#!/bin/bash

# ==============================================================================
# Script Name: extract_exome.sh
# Description: Extracts exonic regions (exome) from a genomic FASTA and GFF file.
#              Ensures that all sequence IDs in the GFF match those in the FASTA.
# Usage: ./extract_exome.sh genome.fna(.gz) genome.gff(.gz) [output_exome.fa]
# Dependencies: bedtools, samtools, awk, zcat, gunzip, sort, comm
# ==============================================================================

# Exit immediately if a command exits with a non-zero status
set -e

# Function to display usage instructions
usage() {
    echo "Usage: $0 <genome.fna(.gz)> <genome.gff(.gz)> [output_exome.fa]"
    echo
    echo "Arguments:"
    echo "  genome.fna(.gz)   Genomic FASTA file (.fna or .fna.gz)"
    echo "  genome.gff(.gz)   GFF annotation file (.gff or .gff.gz)"
    echo "  output_exome.fa  (Optional) Output FASTA file for exome sequences (default: exome.fa)"
    exit 1
}

# Check if at least two arguments are provided
if [ "$#" -lt 2 ]; then
    echo "Error: Insufficient arguments provided."
    usage
fi

# Assign input arguments to variables
GENOME_FNA_INPUT="$1"
GENOME_GFF_INPUT="$2"

# Assign output file name
OUTPUT_FA="${3:-exome.fa}"

# Function to check if a file is gzipped
is_gzipped() {
    local file="$1"
    if [[ "$file" == *.gz ]]; then
        return 0  # True: gzipped
    else
        return 1  # False: not gzipped
    fi
}

# Function to decompress a file if it's gzipped
decompress_if_needed() {
    local file="$1"
    local tmp_dir="$2"
    local output_var="$3"

    if is_gzipped "$file"; then
        local decompressed_file="$tmp_dir/$(basename "${file%.*}")"
        echo "Decompressing '$file' to '$decompressed_file'..."
        gunzip -c "$file" > "$decompressed_file"
        eval "$output_var='$decompressed_file'"
    else
        echo "Using uncompressed file '$file'..."
        eval "$output_var='$file'"
    fi
}

# Check if input files exist
if [ ! -f "$GENOME_FNA_INPUT" ]; then
    echo "Error: File '$GENOME_FNA_INPUT' not found."
    exit 1
fi

if [ ! -f "$GENOME_GFF_INPUT" ]; then
    echo "Error: File '$GENOME_GFF_INPUT' not found."
    exit 1
fi

# Check if required tools are installed
for cmd in bedtools samtools awk sort comm; do
    if ! command -v "$cmd" &> /dev/null; then
        echo "Error: '$cmd' is not installed. Please install it and try again."
        exit 1
    fi
done

# Determine which decompression tool to use
if command -v zcat &> /dev/null; then
    DECOMPRESS_CMD="zcat"
elif command -v gunzip &> /dev/null; then
    DECOMPRESS_CMD="gunzip -c"
else
    echo "Error: Neither 'zcat' nor 'gunzip' is available for decompression."
    exit 1
fi

# Create a temporary directory for intermediate files
TMP_DIR=$(mktemp -d)
trap "rm -rf $TMP_DIR" EXIT

echo "Temporary directory created at $TMP_DIR"

# Define intermediate file paths
EXONS_BED="$TMP_DIR/exons.bed"
DECOMPRESSED_FA="$TMP_DIR/genome.fna"
DECOMPRESSED_GFF="$TMP_DIR/genome.gff"

# ==============================================================================
# Step 1: Prepare the Genomic FASTA File
# ==============================================================================

echo "Preparing the genomic FASTA file..."

decompress_if_needed "$GENOME_FNA_INPUT" "$TMP_DIR" DECOMPRESSED_FA

# ==============================================================================
# Step 2: Prepare the GFF File
# ==============================================================================

echo "Preparing the GFF file..."

decompress_if_needed "$GENOME_GFF_INPUT" "$TMP_DIR" DECOMPRESSED_GFF

# ==============================================================================
# Step 3: Extract and Compare Sequence IDs
# ==============================================================================

echo "Extracting sequence IDs from FASTA and GFF files..."

# Extract sequence IDs from FASTA
grep '^>' "$DECOMPRESSED_FA" | awk '{print substr($1,2)}' | sort > "$TMP_DIR/fasta_ids.txt"

# Extract sequence IDs from GFF, excluding comment lines
awk '!/^#/ {print $1}' "$DECOMPRESSED_GFF" | sort | uniq > "$TMP_DIR/gff_ids.txt"

echo "Comparing sequence IDs between FASTA and GFF..."

# Find IDs in GFF not in FASTA
comm -23 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_gff.txt"

# Find IDs in FASTA not in GFF
comm -13 "$TMP_DIR/gff_ids.txt" "$TMP_DIR/fasta_ids.txt" > "$TMP_DIR/ids_only_in_fasta.txt"

# Check for discrepancies
IDS_ONLY_IN_GFF=$(wc -l < "$TMP_DIR/ids_only_in_gff.txt")
IDS_ONLY_IN_FASTA=$(wc -l < "$TMP_DIR/ids_only_in_fasta.txt")

if [ "$IDS_ONLY_IN_GFF" -ne 0 ] || [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
    echo "Error: Sequence ID mismatches detected between GFF and FASTA files."

    if [ "$IDS_ONLY_IN_GFF" -ne 0 ]; then
        echo "Sequence IDs present in GFF but NOT in FASTA ($IDS_ONLY_IN_GFF):"
        cat "$TMP_DIR/ids_only_in_gff.txt"
    fi

    if [ "$IDS_ONLY_IN_FASTA" -ne 0 ]; then
        echo "Sequence IDs present in FASTA but NOT in GFF ($IDS_ONLY_IN_FASTA):"
        cat "$TMP_DIR/ids_only_in_fasta.txt"
    fi

    echo "Please ensure that the sequence IDs in both files match exactly."
    exit 1
else
    echo "All sequence IDs in GFF match those in FASTA."
fi

# ==============================================================================
# Step 4: Index the Genomic FASTA File
# ==============================================================================

echo "Indexing the genomic FASTA file using samtools faidx..."

samtools faidx "$DECOMPRESSED_FA"

echo "FASTA indexing completed."

# ==============================================================================
# Step 5: Extract Exon Coordinates and Convert to BED Format
# ==============================================================================

echo "Extracting exon coordinates from GFF file and converting to BED format..."

awk '$3 == "exon" {print $1"\t"($4-1)"\t"$5"\t"$9"\t.\t"$7}' "$DECOMPRESSED_GFF" > "$EXONS_BED"

echo "Exon coordinates saved to $EXONS_BED"

# ==============================================================================
# Step 6: Extract Exonic Sequences using bedtools
# ==============================================================================

echo "Extracting exonic sequences using bedtools..."

bedtools getfasta \
    -fi "$DECOMPRESSED_FA" \
    -bed "$EXONS_BED" \
    -fo "$OUTPUT_FA" \
    -name  # Optional: Use BED name field as FASTA headers

echo "Exonic sequences saved to $OUTPUT_FA"

# ==============================================================================
# Completion Message
# ==============================================================================

echo "Exome extraction completed successfully."
echo "Output FASTA file: $OUTPUT_FA"

