Question

Failed to open VCF file

0

Entering edit mode

7 months ago

melissachua90 ▴ 70

Code:

bwa index $REF_GENOME_DIR/hg38.fa
samtools dict $REF_GENOME_DIR/hg38.fa -o $REF_GENOME_DIR/hg38.dict
samtools faidx hg38.fa


# Known sites VCF 
KNOWN_SITES="$VCF_DIR/00-All.vcf.gz"
bcftools index $VCF_DIR/00-All.vcf.gz

# Create a mapping file for contig names
echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt

# Function to update VCF files and ensure only the path is outputted
update_vcf_contigs() {
    local vcf_file=$1
    local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
    echo "Fixing contig names in $vcf_file..."

    # Create a mapping file with old and new contig names if needed
    bcftools reheader -h $VCF_DIR/contig_mapping.txt $vcf_file -o $fixed_vcf_file
    if tabix -p vcf $fixed_vcf_file; then
        echo "$fixed_vcf_file"
    else
        echo "Error: Failed to index fixed VCF file." >&2
        return 1
    fi
}


# Check updated VCF
KNOWN_SITES=$(update_vcf_contigs "$KNOWN_SITES")
if [ -z "$KNOWN_SITES" ]; then
    echo "Error: VCF file is empty or update failed. Check the contig mapping process."
    exit 1
else
    echo "Updated VCF path: $KNOWN_SITES"
fi


shopt -s nullglob

# Quality Control
for sample_dir in $DATA_DIR/*; do
    # Check if it's a directory containing FASTQ files
    if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
        sample_name=$(basename "$sample_dir")
        echo "Processing $sample_name..."
        # Look for all possible extensions
        for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
            echo "Found file $fastq_file"
            # Skip if the FastQC output already exists
            if find "$OUTPUT_DIR" -name "${sample_name}*_fastqc.html" | grep -q .; then
                echo "FastQC already executed for files in $sample_dir. Skipping..."
            else
                fastqc -o "$OUTPUT_DIR" "$fastq_file"
            fi
        done
    else
        echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
    fi
done


# Read Trimming and Adapter Removal
for sample_dir in "$DATA_DIR"/*; do
    if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
        sample_name=$(basename "$sample_dir")
        echo "Processing $sample_name..."
        found_files=false

        for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
            found_files=true
            base_name=$(basename "$fastq_file" | sed 's/\(.*\)\..*/\1/')

            # Define expected trimmed file names
            trimmed_paired1="$OUTPUT_DIR/${base_name}_1_paired_trimmed.fastq.gz"
            trimmed_unpaired1="$OUTPUT_DIR/${base_name}_1_unpaired_trimmed.fastq.gz"
            trimmed_paired2="$OUTPUT_DIR/${base_name}_2_paired_trimmed.fastq.gz"
            trimmed_unpaired2="$OUTPUT_DIR/${base_name}_2_unpaired_trimmed.fastq.gz"

            # Check if trimmed files already exist
            trimmed_files_exist=true
            for file in $trimmed_paired1 $trimmed_unpaired1 $trimmed_paired2 $trimmed_unpaired2; do
                if [ ! -f "$file" ]; then
                    trimmed_files_exist=false
                    break
                fi
            done

            if [ "$trimmed_files_exist" = false ]; then
                echo "Trimming $fastq_file..."
                # Your trimming logic here
            else
                echo "Trimmed files for $base_name already exist. Skipping."
            fi
        done

        if [ "$found_files" = false ]; then
            echo "No FASTQ files found in $sample_dir, skipping this directory."
        fi
    else
        echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
    fi
done


shopt -u nullglob

# Loop through each sample directory
for sample_dir in $DATA_DIR/*; do
    if [[ -d "$sample_dir" ]]; then  # Ensure it's a directory
        sample_name=$(basename "$sample_dir")

        # Define paths for output files early
        sam_file="$OUTPUT_DIR/${sample_name}.sam"
        sorted_bam="$OUTPUT_DIR/${sample_name}.sorted.bam"
        marked_bam="$OUTPUT_DIR/${sample_name}.marked.bam"
        recal_table="$OUTPUT_DIR/${sample_name}.recal_data.table"
        recal_bam="$OUTPUT_DIR/${sample_name}.recal.bam"

        # Concatenate all read files for each pair, if necessary
        read1_files=$(find $OUTPUT_DIR -name "${sample_name}_*1.*_trimmed.fastq.gz" | sort)
        read2_files=$(find $OUTPUT_DIR -name "${sample_name}_*2.*_trimmed.fastq.gz" | sort)

        concatenated_read1="$OUTPUT_DIR/${sample_name}_concat_1.fq.gz"
        concatenated_read2="$OUTPUT_DIR/${sample_name}_concat_2.fq.gz"

        # Create concatenated files if multiple files exist and if they are not already created
        if [[ $(echo $read1_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read1" ]; then
            cat $read1_files > $concatenated_read1
        fi
        if [[ $(echo $read2_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read2" ]; then
            cat $read2_files > $concatenated_read2
        fi

        read1=${concatenated_read1:-$read1_files}
        read2=${concatenated_read2:-$read2_files}

        # Mapping and processing
        if [ ! -s "$sam_file" ]; then
            echo "Mapping reads using Sentieon bwa mem..."
            sentieon bwa mem -R "@RG\tID:$sample_name\tSM:$sample_name\tPL:illumina\tLB:${sample_name}_lib\tPU:unit1" $REF_GENOME_DIR/hg38.fa $read1 $read2 > $sam_file
        fi

        if [ ! -s "$sorted_bam" ]; then
            echo "Converting SAM to BAM, sorting, and indexing..."
            samtools sort -o $sorted_bam $sam_file
            samtools index $sorted_bam
        fi

        if [ ! -s "$marked_bam" ]; then
            echo "Marking duplicates..."
            sentieon driver -i $sorted_bam --algo LocusCollector --fun score_info $metrics_file
            sentieon driver -i $sorted_bam --algo Dedup --score_info $metrics_file --metrics $metrics_file -o $marked_bam
        fi

        if [ ! -s "$recal_table" ] && [ -s "$marked_bam" ]; then
            echo "Generating the BQSR calibration table..."
            sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCal -k $KNOWN_SITES -q $recal_table
        fi

        if [ ! -s "$recal_bam" ] && [ -s "$recal_table" ]; then
            echo "Applying Base Quality Score Recalibration..."
            sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCalApply -q $recal_table -o $recal_bam
        fi
    fi
done

# Check and Generate gVCF files if they do not exist
for sample_dir in $DATA_DIR/*; do
    if [ -d "$sample_dir" ]; then
        sample_name=$(basename "$sample_dir")
        bam_file="$OUTPUT_DIR/${sample_name}.sorted.bam" # Adjusted to reflect the correct BAM file naming
        gvcf_file="$OUTPUT_DIR/${sample_name}.g.vcf.gz"  # Define the gVCF file path

        # Check if gVCF file exists
        if [ ! -f "$gvcf_file" ]; then

            echo "Generating gVCF for $sample_name..."

            sentieon driver -t 30 -r $REF_GENOME -i $bam_file --algo Haplotyper --emit_mode gvcf -d $KNOWN_SITES -o $gvcf_file

            echo "gVCF generated at $gvcf_file"
        else
            echo "gVCF file for $sample_name already exists at $gvcf_file."
        fi
    fi
done



# Combine GVCFs
echo "Combining gVCF files..."
gatk CombineGVCFs \
    -R $REF_GENOME \
    $(for gvcf_file in $OUTPUT_DIR/*.g.vcf.gz; do echo "-V $gvcf_file"; done) \
    -O $OUTPUT_DIR/cohort.g.vcf.gz

echo "All gVCF files combined into a cohort gVCF."

# Function to check and fix VCF contig names to match the reference genome
fix_vcf_contigs() {
    local vcf_file=$1
    local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
    echo "Fixing contig names in $vcf_file..."

    # Create a mapping file with old and new contig names if needed
    echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt

    # Replace contigs using bcftools (make sure contig_mapping.txt is accurate for your files)
    bcftools reheader -f contig_mapping.txt $vcf_file -o $fixed_vcf_file
    tabix -p vcf $fixed_vcf_file

    echo "Fixed VCF written to $fixed_vcf_file"
    return $fixed_vcf_file
}

# Check and potentially fix contigs in resource VCFs
hapmap_vcf=$(fix_vcf_contigs hapmap_3.3.hg38.vcf.gz)
omni_vcf=$(fix_vcf_contigs 1000G_omni2.5.hg38.vcf.gz)
G1k_vcf=$(fix_vcf_contigs 1000G_phase1.snps.high_confidence.hg38.vcf.gz)
dbsnp_vcf=$(fix_vcf_contigs Homo_sapiens_assembly38.dbsnp138.vcf)

# Variant Recalibration with fixed VCF files
echo "Starting variant recalibration..."
gatk VariantRecalibrator \
    -R $REF_GENOME \
    -V $OUTPUT_DIR/cohort.g.vcf.gz \
    --resource:hapmap,known=false,training=true,truth=true,prior=15.0 $hapmap_vcf \
    --resource:omni,known=false,training=true,truth=true,prior=12.0 $omni_vcf \
    --resource:1000G,known=false,training=true,truth=false,prior=10.0 $G1k_vcf \
    --resource:dbsnp,known=true,training=false,truth=false,prior=7.0 $dbsnp_vcf \
    -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
    -mode SNP \
    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \
    --output $OUTPUT_DIR/cohort.recal \
    --tranches-file $OUTPUT_DIR/cohort.tranches \
    --rscript-file $OUTPUT_DIR/cohort.plots.R

echo "Variant recalibration process completed."

# Apply the recalibration to your cohort VCF
echo "Applying VQSR..."
gatk ApplyVQSR \
    -R $REF_GENOME \
    -V $OUTPUT_DIR/cohort.g.vcf.gz \
    --truth-sensitivity-filter-level 99.7 \
    --tranches-file $OUTPUT_DIR/cohort.tranches \
    --recal-file $OUTPUT_DIR/cohort.recal \
    --mode SNP \
    -O $OUTPUT_DIR/cohort.recalibrated.vcf.gz

echo "Variant recalibration and application complete."

echo "All samples processed."

Traceback:

Generating the BQSR calibration table...
cmdline: /scg/apps/software/sentieon/201808.03/libexec/driver -t 30 -r reference_genomes/hg38/hg38.fa -i 
output/A069C001.marked.bam --algo QualCal -k Fixing contig names in VCF/00-All.vcf.gz... VCF/00-All_fixed.vcf.gz -q output/A069C001.recal_data.table
Error: Failed to open vcf file Fixing: No such file or directory

GATK sentieon BWA-MEM • 432 views

ADD COMMENT • link updated 7 months ago by Pierre Lindenbaum 164k • written 7 months ago by melissachua90 ▴ 70

score 2 · Answer 1 · 2024-05-11

Error: Failed to open vcf file Fixing: No such file or directory

one of your file is missing, obviously.

Check files exist using [ -f ] (not [ -z ] )

we cannot easily debug your script because it's just too long.

run you script with " Defensive BASH Programming ":

set -e
set -u
set -o pipefail

you should learn to work with workflow managers: snakemake/nextflow.