Failed to open VCF file
1
0
Entering edit mode
7 months ago

Code:

bwa index $REF_GENOME_DIR/hg38.fa
samtools dict $REF_GENOME_DIR/hg38.fa -o $REF_GENOME_DIR/hg38.dict
samtools faidx hg38.fa


# Known sites VCF 
KNOWN_SITES="$VCF_DIR/00-All.vcf.gz"
bcftools index $VCF_DIR/00-All.vcf.gz

# Create a mapping file for contig names
echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt

# Function to update VCF files and ensure only the path is outputted
update_vcf_contigs() {
    local vcf_file=$1
    local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
    echo "Fixing contig names in $vcf_file..."

    # Create a mapping file with old and new contig names if needed
    bcftools reheader -h $VCF_DIR/contig_mapping.txt $vcf_file -o $fixed_vcf_file
    if tabix -p vcf $fixed_vcf_file; then
        echo "$fixed_vcf_file"
    else
        echo "Error: Failed to index fixed VCF file." >&2
        return 1
    fi
}


# Check updated VCF
KNOWN_SITES=$(update_vcf_contigs "$KNOWN_SITES")
if [ -z "$KNOWN_SITES" ]; then
    echo "Error: VCF file is empty or update failed. Check the contig mapping process."
    exit 1
else
    echo "Updated VCF path: $KNOWN_SITES"
fi


shopt -s nullglob

# Quality Control
for sample_dir in $DATA_DIR/*; do
    # Check if it's a directory containing FASTQ files
    if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
        sample_name=$(basename "$sample_dir")
        echo "Processing $sample_name..."
        # Look for all possible extensions
        for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
            echo "Found file $fastq_file"
            # Skip if the FastQC output already exists
            if find "$OUTPUT_DIR" -name "${sample_name}*_fastqc.html" | grep -q .; then
                echo "FastQC already executed for files in $sample_dir. Skipping..."
            else
                fastqc -o "$OUTPUT_DIR" "$fastq_file"
            fi
        done
    else
        echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
    fi
done


# Read Trimming and Adapter Removal
for sample_dir in "$DATA_DIR"/*; do
    if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
        sample_name=$(basename "$sample_dir")
        echo "Processing $sample_name..."
        found_files=false

        for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
            found_files=true
            base_name=$(basename "$fastq_file" | sed 's/\(.*\)\..*/\1/')

            # Define expected trimmed file names
            trimmed_paired1="$OUTPUT_DIR/${base_name}_1_paired_trimmed.fastq.gz"
            trimmed_unpaired1="$OUTPUT_DIR/${base_name}_1_unpaired_trimmed.fastq.gz"
            trimmed_paired2="$OUTPUT_DIR/${base_name}_2_paired_trimmed.fastq.gz"
            trimmed_unpaired2="$OUTPUT_DIR/${base_name}_2_unpaired_trimmed.fastq.gz"

            # Check if trimmed files already exist
            trimmed_files_exist=true
            for file in $trimmed_paired1 $trimmed_unpaired1 $trimmed_paired2 $trimmed_unpaired2; do
                if [ ! -f "$file" ]; then
                    trimmed_files_exist=false
                    break
                fi
            done

            if [ "$trimmed_files_exist" = false ]; then
                echo "Trimming $fastq_file..."
                # Your trimming logic here
            else
                echo "Trimmed files for $base_name already exist. Skipping."
            fi
        done

        if [ "$found_files" = false ]; then
            echo "No FASTQ files found in $sample_dir, skipping this directory."
        fi
    else
        echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
    fi
done


shopt -u nullglob

# Loop through each sample directory
for sample_dir in $DATA_DIR/*; do
    if [[ -d "$sample_dir" ]]; then  # Ensure it's a directory
        sample_name=$(basename "$sample_dir")

        # Define paths for output files early
        sam_file="$OUTPUT_DIR/${sample_name}.sam"
        sorted_bam="$OUTPUT_DIR/${sample_name}.sorted.bam"
        marked_bam="$OUTPUT_DIR/${sample_name}.marked.bam"
        recal_table="$OUTPUT_DIR/${sample_name}.recal_data.table"
        recal_bam="$OUTPUT_DIR/${sample_name}.recal.bam"

        # Concatenate all read files for each pair, if necessary
        read1_files=$(find $OUTPUT_DIR -name "${sample_name}_*1.*_trimmed.fastq.gz" | sort)
        read2_files=$(find $OUTPUT_DIR -name "${sample_name}_*2.*_trimmed.fastq.gz" | sort)

        concatenated_read1="$OUTPUT_DIR/${sample_name}_concat_1.fq.gz"
        concatenated_read2="$OUTPUT_DIR/${sample_name}_concat_2.fq.gz"

        # Create concatenated files if multiple files exist and if they are not already created
        if [[ $(echo $read1_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read1" ]; then
            cat $read1_files > $concatenated_read1
        fi
        if [[ $(echo $read2_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read2" ]; then
            cat $read2_files > $concatenated_read2
        fi

        read1=${concatenated_read1:-$read1_files}
        read2=${concatenated_read2:-$read2_files}

        # Mapping and processing
        if [ ! -s "$sam_file" ]; then
            echo "Mapping reads using Sentieon bwa mem..."
            sentieon bwa mem -R "@RG\tID:$sample_name\tSM:$sample_name\tPL:illumina\tLB:${sample_name}_lib\tPU:unit1" $REF_GENOME_DIR/hg38.fa $read1 $read2 > $sam_file
        fi

        if [ ! -s "$sorted_bam" ]; then
            echo "Converting SAM to BAM, sorting, and indexing..."
            samtools sort -o $sorted_bam $sam_file
            samtools index $sorted_bam
        fi

        if [ ! -s "$marked_bam" ]; then
            echo "Marking duplicates..."
            sentieon driver -i $sorted_bam --algo LocusCollector --fun score_info $metrics_file
            sentieon driver -i $sorted_bam --algo Dedup --score_info $metrics_file --metrics $metrics_file -o $marked_bam
        fi

        if [ ! -s "$recal_table" ] && [ -s "$marked_bam" ]; then
            echo "Generating the BQSR calibration table..."
            sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCal -k $KNOWN_SITES -q $recal_table
        fi

        if [ ! -s "$recal_bam" ] && [ -s "$recal_table" ]; then
            echo "Applying Base Quality Score Recalibration..."
            sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCalApply -q $recal_table -o $recal_bam
        fi
    fi
done

# Check and Generate gVCF files if they do not exist
for sample_dir in $DATA_DIR/*; do
    if [ -d "$sample_dir" ]; then
        sample_name=$(basename "$sample_dir")
        bam_file="$OUTPUT_DIR/${sample_name}.sorted.bam" # Adjusted to reflect the correct BAM file naming
        gvcf_file="$OUTPUT_DIR/${sample_name}.g.vcf.gz"  # Define the gVCF file path

        # Check if gVCF file exists
        if [ ! -f "$gvcf_file" ]; then

            echo "Generating gVCF for $sample_name..."

            sentieon driver -t 30 -r $REF_GENOME -i $bam_file --algo Haplotyper --emit_mode gvcf -d $KNOWN_SITES -o $gvcf_file

            echo "gVCF generated at $gvcf_file"
        else
            echo "gVCF file for $sample_name already exists at $gvcf_file."
        fi
    fi
done



# Combine GVCFs
echo "Combining gVCF files..."
gatk CombineGVCFs \
    -R $REF_GENOME \
    $(for gvcf_file in $OUTPUT_DIR/*.g.vcf.gz; do echo "-V $gvcf_file"; done) \
    -O $OUTPUT_DIR/cohort.g.vcf.gz

echo "All gVCF files combined into a cohort gVCF."

# Function to check and fix VCF contig names to match the reference genome
fix_vcf_contigs() {
    local vcf_file=$1
    local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
    echo "Fixing contig names in $vcf_file..."

    # Create a mapping file with old and new contig names if needed
    echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt

    # Replace contigs using bcftools (make sure contig_mapping.txt is accurate for your files)
    bcftools reheader -f contig_mapping.txt $vcf_file -o $fixed_vcf_file
    tabix -p vcf $fixed_vcf_file

    echo "Fixed VCF written to $fixed_vcf_file"
    return $fixed_vcf_file
}

# Check and potentially fix contigs in resource VCFs
hapmap_vcf=$(fix_vcf_contigs hapmap_3.3.hg38.vcf.gz)
omni_vcf=$(fix_vcf_contigs 1000G_omni2.5.hg38.vcf.gz)
G1k_vcf=$(fix_vcf_contigs 1000G_phase1.snps.high_confidence.hg38.vcf.gz)
dbsnp_vcf=$(fix_vcf_contigs Homo_sapiens_assembly38.dbsnp138.vcf)

# Variant Recalibration with fixed VCF files
echo "Starting variant recalibration..."
gatk VariantRecalibrator \
    -R $REF_GENOME \
    -V $OUTPUT_DIR/cohort.g.vcf.gz \
    --resource:hapmap,known=false,training=true,truth=true,prior=15.0 $hapmap_vcf \
    --resource:omni,known=false,training=true,truth=true,prior=12.0 $omni_vcf \
    --resource:1000G,known=false,training=true,truth=false,prior=10.0 $G1k_vcf \
    --resource:dbsnp,known=true,training=false,truth=false,prior=7.0 $dbsnp_vcf \
    -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
    -mode SNP \
    -tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \
    --output $OUTPUT_DIR/cohort.recal \
    --tranches-file $OUTPUT_DIR/cohort.tranches \
    --rscript-file $OUTPUT_DIR/cohort.plots.R

echo "Variant recalibration process completed."

# Apply the recalibration to your cohort VCF
echo "Applying VQSR..."
gatk ApplyVQSR \
    -R $REF_GENOME \
    -V $OUTPUT_DIR/cohort.g.vcf.gz \
    --truth-sensitivity-filter-level 99.7 \
    --tranches-file $OUTPUT_DIR/cohort.tranches \
    --recal-file $OUTPUT_DIR/cohort.recal \
    --mode SNP \
    -O $OUTPUT_DIR/cohort.recalibrated.vcf.gz

echo "Variant recalibration and application complete."

echo "All samples processed."

Traceback:

Generating the BQSR calibration table...
cmdline: /scg/apps/software/sentieon/201808.03/libexec/driver -t 30 -r reference_genomes/hg38/hg38.fa -i 
output/A069C001.marked.bam --algo QualCal -k Fixing contig names in VCF/00-All.vcf.gz... VCF/00-All_fixed.vcf.gz -q output/A069C001.recal_data.table
Error: Failed to open vcf file Fixing: No such file or directory
GATK sentieon BWA-MEM • 432 views
ADD COMMENT
2
Entering edit mode
7 months ago

Error: Failed to open vcf file Fixing: No such file or directory

one of your file is missing, obviously.

Check files exist using [ -f ] (not [ -z ] )

we cannot easily debug your script because it's just too long.

run you script with " Defensive BASH Programming ":

set -e
set -u
set -o pipefail

you should learn to work with workflow managers: snakemake/nextflow.

ADD COMMENT

Login before adding your answer.

Traffic: 2003 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6