Entering edit mode
7 months ago
melissachua90
▴
70
Code:
bwa index $REF_GENOME_DIR/hg38.fa
samtools dict $REF_GENOME_DIR/hg38.fa -o $REF_GENOME_DIR/hg38.dict
samtools faidx hg38.fa
# Known sites VCF
KNOWN_SITES="$VCF_DIR/00-All.vcf.gz"
bcftools index $VCF_DIR/00-All.vcf.gz
# Create a mapping file for contig names
echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt
# Function to update VCF files and ensure only the path is outputted
update_vcf_contigs() {
local vcf_file=$1
local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
echo "Fixing contig names in $vcf_file..."
# Create a mapping file with old and new contig names if needed
bcftools reheader -h $VCF_DIR/contig_mapping.txt $vcf_file -o $fixed_vcf_file
if tabix -p vcf $fixed_vcf_file; then
echo "$fixed_vcf_file"
else
echo "Error: Failed to index fixed VCF file." >&2
return 1
fi
}
# Check updated VCF
KNOWN_SITES=$(update_vcf_contigs "$KNOWN_SITES")
if [ -z "$KNOWN_SITES" ]; then
echo "Error: VCF file is empty or update failed. Check the contig mapping process."
exit 1
else
echo "Updated VCF path: $KNOWN_SITES"
fi
shopt -s nullglob
# Quality Control
for sample_dir in $DATA_DIR/*; do
# Check if it's a directory containing FASTQ files
if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
sample_name=$(basename "$sample_dir")
echo "Processing $sample_name..."
# Look for all possible extensions
for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
echo "Found file $fastq_file"
# Skip if the FastQC output already exists
if find "$OUTPUT_DIR" -name "${sample_name}*_fastqc.html" | grep -q .; then
echo "FastQC already executed for files in $sample_dir. Skipping..."
else
fastqc -o "$OUTPUT_DIR" "$fastq_file"
fi
done
else
echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
fi
done
# Read Trimming and Adapter Removal
for sample_dir in "$DATA_DIR"/*; do
if [ -d "$sample_dir" ] && ls $sample_dir/*.fastq.gz 1> /dev/null 2>&1; then
sample_name=$(basename "$sample_dir")
echo "Processing $sample_name..."
found_files=false
for fastq_file in "$sample_dir"/*.{fastq.gz,fastq,fq.gz,fq}; do
found_files=true
base_name=$(basename "$fastq_file" | sed 's/\(.*\)\..*/\1/')
# Define expected trimmed file names
trimmed_paired1="$OUTPUT_DIR/${base_name}_1_paired_trimmed.fastq.gz"
trimmed_unpaired1="$OUTPUT_DIR/${base_name}_1_unpaired_trimmed.fastq.gz"
trimmed_paired2="$OUTPUT_DIR/${base_name}_2_paired_trimmed.fastq.gz"
trimmed_unpaired2="$OUTPUT_DIR/${base_name}_2_unpaired_trimmed.fastq.gz"
# Check if trimmed files already exist
trimmed_files_exist=true
for file in $trimmed_paired1 $trimmed_unpaired1 $trimmed_paired2 $trimmed_unpaired2; do
if [ ! -f "$file" ]; then
trimmed_files_exist=false
break
fi
done
if [ "$trimmed_files_exist" = false ]; then
echo "Trimming $fastq_file..."
# Your trimming logic here
else
echo "Trimmed files for $base_name already exist. Skipping."
fi
done
if [ "$found_files" = false ]; then
echo "No FASTQ files found in $sample_dir, skipping this directory."
fi
else
echo "Skipping non-sample directory or no FASTQ files found: $sample_dir"
fi
done
shopt -u nullglob
# Loop through each sample directory
for sample_dir in $DATA_DIR/*; do
if [[ -d "$sample_dir" ]]; then # Ensure it's a directory
sample_name=$(basename "$sample_dir")
# Define paths for output files early
sam_file="$OUTPUT_DIR/${sample_name}.sam"
sorted_bam="$OUTPUT_DIR/${sample_name}.sorted.bam"
marked_bam="$OUTPUT_DIR/${sample_name}.marked.bam"
recal_table="$OUTPUT_DIR/${sample_name}.recal_data.table"
recal_bam="$OUTPUT_DIR/${sample_name}.recal.bam"
# Concatenate all read files for each pair, if necessary
read1_files=$(find $OUTPUT_DIR -name "${sample_name}_*1.*_trimmed.fastq.gz" | sort)
read2_files=$(find $OUTPUT_DIR -name "${sample_name}_*2.*_trimmed.fastq.gz" | sort)
concatenated_read1="$OUTPUT_DIR/${sample_name}_concat_1.fq.gz"
concatenated_read2="$OUTPUT_DIR/${sample_name}_concat_2.fq.gz"
# Create concatenated files if multiple files exist and if they are not already created
if [[ $(echo $read1_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read1" ]; then
cat $read1_files > $concatenated_read1
fi
if [[ $(echo $read2_files | wc -w) -gt 1 ]] && [ ! -s "$concatenated_read2" ]; then
cat $read2_files > $concatenated_read2
fi
read1=${concatenated_read1:-$read1_files}
read2=${concatenated_read2:-$read2_files}
# Mapping and processing
if [ ! -s "$sam_file" ]; then
echo "Mapping reads using Sentieon bwa mem..."
sentieon bwa mem -R "@RG\tID:$sample_name\tSM:$sample_name\tPL:illumina\tLB:${sample_name}_lib\tPU:unit1" $REF_GENOME_DIR/hg38.fa $read1 $read2 > $sam_file
fi
if [ ! -s "$sorted_bam" ]; then
echo "Converting SAM to BAM, sorting, and indexing..."
samtools sort -o $sorted_bam $sam_file
samtools index $sorted_bam
fi
if [ ! -s "$marked_bam" ]; then
echo "Marking duplicates..."
sentieon driver -i $sorted_bam --algo LocusCollector --fun score_info $metrics_file
sentieon driver -i $sorted_bam --algo Dedup --score_info $metrics_file --metrics $metrics_file -o $marked_bam
fi
if [ ! -s "$recal_table" ] && [ -s "$marked_bam" ]; then
echo "Generating the BQSR calibration table..."
sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCal -k $KNOWN_SITES -q $recal_table
fi
if [ ! -s "$recal_bam" ] && [ -s "$recal_table" ]; then
echo "Applying Base Quality Score Recalibration..."
sentieon driver -t 30 -r $REF_GENOME_DIR/hg38.fa -i $marked_bam --algo QualCalApply -q $recal_table -o $recal_bam
fi
fi
done
# Check and Generate gVCF files if they do not exist
for sample_dir in $DATA_DIR/*; do
if [ -d "$sample_dir" ]; then
sample_name=$(basename "$sample_dir")
bam_file="$OUTPUT_DIR/${sample_name}.sorted.bam" # Adjusted to reflect the correct BAM file naming
gvcf_file="$OUTPUT_DIR/${sample_name}.g.vcf.gz" # Define the gVCF file path
# Check if gVCF file exists
if [ ! -f "$gvcf_file" ]; then
echo "Generating gVCF for $sample_name..."
sentieon driver -t 30 -r $REF_GENOME -i $bam_file --algo Haplotyper --emit_mode gvcf -d $KNOWN_SITES -o $gvcf_file
echo "gVCF generated at $gvcf_file"
else
echo "gVCF file for $sample_name already exists at $gvcf_file."
fi
fi
done
# Combine GVCFs
echo "Combining gVCF files..."
gatk CombineGVCFs \
-R $REF_GENOME \
$(for gvcf_file in $OUTPUT_DIR/*.g.vcf.gz; do echo "-V $gvcf_file"; done) \
-O $OUTPUT_DIR/cohort.g.vcf.gz
echo "All gVCF files combined into a cohort gVCF."
# Function to check and fix VCF contig names to match the reference genome
fix_vcf_contigs() {
local vcf_file=$1
local fixed_vcf_file=$(echo $vcf_file | sed 's/.vcf.gz/_fixed.vcf.gz/')
echo "Fixing contig names in $vcf_file..."
# Create a mapping file with old and new contig names if needed
echo -e "1\tchr1\n2\tchr2\n3\tchr3\n4\tchr4\n5\tchr5\n6\tchr6\n7\tchr7\n8\tchr8\n9\tchr9\n10\tchr10\n11\tchr11\n12\tchr12\n13\tchr13\n14\tchr14\n15\tchr15\n16\tchr16\n17\tchr17\n18\tchr18\n19\tchr19\n20\tchr20\n21\tchr21\n22\tchr22\nX\tchrX\nY\tchrY\nMT\tchrM" > $VCF_DIR/contig_mapping.txt
# Replace contigs using bcftools (make sure contig_mapping.txt is accurate for your files)
bcftools reheader -f contig_mapping.txt $vcf_file -o $fixed_vcf_file
tabix -p vcf $fixed_vcf_file
echo "Fixed VCF written to $fixed_vcf_file"
return $fixed_vcf_file
}
# Check and potentially fix contigs in resource VCFs
hapmap_vcf=$(fix_vcf_contigs hapmap_3.3.hg38.vcf.gz)
omni_vcf=$(fix_vcf_contigs 1000G_omni2.5.hg38.vcf.gz)
G1k_vcf=$(fix_vcf_contigs 1000G_phase1.snps.high_confidence.hg38.vcf.gz)
dbsnp_vcf=$(fix_vcf_contigs Homo_sapiens_assembly38.dbsnp138.vcf)
# Variant Recalibration with fixed VCF files
echo "Starting variant recalibration..."
gatk VariantRecalibrator \
-R $REF_GENOME \
-V $OUTPUT_DIR/cohort.g.vcf.gz \
--resource:hapmap,known=false,training=true,truth=true,prior=15.0 $hapmap_vcf \
--resource:omni,known=false,training=true,truth=true,prior=12.0 $omni_vcf \
--resource:1000G,known=false,training=true,truth=false,prior=10.0 $G1k_vcf \
--resource:dbsnp,known=true,training=false,truth=false,prior=7.0 $dbsnp_vcf \
-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \
-mode SNP \
-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 \
--output $OUTPUT_DIR/cohort.recal \
--tranches-file $OUTPUT_DIR/cohort.tranches \
--rscript-file $OUTPUT_DIR/cohort.plots.R
echo "Variant recalibration process completed."
# Apply the recalibration to your cohort VCF
echo "Applying VQSR..."
gatk ApplyVQSR \
-R $REF_GENOME \
-V $OUTPUT_DIR/cohort.g.vcf.gz \
--truth-sensitivity-filter-level 99.7 \
--tranches-file $OUTPUT_DIR/cohort.tranches \
--recal-file $OUTPUT_DIR/cohort.recal \
--mode SNP \
-O $OUTPUT_DIR/cohort.recalibrated.vcf.gz
echo "Variant recalibration and application complete."
echo "All samples processed."
Traceback:
Generating the BQSR calibration table...
cmdline: /scg/apps/software/sentieon/201808.03/libexec/driver -t 30 -r reference_genomes/hg38/hg38.fa -i
output/A069C001.marked.bam --algo QualCal -k Fixing contig names in VCF/00-All.vcf.gz... VCF/00-All_fixed.vcf.gz -q output/A069C001.recal_data.table
Error: Failed to open vcf file Fixing: No such file or directory