Question

Comparing quality across samples in a VCF

0

Entering edit mode

6 months ago

ekirsch • 0

Hello, I am trying to compare the quality of the samples within my vcf file. I want to compare the quality of sequencing across two metrics, the different subspecies in my file and the sample type (modern vs. historical). I have been using this script, but when I run it, the output files say that the mean and standard deviation for quality is the same across all the subspecies and in both types of samples which I know is not the case. In the script, I start with a vcf file that contains all the samples, and then filter it based on lists of samples within each subspecies and within each sample type. I am trying to look at the QUAL scores and the QD scores (but if anyone has suggestions for other metrics to look at please let me know).

# Define the directory and file paths
VCF_DIR="/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/data"
ALL_SAMPLES_VCF="lineremoved_GATK_bgzip_filtered_passerculus.vcf.gz"  # Your comprehensive VCF file
SUBSPECIES_DIR="/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus"
TIME_PERIOD_DIR="/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus"
OUTPUT_DIR="/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"

# Subspecies list files
SUBSPECIES=("alaudinus" "brooksi" "beldingi" "rostratus" "nevadensis")

# Load bcftools (modify this based on your environment)
module load bcftools

# Function to filter the VCF file by subspecies and time period
filter_vcf_by_subspecies_and_time() {
    local subspecies_list=$1
    local time_period_list=$2
    local vcf_file=$3
    local output_vcf=$4
    bcftools view -S $subspecies_list -S $time_period_list $vcf_file -Oz -o $output_vcf
}

# Function to calculate quality stats for QUAL and QD
calculate_quality_stats() {
    local vcf_file=$1
    local output_file=$2
    bcftools query -f '%QUAL\t%QD\n' $vcf_file > $output_file
    awk '
    BEGIN { sum_qual=0; sumsq_qual=0; sum_qd=0; sumsq_qd=0; count=0 }
    {
        sum_qual += $1;
        sumsq_qual += ($1)^2;
        sum_qd += $2;
        sumsq_qd += ($2)^2;
        count++;
    }
    END {
        if (count > 0) {
            mean_qual = sum_qual / count;
            variance_qual = (sumsq_qual - (sum_qual^2)/count) / (count - 1);
            stddev_qual = sqrt(variance_qual);
            mean_qd = sum_qd / count;
            variance_qd = (sumsq_qd - (sum_qd^2)/count) / (count - 1);
            stddev_qd = sqrt(variance_qd);
            print "Mean QUAL:", mean_qual;
            print "Standard Deviation QUAL:", stddev_qual;
            print "Mean QD:", mean_qd;
            print "Standard Deviation QD:", stddev_qd;
        } else {
            print "No data available.";
        }
    }
    ' $output_file
}

# Loop through each subspecies and filter by time period, then calculate quality scores
for subspecies in "${SUBSPECIES[@]}"; do
    subspecies_list="${SUBSPECIES_DIR}/${subspecies}_list.txt"

    # Process historical samples
    historical_output_vcf="${OUTPUT_DIR}/${subspecies}_historical.vcf.gz"
    filter_vcf_by_subspecies_and_time $subspecies_list "${TIME_PERIOD_DIR}/historical_samples.txt" "$VCF_DIR/$ALL_SAMPLES_VCF" $historical_output_vcf
    calculate_quality_stats $historical_output_vcf "${OUTPUT_DIR}/quality_scores_${subspecies}_historical.txt" > "${OUTPUT_DIR}/stats_${subspecies}_historical_quality.txt"

    # Process modern samples
    modern_output_vcf="${OUTPUT_DIR}/${subspecies}_modern.vcf.gz"
    filter_vcf_by_subspecies_and_time $subspecies_list "${TIME_PERIOD_DIR}/modern_samples.txt" "$VCF_DIR/$ALL_SAMPLES_VCF" $modern_output_vcf
    calculate_quality_stats $modern_output_vcf "${OUTPUT_DIR}/quality_scores_${subspecies}_modern.txt" > "${OUTPUT_DIR}/stats_${subspecies}_modern_quality.txt"
done

echo "Quality score calculation completed."

If anyone can find an error in my script or suggest another way to compare quality scores that would be amazing. Thank you!

vcf • 333 views

ADD COMMENT • link updated 5 months ago by Ram 44k • written 6 months ago by ekirsch • 0

score 1 · Answer 1 · 2024-06-04

1

Entering edit mode

5 months ago

Pierre Lindenbaum 164k

may be you could 'just' use datamash ?

bcftools query -f '%QUAL\t%QD'  in.bcf | LC_ALL=C datamash  mean 1 sstdev 1 mean 2 sstdev 2
105122.18351    373919.66838    15.71859    6.1045297111977

ADD COMMENT • link 5 months ago by Pierre Lindenbaum 164k