Question

advice for wattersons theta calculation using vcf file

0

Entering edit mode

5 months ago

ekirsch • 0

Hello everyone,

I'm looking for advice on calculating watterson's theta values for my dataset composed of whole genome sequences. I don't have the .bam files so I can't use ANGSD. I am currently trying to use the allel module in python with the following script. However, the values I am getting after normalizing by the total length of callable sites seem too small to be correct (~2.10 x 10^-10). I am wondering if anyone has calculated wattersons theta using other tools? Also, is normalizing by the total length of callable sites necessary/ good practice? Thank you for your help in advance!

#!/usr/bin/env python3

import allel
import numpy as np
import pandas as pd
import os

# Define paths
input_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
output_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
callable_sites_file = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/callable_sites/41-Passerculus_callable_sites_map_sorted.bed"
vcf_files = ["alaudinus_modern.vcf.gz", "alaudinus_historical.vcf.gz", "brooksi_modern.vcf.gz", "rostratus_modern.vcf.gz", "nevadensis_modern.vcf.gz", "nevadensis_historical.vcf.gz", "beldingi_modern.vcf.gz", "beldingi_historical.vcf.gz"]

# Calculate the total length of callable sites
callable_sites = pd.read_csv(callable_sites_file, sep='\t', header=None, names=['chrom', 'start', 'end'])
total_callable_sites = callable_sites.apply(lambda row: row.end - row.start, axis=1).sum()
print(f"Total callable sites length: {total_callable_sites}")

# Function to calculate Watterson's theta
def calculate_wattersons_theta(vcf_file, period, subspecies):
    # Read VCF file
    vcf_path = os.path.join(input_dir, vcf_file)
    callset = allel.read_vcf(vcf_path)

    # Check if genotype data is present
    if 'calldata/GT' not in callset:
        print(f"No genotype data found in {vcf_file} for {period} {subspecies}. Skipping.")
        return

    # Create genotype array
    genotypes = allel.GenotypeArray(callset['calldata/GT'])

    # Ensure positions are sorted
    pos = callset['variants/POS']
    sort_order = np.argsort(pos)
    pos = pos[sort_order]
    genotypes = genotypes.take(sort_order, axis=0)

    # Calculate allele counts
    ac = genotypes.count_alleles()

    # Define window size and step size
    window_size = 1000
    step_size = 500

    # Calculate Watterson's theta in windows
    theta_w, windows, n_bases, counts = allel.windowed_watterson_theta(pos, ac, size=window_size, step=step_size)

    # Save raw Watterson's theta values to file
    raw_output_file = os.path.join(output_dir, f"raw_wattersons_theta_{period}_filtered_{subspecies}_py.txt")
    with open(raw_output_file, 'w') as f_raw:
        f_raw.write("CHROM\tBIN_START\tTHETA_W\n")
        for i in range(len(windows)):
            chrom = callset['variants/CHROM'][0]
            f_raw.write(f"{chrom}\t{windows[i][0]}\t{theta_w[i]}\n")

    # Normalize Watterson's theta by the total length of callable sites
    theta_w_normalized = theta_w / total_callable_sites

    # Save normalized Watterson's theta values to file
    output_file = os.path.join(output_dir, f"wattersons_theta_{period}_filtered_{subspecies}_py.txt")
    with open(output_file, 'w') as f_norm:
        f_norm.write("CHROM\tBIN_START\tTHETA_W_NORMALIZED\n")
        for i in range(len(windows)):
            chrom = callset['variants/CHROM'][0]
            f_norm.write(f"{chrom}\t{windows[i][0]}\t{theta_w_normalized[i]}\n")

    print(f"Watterson's theta calculated and saved to {raw_output_file} and {output_file}")

# Loop over each subspecies VCF file and calculate Watterson's theta
for vcf_file in vcf_files:
    subspecies_period = vcf_file.replace('.vcf.gz', '').split('_')
    subspecies

diversity vcf theta • 242 views

ADD COMMENT • link 5 months ago by ekirsch • 0