advice for wattersons theta calculation using vcf file
0
0
Entering edit mode
5 months ago
ekirsch • 0

Hello everyone,

I'm looking for advice on calculating watterson's theta values for my dataset composed of whole genome sequences. I don't have the .bam files so I can't use ANGSD. I am currently trying to use the allel module in python with the following script. However, the values I am getting after normalizing by the total length of callable sites seem too small to be correct (~2.10 x 10^-10). I am wondering if anyone has calculated wattersons theta using other tools? Also, is normalizing by the total length of callable sites necessary/ good practice? Thank you for your help in advance!

#!/usr/bin/env python3

import allel
import numpy as np
import pandas as pd
import os

# Define paths
input_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
output_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
callable_sites_file = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/callable_sites/41-Passerculus_callable_sites_map_sorted.bed"
vcf_files = ["alaudinus_modern.vcf.gz", "alaudinus_historical.vcf.gz", "brooksi_modern.vcf.gz", "rostratus_modern.vcf.gz", "nevadensis_modern.vcf.gz", "nevadensis_historical.vcf.gz", "beldingi_modern.vcf.gz", "beldingi_historical.vcf.gz"]

# Calculate the total length of callable sites
callable_sites = pd.read_csv(callable_sites_file, sep='\t', header=None, names=['chrom', 'start', 'end'])
total_callable_sites = callable_sites.apply(lambda row: row.end - row.start, axis=1).sum()
print(f"Total callable sites length: {total_callable_sites}")

# Function to calculate Watterson's theta
def calculate_wattersons_theta(vcf_file, period, subspecies):
    # Read VCF file
    vcf_path = os.path.join(input_dir, vcf_file)
    callset = allel.read_vcf(vcf_path)

    # Check if genotype data is present
    if 'calldata/GT' not in callset:
        print(f"No genotype data found in {vcf_file} for {period} {subspecies}. Skipping.")
        return

    # Create genotype array
    genotypes = allel.GenotypeArray(callset['calldata/GT'])

    # Ensure positions are sorted
    pos = callset['variants/POS']
    sort_order = np.argsort(pos)
    pos = pos[sort_order]
    genotypes = genotypes.take(sort_order, axis=0)

    # Calculate allele counts
    ac = genotypes.count_alleles()

    # Define window size and step size
    window_size = 1000
    step_size = 500

    # Calculate Watterson's theta in windows
    theta_w, windows, n_bases, counts = allel.windowed_watterson_theta(pos, ac, size=window_size, step=step_size)

    # Save raw Watterson's theta values to file
    raw_output_file = os.path.join(output_dir, f"raw_wattersons_theta_{period}_filtered_{subspecies}_py.txt")
    with open(raw_output_file, 'w') as f_raw:
        f_raw.write("CHROM\tBIN_START\tTHETA_W\n")
        for i in range(len(windows)):
            chrom = callset['variants/CHROM'][0]
            f_raw.write(f"{chrom}\t{windows[i][0]}\t{theta_w[i]}\n")

    # Normalize Watterson's theta by the total length of callable sites
    theta_w_normalized = theta_w / total_callable_sites

    # Save normalized Watterson's theta values to file
    output_file = os.path.join(output_dir, f"wattersons_theta_{period}_filtered_{subspecies}_py.txt")
    with open(output_file, 'w') as f_norm:
        f_norm.write("CHROM\tBIN_START\tTHETA_W_NORMALIZED\n")
        for i in range(len(windows)):
            chrom = callset['variants/CHROM'][0]
            f_norm.write(f"{chrom}\t{windows[i][0]}\t{theta_w_normalized[i]}\n")

    print(f"Watterson's theta calculated and saved to {raw_output_file} and {output_file}")

# Loop over each subspecies VCF file and calculate Watterson's theta
for vcf_file in vcf_files:
    subspecies_period = vcf_file.replace('.vcf.gz', '').split('_')
    subspecies
diversity vcf theta • 242 views
ADD COMMENT

Login before adding your answer.

Traffic: 1862 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6