Entering edit mode
5 months ago
ekirsch
•
0
Hello everyone,
I'm looking for advice on calculating watterson's theta values for my dataset composed of whole genome sequences. I don't have the .bam files so I can't use ANGSD. I am currently trying to use the allel module in python with the following script. However, the values I am getting after normalizing by the total length of callable sites seem too small to be correct (~2.10 x 10^-10). I am wondering if anyone has calculated wattersons theta using other tools? Also, is normalizing by the total length of callable sites necessary/ good practice? Thank you for your help in advance!
#!/usr/bin/env python3
import allel
import numpy as np
import pandas as pd
import os
# Define paths
input_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
output_dir = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/output"
callable_sites_file = "/project/sedmands_1143/ekirsch/savannahsparrow/savannahsparrow/41-Passerculus/41-Passerculus/callable_sites/41-Passerculus_callable_sites_map_sorted.bed"
vcf_files = ["alaudinus_modern.vcf.gz", "alaudinus_historical.vcf.gz", "brooksi_modern.vcf.gz", "rostratus_modern.vcf.gz", "nevadensis_modern.vcf.gz", "nevadensis_historical.vcf.gz", "beldingi_modern.vcf.gz", "beldingi_historical.vcf.gz"]
# Calculate the total length of callable sites
callable_sites = pd.read_csv(callable_sites_file, sep='\t', header=None, names=['chrom', 'start', 'end'])
total_callable_sites = callable_sites.apply(lambda row: row.end - row.start, axis=1).sum()
print(f"Total callable sites length: {total_callable_sites}")
# Function to calculate Watterson's theta
def calculate_wattersons_theta(vcf_file, period, subspecies):
# Read VCF file
vcf_path = os.path.join(input_dir, vcf_file)
callset = allel.read_vcf(vcf_path)
# Check if genotype data is present
if 'calldata/GT' not in callset:
print(f"No genotype data found in {vcf_file} for {period} {subspecies}. Skipping.")
return
# Create genotype array
genotypes = allel.GenotypeArray(callset['calldata/GT'])
# Ensure positions are sorted
pos = callset['variants/POS']
sort_order = np.argsort(pos)
pos = pos[sort_order]
genotypes = genotypes.take(sort_order, axis=0)
# Calculate allele counts
ac = genotypes.count_alleles()
# Define window size and step size
window_size = 1000
step_size = 500
# Calculate Watterson's theta in windows
theta_w, windows, n_bases, counts = allel.windowed_watterson_theta(pos, ac, size=window_size, step=step_size)
# Save raw Watterson's theta values to file
raw_output_file = os.path.join(output_dir, f"raw_wattersons_theta_{period}_filtered_{subspecies}_py.txt")
with open(raw_output_file, 'w') as f_raw:
f_raw.write("CHROM\tBIN_START\tTHETA_W\n")
for i in range(len(windows)):
chrom = callset['variants/CHROM'][0]
f_raw.write(f"{chrom}\t{windows[i][0]}\t{theta_w[i]}\n")
# Normalize Watterson's theta by the total length of callable sites
theta_w_normalized = theta_w / total_callable_sites
# Save normalized Watterson's theta values to file
output_file = os.path.join(output_dir, f"wattersons_theta_{period}_filtered_{subspecies}_py.txt")
with open(output_file, 'w') as f_norm:
f_norm.write("CHROM\tBIN_START\tTHETA_W_NORMALIZED\n")
for i in range(len(windows)):
chrom = callset['variants/CHROM'][0]
f_norm.write(f"{chrom}\t{windows[i][0]}\t{theta_w_normalized[i]}\n")
print(f"Watterson's theta calculated and saved to {raw_output_file} and {output_file}")
# Loop over each subspecies VCF file and calculate Watterson's theta
for vcf_file in vcf_files:
subspecies_period = vcf_file.replace('.vcf.gz', '').split('_')
subspecies