Entering edit mode
3.7 years ago
O.rka
▴
740
I have 2 groups: treatment and control. In the treatment group, we hypothesize that there is lower coverage on the 3' region of the CDS in the treatment. I've mapped all of the samples to the CDS sequences, converted to bed, and I'm kind of stuck.
There has to be a better way to statistically quantify if one group of samples has differential coverage on regions than another group.
I did the following but it seems full of assumptions (note, I hacked this together very quick so bear with me):
from collections import *
import sys, os, glob
import numpy as np
import soothsayer_utils as syu # This is a utility package I wrote
from scipy import stats
# Get cds sequences
seqs = syu.read_fasta("../assembly_run-25/eco.genes.fna")
data = defaultdict(dict)
# Initialize arrays filled zeros for each gene in each sample
for fp in glob.glob("*.bam.bedtools.cov"): # The output of bedtools coverage
id_sample = fp.split(".bam")[0]
for id_gene, length in seqs.map(len).items():
data[id_sample][id_gene] = np.zeros(length)
# Get the regions covered by the read and add 1
for fp in glob.glob("*.bam.bedtools.cov"):
id_sample = fp.split(".bam")[0]
for line in syu.pv(syu.read_textfile(fp), id_sample):
fields = line.split("\t")
if fields[0] != ".":
id_gene = fields[0]
start = int(fields[1]) - 1
end = int(fields[2])
data[id_sample][id_gene][start:end] += 1
# Get the total number of reads per sample
sample_to_depth = defaultdict(int)
for fp in glob.glob("*.bam.bedtools.cov"):
id_sample = fp.split(".bam")[0]
with open(fp, "r") as f:
for i, line in sy.pv(enumerate(f), id_sample):
sample_to_depth[id_sample] += 1
# Create 2 dictionaries, one per group, and then make
# key=gene and value=array of coverages
data_solvents = defaultdict(list)
data_treatments = defaultdict(list)
solvents = ["DH2O_BR3-PE-P42-1_S42_001", "DH2O_BR2-PE-P28-1_S28_001", "DH2O_BR1-PE-P14-1_S14_001"]
treatments = ["NU_C_12_BR2-PE-P24-1_S24_001", "NU_C_12_BR3-PE-P38-1_S38_001", "NU_C_12_BR1-PE-P10-1_S10_001"]
for id_sample, coverages in data.items():
if id_sample in solvents:
depth = sample_to_depth[id_sample]
for id_gene, covs in coverages.items():
# covs = covs/depth
data_solvents[id_gene].append(covs)
if id_sample in treatments:
depth = sample_to_depth[id_sample]
for id_gene, covs in coverages.items():
# covs = covs/depth
data_treatments[id_gene].append(covs)
for d in [data_solvents, data_treatments]:
for id_gene, covs in d.items():
A = np.stack(covs, axis=0)
d[id_gene] = A.mean(axis=0)
# Get medians for percentils
def f(x):
n = len(x)
p_25 = int(n*0.25)
p_50 = int(n*0.50)
p_75 = int(n*0.75)
p_100 = int(n*1.0)
percentiles = [0, p_25, p_50, p_75, p_100]
output = [0,0,0,0]
for i in range(len(percentiles)-1):
output[i] = np.median(x[percentiles[i]:percentiles[i+1]])
return np.asarray(output)
output_solvents = dict()
output_treatments = dict()
for id_gene, medians in data_solvents.items():
output_solvents[id_gene] = f(medians)
for id_gene, medians in data_treatments.items():
output_treatments[id_gene] = f(medians)
# Organize into DataFrames
df_solvents = pd.DataFrame(output_solvents, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)
df_treatments = pd.DataFrame(output_treatments, index=["0-25", "25-50", "50-75", "75-100"]).T.dropna(how="all", axis=0)
output_p = dict()
output_diff = dict()
for column in df_solvents.columns:
u = df_solvents[column]
v = df_treatments[column]
output_p[column] = stats.wilcoxon(u, v)[-1]
output_diff[column] = v.median() - u.median()
output_p
# {'0-25': 2.7590531087162958e-117,
# '25-50': 5.163056594586972e-151,
# '50-75': 4.137767887055633e-149,
# '75-100': 5.673772630059623e-134}
df_solvents
# 0-25 25-50 50-75 75-100
# EG11277 0.333333 0.333333 0.333333 0.333333
# EG10998 209.333333 171.000000 250.000000 227.500000
# EG10999 241.000000 125.333333 422.666667 238.833333
# EG11000 257.000000 293.333333 246.333333 204.333333
# G6081 10.833333 61.333333 144.666667 114.333333