|
from __future__ import print_function |
|
from itertools import islice |
|
import sys |
|
|
|
Nextera_S_barcodes=['CTCTCTAT','GCGTAAGA','TATCCTCT','AGAGTAGA','GTAAGGAG','ACTGCATA','AAGGAGTA','CTAAGCCT'] |
|
Nextera_N_barcodes=['TAAGGCGA','CGTACTAG','AGGCAGAA','TCCTGAGC','GGACTCCT','TAGGCATG','CTCTCTAC','CAGAGAGG', |
|
'GCTACGCT','CGAGGCTG','AAGAGGCA','GTAGAGGA'] |
|
|
|
def eprint(*args, **kwargs): |
|
print(*args, file=sys.stderr, **kwargs) |
|
|
|
def levenshtein(a, b): |
|
n, m = len(a), len(b) |
|
if n > m: |
|
# Make sure n <= m, to use O(min(n,m)) space |
|
a, b = b, a |
|
n, m = m, n |
|
|
|
current = range(n + 1) |
|
for i in range(1, m + 1): |
|
previous, current = current, [i] + [0] * n |
|
for j in range(1, n + 1): |
|
add, delete = previous[j] + 1, current[j - 1] + 1 |
|
change = previous[j - 1] |
|
if a[j - 1] != b[i - 1]: |
|
change = change + 1 |
|
current[j] = min(add, delete, change) |
|
|
|
return current[n] |
|
|
|
try: |
|
from Levenshtein import distance |
|
lev_dist=distance |
|
eprint("Levenshtein module found, using it for speedup") |
|
except: |
|
lev_dist=levenshtein |
|
eprint("Levenshtein module not found, using slow python alternative") |
|
|
|
|
|
def ScanIndexFiles(I1,I2,barcodelength): |
|
data={} |
|
tRecords=0 |
|
with open(I1) as i1, open(I2) as i2: |
|
while True: |
|
next_record_1 = list(islice(i1, 4)) |
|
next_record_2 = list(islice(i2, 4)) |
|
if not next_record_1: |
|
break |
|
if not next_record_2: |
|
print("New record in file 1, but not in file 2. Record %d"%(tRecords)) |
|
return |
|
if next_record_1[0][0]!='@': |
|
print("Supposed to be New record in file 1, but is not. Record %d"%(tRecords)) |
|
return |
|
if next_record_2[0][0] != '@': |
|
print("Supposed to be New record in file 2, but is not. Record %d" % (tRecords)) |
|
return |
|
tBc="%s-%s"%(next_record_1[1][:barcodelength],next_record_2[1][:barcodelength]) |
|
if not data.has_key(tBc): |
|
data[tBc]=0 |
|
data[tBc] += 1 |
|
tRecords+=1 |
|
if not (tRecords%10000): |
|
eprint("Processed %d sequences"%(tRecords)) |
|
return data |
|
|
|
def CollapseSimilar(data,dist,minoccur): |
|
ndata={} |
|
ndatal={} |
|
ndatar={} |
|
cnt=0 |
|
eprint("Collapsing %d barcodes into similar by distance %d"%(len(data),dist)) |
|
for w in sorted(data, key=data.get, reverse=True): |
|
if data[w]>minoccur: |
|
tl=w.split('-')[0] |
|
tr=w.split('-')[1] |
|
il="" |
|
for l in ndatal.keys(): |
|
if lev_dist(l,tl)<=dist: |
|
if il=="": |
|
il=l |
|
else: #if found more than one similar |
|
il="" #its like didn't find anything - can't collapse |
|
break |
|
ir = "" |
|
for r in ndatar.keys(): |
|
if lev_dist(r, tr) <= dist: |
|
if ir == "": |
|
ir = r |
|
else: # if found more than one similar |
|
ir = "" # its like didn't find anything - can't collapse |
|
break |
|
if ((ir!="")and(il!="")): |
|
bc="%s-%s"%(il,ir) |
|
if not ndata.has_key(bc): |
|
ndata[bc]={} |
|
ndata[bc]["count"]=data[w] |
|
ndata[bc]["mismatches"] = {} |
|
ndatal[il] = 1 |
|
ndatar[ir] = 1 |
|
else: |
|
ndata[bc]["count"] += data[w] |
|
ndata[bc]["mismatches"][w]=data[w] |
|
else: |
|
bc=w |
|
if ndata.has_key(bc): |
|
print("Encountered an unexpected key (%s), probably algorithm bug "%(bc)) |
|
return |
|
ndata[bc]={} |
|
ndata[bc]["count"]=data[w] |
|
ndata[bc]["mismatches"]={} |
|
ndatal[tl] = 1 |
|
ndatar[tr] = 1 |
|
cnt+=1 |
|
if not (cnt % 100): |
|
eprint("Processed %f %%" % (100.0*cnt/len(data))) |
|
return ndata |
|
|
|
def read_processedfile(fname): |
|
data={} |
|
with open(fname) as f: |
|
for l in f: |
|
if ((l.startswith("Processed"))or(l.startswith("Done."))or(l=="\n\n\nCollapsed\n\n")): |
|
continue |
|
else: |
|
d=l.strip().split(' ') |
|
if ((len(d)!=2)or(len(d[0].split('-'))!=2)or(len(d[0].split('-')[0])!=len(d[0].split('-')[1]))): |
|
print("Unknown format") |
|
return |
|
else: |
|
data[d[0]]=int(d[1]) |
|
return data |
|
|
|
if __name__ == "__main__": |
|
|
|
if len(sys.argv)<5: |
|
print("usage: Generate_runscripts_v1.py IndexFile1 IndexFile2 barcode_length allowedmutations min_occurrences") |
|
exit() |
|
I1f = sys.argv[1] |
|
I2f = sys.argv[2] |
|
NexteraBarcodeL = int(sys.argv[3]) |
|
okmut= int(sys.argv[4]) |
|
min_occur = int(sys.argv[5]) |
|
|
|
data=ScanIndexFiles(I1f,I2f,NexteraBarcodeL) |
|
for w in sorted(data, key=data.get, reverse=True): |
|
if data[w]>min_occur: |
|
print(w, data[w]) |
|
else: |
|
break |
|
print("\n\n\nCollapsed\n\n") |
|
|
|
coldata=CollapseSimilar(data,okmut,min_occur) |
|
sd={} |
|
for i in coldata.keys(): |
|
sd[i]=coldata[i]["count"] |
|
for w in sorted(sd, key=sd.get, reverse=True): |
|
ts="" |
|
for i in sorted(coldata[w]["mismatches"], key=coldata[w]["mismatches"].get, reverse=True): |
|
ts="%s\t%s\t%d"%(ts,i,coldata[w]["mismatches"][i]) |
|
print("%s\t%d\t%d\t%s"%(w,coldata[w]["count"],len(coldata[w]["mismatches"]),ts)) |
|
|
|
eprint("Done.") |
I think you should sort before count uniq, then sort again to see the most frequent at the top of the list.