I have a large fastq file, ~40GB of short 50bp illumina reads. The first 10bp are an experiment identifier tag, 18bp follow that are the same for all reads, the remainder of the sequence is the first 22 bp of a gene. I have a script which identifies the experiment and the gene given a "decode" file. The problem is this script is so, so slow.
Any suggestions on how to speed things up would be greatly appreciated.
from collections import defaultdict
import argparse
import itertools
import os
import profile
import sys
class BarSeq( object ):
def __init__( self, fastq, decode, seqID, cwd ):
"""
parameters:
fastq = BarSeq fastq file, expected to be for a single user
decode = Gene UP_tag file
cwd = current working directory
seqID = Experiment ID_tag file
"""
self.fastq = fastq
self.dir = cwd
self.seqid = seqID
self.decode = decode
self.geneList = [] # Gene names
self.geneUpTag = self.getGeneUpTag() # key = gene names, values = list of sequence tags
self.seqIdTag = self.getSeqIdTag() # key = sampleID, values = list [ sequnce, user ]
self.exact_Results = defaultdict(dict) # dict(key=experiment) value = Dict(key = Gene, value = Counts )
self.fuzzy_Results = defaultdict(dict) # for results with 1 mismatch
def matchSeqId( self, read ):
"""
Match a sequence read to an experiment with a seqID tag
seqID tag must match start of string and be an EXACT match.
"""
for key, value in self.seqIdTag.items():
if read.startswith(value[0]):
return key
return
def matchGene( self, read ):
"""
Match GENE UPTAG to sequence read,
Tag must be an exact match
Assumes the Gene part of the sequence begins at the 29th base of the read.
If not you will need to change the base number in geneSeq = read[28:]
to an appropriate value.
"""
for key, value in self.geneUpTag.items():
geneSeq = read[28:]
if geneSeq.startswith(value[0]):
return key
return
def processFastq( self ):
"""
Open and read fastq file read by read, calling functions matchSEQID
and matchGene functions. If match found count and store result.
"""
with open( self.fastq, "rU" ) as data:
for line1, line2, line3, line4 in itertools.zip_longest(*[data]*4):
header = line1.rstrip()
read = line2.rstrip()
sign = line3.rstrip()
qual = line4.rstrip()
# EXACT MATCHING STARTS HERE
SeqID = self.matchSeqId(read) # returns sample/experiment identifer
if SeqID:
geneName = self.matchGene(read)
if geneName:
if SeqID in self.exact_Results:
if geneName in self.exact_Results[SeqID]:
self.exact_Results[SeqID][geneName] += 1
else:
self.exact_Results[SeqID][geneName] = 1
else:
self.exact_Results[SeqID][geneName] = 1
def writeTable( self, data, outName ):
"""
Write gene counts to file as a tab delimited table.
"""
sampleList = list(data.keys()) # get list to maintain order when writing table
sampleList.sort()`
with open( outName, 'w') as out:
out.write('gene')
for name in sampleList:
newName = name + "-" + self.seqIdTag[name][0]
out.write("\t%s" %(newName))
out.write("\n")
for idx in range(0, len(self.geneList)):
out.write("%s\t" %( self.geneList[idx]))
for sample in sampleList:
if self.geneList[idx] not in self.exact_Results[sample]:
out.write('0\t')
else:
out.write('%s\t' %(self.exact_Results[sample][self.geneList[idx]]))
out.write("\n")
def getSeqIdTag( self ):
"""
Get SeqID UP_tag Decode information, put in dictionary
Key = SampleID, value = list[ sequence, user_name]
"""
seqUpTag = defaultdict(list)
with open( self.seqid , 'r') as f:
for line in f:
if line.startswith('Sample'):
continue
line = line.rstrip()
items = line.split()
sampleID = ["".join(items[0:3])]
if sampleID[0] in seqUpTag:
print("Duplicate sampleID %s" %(sampleID[0]))
else:
seqUpTag[sampleID[0]].append(items[4])
seqUpTag[sampleID[0]].append(items[5])
return seqUpTag
def getGeneUpTag( self ):
"""
Get GENE UP_tag Decode information, put in dictionary,
key = GENE, value = list of sequence tags
"""
geneUpTag = defaultdict(list)
with open( self.decode, 'r') as f:
for line in f:
line = line.rstrip()
if line.startswith('Y'):
items = line.split()
geneUpTag[items[0]].append(items[1])
if items[0] not in self.geneList:
self.geneList.append(items[0])
return geneUpTag
def main():
cmdparser = argparse.ArgumentParser(description="Produce a table of gene counts from DNA Bar-Seq data.",
usage='%(prog)s -f fastq -d UP_tag_Decode file -s Seq_ID file', prog='BarSeq.py' )
cmdparser.add_argument('-f', '--Fastq' , action='store' , dest='FASTQ' , help='BarSeq fastq file' , metavar='')
cmdparser.add_argument('-d', '--Decode', action='store' , dest='DECODE', help='GENE UP_tag_Decode file', metavar='')
cmdparser.add_argument('-s', '--Seqid' , action='store' , dest='SEQID' , help='Seq_ID file (experiment ID)', metavar='')
cmdparser.add_argument('-i', '--info' , action='store_true', dest='INFO' , help='Print a more detailed description of program.')
cmdResults = vars(cmdparser.parse_args())
# if no args print help
if len(sys.argv) == 1:
print("")
cmdparser.print_help()
sys.exit(1)
cwd = os.getcwd() # current working dir
# Start processing the data
data = BarSeq(fastq, geneDecode, seqID, cwd)
data.processFastq()
data.writeTable(data.exact_Results,"Exact-Match.table")
if __name__ == "__main__":
main()
Thanks for all the information, very helpful