Hi all!
I was wondering whether anyone would have any insights into how to make the following python script faster. I need to access a set of paired end FASTQ files, and sort the reads into three distinct files depending on which adaptor sequence is present (or whether none of them is present). However, my current implementation takes over 5h+ for around 20 million reads... Would anyone have any hints into how to make it more efficient? I'm already using the low level FastqGeneralIterator which is supposed to be faster...
Thanks in advance!!
import gzip
import sys
from Bio import SeqIO
from Bio import Align
from Bio.SeqIO.QualityIO import FastqGeneralIterator
# Arguments
file1 = sys.argv[1]
file2 = sys.argv[2]
adaptor_root = sys.argv[3]
adaptor_five = sys.argv[4]
adaptor_three = sys.argv[5]
id = sys.argv[6]
cwd_path = sys.argv[7]
temp_path = sys.argv[8]
output_five_file_1 = temp_path + id + "_five_1.fastq.gz"
output_five_file_2 = temp_path + id + "_five_2.fastq.gz"
output_three_file_1 = temp_path + id + "_three_1.fastq.gz"
output_three_file_2 = temp_path + id + "_three_2.fastq.gz"
output_else_file_1 = temp_path + id + "_else_1.fastq.gz"
output_else_file_2 = temp_path + id + "_else_2.fastq.gz"
output_five_1 = gzip.open(output_five_file_1, 'wt', 1)
output_five_2 = gzip.open(output_five_file_2, 'wt', 1)
output_three_1 = gzip.open(output_three_file_1, 'wt', 1)
output_three_2 = gzip.open(output_three_file_2, 'wt', 1)
output_else_1 = gzip.open(output_else_file_1, 'wt', 1)
output_else_2 = gzip.open(output_else_file_2, 'wt', 1)
#Setup aligner with match 1, mismatch 0, local
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.open_gap_score = 0
with gzip.open(file1, "rt") as handle1:
with gzip.open(file2, "rt") as handle2:
for (title1, seq1, qual1), (title2, seq2, qual2) in zip(FastqGeneralIterator(handle1), FastqGeneralIterator(handle2)):
root_alignment = aligner.align(seq2[0:30], adaptor_root)
if root_alignment.score > 20:
five_alignment = aligner.align(seq2[0:30], adaptor_five).score
three_alignment = aligner.align(seq2[0:30], adaptor_three).score
# if a 5 prime alignment is likelier, then output it there, else output to 3 prime file
if five_alignment > three_alignment:
output_five_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1))
j = 30 + len(seq2[30:151]) - len(seq2[30:151].lstrip('G'))
output_five_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2[j:151], qual2[j:151]))
elif three_alignment > five_alignment:
output_three_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1))
output_three_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2[30:151], qual2[30:151]))
else:
output_else_1.write("@%s\n%s\n+\n%s\n" % (title1, seq1, qual1))
output_else_2.write("@%s\n%s\n+\n%s\n" % (title2, seq2, qual2))
output_five_1.close()
output_five_2.close()
output_three_1.close()
output_three_2.close()
output_else_1.close()
output_else_2.close()
Not an answer for your specific question but if you are looking to get this done fast I suggest using filtering mode of
bbduk.sh
from BBMap suite withliteral=sequence
option. You can find a guide here.Sounds like a problem you can run faster using parallelization.
Unrelated: I'd suggest taking a look at the argparse module for parsing command line arguments.