A long run time problem
2
0
Entering edit mode
7.6 years ago
elisheva ▴ 120

Hello everybody!!
I have a serious problem with my script on python.
I have to count the appearances of each dinucleotide for each cDNA, Write it into table.
Then Calculate the ratio between some specific dinucleotides and write this data into another table.
I wrote Python script that does all these.
But the problem is that on big files (500 MB) it takes hours for the algorithm to run...
Can anybody help me improve the run time - by changing the script or translate it to another language like Perl or bash.
Unfortunately I am not close with these languages...
And I understood that python is quite slow on this stuff...
Here is my script:

"""This program gets an organism's cDNA fasta file
and generates a table for the frequency of each dinucleotide for all the cDNAs
and a table for the ratio between each dinucleotides - sense/antisense."""

from Bio import SeqIO
import csv

#Initializes a list of all the posibile dinucleotides
dinucleotides = ['AA','AT','AC','AG',
               'TT','TA','TC','TG',
               'CC','CA','CT','CG',
               'GG','GA','GT','GC']

ratio = ['AA/TT','AC/GT','AG/CT',
         'TT/AA','TC/GA','TG/CA',
         'CC/GG','CA/TG','CT/AG',
         'GG/CC','GA/TC','GT/AC']

def Counting(seq):
    """This function gets a cDNA and returns the frequency of each dinucleotide"""
    counting = {} #A dictionary where the dinucleotides are the keys and the counts are the values.
    #Scans all the dinucleotides and checks how many time they appear in the sequence.    
    for dinuc in dinucleotides:
        for i in range (len(seq)-2):
            pos = i+1,i+2
            if(seq[i:i+2] == dinuc):          
                if(counting.get(dinuc,None)!= None): #If the dictionary at the specific dinucleotide is not empty.
                    counting[dinuc] += 1 #Adds the number of times that the dinucleotide appears.
                else:
                    counting[dinuc] = 1 #initializes the dinucleotide with 1 instance.
            elif (seq[i:i+2] != dinuc and counting.get(dinuc,None)== None): #If the dinucleotide doesn't appear at all.
                counting[dinuc] = 0 #initializes the dinucleotide with 0 instances.

    return counting

def Ratio(c):
    """This function calculates the ratio between each dinucleotide on the sense and antisense."""
    di = dict() #A dictionary for the ratio.

    for ra in ratio:
        r = ra.split('/') #Splits the header for 2 dinucleotides.(r is a list)
        first = r[0] #The first dinucleotide.
        second = r[1] #The second dinucleotide.
        if(c[second] == 0): #In case of division at 0.
            di[ra] = None
        else:
            di[ra] = ((float)(c[first])/c[second]) #The ratio beetween the first dinucleotide and the second.
    return di




def cDNA(Id,seq):
    """This function gets a sequence and it's id and writes into csv file it's details"""

    # A dictionary with all the details: id, length, counting and ratio.
    data = {'Id': Id , 'length': len(seq) , 'dinucleotides' : Counting(seq),'ratio':Ratio(Counting(seq))}

    # build row
    list1 = list() #A list for all the data of table1.
    list2 = list() #A list for all the data of table2.

    list1.append(data['Id']) #The sequence id.    
    list1.append(data['length']) #The length of the sequence.

    #Scans all the "counting" dictionary
    for dinuc in dinucleotides:
        list1.append(data['dinucleotides'][dinuc]) #The counting of all the dinucleotides in the sequence. 

    list2.append(data['Id']) #The sequence's id. 
    for r in ratio:
        list2.append(data['ratio'][r]) #The ratio between each dinucleotides in the sequence.

    # write row to CSV handle
    writer1.writerow(list1)
    writer2.writerow(list2)



def main(path_to_file):


    with open(path_to_file, mode='r') as handle:

    # Use Biopython's parse function to process individual FASTA records. 
        for record in SeqIO.parse(handle, 'fasta'):

            # Extract individual parts of the FASTA record
            identifier = record.id #The sequence's Id.
            sequence = record.seq #The sequence itself.
            sequence = sequence.upper() #Turns all the nucleotides to upper case.
            cDNA(identifier,sequence)

    handle.close()
    table1.close()
    table2.close()
    print "well done\n"

#Generates a csv file for all the information
path_to_file = raw_input()
name = path_to_file[:path_to_file.find('.')] #The organism's name.

table1 =  (open(name + ".csv", "wb+")) #A file for the counting table.
writer1 = csv.writer(table1)
table2 = (open(name + "-ratio.csv", "wb+")) #A file for the ratio table.
writer2 = csv.writer(table2)

header_keys1 = ["Id", "length" ] + dinucleotides #The header of table 1.
header_keys2 = ["Id"] + ratio #The header of table 2
writer1.writerow(header_keys1) #Writes the header into file.
writer2.writerow(header_keys2) #Writes the header into file.

main(path_to_file)

For example (just a tiny file)

>CCE57618 cdna:annotated plasmid:HUSEC2011CHR1:pHUSEC2011-2:166:1143:1 gene:HUS2011_pII0001 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:repA description:replication initiation protein RepFIB
GTGGATAAGTCGTCCGGTGAGCTGGTGACACTGACACCAAACAATAACAACACCGTACAA
CCTGTGGCGCTGATGCGTCTGGGCGTCTTTGTACCGACCCTTAAGTCACTGAAGAACAGT
AAAAAAAATACACTGTCACGCACTGATGCCACGGAAGAACTGACGCGTCTTTCCCTGGCC
CGTGCAGAAGGATTCGATAAGGTTGAGATCACCGGTCCCCGGCTGGATATGGATAACGAT
TTCAAGACCTGGGTGGGGATCATTCATTCCTTTGCCCGCCATAACGTGACTGGTGACAAA
GTTGAACTGCCTTTTGTCGAGTTTGCAAAACTGTGTGGTATACCTTCAAGCCAGTCATCA
CGCAGGCTGCGTGAGCGCATCAGCCCTTCCCTGAAACGCATTGCCGGTACCGTGATCTCC
TTTTCCCGCACCGATGAGAAGCACACCCGGGAATATATCACCCATCTGGTACAGTCAGCC
TACTACGATACTGAACGGGATATTGTTCAGTTACAGGCTGATCCCCGCCTGTTTGAACTG
TACCAGTTTGACAGAAAGGTCCTTCTTCAGCTTAAGGCGATTAATGCCCTGAAGCGACGG
GAGTCCGCCCAGGCACTTTACACCTTTATAGAGAGCCTGCCCCGGGATCCGGCACCGATA
TCGCTGGCGCGGCTACGTGCCCGCCTCAATCTGAAGTCTCCGGTATTTTCCCAGAACCAG
ACGGTCAGACGGGCAATGGAGCAGTTACGTGAGATTGGATATCTTGATTACACGGAGATC
CAGCGGGGGCGAACAAAACTCTTCTGTATTCACTACCGACGTCCCCGGTTAAAAGCGCCG
AATGATGAGAGTAAGGAAAATCCGTTGCCACCTTCACCTGTGGAAAAAGTCAGTCCGGAG
ATGGCGGAGAAACTTGCCCTGCTTGAAAAACTGGGCATCACGCTGGATGACCTGGAAAAA
CTCTTCAAATCCCGCTGA
>CCE57619 cdna:annotated plasmid:HUSEC2011CHR1:pHUSEC2011-2:1219:1338:-1 gene:HUS2011_pII0002 gene_biotype:protein_coding transcript_biotype:protein_coding description:hypothetical protein
ATGTTTTATGAAGGGAGCAATGCCTCAGCATCAGGTTACGGGGTGACTCACGTAAGGGAC
AGGCAGATGGCAGCTCAGCCACAGGCAGCACTGCAGGAAACTGAATATAAACTGCAGTGA
>CCE57620 cdna:annotated plasmid:HUSEC2011CHR1:pHUSEC2011-2:1422:2162:-1 gene:HUS2011_pII0003 gene_biotype:protein_coding transcript_biotype:protein_coding description:site-specific recombinase
ATGAACAATGTCATTCCCCTGCAGAATTCACCAGAACGCGTCTCCCTGTTACCCATTGCG
CCGGGGGTGGATTTTGCAACAGCGCTCTCCCTGAGAAGAATGGCCACTTCCACGGGGGCC
ACACCGGCCTACCTGCTGGCCCCGGAAGTGAGTGCCCTTCTTTTCTATATGCCGGATCAG
CGTCACCATATGCTGTTCGCCACCCTCTGGAATACCGGAATGCGTATTGGCGAAGCCCGG
ATGCTGACACCGGAATCATTTGACCTGGATGGAGTAAGACCGTTTGTGCGGATCCAGTCC
GAAAAAGTGCGTGCGCGACGCGGACGCCCGCCAAAAGATGAAGTGCGCCTGGTTCCGCTG
ACAGATATAAGCTATGTCAGGCAGATGGAAAGCTGGATGATCACCACCCGGCCCCGTCGT
CGTGAACCATTATGGGCCGTGACCGACGAAACCATGCGCAACTGGCTGAAGCAGGCTGTC
AGACGGGCCGAAGCTGACGGGGTACACTTTTCGATTCCGGTAACACCACACACTTTCCGG
CACAGCTATATCATGCACATGCTCTATCACCGCCAGCCCCGGAAAGTCATCCAGGCACTG
GCTGGTCACAGGGATCCACGTTCGATGGAGGTCTATACACGGGTGTTTGCGCTGGATATG
GCTGCCACGCTGGCAGTGCCTTTCACAGGTGACGGACGGGATGCTGCAGAGATCCTGCGT
ACACTGCCTCCCCTGAAGTAA

My output is these 2 csv file:

counting.csv

Id,length,AA,AT,AC,AG,TT,TA,TC,TG,CC,CA,CT,CG,GG,GA,GT,GC
CCE57618,978,74,52,70,52,55,38,55,73,80,61,61,60,60,75,53,57
CCE57619,120,8,7,8,13,4,5,4,9,2,15,6,2,12,7,5,11
CCE57620,741,34,44,50,35,31,22,38,62,68,55,43,52,56,52,35,62

ratio.csv

Id,AA/TT,AC/GT,AG/CT,TT/AA,TC/GA,TG/CA,CC/GG,CA/TG,CT/AG,GG/CC,GA/TC,GT/AC
CCE57618,1.34545454545,1.32075471698,0.852459016393,0.743243243243,0.733333333333,1.19672131148,1.33333333333,0.835616438356,1.17307692308,0.75,1.36363636364,0.757142857143
CCE57619,2.0,1.6,2.16666666667,0.5,0.571428571429,0.6,0.166666666667,1.66666666667,0.461538461538,6.0,1.75,0.625
CCE57620,1.09677419355,1.42857142857,0.813953488372,0.911764705882,0.730769230769,1.12727272727,1.21428571429,0.887096774194,1.22857142857,0.823529411765,1.36842105263,0.7

Sorry for the mess, if anyone can explain how to upload an Excel file it will be very helpful....

Thanks!!!

python csv genome • 4.0k views
ADD COMMENT
0
Entering edit mode

How are you executing this? Where are you specifying the input file?

ADD REPLY
0
Entering edit mode

On this line: main(path_to_file) And then the main function calls the other functions.

ADD REPLY
0
Entering edit mode

Working with a interpreted language like python will always make your code slower. It seems that your problem could be easily paralellized. Give a look at the multiprocessing package or to GNU Parallel.

ADD REPLY
0
Entering edit mode

Is this package available on python?

ADD REPLY
0
Entering edit mode

import multithreading is intended here. "GNU Parallel" will be of no help to you, ignore that. But really, get the core script to run as quickly as possible before making things more complicated with threading.

ADD REPLY
0
Entering edit mode

O.K. I will.
any tips for that?
Or maybe I'll try to write it in another language?

ADD REPLY
0
Entering edit mode

Unless you are writing software which is for production or for using multiple times, python is perfectly fine. With some parallelization, you can easily improve on runtime. The task you are writing for is not too hard.

How many cores does your machine have available?

by changing the script or translate it to another language like Perl or bash. Unfortunately I am not close with these languages... And I understood that python is quite slow on this stuff...

Perl or bash is not going to help (or not a lot). If people talk about faster languages it's about compiled languages such as java or C.

ADD REPLY
5
Entering edit mode
7.6 years ago

Start be rewriting Counting():

def Counting(seq):
    """This function gets a cDNA and returns the frequency of each dinucleotide"""
    counting = {k: 0 for k in dinucleotides}  # This is faster than a list
    #Scan the sequence, looking for all dinucleotides at once
    for i in range(len(seq)-2):
        if seq[i:i+2] in counting:
            counting[seq[i:i+2]] += 1
    return counting
ADD COMMENT
0
Entering edit mode

I'v changed the function and I got this error:

Warning (from warnings module):
  File "C:\Python27\lib\site-packages\Bio\Seq.py", line 152
    "the new string hashing behaviour.", BiopythonWarning)
BiopythonWarning: Biopython Seq objects now use string comparison. Older versions of Biopython used object comparison. During this transition, please use hash(id(my_seq)) or my_dict[id(my_seq)] if you want the old behaviour, or use hash(str(my_seq)) or my_dict[str(my_seq)] for the new string hashing behaviour.
ADD REPLY
0
Entering edit mode

Why do you think that's an error? It explicitly says "Warning".

ADD REPLY
0
Entering edit mode

Yes, sorry. But Can this cause a problem? O.k. I'v changed it to str(sequence) And now it's good.

ADD REPLY
0
Entering edit mode

It shouldn't.

ADD REPLY
0
Entering edit mode

Update
Now it's run for a few minutes.
Thank you so much!!!!!!!

ADD REPLY
0
Entering edit mode

If this solved your issue please mark the answer from Devon as accepted.

ADD REPLY
3
Entering edit mode
7.6 years ago
Joe 21k

I've partially stolen Devon's solution, since that is where I think most of the speed gains were to be made, but it looks to me like you were getting yourself tied up with opening and closing files unecessarily as well.

This code does everything you want I think with the exception of placing headers in to the resulting csv, but that is pretty simple to resolve (see here). You'll need to test it for speed.

The with block handles all the opening and closing of files, and the DictWriter functions from csv are very useful for printing your data directly in to files.

I'm not sure that your ratio function works exactly, as the ratio file only has a single value in it - but I'm not thinking clearly enough at the moment to fix it. Hopefully this will get you on your way though.

The code produces these files: Counts.csv

CCE57618,74,70,53,52,80,55,60,60,57,52,75,73,61,61,55,38
CCE57619,8,8,5,13,2,4,2,12,11,7,7,9,6,15,4,5
CCE57620,34,50,35,35,68,31,52,56,62,44,52,62,43,55,38,22

Ratio.csv

CCE57618,0.7571428571428571
CCE57619,0.625
CCE57620,0.7

Code:

$ python dinucleotide.py --infile sample.fasta --ratiofile ratiofile.csv --countsfile counts.csv

from Bio import SeqIO
import sys
import argparse
import traceback
import warnings


dinucleotides = ['AA','AT','AC','AG',
               'TT','TA','TC','TG',
               'CC','CA','CT','CG',
               'GG','GA','GT','GC']

ratio = ['AA/TT','AC/GT','AG/CT',
         'TT/AA','TC/GA','TG/CA',
         'CC/GG','CA/TG','CT/AG',
         'GG/CC','GA/TC','GT/AC']


def parseArgs():
        """Parse commandline arguments"""

        import argparse
        try:
                parser = argparse.ArgumentParser(description='Count dinucleotide frequency in a multifasta and write the output to a CSV.')
                parser.add_argument('--infile',
                                    action='store',
                                    help='The HHpred output file to parse.')
                parser.add_argument('--ratiofile',
                                    action='store',
                                    help='Output file to store ratios in.')
                parser.add_argument('--countsfile',
                                    action='store',
                                    help='Output file to store counts in.')
        except:
                print "An exception occured with argument parsing. Check your provided options."
                traceback.print_exc()

        return parser.parse_args()



def Ratio(c):
        """This function calculates the ratio between each dinucleotide on the sense and antisense."""
        di = dict() #A dictionary for the ratio.

        for ra in ratio:
                r = ra.split('/') #Splits the header for 2 dinucleotides.(r is a list)
                first = r[0] #The first dinucleotide.
                second = r[1] #The second dinucleotide.
                if(c[second] == 0): #In case of division at 0.
                        di[ra] = None
        else:
                di[ra] = ((float)(c[first])/c[second]) #The ratio beetween the first dinucleotide and the second.

        return di


def Counting(seq):
        """This function gets a cDNA and returns the frequency of each dinucleotide"""
        counting = {k: 0 for k in dinucleotides}

        for i in range(len(seq)-2):
                if seq[i:i+2] in counting:
                        counting[seq[i:i+2]] += 1

        return counting



def main():
        import csv

        args = parseArgs()

        with open(args.infile, 'r') as ifh, open(args.ratiofile, 'wb') as rfh, open(args.countsfile, 'wb') as cfh:
                for record in SeqIO.parse(ifh, 'fasta'):
                        identifier = record.id
                        sequence = record.seq
                        sequence = sequence.upper()

                        count = Counting(sequence)

                        c = csv.DictWriter(cfh, count.keys())
                        cfh.write(identifier + ',')
                        c.writerow(count)

                        ratio = Ratio(count)
                        r = csv.DictWriter(rfh, ratio.keys())
                        rfh.write(identifier + ',')
                        r.writerow(ratio)




if __name__ == '__main__':
        main()
ADD COMMENT
0
Entering edit mode

If nothing else it's a little cleaner and invokes from the commandline ;)

ADD REPLY
0
Entering edit mode

Thanks!!!
I really appreciate it.

ADD REPLY
2
Entering edit mode

I've totally stolen jrj.healey's solution, and made it faster by using defaultdict, storing things in tuples, and only uppercasing the things you need.

Also, if you really want your mind blown, try running it in PyPy :) It will probably take seconds.

import csv
import argparse
import collections
from Bio import SeqIO
parser = argparse.ArgumentParser(description='Count dinucleotide frequency in a multifasta and write the output to a CSV.')
parser.add_argument('--infile',     action='store', help='The HHpred output file to parse.')
parser.add_argument('--ratiofile',  action='store', help='Output file to store ratios in.')
parser.add_argument('--countsfile', action='store', help='Output file to store counts in.')
args = parser.parse_args()
dinucleotides = ['AA','AT','AC','AG','TT','TA','TC','TG','CC','CA','CT','CG','GG','GA','GT','GC']
ratios = [('AA','TT'),('AC','GT'),('AG','CT'),('TT','AA'),('TC','GA'),('TG','CA'),('CC','GG'),('CA','TG'),('CT','AG'),('GG','CC'),('GA','TC'),('GT','AC')]    
with open(args.infile, 'r') as ifh, open(args.ratiofile, 'wb') as rfh, open(args.countsfile, 'wb') as cfh:
    for record in SeqIO.parse(ifh, 'fasta'):
        count = collections.defaultdict(int)
        for i in xrange(len(record.seq)-2): count[record.seq[i:i+2].upper()] += 1
        c = csv.DictWriter(cfh, count.keys())
        cfh.writerecord.id + ',')
        c.writerow(count)
        ratio = dict()
        for first,second in ratios:
            if(c[second] == 0): ratio[first,second] = None
            else:               ratio[first,second] = ((float)(count[first])/count[second]) #The ratio beetween the first dinucleotide and the second.
        r = csv.DictWriter(rfh, ratio.keys())
        rfh.writerecord.id + ',')
        r.writerow(ratio)
ADD REPLY

Login before adding your answer.

Traffic: 1579 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6