Hey,
I have a script that extract features from a large fasta file (1767 MB) using biopython.
I am sending it as a bash job via ssh remote server. The job is running for two days now.. Is there a way to optimize my script?
I think maybe the problem is in genome_file.seek(0)
import sys
from Bio import SeqIO
import pandas as pd
from Bio.SeqFeature import SeqFeature, FeatureLocation
class target_features():
def __init__(self, table_file, male_ref_genome, output_file_name):
self.features_table = df_from_features_table(table_file)
self.genome_file = male_ref_genome
self.output = output_file_name
def extract_target(self):
"""Grab ten first mRNA features from the features_df
get the matching chromosome seq from the genome file
create a feature object and write it to an output fasta file. """
out = open(self.output, "a")
genome_file = open(self.genome_file)
genome_sequence = SeqIO.parse(genome_file, 'fasta')
#mrna_counter = 0
for index, row in self.features_table.iterrows():
#if mrna_counter >= 11:
# break
if row['# feature'] == 'mRNA':
#mrna_counter = mrna_counter + 1
chr = row['genomic_accession']
start = row['start']
stop = row['end']
strand = row['strand']
product_accession = row['product_accession']
related_accession = row['related_accession']
name = row['name']
assert strand == '+' or strand == '-', "strand value incorrect {}".format(strand)
strand = 1 if strand == '+' else -1
target_feature = SeqFeature(FeatureLocation(start, stop, strand=strand))
# find chromosome seq by id
for record in genome_sequence:
if record.id == chr:
target_feature.location.extract(record.seq)
line = '>{}|{}|{}|{}|{}|{}|{}|{}\n{}\n'.format(record.id, start, stop, strand, product_accession, related_accession, name, stop-start+1, target_feature.location.extract(record.seq))
out.write(line)
break
genome_file.seek(0)
genome_sequence = SeqIO.parse(genome_file, 'fasta')
def df_from_features_table(table_file):
features_df = pd.read_csv(table_file, sep="\t")
return features_df
if __name__ == '__main__':
genome = sys.argv[-3]
features_data_file = sys.argv[-2]
out = sys.argv[-1]
print('genome: ', genome)
print('feature table: ', features_data_file)
print('out: ', out+'\n')
target_features = target_features(features_data_file, genome, out)
print('AFTER CREATE target_features')
target_features.extract_target()
print('AFTER extract target')
Providing a small example of the input and the expected output usually helps to get better answers. Having said this and without knowing what the script does specifically, reading a fasta file (especially if it is big) multiple times is subptimal,
genome_sequence = SeqIO.parse(genome_file, 'fasta')
. Maybe better to give to the main function thegenome_sequence
variable directly.