Entering edit mode
3.1 years ago
e5cf5013
•
0
How can I find which frame is producing the final protein? Is there any way to set all the frames?
import re
filename = input('Enter name of file to parse: ')
sequences = []
descr = None
# here is the path of multifalsta file
with open(filename) as file:
line = file.readline()[:-1] # always trim newline
while line:
if line[0] == '>':
if descr: # any sequence found yet?
sequences.append((descr, seq))
descr = str(line[1:].split('>'))
seq = '' # start a new sequence
else:
seq += line
line = file.readline()[:-1]
sequences.append((descr, seq))
def find_all_starts(seq):
"""Find the starting index of all start codons in a lowercase seq"""
# Compile regex for start codons
regex_start = re.compile('atg')
# Find the indices of all start codons
starts = []
for match in regex_start.finditer(seq):
starts.append(match.start())
return tuple(starts)
find_all_starts(seq)
def find_first_in_register_stop(seq):
"""
Find first stop codon on lowercase seq that starts at an index
that is divisible by three
"""
# Compile regexes for stop codons
regex_stop = re.compile('(taa|tag|tga)')
# Stop codon iterator
stop_iterator = regex_stop.finditer(seq)
# Find next stop codon that is in register
for stop in stop_iterator:
if stop.end() % 3 == 0:
return stop.end()
# Return -1 if we failed to find a stop codon
return -1
find_first_in_register_stop(seq)
def all_orfs(seq):
"""Return all ORFs of a sequence."""
# Make sure sequence is all lower case
seq = seq.lower()
# Find the indices of all start codons
start_inds = find_all_starts(seq)
# Keep track of stops
stop_inds = []
# Initialze ORFs. Each entry in list is [ORF length, ORF start, ORF stop]
orfs = []
# For each start codon, find the next stop codon in register
for start in start_inds:
relative_stop = find_first_in_register_stop(seq[start:])
if relative_stop != -1:
# Index of stop codon
stop = start + relative_stop
# If already had stop, a longer ORF contains this one
if stop not in stop_inds:
orfs.append((relative_stop, start, stop))
stop_inds.append(stop)
# Get sorted list of ORF length
orfs = sorted(orfs, reverse=True)
# Remove lengths
for i, orf in enumerate(orfs):
orfs[i] = (orf[1], orf[2])
return tuple(orfs)
all_orfs(seq)
def longest_orf(seq):
"""Longest ORF of a sequence."""
orfs = all_orfs(seq)
if len(orfs) == 0:
return ''
else:
return seq[orfs[0][0]:orfs[0][1]]
final_orf = longest_orf(seq)
def translate(seq):
table={}
table = {
'ata':'I', 'atc':'I', 'att':'I', 'atg':'M',
'aca':'T', 'acc':'T', 'acg':'T', 'act':'T',
'aac':'N', 'aat':'N', 'aaa':'K', 'aag':'K',
'agc':'S', 'agt':'S', 'aga':'R', 'agg':'R',
'cta':'L', 'ctc':'L', 'ctg':'L', 'ctt':'L',
'cca':'P', 'ccc':'P', 'ccg':'P', 'cct':'P',
'cac':'H', 'cat':'H', 'caa':'Q', 'cag':'Q',
'cga':'R', 'cgc':'R', 'cgg':'R', 'cgt':'R',
'gta':'V', 'gtc':'V', 'gtg':'V', 'gtt':'V',
'gca':'A', 'gcc':'A', 'gcg':'A', 'gct':'A',
'gac':'D', 'gat':'D', 'gaa':'E', 'gag':'E',
'gga':'G', 'ggc':'G', 'ggg':'G', 'ggt':'G',
'tca':'S', 'tcc':'S', 'tcg':'S', 'tct':'S',
'ttc':'F', 'ttt':'F', 'tta':'L', 'ttg':'L',
'tac':'Y', 'tat':'Y', 'taa':'_', 'tag':'_',
'tgc':'C', 'tgt':'C', 'tga':'_', 'tgg':'W',
}
protein =""
if len(seq)%3 == 0:
for i in range(0, len(seq), 3):
codon = seq[i:i + 3]
protein+= table[codon]
my_protein_file ="my_protein.fasta"
with open(my_protein_file,"w") as translated_protein_file:
translated_protein_file.write(">CLAUD\n")
translated_protein_file.write(protein)
return protein
translate(final_orf)