Here, I have written in Python which does the job. If you want to get more information regarding the any uniprot ID then its better to fetch xml (http://www.uniprot.org/uniprot/P02070.xml) file rather then fasta and parse the needed content using either lxml, SAX or dom. If you want then I can write xml parser too.
"""
*This script assumes that all the Uniprot accession are available so haven't written
any exception for unknown accession.
*The fasta file will be saved as O60243_411.fasta (filname_lengthofsequence.fasta)
"""
import urllib2
ids_list = ['O60243', 'Q8IZP7', 'P02070']
uniprot_url = "http://www.uniprot.org/uniprot/"
def get_fasta(id):
url_with_id = "%s%s%s" %(uniprot_url, id, ".fasta")
file_from_uniprot = urllib2.urlopen(url_with_id)
data = file_from_uniprot.read()
"""
- replace all the newlines from the file_from_uniprot by replace('\n') method
and make it one line.
- after that split the line by split('SV=') and get first second item from the list which is 1
- then compute the length of the sequence starting from 1 to length of sequence.
"""
get_only_sequence = data.replace('\n', '').split('SV=')[1]
length_of_sequence = len(get_only_sequence[1:len(get_only_sequence)])
file_output_name = "%s%s%s%s" %(id,"_", length_of_sequence, ".fasta")
with open(file_output_name, "wb") as fasta_file:
fasta_file.write(data)
print "completed"
def main():
for k,v in enumerate(ids_list):
get_fasta(v)
if __name__ == '__main__':
main()
UPDATED
My first attempt wasn't to your requirement which I misunderstood. But, I spare sometime and wrote new script and sure its what you trying to achieve.
I have tried to clear by writing precise comments with lengthy variable names. It creates three different files (fasta, fasta without signal peptide and statistics). Here is a code which I haven't done any refactoring.
import xml.etree.ElementTree as ET
import urllib2
from collections import Counter
ids_list = ['P01165', 'P53634', 'Q8CW46']
uniprot_xml_namespace = "{http://uniprot.org/uniprot}"
uniprot_url = "http://www.uniprot.org/uniprot/"
def get_xml_feature_signal_peptide(id):
url_with_id = "%s%s%s" %(uniprot_url, id, ".xml")
file_to_parse = urllib2.urlopen(url_with_id)
tree = ET.parse(file_to_parse)
"""
from the tree get feature element with attribute signal peptide and find the
position. Here its assume the length of the signal peptide will be equivalent
to end position.
"""
get_feature = tree.findall(".//"+uniprot_xml_namespace+"feature[@type='signal peptide']")
if get_feature:
feature_location_position = []
for eachitem in get_feature:
for item in eachitem.getchildren()[0]:
feature_location_position.append(item.attrib.get('position'))
get_fasta_file_and_statistics(id, feature_location_position)
else:
print "no signal peptide found for id: %s" %id
def get_fasta_file_and_statistics(id, feature):
url_with_id = "%s%s%s" %(uniprot_url, id, ".fasta")
file_from_uniprot = urllib2.urlopen(url_with_id)
data = file_from_uniprot.read()
save_fasta_name = "%s%s%s%s" % (id, "_", feature[1], ".fasta")
save_fasta_stat_name = "%s%s%s%s%s" % (id, "_", feature[1], "_stat", ".txt")
save_fasta_without_sigpep = "%s%s%s%s%s" % (id, "_", feature[1], "_without_pepsig", ".fasta")
with open(save_fasta_name, "wb") as fasta_file:
fasta_file.write(data)
print "completed writing fasta file"
peptide_seq_region = int(feature[1])
get_only_sequence = data.replace('\n', '').split('SV=')[1]
length_of_sequence = len(get_only_sequence[1:len(get_only_sequence)])
complete_sequence = get_only_sequence[1:len(get_only_sequence)]
rm_sig_pep_seq = complete_sequence[peptide_seq_region:len(complete_sequence)]
statistics = Counter(rm_sig_pep_seq)
stat_file = open(save_fasta_stat_name, 'w')
aa_one_letter = "ARNDBCEQZGHILKMFPSTWYV"
for letter in aa_one_letter:
stat_file.writelines('%s : %d \n' % (letter, statistics[letter]))
print "completed writing statistics"
stat_file.close()
accession = data.split('\n')[0]
without_sigpep = open(save_fasta_without_sigpep,'w')
without_sigpep.writelines(accession+'\n'+'\n'.join(rm_sig_pep_seq[pos:pos+100] for pos in xrange(0, len(rm_sig_pep_seq), 100)))
print "Completed writing fasta file without signal peptide residues"
without_sigpep.close()
def main():
for k,v in enumerate(ids_list):
get_xml_feature_signal_peptide(v)
if __name__ == '__main__':
main()
Hope it helps :)
From where is the value of nn obtained? It's something you know already, in the file of IDs, or something you have to calculate from the UniProt record?
It's from UniProt record under Sequence Annotation . Feature Key: Signal peptide , Positions: 1-nn, length: nn
Total number of each residue == sequence length you mean? What you want to do with total number of each residue? just to place them after filename (e.g filename_312)?
Sorry, its total count of each residue (No of Gly, Pro, Ala... in the sequence). When Fasta record is retrieved, it needs to be saved as UniProtID_nn, where nn is the length of signal peptide in the sequence as described in my comment above.