Here, I have written in Python which does the job. If you want to get more information regarding the any uniprot ID then its better to fetch xml (http://www.uniprot.org/uniprot/P02070.xml) file rather then fasta and parse the needed content using either lxml, SAX or dom. If you want then I can write xml parser too.
"""
*This script assumes that all the Uniprot accession are available so haven't written
any exception for unknown accession.
*The fasta file will be saved as O60243_411.fasta (filname_lengthofsequence.fasta)
"""
import urllib2
ids_list = ['O60243', 'Q8IZP7', 'P02070'] # or read from the file (.txt, csv)
uniprot_url = "http://www.uniprot.org/uniprot/" # constant Uniprot Namespace
def get_fasta(id):
# get the fasta record of the Uniprot ID
url_with_id = "%s%s%s" %(uniprot_url, id, ".fasta") # <http://www.uniprot.org/uniprot/O60243.fasta>
file_from_uniprot = urllib2.urlopen(url_with_id)
# file_from_uniprot = urllib2.urlopen(url_with_id).read() is better no extra variable
data = file_from_uniprot.read() # read the retrieve file
"""
- replace all the newlines from the file_from_uniprot by replace('\n') method
and make it one line.
- after that split the line by split('SV=') and get first second item from the list which is 1
- then compute the length of the sequence starting from 1 to length of sequence.
"""
get_only_sequence = data.replace('\n', '').split('SV=')[1]
length_of_sequence = len(get_only_sequence[1:len(get_only_sequence)])
file_output_name = "%s%s%s%s" %(id,"_", length_of_sequence, ".fasta")
# print file_output_name
# write file to local computer
with open(file_output_name, "wb") as fasta_file:
fasta_file.write(data)
print "completed"
def main():
# iterate over the list of IDS
for k,v in enumerate(ids_list):
get_fasta(v)
# or read from a text file
# input_file = open("myuniprot_id.txt").readlines()
# get_fasta(input_file)
if __name__ == '__main__':
main()
UPDATED
My first attempt wasn't to your requirement which I misunderstood. But, I spare sometime and wrote new script and sure its what you trying to achieve.
I have tried to clear by writing precise comments with lengthy variable names. It creates three different files (fasta, fasta without signal peptide and statistics). Here is a code which I haven't done any refactoring.
import xml.etree.ElementTree as ET
import urllib2
from collections import Counter # require Python 2.7 and higher
ids_list = ['P01165', 'P53634', 'Q8CW46']
uniprot_xml_namespace = "{http://uniprot.org/uniprot}"
uniprot_url = "http://www.uniprot.org/uniprot/" # constant Uniprot Namespace
def get_xml_feature_signal_peptide(id):
# get id and fetched the content as xml from Uniprot.
# the xml file from uniprot has all the information which is parsed using ElementTree
url_with_id = "%s%s%s" %(uniprot_url, id, ".xml") # <http://www.uniprot.org/uniprot/O60243.xml>
file_to_parse = urllib2.urlopen(url_with_id)
tree = ET.parse(file_to_parse) # parse the xml file
"""
from the tree get feature element with attribute signal peptide and find the
position. Here its assume the length of the signal peptide will be equivalent
to end position.
"""
get_feature = tree.findall(".//"+uniprot_xml_namespace+"feature[@type='signal peptide']")
if get_feature:
feature_location_position = []
for eachitem in get_feature:
for item in eachitem.getchildren()[0]:
feature_location_position.append(item.attrib.get('position'))
get_fasta_file_and_statistics(id, feature_location_position)
else:
print "no signal peptide found for id: %s" %id
def get_fasta_file_and_statistics(id, feature):
url_with_id = "%s%s%s" %(uniprot_url, id, ".fasta") # <http://www.uniprot.org/uniprot/O60243.fasta>
file_from_uniprot = urllib2.urlopen(url_with_id)
data = file_from_uniprot.read() # read the retrieve file
save_fasta_name = "%s%s%s%s" % (id, "_", feature[1], ".fasta")
save_fasta_stat_name = "%s%s%s%s%s" % (id, "_", feature[1], "_stat", ".txt")
save_fasta_without_sigpep = "%s%s%s%s%s" % (id, "_", feature[1], "_without_pepsig", ".fasta")
# write fasta file to the local computer
with open(save_fasta_name, "wb") as fasta_file:
fasta_file.write(data)
print "completed writing fasta file"
# remove signal peptide from the sequence
peptide_seq_region = int(feature[1])
get_only_sequence = data.replace('\n', '').split('SV=')[1]
length_of_sequence = len(get_only_sequence[1:len(get_only_sequence)])
complete_sequence = get_only_sequence[1:len(get_only_sequence)]
rm_sig_pep_seq = complete_sequence[peptide_seq_region:len(complete_sequence)]
statistics = Counter(rm_sig_pep_seq)
# write statistics file to local computer
stat_file = open(save_fasta_stat_name, 'w')
aa_one_letter = "ARNDBCEQZGHILKMFPSTWYV"
for letter in aa_one_letter:
stat_file.writelines('%s : %d \n' % (letter, statistics[letter]))
print "completed writing statistics"
stat_file.close()
# write fasta file without peptide signal
accession = data.split('\n')[0]
without_sigpep = open(save_fasta_without_sigpep,'w')
# write 100 residues per line and do line break
without_sigpep.writelines(accession+'\n'+'\n'.join(rm_sig_pep_seq[pos:pos+100] for pos in xrange(0, len(rm_sig_pep_seq), 100)))
print "Completed writing fasta file without signal peptide residues"
without_sigpep.close()
def main():
for k,v in enumerate(ids_list):
get_xml_feature_signal_peptide(v)
# OR
# get_xml_feature_signal_peptide('Q8CW46')
# OR
#read from a text file
#input_file = open("myuniprot_id.txt").readlines()
#get_xml_feature_signal_peptide(input_file)
if __name__ == '__main__':
main()
Hope it helps :)
From where is the value of nn obtained? It's something you know already, in the file of IDs, or something you have to calculate from the UniProt record?
It's from UniProt record under Sequence Annotation . Feature Key: Signal peptide , Positions: 1-nn, length: nn
Total number of each residue == sequence length you mean? What you want to do with total number of each residue? just to place them after filename (e.g filename_312)?
Sorry, its total count of each residue (No of Gly, Pro, Ala... in the sequence). When Fasta record is retrieved, it needs to be saved as UniProtID_nn, where nn is the length of signal peptide in the sequence as described in my comment above.