Entering edit mode
5.8 years ago
erick_rc93
▴
30
I'm trying to parse a multifasta file with python with the next script
import re
def loadFasta(filename):
if (filename.endswith(".gz")):
fp = gzip.open(filename, 'rb')
else:
fp = open(filename, 'rb')
# split at headers
data = fp.read().split(">")
fp.close()
# ignore whatever appears before the 1st header
data.pop(0)
headers = []
sequences = []
for sequence in data:
lines = sequence.split('\n')
headers.append(lines.pop(0))
# add an extra "+" to make string "1-referenced"
sequences.append('+' + ''.join(lines))
return (headers, sequences)
header, seq = loadFasta("/path/to/fasta/all_chromosomes.fasta")
for i in xrange(len(header)):
print (header[i])
print (len(seq[i])-1, "bases", seq[i][:30], "...", seq[i][-30:])
print
genome = seq[0]
But when I try to run the above script I get the next message error
Traceback (most recent call last):
File "parser_fasta.py", line 23, in <module>
header, seq = loadFasta("/path/to/fasta/all_chromosomes.fasta")
File "parser_fasta.py", line 10, in loadFasta
data = fp.read().split(">")
TypeError: a bytes-like object is required, not 'str'
any reason you didn't try https://biopython.org/wiki/SeqIO?