I pulled the code below from an old biostars post (https://www.biostars.org/p/66921/):
import argparse
import sys
import os
import Bio.Entrez
RETMAX = 10**9
GB_EXT = ".gb"
def parse_args(arg_lst):
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, required=True,
help="A file with accessions to download")
parser.add_argument("-d", "--database", type=str, required=True,
help="NCBI database ID")
parser.add_argument("-e", "--email", type=str, required=False,
default="some_email@somedomain.com",
help="An e-mail address")
parser.add_argument("-b", "--batch", type=int, required=False, default=100,
help="The number of accessions to process per request")
parser.add_argument("-o", "--output_dir", type=str, required=True,
help="The directory to write downloaded files to")
return parser.parse_args(arg_lst)
def read_accessions(fp):
with open(fp) as acc_lines:
return [line.strip() for line in acc_lines]
def accessions_to_gb(accessions, db, batchsize, retmax):
def batch(sequence, size):
l = len(accessions)
for start in range(0, l, size):
yield sequence[start:min(start + size, l)]
def extract_records(records_handle):
buffer = []
for line in records_handle:
if line.startswith("LOCUS") and buffer:
# yield accession number and record
yield buffer[0].split()[1], "".join(buffer)
buffer = [line]
else:
buffer.append(line)
yield buffer[0].split()[1], "".join(buffer)
def process_batch(accessions_batch):
# get GI for query accessions
query = " ".join(accessions_batch)
query_handle = Bio.Entrez.esearch(db=db, term=query, retmax=retmax)
gi_list = Bio.Entrez.read(query_handle)['IdList']
# get GB files
search_handle = Bio.Entrez.epost(db=db, id=",".join(gi_list))
search_results = Bio.Entrez.read(search_handle)
webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
records_handle = Bio.Entrez.efetch(db=db, rettype="gb", retmax=batchsize,
webenv=webenv, query_key=query_key)
yield from extract_records(records_handle)
accession_batches = batch(accessions, batchsize)
for acc_batch in accession_batches:
yield from process_batch(acc_batch)
def write_record(dir, accession, record):
with open(os.path.join(dir, accession + GB_EXT), "w") as output:
print(record, file=output)
def main(argv):
args = parse_args(argv)
accessions = read_accessions(os.path.abspath(args.input))
op_dir = os.path.abspath(args.output_dir)
if not os.path.exists(op_dir):
os.makedirs(op_dir)
dbase = args.database
Bio.Entrez.email = args.email
batchsize = args.batch
for acc, record in accessions_to_gb(accessions, dbase, batchsize, RETMAX):
write_record(op_dir, acc, record)
if __name__ == "__main__":
main(sys.argv[1:])
Part of the program I'm writing pulls about 80 FASTA files from GenBank via accession numbers. I saved the code in a file and ran this in my windows command line:
C:\Users\mac03\AppData\Local\Programs\Python\Python37\MBSProject>python accesstofasta.py -i HQ823621 -d genbank -o C:\Users\mac03\AppData\Local\Programs\Python\Python37\MBSProject\fastafiles
This error was returned:
Traceback (most recent call last):
File "accesstofasta.py", line 90, in <module>
main(sys.argv[1:])
File "accesstofasta.py", line 77, in main
accessions = read_accessions(os.path.abspath(args.input))
File "accesstofasta.py", line 30, in read_accessions
with open(fp) as acc_lines:
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mac03\\AppData\\Local\\Programs\\Python\\Python37\\MBSProject\\HQ823621'
It seems to be looking for the accession number "HQ823621" as a file in the folder MBSProject. I had thought this program would pull directly from GenBank. I only entered one of the accession numbers as I wasn't sure how to properly use the program and figured I would try with one first.
I've been coding for about 7 weeks and have never used argparse before so help is greatly appreciated!
Thanks, -Mac
Seems to have worked out great! Thank you.
If an answer was helpful you should upvote it, if the answer resolved your question you should mark it as accepted.