Python 3 + BioPython with command-line interface. Basically, a less hardcoded and flexible version of Leszek's code, that doesn't fail when the list of accessions is so big, that GI retrieval results in HTTP Error 414: Request-URI Too Large
#! /usr/bin/env python3
import argparse
import sys
import os
import Bio.Entrez
RETMAX = 10**9
GB_EXT = ".gb"
def parse_args(arg_lst):
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, required=True,
help="A file with accessions to download")
parser.add_argument("-d", "--database", type=str, required=True,
help="NCBI database ID")
parser.add_argument("-e", "--email", type=str, required=False,
default="some_email@somedomain.com",
help="An e-mail address")
parser.add_argument("-b", "--batch", type=int, required=False, default=100,
help="The number of accessions to process per request")
parser.add_argument("-o", "--output_dir", type=str, required=True,
help="The directory to write downloaded files to")
return parser.parse_args(arg_lst)
def read_accessions(fp):
with open(fp) as acc_lines:
return [line.strip() for line in acc_lines]
def accessions_to_gb(accessions, db, batchsize, retmax):
def batch(sequence, size):
l = len(accessions)
for start in range(0, l, size):
yield sequence[start:min(start + size, l)]
def extract_records(records_handle):
buffer = []
for line in records_handle:
if line.startswith("LOCUS") and buffer:
# yield accession number and record
yield buffer[0].split()[1], "".join(buffer)
buffer = [line]
else:
buffer.append(line)
yield buffer[0].split()[1], "".join(buffer)
def process_batch(accessions_batch):
# get GI for query accessions
query = " ".join(accessions_batch)
query_handle = Bio.Entrez.esearch(db=db, term=query, retmax=retmax)
gi_list = Bio.Entrez.read(query_handle)['IdList']
# get GB files
search_handle = Bio.Entrez.epost(db=db, id=",".join(gi_list))
search_results = Bio.Entrez.read(search_handle)
webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
records_handle = Bio.Entrez.efetch(db=db, rettype="gb", retmax=batchsize,
webenv=webenv, query_key=query_key)
yield from extract_records(records_handle)
accession_batches = batch(accessions, batchsize)
for acc_batch in accession_batches:
yield from process_batch(acc_batch)
def write_record(dir, accession, record):
with open(os.path.join(dir, accession + GB_EXT), "w") as output:
print(record, file=output)
def main(argv):
args = parse_args(argv)
accessions = read_accessions(os.path.abspath(args.input))
op_dir = os.path.abspath(args.output_dir)
if not os.path.exists(op_dir):
os.makedirs(op_dir)
dbase = args.database
Bio.Entrez.email = args.email
batchsize = args.batch
for acc, record in accessions_to_gb(accessions, dbase, batchsize, RETMAX):
write_record(op_dir, acc, record)
if __name__ == "__main__":
main(sys.argv[1:])
can you give some example of your accession number? where is it from?
Ex: A22237,A22239,A32021,A32022,A33397 Those are accessions from NCBI. When you post them using epost, it gives this error: "IDs contain invalid characters which was treated as delimiters." So it appears to me that epost doesn't accept non-numeric characters for ID field. I tried to change the letters to their ascii codes, didn't help.
One more thing that I noticed today: All the BioXXX libraries just stop when they get error from epost. But I noticed that along with error, epost still returns WebEnv and query_key. But what it does is that, it takes the accession number, trims out the non-numeric characters, and searches for resultant GID. So, A22237 turns to 22237. Don't know what to do. Such a tiny problem taking up lot's of time.