Question

argparse to pull fasta files from GenBank

0

Entering edit mode

5.6 years ago

mac03pat ▴ 30

I pulled the code below from an old biostars post (https://www.biostars.org/p/66921/):

import argparse
import sys
import os

import Bio.Entrez


RETMAX = 10**9
GB_EXT = ".gb"


def parse_args(arg_lst):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str, required=True,
                        help="A file with accessions to download")
    parser.add_argument("-d", "--database", type=str, required=True,
                        help="NCBI database ID")
    parser.add_argument("-e", "--email", type=str, required=False,
                        default="some_email@somedomain.com",
                        help="An e-mail address")
    parser.add_argument("-b", "--batch", type=int, required=False, default=100,
                        help="The number of accessions to process per request")
    parser.add_argument("-o", "--output_dir", type=str, required=True,
                        help="The directory to write downloaded files to")

    return parser.parse_args(arg_lst)


def read_accessions(fp):
    with open(fp) as acc_lines:
        return [line.strip() for line in acc_lines]


def accessions_to_gb(accessions, db, batchsize, retmax):
    def batch(sequence, size):
        l = len(accessions)
        for start in range(0, l, size):
            yield sequence[start:min(start + size, l)]

    def extract_records(records_handle):
        buffer = []
        for line in records_handle:
            if line.startswith("LOCUS") and buffer:
                # yield accession number and record
                yield buffer[0].split()[1], "".join(buffer)
                buffer = [line]
            else:
                buffer.append(line)
        yield buffer[0].split()[1], "".join(buffer)

    def process_batch(accessions_batch):
        # get GI for query accessions
        query = " ".join(accessions_batch)
        query_handle = Bio.Entrez.esearch(db=db, term=query, retmax=retmax)
        gi_list = Bio.Entrez.read(query_handle)['IdList']

        # get GB files
        search_handle = Bio.Entrez.epost(db=db, id=",".join(gi_list))
        search_results = Bio.Entrez.read(search_handle)
        webenv, query_key = search_results["WebEnv"], search_results["QueryKey"]
        records_handle = Bio.Entrez.efetch(db=db, rettype="gb", retmax=batchsize,
                                           webenv=webenv, query_key=query_key)
        yield from extract_records(records_handle)

    accession_batches = batch(accessions, batchsize)
    for acc_batch in accession_batches:
        yield from process_batch(acc_batch)


def write_record(dir, accession, record):
    with open(os.path.join(dir, accession + GB_EXT), "w") as output:
        print(record, file=output)


def main(argv):
    args = parse_args(argv)
    accessions = read_accessions(os.path.abspath(args.input))
    op_dir = os.path.abspath(args.output_dir)
    if not os.path.exists(op_dir):
        os.makedirs(op_dir)
    dbase = args.database
    Bio.Entrez.email = args.email
    batchsize = args.batch

    for acc, record in accessions_to_gb(accessions, dbase, batchsize, RETMAX):
        write_record(op_dir, acc, record)


if __name__ == "__main__":
    main(sys.argv[1:])

Part of the program I'm writing pulls about 80 FASTA files from GenBank via accession numbers. I saved the code in a file and ran this in my windows command line:

C:\Users\mac03\AppData\Local\Programs\Python\Python37\MBSProject>python accesstofasta.py -i HQ823621 -d genbank -o C:\Users\mac03\AppData\Local\Programs\Python\Python37\MBSProject\fastafiles

This error was returned:

Traceback (most recent call last):
  File "accesstofasta.py", line 90, in <module>
    main(sys.argv[1:])
  File "accesstofasta.py", line 77, in main
    accessions = read_accessions(os.path.abspath(args.input))
  File "accesstofasta.py", line 30, in read_accessions
    with open(fp) as acc_lines:
FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mac03\\AppData\\Local\\Programs\\Python\\Python37\\MBSProject\\HQ823621'

It seems to be looking for the accession number "HQ823621" as a file in the folder MBSProject. I had thought this program would pull directly from GenBank. I only entered one of the accession numbers as I wasn't sure how to properly use the program and figured I would try with one first.

I've been coding for about 7 weeks and have never used argparse before so help is greatly appreciated!

Thanks, -Mac

fasta argparase genbank • 1.5k views

ADD COMMENT • link updated 5.6 years ago by AK ★ 2.2k • written 5.6 years ago by mac03pat ▴ 30

score 1 · Answer 1 · 2019-05-18

Try:

Put all the accession numbers that you want to query in a file, for example: list.txt;
Change database from genbank to nuccore;
If you need to fetch fasta format, change GB_EXT = ".gb" to GB_EXT = ".fa", and rettype="gb" to rettype="fasta".

$ cat list.txt
HQ823621

$ python accesstofasta.py -i list.txt -d nuccore -o fastafiles

$ head fastafiles/Cladonia.fa
>HQ823621.1 Cladonia grayi isolate PKS15 putative polyketide synthase gene, complete cds
ATGGCTGGGATGCAGTTCCTCCTCTTTGGGGATCAATCATCCTTCGACCGTGCCCATATTCAGAAACAAG
TGGTAGAGAGCAGGGAGAATCCTTTTCTCAGTCTCTTCTACCGAAAAACTAGCGATGCTCTCAGGCATGA
GATTGCTCAGCTCTCTCCCTTGGAGACGACGTCAATTCCCAGCTTCACTACCATTACCGAGCTCAACGAC
CGGATAGGCTCCAAGGGTGCTCATGAAGGGGTGCAAAATGCTCTCCAGTGCATTTCCCAATTGGCACATT
ATATCGAGTAAGATTCAACATCGTGGATTCTAGTTTTCAGATCTAAATGCTCCCAGGTACACCAGTGCTC
TCCATGGCCAAATTGAGGAGACAGCAAAATCATGCGTTGTTGGATTGTCCACTGGTTTNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTCTACTCCGAAAAACTAGCGATGCTCTCAGGCATGA
GATTGCTCAGCTCTCTCCCTTGGAGACGACGTCAATTCCCAGCTTCACTACCATTACCGAGCTCAACGAC
CGGATAGGCTCCAAGGGTGCTCATGAAGGGGTGCAAAATGCTCTCCAGTGTGTTTCCCAATTGGCACATT