Question

How to extract gene ids from NCBI gene names using either efetch or python ?

2

Entering edit mode

5.6 years ago

lakhujanivijay 5.9k

How to fetch gene ids (in RED) from NCBI gene names (in BLUE) using either efetch or python?

I am looking at this link and it does exactly the opposite of what I want.

from Bio import Entrez
import sys

id_list = ['3799']

Entrez.email = "*****@gmail.com"

def retrieve_annotation(id_list):

    request = Entrez.epost("gene",id=",".join(id_list))
    try:
        result = Entrez.read(request)
    except RuntimeError as e:

        print "An error occurred while retrieving the annotations."
        print "The error returned was %s" % e
        sys.exit(-1)

    webEnv = result["WebEnv"]
    queryKey = result["QueryKey"]
    data = Entrez.esummary(db="gene", webenv=webEnv, query_key =
            queryKey)
    annotations = Entrez.read(data)

    print "Retrieved %d annotations for %d genes" % (len(annotations),
            len(id_list))

    return annotations


def print_data(annotation):
    for gene_data in annotation:
        gene_id = gene_data["Id"]
        gene_symbol = gene_data["NomenclatureSymbol"]
        gene_name = gene_data["Description"]
        print "ID: %s - Gene Symbol: %s - Gene Name: %s" % (gene_id, gene_symbol, gene_name)


annotation=retrieve_annotation(id_list)

print annotation

Output

python ncbi.py 
Retrieved 1 annotations for 1 genes
DictElement({u'DocumentSummarySet': DictElement({u'DbBuild': 'Build190501-0100m.1', u'DocumentSummary': [DictElement({u'Status': '0', u'NomenclatureSymbol': 'KIF5B', u'OtherDesignations': 'kinesin-1 heavy chain|conventional kinesin heavy chain|epididymis secretory protein Li 61|kinesin 1 (110-120kD)|kinesin heavy chain|ubiquitous kinesin heavy chain', u'Mim': ['602809'], u'Name': 'KIF5B', u'NomenclatureName': 'kinesin family member 5B', u'CurrentID': '0', u'GenomicInfo': [DictElement({u'ChrAccVer': 'NC_000010.11', u'ChrLoc': '10', u'ExonCount': '27', u'ChrStop': '32009009', u'ChrStart': '32056442'}, attributes={})], u'OtherAliases': 'HEL-S-61, KINH, KNS, KNS1, UKHC', u'Summary': '', u'GeneWeight': '9359', u'GeneticSource': 'genomic', u'MapLocation': '10p11.22', u'ChrSort': '10', u'ChrStart': '32009009', u'LocationHist': [DictElement({u'AssemblyAccVer': 'GCF_000001405.38', u'ChrAccVer': 'NC_000010.11', u'AnnotationRelease': '109', u'ChrStop': '32009009', u'ChrStart': '32056442'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000001405.33', u'ChrAccVer': 'NC_000010.11', u'AnnotationRelease': '108', u'ChrStop': '32009009', u'ChrStart': '32056442'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000306695.2', u'ChrAccVer': 'NC_018921.2', u'AnnotationRelease': '108', u'ChrStop': '32299659', u'ChrStart': '32347070'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000001405.28', u'ChrAccVer': 'NC_000010.11', u'AnnotationRelease': '107', u'ChrStop': '32009009', u'ChrStart': '32056442'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000306695.2', u'ChrAccVer': 'NC_018921.2', u'AnnotationRelease': '107', u'ChrStop': '32299659', u'ChrStart': '32347070'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000001405.25', u'ChrAccVer': 'NC_000010.10', u'AnnotationRelease': '105', u'ChrStop': '32297937', u'ChrStart': '32345370'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000002125.1', u'ChrAccVer': 'AC_000142.1', u'AnnotationRelease': '105', u'ChrStop': '32018110', u'ChrStart': '32065918'}, attributes={}), DictElement({u'AssemblyAccVer': 'GCF_000306695.2', u'ChrAccVer': 'NC_018921.2', u'AnnotationRelease': '105', u'ChrStop': '32299659', u'ChrStart': '32347070'}, attributes={})], u'Organism': DictElement({u'CommonName': 'human', u'ScientificName': 'Homo sapiens', u'TaxID': '9606'}, attributes={}), u'NomenclatureStatus': 'Official', u'Chromosome': '10', u'Description': 'kinesin family member 5B'}, attributes={u'uid': u'3799'})]}, attributes={u'status': u'OK'})}, attributes={})

ncbi entrez efetch • 3.4k views

ADD COMMENT • link updated 5.6 years ago by GenoMax 148k • written 5.6 years ago by lakhujanivijay 5.9k

score 2 · Accepted Answer · 2019-05-02

Using EntrezDirect

$ esearch -db gene -query "KIF5B [GENE] AND Homo [ORGN]" | esummary | xtract -pattern DocumentSummary -element Id
3799
3830

More generic solution (output trimmed for brevity):

$ esearch -db gene -query "KIF5B [GENE]" | esummary | xtract -pattern DocumentSummary -element Id,ScientificName
36810   Drosophila melanogaster
3799    Homo sapiens
16573   Mus musculus
117550  Rattus norvegicus
100855651       Canis lupus familiaris
595132  Sus scrofa
100038146       Xenopus tropicalis
514261  Bos taurus
696652  Macaca mulatta
100101320       Xenopus laevis
450390  Pan troglodytes
420472  Gallus gallus
103188818       Callorhinchus milii
101839615       Mesocricetus auratus