I wrote a function that may be of some help:
def pull_vars(var_set,line_start,line,multi=False):
"""
This function parses data from flat files in one of three ways:
1.) Pulls variables out of a particular line when defined as "variablename=[value]"
2.) Pulls variables based on a set position within a line.
3.) Defines variables that can be identified based on a limited possible set of values.
"""
lineset = [x.split(' | ') for x in line if x.startswith(line_start)]
if len(lineset) == 0:
return
# If the same line exists multiple times - place results into an array
if multi == True:
pulled_vars = []
for line in lineset:
cur_set = {}
for k,v in var_set.items():
if type(v) == str:
try:
cur_set[k] = [x for x in line if x.startswith(v)][0].replace(v,'')
except:
pass
elif type(v) == int:
try:
cur_set[k] = line[v]
except:
pass
else:
try:
cur_set[k] = [x for x in line if x in v][0]
except:
pass
pulled_vars.append(cur_set)
return pulled_vars
else:
# Else if the line is always unique, output single dictionary
line = lineset[0]
pulled_vars = {}
for k,v in var_set.items():
if type(v) == str:
try:
pulled_vars[k] = [x for x in line if x.startswith(v)][0].replace(v,'')
except:
pass
elif type(v) == int:
try:
pulled_vars[k] = line[v]
except:
pass
else:
try:
pulled_vars[k] = [x for x in line if x in v][0]
except:
pass
return pulled_vars
def get_snp(q):
"""
This function takes as input a list of snp identifiers and returns
a parsed dictionary of their data from Entrez.
"""
response = Entrez.efetch(db='SNP', id=','.join(q), rettype='flt', retmode='flt').read()
r = {} # Return dictionary variable
# Parse flat file response
for snp_info in filter(None,response.split('\n\n')):
# Parse the First Line. Details of rs flat files available here:
# ftp://ftp.ncbi.nlm.nih.gov/snp/specs/00readme.txt
snp = snp_info.split('\n')
# Parse the 'rs' line:
rsId = snp[0].split(" | ")[0]
r[rsId] = {}
# rs vars
rs_vars = {"organism":1,
"taxId":2,
"snpClass":3,
"genotype":"genotype=",
"rsLinkout":"submitterlink=",
"date":"updated "}
# rs vars
ss_vars = {"ssId":0,
"handle":1,
"locSnpId":2,
"orient":"orient=",
"exemplar":"ss_pick=",
}
# SNP line variables:
SNP_vars = {"observed":"alleles=",
"value":"het=",
"stdError":"se(het)=",
"validated":"validated=",
"validProbMin":"min_prob=",
"validProbMax":"max_prob=",
"validation":"suspect=",
"AlleleOrigin":['unknown','germline','somatic','inherited','paternal','maternal','de-novo','bipaternal','unipaternal','not-tested','tested-inconclusive'],
"snpType":['notwithdrawn','artifact','gene-duplication','duplicate-submission','notspecified','ambiguous-location;','low-map-quality']}
# CLINSIG line variables:
CLINSIG_vars = {"ClinicalSignificance":['probable-pathogenic','pathogenic','other']}
# GMAF line variables
GMAF_vars = {"allele":"allele=",
"sampleSize":"count=",
"freq":"MAF="}
# CTG line variables
CTG_vars = {"groupLabel":"assembly=",
"chromosome":"chr=",
"physmapInt":"chr-pos=",
"asnFrom":"ctg-start=",
"asnTo":"ctg-end=",
"loctype":"loctype=",
"orient":"orient="}
# LOC line variables
LOC_vars = {"symbol":1,
"geneId":"locus_id=",
"fxnClass":"fxn-class=",
"allele":"allele=",
"readingFrame":"frame=",
"residue":"residue=",
"aaPosition":"aa_position="}
# LOC line variables
SEQ_vars = {"gi":1,
"source":"source-db=",
"asnFrom":"seq-pos=",
"orient":"orient="}
r[rsId]['rs'] = pull_vars(rs_vars,"rs",snp)
r[rsId]['ss'] = pull_vars(ss_vars,"ss",snp,True)
r[rsId]['SNP'] = pull_vars(SNP_vars,"SNP",snp)
r[rsId]['CLINSIG'] = pull_vars(CLINSIG_vars,"CLINSIG",snp)
r[rsId]['GMAF'] = pull_vars(GMAF_vars,"GMAF",snp)
r[rsId]['CTG'] = pull_vars(CTG_vars,"CTG",snp,True)
r[rsId]['LOC'] = pull_vars(LOC_vars,"LOC",snp,True)
r[rsId]['SEQ'] = pull_vars(SEQ_vars,"SEQ",snp,True)
return r
snp = get_snp(["12009","122"])
the SNP database uses quite different xml to the rest of the NCBI - check out this question for some workarounds http://biostar.stackexchange.com/questions/12262/find-amino-acid-change-for-snp-using-eutils
Brilliant! That's exactly what I needed - can't believe I didn't find it on my first search. Or think about using a different parser :P