|
def pull_vars(var_set,line_start,line,multi=False): |
|
""" |
|
This function parses data from flat files in one of three ways: |
|
|
|
1.) Pulls variables out of a particular line when defined as "variablename=[value]" |
|
2.) Pulls variables based on a set position within a line. |
|
3.) Defines variables that can be identified based on a limited possible set of values. |
|
|
|
""" |
|
lineset = [x.split(' | ') for x in line if x.startswith(line_start)] |
|
if len(lineset) == 0: |
|
return |
|
# If the same line exists multiple times - place results into an array |
|
if multi == True: |
|
pulled_vars = [] |
|
for line in lineset: |
|
cur_set = {} |
|
for k,v in var_set.items(): |
|
if type(v) == str: |
|
try: |
|
cur_set[k] = [x for x in line if x.startswith(v)][0].replace(v,'') |
|
except: |
|
pass |
|
elif type(v) == int: |
|
try: |
|
cur_set[k] = line[v] |
|
except: |
|
pass |
|
else: |
|
try: |
|
cur_set[k] = [x for x in line if x in v][0] |
|
except: |
|
pass |
|
pulled_vars.append(cur_set) |
|
return pulled_vars |
|
else: |
|
# Else if the line is always unique, output single dictionary |
|
line = lineset[0] |
|
pulled_vars = {} |
|
for k,v in var_set.items(): |
|
if type(v) == str: |
|
try: |
|
pulled_vars[k] = [x for x in line if x.startswith(v)][0].replace(v,'') |
|
except: |
|
pass |
|
elif type(v) == int: |
|
try: |
|
pulled_vars[k] = line[v] |
|
except: |
|
pass |
|
else: |
|
try: |
|
pulled_vars[k] = [x for x in line if x in v][0] |
|
except: |
|
pass |
|
return pulled_vars |
|
|
|
def get_snp(q): |
|
""" |
|
This function takes as input a list of snp identifiers and returns |
|
a parsed dictionary of their data from Entrez. |
|
""" |
|
|
|
response = Entrez.efetch(db='SNP', id=','.join(q), rettype='flt', retmode='flt').read() |
|
r = {} # Return dictionary variable |
|
# Parse flat file response |
|
for snp_info in filter(None,response.split('\n\n')): |
|
# Parse the First Line. Details of rs flat files available here: |
|
# ftp://ftp.ncbi.nlm.nih.gov/snp/specs/00readme.txt |
|
snp = snp_info.split('\n') |
|
# Parse the 'rs' line: |
|
rsId = snp[0].split(" | ")[0] |
|
r[rsId] = {} |
|
|
|
# rs vars |
|
rs_vars = {"organism":1, |
|
"taxId":2, |
|
"snpClass":3, |
|
"genotype":"genotype=", |
|
"rsLinkout":"submitterlink=", |
|
"date":"updated "} |
|
|
|
# rs vars |
|
ss_vars = {"ssId":0, |
|
"handle":1, |
|
"locSnpId":2, |
|
"orient":"orient=", |
|
"exemplar":"ss_pick=", |
|
} |
|
|
|
# SNP line variables: |
|
SNP_vars = {"observed":"alleles=", |
|
"value":"het=", |
|
"stdError":"se(het)=", |
|
"validated":"validated=", |
|
"validProbMin":"min_prob=", |
|
"validProbMax":"max_prob=", |
|
"validation":"suspect=", |
|
"AlleleOrigin":['unknown','germline','somatic','inherited','paternal','maternal','de-novo','bipaternal','unipaternal','not-tested','tested-inconclusive'], |
|
"snpType":['notwithdrawn','artifact','gene-duplication','duplicate-submission','notspecified','ambiguous-location;','low-map-quality']} |
|
|
|
# CLINSIG line variables: |
|
CLINSIG_vars = {"ClinicalSignificance":['probable-pathogenic','pathogenic','other']} |
|
|
|
# GMAF line variables |
|
GMAF_vars = {"allele":"allele=", |
|
"sampleSize":"count=", |
|
"freq":"MAF="} |
|
|
|
# CTG line variables |
|
CTG_vars = {"groupLabel":"assembly=", |
|
"chromosome":"chr=", |
|
"physmapInt":"chr-pos=", |
|
"asnFrom":"ctg-start=", |
|
"asnTo":"ctg-end=", |
|
"loctype":"loctype=", |
|
"orient":"orient="} |
|
|
|
# LOC line variables |
|
LOC_vars = {"symbol":1, |
|
"geneId":"locus_id=", |
|
"fxnClass":"fxn-class=", |
|
"allele":"allele=", |
|
"readingFrame":"frame=", |
|
"residue":"residue=", |
|
"aaPosition":"aa_position="} |
|
|
|
# LOC line variables |
|
SEQ_vars = {"gi":1, |
|
"source":"source-db=", |
|
"asnFrom":"seq-pos=", |
|
"orient":"orient="} |
|
|
|
r[rsId]['rs'] = pull_vars(rs_vars,"rs",snp) |
|
r[rsId]['ss'] = pull_vars(ss_vars,"ss",snp,True) |
|
r[rsId]['SNP'] = pull_vars(SNP_vars,"SNP",snp) |
|
r[rsId]['CLINSIG'] = pull_vars(CLINSIG_vars,"CLINSIG",snp) |
|
r[rsId]['GMAF'] = pull_vars(GMAF_vars,"GMAF",snp) |
|
r[rsId]['CTG'] = pull_vars(CTG_vars,"CTG",snp,True) |
|
r[rsId]['LOC'] = pull_vars(LOC_vars,"LOC",snp,True) |
|
r[rsId]['SEQ'] = pull_vars(SEQ_vars,"SEQ",snp,True) |
|
return r |
|
|
|
|
|
snp = get_snp(["12009","122"]) |
There is possibly more than one amino acid change associated with the SNP, but you can get the annotated ones from your response by looking in the RsStruct elements (or from the HGVS descriptions on NP references in the hgvs elements). E.g. calling .getElementsByTagName('hgvs') on the parsed document could be the first step. Consult some general documentation on XML DOM navigation if you need more information.
Thanks for the tip! Seems like etree can also do the job. But then back to my original question: how do I get the amino acid change from this xml? I am not very familiar with xml and was relying on the Entrez parser to do the job for me. I have no experience with etree or minidom