|
#!/usr/bin/env python |
|
|
|
# Install into new environment and activate it before running this script: |
|
# |
|
# conda create -n biostars-212519 python=3 requests numpy pandas |
|
# source activate biostars-212519 |
|
# python identify_encode_controls.py |
|
|
|
import pandas |
|
import numpy as np |
|
import requests |
|
|
|
|
|
def accession_metadata(acc): |
|
""" |
|
Returns the metadata for ENCODE accession `acc` (e.g., ENCSR000BJN) |
|
""" |
|
HEADERS = {'accept': 'application/json'} |
|
URL = ( |
|
'https://www.encodeproject.org/experiments/{0}/?frame=object' |
|
.format(acc) |
|
) |
|
response = requests.get(URL, headers=HEADERS) |
|
return response.content |
|
|
|
# You can get a URL interactively on encodeproject.org by subsetting your |
|
# query, clicking on the "download" button, and extracting the first line of |
|
# that file. This example is all HepG2 ChIP-seq data. |
|
metadata_url = ( |
|
"https://www.encodeproject.org/metadata/type=Experiment&" |
|
"biosample_term_name=HepG2&assay_title=ChIP-seq&limit=all/metadata.tsv" |
|
) |
|
|
|
df = pandas.read_table(metadata_url) |
|
|
|
# subset just the first 100 rows for this example |
|
df = df.iloc[:100] |
|
|
|
|
|
def find_controls(acc): |
|
""" |
|
The metadata for an accession contains a "possible controls" field. I'm |
|
taking that to mean there can be multiple controls, so to be safe I'm |
|
returning a list of them. |
|
""" |
|
m = pandas.read_json(accession_metadata(acc), typ='series') |
|
c = m['possible_controls'] |
|
return [i.split('/')[2] for i in c] |
|
|
|
|
|
# the metadata has multiple rows for each accession. To speed things up |
|
# dramatically, only look for controls for the unique set of accessions, and |
|
# then join them to the dataframe afterwards. |
|
# |
|
ds = [] |
|
for acc in df['Experiment accession'].unique(): |
|
print('getting metadata for accession:', acc) |
|
ds.append( |
|
{ |
|
'Experiment accession': acc, |
|
'controls': find_controls(acc) |
|
} |
|
) |
|
controls = pandas.DataFrame(ds).set_index('Experiment accession') |
|
|
|
# join controls to full metadata |
|
df = df.join(controls, on='Experiment accession') |
You'd have to go through them manually, each experiment has a control dataset specified. Well ... almost all of them do. ENCODE is funny sometimes and this right here is one of the most frustrating reasons.
That's the only way I have found to work as well, the problem is that it will take more time that I'm willing to spend to go through all the files I have downloaded and find their control experiments manually. I really hope there is another way!