First download the json:
https://www.kegg.jp/kegg-bin/get_htext?ko00001.keg
or do some fancy urllib
and StringIO
to pull directly from web:
A bit messy but here's my Python code. Forgive the lack of documentation but it's pretty straightforward. I coded it up on a plane ride:
import pandas as pd
from collections import *
database = list()
for _, v in pd.read_json("/Users/jespinoz/Downloads/ko00001.json").iterrows():
d = v["children"]
cat_1 = d["name"]
for child_1 in d["children"]:
cat_2 = child_1["name"] # Module?
for child_2 in child_1["children"]:
cat_3 = child_2["name"]
if "children" in child_2:
for child_3 in child_2["children"]:
cat_4 = child_3["name"]
fields = [cat_1, cat_2, cat_3, cat_4]
database.append(fields)
df_kegg = pd.DataFrame(database, columns=["Level_A", "Level_B", "Level_C", "Level_D"])
def parse_ko_identifiers(x):
x = x.upper()
kos = list()
elements = x.split(" ")
for word in elements:
if word:
conditions = [
word[0] == "K",
word[1:].isnumeric(),
len(word) == 6,
]
if all(conditions):
kos.append(word)
return set(kos)
df_kegg["Level_D-KOs"] = df_kegg["Level_D"].map(parse_ko_identifiers)
database_expanded = dict()
for i, row in df_kegg.iterrows():
for id_ko in row["Level_D-KOs"]:
database_expanded[id_ko] = row
df_kegg_expanded = pd.DataFrame(database_expanded).T
df_kegg_expanded.index.name = "KO"
df_kegg_expanded.columns = df_kegg_expanded.columns.map(lambda x: (x.split("-")[0], x))
for id_cat in ["Level_A", "Level_B", "Level_C"]:
df_kegg_expanded[(id_cat, "ID")] = df_kegg_expanded[(id_cat, id_cat)].map(lambda x: x.split(" ")[0])
df_kegg_expanded[(id_cat, "Name")] = df_kegg_expanded[(id_cat, id_cat)].map(lambda x: " ".join(x.split(" ")[1:]))
def f(x):
if "; " in x:
return x.split("; ")[1]
else:
return x
df_kegg_expanded[("Level_D", "Name")] = df_kegg_expanded[("Level_D", "Level_D")].map(f)
df_kegg_expanded.columns = df_kegg_expanded.columns.map(lambda x: (x[0], "Full") if x[0] == x[1] else x)
df_kegg_expanded = df_kegg_expanded.sort_index(axis=1)
Looks like this:
Can you use MetaCyc instead?
https://biodatamining.biomedcentral.com/articles/10.1186/s13040-018-0166-8
See data/pathways.info https://sourceforge.net/p/fun4me/code/ci/master/tree/