KEGG pathway modules
1
0
Entering edit mode
3.0 years ago

Hi everyone,

I'm looking to create a table with in the first column genes (ENSEMBL ID or symbol) and in the second column all the pathway modules in which the gene is involved. For example this should be the result:

| Gene 1 | module 1, module 2 , ... |

| Gene 2 | module 5, module 7 , ... |

| Gene 3 | module 1, module 9 , ... |

There's a way to obtain such table in Python or in R from KEGG?

Thank you very much in advance everyone.

KEGG R python • 1.9k views
ADD COMMENT
0
Entering edit mode

I have a solution for pathway IDs in R that involves inverting one of the pathway to gene data structures from the gage library, but it doesn't address Module IDs.

ADD REPLY
1
Entering edit mode
3.0 years ago

Maybe something like this? It will return all genes in all KEGG pathways. If you are intrested in a subset of genes, subste the finalDF with your genes of intrest: NOTE As per the comment below, updated on 2021-11-29

library(org.Hs.eg.db)
library(dplyr)
library(tidyr)
library(jsonlite)

db <- org.Hs.egPATH
# Get the entrez gene identifiers that are mapped to a KEGG pathway ID
mapped_genes <- mappedkeys(db)
# Convert to a list
mapped_genesList <- as.list(db[mapped_genes])
# converting list to dataframe
df <- plyr::ldply (mapped_genesList, data.frame)
mappedDF = data.frame(ENTREZ_ID = as.numeric(df[,1]), KEGG_ID = paste0("map",df[,2]))

# Retriving KEGG module data
url = "https://www.genome.jp/kegg-bin/download_htext?htext=ko00002&format=json&filedir="
download.file(url, destfile = "~/keggM.json", method = "curl")

# reading json
document <- fromJSON(txt=url)
# parsing json
df = data.frame(Reduce(rbind, document))
# pathway modules
pathMod = df[2,2]
pathModDF = data.frame(Reduce(rbind, pathMod))
pathway_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
for(f in 1:dim(pathModDF)[1]){
  for(i in 1:dim(pathModDF)[1]){
    tmp = pathModDF[[2]][[i]]
    for(j in 1:dim(tmp)[1]){
      tmp2 = tmp[[2]][[j]]
      tmp2$module = substr(tmp2$name,1,7)
      tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
      tmp2$path = sub("PATH:", "", tmp2$path)
      tmp2$p1Path = tmp[j,1]
      tmp2$p2Path = pathModDF[f,1]
      pathway_modules= rbind(pathway_modules, tmp2)
    }
  }
}

sigMod = df[3,2]
sigModDF = data.frame(Reduce(rbind, sigMod))
sig_modules = data.frame(name =c(), modules = c(), path = c(), p1Path = c(), p2Path = c())
for(f in 1:dim(sigModDF)[1]){
  for(i in 1:dim(sigModDF)[1]){
    tmp = sigModDF[[2]][[i]]
    for(j in 1:dim(tmp)[1]){
      tmp2 = tmp[[2]][[j]]
      tmp2$module = substr(tmp2$name,1,7)
      tmp2$path = stringr::str_extract(string = tmp2$name, pattern = "(?<=\\[)[^{}]+(?=\\])")
      tmp2$path = sub("PATH:", "", tmp2$path)
      tmp2$p1Path = tmp[j,1]
      tmp2$p2Path = sigModDF[f,1]
      sig_modules= rbind(sig_modules, tmp2)

    }
  }
}

pathway_modules$module_type = "pathway"
sig_modules$module_type = "signature"

keggModule = rbind(pathway_modules, sig_modules) 

# module matrix
modMat = keggModule[,c(2,3)]
modMat = data.frame(cbind(keggModule[,2], stringr::str_split_fixed(keggModule$path, " ", 7))) # 7 maximum pathways assigned to a Module
modMat[modMat == ""] <- NA
# long dataframe fro module and pathway
path2Mod = data.frame(module = c(), path = c())
for(i in 2:ncol(modMat)){
  tmp = modMat[,c(1,i)][!is.na(modMat[i]),]
  names(tmp) = c("module", "path")
  path2Mod = rbind(path2Mod, tmp)
}

#deduplication
path2Mod= path2Mod[!duplicated(paste0(path2Mod$module, path2Mod$path)),]

# joing datasets
kegg = merge(path2Mod, mappedDF, by.x = "path" , by.y ="KEGG_ID" , all.x = TRUE)
# removing duplicates in kegg dataset
keggdedup = kegg[!duplicated(paste0(kegg$module, kegg$ENTREZ_ID)),]

#final dataset
finalDF = aggregate(. ~ ENTREZ_ID, keggdedup[, c(2,3)], FUN = function(x) 
  toString(x), na.action = NULL)
ADD COMMENT
0
Entering edit mode

Thank you for the help, but unfortunately that's not what I'm looking for. The structure of the final data frame it's correct, but in the second column instead of the list of pathways I need the list of KEGG pathway modules, which is the information that I'm not able to retrieve.

ADD REPLY
1
Entering edit mode

I see that, now I have updated my reply accordingly. To map module IDs in the final table to module name you may look at the "keggModule" dataset

ADD REPLY

Login before adding your answer.

Traffic: 1762 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6