The documentation has an example for RNAseq data. You input your data, map to entrez id, correct multimapped entrez, generate the design and contrasts, load the gene sets, and run gsva. You also might want to check out EGSEA() as well.
#pipeline for RNAseq GSVA analysis
##install libraries
#source("https://bioconductor.org/biocLite.R")
#biocLite(c("limma","GSVA","GSVAdata","org.Hs.eg.db","GSEABase","snow","rlecuyer","edgeR"))
##
#load libraries
library(limma)
library(GSVA)
library(GSVAdata)
library(org.Hs.eg.db)
library(GSEABase)
library(snow)
library(rlecuyer)
library(edgeR)
##make_unique(df)
#usage: analysis_ready_data <- make_unique(data_frame)
#converts ensembl to entrez id and collapses redundant ids by greatest IQR
make_unique <- function(input.data) {
#annotate entrez id
input.data <- data.frame(cbind(input.data, entrez=as.numeric(mapIds(org.Hs.eg.db, keys=gsub("\\..*","",rownames(input.data)), column="ENTREZID", keytype="ENSEMBL",multiVals="first"))))
#remove NA
input.data <- input.data[!is.na(input.data$entrez),]
#parse duplicated entrez id
id.dup <- input.data[duplicated(input.data$entrez) | duplicated(input.data$entrez, fromLast = TRUE),ncol(input.data)]
data.dup <- as.matrix(input.data[duplicated(input.data$entrez) | duplicated(input.data$entrez, fromLast = TRUE),])
ezid.dup <- unique(id.dup)
data.unique <- input.data[!input.data$entrez %in% id.dup,]
rownames(data.unique) <- data.unique$entrez
data.unique$entrez = NULL
#assess IQR for duplicates
data.dup2 <- lapply(ezid.dup, function(x) {
expr <- data.dup[id.dup == x,]
if(is.matrix(expr)){
sd.values <- apply(expr[,1:ncol(input.data)-1], 1, sd)
mean.values <- apply (expr[,1:ncol(input.data)-1], 1, mean)
CV.values <- sd.values/mean.values
CV.values <- gsub("NaN","0",CV.values)
expr <- expr[which(CV.values == max(CV.values))[[1]],]
} else {
expr
}
})
#merge data
merger <- data.frame(do.call(rbind, data.dup2))
rownames(merger) <- merger$entrez
merger$entrez = NULL
eset <- as.matrix(rbind(data.unique, merger))
return(eset)
}
#load logCPM, convert ensembl id to entrez, and for multimaps retain entry with highest IQR
logCPM <- read.table("logCPM.csv",header=TRUE,row.names=1,sep="\t")
logCPM <- make_unique(logCPM)
#load omics, e.g., factor levels for each sample
factor.table <- read.delim("omics.csv", header=TRUE,sep="\t")
groups <- factor(factor.table$group)
institution <- factor(factor.table$Institution)
lane <- factor(factor.table$Lane)
group.intercepts <- model.matrix(~0 + groups + institution + lane)
contrasts <- makeContrasts(
groups1 - (groups2 + groups3) / 2,
groups2 - (groups1 + groups3) / 2,
groups3 - (groups1 + groups2) / 2, levels=group.intercepts)
#C2 collection of curated gene sets that form part of the Molecular Signatures Database (MSigDB) version 3.0.
data(c2BroadSets)
gene.sets <- GeneSetCollection(c2BroadSets)
#calculate GSVA enrichment scores
enrichment.scores <- gsva(logCPM, gene.sets, method = "gsva", mx.diff = TRUE, verbose=TRUE, rnaseq=FALSE, parallel.sz=8)$es.obs
fit <- lmFit(enrichment.scores, group.intercepts)
fit2 <- contrasts.fit(fit, contrasts)
fit2 <- eBayes(fit2,trend = TRUE, robust = TRUE)
topTable(fit2, coef=1)
topTable(fit2, coef=2)
topTable(fit2, coef=3)
Thanks a lot for the detailed reply!
Hi, colonppg:
Can you use GSVA for your RNA-seq RPKM data now? Can you please share your R script with me? I still have a lot problem with it. Thanks very much.
HY
Thanks for the response to the question. Given this is rnaseq data please can you explain the rnaseq=FALSE in your gsva call?
Answering my own question: My vignette didn't explain this but I can now see in https://rdrr.io/bioc/GSVA/man/gsva.html that rnaseq=TRUE is for discrete count data and rnaseq=FALSE is for when continuous values such as TPM are used. Since you have continuous log CPM values above you have correctly set rnaseq=FALSE. The gsva author indicates that this parameter might be renamed in future to avoid confusion :)