|
library(recount) |
|
library(edgeR) |
|
|
|
#/ Get the counts from GTEx via recount as a SummarizedExperiment |
|
#/ => 1.3GB file |
|
options(timeout=600) |
|
download_study("SRP012682", type = "rse-gene") |
|
load(file.path("SRP012682", "rse_gene.Rdata")) |
|
|
|
#/ remove whitespaces in tissue names: |
|
rse_gene$tissue <- gsub(" ", "_", rse_gene$smts) |
|
|
|
#/ see the tissues included: |
|
table(rse_gene$tissue) |
|
|
|
#/ make DGEList |
|
y <- DGEList(counts = assay(rse_gene, "counts"), |
|
group = rse_gene$tissue) |
|
|
|
#/ Subset the entire dataset to three tissues to make the process faster and less |
|
#/ memory intensive, for the sake of this post: |
|
y <- y[,y$samples$group %in% c("Heart", "Lung", "Pancreas")] |
|
|
|
#/ filter lowly-expressed genes: |
|
y <- y[filterByExpr(y, group=y$samples$group),] |
|
|
|
#/ naive CPMs, corrected only for library size: |
|
cpm_by_group_naive <- edgeR::cpmByGroup(y, log=TRUE) |
|
|
|
#/ and with the TMMs, corrected for composition: |
|
y <- calcNormFactors(y) |
|
cpm_by_group_TMM <- edgeR::cpmByGroup(y, log=TRUE) |
|
|
|
#/ Lets look at lung vs pancreas: |
|
naive <- cpm_by_group_naive[,c("Lung", "Pancreas")] |
|
TMM <- cpm_by_group_TMM[,c("Lung", "Pancreas")] |
|
|
|
#/ Make the MA-plot using smoothScatter: |
|
par(mfrow=c(1,2)) |
|
for(i in c("naive", "TMM")){ |
|
|
|
g1 <- get(i)[,1] |
|
g2 <- get(i)[,2] |
|
smoothScatter(x = 0.5 * (g1+g2), # average expr |
|
y = g1-g2, # fold change |
|
xlab = "average log expression", |
|
ylab = "log fold change", |
|
main = i, |
|
ylim=c(-6,6)) |
|
abline(h=0, col="red") |
|
abline(h=log2(2), col="red", lty=2) |
|
abline(h=-log2(2), col="red", lty=2) |
|
|
|
} |
|
|
|
|
Thanks ATpoint, does TMM normalization account for gene length in addition to sequencing depth and RNA composition?
Short answer: No it does not :)