I want to obtain the upregulated and downregulated genes using limma
. However, all the DEGs returned by my code have positive LogFC and none are downregulated (negative LogFC). This observation is consistent across multiple distinct dataframes.
Is there something wrong with my code?
library(limma)
library(edgeR)
group<- factor(type, levels=c("1a", "1b", "1c", "2a", "2b", "2c"))
design <- model.matrix(~0+group)
colnames(design) <- c("samp1a", "samp1b", "samp1c", "samp2a", "samp2b", "samp2c")
rownames(design) <- colnames(df)
contrasts <- makeContrasts(samp1a-samp1b,samp1a-samp1c,samp1a-samp2a,samp1a-samp2b,samp1a-samp2c, samp1b-samp1c,samp1b-samp2a,samp1b-samp2b,samp1b-samp2c, samp1c-samp2a,samp1c-samp2b,samp1c-samp2c,samp2a-samp2b,samp2a-samp2c,samp2b-samp2c,levels=design)
mrna.dge <- DGEList(counts=mat, group=group)
keep <- filterByExpr(mrna.dge, design)
mrna.dge <- mrna.dge[keep,,keep.lib.sizes=FALSE]
mrna.dge <- calcNormFactors(mrna.dge)
logCPM <- cpm(mrna.dge, log=T)
fit <- lmFit(logCPM, design)
efit <- eBayes(fit, trend=TRUE)
efit <- na.omit(efit)
top.ranked.mrna <- topTable(efit, coef=ncol(design), number=100000)
# Get top DEGs
deg <- top.ranked.mrna[top.ranked.mrna$adj.P.Val < 0.05, ]
deg <- deg[order(deg$adj.P.Val, decreasing=F),]
# upregulated and downregulated DEGs
deg.up <- deg[deg$logFC > 0,]
deg.down <- deg[deg$logFC < 0,]
Data:
Data:
> dput(mat[1:20,1:20])
structure(c(391.94, 8, 1, 159.46, 40686.22, 0, 2583, 27, 0, 2110,
1360, 36, 1, 0, 0, 0, 80, 1722, 2590, 4478, 68.91, 75, 0, 247.06,
11009.03, 0, 4720, 16, 0, 2359, 2013, 53, 0, 0, 0, 0, 151, 2615,
2562, 4518, 71.9, 28, 33, 516.7, 3180.79, 0, 2768, 108, 0, 2591,
2388, 33, 0, 0, 0, 0, 199, 3338, 2715, 5936, 325.6, 47, 4, 151.49,
16771.52, 0, 1689, 8, 0, 1744, 1366, 149, 0, 0, 0, 1, 81, 1407,
3010, 4139, 70.42, 199, 1, 243.76, 8407.38, 0, 3618, 91, 0, 2569,
1928, 123, 0, 0, 0, 0, 92, 2144, 3096, 4825, 234.21, 9, 0, 288.14,
3785.26, 4, 2936, 38, 0, 2658, 1338, 24, 0, 0, 1, 0, 200, 1957,
1180, 5419, 36.75, 126, 0, 725.92, 12247.11, 1, 2889, 103, 0,
2983, 2877, 170, 0, 0, 0, 0, 695, 4046, 10246, 10136, 177.65,
1, 0, 142.16, 79779.59, 1, 3792, 61, 2, 1669, 1588, 1, 0, 0,
0, 0, 182, 1804, 1958, 6224, 17.19, 37, 0, 784.96, 3815.74, 0,
3441, 5, 0, 1406, 1990, 38, 0, 0, 0, 0, 117, 2177, 3708, 4412,
47.84, 4, 0, 342.64, 5265.48, 4, 5856, 16, 0, 2229, 1864, 79,
0, 0, 0, 0, 111, 3320, 3034, 4593, 672.33, 120, 3, 121.91, 16235.8,
14, 2354, 84, 0, 2234, 1971, 85, 0, 0, 0, 0, 294, 1954, 3901,
7452, 205.07, 2, 59, 223.86, 19282.71, 0, 9455, 11, 0, 1980,
3503, 1, 0, 0, 0, 0, 140, 2853, 3180, 7491, 253.67, 24, 3, 180.83,
15562.65, 1, 4012, 328, 0, 2296, 1462, 32, 0, 0, 0, 0, 127, 2362,
3604, 3815, 64.81, 1, 2, 245.23, 15341.74, 17, 1495, 0, 0, 1219,
2128, 100, 0, 0, 0, 0, 203, 2410, 2823, 4972, 133.26, 2, 4, 300.43,
17762.11, 0, 4043, 4, 0, 1801, 1391, 39, 0, 0, 0, 2, 121, 1685,
2699, 3679, 60, 4, 1, 174.82, 7955.45, 0, 1776, 1, 0, 850, 539,
46, 0, 0, 0, 0, 16, 1314, 1323, 1405, 131, 2, 4, 61.48, 15515.27,
0, 2059, 4, 0, 889, 506, 15, 0, 0, 0, 0, 14, 850, 793, 1320,
109.29, 5, 0, 203.74, 35661.02, 1, 5089, 145, 0, 2370, 1167,
44, 0, 0, 0, 1, 298, 1984, 2649, 4904, 25.81, 437, 1, 458.92,
9677.4, 4, 409, 18, 0, 2923, 1626, 45, 1, 0, 0, 0, 284, 2055,
1867, 9169, 396.46, 5, 1, 240.29, 97226.72, 55, 1908, 2, 0, 1437,
1086, 38, 1, 0, 0, 3, 97, 1769, 2562, 5130), dim = c(20L, 20L
), dimnames = list(c("A1BG", "A1CF", "A2BP1", "A2LD1", "A2M",
"A2ML1", "A4GALT", "A4GNT", "AAA1", "AAAS", "AACS", "AACSL",
"AADAC", "AADACL2", "AADACL3", "AADACL4", "AADAT", "AAGAB", "AAK1",
"AAMP"), c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01", "TCGA.2Z.A9J3.01",
"TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01", "TCGA.2Z.A9J8.01",
"TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01", "TCGA.2Z.A9JI.01",
"TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01", "TCGA.2Z.A9JP.01",
"TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01", "TCGA.4A.A93X.01",
"TCGA.4A.A93Y.01")))
> dput(design[1:20,])
structure(c(0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), dim = c(20L,
6L), dimnames = list(c("TCGA.2K.A9WE.01", "TCGA.2Z.A9J1.01",
"TCGA.2Z.A9J3.01", "TCGA.2Z.A9J5.01", "TCGA.2Z.A9J6.01", "TCGA.2Z.A9J7.01",
"TCGA.2Z.A9J8.01", "TCGA.2Z.A9J9.01", "TCGA.2Z.A9JD.01", "TCGA.2Z.A9JE.01",
"TCGA.2Z.A9JI.01", "TCGA.2Z.A9JJ.01", "TCGA.2Z.A9JL.01", "TCGA.2Z.A9JO.01",
"TCGA.2Z.A9JP.01", "TCGA.2Z.A9JQ.01", "TCGA.2Z.A9JT.01", "TCGA.4A.A93W.01",
"TCGA.4A.A93X.01", "TCGA.4A.A93Y.01"), c("group1a", "group1b",
"group1c", "group2a", "group2b", "group2c")))
Please show the design matrix or the code used to create the design matrix.
The result you report is a common occurrence when people mistakingly construct a test for an intercept instead of for a logFC, and showing the design matrix will reveal whether or not that is true. If I had to guess, I suspect you may have used a group-means design matrix and combined it with a coefficient test instead of forming contrasts.
I added the dput. Thanks for the reminder!
Design, not counts. Show the model.matrix. Is this even raw counts what you have there in the dput rather than any normalized values?
Yeah, I added the design matrix.
design <- model.matrix(~0+group)
. Thecounts
is log2-normalized raw data.It'd be helpful to know how many differentially expressed genes there are. If you've got a handful and they're all upregulated, I wouldn't think anything is amiss. If you've got thousands of DE genes, all upregulated, I would be very suspicious.
Several hundred to 10000 DEGs. Would normalization affect logFC?
It shouldn't, unless there's a misspecification in design. Is it the case that the library sizes are very different between the conditions you are comparing? Can you spot-check a few genes on the raw counts or CPMs?
No the library sizes across conditions should be relatively homogenous because it's from the same dataset. I clustered the samples into subtypes and am now comparing the differential expression across the subtypes.