Hi all!
This post is rather lengthy, so I give a brief overview of what I intend to ask:
A) Does one need to check whether each different batch level contains at least 2 different experimental conditions of interest (ie diseased/normal)?
B) Should be the distribution of experimental condition within batches be balanced?
While analyzing a GEO dataset, I was confronted with the below sample characteristics, as Biobase::pData(my_ExpressionSet)). I think I should completely discard certain experiments before batch correction
> des # dput is given below
cell.type disease dnr pairbatch
Fibroblast affctd AA1 1 1a
Fibroblast affctd AA1 1 2a
Fibroblast affctd AA2 2 1a
Fibroblast affctd AA2 2 2a
...........
iPSC-d-nrn Normal NN2 2 13b
des= structure(list(cell.type = c("Fibroblast", "Fibroblast", "Fibroblast",
"Fibroblast", "Fibroblast", "Fibroblast", "Fibroblast", "Fibroblast",
"Fibroblast", "Fibroblast", "Fibroblast", "Fibroblast", "Fibroblast",
"Fibroblast", "Fibroblast", "Fibroblast", "inducedPSC", "inducedPSC",
"inducedPSC", "inducedPSC", "inducedPSC", "inducedPSC", "inducedPSC",
"inducedPSC", "inducedPSC", "inducedPSC", "inducedPSC", "inducedPSC",
"inducedPSC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC",
"iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC",
"iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-NPC", "iPSC-d-nrn",
"iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn",
"iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn", "iPSC-d-nrn",
"iPSC-d-nrn", "iPSC-d-nrn"), disease = c("affctd", "affctd",
"affctd", "affctd", "affctd", "affctd", "Normal", "Normal", "Normal",
"Normal", "Normal", "Normal", "Normal", "Normal", "Normal", "Normal",
"affctd", "affctd", "affctd", "affctd", "affctd", "Normal", "Normal",
"Normal", "Normal", "Normal", "Normal", "Normal", "Normal", "affctd",
"affctd", "affctd", "affctd", "affctd", "Normal", "Normal", "Normal",
"Normal", "Normal", "Normal", "Normal", "Normal", "affctd", "affctd",
"affctd", "affctd", "affctd", "Normal", "Normal", "Normal", "Normal",
"Normal", "Normal", "Normal", "Normal"), donor = c("AA1", "AA1",
"AA2", "AA2", "AA3", "AA4", "AN1", "AN1", "AN2", "AN2", "AN3",
"AN4", "NN1", "NN1", "NN2", "NN3", "AA1", "AA1", "AA2", "AA2",
"AA3", "AN1", "AN1", "AN2", "AN2", "AN3", "NN1", "NN1", "NN2",
"AA1", "AA1", "AA2", "AA2", "AA3", "AN1", "AN1", "AN2", "AN2",
"AN3", "NN1", "NN1", "NN2", "AA1", "AA1", "AA2", "AA2", "AA3",
"AN1", "AN1", "AN2", "AN2", "AN3", "NN1", "NN1", "NN2"), pair = c(1L,
1L, 2L, 2L, 3L, 4L, 1L, 1L, 2L, 2L, 3L, 4L, 1L, 1L, 2L, 3L, 1L,
1L, 2L, 2L, 3L, 1L, 1L, 2L, 2L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 2L,
3L, 1L, 1L, 2L, 2L, 3L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 3L, 1L, 1L,
2L, 2L, 3L, 1L, 1L, 2L), batch = c("1a", "2a", "1a", "2a", "1a",
"2a", "3a", "3a", "3a", "3b", "3a", "3b", "4a", "4a", "4a", "4a",
"9a", "10a", "9a", "12a", "12a", "10a", "10a", "11a", "9a", "21a",
"11a", "9a", "12a", "18a", "19a", "15a", "18a", "19a", "16b",
"19a", "16a", "19b", "19a", "16a", "18b", "17a", "14d", "14a",
"14c", "13b", "14c", "13a", "14a", "13b", "14b", "14b", "14b",
"13a", "13b")), class = "data.frame", row.names = c(NA, -55L))
# see whether both diseased/normal conditions are repsented in each batch: point A above
table(des$batch, des$disease)
affctd Normal
10a 1 2
11a 0 2
12a 2 1
13a 0 2
13b 1 2
14a 1 1
14b 0 3
14c 2 0
14d 1 0
15a 1 0
16a 0 2
16b 0 1
17a 0 1
18a 2 0
18b 0 1
19a 2 2
19b 0 1
1a 3 0
21a 0 1
2a 3 0
3a 0 4
3b 0 2
4a 0 4
9a 2 2
#try to remove baches lacking arrays from either condition
>bad.batch = apply (table(des$batch, des$disease),1,min) == 0
>table(des$batch, des$disease) [! bad.batch,]
affctd Normal
10a 1 2
12a 2 1
13b 1 2
14a 1 1
19a 2 2
9a 2 2
> keep=rownames(table(des$batch, des$disease) [! bad.batch,])
> data4further.analysis= des[des$batch %in% keep,]
> data4further.analysis
cell.type disease donor pair batch
17 inducedPSC affctd AA1 1 9a
18 inducedPSC affctd AA1 1 10a
19 inducedPSC affctd AA2 2 9a
20 inducedPSC affctd AA2 2 12a
21 inducedPSC affctd AA3 3 12a
22 inducedPSC Normal AN1 1 10a
23 inducedPSC Normal AN1 1 10a
25 inducedPSC Normal AN2 2 9a
28 inducedPSC Normal NN1 1 9a
29 inducedPSC Normal NN2 2 12a
31 iPSC-d-NPC affctd AA1 1 19a
34 iPSC-d-NPC affctd AA3 3 19a
36 iPSC-d-NPC Normal AN1 1 19a
39 iPSC-d-NPC Normal AN3 3 19a
44 iPSC-d-nrn affctd AA1 1 14a
46 iPSC-d-nrn affctd AA2 2 13b
49 iPSC-d-nrn Normal AN1 1 14a
50 iPSC-d-nrn Normal AN2 2 13b
55 iPSC-d-nrn Normal NN2 2 13b
and then proceed to sva::ComBat for batch effect removel or model.matrix %>% limma's lmFit and ... ,considering batch.
Thank you for your response and for reading my (long) post. I would appreiate if you also share your opinion on point A above, which I reiterate/paraphrase here:
A) Does one need to check whether each different batch level contains all different experimental conditions of interest (eg diseased/normal)? That is do you recommend ommiting rows
11a
and13a
containing 0s in this table:This can only be answered with the exact statistical question in mind. If a batch contains samples of one type only, any observations made on samples in that batch cannot be of high confidence because it is not trivial to deconvolute the underlying cause of the observations. It could be biology or it could be batch effects or a combination of both.
Thank you for your response. I wondered if you know of a reference or the name of this situation so that I myself search aticles/books.
Search for anything
batch effect
related and most papers/articles will stress on the importance of experiment design.Thank you again, RsmRS.