Hello!
I am performing a differential expression analysis on my scRNA sequencing dataset of n = 75 individuals. With my current setup, I only receive insignificant results.
I test the differential expression for each cell type across three different groups of my dataset (3 disease stages, so mutually exclusive).
This is my code:
# list of SingleCellExperiment objects
sce_merged <- do.call(cbind, sce.list)
library(Matrix)
# Define a minimal count threshold and the minimum number of cells that must have counts > threshold
min_count <- 10
min_cells <- 3
# Retrieve the counts matrix
counts_mat <- assay(sce_merged, "counts")
# Create a logical matrix indicating where counts exceed the threshold.
expressed <- counts_mat > min_count
# For each gene, count the number of cells where it is expressed
num_cells_expressed <- Matrix::rowSums(expressed)
# Determine which genes pass the filter
keep_genes <- num_cells_expressed >= min_cells
cat("Number of genes before filtering by min count and min cells:", nrow(sce_merged), "\n")
sce_merged_filtered <- sce_merged[keep_genes, ]
cat("Number of genes after filtering by min count and min cells:", nrow(sce_merged_filtered), "\n")
# Pseudo-bulking
# ----------------------------------------
# Create pseudo bulks for every patient and every cell type to continue with traditional analysis
summed <- scuttle::aggregateAcrossCells(sce_merged_filtered,
ids=colData(sce_merged_filtered)[,c(short_annotation_type, "orig.ident")])
table(summed[[short_annotation_type]], summed$diagnosis_name)
library(edgeR)
# Removing all pseudo-bulk samples with 'insufficient' cells.
summed.filt <- summed[,summed$ncells >= 10]
cell_count <- table(summed.filt[[short_annotation_type]], summed.filt$diagnosis_name)
print(cell_count)
# identify the cell types that do not exist at least at n=10 in all patients
cell_types_to_remove <- rownames(cell_count)[apply(cell_count, 1, function(x) all(x < 10))]
cat("Cell types to remove: \n", cell_types_to_remove)
summed.filt <- summed.filt[,!summed.filt[[short_annotation_type]] %in% cell_types_to_remove,]
table(summed.filt[[short_annotation_type]], summed.filt$diagnosis_name)
# For each celltype: Repeat
# =================
library(scran)
# choose celltype
label <- "B"
current <- summed[,label==summed[[short_annotation_type]]]
# Creating up a DGEList object for use in edgeR:
library(edgeR)
y <- DGEList(counts(current), samples=colData(current))
# remove combinations containing fewer than 10 cells
discarded <- current$ncells < 10
y <- y[,!discarded]
summary(discarded)
# remove lowly expressed genes
keep <- filterByExpr(y, group=current$diagnosis_name)
y <- y[keep,]
summary(keep)
# correct for composition biases by computing normalizing factors
y <- calcNormFactors(y)
head(y$samples)
# Ensure that diagnosis_name is a factor
y$samples$diagnosis_name <- factor(y$samples$diagnosis_name)
y$samples$diagnosis_name <- relevel(y$samples$diagnosis_name, ref="A")
design <- model.matrix(~0+diagnosis_name, y$samples)
# model mean variance trend
y <- estimateDisp(y, design)
summary(y)
summary(y$trended.dispersion)
fit <- glmFit(y, design)
summary(fit$var.prior)
summary(fit$df.prior)
my.contrasts <- makeContrasts(BvsA=diagnosis_nameB-diagnosis_nameA, CvsB=diagnosis_nameC-diagnosis_nameB, CvsA=diagnosis_nameC-diagnosis_nameA, levels=design)
qlf.BvsA <- glmLRT(fit, contrast=my.contrasts[,"BvsA"])
qlf.BvsA
summary(decideTests(qlf.BvsA))
topTags(qlf.BvsA)
What worries me are two things: First, the prior is NULL
fit <- glmFit(y, design)
summary(fit$var.prior)
Length Class Mode
0 NULL NULL
summary(fit$df.prior)
Length Class Mode
0 NULL NULL
Second, none of the results are significant
topTags(qlf.BvsA)
Coefficient: -1*diagnosis_nameA 1*diagnosis_nameB
logFC logCPM LR PValue FDR
ENSG00000241294 5.501448 9.897842 14.785774 0.0001204407 0.3361499
ENSG00000211663 -4.033216 10.512895 12.668160 0.0003719354 0.5190358
ENSG00000232216 4.016013 10.414781 11.891560 0.0005638886 0.5246044
ENSG00000211976 -4.398647 8.492410 9.963259 0.0015969508 0.9995831
ENSG00000211946 3.264621 7.737786 8.767256 0.0030668652 0.9995831
ENSG00000239855 -3.742213 9.367577 8.291273 0.0039836046 0.9995831
ENSG00000289340 -0.935096 5.140444 7.935606 0.0048471380 0.9995831
ENSG00000244116 -2.242584 8.907894 7.751107 0.0053679615 0.9995831
ENSG00000276775 3.007996 8.695115 7.067390 0.0078499535 0.9995831
ENSG00000240671 3.335387 9.702430 6.858517 0.0088219756 0.9995831
Could you help me find the culprit? Thanks very much in advance!
sessionInfo:
> sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: FlexiBLAS OPENBLAS; LAPACK version 3.11.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Europe/Berlin
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods
[8] base
other attached packages:
[1] scran_1.34.0 edgeR_4.4.2
[3] limma_3.62.2 DESeq2_1.46.0
[5] Matrix_1.7-2 scater_1.34.0
[7] ggplot2_3.5.1 scuttle_1.16.0
[9] Seurat_5.2.1 SeuratObject_5.0.2
[11] sp_2.2-0 SingleCellExperiment_1.28.1
[13] SummarizedExperiment_1.36.0 Biobase_2.66.0
[15] GenomicRanges_1.58.0 GenomeInfoDb_1.42.3
[17] IRanges_2.40.1 S4Vectors_0.44.0
[19] BiocGenerics_0.52.0 MatrixGenerics_1.18.1
[21] matrixStats_1.5.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 jsonlite_1.9.1 magrittr_2.0.3
[4] ggbeeswarm_0.7.2 spatstat.utils_3.1-2 farver_2.1.2
[7] zlibbioc_1.52.0 vctrs_0.6.5 ROCR_1.0-11
[10] spatstat.explore_3.3-4 htmltools_0.5.8.1 S4Arrays_1.6.0
[13] BiocNeighbors_2.0.1 SparseArray_1.6.2 sctransform_0.4.1
[16] parallelly_1.42.0 KernSmooth_2.23-26 htmlwidgets_1.6.4
[19] ica_1.0-3 plyr_1.8.9 plotly_4.10.4
[22] zoo_1.8-13 igraph_2.1.4 mime_0.12
[25] lifecycle_1.0.4 pkgconfig_2.0.3 rsvd_1.0.5
[28] R6_2.6.1 fastmap_1.2.0 GenomeInfoDbData_1.2.13
[31] fitdistrplus_1.2-2 future_1.34.0 shiny_1.10.0
[34] digest_0.6.37 colorspace_2.1-1 patchwork_1.3.0
[37] tensor_1.5 dqrng_0.4.1 RSpectra_0.16-2
[40] irlba_2.3.5.1 beachmat_2.22.0 progressr_0.15.1
[43] spatstat.sparse_3.1-0 httr_1.4.7 polyclip_1.10-7
[46] abind_1.4-8 compiler_4.4.1 withr_3.0.2
[49] BiocParallel_1.40.0 viridis_0.6.5 fastDummies_1.7.5
[52] MASS_7.3-65 DelayedArray_0.32.0 bluster_1.16.0
[55] tools_4.4.1 vipor_0.4.7 lmtest_0.9-40
[58] beeswarm_0.4.0 httpuv_1.6.15 future.apply_1.11.3
[61] goftest_1.2-3 glue_1.8.0 nlme_3.1-167
[64] promises_1.3.2 grid_4.4.1 Rtsne_0.17
[67] cluster_2.1.8 reshape2_1.4.4 generics_0.1.3
[70] gtable_0.3.6 spatstat.data_3.1-4 tidyr_1.3.1
[73] data.table_1.17.0 metapod_1.14.0 ScaledMatrix_1.14.0
[76] BiocSingular_1.22.0 XVector_0.46.0 spatstat.geom_3.3-5
[79] RcppAnnoy_0.0.22 ggrepel_0.9.6 RANN_2.6.2
[82] pillar_1.10.1 stringr_1.5.1 spam_2.11-1
[85] RcppHNSW_0.6.0 later_1.4.1 splines_4.4.1
[88] dplyr_1.1.4 lattice_0.22-6 renv_1.1.2
[91] survival_3.8-3 deldir_2.0-4 tidyselect_1.2.1
[94] locfit_1.5-9.11 miniUI_0.1.1.1 pbapply_1.7-2
[97] gridExtra_2.3 scattermore_1.2 statmod_1.5.0
[100] stringi_1.8.4 UCSC.utils_1.2.0 lazyeval_0.2.2
[103] codetools_0.2-20 tibble_3.2.1 BiocManager_1.30.25
[106] cli_3.6.4 uwot_0.2.3 xtable_1.8-4
[109] reticulate_1.41.0 munsell_0.5.1 Rcpp_1.0.14
[112] globals_0.16.3 spatstat.random_3.3-2 png_0.1-8
[115] spatstat.univar_3.1-1 parallel_4.4.1 dotCall64_1.2
[118] listenv_0.9.1 viridisLite_0.4.2 scales_1.3.0
[121] ggridges_0.5.6 purrr_1.0.4 crayon_1.5.3
[124] rlang_1.1.5 cowplot_1.1.3