Hi
I just finished my first task on scRNAseq analysis, please feel free to make any comments. Very appreciate your help. Thank you!
I use SingleR to get cell type annotation. And my general workflow for 4 control samples as follows:
GL1101_data <- Read10X(data.dir = "~/GL1101/", strip.suffix = TRUE)
GL1111_data <- Read10X(data.dir = "~/GL1111/", strip.suffix = TRUE)
GL1131_data <- Read10X(data.dir = "~/GL1131/", strip.suffix = TRUE)
GL1141_data <- Read10X(data.dir = "~/GL1141/", strip.suffix = TRUE)
data1101<- CreateSeuratObject(counts = GL1101_data, project = "GL1101", min.cells = 3, min.features = 200)
data1111<- CreateSeuratObject(counts = GL1111_data, project = "GL1111", min.cells = 3, min.features = 200)
data1131<- CreateSeuratObject(counts = GL1131_data, project = "GL1131", min.cells = 3, min.features = 200)
data1141<- CreateSeuratObject(counts = GL1141_data, project = "GL1141", min.cells = 3, min.features = 200)
data1101 <- subset(data1101, subset = nFeature_RNA > 200 & nFeature_RNA < 4000)
data1111 <- subset(data1111, subset = nFeature_RNA > 400 & nFeature_RNA < 3600)
data1131 <- subset(data1131, subset = nFeature_RNA > 500 & nFeature_RNA < 4000)
data1141 <- subset(data1141, subset = nFeature_RNA > 300 & nFeature_RNA < 4000)
data1101<-NormalizeData(data1101, normalization.method = "LogNormalize", scale.factor = 10000)
data1111<-NormalizeData(data1111, normalization.method = "LogNormalize", scale.factor = 10000)
data1131<-NormalizeData(data1131, normalization.method = "LogNormalize", scale.factor = 10000)
data1141<-NormalizeData(data1141, normalization.method = "LogNormalize", scale.factor = 10000)
data1101$sampleID<-"GL1101"
data1111$sampleID<-"GL1111"
data1131$sampleID<-"GL1131"
data1141$sampleID<-"GL1141"
obj.list<-list(GL1101 = data1101, GL1111 = data1111, GL1131 = data1131, GL1141 = data1141)
# identify variable features for each dataset independently
obj.list <- lapply(X = obj.list, FUN = function(x) {
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
features <- SelectIntegrationFeatures(object.list = obj.list)
data.anchors <- FindIntegrationAnchors(object.list = obj.list, normalization.method = "LogNormalize",
anchor.features = features)
data.combined <- IntegrateData(anchorset = data.anchors, normalization.method = "LogNormalize")
data.combined <- ScaleData(data.combined, verbose = FALSE)
data.combined <- RunPCA(data.combined, verbose = FALSE)
data.combined <- RunUMAP(data.combined, reduction = "pca", dims = 1:40)
data.combined<-FindNeighbors(data.combined, reduction = "pca", dim = 1:40)
data.combined<-FindClusters(data.combined, resolution = 0.5)
#load reference dataset, which has cell type annotation information. A SingleCellExpreiment Object
sce.ref<-readRDS("~/sce_ref.rds")
DefaultAssay(data.combined)<- "RNA"
sce.data.combined<-as.SingleCellExperiment(data.combined)
library(SingleR)
pred.data.combined <- SingleR(test=sce.data.combined,
ref=sce.ref, labels=sce.ref$celllabel, de.method="wilcox", de.n = 50)
#Calculating the Rand Index
DefaultAssay(data.combined) <- "integrated"
library(bluster)
pairwiseRand(data.combined$integrated_snn_res.0.5, pred.data.combined$labels, mode="index")
#plot heat map
tab <- table(cluster=data.combined$integrated_snn_res.0.5, label=pred.data.combined$labels)
pheatmap::pheatmap(log10(tab+10))
The rand Index ARI between unsupervised cluster and assigned cell type label is below 0.5, only 0.461. What's worse is that the cluster and cell type is not quite one-to-one map, maybe it has some nested structure, but I barely have any biological and medical background, it's hard for me to tell.
Also I use the following command to plot the clustering after I have assigned the cell type name to each cell.
data.combined$cell.type.label<-pred.data.combined$labels
DimPlot(data.combined, reduction = "umap", group.by = "cell.type.label", label = TRUE, pt.size = 3, label.size = 5)
At the left corner, there are two clusters which does not be labeled, I guess it is because those cells share the same label to the other clusters which actually has already been labeled, it might be just a "label location" issue. But I am wondering whether is it safe to say the cell annotation results are reasonable and solid?