Question

Could Anyone Help Me with Seurat code

0

Entering edit mode

4 months ago

ANHAO • 0

Hi, I'm a post-doc have experience only in bulk RNA-seq. (Basically, I'm a wet experimnenter, and even no one handled bulk RNA-seq data from fastq in our lab). And this time our lab performed single cell RNA-seq experiment. Professor asked me to learn about the Seurat and other packages for QC then handle with our data instead of Loupebrowther. I already succeed on constract a custom reference and finished the cellranger process.

I searched a lot of tutorials and wrote my first code for Seurat part. But because of the lack of background of mathematical konowledge, I'm not sure if it's right. Could anyone help me to judge and give me some advice on my code? Here is my code:

# Setting Background
Sys.setenv(LANGUAGE = "en")
options(stringsAsFactors = FALSE)
rm(list=ls())

setwd("C:/Users/liufinity/Desktop/R_Analysis/20241121")

library(Seurat)
library(SeuratObject)
library(tidyverse)
library(dplyr)
library(patchwork)
library(DoubletFinder)
library(harmony)
library(ggplot2)
library(cowplot)
library(R.utils)
library(scCustomize)
library(RColorBrewer)
library(glmGamPoi)

# import data

x=list.files()

dir = c('old/', "tail/", "young/")
names(dir) = c('old', 'tail', 'young')      

scRNAlist <- list()
for(i in 1:length(dir))
{
  counts <- Read10X(data.dir = dir[i])
  scRNAlist[[i]] <- CreateSeuratObject(counts, min.cells = 3, min.features =200)
}

# Preprocess
for (i in 1:length(scRNAlist)) 
{
  scRNAlist[[i]] <- NormalizeData(scRNAlist[[i]])
  scRNAlist[[i]] <- FindVariableFeatures(scRNAlist[[i]], selection.method = "vst",nfeatures = 2000)
  scRNAlist[[i]] <- ScaleData(scRNAlist[[i]])
  scRNAlist[[i]] <- RunPCA(scRNAlist[[i]])
}


# HB and Mito genes calculation

for (i in 1:length(scRNAlist)) 
{
  scRNAlist[[i]][["percent.mt"]] <- PercentageFeatureSet(scRNAlist[[i]], pattern = "^mt-")

  HB.genes <- c("Hba-x","Hba-a1","Hba-a2","Hbb-bt","Hbb-bs","Hbb-bh1","Hbb-bh2","Hbb-y","Hbq1a","Hbq1b")
  HB_m <- match(HB.genes, rownames(scRNAlist[[i]]@assays$RNA)) 
  HB.genes <- rownames(scRNAlist[[i]]@assays$RNA)[HB_m] 
  HB.genes <- HB.genes[!is.na(HB.genes)] 
  scRNAlist[[i]][["percent.HB"]]<-PercentageFeatureSet(scRNAlist[[i]], features = HB.genes) 

  scRNAlist[[i]][["log10GenesPerUMI"]] <- log10(scRNAlist[[i]]$nFeature_RNA) / log10(scRNAlist[[i]]$nCount_RNA)
}

# Cell cycle related calculation

for (i in 1:length(scRNAlist)) 
{
  CaseMatch(c(cc.genes$s.genes,cc.genes$g2m.genes),VariableFeatures(scRNAlist[[i]]))
  g2m_genes = cc.genes$g2m.genes
  g2m_genes = CaseMatch(search = g2m_genes, match = rownames(scRNAlist[[i]]))
  s_genes = cc.genes$s.genes
  s_genes = CaseMatch(search = s_genes, match = rownames(scRNAlist[[i]]))
  scRNAlist[[i]] <- CellCycleScoring(object=scRNAlist[[i]],  g2m.features=g2m_genes,  s.features=s_genes)
}

# Exclude the influence of cell cycle and mitogenes

for (i in 1:length(scRNAlist)) 
{
 scRNAlist[[i]]$CC.Difference <- scRNAlist[[i]]$S.Score - scRNAlist[[i]]$G2M.Score
 scRNAlist[[i]] <- SCTransform(scRNAlist[[i]], 
                                method = "glmGamPoi", 
                               vars.to.regress = c("percent.mt", "CC.Difference"), 
                               verbose = T)
}

# Visualize and evaluate the cutoff for subset

for (i in 1:length(scRNAlist)) 
{
  col.num <- length(levels(scRNAlist[[i]]@active.ident))
  violin_temp <- VlnPlot(scRNAlist[[i]],
                  features = c("nFeature_RNA", "nCount_RNA", "percent.mt", "percent.HB"), 
                  cols =rainbow(col.num), 
                  pt.size = 0.01, 
                  ncol = 4)
  assign(paste0("violin_before", i), violin_temp)
}
violin_before1
violin_before2
violin_before3


# Subset

scRNAlist.filtered <- lapply(scRNAlist, subset, subset = nFeature_RNA < 6000 & percent.mt < 10 & percent.HB < 3 & log10GenesPerUMI > 0.8)

# DoubletFinder

for (i in 1:length(scRNAlist.filtered)) 
{
  sweep.res.list <- paramSweep(scRNAlist.filtered[[i]], PCs = 1:20, sct = FALSE)
  sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
  bcmvn <- find.pK(sweep.stats)

  pK <- bcmvn %>% 
    filter(BCmetric == max(BCmetric)) %>%
    select(pK) 
  pK <- as.numeric(as.character(pK[[1]]))

  annotations <- scRNAlist.filtered[[i]]@meta.data$seurat_clusters
  homotypic.prop <- modelHomotypic(annotations)           
  nExp_poi <- round(0.075*nrow(scRNAlist.filtered[[i]]@meta.data))  
  nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

  scRNAlist.filtered[[i]] <- doubletFinder(scRNAlist.filtered[[i]], PCs = 1:20, pN = 0.25, pK = pK, 
                                  nExp = nExp_poi.adj,reuse.pANN = FALSE, sct = FALSE)
}

# Manually delete the pANN value and left only DF.classifications

scRNAlist.filtered[[1]]$pANN_0.25_0.005_846 <- NULL
scRNAlist.filtered[[1]]$DF.classifications <- scRNAlist.filtered[[1]]$DF.classifications_0.25_0.005_846
scRNAlist.filtered[[1]]$DF.classifications_0.25_0.005_846 <- NULL

scRNAlist.filtered[[2]]$pANN_0.25_0.24_82 <- NULL
scRNAlist.filtered[[2]]$DF.classifications <- scRNAlist.filtered[[2]]$DF.classifications_0.25_0.24_82
scRNAlist.filtered[[2]]$DF.classifications_0.25_0.24_82 <- NULL

scRNAlist.filtered[[3]]$pANN_0.25_0.01_427 <- NULL
scRNAlist.filtered[[3]]$DF.classifications <- scRNAlist.filtered[[3]]$DF.classifications_0.25_0.01_427
scRNAlist.filtered[[3]]$DF.classifications_0.25_0.01_427 <- NULL

# Subset the single
scRNAlist.singlet <- lapply(scRNAlist.filtered, function(x) 
  {subset(x, DF.classifications == "Singlet")})

# Merge
scRNAlist_merge <- merge(x = scRNAlist.filtered[[1]],y = c(scRNAlist.filtered[[2]],scRNAlist.filtered[[3]]), add.cell.ids = dir)
scRNAlist_merge <- ScaleData(scRNAlist_merge)
scRNAlist_merge <- FindVariableFeatures(scRNAlist_merge, selection.method = "vst",nfeatures = 2000)
scRNAlist_merge <- RunPCA(scRNAlist_merge)

# Integrated with harmony
scRNA_harmony <- IntegrateLayers(object = scRNAlist_merge,
                                 method = HarmonyIntegration,
                                 orig.reduction = 'pca',
                                 new.reduction = 'harmony')

scRNA_harmony <- JoinLayers(scRNA_harmony)

# PCA
scRNA_harmony <- RunPCA(scRNA_harmony, features = VariableFeatures(scRNA_harmony)) 
plot1 <- DimPlot(scRNA_harmony, reduction = "pca", group.by="orig.ident") 
ElbowPlot(scRNA_harmony, ndims=50, reduction="pca") 

# Clustering
pc.num=1:10
scRNA_harmony <- FindNeighbors(scRNA_harmony, dims = pc.num) 
scRNA_harmony <- FindClusters(scRNA_harmony, resolution = 1.0)


# tSNE
scRNA_harmony = RunTSNE(scRNA_harmony, dims = pc.num, reduction = "harmony")
embed_tsne <- Embeddings(scRNA_harmony, 'tsne')
write.csv(embed_tsne,'embed_tsne.csv')
plot1 = DimPlot(scRNA_harmony, reduction = "tsne") 
plot2 = DimPlot(scRNA_harmony, reduction = "tsne",group.by = "orig.ident") 

plot1
plot2

# UMAP
scRNA_harmony <- RunUMAP(scRNA_harmony, dims = pc.num, reduction = "harmony")
embed_umap <- Embeddings(scRNA_harmony, 'umap')
write.csv(embed_umap,'embed_umap.csv') 
plot3 = DimPlot(scRNA_harmony, reduction = "umap") 
plot4 = DimPlot(scRNA_harmony, reduction = "umap",group.by = "orig.ident") 
plot3
plot4


# SepecificGenes
FeaturePlot_scCustom(seurat_object = scRNA_harmony, reduction = "tsne", features = "Ai9", pt.size = 1)

# Markers for cluster
markers <- FindAllMarkers(
  object = scRNA_harmony,
  assay = "RNA",  
  slot = "data",  
  test.use = "wilcox",  
  logfc.threshold = 0.5, 
  min.pct = 0.5
)

DoubletFinder Seurat • 895 views

ADD COMMENT • link 4 months ago by ANHAO • 0

0

Entering edit mode

Just follow the Seurat vignettes which represent the developers best practices. I doubt many will go through a wall of code like this. See here for morelinks to interesting literature: scRNA-seq from cell count matrix and DESeq2

ADD REPLY • link 4 months ago by ATpoint 87k

0

Entering edit mode

Basically, I followed the Seurat vignettes. But I modified it with a loop for multiple samples and added a doublet exclusion part. Now I am confused about normalization and scaling. I need to do them first for individual samples if I want to exclude the influence of cell cycle or doublet. But when I want to exclude the infuluence of cell cycle, I still need the scaling to regress. In addition, when integratelayers after merging samples, I need a scaling before the process again. I'm not sure if it makes sense to normalize and scale for 2~3 time before and during the process.

ADD REPLY • link 4 months ago by ANHAO • 0

1

Entering edit mode

I would typically do the steps in the following order:

Import data
Run DoubletFinder and annotate singlets/doublets (that pipeline does its own scaling etc)
Get % features for mito/ribo/etc
Visualise and filter for doublets, nFeature, percent.mt and anything else you threshold
Run basic normalisation in order to be able to do cell-cycle estimation and then run it
Normalise, get HVG and scale the data properly regressing out any variables you want to using your preferred approach
Integrate the data if needed
Run downstream clustering, marker differentials etc

FWIW every time you use default Seurat functions to normalise data, it will pull from 'counts' layer and deposit the data in the 'data' layer, and when you scale it will pull from 'data' layer and deposit in 'scale.data' layer. So you're not really ever double-scaling anything. The only caveat is that running SCTransform() will set the default assay to the new SCT assay. But you can always re-set the default assay to RNA to run the scaling again as if fresh.

ADD REPLY • link 4 months ago by yura.grabovska ▴ 750

0

Entering edit mode

Thanks for your kind reply, dude. I have one question about your doubletfinder step. Without normalizing the data, I cannot run it (It is also mentioned by the author of this package that need the normalizing and scaling in Seurat before run it). Are there any tips about run doublet finder without the nom and scaling in Seurat?

ADD REPLY • link 4 months ago by ANHAO • 0

0

Entering edit mode

## spits out the expected multiplet rate based on input value of library size
get.10x.multiplets <- function(x){
  expected.df <- data.frame(
    "Cells.Recovered" = c(500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000),
    "Multiplet.Rate" = c(0.4, 0.8, 1.6, 2.4, 3.2, 4.0, 4.8, 5.6, 6.4, 7.2, 8.0)
  )
  lm.res <- lm(Multiplet.Rate~Cells.Recovered, data = expected.df)
  test.df <- data.frame("Cells.Recovered" = x)
  predict(lm.res, test.df)
}

## runs DoubletFinder on a Seurat object 
get.10x.doublets <- function(x){
  require(DoubletFinder)

  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 500)
  x <- ScaleData(x, features = VariableFeatures(x))
  x <- RunPCA(x, features = VariableFeatures(x), verbose = FALSE)
  x <- RunUMAP(x, reduction = "pca", dims = 1:20, reduction.key = "UMAPpca_", reduction.name = "UMAP_PCA")
  x <- FindNeighbors(x, reduction = "pca", dims = 1:20)
  x <- FindClusters(x, resolution = 0.3)
  DimPlot(x, group.by = "seurat_clusters")
  x.sweep <- paramSweep(x, PCs = 1:20, sct = FALSE)
  x.sweep.stats <- summarizeSweep(x.sweep, GT = FALSE)
  x.bcmvn <- find.pK(x.sweep.stats)
  x.pK <- as.numeric(as.character(x.bcmvn$pK[which.max(x.bcmvn$BCmetric)]))
  homotypic.prop <- modelHomotypic(x$seurat_clusters)
  mlt.est <- get.10x.multiplets(nrow(x@meta.data))/100
  nExp_poi <- round(mlt.est*nrow(x@meta.data))
  nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
  x <- doubletFinder(x, PCs = 1:20, pN = 0.25, pK = x.pK, nExp = nExp_poi, sct = FALSE, reuse.pANN = FALSE)
  x <- doubletFinder(x, PCs = 1:20, pN = 0.25, pK = x.pK, nExp = nExp_poi.adj, sct = FALSE, 
                        reuse.pANN = grep("pANN", colnames(x@meta.data), value = TRUE))
  doublet.df <- x@meta.data[,grep("DF.classifications", colnames(x@meta.data), value = TRUE)]
  colnames(doublet.df) <- c("Estimate", "Adj.Estimate")
  doublet.df$Cellname <- rownames(doublet.df)
  doublet.df
}

Here are my source functions I use to run DoubletFinder.

I import my Seurat data with each library in a list then I run the following:

## calculate doublet probabilities using DoubletFinder - takes a while
doublet.res.list <- lapply(seurat.list, get.10x.doublets)
doublet.res <- do.call(rbind, doublet.res.list)

It's a bit janky and happy for people to tell me as such. But essentially, it runs DoubletFinder completely separately from the rest of the analysis right at the beginning of the script. It does its own necessary normalisation steps and then those objects are kept only internally within the function. This means that I can normalise and scale my data however I like, I only retain the doublet annotation.