Hello,
Here is a small code sample that you can run to plot the intra-cluster correlation distribution of each cluster.
However I suggest you to run PCA on the expression matrix before calculating the cell-to-cell correlation based on the projection of cells into the PCs space (e.g. first 20 PCs). This would allow to eliminate redundant features & reduce noise.
You will need a few libraries :
library(tidyr)
library(dplyr)
library(ggplot2)
This is to reproduce a count expression matrix with two distinct cell populations having different heterogeneities :
set.seed(47)
# Mimick two cell clusters with different variablity (heterogeneity)
homogenous = matrix(rnorm(n=75*500),nrow = 500,ncol = 125) +
matrix(c(rep(c(0),500*75),rep(c(1),500*50)),byrow = T,nrow = 500,ncol = 125)
heterogenous = matrix(rnorm(n=75*500),nrow = 500,ncol = 75)
mat <- cbind(homogenous, heterogenous)
colnames(mat) = paste0("cell_",1:200)
Calculate correlation, as you did:
cor_mat <- cor(mat)
Generate an metadata data.frame with 2 clusters
# Mimick three cell clusters
cluster_df <- data.frame(cell_id = paste0("cell_",1:200),
cluster=c(rep('C1',125),rep('C2',75)))
Create cell to cell correlation data.frame with tidyr::pivot_longer function, that will give the correlation score of any given "cell of origin" with any "other cell"
# Intra Correlation
cor_df <- as.data.frame(cor_mat)
cor_df$cell_of_origin <- rownames(cor_mat)
cor_df <- tidyr::pivot_longer(cor_df, cols = seq_len(ncol(cor_mat)),
names_to="other_cell",values_to="correlation")
Remove self correlations (e.g. cell_1 with cell_1), as it is always 1
cor_df <- cor_df[-which(cor_df$cell_of_origin == cor_df$other_cell),]
Add cluster information (cluster of the cell of origin & cluster of the other cell)
cor_df$cell_of_origin_cluster <- cluster_df$cluster[match(cor_df$cell_of_origin,cluster_df$cell_id)]
cor_df$other_cell_cluster <- cluster_df$cluster[match(cor_df$other_cell,cluster_df$cell_id)]
Select only cells that belong to the same clusters
intra_corr <- cor_df[cor_df$cell_of_origin_cluster==cor_df$other_cell_cluster,]
Violin plot of intra correlation distribution by cluster
ggplot(intra_corr,aes(x = cell_of_origin_cluster,y=correlation, fill = cell_of_origin_cluster)) +
geom_violin() + theme_classic() + geom_jitter(size=0.2)
For inter-cluster correlation, you can do the same, selecting only cells that don't belong to the same clusters, but this makes sense only if you have more than 2 clusters:
inter_corr <- cor_df[cor_df$cell_of_origin_cluster!=cor_df$other_cell_cluster,]
# Violin plot of intra correlation distribution by cluster
ggplot(inter_corr) + geom_violin(aes(x = cell_of_origin_cluster,y=correlation,
fill = cell_of_origin_cluster)) + theme_classic()
Wow thank you so much pacome.pr! This is exactly what I was looking to do! Very much appreciated :)