Hi there,
I'm currently running a PCA on an RNAseq dataset. I'm rather new to both R and PCA but so far I have a grip on things besides this issue:
I've managed to plot PC1 vs PC2 for 500 most variable genes and it turns out well. I was able to assign colours to experimental groups (CTRL and SA) and wanted to add an additional level of Sex on the plot. After doing so, however, if you take a good look at colours of the data points, only some of them are somehow wrong (I've confirmed that the data point shape for sex is correct). Can someone please figure out what I'm doing wrong? I don't see why setting col=pca_df$col, suddenly doesn't work after sex has been added as a variable.
# construct data frame w/ PC loadings and add subject labels
pca_df <- as.data.frame(pca$x)
# add a column containing experimental group
pca_df$Group <- dds@colData$Group
# add column containing subject IDs
pca_df$BrainID <- colnames(dds)
# add colors for plotting to df
pca_df$col <- NA
for(i in 1:length(levels(pca_df$Group))){
ind1 <- which(pca_df$Group == levels(pca_df$Group)[i])
pca_df$col[ind1] <- i
}
# plot PC1 vs PC2
plot(pca_df[, 1], pca_df[, 2],
xlab = paste0("PC1 (", (round(percentVar[1], digits=3)*100), "% variance)"),
ylab = paste0("PC2 (", (round(percentVar[2], digits=3)*100), "% variance)"),
main=paste0("PC1 vs PC2 for ", var_feature_n, " most variable genes"),
pch=16, cex=1.35, cex.lab=1.3, cex.axis = 1.15, las=1,
panel.first = grid(),
col=pca_df$col)
# add subject names to data points
text((pca_df[, 2])~(pca_df[, 1]), labels = pca_df$BrainID, cex=0.6, font=2, pos=4)
# add variable to PCA dataframe for sex
pca_df$Sex <- NA
# add a column containing Sex
pca_df$Sex <- dds@colData$Sex
# add variable to PCA dataframe for age
pca_df$Age <- NA
# add a column containing age
pca_df$Age <- dds@colData$Age
# add variable to PCA dataframe for Substance @ death
pca_df$SubstanceDeath <- NA
# add a column containing Substance @ death
pca_df$SubstanceDeath <- dds@colData$SubstanceDeath
# adding sex as variable
# convert string to factor
pca_df$Sex <- factor(pca_df$Sex, levels = c("Female", "Male"))
# plot PC1 vs PC2 but only for female subjects
plot(pca_df[pca_df$Sex=="Female", 1], pca_df[pca_df$Sex=="Female", 2],
xlab = paste0("PC1 (", (round(percentVar[1], digits=3)*100), "% variance)"),
ylab = paste0("PC2 (", (round(percentVar[2], digits=3)*100), "% variance)"),
main=paste0("PC1 vs PC2 for ", var_feature_n, " most variable genes"),
pch=16, cex=1.35, cex.lab=1.3, cex.axis = 1.15, las=1,
panel.first = grid(),
xlim=c(-14,20),
ylim=c(-14,20),
col= pca_df$col)
# add points for Male subjects but use different shape
points(pca_df[pca_df$Sex=="Male", 1], pca_df[pca_df$Sex=="Male", 2],
pch=2, cex=1.35,
col= pca_df$col)
# add legend
legend(14, -5, levels(pca_df$Sex), pch = c(16, 2))
legend(8, -5, levels(pca_df$Group), pch = 16, col =pca_df$col)
# add subject names as text to points
text((pca_df[, 2])~(pca_df[, 1]), labels = pca_df$Group, cex=0.6, font=2, pos=4)
Thank you so much! :)