Hello all, I am doing gene expression profiling of 3 structures(BB, 79A, 28z). I have 9 samples from 3 donors. Each donor has 3 samples, representing 3 structures. For example Donor 1 has structure BB, 79A, 28z , Donor 2 has structure BB, 79A, 28z Donor 3 has structure BB, 79A, 28z. I would like to see the differentially expressed gene of structure 79A. I compare structure 79A vs 28z, 79A vs BB and 28z vs BB. I have attached the pheno column in the picture. Currently I got only 8 samples as another 1 is still in the process of sequencing but I want to try analyze these 8 samples first. I am trying to write my script in glm approach. I am a beginner and quite new to edgeR and I would like to ask for your help to review and confirm whether what I wrote is correct. I have got DEGs from the scripts below however I am not quite reliable with the result. Thank you in advance!!
library(edgeR)
counts <- read.table('NewCounting_file.txt')
#create gene name and gene id variable
gene_name <- counts[,2]
gene_id <- counts[,1]
#Delete gene id and gene name column on counts
counts[1:2] <- list(NULL)
counts_mat <- apply(as.matrix.noquote(counts),2,as.numeric)
row.names(counts_mat) <- gene_name
#Assign group as column name
sample_name <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
colnames(counts_mat) <- sample_name
## read phenotype data in
group <- c("MD7_BB","MD7_79A","MD7_28","BD7_BB","BD7_79A","BD7_28","JD7_79A","JD7_28")
Pheno <- read.csv("mypheno.csv", stringsAsFactors = TRUE)![enter image description here][1]
dglist <- DGEList(counts=counts_mat, samples = Pheno, group = group)
keep <-filterByExpr(dglist)
summary(keep)
dglist <- dglist[keep, ,keep.lib.sizes=FALSE]
dglist <- calcNormFactors(dglist, method="TMM")
#Design matrix
donors <- factor(dglist$sample$Donors)
groups <- factor(dglist$sample$Structures)
design <- model.matrix(~0 + donors + groups)
dglist<- estimateDisp(dglist, design, robust=TRUE)
dglist$common.dispersion
plotBCV(dglist)
fit <- glmQLFit(dglist,design, robust=TRUE)
##############DGE
##To detect genes that are differentially expressed btw any of three treatments
qlf <- glmQLFTest(fit, coef=4:5)
topTags(qlf)
FDR <- p.adjust(qlf$table$PValue, method='fdr')
sum(FDR < 0.05)
summary(decideTests(qlf))
top <- rownames(topTags(qlf))
cpm(dglist)[top,]
summary(decideTests(qlf))
plotMD(qlf)
abline(h=c(-1,1), col="blue")
###############################
#detect gene that are differential expressed in 79A vs BBz
qlf_79A_BBz <- glmQLFTest(fit, coef=4)
topTags(qlf_79A_BBz)
#total num of genes significantly up-regulated or down regulated at 5%FDR
summary(decideTests(qlf_79A_BBz))
top <- rownames(topTags(qlf_79A_BBz))
cpm(dglist)[top,]
plotMD(qlf_79A_BBz)
abline(h=c(-1,1), col="blue")
qlf_79A_BBz$table <- cbind(qlf_79A_BBz$table, FDR=p.adjust(qlf_79A_BBz$table$PValue, method ='fdr'))
gsign_79A_BBz <- qlf_79A_BBz$table[qlf_79A_BBz$table$FDR<0.05,]
gsign_79A_BBz <- gsign_79A_BBz[order(gsign_79A_BBz$FDR),]
dim(gsign_79A_BBz)
gsign_79A_BBz_df <-data.frame(gsign_79A_BBz)
new_gsign_79A_BBz <- cbind(rownames(gsign_79A_BBz_df),gsign_79A_BBz_df)
rownames(new_gsign_79A_BBz)<-NULL
colnames(new_gsign_79A_BBz)[1]<-"genename"
write.csv(new_gsign_79A_BBz, file="new_gsign_79A_BBz.csv")
###############################
#detect gene that are differential expressed in BB vs 28z
qlf_BB_28z <- glmQLFTest(fit, coef=5)
topTags(qlf_BB_28z)
top <- rownames(topTags(qlf_BB_28z))
topFDR <- top <- rownames(topTags(qlf_BB_28z))
cpm(dglist)[top,]
summary(decideTests(qlf_BB_28z))
plotMD(qlf_BB_28z)
abline(h=c(-1,1), col="blue")
qlf_BB_28z$table <- cbind(qlf_BB_28z$table, FDR=p.adjust(qlf_BB_28z$table$PValue, method ='fdr')
gsign_BB_28z <- qlf_BB_28z$table[qlf_BB_28z$table$FDR<0.05,]
gsign_BB_28z <- gsign_BB_28z[order(gsign_BB_28z$FDR),]
gsign_BB_28z_df <-data.frame(gsign_BB_28z)
new_gsign_BB_28z <- cbind(rownames(gsign_BB_28z_df),gsign_BB_28z_df)
rownames(new_gsign_BB_28z)<-NULL
colnames(new_gsign_BB_28z)[1]<-"genename"
write.csv(new_gsign_BB_28z, file="new_gsign_BB_28z.csv")
######################################
#detect gene that are differential expressed in 28z vs 79A
qlf28z_79A <- glmQLFTest(fit, contrast=c(0,0,0,-1,1))
topTags(qlf28z_79A)
summary(decideTests(qlf28z_79A))
top <- rownames(topTags(qlf28z_79A))
cpm(dglist)[top,]
plotMD(qlf_BB_28z)
abline(h=c(-1,1), col="blue")
qlf28z_79A$table <- cbind(qlf28z_79A$table, FDR=p.adjust(qlf28z_79A$table$PValue, method ='fdr'))
gsign_28z_79A <- qlf28z_79A$table[qlf28z_79A$table$FDR<0.05,]
gsign_28z_79A <- gsign_28z_79A[order(gsign_28z_79A$FDR),]
gsign_28z_79A_df <-data.frame(gsign_28z_79A)
new_gsign_28z_79A <- cbind(rownames(gsign_28z_79A_df),gsign_28z_79A_df)
rownames(new_gsign_28z_79A)<-NULL
colnames(new_gsign_28z_79A)[1]<-"genename"
write.csv(new_gsign_28z_79A, file="new_gsign_28z_79A.csv")
Cross-posted: https://support.bioconductor.org/p/9143145/
Thank you, I thought these two are different forums, I have already deleted in Bioconductor.
If you are asking people to review your code, please remove any uncessary lines which are irrelevant to the review. For example, anything with
view()
,plot()
,head()
,summary()
etc. It's a common courtesy to keep your code as short as possible as to make it easier for others to read. Thank you.Thanks for your comments, I have already removed as suggested