Entering edit mode
10.6 years ago
biotech
▴
570
Hi guys,
I would you to revise my edgeR code since it's possible I missed something important because being quite new on this.
Thanks, Bernardo
############################################################
#htseq-count stats
############################################################
# rRNA and tRNA will be discarded in counts file because the arbitrary mapped reads to these regions
# NOTE: minimun alignment quality is set to 10
# '-t CDS -i ID' will exclude rRNA and tRNA. Also 'Parent' will give the correct locus tag name to each 'feature' in count table.
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 14.sam HS372.gff > 14.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 15.sam HS372.gff > 15.counts
python -m HTSeq.scripts.count -m intersection-nonempty -a 10 -t CDS -i ID 16.sam HS372.gff > 16.counts
#Last lines from .counts files
#14.counts
__no_feature 271363
__ambiguous 6
__too_low_aQual 137308
__not_aligned 133836
__alignment_not_unique 0
#15.counts
__no_feature 346247
__ambiguous 3
__too_low_aQual 148014
__not_aligned 135920
__alignment_not_unique 0
#16.counts
__no_feature 1067474
__ambiguous 0
__too_low_aQual 136484
__not_aligned 110488
__alignment_not_unique 0
############################################################
#edgeR
############################################################
#R code
library(edgeR)
library(WriteXLS)
dir () # The tag counts for the two individual libraries are stored in two separate plain text files. In each file, the tag IDs and counts for each tag are provided in a table
targets <- read.delim("targets.txt", stringsAsFactors = FALSE)
targets
# files group description
#1 14.counts biofilm biofilm F9
#2 15.counts planktonic planktonic F9
#3 16.counts stationary stationary F9
RG <- readDGE(targets)
colnames(RG) <- c("biofilm","planktonic","stationary")
RG
dim(RG)
#filter low expressed transcripts
keep <- rowSums(cpm(RG)>1) > 1 #we keep genes that achieve at least one count per million (cpm) in at least TWO samples
RG <- RG[keep,]
dim(RG)
RG$samples$lib.size <- colSums(RG$counts) # After filtering, it is a good idea to reset the library sizes:
RG$samples
#Normalization
RG <- calcNormFactors(RG) # Apply TMM normalization
RG$samples # se manual
RG
############################################################
#Bio_vs_Plank
############################################################
bcv <- 0.2 # Assume a low BCV value of 0.2. The BCV (square root of the common dispersion) here is 20%, stilgthly higher than a typical size for a laboratory experiment with a cell line or a model organism.
et <- exactTest(RG, pair=c("planktonic","biofilm"),dispersion=bcv^2) # exactTest. dispersion = 0.04
et
class(et)
top <- topTags(et) # Top ten differentially expressed tags with their p-values
top
class(top)
cpm(RG)[rownames(top), ] # Check the individual cpm values for the top ten genes
summary(de <- decideTestsDGE(et, p=0.05, adjust="BH")) # The total number of differentiallly expressed genes at FDR< 0.05
# Here the entries for -1, 0 and 1 are for down-regulated, non-differentially expressed and up-regulated tags respectively.
#-1 54
#0 2153
#1 52
detags <- rownames(RG)[as.logical(de)] # detags contains the DE genes at 5% FDR
head (detags)
plotSmear(et, de.tags=detags, ylim=c(-7,7), xlim=c(0,15), cex.lab=1.4, cex.axis=1) # plot all genes and highlight DE genes at 5% FDR
abline(h=c(-1, 1), col="blue") # The blue lines indicate 2-fold changes.
title("Biofilm vs planktonic")
dev.copy2pdf(file = "Figure_1.pdf") #Save as .pdf##
# NOTE -> adjust 'n' depending on the available number of genes
Bio_vs_Plank <- topTags(et, n=2259, adjust.method="BH", sort.by="logFC")
# export excel file
x <- Bio_vs_Plank$table
WriteXLS("x","Bio_vs_Plank.xls", row.names = TRUE, col.names = TRUE)
# export genes for TopGO. DEG_up_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC > 0 & PValue < 0.05)
x.sub
DEG_up_Pvalue_005 <- rownames(x.sub)
write.table(DEG_up_Pvalue_005, "Bio_vs_Plank_DEG_up_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
# export genes for TopGO. DEG_down_pvalue_0.05
x.df <- Bio_vs_Plank$table
x.sub <- subset(x.df, logFC < 0 & PValue < 0.05)
x.sub
DEG_down_Pvalue_005 <- rownames(x.sub)
write.table(DEG_down_Pvalue_005, "Bio_vs_Plank_DEG_down_pvalue_005.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
# export reference gene set
reference_set <- rownames(RG$counts)
write.table(reference_set, "reference_set.txt", quote = FALSE, row.names = FALSE, col.names = FALSE)
please edit your post and format it better, it is too long and properly readable at this time
Sorry, I was counter editing, went raw again, I made a gist for it, and was trying to embed
gist fails with
\gist anonymous/49dd230bd6532e3aec2a
the way it works now is that you just paste the actual url to the gist and will embed it (see the main post)
Ah, great :)
The code has been updated.