How to compare and categorize enrichment CO terms from multiple files?
I need to find a way to substitute more specific child GO terms with less specific parent GO terms since my remaining vision is not good enough for comparing them by visually looking at the enriched GO terms. I have copy-pasted my latest version of the R code I am currently using at the end of this email. Could anyone please assist me in accomplishing this or connect me to somebody, who might? I only have 4 weeks left to finish my thesis. If I cannot defend by the end of March my funding will be discontinued.
Could you please respond to me directly via email (Thomas.F.Hahn2@gmail.com) or Skype (tfh002) or cell phone (318 243 3940) because my text to speech software, on which I am depending to access electronic information due to my very limited remaining vision, cannot read out all portions of this website aloud to me? Fortunately, it is much more compatible with the G-mail and Skype interface thus allowing me to communicate much more efficiently and reliably when using those.
I am very grateful for any kind of assistance in earning a PhD in bioinformatics despite being almost blind.
With very warm regards,
Thomas Hahn
Cell phone: 318 243 3940
Skype ID: tfh002
Email: Thomas.F.Hahn2@gmail.com
#################################################################################################
###
### This script combines the two subscripts that preparation and plotting of the log-fold
### yeast expression changes under CR and YEPD
###
### Additional (minor) changes:
### -
###
### Feb 20, 2015
###
#################################################################################################
message("Clean up workspace")
rm(list=ls(all=T))
#------------------------------------------------------------------------------------------------
### Generate scatter plot input file from:
### 1. Annotation file from Expression Lab (Affy)
### 2. Expression file from Expression Lab (Affy)
### Creates:
### 1. *.for.scatterplot.txt file in the <dir.scatter> folder in the current working directory
message("Create Scatterplot input files...")
setwd("D:/Analysis")
## INPUT FILES (relative or full path)
file="Our_data_from_expression_console.txt" # file with data from expression console
annotation = read.delim("Our_data_annotation.txt", header = F) # annotation for replicates
## OUTPUT directory
my.directory = dir.scatter="2015_02_26_Erg6"
chosen.folder = "chosen_genes"
#######################################################
replicates = tapply(annotation[,1], annotation[,2], list)
normdata = read.delim(file, check.names = F)
####Inserted by Patrick on Thursday, February 26th to set the minimum threshold to 7.6
adaptLowerBound <- TRUE # set to FALSE if no adaption so take place !!!
if (adaptLowerBound) {
tmp.colIndices <- 2:7 # column indices of expression values !!!
lowerBound <- 7.61 # hard-coded lower boundary of expression values !!!
normdata.orig <- normdata
normdata.sub <- normdata[,tmp.colIndices]
normdata.sub[which(normdata.sub <= lowerBound, arr.ind=T)] <- lowerBound
normdata[,tmp.colIndices] <- normdata.sub
}
# calculate the mean for replicates and add them to the table
for (i in 1:length(replicates)){
new.data=apply(normdata[,as.character(unlist(replicates[i])),drop=FALSE] ,2, function(x)as.numeric(as.character(x)))
normdata[, names(replicates)[i]] = rowMeans(new.data)
}
# get the ratios that we want
ratios = t(combn(names(replicates),2))
write.ratios = vector()
for (i in 1:nrow(ratios)){
ratio = paste(ratios[i,], collapse="/")
normdata[,ratio] = 2^(normdata[,ratios[i, 1]])/2^(normdata[,ratios[i,2]])
write.ratios = c(write.ratios, ratio)
}
for (i in 1:nrow(ratios)){
ratio = paste(ratios[i,2], ratios[i,1], sep="/")
normdata[,ratio] = 2^(normdata[,ratios[i, 2]])/2^(normdata[,ratios[i,1]])
write.ratios = c(write.ratios, ratio)
}
normdata$Systematic = normdata[,"Representative Public ID"]
normdata$Common = normdata[, "Gene Symbol"]
normdata$Probesets = normdata[, "Probe Set ID"]
multiple=names(which(table(normdata$Systematic)>1))
# remove the ones with multiple probes and leave only the conditions and ratios
final.normdata=normdata[!(as.character(normdata$Systematic) %in% multiple), c("Probesets","Systematic","Common",names(replicates), write.ratios)]
pombe = grep("^SP",as.character(final.normdata$Systematic))
affy = grep("^AF", as.character(final.normdata$Systematic))
remove = c(pombe, affy)
if (length(remove)>0){
final.normdata = final.normdata[-remove,]
}
normfile=gsub(".txt", ".for.scatterplot.txt", file)
dir.create(dir.scatter, showWarnings=F)
fn.scatterInput <- file.path(dir.scatter, normfile)
write.table(final.normdata, fn.scatterInput, sep="\t", row.names=FALSE, col.names=TRUE, quote=F)
message(" Done.")
#------------------------------------------------------------------------------------------------
### Generate scatter plot from:
### 1. *.for.scatterplot.txt file in the "Scatterplot_out" folder in the current working directory (not required to be specified)
### 2. *.txt list of genes
### Creates:
### 1. *.for.scatterplot.txt file in the "Scatterplot_out" folder in the current working directory
message("Create Scatterplot...")
#setwd("D:/Tang/Patrick/Friday_Feb20_2015")
## INPUT FILES
# read in the file with chosen genes from a txt file
# give the names of the chosen files, the colors and the labels
chosen.input.files = c(
"Ergosterol SGD 150 genes.txt",
"Dietary Restriction 72 genes.txt",
"Autophagy SGD 61 genes.txt",
"Sphingolipids 12 genes.txt",
"ER lipid enzymes 9 genes.txt",
"Flipases 7 genes.txt",
"mito lipid enzymes 2 genes.txt",
"Vacuolar Lipid Enzymes 2 genes.txt",
"Phospholipid binding 16 genes.txt",
"Glucose transport 9 genes.txt"
)
# what is the gene universe
#universe=read.delim(file=fn.scatterInput, header=T)
#universe$Systematic=gsub("[.]S1", "", universe$Systematic)# replaces .S1 with empty strings
universe=read.delim(file=fn.scatterInput, header=T)
universe$Systematic=gsub("[.]S1", "", universe$Systematic)# replaces .S1 with empty strings
# select the columns
yaxis="Erg6c.WTc"
#yaxis="Our.WTc.Our.WTy"
#yaxis="WTy"
#yaxis = colnames(universe)[4]#sets the Y axis to the name of the last coumn
#yaxis = colnames(universe)[ncol(universe)]#sets the Y axis to the name of the last coumn
#yaxis=ncol(universe)
#yaxis="Ag15c.Atg15yR1"
yratio=1.5
#xaxis=colnames(universe)[5]
xaxis="Erg6y.WTy"
#xaxis="WT.CR.2.WT.2"
#xaxis="X2.G.12.h"
xratio=1.5
logarithm = TRUE # for the ratio the logarithm should be set to TRUE
enrichment = TRUE
my.legend = TRUE
Brown = FALSE
no.black = FALSE # if set to TRUE it does not do enrichment for the chosen genes. True means it will do everything, except for the chosen genes
black.only = FALSE # put it to FALSE if you want all genes
#Black = TRUE# it does enrichment only for the black genes (put it to FALSE to do it for everybody). False means that it enriches the over and under expressed but not the chosen
if (enrichment == TRUE){
library(GOstats)
library(org.Sc.sgd.db)
}
exclusion.quadrant = FALSE
xaxis.limit = 7.61
yaxis.limit = 7.61
exclusion.color = "violet"
################################
number.of.lines = 5
chosen.colors = c("cornflowerblue", "gray53", "seagreen4", "blue", "deeppink3", "saddlebrown", "red", "black", "blueviolet", "darkblue")
chosen.labels = c("cornflowerblue", "gray53", "seagreen4", "blue", "deeppink3", "saddlebrown", "red", "black", "blueviolet", "darkblue")
my.directory = "2015_02_26_Erg6"
#####################################################################
### choose colors for the points
above.line.color = "yellow"
below.line.color = "cyan"
handpicked.color = "saddlebrown"
middle.genes.color = "moccasin"
above.line.label = "red"
below.line.label = "blue"
handpicked.label = "burlywood4"
middle.genes.label = "navajowhite4"
### choose colors for the lines
upper.diagonal.color = "yellow"
lower.diagonal.color = "yellow"
left.vertical.line = "yellow"
right.vertical.line = "yellow"
upper.horizontal.line = "yellow"
lower.horizontal.line = "yellow"
# define the shapes
shape.above.line = 17
shape.below.line = 15
legend.size = 0.4# font size
point.size = 0.4# size of points in scatter plot
show.above.names = "TRUE" #you can change it to FALSE
show.below.names = "TRUE"
show.middle.names = "FALSE"
show.chosen = "TRUE"
# define parameters for gene ontology enrichment
hgCutoff <- 0.05
FDR.thresh=0.1
text.cex=0.25 # This number tells the font size of the selected genes.
#######################################################################
#######################################################################