I am trying to automatically label some of the data points on a manhattan plot. For those who do not know what a manhattan plot is, it really does not matter. Rewriting the old style R code in ggplot2 seems to be a bigger challenging for me. Since the only feature I want to add to the script is to be able to automatically label some data points nicely (using ggrepel), so I think maybe I could just overlay the labels on the graph plotted with the old code.
My current try is as follows:
library(ggplot2);
library(ggrepel);
d=read.table("a.txt",header=T,fill=T, sep=" ");
dmht='';
dmht<-data.frame(chrom=d[,9], txStart = d[,11], "-log10(PValue)" = -log10(d[,5]))
# sort it
o<-order(dmht[,1],dmht[,2]);
dmht<-data.frame(dmht[o,]);
names(dmht)<-c("chrom", "txStart", "-log10(PValue)");
attach(dmht);
chrs<-c('chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chrX','chrY');
chrLabels = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y');
chrLen<-rep(0,length(chrs));
chrMin<-rep(0,length(chrs));
chrMax<-rep(0,length(chrs));
totalLen = 0;
for(i in 1:length(chrs)) {
dchr<-subset(dmht, chrom == chrs[i]);
chrMin[i]<-min(as.numeric(dchr[,2]),na.rm=T);
chrMax[i]<-max(as.numeric(dchr[,2]),na.rm=T);
chrLen[i]<-chrMax[i] - chrMin[i] + 1;
totalLen = totalLen + chrMax[i];
}
ds=read.table("selected2label.txt",header=T,fill=T,sep=" ");
dsel='';
dsel<-data.frame(ds[,9], ds[,11], -log10(ds[,5]) , ds[,1])
ymin = min(as.numeric(dmht[,3], dsel[,3]), na.rm=T);
ymax = max(as.numeric(dmht[,3], dsel[,3]), na.rm=T);
# chr start positions
chrStart<-rep(0,length(chrs));
for(i in 1:length(chrs)) {
if(i == 1) {
chrStart[i] = 1;
}
else {
chrStart[i] = chrStart[i-1] + chrMax[i-1] + 1;
}
}
#dmht=subset(dmht,dmht[,1] < 24); # remove mitochondra snps
colors <- rep(c("blue", "green", "cyan"),9);
png("result/test/mhtTest.png" , width=1600);
par(las=3, lab=c(length(chrLabels),5,7))
# draw the dots
dchr<-subset(dmht, chrom == chrs[1]);
plot(as.numeric(dchr[,2])+chrStart[1], as.numeric(dchr[,3]), col=colors[1], ylim=c(ymin,ymax),xlim=c(chrMin[1],totalLen),axes=F,ylab="-log10(PValue)", xlab="Chromosome", main="");
for(i in 2:length(chrs)) {
dchr<-subset(dmht, chrom == chrs[i]);
points(as.numeric(dchr[,2])+chrStart[i], as.numeric(dchr[,3]), col=colors[i]);
}
axis(side=1,labels=chrLabels,at=chrStart);
axis(side=2);
# draw the quantiles
quants<-quantile(as.numeric(dmht[,3]), p=c(),na.rm=T);
for(q in quants) {
abline(h=q);
}
# draw the abs values
abss<-c();
for(a in abss) {
abline(h=a);
}
# sort it
#os<-order(dsel[,1],dsel[,2]);
#dsel<-data.frame(dsel[os,]);
#dsel
#dsel<-data.frame(dsel);
#dsel
colnames(dsel)<-c("chrom", "txStart", "-log10(PValue)" , 'GENE_ID');
detach(dmht);
attach(dsel);
# highlight the selected dots
for(i in 1:length(chrs)) {
dchr<-subset(dsel, chrom == chrs[i] | paste("chr", chrom, sep="") == chrs[i]);
if(length(dchr[,2]) > 0) {
print(dchr)
# this is the new code
geom_label_repel(data = dchr, aes(label = GENE_ID, x=as.numeric(txStart)+chrStart[i], y = as.numeric(dchr[,3])), size = 5, box.padding = unit(0.35, "lines"), point.padding = unit(0.5, "lines"))
# replacing the line in the old script:
# text(as.numeric(dchr[,2])+chrStart[i], as.numeric(dchr[,3]), dchr[,4])
}
}
dev.off();
The only line I replaced (besides adding "libary(ggplot2)" & "library(ggrepel)") was:
# this is the new code
geom_label_repel(data = dchr, aes(label = GENE_ID, x=as.numeric(txStart)+chrStart[i], y = as.numeric(dchr[,3])), size = 5, box.padding = unit(0.35, "lines"), point.padding = unit(0.5, "lines"))
# replacing the line in the old script:
# text(as.numeric(dchr[,2])+chrStart[i], as.numeric(dchr[,3]), dchr[,4])
The problem is that the labelling does not show up at all.
Your help would be highly appreciated.
"text(..)" works, but it just that the labels could jumble together. Not sure if gridBase has the same problem. The reason I want to use "ggrepel" is that it can automatically label data points without overlaying the text on each other.
From what I understand, gridBase allows you to mix base and grid graphics. So you can use geom_label_repel on top of your plot. See my updated answer. But again, I'm just guessing as I haven't tried it.
Tried what you suggested and various other combinations without luck. Taking a break.
Really wish R were OO-R. :)