In an effort to encourage others to be more explicit with source code I will try to set a good example.
This is for human mirnas illumina 1.5 adapters, alignment using Novoalign, R, Bioconductor, and DESeq:
shell:
wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz .
gunzip -c hairpin.fa.gz > newhairpin.fa
perl -ne 'unless(/^>/){s/U/T/g;}print' < newhairpin.fa > hairpin.dna.fa
perl -ne 'if(/>hsa/){print;$_=<>;print}' hairpin.dna.fa > hsa_hairpin.dna.fa
novoindex -m hairpin.ndx hsa_hairpin.dna.fa
novoalign -a ATCTCGTATGCCGTCTTCTGCTTG -d hairpin.ndx -F ILMFQ -f control.export.txt.fq -m -l 17 -h 60 -t 65 -o sam -o FullNW > control.export.txt.fq.hairpin.sam
novoalign -a ATCTCGTATGCCGTCTTCTGCTTG -d hairpin.ndx -F ILMFQ -f condition.export.txt.fq -m -l 17 -h 60 -t 65 -o sam -o FullNW > condition.export.txt.fq.hairpin.sam
for f in *hairpin.sam;
do echo "samtools view -b -S $PWD/$f -t $PWD/refs/hairpin_dna.fa > $PWD/$f.bam;
samtools sort $PWD/$f.bam $PWD/$f.sorted;
samtools index $PWD/$f.sorted.bam" | qsub ;
done
R:
library(Rsamtools)
library(Biostrings)
library(stringr)
library(plyr)
library(reshape)
library(DESeq)
rpmNormalize<-FALSE
minCount<-100
s <- read.DNAStringSet("refs/hsa_hairpin.dna.fa",use.names=TRUE)
hsa<-s[str_detect(names(s),"^hsa")]
names(hsa)<-str_match(names(hsa),"^\\S+")
hsaGR<-GRanges(names(hsa),IRanges(1,width(hsa)),strand="*")
mirna<-function(x){
xdf<-countBam(file=paste("./",x,".hairpin.bam",sep=""),param=ScanBamParam(which=hsaGR))[,c("space","records")]
names(xdf)<-c("miRNA","reads")
if(rpmNormalize){xdf$reads<-xdf$reads*1000000/sum(xdf$reads)}
xdf$variable<-x
xdf
}
conds<-c("N","T")
samples<-c("control","condition")
uncasted<-ldply(as.list(samples),.fun=mirna)
rnaSeqReadyDataFrame<-cast(uncasted,miRNA ~ variable,value="reads")
write.csv(rnaSeqReadyDataFrame,file="rawCounts.csv")
rnaSeqReadyDataFrame<-subset(rnaSeqReadyDataFrame,rowSums(rnaSeqReadyDataFrame)>minCount)
countsTable<-as.matrix(rnaSeqReadyDataFrame[,-1])
row.names(countsTable)<-rnaSeqReadyDataFrame$miRNA
cds <- newCountDataSet( countsTable, conds )
cds <- estimateSizeFactors( cds )
cds <- estimateVarianceFunctions( cds )
res <- nbinomTest( cds, "N", "T")
write.csv(res,file="deseqResults.csv")
yes I think the novoalign mirna recipe is -l 15 -t 30 -h 20 scores are backward in novoalign so my -l -h and -t are actually more liberal (since we are looking for mirna editing as well as diff exp). There is no science there but I might have seen that in the novoalign forum.
Thanks for this. I am interested how you came up with the scores for -l -h and -t for novoalign. Trail and error or calculation?
yes I think the novoalign mirna recipe is -l 15 -t 30 -h 20 scores are backward in novoalign so my -l -h and -t are actually more liberal (since we are looking for mirna editing as well as diff exp)
This is very helpful! Thanks for posting the whole script instead of just alluding to parts of the process.