Hello,
I'm using the program 16Stimator (paper here and source here) to estimate 16S rRNA gene copy number from draft genomes. It's comprised of a shell script that calls several other programs and R scripts. I run R on a linux cluster and have gotten most scripts to run successfully, but upon running the R script Estimate_16S23S_PriceBonCIR1R2.R
, I receive the following error:
Rscript ./Scripts/Estimate_16S23S_PriceBonCIR1R2.R $Isolate $Lib $ReadLen $n
Error in if (Df16S$GeneLen[r] > 1400) { :
missing value where TRUE/FALSE needed
Execution halted
The required seqinr
packages is installed. I've found a general answer regarding this error here, but I can't seem to gleen enough information to apply it to my situation. I haven't found anyone else running into this problem. I should mention that the authors state that16Stimator requires R v3.1.0, and I'm using R v3.1.1. I'm not sure if this is significant or not (E.g. some syntax has changed during the version change). Thank you in advance for your insight. Here's the entire script below:
## Estimate 16S and 23S copy numbers
CurrentDir = getwd()
setwd(CurrentDir)
## Start code
args = commandArgs(trailingOnly = TRUE)
Isolate = as.character(args[1])
Lib = as.character(args[2])
ReadLen = as.numeric(args[3])
Seq = as.numeric(args[4])
## Load in gene GC% table and mapped read table
GCtab = read.table(paste("./Beds/", Isolate, ".", Lib, ".gene.gc.txt", sep = ""),
sep = "\t", header = FALSE, stringsAsFactors = FALSE)
colnames(GCtab) = c("Node", "Start", "End", "Gene", "pct_at", "pct_gc", "num_A",
"num_C", "num_G", "num_T", "num_N", "num_oth", "seq_len")
MappedReads = read.table(paste("./Beds/", Isolate, ".", Lib, ".R", Seq,
".gene.mapped.reads.txt", sep = ""),
sep = "\t", header = FALSE, quote = "",
comment.char = "", stringsAsFactors = FALSE)
MappedReads = MappedReads[,(ncol(MappedReads) - 4):ncol(MappedReads)]
colnames(MappedReads) = c("Node", "Start", "End", "Gene", "Overlap")
RDPerGene = data.frame(table(MappedReads$Gene), stringsAsFactors = FALSE)
colnames(RDPerGene) = c("Gene", "Reads")
RDPerGene$Gene = as.character(RDPerGene$Gene)
## Apply GC% correction (Yoon et al, 2009)
## multiply coverage for each gene win by
## looking up corresponding GC cov correction factor
CovCorrectDF = read.csv(paste("./CovCorrect/", Isolate, ".", Lib, ".R", Seq,
".CovCorrectForGC.csv", sep = ""), header = TRUE)
Gene_gc = numeric()
Reads_gc = numeric()
## Gene name has class, name, len, win, separated by "_"
Class = character()
Name = character()
GeneLen = numeric()
GeneWin = numeric()
for(q in 1:nrow(RDPerGene)){
Gene_gc[q] = (round(GCtab[which(GCtab$Gene == RDPerGene$Gene[q]),]$pct_gc,
digits = 2))
Reads_gc[q] = RDPerGene$Reads[q] *
CovCorrectDF[CovCorrectDF$GCprop == Gene_gc[q],]$CovCorrect
String = strsplit(RDPerGene$Gene[q], "_")
Class[q] = String[[1]][1]
Name[q] = String[[1]][2]
GeneLen[q] = as.numeric(String[[1]][3])
GeneWin[q] = as.numeric(String[[1]][4])
}
RDPerGene = data.frame(RDPerGene, Gene_gc, Reads_gc, Class, Name, GeneLen, GeneWin,
stringsAsFactors = FALSE)
write.csv(RDPerGene, paste("./CovCorrect/", Isolate, ".", Lib, ".R", Seq,
".GCorCovPerGeneWin.csv", sep = ""),
quote = FALSE, row.names = FALSE)
## Calculate median 16S/23S coverages, take account of full and partial genes
## if multiple full length 16S or 23S, then sum coverages for each win
## if partial 16S or 23S, then just add wins to vector
Df16S = RDPerGene[which(RDPerGene$Class == "16S"),]
Big16S = data.frame()
Lit16S = data.frame()
SixteenCov = numeric()
for(r in 1:nrow(Df16S)){
if(Df16S$GeneLen[r] > 1400){
Big16S = rbind(Big16S, Df16S[r,])
}else{
Lit16S = rbind(Lit16S, Df16S[r,])
}
}
if(nrow(Big16S) > 0){
for(s in as.numeric(unique(Big16S$GeneWin))){
SixteenCov = c(SixteenCov, sum(Big16S[which(Big16S$GeneWin == s),]$Reads_gc))
}
}
if(nrow(Lit16S) > 0){
SixteenCov = c(SixteenCov, as.numeric(Lit16S$Reads_gc))
}
Df23S = RDPerGene[which(RDPerGene$Class == "23S"),]
Big23S = data.frame()
Lit23S = data.frame()
TwentyThreeCov = numeric()
for(t in 1:nrow(Df23S)){
if(Df23S$GeneLen[t] > 2800){
Big23S = rbind(Big23S, Df23S[t,])
}else{
Lit23S = rbind(Lit23S, Df23S[t,])
}
}
if(nrow(Big23S) > 0){
for(u in as.numeric(unique(Big23S$GeneWin))){
TwentyThreeCov = c(TwentyThreeCov, sum(Big23S[which(Big23S$GeneWin == u),]$Reads_gc))
}
}
if(nrow(Lit23S) > 0){
TwentyThreeCov = c(TwentyThreeCov, as.numeric(Lit23S$Reads_gc))
}
## Apply confidence intervals from Price + Bonett 2002
## Distribution-Free Confidence Intervals for Difference and Ratio of Medians
## Set alpha to desired confidence level
alpha = 0.05
SingCovs = RDPerGene[which(RDPerGene$Class == "Sing"),]$Reads_gc
## Calculate medians for each class (16S, 23S, SingCopyGenes)
Med16S = median(SixteenCov)
Med23S = median(TwentyThreeCov)
MedSings = median(SingCovs)
Ord16S = SixteenCov[order(SixteenCov)]
Ord23S = TwentyThreeCov[order(TwentyThreeCov)]
OrdSings = SingCovs[order(SingCovs)]
c16S = round((length(Ord16S) + 1)/2 - sqrt(length(Ord16S)), digits = 0)
c23S = round((length(Ord23S) + 1)/2 - sqrt(length(Ord23S)), digits = 0)
cSings = round((length(OrdSings) + 1)/2 - sqrt(length(OrdSings)), digits = 0)
if(length(Ord16S) < 132){
p16S = 0
for(i in 1:(c16S - 1)){
p16S = p16S + choose(length(Ord16S), i)
}
p16S = p16S * 0.5^length(Ord16S)
z16S = qnorm(1 - p16S)
}else{
z16S = 2
}
if(length(Ord23S) < 132){
p23S = 0
for(i in 1:(c23S - 1)){
p23S = p23S + choose(length(Ord23S), i)
}
p23S = p23S * 0.5^length(Ord23S)
z23S = qnorm(1 - p23S)
}else{
z23S = 2
}
if(length(OrdSings) < 132){
pSings = 0
for(i in 1:(cSings - 1)){
pSings = pSings + choose(length(OrdSings), i)
}
pSings = pSings * 0.5^length(OrdSings)
zSings = qnorm(1 - pSings)
}else{
zSings = 2
}
N16S = ((log(Ord16S[length(Ord16S) - c16S + 1]) -
log(Ord16S[c16S]))/(2 * z16S))^2
N23S = ((log(Ord23S[length(Ord23S) - c23S + 1]) -
log(Ord23S[c23S]))/(2 * z23S))^2
NSings = ((log(OrdSings[length(OrdSings) - cSings + 1]) -
log(OrdSings[cSings]))/(2 * zSings))^2
zalpha = qnorm(1 - alpha/2)
Est16S = Med16S/MedSings
LowEst16S = (Med16S/MedSings) * exp(-zalpha * sqrt(N16S + NSings))
HighEst16S = (Med16S/MedSings) * exp(zalpha * sqrt(N16S + NSings))
Est23S = Med23S/MedSings
LowEst23S = (Med23S/MedSings) * exp(-zalpha * sqrt(N23S + NSings))
HighEst23S = (Med23S/MedSings) * exp(zalpha * sqrt(N23S + NSings))
Df1623S = data.frame(Isolate, Lib, Seq, Est16S, LowEst16S, HighEst16S,
Est23S, LowEst23S, HighEst23S,
stringsAsFactors = FALSE)
write.csv(Df1623S, paste("./Estimates/", Isolate, ".", Lib, ".R", Seq,
".CopyNumEst.csv", sep = ""),
quote = FALSE, row.names = FALSE)
print("Estimated 16S and 23S copy numbers with Price + Bonett CIs")
So after a bit of investigation, here's what I've come up with:
print
commands showed what each variable contains:Since the variable RDPerGene and it's corresponding output to *.csv file DO contain the information it's supposed to, it seems to me like there's a syntax or contextual issue with how the following commands are assigning information to the Df16S variable:
As I said in another comment, I'm very new to R and unfamiliar with syntax. Also of note, is that I'm using R v3.1.1, and the author's code was written in R v3.1.0; I'm unsure if there were any major (or minor) changes to syntax, etc.
Does anyone have any further ideas for this issue? Perhaps I'm thinking about this incorrectly?