Hello,
You can run a loop over your entire dataset and test each variable in an independent test. This normally takes a lot of time, but you can parallelise these statistical tests in R in order to make them run to completion a lot quicker.
Here ( R functions edited for parallel processing ), I have created some parallelised code for running a condititonal logistic regression model across 100s of thousands of variables. It processes them in blocks of 5000.
All variants/data-points are stored in the data-frame conditional
in this code.
require(doParallel)
cpucores <- 32
setMKLthreads(cpucores)
cores <- makeCluster(detectCores(), type='PSOCK')
registerDoParallel(cores)
Sys.setenv("MC_CORES"=cpucores)
#Create en empty list to hold the formulae
formula.list <- list()
#Store each possible formula in the list
for (j in 1:ncol(conditional))
{
formula.list[[j-11]] <- as.formula(paste("CaseControl ~ strata(FID) + age + gender + PC1 + PC2, colnames(conditional)[j], sep=""))
}
#Run a base model without the SNP - this will replace later models that fail
formula <- as.formula(paste("CaseControl ~ strata(FID) + age + gender + PC1 + PC2 + ", interactions[k], sep=""))
basemodel <- clogit(formula, conditional, method="breslow", ties="breslow", singular.ok=TRUE)
varnames <- paste(paste("chr", map[,1], sep=""), map[,2], map[,4], sep=":")
blocks <- floor((length(varnames))/5000)+1
for (l in 1:blocks)
{
#Run the conditional logistic regression using mclappy, i.e., multiple cores
#First block
if (l==1)
{
print(paste("Processing 5,000 variants, batch ", l, " of ", blocks, sep=""))
print(paste("Index1: 1; ", "Index2: ", (5000*l), sep=""))
models <- mclapply(formula.list[1:(5000*l)], function(x) clogit(x, conditional, method="breslow", ties="breslow", singular.ok=TRUE))
names(models) <- varnames[1:(5000*l)]
}
#Final block
if (l==blocks)
{
print(paste("Processing final batch ", l, " of ", blocks, sep=""))
print(paste("Index1: ", (1+(5000*(l-1))), "; ", "Index2: ", length(formula.list), sep=""))
models <- mclapply(formula.list[(1+(5000*(l-1))):length(formula.list)], function(x) clogit(x, conditional, method="breslow", ties="breslow", singular.ok=TRUE))
names(models) <- varnames[(1+(5000*(l-1))):length(formula.list)]
}
#Any other blocks
if (l>1 && l<blocks)
{
print(paste("Processing 5,000 variants, batch ", l, " of ", blocks, sep=""))
print(paste("Index1: ", (1+(5000*(l-1))), "; ", "Index2: ", (5000*l), sep=""))
models <- mclapply(formula.list[(1+(5000*(l-1))):(5000*l)], function(x) clogit(x, conditional, method="breslow", ties="breslow", singular.ok=TRUE))
names(models) <- varnames[(1+(5000*(l-1))):(5000*l)]
}
#If any models failed, detect them and replace with the base model
wObjects <- mclapply(models, function(x) if (is.null(names(x))) { x <- basemodel } else {x} )
#Exract information from the model and store in a new list
wObjects <- mclapply(wObjects, function(x) data.frame(rownames(summary(x)$coefficients), summary(x)$coefficients))
#Remove age, gender, PC1, and PC2 information
wObjects <- mclapply(wObjects, function(x) x[grep("age|gender|PC1|PC2", rownames(x), invert=TRUE),])
#Convert the list into a data frame for writing
wObject <- do.call(rbind, lapply(wObjects, data.frame, stringsAsFactors=FALSE))
wObject <- data.frame(gsub("\\.[a-zA-Z0-9_():]*", "", rownames(wObject)), wObject)
wObject[,2] <- gsub("factor", "", wObject[,2])
#Output the results to disk
write.table(wObject, "clogit.tsv", sep="\t", quote=FALSE, col.names=FALSE, row.names=FALSE, append=TRUE)
}
Thank you! Will give this a shot!
Also just realised that SAM seems to be an option...
Please use
ADD COMMENT/ADD REPLY
when responding to existing posts to keep threads logically organized.