Power Calculations for Methylation Study

We have methylation data at 480K sites from the illumina 450 methylation chip for 96 controls and 97 asthmatics from the ICAC project.

NOTE

see this site: http://www.ats.ucla.edu/stat/r/dae/t_test_power2.htm for calculating the joint stddev from 2 groups

sqrt((grp1.std^2 + grp2.std^2)/2)

We want to do power calculations for a future study using the general range of differences that we see in this study. Data are back-transformed to 0-1 beta values for ease of interpretation. All y-axes are in pairs of samples required. The alpha level reported is 0.05 / 400K, where we assume only 400K rather than 485K because we will remove probes known to contain SNPs and those with low variance. The power is fixed at 90%.

We show 5 figures assuming a standard deviation at the 5th, 25th, 50th, 75th and 95th percentile. We expect to have less power in when assuming the worst-case 95th percentile. We can see that is the case by looking at the 5th plot

The vertical blue lines are the same in each plot and indicate the 95th, 99th, and 99.9th percentile of the observed differences between mean asthma and mean non-asthma. With the 95th being the most conservative and requiring more pairs.

	# do power calculations for replication of ICAC dmrs.
	# input data looks like:
	#gene case_mean case_sd control_mean control_sd diff ttest.pval
	#runx3_1 89.3 6.8 91.9 6.9 -2.6 5.49E-03
	#runx3_2 82.9 6.0 85.6 6.0 -2.8 9.42E-04
	#runx3_3 80.9 5.8 83.2 5.0 -2.3 1.99E-03
	#tigit 58.7 5.5 62.1 3.9 -3.4 8.62E-03
	#il13 85.6 3.8 86.9 3.2 -1.3 4.15E-03
	#prf1 84.5 3.7 85.7 3.5 -1.2 1.98E-02
	#cat_1 3.2 1.6 3.7 2.0 -0.5 2.11E-02
	#cat_2 2.1 1.6 2.6 1.9 -0.5 2.83E-02
	#alpl 11.5 3.1 13.1 3.9 -1.7 4.20E-04
	#c2 50.0 4.6 51.6 4.0 -1.6 9.70E-03
	#klf6_1 34.0 6.9 36.4 7.9 -2.4 1.60E-02
	#klf6_2 36.0 6.6 39.1 7.1 -3.0 1.18E-03
	#nbr2 81.1 6.8 83.3 6.5 -2.2 1.34E-02

	library(pwr)

	sd.groups = function(sda, sdb){
	sqrt((sda^2 + sdb^2)/2)
	}


	df = read.csv('icac.pwr.txt', sep=" ", header=TRUE)
	rownames(df) = df[,1]

	df$n_80 = NA
	df$n_90 = NA

	newdf = data.frame()
	for (i in 1:nrow(df)){
	row = df[i,]
	print(row)
	sdboth = sd.groups(row$case_sd, row$control_sd)
	diff = row$case_mean - row$control_mean
	# calculate for 80 and 90% power at alpha = 0.05
	row$n_80 = sprintf("%.2f", pwr.t.test(d=diff / sdboth, power=0.8, sig.level=0.05, alternative="l")$n)
	row$n_90 = sprintf("%.2f", pwr.t.test(d=diff / sdboth, power=0.9, sig.level=0.05, alternative="l")$n)
	newdf = rbind(newdf, row)
	}

	write.table(newdf, file="icac.power.table.txt", quote=F, sep="\t", row.names=FALSE)
	stop("wrote icac.power.table.txt")

	library('pwr')
	#png('power.png', width=400, height=800)

	n_probes=400000

	# standard deviations from ICAC asthma study are:
	#[0.0041666408126895304,
	# 0.0077047577252945211,
	# 0.014751697565109346,
	# 0.0339926314167096,
	# 0.085141971177441933]
	# for the 5, 25, 50, 75, 95th percentiles

	pctiles = c(5, 25, 50, 75, 95)
	stds = c(0.0041666408126895304,
	0.0077047577252945211,
	0.014751697565109346,
	0.0339926314167096,
	0.085141971177441933)

	mean_pctiles = c(95, 99, 99.9)
	means = c(0.012177838422436461, 0.024153323315866573, 0.054873396731666163)




	par(mfrow=c(round(0.5 + length(stds) / 2), 2), oma=c( 0, 0, 3, 0 ))
	# go up to 2; means 100% difference at 2 probes
	diffs = seq(0.01, 1, length.out=100)

	j = 1
	for(std in stds){
	pairs = rep(NA, 100)
	i = 1
	for(difference in diffs){
	pwr.calc = pwr.t.test(d=difference / std, power=0.9, type="paired", sig.level=0.05/n_probes)
	pairs[i] = pwr.calc$n
	i = i + 1
	}
	sub=sprintf("effect std:%.3f (%ith percentile)" , std, pctiles[j])
	plot(diffs, pairs, log=c("x", "y"), xlab="methylation difference", ylab="pairs needed",
	sub=sub, type='b')
	abline(v=means, col='blue', lty=2)
	j = j + 1
	}
	mtext("vertical lines at 95th, 99th, 99.9th%\nof mean asthma:non-asthma differences in ICAC", outer=TRUE)

	library('pwr')

	n_probes=400000

	pctiles = c(50, 75, 90)
	# stds observed in icac study 450k data.
	stds = c(0.0077047577252945211,
	0.014751697565109346,
	0.0339926314167096
	)

	# we want to detect the largest differences in means.
	mean_pctiles = c(95, 99, 99.9)
	means = c(0.012177838422436461,
	0.024153323315866573,
	0.054873396731666163)


	mean_pctiles = c(90, 95, 99, 99.9)
	means = c(0.0070, 0.0107, 0.02204, 0.0531)

	pctiles = c(25, 50, 75, 90)
	stds = c(0.0076, 0.01432, 0.0326, 0.0575)


	stouffer_liptak = function(pvalues, adjust=FALSE, lower.tail=TRUE){
	qvalues = qnorm(pvalues, mean=0, sd=1, lower.tail=lower.tail)
	if(adjust){
	sigma = adjust
	C = chol(sigma)
	Cm1 = solve(C) # C^-1
	qvalues = Cm1 %% qvalues # Qstar = C^-1 Q
	}
	Cp = sum(qvalues) / sqrt(length(qvalues))
	pstar = pnorm(Cp, mean=0, sd=1, lower.tail=lower.tail)
	return(list(C=Cp, p=pstar))
	}

	pvals = c(0.1, 0.1, 0.1)
	sigma = matrix(c(1, 0.5, 0.25,
	0.5, 1, 0.5,
	0.25, 0.5, 1), nrow=3)


	png('pwr.png', width=500, height=500)
	par(mfrow=c(1, 1), oma=c( 0, 0, 3, 0 ))
	# go up to 2; means 100% difference at 2 probes
	diffs = seq(0.01, 0.5, length.out=50)

	j = 1
	for(std in stds[[1]]){
	pairs = rep(NA, length(diffs))
	i = 1
	for(difference in diffs){
	if(j == 3 && i == 1){ i = i + 1; next }
	pwr.calc = pwr.t.test(d=difference / std, power=0.8, type="two.sample", sig.level=0.1/n_probes)
	pairs[i] = pwr.calc$n
	i = i + 1
	}
	sub=sprintf("effect std:%.3f (%ith percentile)" , std, pctiles[j])
	plot(diffs[!is.na(pairs)], pairs[!is.na(pairs)], log=c("x", "y"), xlab="methylation difference", ylab="samples needed",
	type='b', xlim=c(0.01, 0.5), ylim=c(0.01, 150))
	abline(v=means, col='blue', lty=2)
	j = j + 1
	}
	mtext("vertical lines at 95th, 99th, 99.9th%\nof mean asthma:non-asthma differences in ICAC", outer=TRUE)


	write("mean_pctile\tsd_pctile\tdifference\tpower-140\tpower-300", stdout())
	si = 0
	for(std in stds){
	mi = 0
	si = si + 1
	for(difference in means){
	mi = mi + 1
	pwr.140 = pwr.t.test(d=difference / std, type="two.sample", sig.level=0.1/n_probes, n=140)
	pwr.300 = pwr.t.test(d=difference / std, type="two.sample", sig.level=0.1/n_probes, n=300)
	write(sprintf("%.1f\t%.1f\t%.3f\t%.3f\t%.3f", mean_pctiles[mi], pctiles[si], difference, pwr.140$power, pwr.300$power), stdout())
	}
	}

	library(pwr)
	#library(mvtnorm)
	library(parallel)
	library(MASS)
	n_samples = 200

	set.seed(4)
	stouffer_liptak = function(pvalues, adjust=FALSE, lower.tail=TRUE){
	qvalues = qnorm(pvalues, mean=0, sd=1, lower.tail=lower.tail)
	sigma = adjust
	C = chol(sigma)
	Cm1 = solve(C) # C^-1
	qvalues = Cm1 %% qvalues # Qstar = C^-1 Q
	Cp = sum(qvalues) / sqrt(length(qvalues))
	pnorm(Cp, mean=0, sd=1, lower.tail=lower.tail)
	}

	cor.val = 0.2

	gen.correlated = function(rho, nrow=100, ncol=2, mean=0, sd=1){
	X = matrix(rnorm(nrow * ncol, mean=mean, sd=sd), nrow=nrow)
	sigma = diag(ncol)
	sigma <- rho ^ abs(row(sigma)-col(sigma))
	X %*% chol(sigma)
	}

	expr_means = c(0.035145, 0.038902, 0.044048, 0.050961)
	expr_means = c( 0.038902, 0.044048, 0.050961)
	expr_sd50 = 0.16183939707577316

	meth_means = c(0.004042, 0.005181, 0.007036, 0.010723)
	meth_means = c( 0.005181, 0.007036, 0.010723)
	meth_sd50 = 0.014325652249100367


	cc = c(rep(1, n_samples / 2), rep(0, n_samples / 2))

	sigma.base = matrix(c(1, 0.01,
	0.01, 1), nrow=2)
	sigma.meth = matrix(c(1, cor.val,
	cor.val, 1), nrow=2)

	n_sims = 10000
	ei = 1
	for (expr_mean in expr_means){
	mi = 1
	row = c()
	for (meth_mean in meth_means){

	sigma = sigma.base

	combined.ps = unlist(mclapply(1:n_sims, function(i){

	simmed.asthma.meth = gen.correlated(cor.val, nrow=n_samples/2, ncol=4, mean=0, sd=meth_sd50)
	simmed.control.meth = gen.correlated(cor.val, nrow=n_samples/2, ncol=4, mean=meth_mean, sd=meth_sd50)

	simmed.asthma.expr = rnorm(n_samples/2, expr_mean, sd=expr_sd50)
	simmed.control.expr = rnorm(n_samples/2, 0, sd=expr_sd50)

	simmed.asthma = cbind(simmed.asthma.meth, simmed.asthma.expr)
	simmed.control = cbind(simmed.control.meth, simmed.control.expr)

	simmed = rbind(simmed.asthma, simmed.control)


	#colnames(simmed) = c("methylation", "expression")
	mp1 = t.test(simmed[,1] ~ cc)$p.value
	mp2 = t.test(simmed[,2] ~ cc)$p.value
	mp3 = t.test(simmed[,3] ~ cc)$p.value
	mp4 = t.test(simmed[,4] ~ cc)$p.value

	sigma.m = cor(simmed[,1:4])

	ep = t.test(simmed[,ncol(simmed)] ~ cc)$p.value

	mp = stouffer_liptak(c(mp1, mp2, mp3, mp4), sigma.m)
	#mp = stouffer_liptak(c(mp, mp4), sigma.meth)
	combined.p = stouffer_liptak(c(ep, mp), sigma.base)
	#write(sprintf("%.4g\t%.4g\t%.4g", mp, ep, combined.p), stdout())
	combined.p
	}, mc.cores=12))
	row = c(row, (sum(combined.ps < 2.5e-6) / n_sims))
	mi = mi + 1
	}
	write(paste(row, collapse="\t"), stdout())
	ei = ei + 1
	}

	if(FALSE){
	pvals = unlist(lapply(1:1000, function(i){
	asthma = rnorm(n_samples / 2, mean=0, sd=0.0326)
	ctrl = rnorm(n_samples / 2, mean=0.0107, sd=0.0326)
	t.test(asthma, ctrl)$p.value
	}))
	print(sum(pvals < 0.05) / length(pvals))
	}

	import pandas as pa
	import numpy as np

	cov = pa.read_table('results/icac-clinical-july-2012.txt')
	#dat = pa.read_table(gzip.open('expr.rows.txt.gz'), index_col=0)
	for f in ("expr.rows.txt.gz", "450k.rows.txt"):

	dat = pa.read_table("results/" + f, index_col=0, compression='gzip' if f.endswith('.gz')
	else None)

	dat = dat.ix[[not p.startswith(('chrX', 'chrY')) for p in dat.index],:]

	if "expr" in f:
	dsd = dat.std(axis=1)
	dat = dat.ix[dsd < np.percentile(dsd, 0.9), :]


	if "450" in f:
	dat = 1 / (1 + np.exp(-dat))
	print dat.min().min(), dat.max().max()

	cov.index = cov.StudyID
	cov = cov.ix[dat.columns, :]

	assert all(cov.StudyID == dat.columns)

	asthma = cov.StudyID[cov.asthma]
	ctrl = cov.StudyID[~cov.asthma]

	dstd = dat.std(axis=1)
	amean = dat[asthma].mean(axis=1)
	cmean = dat[ctrl].mean(axis=1)

	dmean = abs(amean - cmean)

	print f
	print "50std:", np.percentile(dstd, (50,))
	print "mean: c(%s)" % ", ".join("%.6f" % p for p in
	np.percentile(dmean, (80, 85, 90, 95)))