The problem is that with genomic scales the NchooseK operation produces giant numbers that often cause overflows in python. The trick is to use the log-nchoosek operations since this will keep even th elargest numbers within a reasonable range.
You can check out my gist here: git://gist.github.com/1051335.git
or this:
from math import log, exp
from mpmath import loggamma
def logchoose(ni, ki):
try:
lgn1 = loggamma(ni+1)
lgk1 = loggamma(ki+1)
lgnk1 = loggamma(ni-ki+1)
except ValueError:
#print ni,ki
raise ValueError
return lgn1 - (lgnk1 + lgk1)
def gauss_hypergeom(X, n, m, N):
"""Returns the probability of drawing X successes of m marked items
in n draws from a bin of N total items."""
assert N >= m, 'Number of items %i must be larger than the number of marked items %i' % (N, m)
assert m >= X, 'Number of marked items %i must be larger than the number of sucesses %i' % (m, X)
assert n >= X, 'Number of draws %i must be larger than the number of sucesses %i' % (n, X)
assert N >= n, 'Number of draws %i must be smaller than the total number of items %i' % (n, N)
r1 = logchoose(m, X)
try:
r2 = logchoose(N-m, n-X)
except ValueError:
return 0
r3 = logchoose(N,n)
return exp(r1 + r2 - r3)
def hypergeo_cdf(X, n, m, N):
assert N >= m, 'Number of items %i must be larger than the number of marked items %i' % (N, m)
assert m >= X, 'Number of marked items %i must be larger than the number of sucesses %i' % (m, X)
assert n >= X, 'Number of draws %i must be larger than the number of sucesses %i' % (n, X)
assert N >= n, 'Number of draws %i must be smaller than the total number of items %i' % (n, N)
assert N-m >= n-X, 'There are more failures %i than unmarked items %i' % (N-m, n-X)
s = 0
for i in range(1, X+1):
s += max(gauss_hypergeom(i, n, m, N), 0.0)
return min(max(s,0.0), 1)
Everything should be self-explanitory.
Thanks @Will, that's very helpful! and good to know.
Thanks a lot.I have thought log function, but don't know how.