Thanks for the anwsers on this question by @Will and @brentp.But here comes another question,which one is more accurate.As the N grows, the error grows.
Here are the codes:
from math import log, exp
from mpmath import loggamma
from scipy.stats import hypergeom
import time
def logchoose(ni, ki):
try:
lgn1 = loggamma(ni+1)
lgk1 = loggamma(ki+1)
lgnk1 = loggamma(ni-ki+1)
except ValueError:
#print ni,ki
raise ValueError
return lgn1 - (lgnk1 + lgk1)
def gauss_hypergeom(X, n, m, N):
"""Returns the probability of drawing X successes of m marked items
in n draws from a bin of N total items."""
assert N >= m, 'Number of items %i must be larger than the number of marked items %i' % (N, m)
assert m >= X, 'Number of marked items %i must be larger than the number of sucesses %i' % (m, X)
assert n >= X, 'Number of draws %i must be larger than the number of sucesses %i' % (n, X)
assert N >= n, 'Number of draws %i must be smaller than the total number of items %i' % (n, N)
r1 = logchoose(m, X)
try:
r2 = logchoose(N-m, n-X)
except ValueError:
return 0
r3 = logchoose(N,n)
return exp(r1 + r2 - r3)
def hypergeo_cdf(X, n, m, N):
assert N >= m, 'Number of items %i must be larger than the number of marked items %i' % (N, m)
assert m >= X, 'Number of marked items %i must be larger than the number of sucesses %i' % (m, X)
assert n >= X, 'Number of draws %i must be larger than the number of sucesses %i' % (n, X)
assert N >= n, 'Number of draws %i must be smaller than the total number of items %i' % (n, N)
assert N-m >= n-X, 'There are more failures %i than unmarked items %i' % (N-m, n-X)
s = 0
for i in range(0, X+1):
s += max(gauss_hypergeom(i, n, m, N), 0.0)
return min(max(s,0.0), 1)
def hypergeo_sf(X, n, m, N):
assert N >= m, 'Number of items %i must be larger than the number of marked items %i' % (N, m)
assert m >= X, 'Number of marked items %i must be larger than the number of sucesses %i' % (m, X)
assert n >= X, 'Number of draws %i must be larger than the number of sucesses %i' % (n, X)
assert N >= n, 'Number of draws %i must be smaller than the total number of items %i' % (n, N)
assert N-m >= n-X, 'There are more failures %i than unmarked items %i' % (N-m, n-X)
s = 0
for i in range(X, min(m,n)+1):
s += max(gauss_hypergeom(i, n, m, N), 0.0)
return min(max(s,0.0), 1)
print "\n800"
start=time.time()
print "log\t",hypergeo_cdf(20,60,80,800),hypergeo_sf(20,60,80,800),time.time()-start
start=time.time()
print "scipy\t",hypergeom.cdf(20,800,80,60),hypergeom.sf(19,800,80,60),time.time()-start
print "\n8000"
start=time.time()
print "log\t",hypergeo_cdf(200,600,800,8000),hypergeo_sf(200,600,800,8000),time.time()-start
start=time.time()
print "scipy\t",hypergeom.cdf(200,8000,800,600),hypergeom.sf(199,8000,800,600),time.time()-start
print "\n80000"
start=time.time()
print "log\t",hypergeo_cdf(200,600,800,80000),hypergeo_sf(200,600,800,80000),time.time()-start
start=time.time()
print "scipy\t",hypergeom.cdf(200,80000,800,600),hypergeom.sf(199,80000,800,600),time.time()-start
And the result is below:
800
log 0.999999970682 1.77621863705e-07 0.0460000038147
scipy 0.999999970682 1.77621235498e-07 0.00399994850159
8000
log 0.999999999993 2.81852829734e-61 0.466000080109
scipy 1.0 -9.94981874669e-13 0.0139999389648
80000
log 1 2.32353743034e-249 0.475000143051
scipy 0.999999999924 7.60518314991e-11 0.184000015259
As we can see, scipy is faster.But I wonder which one is more accurate.The result is nearly the same when N is small.
and btw ... the
timeit
module in the python standard library makes things like this much easier.I'm newbie...Thanks for your advice.