Source code for sequana.utils.fisher

# extract from https://github.com/painyeph/FishersExactTest/tree/master
# we are only interested in the 2-tail test that is 150 times faster than scipy.

# changes made.
# - only a subset of essential functions were copied (test1l, test1r, test1t)
# - The 'all' flavor was dropped
# - created a single entry point function similar to scipy syntax.
# - uses python caching instead of own caching (same performances but more reliable on long term)
from math import exp, lgamma, log

try:
    from functools import cache
except ImportError:
    from functools import lru_cache as cache


[docs] @cache def lgamma(z): x = 0.1659470187408462e-06 / (z + 7) x += 0.9934937113930748e-05 / (z + 6) x -= 0.1385710331296526 / (z + 5) x += 12.50734324009056 / (z + 4) x -= 176.6150291498386 / (z + 3) x += 771.3234287757674 / (z + 2) x -= 1259.139216722289 / (z + 1) x += 676.5203681218835 / z x += 0.9999999999995183 return log(x) - 5.58106146679532777 - z + (z - 0.5) * log(z + 6.5)
def _maxn(): l = 1 n = 2 h = float("inf") while l < n: if abs(lgamma(n + 1) - lgamma(n) - log(n)) >= 1: h = n else: l = n n = (l + min(h, l * 3)) // 2 return n MAXN = _maxn()
[docs] def fisher_exact(array_2by2, alternative): """Fast Fisher test alternative to scipy""" a = array_2by2[0][0] b = array_2by2[0][1] c = array_2by2[1][0] d = array_2by2[1][1] if alternative == "less": return test_left(a, b, c, d) elif alternative == "greater": return test_right(a, b, c, d) elif alternative == "two-sided": return test_both(a, b, c, d) else: # pragma: no cover raise ValueError("alternative must be less, greater or two-sided")
# two-tails
[docs] def test_both(a, b, c, d): return exp(-mlnTest2t(a, a + b, a + c, a + b + c + d))
[docs] def mlnTest2t(a, ab, ac, abcd): if 0 > a or a > ab or a > ac or ab + ac > abcd + a: raise ValueError("invalid contingency table") if abcd > MAXN: raise OverflowError("the grand total of contingency table is too large") a_min = max(0, ab + ac - abcd) a_max = min(ab, ac) if a_min == a_max: return 0.0 p0 = lgamma(ab + 1) + lgamma(ac + 1) + lgamma(abcd - ac + 1) + lgamma(abcd - ab + 1) - lgamma(abcd + 1) pa = lgamma(a + 1) + lgamma(ab - a + 1) + lgamma(ac - a + 1) + lgamma(abcd - ab - ac + a + 1) st = 1.0 if ab * ac < a * abcd: for i in range(min(a - 1, int(round(ab * ac / abcd))), a_min - 1, -1): pi = lgamma(i + 1) + lgamma(ab - i + 1) + lgamma(ac - i + 1) + lgamma(abcd - ab - ac + i + 1) if pi < pa: continue st_new = st + exp(pa - pi) if st_new == st: break st = st_new for i in range(a + 1, a_max + 1): pi = lgamma(i + 1) + lgamma(ab - i + 1) + lgamma(ac - i + 1) + lgamma(abcd - ab - ac + i + 1) st_new = st + exp(pa - pi) if st_new == st: break st = st_new else: for i in range(a - 1, a_min - 1, -1): pi = lgamma(i + 1) + lgamma(ab - i + 1) + lgamma(ac - i + 1) + lgamma(abcd - ab - ac + i + 1) st_new = st + exp(pa - pi) if st_new == st: break st = st_new for i in range(max(a + 1, int(round(ab * ac / abcd))), a_max + 1): pi = lgamma(i + 1) + lgamma(ab - i + 1) + lgamma(ac - i + 1) + lgamma(abcd - ab - ac + i + 1) if pi < pa: continue st_new = st + exp(pa - pi) if st_new == st: break st = st_new return max(0, pa - p0 - log(st))
# right tail only
[docs] def test_right(a, b, c, d): return exp(-mlnTest2r(a, a + b, a + c, a + b + c + d))
[docs] def mlnTest2r(a, ab, ac, abcd): if 0 > a or a > ab or a > ac or ab + ac > abcd + a: raise ValueError("invalid contingency table") if abcd > MAXN: raise OverflowError("the grand total of contingency table is too large") a_min = max(0, ab + ac - abcd) a_max = min(ab, ac) if a_min == a_max: return 0.0 p0 = lgamma(ab + 1) + lgamma(ac + 1) + lgamma(abcd - ac + 1) + lgamma(abcd - ab + 1) - lgamma(abcd + 1) pa = lgamma(a + 1) + lgamma(ab - a + 1) + lgamma(ac - a + 1) + lgamma(abcd - ab - ac + a + 1) if ab * ac > a * abcd: sl = 0.0 for i in range(a - 1, a_min - 1, -1): sl_new = sl + exp( pa - lgamma(i + 1) - lgamma(ab - i + 1) - lgamma(ac - i + 1) - lgamma(abcd - ab - ac + i + 1) ) if sl_new == sl: break sl = sl_new return -log(1.0 - max(0, exp(p0 - pa) * sl)) else: sr = 1.0 for i in range(a + 1, a_max + 1): sr_new = sr + exp( pa - lgamma(i + 1) - lgamma(ab - i + 1) - lgamma(ac - i + 1) - lgamma(abcd - ab - ac + i + 1) ) if sr_new == sr: break sr = sr_new return max(0, pa - p0 - log(sr))
# left tail only
[docs] def test_left(a, b, c, d): return exp(-mlnTest2l(a, a + b, a + c, a + b + c + d))
[docs] def mlnTest2l(a, ab, ac, abcd): if 0 > a or a > ab or a > ac or ab + ac > abcd + a: raise ValueError("invalid contingency table") if abcd > MAXN: raise OverflowError("the grand total of contingency table is too large") a_min = max(0, ab + ac - abcd) a_max = min(ab, ac) if a_min == a_max: return 0.0 p0 = lgamma(ab + 1) + lgamma(ac + 1) + lgamma(abcd - ac + 1) + lgamma(abcd - ab + 1) - lgamma(abcd + 1) pa = lgamma(a + 1) + lgamma(ab - a + 1) + lgamma(ac - a + 1) + lgamma(abcd - ab - ac + a + 1) if ab * ac < a * abcd: sr = 0.0 for i in range(a + 1, a_max + 1): sr_new = sr + exp( pa - lgamma(i + 1) - lgamma(ab - i + 1) - lgamma(ac - i + 1) - lgamma(abcd - ab - ac + i + 1) ) if sr_new == sr: break sr = sr_new return -log(1.0 - max(0, exp(p0 - pa) * sr)) else: sl = 1.0 for i in range(a - 1, a_min - 1, -1): sl_new = sl + exp( pa - lgamma(i + 1) - lgamma(ab - i + 1) - lgamma(ac - i + 1) - lgamma(abcd - ab - ac + i + 1) ) if sl_new == sl: break sl = sl_new return max(0, pa - p0 - log(sl))