ports//mail/py-spambayes/work/spambayes-1.0.4/spambayes/chi2.py

import math as _math

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


def chi2Q(x2, v, exp=_math.exp, min=min):
    """Return prob(chisq >= x2, with v degrees of freedom).

    v must be even.
    """
    assert v & 1 == 0
    # XXX If x2 is very large, exp(-m) will underflow to 0.
    m = x2 / 2.0
    sum = term = exp(-m)
    for i in range(1, v//2):
        term *= m / i
        sum += term
    # With small x2 and large v, accumulated roundoff error, plus error in
    # the platform exp(), can cause this to spill a few ULP above 1.0.  For
    # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
    # point.  Returning a value even a teensy bit over 1.0 is no good.
    return min(sum, 1.0)

def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp):
    "Return value of the unit Gaussian at z."
    return exp(-z*z/2.0) / sqrt2pi

def normP(z):
    """Return area under the unit Gaussian from -inf to z.

    This is the probability that a zscore is <= z.
    """

    # This is very accurate in a fixed-point sense.  For negative z of
    # large magnitude (<= -8.3), it returns 0.0, essentially because
    # P(-z) is, to machine precision, indistiguishable from 1.0 then.

    # sum <- area from 0 to abs(z).
    a = abs(float(z))
    if a >= 8.3:
        sum = 0.5
    else:
        sum2 = term = a * normZ(a)
        z2 = a*a
        sum = 0.0
        i = 1.0
        while sum != sum2:
            sum = sum2
            i += 2.0
            term *= z2 / i
            sum2 += term

    if z >= 0:
        result = 0.5 + sum
    else:
        result = 0.5 - sum

    return result

def normIQ(p, sqrt=_math.sqrt, ln=_math.log):
    """Return z such that the area under the unit Gaussian from z to +inf is p.

    Must have 0.0 <= p <= 1.0.
    """

    assert 0.0 <= p <= 1.0
    # This is a low-accuracy rational approximation from Abramowitz & Stegun.
    # The absolute error is bounded by 3e-3.

    flipped = False
    if p > 0.5:
        flipped = True
        p = 1.0 - p

    if p == 0.0:
        z = 8.3
    else:
        t = sqrt(-2.0 * ln(p))
        z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2)

    if flipped:
        z = -z
    return z

def normIP(p):
    """Return z such that the area under the unit Gaussian from -inf to z is p.

    Must have 0.0 <= p <= 1.0.
    """
    z = normIQ(1.0 - p)
    # One Newton step should double the # of good digits.
    return z + (p - normP(z)) / normZ(z)

def main():
    from spambayes.Histogram import Hist
    import sys

    class WrappedRandom:
        # There's no way W-H is equidistributed in 50 dimensions, so use
        # Marsaglia-wrapping to shuffle it more.

        def __init__(self, baserandom=random.random, tabsize=513):
            self.baserandom = baserandom
            self.n = tabsize
            self.tab = [baserandom() for i in range(tabsize)]
            self.next = baserandom()

        def random(self):
            result = self.next
            i = int(result * self.n)
            self.next = self.tab[i]
            self.tab[i] = self.baserandom()
            return result

    random = WrappedRandom().random
    #from uni import uni as random
    #print random

    def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
        H = S = 1.0
        Hexp = Sexp = 0
        for p in ps:
            S *= 1.0 - p
            H *= p
            if S < 1e-200:
                S, e = frexp(S)
                Sexp += e
            if H < 1e-200:
                H, e = frexp(H)
                Hexp += e
        S = ln(S) + Sexp * ln2
        H = ln(H) + Hexp * ln2
        n = len(ps)
        S = 1.0 - chi2Q(-2.0 * S, 2*n)
        H = 1.0 - chi2Q(-2.0 * H, 2*n)
        return S, H, (S-H + 1.0) / 2.0

    warp = 0
    bias = 0.99
    if len(sys.argv) > 1:
        warp = int(sys.argv[1])
    if len(sys.argv) > 2:
        bias = float(sys.argv[2])

    h = Hist(20, lo=0.0, hi=1.0)
    s = Hist(20, lo=0.0, hi=1.0)
    score = Hist(20, lo=0.0, hi=1.0)

    for i in range(5000):
        ps = [random() for j in range(50)]
        s1, h1, score1 = judge(ps + [bias] * warp)
        s.add(s1)
        h.add(h1)
        score.add(score1)

    print "Result for random vectors of 50 probs, +", warp, "forced to", bias

    # Should be uniformly distributed on all-random data.
    print
    print 'H',
    h.display()

    # Should be uniformly distributed on all-random data.
    print
    print 'S',
    s.display()

    # Distribution doesn't really matter.
    print
    print '(S-H+1)/2',
    score.display()

def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
    H = S = 1.0
    Hexp = Sexp = 0
    for p in ps:
        S *= 1.0 - p
        H *= p
        if S < 1e-200:
            S, e = frexp(S)
            Sexp += e
        if H < 1e-200:
            H, e = frexp(H)
            Hexp += e
    S = ln(S) + Sexp * ln2
    H = ln(H) + Hexp * ln2

    n = len(ps)
    probS = chi2Q(-2*S, 2*n)
    probH = chi2Q(-2*H, 2*n)
    print "P(chisq >= %10g | v=%3d) = %10g" % (-2*S, 2*n, probS)
    print "P(chisq >= %10g | v=%3d) = %10g" % (-2*H, 2*n, probH)

    S = 1.0 - probS
    H = 1.0 - probH
    score = (S-H + 1.0) / 2.0
    print "spam prob", S
    print " ham prob", H
    print "(S-H+1)/2", score

if __name__ == '__main__':
    import random
    main()
syntax highlighted by Code2HTML, v. 0.9.1