ports//mail/py-spambayes/work/spambayes-1.0.4/testtools/cmp.py

#!/usr/local/bin/python2.3

"""
cmp.py sbase1 sbase2

Combines output from sbase1.txt and sbase2.txt, which are created by
rates.py from timtest.py output, and displays comparison statistics to
stdout.
"""

import sys
f1n, f2n = sys.argv[1:3]

# Return
#  (list of all f-p rates,
#   list of all f-n rates,
#   total f-p,
#   total f-n,
#   average f-p rate,
#   average f-n rate,
#   list of all ham score deviations,
#   list of all spam score deviations,
#   ham score deviation for all runs,
#   spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
    fns = []
    fps = []
    hamdev = []
    spamdev = []
    hamdevall = spamdevall = (0.0, 0.0)

    get = f.readline
    while 1:
        line = get()
        if line.startswith('-> <stat> tested'):
            print line,
        if line.find(' items; mean ') != -1:
            # -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68
            # and later "sample " went away
            vals = line.split(';')
            mean = float(vals[1].split()[-1])
            sdev = float(vals[2].split()[-1])
            val = (mean, sdev)
            typ = vals[0].split()[2]
            if line.find('for all runs') != -1:
                if typ == 'Ham':
                    hamdevall = val
                else:
                    spamdevall = val
            elif line.find('all in this') != -1:
                if typ == 'Ham':
                    hamdev.append(val)
                else:
                    spamdev.append(val)
            continue
        if line.startswith('-> '):
            continue
        if line.startswith('total'):
            break
        # A line with an f-p rate and an f-n rate.
        p, n = map(float, line.split())
        fps.append(p)
        fns.append(n)

    # "total unique false pos 0"
    # "total unique false neg 0"
    # "average fp % 0.0"
    # "average fn % 0.0"
    fptot = int(line.split()[-1])
    fntot = int(get().split()[-1])
    fpmean = float(get().split()[-1])
    fnmean = float(get().split()[-1])
    return (fps, fns, fptot, fntot, fpmean, fnmean,
            hamdev, spamdev, hamdevall, spamdevall)

def tag(p1, p2):
    if p1 == p2:
        t = "tied          "
    else:
        t = p1 < p2 and "lost " or "won  "
        if p1:
            p = (p2 - p1) * 100.0 / p1
            t += " %+7.2f%%" % p
        else:
            t += " +(was 0)"
    return t

def mtag(m1, m2):
    mean1, dev1 = m1
    mean2, dev2 = m2
    t = "%7.2f %7.2f " % (mean1, mean2)
    if mean1:
        mp = (mean2 - mean1) * 100.0 / mean1
        t += "%+7.2f%%" % mp
    else:
        t += "+(was 0)"
    t += "     %7.2f %7.2f " % (dev1, dev2)
    if dev1:
        dp = (dev2 - dev1) * 100.0 / dev1
        t += "%+7.2f%%" % dp
    else:
        t += "+(was 0)"
    return t

def dump(p1s, p2s):
    alltags = ""
    for p1, p2 in zip(p1s, p2s):
        t = tag(p1, p2)
        print "    %5.3f  %5.3f  %s" % (p1, p2, t)
        alltags += t + " "
    print
    for t in "won", "tied", "lost":
        print "%-4s %2d times" % (t, alltags.count(t))
    print

def dumpdev(meandev1, meandev2):
    for m1, m2 in zip(meandev1, meandev2):
        print mtag(m1, m2)

def windowsfy(fn):
    import os
    if os.path.exists(fn + '.txt'):
        return fn + '.txt'
    else:
        return fn

print f1n, '->', f2n


f1n = windowsfy(f1n)
f2n = windowsfy(f2n)

(fp1, fn1, fptot1, fntot1, fpmean1, fnmean1,
 hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(file(f1n))

(fp2, fn2, fptot2, fntot2, fpmean2, fnmean2,
 hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(file(f2n))

print
print "false positive percentages"
dump(fp1, fp2)
print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2)
print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2)

print
print "false negative percentages"
dump(fn1, fn2)
print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2)
print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2)

print
if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2):
    print "ham mean                     ham sdev"
    dumpdev(hamdev1, hamdev2)
    print
    print "ham mean and sdev for all runs"
    dumpdev([hamdevall1], [hamdevall2])


    print
    print "spam mean                    spam sdev"
    dumpdev(spamdev1, spamdev2)
    print
    print "spam mean and sdev for all runs"
    dumpdev([spamdevall1], [spamdevall2])

    print
    diff1 = spamdevall1[0] - hamdevall1[0]
    diff2 = spamdevall2[0] - hamdevall2[0]
    print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1,
                                                            diff2,
                                                            diff2 - diff1)
else:
    print "[info about ham & spam means & sdevs not available in both files]"
syntax highlighted by Code2HTML, v. 0.9.1