ports//mail/py-spambayes/work/spambayes-1.0.4/contrib/spamcounts.py

#!/usr/local/bin/python2.3

"""
Check spamcounts for various tokens or patterns

usage %(prog)s [ -h ] [ -r ] [ -d db ] [ -p ] [ -t ] ...

-h    - print this documentation and exit.
-r    - treat tokens as regular expressions - may not be used with -t
-d db - use db instead of the default found in the options file
-p    - db is actually a pickle
-t    - read message from stdin, tokenize it, then display their counts
        may not be used with -r
"""

from __future__ import division

import sys
import getopt
import re
import sets
import os
import shelve
import pickle
import csv

from spambayes.Options import options, get_pathname_option
from spambayes.tokenizer import tokenize
from spambayes.storage import STATE_KEY

prog = sys.argv[0]

def usage(msg=None):
    if msg is not None:
        print >> sys.stderr, msg
    print >> sys.stderr, __doc__.strip() % globals()

# From msgs on spambayes mailing list, spam prob is calculated thusly:
## hc = ham token count
## nh = total number of ham messages
## sc = spam token count
## ns = total number of spam messages
## hr = ham ratio = hc / nh
## sr = spam ratio = sc / ns
## p = base spam probability = sr / (sr + hr)
## S = unknown word strength (static factor = 0.45 by default)
## x = unknown word probability (static factor = 0.5 by default)
## n = total number of messages the token appeared in = hc + sc
## sp = final spam probability = ((S * x) + (n * p)) / (S + n)


def print_spamcounts(tokens, db, use_re):
    if use_re:
        s = sets.Set()
        keys = db.keys()
        for pat in tokens:
            for k in keys:
                if re.search(pat, k) is not None:
                    s.add(k)
        tokens = list(s)

    S = options["Classifier", "unknown_word_strength"]
    x = options["Classifier", "unknown_word_prob"]
    _, ns, nh = db[STATE_KEY]

    writer = csv.writer(sys.stdout)
    writer.writerow(("token", "nspam", "nham", "spam prob"))
    seen = sets.Set()
    for t in tokens:
        if t in seen:
            continue
        seen.add(t)

        try:
            sc, hc = db.get(t, (0, 0))
        except ValueError:
            _, sc, hc = db.get(t, (0, 0, 0))
        if sc == hc == 0:
            continue

        hr = hc / nh
        sr = sc / ns
        p = sr / (sr + hr)
        n = hc + sc
        sp = ((S * x) + (n * p)) / (S + n)

        writer.writerow((t, sc, hc, sp))

def main(args):
    try:
        opts, args = getopt.getopt(args, "hrd:t",
                                   ["help", "re", "database=", "pickle",
                                    "tokenize"])
    except getopt.GetoptError, msg:
        usage(msg)
        return 1

    usere = False
    dbname = get_pathname_option("Storage", "persistent_storage_file")
    ispickle = not options["Storage", "persistent_use_database"]
    tokenizestdin = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            dbname = arg
        elif opt in ("-r", "--re"):
            usere = True
        elif opt in ("-p", "--pickle"):
            ispickle = True
        elif opt in ("-t", "--tokenize"):
            tokenizestdin = True

    if usere and tokenizestdin:
        usage("-r and -t may not be used at the same time")
        return 1

    dbname = os.path.expanduser(dbname)
    print >> sys.stderr, "db:", dbname
    if ispickle:
        db = pickle.load(file(dbname))
    else:
        db = shelve.open(dbname, flag='r')

    if tokenizestdin:
        args = tokenize(sys.stdin)

    if args:
        print_spamcounts(args, db, usere)
        return 0
    else:
        usage("need tokens on cmd line or -t w/ msg on stdin")
        return 1

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))
syntax highlighted by Code2HTML, v. 0.9.1