#!/usr/local/bin/python2.3
"""
Check spamcounts for various tokens or patterns
usage %(prog)s [ -h ] [ -r ] [ -d db ] [ -p ] [ -t ] ...
-h - print this documentation and exit.
-r - treat tokens as regular expressions - may not be used with -t
-d db - use db instead of the default found in the options file
-p - db is actually a pickle
-t - read message from stdin, tokenize it, then display their counts
may not be used with -r
"""
from __future__ import division
import sys
import getopt
import re
import sets
import os
import shelve
import pickle
import csv
from spambayes.Options import options, get_pathname_option
from spambayes.tokenizer import tokenize
from spambayes.storage import STATE_KEY
prog = sys.argv[0]
def usage(msg=None):
if msg is not None:
print >> sys.stderr, msg
print >> sys.stderr, __doc__.strip() % globals()
# From msgs on spambayes mailing list, spam prob is calculated thusly:
## hc = ham token count
## nh = total number of ham messages
## sc = spam token count
## ns = total number of spam messages
## hr = ham ratio = hc / nh
## sr = spam ratio = sc / ns
## p = base spam probability = sr / (sr + hr)
## S = unknown word strength (static factor = 0.45 by default)
## x = unknown word probability (static factor = 0.5 by default)
## n = total number of messages the token appeared in = hc + sc
## sp = final spam probability = ((S * x) + (n * p)) / (S + n)
def print_spamcounts(tokens, db, use_re):
if use_re:
s = sets.Set()
keys = db.keys()
for pat in tokens:
for k in keys:
if re.search(pat, k) is not None:
s.add(k)
tokens = list(s)
S = options["Classifier", "unknown_word_strength"]
x = options["Classifier", "unknown_word_prob"]
_, ns, nh = db[STATE_KEY]
writer = csv.writer(sys.stdout)
writer.writerow(("token", "nspam", "nham", "spam prob"))
seen = sets.Set()
for t in tokens:
if t in seen:
continue
seen.add(t)
try:
sc, hc = db.get(t, (0, 0))
except ValueError:
_, sc, hc = db.get(t, (0, 0, 0))
if sc == hc == 0:
continue
hr = hc / nh
sr = sc / ns
p = sr / (sr + hr)
n = hc + sc
sp = ((S * x) + (n * p)) / (S + n)
writer.writerow((t, sc, hc, sp))
def main(args):
try:
opts, args = getopt.getopt(args, "hrd:t",
["help", "re", "database=", "pickle",
"tokenize"])
except getopt.GetoptError, msg:
usage(msg)
return 1
usere = False
dbname = get_pathname_option("Storage", "persistent_storage_file")
ispickle = not options["Storage", "persistent_use_database"]
tokenizestdin = False
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
return 0
elif opt in ("-d", "--database"):
dbname = arg
elif opt in ("-r", "--re"):
usere = True
elif opt in ("-p", "--pickle"):
ispickle = True
elif opt in ("-t", "--tokenize"):
tokenizestdin = True
if usere and tokenizestdin:
usage("-r and -t may not be used at the same time")
return 1
dbname = os.path.expanduser(dbname)
print >> sys.stderr, "db:", dbname
if ispickle:
db = pickle.load(file(dbname))
else:
db = shelve.open(dbname, flag='r')
if tokenizestdin:
args = tokenize(sys.stdin)
if args:
print_spamcounts(args, db, usere)
return 0
else:
usage("need tokens on cmd line or -t w/ msg on stdin")
return 1
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
syntax highlighted by Code2HTML, v. 0.9.1