#!/usr/local/bin/python2.3
"""
Create mapping from features to message ids
usage %(prog)s [ options ] mailbox ...
-d mapfile - identify file which will hold mapping information (required)
-t ham|spam - identify the type of messages in the input mailbox(es)
-h - print this documentation and exit
One of '-t ham' or '-t spam' must be given, as must one or more message
sources.
"""
import sys
import getopt
import anydbm
import cPickle as pickle
from spambayes.mboxutils import getmbox
from spambayes.tokenizer import tokenize
from spambayes.Options import options
from spambayes.classifier import Classifier
prog = sys.argv[0]
def usage(msg=None):
if msg is not None:
print >> sys.stderr, msg
print >> sys.stderr, __doc__.strip() % globals()
def mapmessages(f, mboxtype, mapdb):
i = 0
for msg in getmbox(f):
i += 1
sys.stdout.write('\r%s: %d' % (f, i))
sys.stdout.flush()
msgid = msg.get("message-id")
if msgid is None:
continue
for t in tokenize(msg):
ham, spam = mapdb.get(t, ({}, {}))
if mboxtype == "ham":
msgids = ham.get(f, set())
msgids.add(msgid)
ham[f] = msgids
else:
msgids = spam.get(f, set())
msgids.add(msgid)
spam[f] = msgids
mapdb[t] = (ham, spam)
if options["Classifier", "x-use_bigrams"]:
for t in Classifier()._enhance_wordstream(tokenize(msg)):
ham, spam = mapdb.get(t, ({}, {}))
if mboxtype == "ham":
msgids = ham.get(f, set())
msgids.add(msgid)
ham[f] = msgids
else:
msgids = spam.get(f, set())
msgids.add(msgid)
spam[f] = msgids
mapdb[t] = (ham, spam)
sys.stdout.write("\n")
def main(args):
try:
opts, args = getopt.getopt(args, "hd:t:",
["type=", "help", "database="])
except getopt.GetoptError, msg:
usage(msg)
return 1
mapfile = None
mboxtype = None
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
return 0
elif opt in ("-d", "--database"):
mapfile = arg
elif opt in ("-t", "--type"):
mboxtype = arg
if mapfile is None:
usage("'-d mapfile' is required")
return 1
if mboxtype is None:
usage("'-t ham|spam' is required")
return 1
if mboxtype not in ("ham", "spam"):
usage("mboxtype must be 'ham' or 'spam'")
return 1
try:
mapd = pickle.load(file(mapfile))
except IOError:
mapd = {}
for f in args:
mapmessages(f, mboxtype, mapd)
pickle.dump(mapd, file(mapfile, "w"))
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
syntax highlighted by Code2HTML, v. 0.9.1