#!/usr/local/bin/python2.3

"""
Create mapping from features to message ids

usage %(prog)s [ options ] mailbox ...

-d mapfile - identify file which will hold mapping information (required)

-t ham|spam - identify the type of messages in the input mailbox(es)

-h - print this documentation and exit

One of '-t ham' or '-t spam' must be given, as must one or more message
sources.
"""

import sys
import getopt
import anydbm
import cPickle as pickle

from spambayes.mboxutils import getmbox
from spambayes.tokenizer import tokenize
from spambayes.Options import options
from spambayes.classifier import Classifier

prog = sys.argv[0]

def usage(msg=None):
    if msg is not None:
        print >> sys.stderr, msg
    print >> sys.stderr, __doc__.strip() % globals()

def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")

def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:t:",
                                   ["type=", "help", "database="])
    except getopt.GetoptError, msg:
        usage(msg)
        return 1

    mapfile = None
    mboxtype = None
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg

        elif opt in ("-t", "--type"):
            mboxtype = arg

    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1

    if mboxtype is None:
        usage("'-t ham|spam' is required")
        return 1

    if mboxtype not in ("ham", "spam"):
        usage("mboxtype must be 'ham' or 'spam'")
        return 1

    try:
        mapd = pickle.load(file(mapfile))
    except IOError:
        mapd = {}

    for f in args:
        mapmessages(f, mboxtype, mapd)
    pickle.dump(mapd, file(mapfile, "w"))

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))


syntax highlighted by Code2HTML, v. 0.9.1