#!/usr/local/bin/python2.3

"""
Extract messages which contain given features

usage: %(prog)s [ options ]

-d mapfile - specify file which holds feature mapping information (required)

-S file - output spam message file

-H file - output spam message file

-f feature - specify feature to locate (may be given more than once)

-h - print this documentation and exit

At least one of either the -H or -S flags must be given on the command line.
If no features are given on the command line with the -f flag, one or more
files containing messages with X-Spambayes-Evidence headers must be given.
"""

import sys
import getopt
import re
import cPickle as pickle

from spambayes.mboxutils import getmbox

prog = sys.argv[0]

def usage(msg=None):
    if msg is not None:
        print >> sys.stderr, msg
    print >> sys.stderr, __doc__.strip() % globals()

def extractmessages(features, mapdb, hamfile, spamfile):
    """extract messages which contain given features"""
    hamids = {}
    spamids = {}

    for feature in features:
        ham, spam = mapdb.get(feature, ([], []))
        if hamfile is not None:
            for mbox in ham:
                msgids = hamids.get(mbox, set())
                msgids.update(ham.get(mbox, set()))
                hamids[mbox] = msgids
        if spamfile is not None:
            for mbox in spam:
                msgids = spamids.get(mbox, set())
                msgids.update(spam.get(mbox, set()))
                spamids[mbox] = msgids

    # now run through each mailbox in hamids and spamids and print
    # matching messages to relevant ham or spam files
    for mailfile in hamids:
        i = 0
        msgids = hamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print >> hamfile, msg
    print

    for mailfile in spamids:
        i = 0
        msgids = spamids[mailfile]
        for msg in getmbox(mailfile):
            if msg.get("message-id") in msgids:
                i += 1
                sys.stdout.write('\r%s: %5d' % (mailfile, i))
                sys.stdout.flush()
                print >> spamfile, msg
    print

def main(args):
    try:
        opts, args = getopt.getopt(args, "hd:S:H:f:",
                                   ["help", "database=", "spamfile=",
                                    "hamfile=", "feature="])
    except getopt.GetoptError, msg:
        usage(msg)
        return 1

    mapfile = spamfile = hamfile = None
    features = set()
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            return 0
        elif opt in ("-d", "--database"):
            mapfile = arg
        elif opt in ("-H", "--hamfile"):
            hamfile = arg
        elif opt in ("-S", "--spamfile"):
            spamfile = arg
        elif opt in ("-f", "--feature"):
            features.add(arg)

    if hamfile is None and spamfile is None:
        usage("At least one of -S or -H are required")
        return 1

    if mapfile is None:
        usage("'-d mapfile' is required")
        return 1

    try:
        mapd = pickle.load(file(mapfile))
    except IOError:
        usage("Mapfile %s does not exist" % mapfile)
        return 1

    if not features and not args:
        usage("Require at least one feature (-f) arg or one message file")
        return 1

    if not features:
        # extract significant tokens from each message and identify
        # where they came from
        for f in args:
            for msg in getmbox(f):
                evidence = msg.get("X-Spambayes-Evidence", "")
                evidence = re.sub(r"\s+", " ", evidence)
                features = [e.rsplit(": ", 1)[0]
                              for e in evidence.split("; ")[2:]]
                features = set([eval(f) for f in features])
        if not features:
            usage("No X-Spambayes-Evidence headers found")
            return 1

    if spamfile is not None:
        spamfile = file(spamfile, "w")
    if hamfile is not None:
        hamfile = file(hamfile, "w")

    extractmessages(features, mapd, hamfile, spamfile)

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))


syntax highlighted by Code2HTML, v. 0.9.1