#! /usr/local/bin/python2.3

### Train spambayes on messages in an MH mailbox, with spam identified
### by identical copies in other designated MH mailboxes.
###
### Run this from a cron job on your server.

"""Usage: %(program)s [OPTIONS] ...

Where OPTIONS is one or more of:
    -h
        show usage and exit
    -d DBNAME
        use the DBM store.  A DBM file is larger than the pickle and
        creating it is slower, but loading it is much faster,
        especially for large word databases.  Recommended for use with
        hammiefilter or any procmail-based filter.
    -D DBNAME
        use the pickle store.  A pickle is smaller and faster to create,
        but much slower to load.  Recommended for use with pop3proxy and
        hammiesrv.
    -e PATH
        directory of all messages (both ham and spam).
    -s PATH
        directory of known spam messages to train on.  These should be
        duplicates of messages in the everything folder.  Can be
        specified more than once.
    -f
        force training, ignoring the trained header.  Use this if you
        need to rebuild your database from scratch.
    -q
        quiet mode; no output
"""

import mboxutils
import getopt
import hammie
import sys
import os
import re
import time
import filecmp

program = sys.argv[0]
loud = True
day = 24 * 60 * 60
# The following are in days
expire = 4 * 30
grouping = 2

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def row(value, spamday, hamday, unsureday):
    line = "%5d|" % value
    for j in range(((expire) // grouping) - 1, -1, -1):
        spamv = 0
        hamv = 0
        unsurev = 0
        for k in range(j * grouping, (j + 1) * grouping):
            try:
                spamv += spamday[k]
                hamv += hamday[k]
                unsurev += unsureday[k]
            except:
                pass
        spamv = spamv // grouping
        hamv = hamv // grouping
        unsurev = unsurev // grouping
        # print "%d: %ds %dh %du" % (j, spamv, hamv, unsurev)
        count = 0
        char = ' '
        if spamv >= value:
            count += 1
            char = 's'
        if hamv >= value:
            count += 1
            if (char == ' ' or hamv < spamv):
                char = 'h'
        if unsurev >= value:
            count += 1
            if (char == ' ' or
                (char == 's' and unsurev < spamv) or
                (char == 'h' and unsurev < hamv)):
                char = 'u'
        if count > 1:
            char = char.upper()
        line += char
    return line

def legend():
    line = " " * 60
    now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y"))
    date = time.mktime(time.strptime(time.strftime("1 %b %Y"), "%d %b %Y"))
    age = int(59 - ((now - date) // day // grouping))
    if age >= 55:
        line = line[:age] + time.strftime("| %b")
    else:
        line = line[:(age)] + "|" + line[(age+1):]
        center = int((age + 59) // 2)
        line = line[:center] + time.strftime("%b") + line[center+3:]
    date = time.mktime(time.strptime(time.strftime("1 %b %Y", time.localtime(date - day * 2)), "%d %b %Y"))
    newage = int(59 - ((now - date) // day // grouping))
    while newage >= 0:
        line = line[:newage] + "|" + line[newage+1:]
        center = int((age + newage) // 2)
        line = line[:center] + time.strftime("%b", time.localtime(date)) + line[center+3:]
        age = newage
        date = time.mktime(time.strptime(time.strftime("1 %b %Y", time.localtime(date - day * 2)), "%d %b %Y"))
        newage = int(59 - ((now - date) // day // grouping))
    if age >= 4:
        center = int((age) // 2)
        line = line[:center-2] + time.strftime("%b", time.localtime(date)) + line[center+1:]
    return line

def main():
    """Main program; parse options and go."""

    global loud

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hfqd:D:s:e:')
    except getopt.error, msg:
        usage(2, msg)

    if not opts:
        usage(2, "No options given")

    pck = None
    usedb = None
    force = False
    everything = None
    spam = []
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == "-f":
            force = True
        elif opt == "-q":
            loud = False
        elif opt == '-e':
            everything = arg
        elif opt == '-s':
            spam.append(arg)
        elif opt == "-d":
            usedb = True
            pck = arg
        elif opt == "-D":
            usedb = False
            pck = arg
    if args:
        usage(2, "Positional arguments not allowed")

    if usedb == None:
        usage(2, "Must specify one of -d or -D")

    h = hammie.open(pck, usedb, "c")

    spamsizes = {}

    for s in spam:
        if loud: print "Scanning spamdir (%s):" % s
        files = os.listdir(s)
        for f in files:
            if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
                name = os.path.join(s, f)
                size = os.stat(name).st_size
                try:
                    spamsizes[size].append(name)
                except KeyError:
                    spamsizes[size] = [name]

    skipcount = 0
    spamcount = 0
    hamcount = 0
    spamday = [0] * expire
    hamday = [0] * expire
    unsureday = [0] * expire
    date_re = re.compile(
        r";.* (\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{2,4})")
    now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y"))
    if loud: print "Scanning everything"
    for f in os.listdir(everything):
        if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
            name = os.path.join(everything, f)

            fh = file(name, "rb")
            msg = mboxutils.get_message(fh)
            fh.close()
            # Figure out how old the message is
            age = 2 * expire
            try:
                received = (msg.get_all("Received"))[0]
                received = date_re.search(received).group(1)
                # if loud: print "  %s" % received
                date = time.mktime(time.strptime(received, "%d %b %Y"))
                # if loud: print "  %d" % date
                age = (now - date) // day
                # Can't just continue here... we're in a try
                if age < 0:
                    age = 2 * expire
            except:
                pass
            # Skip anything that has no date or is too old or from the future
            # if loud: print "%s: %d" % (name, age)
            if age >= expire:
                skipcount += 1
                if loud and not (skipcount % 100):
                    sys.stdout.write("-")
                    sys.stdout.flush()
                continue
            age = int(age)

            try:
                if msg.get("X-Spambayes-Classification").find("unsure") >= 0:
                    unsureday[age] += 1
            except:
                pass

            size = os.stat(name).st_size
            isspam = False
            try:
                for s in spamsizes[size]:
                    if filecmp.cmp(name, s):
                        isspam = True
            except KeyError:
                pass
            if isspam:
                spamcount += 1
                spamday[age] += 1
                if loud and not (spamcount % 100):
                    sys.stdout.write("s")
                    sys.stdout.flush()
            else:
                hamcount += 1
                hamday[age] += 1
                if loud and not (hamcount % 100):
                    sys.stdout.write("h")
                    sys.stdout.flush()

            h.train(msg, isspam)

    if loud:
        print

        mval = max(max(spamday), max(hamday), max(unsureday))
        scale = (mval + 19) // 20
        print "%5d" % mval
        for j in range(19, -1, -1):
            print row(scale * j, spamday, hamday, unsureday)
        print "     +" + ('-' * 60)
        print "      " + legend()
        print

        print "Total: %d ham, %d spam (%.2f%% spam)" % (
            hamcount, spamcount, spamcount * 100.0 / (hamcount + spamcount))

    h.store()


if __name__ == "__main__":
    main()


syntax highlighted by Code2HTML, v. 0.9.1