#! /usr/local/bin/python2.3
"""\
To train:
    %(program)s -t ham.mbox spam.mbox

To filter mail (using .forward or .qmail):
    |%(program)s Maildir/ Mail/Spam/

To print the score and top evidence for a message or messages:
    %(program)s -s message [message ...]
"""

SPAM_CUTOFF = 0.57

SIZE_LIMIT = 5000000 # messages larger are not analyzed
BLOCK_SIZE = 10000
RC_DIR = "~/.spambayes"
DB_FILE = RC_DIR + "/wordprobs.cdb"
CONFIG_FILE = RC_DIR + "/bayescustomize.ini"

import sys
import os
import getopt
import email
import time
import signal
import socket
import email

DB_FILE = os.path.expanduser(DB_FILE)

def import_spambayes():
    global mboxutils, CdbClassifier, tokenize
    if not os.environ.has_key('BAYESCUSTOMIZE'):
        os.environ['BAYESCUSTOMIZE'] = os.path.expanduser(CONFIG_FILE)
    from spambayes import mboxutils
    from spambayes.cdb_classifier import CdbClassifier
    from spambayes.tokenizer import tokenize


try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


program = sys.argv[0] # For usage(); referenced by docstring above

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def maketmp(dir):
    hostname = socket.gethostname()
    pid = os.getpid()
    fd = -1
    for x in xrange(200):
        filename = "%d.%d.%s" % (time.time(), pid, hostname)
        pathname = "%s/tmp/%s" % (dir, filename)
        try:
            fd = os.open(pathname, os.O_WRONLY|os.O_CREAT|os.O_EXCL, 0600)
        except IOError, exc:
            if exc[i] not in (errno.EINT, errno.EEXIST):
                raise
        else:
            break
        time.sleep(2)
    if fd == -1:
        raise SystemExit, "could not create a mail file"
    return (os.fdopen(fd, "wb"), pathname, filename)

def train(bayes, msgs, is_spam):
    """Train bayes with all messages from a mailbox."""
    mbox = mboxutils.getmbox(msgs)
    for msg in mbox:
        bayes.learn(tokenize(msg), is_spam)

def train_messages(ham_name, spam_name):
    """Create database using messages."""

    rc_dir = os.path.expanduser(RC_DIR)
    if not os.path.exists(rc_dir):
        print "Creating", RC_DIR, "directory..."
        os.mkdir(rc_dir)
    bayes = CdbClassifier()
    print 'Training with ham...'
    train(bayes, ham_name, False)
    print 'Training with spam...'
    train(bayes, spam_name, True)
    print 'Update probabilities and writing DB...'
    db = open(DB_FILE, "wb")
    bayes.save_wordinfo(db)
    db.close()
    print 'done'

def filter_message(hamdir, spamdir):
    signal.signal(signal.SIGALRM, lambda s, f: sys.exit(1))
    signal.alarm(24 * 60 * 60)

    # write message to temporary file (must be on same partition)
    tmpfile, pathname, filename = maketmp(hamdir)
    try:
        tmpfile.write(os.environ.get("DTLINE", "")) # delivered-to line
        bytes = 0
        blocks = []
        while 1:
            block = sys.stdin.read(BLOCK_SIZE)
            if not block:
                break
            bytes += len(block)
            if bytes < SIZE_LIMIT:
                blocks.append(block)
            tmpfile.write(block)
        tmpfile.close()

        if bytes < SIZE_LIMIT:
            msgdata = ''.join(blocks)
            del blocks
            msg = email.message_from_string(msgdata)
            del msgdata
            bayes = CdbClassifier(open(DB_FILE, 'rb'))
            prob = bayes.spamprob(tokenize(msg))
        else:
            prob = 0.0

        if prob > SPAM_CUTOFF:
            os.rename(pathname, "%s/new/%s" % (spamdir, filename))
        else:
            os.rename(pathname, "%s/new/%s" % (hamdir, filename))
    except:
        os.unlink(pathname)
        raise

def print_message_score(msg_name, msg_fp):
    msg = email.message_from_file(msg_fp)
    bayes = CdbClassifier(open(DB_FILE, 'rb'))
    prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
    print msg_name, prob
    for word, prob in evidence:
        print '  ', `word`, prob

def main():
    global DB_FILE, CONFIG_FILE

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'tsd:c:')
    except getopt.error, msg:
        usage(2, msg)

    mode = 'sort'
    for opt, val in opts:
        if opt == '-t':
            mode = 'train'
        elif opt == '-s':
            mode = 'score'
        elif opt == '-d':
            DB_FILE = val
        elif opt == '-c':
            CONFIG_FILE = val
        else:
            assert 0, 'invalid option'

    import_spambayes()

    if mode == 'sort':
        if len(args) != 2:
            usage(2, 'wrong number of arguments')
        filter_message(args[0], args[1])
    elif mode == 'train':
        if len(args) != 2:
            usage(2, 'wrong number of arguments')
        train_messages(args[0], args[1])
    elif mode == 'score':
        if args:
            for msg in args:
                print_message_score(msg, open(msg))
        else:
            print_message_score('<stdin>', sys.stdin)


if __name__ == "__main__":
    main()


syntax highlighted by Code2HTML, v. 0.9.1