ports//mail/py-spambayes/work/spambayes-1.0.4/Outlook2000/train.py

#! /usr/local/bin/python2.3
# Train a classifier from Outlook Mail folders
# Authors: Sean D. True, WebReply.Com, Mark Hammond
# October, 2002
# Copyright PSF, license under the PSF license

import traceback
from win32com.mapi import mapi

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


# Note our Message Database uses PR_SEARCH_KEY, *not* PR_ENTRYID, as the
# latter changes after a Move operation - see msgstore.py
def been_trained_as_ham(msg, cdata):
    if not cdata.message_db.has_key(msg.searchkey):
        return False
    return cdata.message_db[msg.searchkey]=='0'

def been_trained_as_spam(msg, cdata):
    if not cdata.message_db.has_key(msg.searchkey):
        return False
    return cdata.message_db[msg.searchkey]=='1'

def train_message(msg, is_spam, cdata):
    # Train an individual message.
    # Returns True if newly added (message will be correctly
    # untrained if it was in the wrong category), False if already
    # in the correct category.  Catch your own damn exceptions.
    # If re-classified AND rescore = True, then a new score will
    # be written to the message (so the user can see some effects)
    from spambayes.tokenizer import tokenize

    if not cdata.message_db.has_key(msg.searchkey):
        was_spam = None
    else:
        was_spam = cdata.message_db[msg.searchkey]=='1'
    if was_spam == is_spam:
        return False    # already correctly classified

    # Brand new (was_spam is None), or incorrectly classified.
    stream = msg.GetEmailPackageObject()
    if was_spam is not None:
        # The classification has changed; unlearn the old classification.
        cdata.bayes.unlearn(tokenize(stream), was_spam)

    # Learn the correct classification.
    cdata.bayes.learn(tokenize(stream), is_spam)
    cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
    cdata.dirty = True
    return True

# Untrain a message.
# Return: None == not previously trained
#         True == was_spam
#         False == was_ham
def untrain_message(msg, cdata):
    from spambayes.tokenizer import tokenize
    stream = msg.GetEmailPackageObject()
    if been_trained_as_spam(msg, cdata):
        assert not been_trained_as_ham(msg, cdata), "Can't have been both!"
        cdata.bayes.unlearn(tokenize(stream), True)
        del cdata.message_db[msg.searchkey]
        cdata.dirty = True
        return True
    if been_trained_as_ham(msg, cdata):
        assert not been_trained_as_spam(msg, cdata), "Can't have been both!"
        cdata.bayes.unlearn(tokenize(stream), False)
        del cdata.message_db[msg.searchkey]
        cdata.dirty = True
        return False
    return None

def train_folder(f, isspam, cdata, progress):
    num = num_added = 0
    for message in f.GetMessageGenerator():
        if progress.stop_requested():
            break
        progress.tick()
        try:
            if train_message(message, isspam, cdata):
                num_added += 1
        except:
            print "Error training message '%s'" % (message,)
            traceback.print_exc()
        num += 1
    print "Checked", num, "in folder", f.name, "-", num_added, "new entries found."


def real_trainer(classifier_data, config, message_store, progress):
    progress.set_status("Counting messages")

    num_msgs = 0
    for f in message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
        num_msgs += f.count
    for f in message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
        num_msgs += f.count

    progress.set_max_ticks(num_msgs+3)

    for f in message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
        progress.set_status("Processing good folder '%s'" % (f.name,))
        train_folder(f, 0, classifier_data, progress)
        if progress.stop_requested():
            return

    for f in message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
        progress.set_status("Processing spam folder '%s'" % (f.name,))
        train_folder(f, 1, classifier_data, progress)
        if progress.stop_requested():
            return

    progress.tick()
    if progress.stop_requested():
        return
    # Completed training - save the database
    # Setup the next "stage" in the progress dialog.
    progress.set_max_ticks(1)
    progress.set_status("Writing the database...")
    classifier_data.Save()

# Called back from the dialog to do the actual training.
def trainer(mgr, config, progress):
    rebuild = config.training.rebuild
    rescore = config.training.rescore

    if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
        progress.error("You must specify at least one spam, and one good folder")
        return

    if rebuild:
        # Make a new temporary bayes database to use for training.
        # If we complete, then the manager "adopts" it.
        # This prevents cancelled training from leaving a "bad" db, and
        # also prevents mail coming in during training from being classified
        # with the partial database.
        import os, manager
        bayes_base = os.path.join(mgr.data_directory, "$sbtemp$default_bayes_database")
        mdb_base = os.path.join(mgr.data_directory, "$sbtemp$default_message_database")
        # determine which db manager to use, and create it.
        ManagerClass = manager.GetStorageManagerClass()
        db_manager = ManagerClass(bayes_base, mdb_base)
        classifier_data = manager.ClassifierData(db_manager, mgr)
        classifier_data.InitNew()
    else:
        classifier_data = mgr.classifier_data

    # We do this in possibly 3 stages - train, filter, save
    # re-scoring is much slower and training (as we actually have to save
    # the message back.)
    # Saving is really slow sometimes, but we only have 1 tick for that anyway
    if rescore:
        stages = ("Training", .3), ("Saving", .1), ("Scoring", .6)
    else:
        stages = ("Training", .9), ("Saving", .1)
    progress.set_stages(stages)

    real_trainer(classifier_data, config, mgr.message_store, progress)

    if progress.stop_requested():
        return

    if rebuild:
        assert mgr.classifier_data is not classifier_data
        mgr.classifier_data.Adopt(classifier_data)
        classifier_data = mgr.classifier_data

    progress.tick()

    if rescore:
        # Setup the "filter now" config to what we want.
        config = mgr.config.filter_now
        config.only_unread = False
        config.only_unseen = False
        config.action_all = False
        config.folder_ids = mgr.config.training.ham_folder_ids + mgr.config.training.spam_folder_ids
        config.include_sub = mgr.config.training.ham_include_sub or mgr.config.training.spam_include_sub
        import filter
        filter.filterer(mgr, mgr.config, progress)

    bayes = classifier_data.bayes
    progress.set_status("Completed training with %d spam and %d good messages" % (bayes.nspam, bayes.nham))

def main():
    print "Sorry - we don't do anything here any more"

if __name__ == "__main__":
    main()
syntax highlighted by Code2HTML, v. 0.9.1