#! /usr/local/bin/python2.3
# Train a classifier from Outlook Mail folders
# Authors: Sean D. True, WebReply.Com, Mark Hammond
# October, 2002
# Copyright PSF, license under the PSF license
import traceback
from win32com.mapi import mapi
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
# Note our Message Database uses PR_SEARCH_KEY, *not* PR_ENTRYID, as the
# latter changes after a Move operation - see msgstore.py
def been_trained_as_ham(msg, cdata):
if not cdata.message_db.has_key(msg.searchkey):
return False
return cdata.message_db[msg.searchkey]=='0'
def been_trained_as_spam(msg, cdata):
if not cdata.message_db.has_key(msg.searchkey):
return False
return cdata.message_db[msg.searchkey]=='1'
def train_message(msg, is_spam, cdata):
# Train an individual message.
# Returns True if newly added (message will be correctly
# untrained if it was in the wrong category), False if already
# in the correct category. Catch your own damn exceptions.
# If re-classified AND rescore = True, then a new score will
# be written to the message (so the user can see some effects)
from spambayes.tokenizer import tokenize
if not cdata.message_db.has_key(msg.searchkey):
was_spam = None
else:
was_spam = cdata.message_db[msg.searchkey]=='1'
if was_spam == is_spam:
return False # already correctly classified
# Brand new (was_spam is None), or incorrectly classified.
stream = msg.GetEmailPackageObject()
if was_spam is not None:
# The classification has changed; unlearn the old classification.
cdata.bayes.unlearn(tokenize(stream), was_spam)
# Learn the correct classification.
cdata.bayes.learn(tokenize(stream), is_spam)
cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
cdata.dirty = True
return True
# Untrain a message.
# Return: None == not previously trained
# True == was_spam
# False == was_ham
def untrain_message(msg, cdata):
from spambayes.tokenizer import tokenize
stream = msg.GetEmailPackageObject()
if been_trained_as_spam(msg, cdata):
assert not been_trained_as_ham(msg, cdata), "Can't have been both!"
cdata.bayes.unlearn(tokenize(stream), True)
del cdata.message_db[msg.searchkey]
cdata.dirty = True
return True
if been_trained_as_ham(msg, cdata):
assert not been_trained_as_spam(msg, cdata), "Can't have been both!"
cdata.bayes.unlearn(tokenize(stream), False)
del cdata.message_db[msg.searchkey]
cdata.dirty = True
return False
return None
def train_folder(f, isspam, cdata, progress):
num = num_added = 0
for message in f.GetMessageGenerator():
if progress.stop_requested():
break
progress.tick()
try:
if train_message(message, isspam, cdata):
num_added += 1
except:
print "Error training message '%s'" % (message,)
traceback.print_exc()
num += 1
print "Checked", num, "in folder", f.name, "-", num_added, "new entries found."
def real_trainer(classifier_data, config, message_store, progress):
progress.set_status("Counting messages")
num_msgs = 0
for f in message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
num_msgs += f.count
for f in message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
num_msgs += f.count
progress.set_max_ticks(num_msgs+3)
for f in message_store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
progress.set_status("Processing good folder '%s'" % (f.name,))
train_folder(f, 0, classifier_data, progress)
if progress.stop_requested():
return
for f in message_store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
progress.set_status("Processing spam folder '%s'" % (f.name,))
train_folder(f, 1, classifier_data, progress)
if progress.stop_requested():
return
progress.tick()
if progress.stop_requested():
return
# Completed training - save the database
# Setup the next "stage" in the progress dialog.
progress.set_max_ticks(1)
progress.set_status("Writing the database...")
classifier_data.Save()
# Called back from the dialog to do the actual training.
def trainer(mgr, config, progress):
rebuild = config.training.rebuild
rescore = config.training.rescore
if not config.training.ham_folder_ids or not config.training.spam_folder_ids:
progress.error("You must specify at least one spam, and one good folder")
return
if rebuild:
# Make a new temporary bayes database to use for training.
# If we complete, then the manager "adopts" it.
# This prevents cancelled training from leaving a "bad" db, and
# also prevents mail coming in during training from being classified
# with the partial database.
import os, manager
bayes_base = os.path.join(mgr.data_directory, "$sbtemp$default_bayes_database")
mdb_base = os.path.join(mgr.data_directory, "$sbtemp$default_message_database")
# determine which db manager to use, and create it.
ManagerClass = manager.GetStorageManagerClass()
db_manager = ManagerClass(bayes_base, mdb_base)
classifier_data = manager.ClassifierData(db_manager, mgr)
classifier_data.InitNew()
else:
classifier_data = mgr.classifier_data
# We do this in possibly 3 stages - train, filter, save
# re-scoring is much slower and training (as we actually have to save
# the message back.)
# Saving is really slow sometimes, but we only have 1 tick for that anyway
if rescore:
stages = ("Training", .3), ("Saving", .1), ("Scoring", .6)
else:
stages = ("Training", .9), ("Saving", .1)
progress.set_stages(stages)
real_trainer(classifier_data, config, mgr.message_store, progress)
if progress.stop_requested():
return
if rebuild:
assert mgr.classifier_data is not classifier_data
mgr.classifier_data.Adopt(classifier_data)
classifier_data = mgr.classifier_data
progress.tick()
if rescore:
# Setup the "filter now" config to what we want.
config = mgr.config.filter_now
config.only_unread = False
config.only_unseen = False
config.action_all = False
config.folder_ids = mgr.config.training.ham_folder_ids + mgr.config.training.spam_folder_ids
config.include_sub = mgr.config.training.ham_include_sub or mgr.config.training.spam_include_sub
import filter
filter.filterer(mgr, mgr.config, progress)
bayes = classifier_data.bayes
progress.set_status("Completed training with %d spam and %d good messages" % (bayes.nspam, bayes.nham))
def main():
print "Sorry - we don't do anything here any more"
if __name__ == "__main__":
main()
syntax highlighted by Code2HTML, v. 0.9.1