"""regimes.py
This module is not executable - it contains regime definitions
for use with incremental.py. Pass the name of any regime to
incremental.py with the "-r" switch, and it will be loaded from
this module.
Existing regimes are:
'perfect' A train-on-everything regime. The trainer is given
perfect and immediate knowledge of the proper
classification.
'corrected' A train-on-everything regime. The trainer trusts the
classifier result until end-of-group, at which point
all mistrained and non-trained items (fp, fn, and
unsure) are corrected to be trained with their proper
classification.
'balanced_corrected'
A partial-training regime. Works just like the
'corrected' regime, except that if the database is
imbalanced more than 2::1 (or 1::2) then messages are
not used for training.
'expire4months' This is like 'perfect', except that messages are
untrained after 120 groups have passed.
'nonedge' A partial-training regime, which trains only on messages
which are not properly classified with scores of 1.00 or
0.00 (rounded). All false positives and false negatives
*are* trained.
'fpfnunsure' A partial-training regime, which trains only on
false positives, false negatives and unsures.
'fnunsure' A partial-training regime, which trains only on
false negatives and unsures. This simulates, for
example, a user who deletes all mail classified as spam
without ever examining it for false positives.
"""
###
### This is a training regime for the incremental.py harness.
### It does perfect training on all messages.
###
class perfect:
def __init__(self):
pass
def group_action(self, which, test):
pass
def guess_action(self, which, test, guess, actual, msg):
return actual
###
### This is a training regime for the incremental.py harness.
### It does guess-based training on all messages, followed by
### correction to perfect at the end of each group.
###
class corrected:
def __init__(self):
self.spam_to_ham = []
self.ham_to_spam = []
self.unsure_to_ham = []
self.unsure_to_spam = []
def group_action(self, which, test):
test.untrain(self.ham_to_spam, self.spam_to_ham)
test.train(self.spam_to_ham, self.ham_to_spam)
test.train(self.unsure_to_ham, self.unsure_to_spam)
self.spam_to_ham = []
self.ham_to_spam = []
self.unsure_to_ham = []
self.unsure_to_spam = []
def guess_action(self, which, test, guess, actual, msg):
if guess[0] != actual:
if actual < 0:
if guess[0] == 0:
self.unsure_to_spam.append(msg)
else:
self.ham_to_spam.append(msg)
else:
if guess[0] == 0:
self.unsure_to_ham.append(msg)
else:
self.spam_to_ham.append(msg)
return guess[0]
###
### This is a training regime for the incremental.py harness.
### It does guess-based training on all messages, as long
### as the ham::spam ratio stays roughly even (not more than 2::1),
### followed by correction to perfect at the end of each group.
###
class balanced_corrected(corrected):
ratio_maximum = 2.0
def guess_action(self, which, test, guess, actual, msg):
# In some situations, we just do the 'corrected' regime:
# If we haven't trained any ham/spam (regardless of
# the guess because if all we know is one, everything
# will look like it).
# If the guess is unsure.
if not (guess[0] == 0 or test.nham_trained == 0 or \
test.nspam_trained == 0):
# Otherwise, we only train if it doesn't screw up the
# balance.
ratio = test.nham_trained / float(test.nspam_trained)
if ratio > self.ratio_maximum and guess[0] == 1:
# Too much ham, and this is ham - don't train.
return 0
elif ratio < (1/self.ratio_maximum) and guess[0] == -1:
# Too much spam, and this is spam - don't train.
return 0
return corrected.guess_action(self, which, test, guess, actual, msg)
###
### This is a training regime for the incremental.py harness.
### It does perfect training for fp, fn, and unsures.
###
class fpfnunsure:
def __init__(self):
pass
def group_action(self, which, test):
pass
def guess_action(self, which, test, guess, actual, msg):
if guess[0] != actual:
return actual
return 0
###
### This is a training regime for the incremental.py harness.
### It does perfect training for fn, and unsures, leaving
### false positives broken.
###
class fnunsure:
def __init__(self):
pass
def group_action(self, which, test):
pass
def guess_action(self, which, test, guess, actual, msg):
if guess[0] != actual and guess[0] >= 0:
return actual
return 0
###
### This is a training regime for the incremental.py harness.
### It does perfect training for all messages not already
### properly classified with extreme confidence.
###
class nonedge:
def __init__(self):
pass
def group_action(self, which, test):
pass
def guess_action(self, which, test, guess, actual, msg):
if guess[0] != actual:
return actual
if 0.005 < guess[1] and guess[1] < 0.995:
return actual
return 0
###
### This is a training regime for the incremental.py harness.
### It does perfect training on all messages, followed by
### untraining after 120 groups have gone by.
###
class expire4months:
def __init__(self):
self.ham = [[]]
self.spam = [[]]
def group_action(self, which, test):
if len(self.ham) >= 120:
test.untrain(self.ham[119], self.spam[119])
self.ham = self.ham[:119]
self.spam = self.spam[:119]
self.ham.insert(-1, [])
self.spam.insert(-1, [])
def guess_action(self, which, test, guess, actual, msg):
if actual < 0:
self.spam[0].append(msg)
else:
self.ham[0].append(msg)
return actual
if __name__ == "__main__":
print __doc__
syntax highlighted by Code2HTML, v. 0.9.1