#! /usr/local/bin/python2.3 """Stats.py - Spambayes statistics class. Classes: Stats - provides statistical information about previous activity. Abstract: Provide statistics on the activity that spambayes has done - for example the number of messages classified as each type, and the number of messages trained as each type. This information is retrieved from the messageinfo database, so is as reliable as that is . To Do: o People would like pretty graphs, so maybe that could be done. o People have requested time-based statistics - mail per hour, spam per hour, and so on. o The possible stats to show are pretty much endless. Some to consider would be: percentage of mail that is fp/fn/unsure, percentage of mail correctly classified. o Suggestions? """ # This module is part of the spambayes project, which is Copyright 2002-3 # The Python Software Foundation and is covered by the Python Software # Foundation license. __author__ = "Tony Meyer " __credits__ = "Mark Hammond, all the spambayes folk." from spambayes.message import msginfoDB class Stats(object): class __empty_msg: def getId(self): return self.id def __init__(self): self.CalculateStats() def Reset(self): self.cls_spam = 0 self.cls_ham = 0 self.cls_unsure = 0 self.trn_spam = 0 self.trn_ham = 0 self.trn_unsure_ham = 0 self.trn_unsure_spam = 0 self.fp = 0 self.fn = 0 self.total = 0 def CalculateStats(self): self.Reset() for msg in msginfoDB.db.keys(): self.total += 1 m = self.__empty_msg() m.id = msg msginfoDB._getState(m) if m.c == 's': self.cls_spam += 1 if m.t == 0: self.fp += 1 elif m.c == 'h': self.cls_ham += 1 if m.t == 1: self.fn += 1 elif m.c == 'u': self.cls_unsure += 1 if m.t == 0: self.trn_unsure_ham += 1 elif m.t == 1: self.trn_unsure_spam += 1 if m.t == 1: self.trn_spam += 1 elif m.t == 0: self.trn_ham += 1 def GetStats(self): if self.total == 0: return ["SpamBayes has processed zero messages"] chunks = [] push = chunks.append perc_ham = 100.0 * self.cls_ham / self.total perc_spam = 100.0 * self.cls_spam / self.total perc_unsure = 100.0 * self.cls_unsure / self.total format_dict = { 'perc_spam': perc_spam, 'perc_ham': perc_ham, 'perc_unsure': perc_unsure, 'num_seen': self.total } format_dict.update(self.__dict__) # Figure out plurals for num, key in [(self.total, "sp1"), (self.trn_ham, "sp2"), (self.trn_spam, "sp3"), (self.trn_unsure_ham, "sp4"), (self.fp, "sp5"), (self.fn, "sp6")]: if num == 1: format_dict[key] = '' else: format_dict[key] = 's' for num, key in [(self.fp, "wp1"), (self.fn, "wp2")]: if num == 1: format_dict[key] = 'was a' else: format_dict[key] = 'were' push("SpamBayes has processed %(num_seen)d message%(sp1)s - " \ "%(cls_ham)d (%(perc_ham).0f%%) good, " \ "%(cls_spam)d (%(perc_spam).0f%%) spam " \ "and %(cls_unsure)d (%(perc_unsure)d%%) unsure." % format_dict) push("%(trn_ham)d message%(sp2)s were manually " \ "classified as good (%(fp)d %(wp1)s false positive%(sp5)s)." \ % format_dict) push("%(trn_spam)d message%(sp3)s were manually " \ "classified as spam (%(fn)d %(wp2)s false negative%(sp6)s)." \ % format_dict) push("%(trn_unsure_ham)d unsure message%(sp4)s were manually " \ "identified as good, and %(trn_unsure_spam)d as spam." \ % format_dict) return chunks if __name__=='__main__': s = Stats() print "\n".join(s.GetStats())