from __future__ import generators import os import random from spambayes.tokenizer import tokenize HAMTEST = None SPAMTEST = None HAMTRAIN = None SPAMTRAIN = None SEED = random.randrange(2000000000) class Msg(object): __slots__ = 'tag', 'guts' def __init__(self, dir, name): path = dir + "/" + name self.tag = path f = open(path, 'rb') self.guts = f.read() f.close() def __iter__(self): return tokenize(self.guts) # Compare msgs by their paths; this is appropriate for sets of msgs. def __hash__(self): return hash(self.tag) def __eq__(self, other): return self.tag == other.tag def __str__(self): return self.guts # The iterator yields a stream of Msg objects, taken from a list of # directories. class MsgStream(object): __slots__ = 'tag', 'directories', 'keep' def __init__(self, tag, directories, keep=None): self.tag = tag self.directories = directories self.keep = keep def __str__(self): return self.tag def produce(self): if self.keep is None: for directory in self.directories: for fname in os.listdir(directory): yield Msg(directory, fname) return # We only want part of the msgs. Shuffle each directory list, but # in such a way that we'll get the same result each time this is # called on the same directory list. for directory in self.directories: all = os.listdir(directory) random.seed(hash(max(all)) ^ SEED) # reproducible across calls random.shuffle(all) del all[self.keep:] all.sort() # seems to speed access on Win98! for fname in all: yield Msg(directory, fname) def __iter__(self): return self.produce() class HamStream(MsgStream): def __init__(self, tag, directories, train=0): if train: MsgStream.__init__(self, tag, directories, HAMTRAIN) else: MsgStream.__init__(self, tag, directories, HAMTEST) class SpamStream(MsgStream): def __init__(self, tag, directories, train=0): if train: MsgStream.__init__(self, tag, directories, SPAMTRAIN) else: MsgStream.__init__(self, tag, directories, SPAMTEST) def setparms(hamtrain, spamtrain, hamtest=None, spamtest=None, seed=None): """Set HAMTEST/TRAIN and SPAMTEST/TRAIN. If seed is not None, also set SEED. If (ham|spam)test are not set, set to the same as the (ham|spam)train numbers (backwards compat option). """ global HAMTEST, SPAMTEST, HAMTRAIN, SPAMTRAIN, SEED HAMTRAIN, SPAMTRAIN = hamtrain, spamtrain if hamtest is None: HAMTEST = HAMTRAIN else: HAMTEST = hamtest if spamtest is None: SPAMTEST = SPAMTRAIN else: SPAMTEST = spamtest if seed is not None: SEED = seed