#!/usr/local/bin/perl
# train.pl
# This tool trains a corpus of messages (a directory containing a nonspam and
# a spam subdirectory, each with messages in individual files) to a specific
# user and reports on errors.
#
# You can use four different training paradigms; the three stock training modes
# included with dspam (teft, toe, tum) or tune (train until no errors) by
# setting toe mode training and rerunning this script until little or no errors
# are generated.
use strict;
use vars qw { $USER $PATH $REPORTING_WINDOW $CORPUS $TRAINING_MODE };
$REPORTING_WINDOW = 250; # How often to summarize
$PATH = "/usr/local/bin"; # Path to dspam binaries
$TRAINING_MODE = "teft"; # Training mode
### DO NOT CONFIGURE BELOW THIS LINE ###
$USER = shift;
$CORPUS = shift;
if ($CORPUS eq "") {
die "Usage: $0 [username] [corpus]";
}
&Train("$CORPUS/nonspam", "$CORPUS/spam");
sub Train {
my($nonspam, $spam) = @_;
my(@nonspam_corpus, @spam_corpus);
my($ic, $sc, $fp, $sm, $num);
print "Training $nonspam / $spam corpora...\n";
@nonspam_corpus = GetFiles($nonspam);
@spam_corpus = GetFiles($spam);
$ic = $sc = $fp = $sm = $num = 0;
while($#nonspam_corpus && $#spam_corpus) {
my($nonspam_msg) = shift(@nonspam_corpus);
my($spam_msg) = shift(@spam_corpus);
my($cmd, $response);
# Process one nonspam
$cmd = "$PATH/dspam --user $USER --mode=$TRAINING_MODE --deliver=stdout < $nonspam/$nonspam_msg";
$response = `$cmd`;
if ($response =~ /X-DSPAM-Result: (Innocent|Whitelisted)/i) {
$ic++;
} else {
$fp++;
print "FP: $nonspam_msg\n";
open(TRAIN, "|$PATH/dspam --user $USER --mode=$TRAINING_MODE --class=innocent --source=error");
print TRAIN $response;
close(TRAIN);
}
# Process one spam
$cmd = "$PATH/dspam --user $USER --mode=$TRAINING_MODE --deliver=stdout < $spam/$spam_msg";
$response = `$cmd`;
if ($response =~ /X-DSPAM-Result: Spam/i) {
$sc++;
} else {
print "SM: $spam_msg\n";
$sm++;
open(TRAIN, "|$PATH/dspam --user $USER --mode=$TRAINING_MODE --class=spam --source=error");
print TRAIN $response;
close(TRAIN);
}
$num+=2;
if ($num % $REPORTING_WINDOW == 0) {
print "Spam Correct : $sc\n";
print "Spam Missed : $sm\n";
print "Nonspam Correct: $ic\n";
print "Nonspam Missed : $fp\n";
print "--------------------\n";
$sc = $sm = $ic = $fp = $num = 0;
}
}
print "Spam Correct : $sc\n";
print "Spam Missed : $sm\n";
print "Nonspam Correct: $ic\n";
print "Nonspam Missed : $fp\n\n";
system("$PATH/dspam_stats -S $USER");
}
sub GetFiles {
my($corpus) = @_;
my(@files);
opendir(DIR, "./$corpus") || die $!;
@files = grep(!/^\./, readdir(DIR));
closedir(DIR);
return @files;
}
syntax highlighted by Code2HTML, v. 0.9.1