# FuzzyOcr plugin, version 2.3b
# Changelog:
# version 2.0
# Replaced imagemagick with netpbm
# Invoke giffix to fix broken gifs before conversion
# Support png images
# Analyze the file to detect the format without content-type
# Added several configuration parameters
# version 2.1
# Added scoring for wrong content-type
# Added scoring for broken gif images
# Added configuration for helper applications
# Added autodisable_score feature to disable the OCR engine if the message has already enough points
# version 2.1b
# Rule bugfix to avoid warnings
# version 2.1c
# Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr
# Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions
# Fixed typo (treshold -> threshold)
# version 2.2
# Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller
# Added more error handling
# Removed debug files, added logfile instead
# More messages with verbose = 2
# version 2.3
# Multiple scans with different pnm preprocessing and gocr arguments possible
# Support for interlaced gifs
# Support for animated gifs
# Temporary file handling reorganized
# External wordlist support
# Personalized wordlist support
# Spaces are now stripped from wordlist words and OCR results before matching
# Experimental MD5 Database feature
# version 2.3b
# MD5 Database replaced by different feature database
# Corrupted images are now handled better
# Added a timeout function to avoid lockups
# Added threshold overriding on word basis in wordlist
# Various bugfixes
#
#
# written by Christian Holler decoder_at_own-hero_dot_net
package Mail::SpamAssassin::Plugin::FuzzyOcr;
use strict;
use warnings;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Plugin;
use String::Approx 'adistr';
use FileHandle;
use Fcntl ':flock';
our @ISA = qw (Mail::SpamAssassin::Plugin);
our @err_msges = (
"Failed to open pipe to external programs with pipe command \"%s\".
Please check that all helper programs are installed and in the correct path.
(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
"Unexpected error in pipe to external programs.
Please check that all helper programs are installed and in the correct path.
(Pipe Command \"%s\", Pipe exit code %d (\"%s\"), Temporary file: \"%s\")",
"Cannot open \"%s\" to read previously produced data!
(Previously used pipe: \"%s\", error code %d (\"%s\"), Temporary file: \"%s\")",
"Unexpected error while trying executing gocr with arguments \"%s\".
Make sure the gocr location is specified correctly and the arguments are correct.",
"Failed to open global wordlist \"%s\" for reading.
Please check that path and permissions are correct."
);
our @words = ();
our $self;
our $pms;
# Default values
our $threshold = "0.3";
our $base_score = "4";
our $add_score = "1";
our $wctypescore = "1.5";
our $cimgscore = "2.5";
our $cimgscore2 = "5";
our $countreq = 2;
our $verbose = 1;
our $timeout = 10;
our $pre314 = 0;
our $enable_image_hashing = 0;
our $hashing_learn_scanned = 1;
our ($ts, $th, $tw, $tcn, $tc, $hash_ccnt) = (0.01, 0.01, 0.01, 0.01, 5, 5);
our $giffix = "/usr/local/bin/giffix";
our $giftext = "/usr/local/bin/giftext";
our $gifasm = "/usr/local/bin/gifasm";
our $gifinter = "/usr/local/bin/gifinter";
our $giftopnm = "/usr/local/bin/giftopnm";
our $jpegtopnm = "/usr/local/bin/jpegtopnm";
our $pngtopnm = "/usr/local/bin/pngtopnm";
our $pnmfile = "/usr/local/bin/pnmfile";
our $ppmhist = "/usr/local/bin/ppmhist";
our $convert = "/usr/local/bin/convert";
our $identify = "/usr/local/bin/identify";
our $gocr = "/usr/local/bin/gocr";
our $grep = "/usr/bin/grep";
our $max_images = 5;
our $dscore = 10;
our $logfile = "/usr/local/etc/mail/spamassassin/FuzzyOcr.log";
our $pwordlist = ".spamassassin/fuzzyocr.words";
our $digest_db = "/usr/local/etc/mail/spamassassin/FuzzyOcr.hashdb";
our @scansets = (
'$gocr -i -',
'$gocr -l 180 -d 2 -i -'
);
# constructor: register the eval rule
sub new {
my ( $class, $mailsa ) = @_;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsa);
bless( $self, $class );
$self->register_eval_rule("fuzzyocr_check");
$self->register_eval_rule("dummy_check");
return $self;
}
sub parse_config {
my ( $self, $opts ) = @_;
if ( $opts->{key} eq "focr_global_wordlist" ) {
load_global_words( $opts->{value} );
}
elsif ( $opts->{key} eq "focr_personal_wordlist" ) {
$pwordlist = $opts->{value};
}
elsif ( $opts->{key} eq "focr_threshold" ) {
$threshold = $opts->{value};
}
elsif ( $opts->{key} eq "focr_base_score" ) {
$base_score = $opts->{value};
}
elsif ( $opts->{key} eq "focr_add_score" ) {
$add_score = $opts->{value};
}
elsif ( $opts->{key} eq "focr_corrupt_score" ) {
$cimgscore = $opts->{value};
}
elsif ( $opts->{key} eq "focr_corrupt_unfixable_score" ) {
$cimgscore2 = $opts->{value};
}
elsif ( $opts->{key} eq "focr_wrongctype_score" ) {
$wctypescore = $opts->{value};
}
elsif ( $opts->{key} eq "focr_counts_required" ) {
$countreq = $opts->{value};
}
elsif ( $opts->{key} eq "focr_verbose" ) {
$verbose = $opts->{value};
}
elsif ( $opts->{key} eq "focr_timeout" ) {
$timeout = $opts->{value};
}
elsif ( $opts->{key} eq "focr_scansets" ) {
parse_scansets( $opts->{value} );
}
elsif ( $opts->{key} eq "focr_pre314" ) {
$pre314 = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_giffix" ) {
$giffix = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_giftext" ) {
$giftext = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_gifasm" ) {
$gifasm = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_gifinter" ) {
$gifinter = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_giftopnm" ) {
$giftopnm = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_jpegtopnm" ) {
$jpegtopnm = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_pngtopnm" ) {
$pngtopnm = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_ppmhist" ) {
$ppmhist = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_convert" ) {
$convert = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_identify" ) {
$identify = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_gocr" ) {
$gocr = $opts->{value};
}
elsif ( $opts->{key} eq "focr_bin_grep" ) {
$grep = $opts->{value};
}
elsif ( $opts->{key} eq "focr_gif_max_frames" ) {
$max_images = $opts->{value};
}
elsif ( $opts->{key} eq "focr_autodisable_score" ) {
$dscore = $opts->{value};
}
elsif ( $opts->{key} eq "focr_enable_image_hashing" ) {
$enable_image_hashing = $opts->{value};
}
elsif ( $opts->{key} eq "focr_digest_db" ) {
$digest_db = $opts->{value};
}
elsif ( $opts->{key} eq "focr_hashing_learn_scanned" ) {
$hashing_learn_scanned = $opts->{value};
}
elsif ( $opts->{key} eq "focr_logfile" ) {
$logfile = $opts->{value};
}
}
sub dummy_check {
return 0;
}
sub fuzzyocr_check {
( $self, $pms ) = @_;
my $t = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
$t->run(\&check_fuzzy_ocr);
if ($t->timed_out()) {
logfile("FuzzyOcr received timeout after running \"$timeout\" seconds.");
}
return 0;
}
sub load_global_words {
unless ( -r $_[0] ) {
handle_error( $err_msges[3], ( $_[0] ) );
return;
}
open WORDLIST, "<$_[0]";
while(<WORDLIST>) {
chomp($_);
if (( $_ =~ /^[ \t]*#.*$/ ) or ( $_ =~ /^[^a-zA-Z]$/ )) {
next;
}
$_ =~ s/[ \t]*#.*$//;
push( @words, $_ );
}
close WORDLIST;
return 1;
}
sub load_personal_words {
unless ( -e $_[0] ) {
debuglog("No personal wordlist found, skipping...");
return;
}
unless ( -r $_[0] ) {
debuglog(
"Unable to read from wordlist \"$_[0]\", please make sure that permissions are correct."
);
return;
}
open WORDLIST, "<$_[0]";
while(<WORDLIST>) {
chomp($_);
if ( $_ =~ /^[ \t]*#.*$/ ) {
next;
}
$_ =~ s/[ \t]*#.*$//;
push( @words, $_ );
}
close WORDLIST;
}
sub parse_scansets {
$_[0] =~ s/,[ ]*/,/g;
@scansets = split( ',', $_[0]);
debuglog( "Set scansets to values:\n" . join( "\n", @scansets ) );
return 1;
}
sub max {
unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 }
unless ( defined( $_[0] ) ) { return $_[1] }
unless ( defined( $_[1] ) ) { return $_[0] }
if ( $_[0] < $_[1] ) { return $_[1] }
else { return $_[0] }
}
sub reorder {
my $tmp = join( '', @_ );
return split( '\n', $tmp );
}
sub pipe_io {
$SIG{PIPE} = 'IGNORE';
my $pipecmd = shift;
my $input = shift;
my $filecount = 0;
my $silent = 0;
my $ignerror = 0;
my $tmpdir;
my @stdout = ();
my @stderr = ();
my ( $tmpfile, $tfilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
my ( $errfile, $efilepath ) = Mail::SpamAssassin::Util::secure_tmpfile();
close($tmpfile);
close($errfile);
if ($tmpfile eq $errfile) {
debuglog("Got same tmpfile twice! Aborting pipe_io() to avoid deadlocking");
return ( 1, \@stdout, \@stderr );
unlink($tmpfile);
}
if($pipecmd =~ /\$tmpdir/) {
$tmpdir = Mail::SpamAssassin::Util::secure_tmpdir();
$pipecmd =~ s/\$tmpdir/$tmpdir/g;
$filecount = shift;
} else {
$silent = shift;
$ignerror = shift;
}
$pipecmd =~ s/\$errfile/$errfile/g;
my $pipe_pid = open( PIPE_IN, "| $pipecmd 1>$tmpfile 2>>$errfile" );
unless ($pipe_pid) {
unless($silent) {
handle_error( $err_msges[0], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
}
unlink($tmpfile);
unlink($errfile);
return ( $?, \@stdout, \@stderr );
}
flock( PIPE_IN, LOCK_EX );
print PIPE_IN $input;
flock( PIPE_IN, LOCK_UN );
close(PIPE_IN);
if ($? and not $ignerror) {
unless($silent) {
handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
}
unlink($tmpfile);
unlink($errfile);
return ( $?, \@stdout, \@stderr );
}
if ($filecount) {
my $tsize = 0;
my $tcount = 0;
foreach my $nr (0..$filecount-1) {
my $filesize = 0;
if ($nr < 10) {
$filesize = -s "$tmpdir/out0$nr.gif";
} else {
$filesize = -s "$tmpdir/out$nr.gif";
}
if ($filesize > $tsize) {
$tsize = $filesize;
$tcount = $nr;
}
}
if ($tcount < 10) {
open( PIPE_OUT, "< $tmpdir/out0$tcount.gif" );
} else {
open( PIPE_OUT, "< $tmpdir/out$tcount.gif" );
}
flock( PIPE_OUT, LOCK_EX );
@stdout = <PIPE_OUT>;
flock( PIPE_OUT, LOCK_UN );
close PIPE_OUT;
foreach my $nr (0..$filecount) {
if ($nr < 10) {
unlink("$tmpdir/out0$nr.gif");
} else {
unlink("$tmpdir/out$nr.gif");
}
}
rmdir($tmpdir);
unlink($tmpfile);
unlink($errfile);
return ( 0, \@stdout, \@stderr );
} else {
unless (open( PIPE_OUT, "< $tmpfile" )
and open( PIPE_ERR, "< $errfile" ) )
{
unless($silent) {
handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) );
}
unlink($tmpfile);
unlink($errfile);
return ( $?, \@stdout, \@stderr );
}
flock( PIPE_OUT, LOCK_EX );
flock( PIPE_ERR, LOCK_EX );
@stdout = <PIPE_OUT>;
@stderr = <PIPE_ERR>;
flock( PIPE_OUT, LOCK_UN );
flock( PIPE_ERR, LOCK_UN );
close(PIPE_OUT);
close(PIPE_ERR);
unlink($tmpfile);
unlink($errfile);
return ( 0, \@stdout, \@stderr );
}
}
sub handle_error {
my ( $err_msg, @var_vals ) = @_;
$err_msg = sprintf( $err_msg, @var_vals );
logfile($err_msg);
}
sub logfile {
my $logtext = $_[0];
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time);
my $time = sprintf(
"%4d-%02d-%02d %02d:%02d:%02d",
$year + 1900,
$mon + 1, $mday, $hour, $min, $sec
);
$logtext =~ s/\n/\n /g;
unless ( open LOGFILE, ">> $logfile" ) {
warn "Can't open $logfile for writing, check permissions";
}
flock( LOGFILE, LOCK_EX );
seek( LOGFILE, 0, 2 );
print LOGFILE "[$time] $logtext\n";
flock( LOGFILE, LOCK_UN );
close LOGFILE;
}
sub check_image_hash_db {
my $digest = $_[0];
my ($gpf, @gcf) = split('::', $digest);
my ($gs, $gh, $gw, $gcn) = split(':', $gpf);
unless(open(DB, "<$digest_db")) {
debuglog("No Image Hash database found at \"$digest_db\", or permissions wrong.");
return 0;
}
while(<DB>) {
chomp($_);
my ($score, $dpf, @dcf) = split('::', $_);
my ($ds, $dh, $dw, $dcn) = split(':', $dpf);
if ((abs($ds - $gs) / $gs) > $ts) { next; }
if ((abs($dh - $gh) / $gh) > $th) { next; }
if ((abs($dw - $gw) / $gw) > $tw) { next; }
if ((abs($dcn - $gcn) / $gcn) > $tcn) { next; }
my (@dcfs, @gcfs);
foreach (@dcf) {
push(@dcfs, split(':', $_));
}
foreach (@gcf) {
push(@gcfs, split(':', $_));
}
unless (scalar(@gcfs) eq scalar(@dcfs)) {
logfile("Error in database format, aborting...");
return 0;
}
foreach (0..scalar(@gcfs) - 1) {
if (abs($dcfs[$_] - $gcfs[$_]) > $tc) {
next;
}
}
return $score;
}
return 0;
}
sub add_image_hash_db {
my $digest = shift;
my $score = shift;
my $ret;
if (-e $digest_db) {
$ret = open(DB, ">> $digest_db");
} else {
$ret = open(DB, "> $digest_db");
debuglog("Image Hash Database not found to add hash, creating it...");
}
unless ($ret) {
logfile("Unable to open/create Image Hash database at \"$digest_db\", check permissions.");
return;
}
debuglog("Adding hash \"$digest\" to Image Hash database...");
flock( DB, LOCK_EX );
seek( DB, 0, 2 );
print DB "${score}::${digest}\n";
flock( DB, LOCK_UN );
close(DB);
}
sub calc_image_hash {
my ($rcode, $stdout_ref, $stderr_ref);
my $picdata = $_[0];
my ($hash, $h, $w);
my @ca = ( );
my $s = length ( $picdata );
( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$identify -", $picdata);
foreach (@$stdout_ref) {
if ($_ =~ /([0-9]+)x([0-9]+)/) {
$h = $1;
$w = $2;
last;
}
}
if ($rcode) {
debuglog("Unable to calculate image hash, skipping...");
return ($rcode, $hash);
}
( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$ppmhist -noheader", $picdata);
if ($rcode) {
debuglog("Unable to calculate image hash, skipping...");
return ($rcode, $hash);
}
my $cnt = 0;
my $c = scalar(@$stdout_ref);
if ($hash_ccnt) {
foreach (@$stdout_ref) {
$_ =~ s/ +/ /g;
my($r, $g, $b, $l, $c) = split(' ', $_);
push(@ca, "::$r:$g:$b:$l:$c");
$cnt++;
if ($cnt ge $hash_ccnt) {
last;
}
}
}
$hash = "$s:$h:$w:$c" . join('', @ca);
return(0, $hash);
}
sub debuglog {
if ( $verbose > 1 ) {
logfile("Debug mode: $_[0]");
}
}
sub wrong_ctype {
my ( $format, $ctype ) = @_;
if ($wctypescore) {
my $debuginfo = "";
if ( $verbose > 0 ) {
$debuginfo =
("Image has format \"$format\" but content-type is \"$ctype\"");
debuglog($debuginfo);
}
for my $set ( 0 .. 3 ) {
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} =
sprintf( "%0.3f", $wctypescore );
}
$pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $wctypescore, "BODY: ",
$pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE}
. "\n$debuginfo" );
}
}
sub corrupt_img {
my ($unfixable, $err) = @_;
my $score = 0;
if ($unfixable) {
$score = $cimgscore2;
} else {
$score = $cimgscore;
}
if ($score) {
my $debuginfo = "";
if ( $verbose > 0 ) {
chomp($err);
$debuginfo = ("Corrupt image: $err");
debuglog($debuginfo);
}
for my $set ( 0 .. 3 ) {
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} =
sprintf( "%0.3f", $score );
}
$pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ",
$pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG}
. "\n$debuginfo" );
}
}
sub known_img_hash {
my $digest = shift;
my $score = shift;
my $debuginfo = "";
if ( $verbose > 0 ) {
$debuginfo = ("Hash \"$digest\" is in the database.");
debuglog($debuginfo);
}
for my $set ( 0 .. 3 ) {
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} =
sprintf( "%0.3f", $score );
}
$pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . "\n$debuginfo" );
}
sub check_fuzzy_ocr {
my @found = ();
my $image_type = 0;
my $picture_data;
my @hashes = ();
my $cnt = 0;
my $homedir = (getpwuid($<))[7];
debuglog("Starting FuzzyOcr...");
debuglog("Attempting to load personal wordlist...");
if ($homedir) {
load_personal_words( $homedir . "/$pwordlist" );
} elsif (defined($ENV{HOME})) {
load_personal_words( $ENV{HOME} . "/$pwordlist" );
} else {
debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available...");
}
foreach my $p ( $pms->{msg}->find_parts(qr/^image\b/i) ) {
my $cscore = $pms->get_score();
if ( $cscore > $dscore ) {
debuglog(
"Scan canceled, message has already more than $dscore points.");
return 0;
}
my $ctype = $p->{'type'};
if ( $ctype =~ /image/i ) {
debuglog("Analyzing file with content-type \"$ctype\"");
$picture_data = $p->decode();
my @used_scansets = ();
my $stdout_ref;
my $stderr_ref;
my $rcode = 0;
my $corrupt = 0;
my $digest;
if ( substr($picture_data,0,3) eq "\x47\x49\x46" ) {
if ( $ctype !~ /gif/i ) {
wrong_ctype( "GIF", $ctype );
}
$image_type = 1;
my $interlaced_gif = 0;
my $image_count = 0;
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io( $giftext, $picture_data, 1, 1);
foreach (@$stdout_ref) {
unless ($interlaced_gif) {
if ( $_ =~ /Image is Interlaced/i ) {
$interlaced_gif = 1;
}
elsif ( $_ =~ /Image is Non Interlaced/i ) {
}
}
if ( $_ =~ /^Image #/ ) {
$image_count++;
}
}
if ($interlaced_gif or ($image_count gt 1)) {
debuglog("Image is interlaced or animated...");
}
else {
debuglog("Image is single non-interlaced...");
( $rcode, $stdout_ref, $stderr_ref ) = pipe_io( "$giffix", $picture_data, 0, 1);
$picture_data = join('', @$stdout_ref);
}
foreach (@$stderr_ref) {
if ( $_ =~ /GIF-LIB error/i ) {
$corrupt = $_;
last;
}
}
if ($corrupt and ($interlaced_gif or ($image_count gt 1))) {
debuglog("Skipping corrupted interlaced image...");
corrupt_img(1, $corrupt);
next;
} elsif ($corrupt) {
unless($picture_data) {
debuglog("Uncorrectable corruption detected, skipping non-interlaced image...");
corrupt_img(1, $corrupt);
next;
}
debuglog("Image is corrupt, but seems fixable, continuing...");
corrupt_img(0, $corrupt);
}
if ($image_count gt 1) {
debuglog("File contains more than one image...");
if ($image_count lt $max_images) {
debuglog("Assembling images...");
( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$convert - +append -", $picture_data);
if ($rcode) { next; };
$picture_data = join('', @$stdout_ref);
} elsif ($pre314 eq 0) {
debuglog("Image count exceeds limit, skipping some...");
( $rcode, $stdout_ref, $stderr_ref ) = pipe_io("$gifasm -d \$tmpdir/out", $picture_data, $image_count);
if ($rcode) { next; };
$picture_data = join('', @$stdout_ref);
} else {
debuglog("Image count exceeds limit, but your version does not allow the required functions, skipping image...");
next;
}
}
if ($interlaced_gif) {
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io(
"$gifinter -s | $giftopnm -", $picture_data );
if ($rcode) { next; }
}
else {
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io( "$giftopnm -", $picture_data );
if ($rcode) { next; }
}
}
elsif ( substr($picture_data,0,2) eq "\xff\xd8" ) {
if ( $ctype !~ /jpe{0,1}g/i ) {
wrong_ctype( "JPEG", $ctype );
}
$image_type = 2;
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io( "$jpegtopnm", $picture_data );
if ($rcode) { next; }
}
elsif ( substr($picture_data,0,4) eq "\x89\x50\x4e\x47" ) {
if ( $ctype !~ /png/i ) {
wrong_ctype( "PNG", $ctype );
}
$image_type = 3;
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io( "$pngtopnm -", $picture_data );
if ($rcode) { next; }
}
else {
$image_type = 0;
debuglog(
"Image type not recognized, unknown format. Skipping this image..."
);
next;
}
debuglog("Recognized file type: $image_type");
my @pnmdata = @$stdout_ref;
if($enable_image_hashing) {
debuglog("Calculating the image hash...");
($rcode, $digest) = calc_image_hash(join('', @pnmdata));
if ($rcode) {
debuglog("Error calculating the image hash, skipping hash check...");
} else {
if (my $score = check_image_hash_db($digest)) {
debuglog("Image found in hash database, message is spam...");
debuglog("Scoring with known old score and ending...");
known_img_hash($digest, $score);
return 0;
}
}
debuglog("Hash not yet known to the database, saving for later db storage...");
push(@hashes, $digest);
} else {
debuglog("Image hashing disabled in configuration, skipping...");
}
my @ocr_results = ();
foreach my $scanset (@scansets) {
$scanset =~ s/\$gocr/$gocr/;
( $rcode, $stdout_ref, $stderr_ref ) =
pipe_io( "$scanset", join( '', @pnmdata ), 1);
if ($rcode) {
debuglog(join( '', @$stderr_ref ));
debuglog(
"Skipping scanset \"$scanset\" because of errors, trying next..."
);
next;
}
my @ocrdata = @$stdout_ref;
push( @ocr_results, [@ocrdata] );
push( @used_scansets, $scanset );
}
foreach my $w (@words) {
my $wthreshold;
if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) {
($w, $wthreshold) = ($1, $2);
} else {
$wthreshold = $threshold;
}
$w =~ s/[^a-zA-Z]//g;
$w = lc $w;
my $wcnt = 0;
my $gcnt = 0;
foreach my $ocr_set (@ocr_results) {
my $cwcnt = 0;
foreach (@$ocr_set) {
tr/!;|081/iiioal/;
s/[^a-zA-Z]//g;
$_ = lc;
my $matched = adistr( $w, $_ );
if ( abs($matched) < $wthreshold ) {
$cwcnt++;
debuglog(
"Found word \"$w\" in line\n \"$_\" \n with fuzz of "
. abs($matched)
. " scanned with scanset $used_scansets[$gcnt]"
);
}
}
$wcnt = max( $wcnt, $cwcnt );
$gcnt++;
}
$cnt += $wcnt;
if ( ( $verbose > 0 ) and ($wcnt) ) {
push( @found, "\"$w\" in $wcnt lines" );
}
}
}
}
if ( $cnt >= $countreq ) {
my $score = ( $base_score + ( $cnt - $countreq ) * $add_score );
if($enable_image_hashing and $hashing_learn_scanned) {
debuglog("Message is spam (score $score), storing all image hashes in database...");
foreach (@hashes) {
add_image_hash_db($_, $score);
}
} else {
debuglog("Message is spam (score $score)...");
}
my $debuginfo = "";
if ( $verbose > 0 ) {
$debuginfo =
( "Words found:\n"
. join( "\n", @found )
. "\n($cnt word occurrences found)" );
debuglog($debuginfo);
}
for my $set ( 0 .. 3 ) {
$pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} =
sprintf( "%0.3f", $score );
}
$pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ",
$pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" );
}
debuglog("FuzzyOcr ending successfully...");
return 0;
}
1;
syntax highlighted by Code2HTML, v. 0.9.1