# <@LICENSE>
# Copyright 2006 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
=head1 NAME
Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor - extract "bases" from body ruleset
=head1 SYNOPSIS
This is a plugin to extract "base" strings from SpamAssassin 'body' rules,
suitable for use in Rule2XSBody rules or other parallel matching algorithms.
=cut
package Mail::SpamAssassin::Plugin::BodyRuleBaseExtractor;
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Util::Progress;
use strict;
use warnings;
use bytes;
use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);
use constant DEBUG_RE_PARSING => 0; # noisy!
# a few settings that control what kind of bases are output.
# treat all rules as lowercase for purposes of term extraction?
# $main->{bases_must_be_casei} = 1;
# $main->{bases_can_use_alternations} = 0; # /(foo|bar|baz)/
# $main->{bases_can_use_quantifiers} = 0; # /foo.*bar/ or /foo*bar/ or /foooo?bar/
# $main->{bases_can_use_char_classes} = 0; # /fo[opqr]bar/
# $main->{bases_split_out_alternations} = 1; # /(foo|bar|baz)/ => ["foo", "bar", "baz"]
# TODO: it would be nice to have a clean API to pass such settings
# through to plugins instead of hanging them off $main
###########################################################################
sub new {
my $class = shift;
my $mailsaobject = shift;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsaobject);
bless ($self, $class);
$self->{show_progress} = 1; # default
# $self->test(); exit;
return $self;
}
###########################################################################
sub finish_parsing_end {
my ($self, $params) = @_;
my $conf = $params->{conf};
$self->extract_bases($conf);
}
sub extract_bases {
my ($self, $conf) = @_;
my $main = $conf->{main};
if (!$main->{base_extract}) { return; }
info("base extraction starting. this can take a while...");
$self->extract_set($conf, $conf->{body_tests}, 'body');
}
sub extract_set {
my ($self, $conf, $test_set, $ruletype) = @_;
foreach my $pri (keys %{$test_set}) {
my $nicepri = $pri; $nicepri =~ s/-/neg/g;
$self->extract_set_pri($conf, $test_set->{$pri}, $ruletype.'_'.$nicepri);
}
}
###########################################################################
sub extract_set_pri {
my ($self, $conf, $rules, $ruletype) = @_;
my @good_bases = ();
my @failed = ();
my $yes = 0;
my $no = 0;
my $start = time;
$self->{main} = $conf->{main}; # for use in extract_hints()
info ("extracting from rules of type $ruletype");
# attempt to find good "base strings" (simplified regexp subsets) for each
# regexp. We try looking at the regexp from both ends, since there
# may be a good long string of text at the end of the rule.
# require this many chars in a base string for it to be viable
my $min_chars = 3;
my $count = 0;
my $progress;
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
total => scalar keys %{$rules},
itemtype => 'rules',
});
foreach my $name (keys %{$rules}) {
my $rule = $rules->{$name};
$self->{show_progress} and $progress->update(++$count);
# ignore ReplaceTags rules
# TODO: need cleaner way to do this
next if ($conf->{rules_to_replace}->{$name});
my ($qr, $mods) = $self->simplify_and_qr_regexp($rule);
my @bases;
eval { # catch die()s
@bases = $self->extract_hints($rule, $qr, $mods);
};
$@ and dbg("giving up on regexp: $@");
# if any of the extracted hints in a set are too short, the entire
# set is invalid; this is because each set of N hints represents just
# 1 regexp.
my $minlen;
foreach my $str (@bases) {
my $len = length $str;
if ($len < $min_chars) { $minlen = undef; @bases = (); last; }
elsif (!defined($minlen) || $len < $minlen) { $minlen = $len; }
}
if ($minlen && @bases) {
# dbg("zoom: YES <base>$base</base> <origrule>$rule</origrule>");
# figure out if we have e.g. ["foo", "foob", "foobar"]; in this
# case, we only need to track ["foo"].
my %subsumed = ();
foreach my $base1 (@bases) {
foreach my $base2 (@bases) {
if ($base1 ne $base2 && $base1 =~ /\Q$base2\E/) {
$subsumed{$base1} = 1; # base2 is inside base1; discard the longer
}
}
}
foreach my $base (@bases) {
next if $subsumed{$base};
push @good_bases, { base => $base, orig => $rule, name => $name };
}
$yes++;
}
else {
dbg("zoom: NO $rule");
push @failed, { orig => $rule };
$no++;
}
}
$self->{show_progress} and $progress->final();
dbg ("$ruletype: found ".(scalar @good_bases).
" usable base strings in ".
"$yes rules, skipped $no rules");
# NOTE: re2c will attempt to provide the longest pattern that matched; e.g.
# ("food" =~ "foo" / "food") will return "food". So therefore if a pattern
# subsumes other patterns, we need to return hits for all of them. We also
# need to take care of the case where multiple regexps wind up sharing the
# same base.
#
# Another gotcha, an exception to the subsumption rule; if one pattern isn't
# entirely subsumed (e.g. "food" =~ "foo" / "ood"), then they will be
# returned as two hits, correctly. So we only have to be smart about the
# full-subsumption case; overlapping is taken care of for us, by re2c.
#
# TODO: there's a bug here. Since the code in extract_hints() has been
# modified to support more complex regexps, we can no longer simply assume
# that if pattern A is not contained in pattern B, that means that pattern B
# doesn't subsume it. Consider, for example, A="foo*bar" and
# B="morefobarry"; A is indeed subsumed by B, but we won't be able to test
# that without running the A RE match itself somehow against B.
# same issue remains with:
#
# "foo?bar" / "fobar"
# "fo(?:o|oo|)bar" / "fobar"
# "fo(?:o|oo)?bar" / "fobar"
# "fo(?:o*|baz)bar" / "fobar"
# "(?:fo(?:o*|baz)bar|blargh)" / "fobar"
#
# it's worse with this:
#
# "fo(?:o|oo|)bar" / "foo*bar"
#
# basically, this is impossible to compute without reimplementing most of
# re2c, and it appears the re2c developers don't plan to offer this:
# https://sourceforge.net/tracker/index.php?func=detail&aid=1540845&group_id=96864&atid=616203
$conf->{base_orig}->{$ruletype} = { };
$conf->{base_string}->{$ruletype} = { };
$count = 0;
$self->{show_progress} and $progress = Mail::SpamAssassin::Util::Progress->new({
total => scalar @good_bases,
itemtype => 'bases',
});
foreach my $set1 (@good_bases) {
$self->{show_progress} and $progress->update(++$count);
my $base1 = $set1->{base};
my $orig1 = $set1->{orig};
my $name1 = $set1->{name};
next if ($base1 eq '' or $name1 eq '');
$conf->{base_orig}->{$ruletype}->{$name1} = $orig1;
foreach my $set2 (@good_bases) {
next if ($set1 == $set2);
my $base2 = $set2->{base};
my $name2 = $set2->{name};
# clobber exact dups; this can happen if a regexp outputs the
# same base string multiple times
if ($orig1 eq $set2->{orig} &&
$base1 eq $base2 &&
$name1 eq $name2)
{
$set2->{name} = ''; # clobber
$set2->{base} = '';
}
# skip if either already contains the other rule's name
next if ($name1 =~ /\b\Q$name2\E\b/);
next if ($name2 =~ /\b\Q$name1\E\b/);
next if ($base2 eq '');
next if (length $base1 < length $base2);
next if ($base1 !~ /\Q$base2\E/);
$set1->{name} .= " ".$name2;
# base2 is just a subset of base1
# dbg("zoom: subsuming '$base2' into '$base1': $set1->{name}");
}
}
# we can still have duplicate cases; __FRAUD_PTS and __SARE_FRAUD_BADTHINGS
# both contain "killed" for example, pointing at different rules, which
# the above search hasn't found. Collapse them here with a hash
my %bases = ();
foreach my $set (@good_bases) {
my $base = $set->{base};
next unless $base;
if (defined $bases{$base}) {
$bases{$base} .= " ".$set->{name};
} else {
$bases{$base} = $set->{name};
}
}
foreach my $base (keys %bases) {
# uniq the list, since there are probably dup rules listed
my @list = split (' ', $bases{$base});
my @uniqed;
{
my %u=(); @uniqed = grep {defined} map {
if (exists $u{$_}) { undef; } else { $u{$_}=undef;$_; }
} @list; undef %u;
}
my $key = join ' ', sort @uniqed;
$conf->{base_string}->{$ruletype}->{$base} = $key;
}
$self->{show_progress} and $progress->final();
my $elapsed = time - $start;
info ("$ruletype: ".
(scalar keys %{$conf->{base_string}->{$ruletype}}).
" base strings extracted in $elapsed seconds\n");
}
###########################################################################
# TODO:
# NO /no.{1,10}P(?:er|re)scription.{1,10}(?:needed|require|necessary)/i
# => should extract 'scription' somehow
# /time to refinance|refinanc\w{1,3}\b.{0,16}\bnow\b/i
# => should understand alternations; tricky
sub simplify_and_qr_regexp {
my $self = shift;
my $rule = shift;
my $main = $self->{main};
$rule = Mail::SpamAssassin::Util::regexp_remove_delimiters($rule);
# remove the regexp modifiers, keep for later
my $mods = '';
while ($rule =~ s/^\(\?([a-z]*)\)//) { $mods .= $1; }
# modifier removal
while ($rule =~ s/^\(\?-([a-z]*)\)//) {
foreach my $modchar (split '', $mods) {
$mods =~ s/$modchar//g;
}
}
# now: simplify aspects of the regexp. Bear in mind that we can
# simplify as long as we cause the regexp to become more general;
# more hits is OK, since false positives will be discarded afterwards
# anyway. Simplification that causes the regexp to *not* hit
# stuff that the "real" rule would hit, however, is a bad thing.
if ($main->{bases_must_be_casei}) {
$rule = lc $rule;
$mods =~ s/i//;
# always case-i: /A(?i:ct) N(?i:ow)/ => /Act Now/
$rule =~ s/(?<!\\)\(\?i\:(.*?)\)/$1/gs;
# always case-i: /A(?-i:ct)/ => /Act/
$rule =~ s/(?<!\\)\(\?-i\:(.*?)\)/$1/gs;
# remove (?i)
$rule =~ s/\(\?i\)//gs;
}
else {
die "case-i" if $rule =~ /\(\?i\)/;
die "case-i" if $mods =~ /i/;
}
# remove /m and /s modifiers
$mods =~ s/m//;
$mods =~ s/s//;
# remove (^|\b)'s
# T_KAM_STOCKTIP23 /(EXTREME INNOVATIONS|(^|\b)EXTI($|\b))/is
$rule =~ s/\(\^\|\\b\)//gs;
$rule =~ s/\(\$\|\\b\)//gs;
$rule =~ s/\(\\b\|\^\)//gs;
$rule =~ s/\(\\b\|\$\)//gs;
# remove (?!credit)
$rule =~ s/\(\?\![^\)]+\)//gs;
# remove \b's
$rule =~ s/(?<!\\)\\b//gs;
# remove the "?=" trick
# (?=[dehklnswxy])(horny|nasty|hot|wild|young|....etc...)
$rule =~ s/\(\?\=\[[^\]]+\]\)//gs;
($rule, $mods);
}
sub extract_hints {
my $self = shift;
my $rawrule = shift;
my $rule = shift;
my $mods = shift;
my $main = $self->{main};
my $orig = $rule;
# if there are anchors, give up; we can't get much
# faster than these anyway
die "anchors" if $rule =~ /^\(?(?:\^|\\A)/;
# die "anchors" if $rule =~ /(?:\$|\\Z)\)?$/;
# just remove end-of-string anchors; they're slow so could gain
# from our speedup
$rule =~ s/(?<!\\)(?:\$|\\Z)\)?$//;
# simplify (?:..) to (..)
$main->{bases_allow_noncapture_groups} or
$rule =~ s/\(\?:/\(/g;
# simplify some grouping arrangements so they're easier for us to parse
# (foo)? => (foo|)
$rule =~ s/\((.*?)\)\?/\($1\|\)/gs;
# r? => (r|)
$rule =~ s/(?<!\\)(\w)\?/\($1\|\)/gs;
my ($tmpf, $tmpfh) = Mail::SpamAssassin::Util::secure_tmpfile();
# attempt to find a safe regexp delimiter...
# TODO: would prob be easier to just read this from $rawrule
my $quos = "/"; if ($rule =~ m/\Q${quos}\E/) {
$quos = "#"; if ($rule =~ m/\Q${quos}\E/) {
$quos = "'"; if ($rule =~ m/\Q${quos}\E/) {
$quos = "@"; if ($rule =~ m/\Q${quos}\E/) {
$quos = "*"; if ($rule =~ m/\Q${quos}\E/) {
$quos = "!";
}
}
}
}
}
print $tmpfh "m".$quos.$rule.$quos.$mods;
close $tmpfh or die "cannot write to $tmpf";
my $perl = $self->get_perl();
open (IN, "$perl -c -Mre=debug $tmpf 2>&1 |") or die "cannot run $perl";
my $fullstr = join('', <IN>);
close IN;
unlink $tmpf;
# now parse the -Mre=debug output.
# perl 5.10 format
$fullstr =~ s/^.*\nFinal program:\n//gs;
# perl 5.6/5.8 format
$fullstr =~ s/^(?:.*\n|)size \d[^\n]*\n//gs;
$fullstr =~ s/^(?:.*\n|)first at \d[^\n]*\n//gs;
# common to all
$fullstr =~ s/\nOffsets:.*$//gs;
# clean up every other line that doesn't start with a space
$fullstr =~ s/^\S.*$//gm;
if ($fullstr !~ /((?:\s[^\n]+\n)+)/m) {
die "failed to parse Mre=debug output: $fullstr m".$quos.$rule.$quos.$mods." $rawrule";
}
my $opsstr = $1;
# what's left looks like this:
# 1: EXACTF <v>(3)
# 3: ANYOF[1ILil](14)
# 14: EXACTF <a>(16)
# 16: CURLY {2,7}(29)
# 18: ANYOF[A-Za-z](0)
# 29: SPACE(30)
# 30: EXACTF <http://>(33)
# 33: END(0)
#
DEBUG_RE_PARSING and warn "Mre=debug output: $opsstr";
my @ops = ();
foreach my $op (split(/\n/s, $opsstr)) {
next unless $op;
if ($op =~ /^\s+\d+: (\s*)([A-Z]\w+)\b(.*)(?:\(\d+\))?$/) {
push @ops, [ $1, $2, $3 ];
}
elsif ($op =~ /^ (\s*)<(.*)>\.\.\.\s*$/) {
# 5: TRIE-EXACT[im](44)
# <message contained attachments that have been blocked by guin>...
my $spcs = $1;
# we could use the entire length here, but it's easier to trim to
# the length of a perl 5.8.x/5.6.x EXACT* string; that way our test
# suite results will match, since the sa-update --list extraction will
# be the same for all versions. (The "..." trailer is important btw)
my $str = substr ($2, 0, 55);
push @ops, [ $spcs, '_moretrie', "<$str...>" ];
}
elsif ($op =~ /^ (\s*)(<.*>)\s*(?:\(\d+\))?$/) {
# 5: TRIE-EXACT[am](21)
# <am> (21)
# <might> (12)
push @ops, [ $1, '_moretrie', $2 ];
}
elsif ($op =~ /^ at .+ line \d+$/) {
next; # ' at /local/perl561/lib/5.6.1/i86pc-solaris/re.pm line 109':
}
else {
warn "cannot parse '$op': $opsstr";
next;
}
}
# unroll the branches; returns a list of versions.
# e.g. /foo(bar|baz)argh/ => [ "foobarargh", "foobazargh" ]
my @unrolled;
if ($main->{bases_split_out_alternations}) {
@unrolled = $self->unroll_branches(0, \@ops);
} else {
@unrolled = ( \@ops );
}
# now find the longest DFA-friendly string in each unrolled version
my @longests = ();
foreach my $opsarray (@unrolled) {
my $longestexact = '';
my $buf = '';
# use a closure to keep the code succinct
my $add_candidate = sub {
if (length $buf > length $longestexact) { $longestexact = $buf; }
$buf = '';
};
my $prevop;
foreach my $op (@{$opsarray}) {
my ($spcs, $item, $args) = @{$op};
next if ($item eq 'NOTHING');
# EXACT == case-sensitive
# EXACTF == case-i
# we can do both, since we canonicalize to lc.
if (!$spcs && $item =~ /^EXACT/ && $args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# _moretrie == a TRIE-EXACT entry
elsif (!$spcs && $item =~ /^_moretrie/ && $args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# /(?:foo|bar|baz){2}/ results in a CURLYX beforehand
elsif ($item =~ /^EXACT/ &&
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 55 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
# CURLYX, for perl >= 5.9.5
elsif ($item =~ /^_moretrie/ &&
$prevop && !$prevop->[0] && $prevop->[1] =~ /^CURLYX/ &&
$prevop->[2] =~ /\{(\d+),/ && $1 >= 1 &&
$args =~ /<(.*)>/)
{
$buf .= $1;
if (length $1 >= 60 && $buf =~ s/\.\.\.$//) {
# perl 5.8.x truncates with a "..." here! cut and stop
$add_candidate->();
}
}
else {
# not an /^EXACT/; clear the buffer
$add_candidate->();
}
$prevop = $op;
}
$add_candidate->();
if (!$longestexact) {
die "no long-enough string found in $rawrule";
# all unrolled versions must have a long string, otherwise
# we cannot reliably match all variants of the rule
} else {
push @longests, lc $longestexact;
}
}
DEBUG_RE_PARSING and warn "longest base strings: /".join("/", @longests)."/";
return @longests;
}
###########################################################################
sub unroll_branches {
my ($self, $depth, $opslist) = @_;
die "too deep" if ($depth++ > 5);
my @ops = (@{$opslist}); # copy
my @pre_branch_ops = ();
my $branch_spcs;
my $trie_spcs;
my $open_spcs;
# our input looks something like this 2-level structure:
# 1: BOUND(2)
# 2: EXACT <Dear >(5)
# 5: BRANCH(9)
# 6: EXACT <IT>(8)
# 8: NALNUM(24)
# 9: BRANCH(23)
# 10: EXACT <Int>(12)
# 12: BRANCH(14)
# 13: NOTHING(21)
# 14: BRANCH(17)
# 15: EXACT <a>(21)
# 17: BRANCH(20)
# 18: EXACT <er>(21)
# 20: TAIL(21)
# 21: EXACT <net>(24)
# 23: TAIL(24)
# 24: EXACT < shop>(27)
# 27: END(0)
#
# or:
#
# 1: OPEN1(3)
# 3: BRANCH(6)
# 4: EXACT <v>(9)
# 6: BRANCH(9)
# 7: EXACT <\\/>(9)
# 9: CLOSE1(11)
# 11: CURLY {2,5}(14)
# 13: REG_ANY(0)
# 14: EXACT < g r a >(17)
# 17: ANYOF[a-z](28)
# 28: END(0)
#
# or:
#
# 1: EXACT <i >(3)
# 3: OPEN1(5)
# 5: TRIE-EXACT[am](21)
# <am> (21)
# <might> (12)
# 12: OPEN2(14)
# 14: TRIE-EXACT[ ](19)
# < be>
# <>
# 19: CLOSE2(21)
# 21: CLOSE1(23)
# 23: EXACT < c>(25)
DEBUG_RE_PARSING and warn "starting parse";
# this happens for /foo|bar/ instead of /(?:foo|bar)/ ; transform
# it into the latter. bit of a kludge to do this before the loop, but hey.
# note that it doesn't fix the CLOSE1/END ordering to be correct
if (scalar @ops > 1 && $ops[0]->[1] =~ /^BRANCH/) {
my @newops = ([ "", "OPEN1", "" ]);
foreach my $op (@ops) {
push @newops, [ " ".$op->[0], $op->[1], $op->[2] ];
}
push @newops, [ "", "CLOSE1", "" ];
@ops = @newops;
}
# iterate until we start a branch set. using
# /dkjfksl(foo|bar(baz|argh)boo)gab/ as an example, we're at "dkj..."
# just hitting an OPEN is not enough; wait until we see a TRIE-EXACT
# or a BRANCH, *then* unroll the most recent OPEN set.
while (1) {
my $op = shift @ops;
last unless defined $op;
my ($spcs, $item, $args) = @{$op};
DEBUG_RE_PARSING and warn "pre: [$spcs] $item $args";
if ($item =~ /^OPEN/) {
$open_spcs = $spcs;
next; # next will be a BRANCH or TRIE
} elsif ($item =~ /^TRIE/) {
$trie_spcs = $spcs;
last;
} elsif ($item =~ /^BRANCH/) {
$branch_spcs = $spcs;
last;
} elsif ($item =~ /^EXACT/ && defined $open_spcs) {
# perl 5.9.5 does this; f(o|oish) => OPEN, EXACT, TRIE-EXACT
push @pre_branch_ops, [ $open_spcs, $item, $args ];
next;
} elsif (defined $open_spcs) {
# OPEN not followed immediately by BRANCH, EXACT or TRIE-EXACT:
# ignore this OPEN block entirely and don't try to unroll it
undef $open_spcs;
} else {
push @pre_branch_ops, $op;
}
}
# no branches found? we're done unrolling on this one!
if (scalar @ops == 0) {
return [ @pre_branch_ops ];
}
# otherwise we're at the start of a new branch set
# /(foo|bar(baz|argh)boo)gab/
my @alts = ();
my @in_this_branch = ();
DEBUG_RE_PARSING and warn "entering branch: ".
"open='".(defined $open_spcs ? $open_spcs : 'undef')."' ".
"branch='".(defined $branch_spcs ? $branch_spcs : 'undef')."' ".
"trie='".(defined $trie_spcs ? $trie_spcs : 'undef')."'";
# indentation level to remove from "normal" ops (using a s///)
my $open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
my $trie_sub_spcs = "";
while (1) {
my $op = shift @ops;
last unless defined $op;
my ($spcs, $item, $args) = @{$op};
DEBUG_RE_PARSING and warn "in: [$spcs] $item $args";
if (defined $branch_spcs && $branch_spcs eq $spcs && $item =~ /^BRANCH/) { # alt
push @alts, [ @pre_branch_ops, @in_this_branch ];
@in_this_branch = ();
$open_sub_spcs = $branch_spcs." ";
$trie_sub_spcs = "";
next;
}
elsif (defined $branch_spcs && $branch_spcs eq $spcs && $item eq 'TAIL') { # end
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
elsif (defined $trie_spcs && $trie_spcs eq $spcs && $item eq '_moretrie') {
if (scalar @in_this_branch > 0) {
push @alts, [ @pre_branch_ops, @in_this_branch ];
}
# use $open_spcs instead of $trie_spcs (which is 2 spcs further indented)
@in_this_branch = ( [ $open_spcs, $item, $args ] );
$open_sub_spcs = ($branch_spcs ? $branch_spcs : "")." ";
$trie_sub_spcs = " ";
next;
}
elsif (defined $open_spcs && $open_spcs eq $spcs && $item =~ /^CLOSE/) { # end
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
undef $open_spcs;
undef $trie_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
elsif ($item eq 'END') { # of string
push @alts, [ @pre_branch_ops, @in_this_branch ];
undef $branch_spcs;
undef $open_spcs;
undef $trie_spcs;
$open_sub_spcs = "";
$trie_sub_spcs = "";
last;
}
else {
if ($open_sub_spcs) {
# deindent the space-level to match the opening brace
$spcs =~ s/^$open_sub_spcs//;
# tries also add one more indent level in
$spcs =~ s/^$trie_sub_spcs//;
}
push @in_this_branch, [ $spcs, $item, $args ];
# note that we ignore ops at a deeper $spcs level entirely (until later!)
}
}
if (defined $branch_spcs) {
die "fell off end of string with a branch open: '$branch_spcs'";
}
# we're now after the branch set: /gab/
# @alts looks like [ /dkjfkslfoo/ , /dkjfkslbar(baz|argh)boo/ ]
foreach my $alt (@alts) {
push @{$alt}, @ops; # add all remaining ops to each one
# note that this could include more (?:...); we don't care, since
# those can be handled by recursing
}
# ok, parsed the entire ops list
# @alts looks like [ /dkjfkslfoogab/ , /dkjfkslbar(baz|argh)boogab/ ]
if (DEBUG_RE_PARSING) {
print "unrolled: "; foreach my $alt (@alts) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
}
# now recurse, to unroll the remaining branches (if any exist)
my @rets = ();
foreach my $alt (@alts) {
push @rets, $self->unroll_branches($depth, $alt);
}
if (DEBUG_RE_PARSING) {
print "unrolled post-recurse: "; foreach my $alt (@rets) { foreach my $o (@{$alt}) { print "{/$o->[0]/$o->[1]/$o->[2]} "; } print "\n"; }
}
return @rets;
}
###########################################################################
sub test {
my ($self) = @_;
$self->test_split_alt("foo", "/foo/");
$self->test_split_alt("(foo)", "/foo/");
$self->test_split_alt("foo(bar)baz", "/foobarbaz/");
$self->test_split_alt("x(foo|)", "/xfoo/ /x/");
$self->test_split_alt("fo(o|)", "/foo/ /fo/");
$self->test_split_alt("(foo|bar)", "/foo/ /bar/");
$self->test_split_alt("foo|bar", "/foo/ /bar/");
$self->test_split_alt("foo (bar|baz) argh", "/foo bar argh/ /foo baz argh/");
$self->test_split_alt("foo (bar|baz|bl(arg|at)) cough", "/foo bar cough/ /foo baz cough/ /foo blarg cough/ /foo blat cough/");
$self->test_split_alt("(s(otc|tco)k)", "/sotck/ /stcok/");
$self->test_split_alt("(business partner(s|ship|)|silent partner(s|ship|))", "/business partners/ /silent partners/ /business partnership/ /silent partnership/ /business partner/ /silent partner/");
}
sub test_split_alt {
my ($self, $in, $out) = @_;
my @got = $self->split_alt($in);
$out =~ s/^\///;
$out =~ s/\/$//;
my @want = split(/\/ \//, $out);
my $failed = 0;
if (scalar @want != scalar @got) {
warn "FAIL: results count don't match";
$failed++;
}
else {
my %got = map { $_ => 1 } @got;
foreach my $w (@want) {
if (!$got{$w}) {
warn "FAIL: '$w' not found";
$failed++;
}
}
}
if ($failed) {
print "want: /".join('/ /', @want)."/\n";
print "got: /".join('/ /', @got)."/\n";
return 0;
} else {
print "ok\n";
return 1;
}
}
###########################################################################
sub get_perl {
my ($self) = @_;
my $perl;
# allow user override of the perl interpreter to use when
# extracting base strings.
# TODO: expose this via sa-compile command-line option
my $fromconf = $self->{main}->{conf}->{re_parser_perl};
if ($fromconf) {
$perl = $fromconf;
} elsif ($^X =~ m|^/|) {
$perl = $^X;
} else {
use Config;
$perl = $Config{perlpath};
$perl =~ s|/[^/]*$|/$^X|;
}
$perl =~ /^(.*)$/;
return $1;
}
###########################################################################
1;
syntax highlighted by Code2HTML, v. 0.9.1