#! /usr/local/bin/python2.3
"""Module to tokenize email messages for spam filtering."""
from __future__ import generators
import email
import email.Message
import email.Header
import email.Utils
import email.Errors
import re
import math
import time
import os
import binascii
import urlparse
import urllib
try:
from sets import Set
except ImportError:
from compatsets import Set
from spambayes import classifier
from spambayes.Options import options
from spambayes.mboxutils import get_message
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
# Patch encodings.aliases to recognize 'ansi_x3_4_1968'
from encodings.aliases import aliases # The aliases dictionary
if not aliases.has_key('ansi_x3_4_1968'):
aliases['ansi_x3_4_1968'] = 'ascii'
del aliases # Not needed any more
##############################################################################
# To fold case or not to fold case? I didn't want to fold case, because
# it hides information in English, and I have no idea what .lower() does
# to other languages; and, indeed, 'FREE' (all caps) turned out to be one
# of the strongest spam indicators in my content-only tests (== one with
# prob 0.99 *and* made it into spamprob's nbest list very often).
#
# Against preservering case, it makes the database size larger, and requires
# more training data to get enough "representative" mixed-case examples.
#
# Running my c.l.py tests didn't support my intuition that case was
# valuable, so it's getting folded away now. Folding or not made no
# significant difference to the false positive rate, and folding made a
# small (but statistically significant all the same) reduction in the
# false negative rate. There is one obvious difference: after folding
# case, conference announcements no longer got high spam scores. Their
# content was usually fine, but they were highly penalized for VISIT OUR
# WEBSITE FOR MORE INFORMATION! kinds of repeated SCREAMING. That is
# indeed the language of advertising, and I halfway regret that folding
# away case no longer picks on them.
#
# Since the f-p rate didn't change, but conference announcements escaped
# that category, something else took their place. It seems to be highly
# off-topic messages, like debates about Microsoft's place in the world.
# Talk about "money" and "lucrative" is indistinguishable now from talk
# about "MONEY" and "LUCRATIVE", and spam mentions MONEY a lot.
##############################################################################
# Character n-grams or words?
#
# With careful multiple-corpora c.l.py tests sticking to case-folded decoded
# text-only portions, and ignoring headers, and with identical special
# parsing & tagging of embedded URLs:
#
# Character 3-grams gave 5x as many false positives as split-on-whitespace
# (s-o-w). The f-n rate was also significantly worse, but within a factor
# of 2. So character 3-grams lost across the board.
#
# Character 5-grams gave 32% more f-ps than split-on-whitespace, but the
# s-o-w fp rate across 20,000 presumed-hams was 0.1%, and this is the
# difference between 23 and 34 f-ps. There aren't enough there to say that's
# significnatly more with killer-high confidence. There were plenty of f-ns,
# though, and the f-n rate with character 5-grams was substantially *worse*
# than with character 3-grams (which in turn was substantially worse than
# with s-o-w).
#
# Training on character 5-grams creates many more unique tokens than s-o-w:
# a typical run bloated to 150MB process size. It also ran a lot slower than
# s-o-w, partly related to heavy indexing of a huge out-of-cache wordinfo
# dict. I rarely noticed disk activity when running s-o-w, so rarely bothered
# to look at process size; it was under 30MB last time I looked.
#
# Figuring out *why* a msg scored as it did proved much more mysterious when
# working with character n-grams: they often had no obvious "meaning". In
# contrast, it was always easy to figure out what s-o-w was picking up on.
# 5-grams flagged a msg from Christian Tismer as spam, where he was discussing
# the speed of tasklets under his new implementation of stackless:
#
# prob = 0.99999998959
# prob('ed sw') = 0.01
# prob('http0:pgp') = 0.01
# prob('http0:python') = 0.01
# prob('hlon ') = 0.99
# prob('http0:wwwkeys') = 0.01
# prob('http0:starship') = 0.01
# prob('http0:stackless') = 0.01
# prob('n xp ') = 0.99
# prob('on xp') = 0.99
# prob('p 150') = 0.99
# prob('lon x') = 0.99
# prob(' amd ') = 0.99
# prob(' xp 1') = 0.99
# prob(' athl') = 0.99
# prob('1500+') = 0.99
# prob('xp 15') = 0.99
#
# The spam decision was baffling until I realized that *all* the high-
# probablity spam 5-grams there came out of a single phrase:
#
# AMD Athlon XP 1500+
#
# So Christian was punished for using a machine lots of spam tries to sell
# <wink>. In a classic Bayesian classifier, this probably wouldn't have
# mattered, but Graham's throws away almost all the 5-grams from a msg,
# saving only the about-a-dozen farthest from a neutral 0.5. So one bad
# phrase can kill you! This appears to happen very rarely, but happened
# more than once.
#
# The conclusion is that character n-grams have almost nothing to recommend
# them under Graham's scheme: harder to work with, slower, much larger
# database, worse results, and prone to rare mysterious disasters.
#
# There's one area they won hands-down: detecting spam in what I assume are
# Asian languages. The s-o-w scheme sometimes finds only line-ends to split
# on then, and then a "hey, this 'word' is way too big! let's ignore it"
# gimmick kicks in, and produces no tokens at all.
#
# [Later: we produce character 5-grams then under the s-o-w scheme, instead
# ignoring the blob, but only if there are high-bit characters in the blob;
# e.g., there's no point 5-gramming uuencoded lines, and doing so would
# bloat the database size.]
#
# Interesting: despite that odd example above, the *kinds* of f-p mistakes
# 5-grams made were very much like s-o-w made -- I recognized almost all of
# the 5-gram f-p messages from previous s-o-w runs. For example, both
# schemes have a particular hatred for conference announcements, although
# s-o-w stopped hating them after folding case. But 5-grams still hate them.
# Both schemes also hate msgs discussing HTML with examples, with about equal
# passion. Both schemes hate brief "please subscribe [unsubscribe] me"
# msgs, although 5-grams seems to hate them more.
##############################################################################
# How to tokenize?
#
# I started with string.split() merely for speed. Over time I realized it
# was making interesting context distinctions qualitatively akin to n-gram
# schemes; e.g., "free!!" is a much stronger spam indicator than "free". But
# unlike n-grams (whether word- or character- based) under Graham's scoring
# scheme, this mild context dependence never seems to go over the edge in
# giving "too much" credence to an unlucky phrase.
#
# OTOH, compared to "searching for words", it increases the size of the
# database substantially, less than but close to a factor of 2. This is very
# much less than a word bigram scheme bloats it, but as always an increase
# isn't justified unless the results are better.
#
# Following are stats comparing
#
# for token in text.split(): # left column
#
# to
#
# for token in re.findall(r"[\w$\-\x80-\xff]+", text): # right column
#
# text is case-normalized (text.lower()) in both cases, and the runs were
# identical in all other respects. The results clearly favor the split()
# gimmick, although they vaguely suggest that some sort of compromise
# may do as well with less database burden; e.g., *perhaps* folding runs of
# "punctuation" characters into a canonical representative could do that.
# But the database size is reasonable without that, and plain split() avoids
# having to worry about how to "fold punctuation" in languages other than
# English.
#
# false positive percentages
# 0.000 0.000 tied
# 0.000 0.050 lost
# 0.050 0.150 lost
# 0.000 0.025 lost
# 0.025 0.050 lost
# 0.025 0.075 lost
# 0.050 0.150 lost
# 0.025 0.000 won
# 0.025 0.075 lost
# 0.000 0.025 lost
# 0.075 0.150 lost
# 0.050 0.050 tied
# 0.025 0.050 lost
# 0.000 0.025 lost
# 0.050 0.025 won
# 0.025 0.000 won
# 0.025 0.025 tied
# 0.000 0.025 lost
# 0.025 0.075 lost
# 0.050 0.175 lost
#
# won 3 times
# tied 3 times
# lost 14 times
#
# total unique fp went from 8 to 20
#
# false negative percentages
# 0.945 1.200 lost
# 0.836 1.018 lost
# 1.200 1.200 tied
# 1.418 1.636 lost
# 1.455 1.418 won
# 1.091 1.309 lost
# 1.091 1.272 lost
# 1.236 1.563 lost
# 1.564 1.855 lost
# 1.236 1.491 lost
# 1.563 1.599 lost
# 1.563 1.781 lost
# 1.236 1.709 lost
# 0.836 0.982 lost
# 0.873 1.382 lost
# 1.236 1.527 lost
# 1.273 1.418 lost
# 1.018 1.273 lost
# 1.091 1.091 tied
# 1.490 1.454 won
#
# won 2 times
# tied 2 times
# lost 16 times
#
# total unique fn went from 292 to 302
#
# Later: Here's another tokenization scheme with more promise.
#
# fold case, ignore punctuation, strip a trailing 's' from words (to
# stop Guido griping about "hotel" and "hotels" getting scored as
# distinct clues <wink>) and save both word bigrams and word unigrams
#
# This was the code:
#
# # Tokenize everything in the body.
# lastw = ''
# for w in word_re.findall(text):
# n = len(w)
# # Make sure this range matches in tokenize_word().
# if 3 <= n <= 12:
# if w[-1] == 's':
# w = w[:-1]
# yield w
# if lastw:
# yield lastw + w
# lastw = w + ' '
#
# elif n >= 3:
# lastw = ''
# for t in tokenize_word(w):
# yield t
#
# where
#
# word_re = re.compile(r"[\w$\-\x80-\xff]+")
#
# This at least doubled the process size. It helped the f-n rate
# significantly, but probably hurt the f-p rate (the f-p rate is too low
# with only 4000 hams per run to be confident about changes of such small
# *absolute* magnitude -- 0.025% is a single message in the f-p table):
#
# false positive percentages
# 0.000 0.000 tied
# 0.000 0.075 lost +(was 0)
# 0.050 0.125 lost +150.00%
# 0.025 0.000 won -100.00%
# 0.075 0.025 won -66.67%
# 0.000 0.050 lost +(was 0)
# 0.100 0.175 lost +75.00%
# 0.050 0.050 tied
# 0.025 0.050 lost +100.00%
# 0.025 0.000 won -100.00%
# 0.050 0.125 lost +150.00%
# 0.050 0.025 won -50.00%
# 0.050 0.050 tied
# 0.000 0.025 lost +(was 0)
# 0.000 0.025 lost +(was 0)
# 0.075 0.050 won -33.33%
# 0.025 0.050 lost +100.00%
# 0.000 0.000 tied
# 0.025 0.100 lost +300.00%
# 0.050 0.150 lost +200.00%
#
# won 5 times
# tied 4 times
# lost 11 times
#
# total unique fp went from 13 to 21
#
# false negative percentages
# 0.327 0.218 won -33.33%
# 0.400 0.218 won -45.50%
# 0.327 0.218 won -33.33%
# 0.691 0.691 tied
# 0.545 0.327 won -40.00%
# 0.291 0.218 won -25.09%
# 0.218 0.291 lost +33.49%
# 0.654 0.473 won -27.68%
# 0.364 0.327 won -10.16%
# 0.291 0.182 won -37.46%
# 0.327 0.254 won -22.32%
# 0.691 0.509 won -26.34%
# 0.582 0.473 won -18.73%
# 0.291 0.255 won -12.37%
# 0.364 0.218 won -40.11%
# 0.436 0.327 won -25.00%
# 0.436 0.473 lost +8.49%
# 0.218 0.218 tied
# 0.291 0.255 won -12.37%
# 0.254 0.364 lost +43.31%
#
# won 15 times
# tied 2 times
# lost 3 times
#
# total unique fn went from 106 to 94
##############################################################################
# What about HTML?
#
# Computer geeks seem to view use of HTML in mailing lists and newsgroups as
# a mortal sin. Normal people don't, but so it goes: in a technical list/
# group, every HTML decoration has spamprob 0.99, there are lots of unique
# HTML decorations, and lots of them appear at the very start of the message
# so that Graham's scoring scheme latches on to them tight. As a result,
# any plain text message just containing an HTML example is likely to be
# judged spam (every HTML decoration is an extreme).
#
# So if a message is multipart/alternative with both text/plain and text/html
# branches, we ignore the latter, else newbies would never get a message
# through. If a message is just HTML, it has virtually no chance of getting
# through.
#
# In an effort to let normal people use mailing lists too <wink>, and to
# alleviate the woes of messages merely *discussing* HTML practice, I
# added a gimmick to strip HTML tags after case-normalization and after
# special tagging of embedded URLs. This consisted of a regexp sub pattern,
# where instances got replaced by single blanks:
#
# html_re = re.compile(r"""
# <
# [^\s<>] # e.g., don't match 'a < b' or '<<<' or 'i << 5' or 'a<>b'
# [^>]{0,128} # search for the end '>', but don't chew up the world
# >
# """, re.VERBOSE)
#
# and then
#
# text = html_re.sub(' ', text)
#
# Alas, little good came of this:
#
# false positive percentages
# 0.000 0.000 tied
# 0.000 0.000 tied
# 0.050 0.075 lost
# 0.000 0.000 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.050 0.050 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.000 0.050 lost
# 0.075 0.100 lost
# 0.050 0.050 tied
# 0.025 0.025 tied
# 0.000 0.025 lost
# 0.050 0.050 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.000 0.000 tied
# 0.025 0.050 lost
# 0.050 0.050 tied
#
# won 0 times
# tied 15 times
# lost 5 times
#
# total unique fp went from 8 to 12
#
# false negative percentages
# 0.945 1.164 lost
# 0.836 1.418 lost
# 1.200 1.272 lost
# 1.418 1.272 won
# 1.455 1.273 won
# 1.091 1.382 lost
# 1.091 1.309 lost
# 1.236 1.381 lost
# 1.564 1.745 lost
# 1.236 1.564 lost
# 1.563 1.781 lost
# 1.563 1.745 lost
# 1.236 1.455 lost
# 0.836 0.982 lost
# 0.873 1.309 lost
# 1.236 1.381 lost
# 1.273 1.273 tied
# 1.018 1.273 lost
# 1.091 1.200 lost
# 1.490 1.599 lost
#
# won 2 times
# tied 1 times
# lost 17 times
#
# total unique fn went from 292 to 327
#
# The messages merely discussing HTML were no longer fps, so it did what it
# intended there. But the f-n rate nearly doubled on at least one run -- so
# strong a set of spam indicators is the mere presence of HTML. The increase
# in the number of fps despite that the HTML-discussing msgs left that
# category remains mysterious to me, but it wasn't a significant increase
# so I let it drop.
#
# Later: If I simply give up on making mailing lists friendly to my sisters
# (they're not nerds, and create wonderfully attractive HTML msgs), a
# compromise is to strip HTML tags from only text/plain msgs. That's
# principled enough so far as it goes, and eliminates the HTML-discussing
# false positives. It remains disturbing that the f-n rate on pure HTML
# msgs increases significantly when stripping tags, so the code here doesn't
# do that part. However, even after stripping tags, the rates above show that
# at least 98% of spams are still correctly identified as spam.
#
# So, if another way is found to slash the f-n rate, the decision here not
# to strip HTML from HTML-only msgs should be revisited.
#
# Later, after the f-n rate got slashed via other means:
#
# false positive percentages
# 0.000 0.000 tied
# 0.000 0.000 tied
# 0.050 0.075 lost +50.00%
# 0.025 0.025 tied
# 0.075 0.025 won -66.67%
# 0.000 0.000 tied
# 0.100 0.100 tied
# 0.050 0.075 lost +50.00%
# 0.025 0.025 tied
# 0.025 0.000 won -100.00%
# 0.050 0.075 lost +50.00%
# 0.050 0.050 tied
# 0.050 0.025 won -50.00%
# 0.000 0.000 tied
# 0.000 0.000 tied
# 0.075 0.075 tied
# 0.025 0.025 tied
# 0.000 0.000 tied
# 0.025 0.025 tied
# 0.050 0.050 tied
#
# won 3 times
# tied 14 times
# lost 3 times
#
# total unique fp went from 13 to 11
#
# false negative percentages
# 0.327 0.400 lost +22.32%
# 0.400 0.400 tied
# 0.327 0.473 lost +44.65%
# 0.691 0.654 won -5.35%
# 0.545 0.473 won -13.21%
# 0.291 0.364 lost +25.09%
# 0.218 0.291 lost +33.49%
# 0.654 0.654 tied
# 0.364 0.473 lost +29.95%
# 0.291 0.327 lost +12.37%
# 0.327 0.291 won -11.01%
# 0.691 0.654 won -5.35%
# 0.582 0.655 lost +12.54%
# 0.291 0.400 lost +37.46%
# 0.364 0.436 lost +19.78%
# 0.436 0.582 lost +33.49%
# 0.436 0.364 won -16.51%
# 0.218 0.291 lost +33.49%
# 0.291 0.400 lost +37.46%
# 0.254 0.327 lost +28.74%
#
# won 5 times
# tied 2 times
# lost 13 times
#
# total unique fn went from 106 to 122
#
# So HTML decorations are still a significant clue when the ham is composed
# of c.l.py traffic. Again, this should be revisited if the f-n rate is
# slashed again.
#
# Later: As the amount of training data increased, the effect of retaining
# HTML tags decreased to insignificance. options.retain_pure_html_tags
# was introduced to control this, and it defaulted to False. Later, as the
# algorithm improved, retain_pure_html_tags was removed.
#
# Later: The decision to ignore "redundant" HTML is also dubious, since
# the text/plain and text/html alternatives may have entirely different
# content. options.ignore_redundant_html was introduced to control this,
# and it defaults to False. Later: ignore_redundant_html was also removed.
##############################################################################
# How big should "a word" be?
#
# As I write this, words less than 3 chars are ignored completely, and words
# with more than 12 are special-cased, replaced with a summary "I skipped
# about so-and-so many chars starting with such-and-such a letter" token.
# This makes sense for English if most of the info is in "regular size"
# words.
#
# A test run boosting to 13 had no effect on f-p rate, and did a little
# better or worse than 12 across runs -- overall, no significant difference.
# The database size is smaller at 12, so there's nothing in favor of 13.
# A test at 11 showed a slight but consistent bad effect on the f-n rate
# (lost 12 times, won once, tied 7 times).
#
# A test with no lower bound showed a significant increase in the f-n rate.
# Curious, but not worth digging into. Boosting the lower bound to 4 is a
# worse idea: f-p and f-n rates both suffered significantly then. I didn't
# try testing with lower bound 2.
#
# Anthony Baxter found that boosting the option skip_max_word_size to 20
# from its default of 12 produced a quite dramatic decrease in the number
# of 'unsure' messages. However, this was coupled with a large increase
# in the FN rate, and it remains unclear whether simply shifting cutoffs
# would have given the same tradeoff (not enough data was posted to tell).
#
# On Tim's c.l.py test, 10-fold CV, ham_cutoff=0.20 and spam_cutoff=0.80:
#
# -> <stat> tested 2000 hams & 1400 spams against 18000 hams & 12600 spams
# [ditto]
#
# filename: max12 max20
# ham:spam: 20000:14000
# 20000:14000
# fp total: 2 2 the same
# fp %: 0.01 0.01
# fn total: 0 0 the same
# fn %: 0.00 0.00
# unsure t: 103 100 slight decrease
# unsure %: 0.30 0.29
# real cost: $40.60 $40.00 slight improvement with these cutoffs
# best cost: $27.00 $27.40 best possible got slightly worse
# h mean: 0.28 0.27
# h sdev: 2.99 2.92
# s mean: 99.94 99.93
# s sdev: 1.41 1.47
# mean diff: 99.66 99.66
# k: 22.65 22.70
#
# "Best possible" in max20 would have been to boost ham_cutoff to 0.50(!),
# and drop spam_cutoff a little to 0.78. This would have traded away most
# of the unsures in return for letting 3 spam through:
#
# -> smallest ham & spam cutoffs 0.5 & 0.78
# -> fp 2; fn 3; unsure ham 11; unsure spam 11
# -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0647%
#
# Best possible in max12 was much the same:
#
# -> largest ham & spam cutoffs 0.5 & 0.78
# -> fp 2; fn 3; unsure ham 12; unsure spam 8
# -> fp rate 0.01%; fn rate 0.0214%; unsure rate 0.0588%
#
# The classifier pickle size increased by about 1.5 MB (~8.4% bigger).
#
# Rob Hooft's results were worse:
#
# -> <stat> tested 1600 hams & 580 spams against 14400 hams & 5220 spams
# [...]
# -> <stat> tested 1600 hams & 580 spams against 14400 hams & 5220 spams
# filename: skip12 skip20
# ham:spam: 16000:5800
# 16000:5800
# fp total: 12 13
# fp %: 0.07 0.08
# fn total: 7 7
# fn %: 0.12 0.12
# unsure t: 178 184
# unsure %: 0.82 0.84
# real cost: $162.60 $173.80
# best cost: $106.20 $109.60
# h mean: 0.51 0.52
# h sdev: 4.87 4.92
# s mean: 99.42 99.39
# s sdev: 5.22 5.34
# mean diff: 98.91 98.87
# k: 9.80 9.64
# textparts(msg) returns a set containing all the text components of msg.
# There's no point decoding binary blobs (like images). If a text/plain
# and text/html part happen to have redundant content, it doesn't matter
# to results, since training and scoring are done on the set of all
# words in the msg, without regard to how many times a given word appears.
def textparts(msg):
"""Return a set of all msg parts with content maintype 'text'."""
return Set(filter(lambda part: part.get_content_maintype() == 'text',
msg.walk()))
def octetparts(msg):
"""Return a set of all msg parts with type 'application/octet-stream'."""
return Set(filter(lambda part:
part.get_type() == 'application/octet-stream',
msg.walk()))
has_highbit_char = re.compile(r"[\x80-\xff]").search
# Cheap-ass gimmick to probabilistically find HTML/XML tags.
# Note that <style and HTML comments are handled by crack_html_style()
# and crack_html_comment() instead -- they can be very long, and long
# minimal matches have a nasty habit of blowing the C stack.
html_re = re.compile(r"""
<
(?![\s<>]) # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
# guessing that other tags are usually "short"
[^>]{0,256} # search for the end '>', but don't run wild
>
""", re.VERBOSE | re.DOTALL)
# Trailing letter serves to reject "hostnames" which are really ip
# addresses. Some spammers forge their apparent ip addresses, so you get
# Received: headers which look like:
# Received: from 199.249.165.175 ([218.5.93.116])
# by manatee.mojam.com (8.12.1-20030917/8.12.1) with SMTP id
# hBIERsqI018090
# for <itinerary@musi-cal.com>; Thu, 18 Dec 2003 08:28:11 -0600
# "199.249.165.175" is who the spamhaus said it was. That's really the
# ip address of the receiving host (manatee.mojam.com), which correctly
# identified the sender's ip address as 218.5.93.116.
#
# Similarly, the more complex character set instead of just \S serves to
# reject Received: headers where the message bounces from one user to
# another on the local machine:
# Received: (from itin@localhost)
# by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
# for skip@manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
received_host_re = re.compile(r'from ([a-z0-9._-]+[a-z])[)\s]')
# 99% of the time, the receiving host places the sender's ip address in
# square brackets as it should, but every once in awhile it turns up in
# parens. Yahoo seems to be guilty of this minor infraction:
# Received: from unknown (66.218.66.218)
# by m19.grp.scd.yahoo.com with QMQP; 19 Dec 2003 04:06:53 -0000
received_ip_re = re.compile(r'[[(]((\d{1,3}\.?){4})[])]')
message_id_re = re.compile(r'\s*<[^@]+@([^>]+)>\s*')
# I'm usually just splitting on whitespace, but for subject lines I want to
# break things like "Python/Perl comparison?" up. OTOH, I don't want to
# break up the unitized numbers in spammish subject phrases like "Increase
# size 79%" or "Now only $29.95!". Then again, I do want to break up
# "Python-Dev". Runs of punctuation are also interesting in subject lines.
subject_word_re = re.compile(r"[\w\x80-\xff$.%]+")
punctuation_run_re = re.compile(r'\W+')
fname_sep_re = re.compile(r'[/\\:]')
def crack_filename(fname):
yield "fname:" + fname
components = fname_sep_re.split(fname)
morethan1 = len(components) > 1
for component in components:
if morethan1:
yield "fname comp:" + component
pieces = urlsep_re.split(component)
if len(pieces) > 1:
for piece in pieces:
yield "fname piece:" + piece
def tokenize_word(word, _len=len, maxword=options["Tokenizer",
"skip_max_word_size"]):
n = _len(word)
# Make sure this range matches in tokenize().
if 3 <= n <= maxword:
yield word
elif n >= 3:
# A long word.
# Don't want to skip embedded email addresses.
# An earlier scheme also split up the y in x@y on '.'. Not splitting
# improved the f-n rate; the f-p rate didn't care either way.
if n < 40 and '.' in word and word.count('@') == 1:
p1, p2 = word.split('@')
yield 'email name:' + p1
yield 'email addr:' + p2
else:
# There's value in generating a token indicating roughly how
# many chars were skipped. This has real benefit for the f-n
# rate, but is neutral for the f-p rate. I don't know why!
# XXX Figure out why, and/or see if some other way of summarizing
# XXX this info has greater benefit.
if options["Tokenizer", "generate_long_skips"]:
yield "skip:%c %d" % (word[0], n // 10 * 10)
if has_highbit_char(word):
hicount = 0
for i in map(ord, word):
if i >= 128:
hicount += 1
yield "8bit%%:%d" % round(hicount * 100.0 / len(word))
# Generate tokens for:
# Content-Type
# and its type= param
# Content-Dispostion
# and its filename= param
# all the charsets
#
# This has huge benefit for the f-n rate, and virtually no effect on the f-p
# rate, although it does reduce the variance of the f-p rate across different
# training sets (really marginal msgs, like a brief HTML msg saying just
# "unsubscribe me", are almost always tagged as spam now; before they were
# right on the edge, and now the multipart/alternative pushes them over it
# more consistently).
#
# XXX I put all of this in as one chunk. I don't know which parts are
# XXX most effective; it could be that some parts don't help at all. But
# XXX given the nature of the c.l.py tests, it's not surprising that the
# XXX 'content-type:text/html'
# XXX token is now the single most powerful spam indicator (== makes it
# XXX into the nbest list most often). What *is* a little surprising is
# XXX that this doesn't push more mixed-type msgs into the f-p camp --
# XXX unlike looking at *all* HTML tags, this is just one spam indicator
# XXX instead of dozens, so relevant msg content can cancel it out.
#
# A bug in this code prevented Content-Transfer-Encoding from getting
# picked up. Fixing that bug showed that it didn't help, so the corrected
# code is disabled now (left column without Content-Transfer-Encoding,
# right column with it);
#
# false positive percentages
# 0.000 0.000 tied
# 0.000 0.000 tied
# 0.100 0.100 tied
# 0.000 0.000 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.100 0.100 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.050 0.050 tied
# 0.100 0.100 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.025 0.025 tied
# 0.000 0.025 lost +(was 0)
# 0.025 0.025 tied
# 0.100 0.100 tied
#
# won 0 times
# tied 19 times
# lost 1 times
#
# total unique fp went from 9 to 10
#
# false negative percentages
# 0.364 0.400 lost +9.89%
# 0.400 0.364 won -9.00%
# 0.400 0.436 lost +9.00%
# 0.909 0.872 won -4.07%
# 0.836 0.836 tied
# 0.618 0.618 tied
# 0.291 0.291 tied
# 1.018 0.981 won -3.63%
# 0.982 0.982 tied
# 0.727 0.727 tied
# 0.800 0.800 tied
# 1.163 1.127 won -3.10%
# 0.764 0.836 lost +9.42%
# 0.473 0.473 tied
# 0.473 0.618 lost +30.66%
# 0.727 0.763 lost +4.95%
# 0.655 0.618 won -5.65%
# 0.509 0.473 won -7.07%
# 0.545 0.582 lost +6.79%
# 0.509 0.509 tied
#
# won 6 times
# tied 8 times
# lost 6 times
#
# total unique fn went from 168 to 169
# For support of the replace_nonascii_chars option, build a string.translate
# table that maps all high-bit chars and control chars to a '?' character.
non_ascii_translate_tab = ['?'] * 256
# leave blank up to (but not including) DEL alone
for i in range(32, 127):
non_ascii_translate_tab[i] = chr(i)
# leave "normal" whitespace alone
for ch in ' \t\r\n':
non_ascii_translate_tab[ord(ch)] = ch
del i, ch
non_ascii_translate_tab = ''.join(non_ascii_translate_tab)
def crack_content_xyz(msg):
yield 'content-type:' + msg.get_content_type()
x = msg.get_param('type')
if x is not None:
yield 'content-type/type:' + x.lower()
for x in msg.get_charsets(None):
if x is not None:
yield 'charset:' + x.lower()
x = msg.get('content-disposition')
if x is not None:
yield 'content-disposition:' + x.lower()
try:
fname = msg.get_filename()
if fname is not None:
for x in crack_filename(fname):
yield 'filename:' + x
except TypeError:
# bug in email pkg? see the thread beginning at
# http://mail.python.org/pipermail/spambayes/2003-September/008006.html
# and
# http://mail.python.org/pipermail/spambayes-dev/2003-September/001177.html
yield "filename:<bogus>"
if 0: # disabled; see comment before function
x = msg.get('content-transfer-encoding')
if x is not None:
yield 'content-transfer-encoding:' + x.lower()
# The base64 decoder is actually very forgiving, but flubs one case:
# if no padding is required (no trailing '='), it continues to read
# following lines as if they were still part of the base64 part. We're
# actually stricter here. The *point* is that some mailers tack plain
# text on to the end of base64-encoded text sections.
# Match a line of base64, up to & including the trailing newline.
# We allow for optional leading and trailing whitespace, and don't care
# about line length, but other than that are strict. Group 1 is non-empty
# after a match iff the last significant char on the line is '='; in that
# case, it must be the last line of the base64 section.
base64_re = re.compile(r"""
[ \t]*
[a-zA-Z0-9+/]*
(=*)
[ \t]*
\r?
\n
""", re.VERBOSE)
def try_to_repair_damaged_base64(text):
i = 0
while True:
# text[:i] looks like base64. Does the line starting at i also?
m = base64_re.match(text, i)
if not m:
break
i = m.end()
if m.group(1):
# This line has a trailing '=' -- the base64 part is done.
break
base64text = ''
if i:
base64 = text[:i]
try:
base64text = binascii.a2b_base64(base64)
except:
# There's no point in tokenizing raw base64 gibberish.
pass
return base64text + text[i:]
def breakdown_host(host):
parts = host.split('.')
for i in range(1, len(parts) + 1):
yield '.'.join(parts[-i:])
def breakdown_ipaddr(ipaddr):
parts = ipaddr.split('.')
for i in range(1, 5):
yield '.'.join(parts[:i])
def log2(n, log=math.log, c=math.log(2)):
return log(n)/c
class Stripper(object):
# The retained portions are catenated together with self.separator.
# CAUTION: This used to be blank. But then I noticed spam putting
# HTML comments embedded in words, like
# FR<!--slkdflskjf-->EE!
# Breaking this into "FR" and "EE!" wasn't a real help <wink>.
separator = '' # a subclass can override if this isn't appropriate
def __init__(self, find_start, find_end):
# find_start and find_end have signature
# string, int -> match_object
# where the search starts at string[int:int]. If a match isn't found,
# they must return None. The match_object for find_start, if not
# None, is passed to self.tokenize, which returns a (possibly empty)
# list of tokens to generate. Subclasses may override tokenize().
# Text between find_start and find_end is thrown away, except for
# whatever tokenize() produces. A match_object must support method
# span() -> int, int # the slice bounds of what was matched
self.find_start = find_start
self.find_end = find_end
# Efficiency note: This is cheaper than it looks if there aren't any
# special sections. Under the covers, string[0:] is optimized to
# return string (no new object is built), and likewise ' '.join([string])
# is optimized to return string. It would actually slow this code down
# to special-case these "do nothing" special cases at the Python level!
def analyze(self, text):
i = 0
retained = []
pushretained = retained.append
tokens = []
while True:
m = self.find_start(text, i)
if not m:
pushretained(text[i:])
break
start, end = m.span()
pushretained(text[i : start])
tokens.extend(self.tokenize(m))
m = self.find_end(text, end)
if not m:
# No matching end - act as if the open
# tag did not exist.
pushretained(text[start:])
break
dummy, i = m.span()
return self.separator.join(retained), tokens
def tokenize(self, match_object):
# Override this if you want to suck info out of the start pattern.
return []
# Strip out uuencoded sections and produce tokens. The return value
# is (new_text, sequence_of_tokens), where new_text no longer contains
# uuencoded stuff. Note that we're not bothering to decode it! Maybe
# we should. One of my persistent false negatives is a spam containing
# nothing but a uuencoded money.txt; OTOH, uuencode seems to be on
# its way out (that's an old spam).
uuencode_begin_re = re.compile(r"""
^begin \s+
(\S+) \s+ # capture mode
(\S+) \s* # capture filename
$
""", re.VERBOSE | re.MULTILINE)
uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)
class UUencodeStripper(Stripper):
def __init__(self):
Stripper.__init__(self, uuencode_begin_re.search,
uuencode_end_re.search)
def tokenize(self, m):
mode, fname = m.groups()
return (['uuencode mode:%s' % mode] +
['uuencode:%s' % x for x in crack_filename(fname)])
crack_uuencode = UUencodeStripper().analyze
# Strip and specially tokenize embedded URLish thingies.
url_fancy_re = re.compile(r"""
\b # the preceeding character must not be alphanumeric
(?:
(?:
(https? | ftp) # capture the protocol
:// # skip the boilerplate
)|
(?= ftp\.[^\.\s<>"'\x7f-\xff] )| # allow the protocol to be missing, but only if
(?= www\.[^\.\s<>"'\x7f-\xff] ) # the rest of the url starts "www.x" or "ftp.x"
)
# Do a reasonable attempt at detecting the end. It may or may not
# be in HTML, may or may not be in quotes, etc. If it's full of %
# escapes, cool -- that's a clue too.
([^\s<>"'\x7f-\xff]+) # capture the guts
""", re.VERBOSE) # '
url_re = re.compile(r"""
(https? | ftp) # capture the protocol
:// # skip the boilerplate
# Do a reasonable attempt at detecting the end. It may or may not
# be in HTML, may or may not be in quotes, etc. If it's full of %
# escapes, cool -- that's a clue too.
([^\s<>"'\x7f-\xff]+) # capture the guts
""", re.VERBOSE) # '
urlsep_re = re.compile(r"[;?:@&=+,$.]")
class URLStripper(Stripper):
def __init__(self):
# The empty regexp matches anything at once.
if options["Tokenizer", "x-fancy_url_recognition"]:
search = url_fancy_re.search
else:
search = url_re.search
Stripper.__init__(self, search, re.compile("").search)
def tokenize(self, m):
proto, guts = m.groups()
assert guts
if proto is None:
if guts.lower().startswith("www"):
proto = "http"
elif guts.lower().startswith("ftp"):
proto = "ftp"
else:
proto = "unknown"
tokens = ["proto:" + proto]
pushclue = tokens.append
if options["Tokenizer", "x-pick_apart_urls"]:
url = proto + "://" + guts
escapes = re.findall(r'%..', guts)
# roughly how many %nn escapes are there?
if escapes:
pushclue("url:%%%d" % int(log2(len(escapes))))
# %nn escapes are usually intentional obfuscation. Generate a
# lot of correlated tokens if the URL contains a lot of them.
# The classifier will learn which specific ones are and aren't
# spammy.
tokens.extend(["url:" + escape for escape in escapes])
# now remove any obfuscation and probe around a bit
url = urllib.unquote(url)
scheme, netloc, path, params, query, frag = urlparse.urlparse(url)
# one common technique in bogus "please (re-)authorize yourself"
# scams is to make it appear as if you're visiting a valid
# payment-oriented site like PayPal, CitiBank or eBay, when you
# actually aren't. The company's web server appears as the
# beginning of an often long username element in the URL such as
# http://www.paypal.com%65%43%99%35@10.0.1.1/iwantyourccinfo
# generally with an innocuous-looking fragment of text or a
# valid URL as the highlighted link. Usernames should rarely
# appear in URLs (perhaps in a local bookmark you established),
# and never in a URL you receive from an unsolicited email or
# another website.
user_pwd, host_port = urllib.splituser(netloc)
if user_pwd is not None:
pushclue("url:has user")
host, port = urllib.splitport(host_port)
# web servers listening on non-standard ports are suspicious ...
if port is not None:
if (scheme == "http" and port != '80' or
scheme == "https" and port != '443'):
pushclue("url:non-standard %s port" % scheme)
# ... as are web servers associated with raw ip addresses
if re.match("(\d+\.?){4,4}$", host) is not None:
pushclue("url:ip addr")
# make sure we later tokenize the unobfuscated url bits
proto, guts = url.split("://", 1)
# Lose the trailing punctuation for casual embedding, like:
# The code is at http://mystuff.org/here? Didn't resolve.
# or
# I found it at http://mystuff.org/there/. Thanks!
while guts and guts[-1] in '.:?!/':
guts = guts[:-1]
for piece in guts.split('/'):
for chunk in urlsep_re.split(piece):
pushclue("url:" + chunk)
return tokens
received_complaints_re = re.compile(r'\([a-z]+(?:\s+[a-z]+)+\)')
class SlurpingURLStripper(URLStripper):
def __init__(self):
URLStripper.__init__(self)
def analyze(self, text):
# If there are no URLS, then we need to clear the
# wordstream, or whatever was there from the last message
# will be used.
classifier.slurp_wordstream = None
# Continue as normal.
return URLStripper.analyze(self, text)
def tokenize(self, m):
# XXX Note that the 'slurped' tokens are *always* trained
# XXX on; it would be simple to change/parameterize this.
tokens = URLStripper.tokenize(self, m)
if not options["URLRetriever", "x-slurp_urls"]:
return tokens
proto, guts = m.groups()
if proto != "http":
return tokens
assert guts
while guts and guts[-1] in '.:;?!/)':
guts = guts[:-1]
classifier.slurp_wordstream = (proto, guts)
return tokens
if options["URLRetriever", "x-slurp_urls"]:
crack_urls = SlurpingURLStripper().analyze
else:
crack_urls = URLStripper().analyze
# Nuke HTML <style gimmicks.
html_style_start_re = re.compile(r"""
< \s* style\b [^>]* >
""", re.VERBOSE)
class StyleStripper(Stripper):
def __init__(self):
Stripper.__init__(self, html_style_start_re.search,
re.compile(r"</style>").search)
crack_html_style = StyleStripper().analyze
# Nuke HTML comments.
class CommentStripper(Stripper):
def __init__(self):
Stripper.__init__(self,
re.compile(r"<!--|<\s*comment\s*[^>]*>").search,
re.compile(r"-->|</comment>").search)
crack_html_comment = CommentStripper().analyze
# Nuke stuff between <noframes> </noframes> tags.
class NoframesStripper(Stripper):
def __init__(self):
Stripper.__init__(self,
re.compile(r"<\s*noframes\s*>").search,
re.compile(r"</noframes\s*>").search)
crack_noframes = NoframesStripper().analyze
# Scan HTML for constructs often seen in viruses and worms.
# <script </script
# <iframe </iframe
# src=cid:
# height=0 width=0
virus_re = re.compile(r"""
< /? \s* (?: script | iframe) \b
| \b src= ['"]? cid:
| \b (?: height | width) = ['"]? 0
""", re.VERBOSE) # '
def find_html_virus_clues(text):
for bingo in virus_re.findall(text):
yield bingo
numeric_entity_re = re.compile(r'&#(\d+);')
def numeric_entity_replacer(m):
try:
return chr(int(m.group(1)))
except:
return '?'
breaking_entity_re = re.compile(r"""
| < (?: p
| br
)
>
""", re.VERBOSE)
class Tokenizer:
date_hms_re = re.compile(r' (?P<hour>[0-9][0-9])'
r':(?P<minute>[0-9][0-9])'
r'(?::[0-9][0-9])? ')
date_formats = ("%a, %d %b %Y %H:%M:%S (%Z)",
"%a, %d %b %Y %H:%M:%S %Z",
"%d %b %Y %H:%M:%S (%Z)",
"%d %b %Y %H:%M:%S %Z",
"%a, %d %b %Y %H:%M (%Z)",
"%a, %d %b %Y %H:%M %Z",
"%d %b %Y %H:%M (%Z)",
"%d %b %Y %H:%M %Z")
def __init__(self):
if options["Tokenizer", "basic_header_tokenize"]:
self.basic_skip = [re.compile(s)
for s in options["Tokenizer",
"basic_header_skip"]]
def get_message(self, obj):
return get_message(obj)
def tokenize(self, obj):
msg = self.get_message(obj)
for tok in self.tokenize_headers(msg):
yield tok
for tok in self.tokenize_body(msg):
yield tok
def tokenize_headers(self, msg):
# Special tagging of header lines and MIME metadata.
# Content-{Type, Disposition} and their params, and charsets.
# This is done for all MIME sections.
for x in msg.walk():
for w in crack_content_xyz(x):
yield w
# The rest is solely tokenization of header lines.
# XXX The headers in my (Tim's) spam and ham corpora are so different
# XXX (they came from different sources) that including several kinds
# XXX of header analysis renders the classifier's job trivial. So
# XXX lots of this is crippled now, controlled by an ever-growing
# XXX collection of funky options.
# Basic header tokenization
# Tokenize the contents of each header field in the way Subject lines
# are tokenized later.
# XXX Different kinds of tokenization have gotten better results on
# XXX different header lines. No experiments have been run on
# XXX whether the best choice is being made for each of the header
# XXX lines tokenized by this section.
# The name of the header is used as a tag. Tokens look like
# "header:word". The basic approach is simple and effective, but
# also very sensitive to biases in the ham and spam collections.
# For example, if the ham and spam were collected at different
# times, several headers with date/time information will become
# the best discriminators.
# (Not just Date, but Received and X-From_.)
if options["Tokenizer", "basic_header_tokenize"]:
for k, v in msg.items():
k = k.lower()
for rx in self.basic_skip:
if rx.match(k):
break # do nothing -- we're supposed to skip this
else:
# Never found a match -- don't skip this.
for w in subject_word_re.findall(v):
for t in tokenize_word(w):
yield "%s:%s" % (k, t)
if options["Tokenizer", "basic_header_tokenize_only"]:
return
# Habeas Headers - see http://www.habeas.com
if options["Tokenizer", "x-search_for_habeas_headers"]:
habeas_headers = [
("X-Habeas-SWE-1", "winter into spring"),
("X-Habeas-SWE-2", "brightly anticipated"),
("X-Habeas-SWE-3", "like Habeas SWE (tm)"),
("X-Habeas-SWE-4", "Copyright 2002 Habeas (tm)"),
("X-Habeas-SWE-5", "Sender Warranted Email (SWE) (tm). The sender of this"),
("X-Habeas-SWE-6", "email in exchange for a license for this Habeas"),
("X-Habeas-SWE-7", "warrant mark warrants that this is a Habeas Compliant"),
("X-Habeas-SWE-8", "Message (HCM) and not spam. Please report use of this"),
("X-Habeas-SWE-9", "mark in spam to <http://www.habeas.com/report/>.")
]
valid_habeas = 0
invalid_habeas = False
for opt, val in habeas_headers:
habeas = msg.get(opt)
if habeas is not None:
if options["Tokenizer", "x-reduce_habeas_headers"]:
if habeas == val:
valid_habeas += 1
else:
invalid_habeas = True
else:
if habeas == val:
yield opt.lower() + ":valid"
else:
yield opt.lower() + ":invalid"
if options["Tokenizer", "x-reduce_habeas_headers"]:
# If there was any invalid line, we record as invalid.
# If all nine lines were correct, we record as valid.
# Otherwise we ignore.
if invalid_habeas == True:
yield "x-habeas-swe:invalid"
elif valid_habeas == 9:
yield "x-habeas-swe:valid"
# Subject:
# Don't ignore case in Subject lines; e.g., 'free' versus 'FREE' is
# especially significant in this context. Experiment showed a small
# but real benefit to keeping case intact in this specific context.
x = msg.get('subject', '')
try:
subjcharsetlist = email.Header.decode_header(x)
except (binascii.Error, email.Errors.HeaderParseError):
subjcharsetlist = [(x, 'invalid')]
for x, subjcharset in subjcharsetlist:
if subjcharset is not None:
yield 'subjectcharset:' + subjcharset
for w in subject_word_re.findall(x):
for t in tokenize_word(w):
yield 'subject:' + t
for w in punctuation_run_re.findall(x):
yield 'subject:' + w
# Dang -- I can't use Sender:. If I do,
# 'sender:email name:python-list-admin'
# becomes the most powerful indicator in the whole database.
#
# From: # this helps both rates
# Reply-To: # my error rates are too low now to tell about this
# # one (smalls wins & losses across runs, overall
# # not significant), so leaving it out
# To:, Cc: # These can help, if your ham and spam are sourced
# # from the same location. If not, they'll be horrible.
for field in options["Tokenizer", "address_headers"]:
addrlist = msg.get_all(field, [])
if not addrlist:
yield field + ":none"
continue
noname_count = 0
for name, addr in email.Utils.getaddresses(addrlist):
if name:
try:
subjcharsetlist = email.Header.decode_header(name)
except (binascii.Error, email.Errors.HeaderParseError):
subjcharsetlist = [(name, 'invalid')]
for name, charset in subjcharsetlist:
yield "%s:name:%s" % (field, name.lower())
if charset is not None:
yield "%s:charset:%s" % (field, charset)
else:
noname_count += 1
if addr:
for w in addr.lower().split('@'):
yield "%s:addr:%s" % (field, w)
else:
yield field + ":addr:none"
if noname_count:
yield "%s:no real name:2**%d" % (field,
round(log2(noname_count)))
# Spammers sometimes send out mail alphabetically to fairly large
# numbers of addresses. This results in headers like:
# To: <itinerart@videotron.ca>
# Cc: <itinerant@skyful.com>, <itinerant@netillusions.net>,
# <itineraries@musi-cal.com>, <itinerario@rullet.leidenuniv.nl>,
# <itinerance@sorengo.com>
#
# This token attempts to exploit that property. The above would
# give a common prefix of "itinera" for 6 addresses, yielding a
# gross score of 42. We group scores into buckets by dividing by 10
# to yield a final token value of "pfxlen:04". The length test
# eliminates the bad case where the message was sent to a single
# individual.
if options["Tokenizer", "summarize_email_prefixes"]:
all_addrs = []
addresses = msg.get_all('to', []) + msg.get_all('cc', [])
for name, addr in email.Utils.getaddresses(addresses):
all_addrs.append(addr.lower())
if len(all_addrs) > 1:
# don't be fooled by "os.path." - commonprefix
# operates char-by-char!
pfx = os.path.commonprefix(all_addrs)
if pfx:
score = (len(pfx) * len(all_addrs)) // 10
# After staring at pfxlen:* values generated from a large
# number of ham & spam I saw that any scores greater
# than 3 were always associated with spam. Collapsing
# all such scores into a single token avoids a bunch of
# hapaxes like "pfxlen:28".
if score > 3:
yield "pfxlen:big"
else:
yield "pfxlen:%d" % score
# same idea as above, but works for addresses in the same domain
# like
# To: "skip" <bugs@mojam.com>, <chris@mojam.com>,
# <concertmaster@mojam.com>, <concerts@mojam.com>,
# <design@mojam.com>, <rob@mojam.com>, <skip@mojam.com>
if options["Tokenizer", "summarize_email_suffixes"]:
all_addrs = []
addresses = msg.get_all('to', []) + msg.get_all('cc', [])
for name, addr in email.Utils.getaddresses(addresses):
# flip address code so following logic is the same as
# that for prefixes
addr = list(addr)
addr.reverse()
addr = "".join(addr)
all_addrs.append(addr.lower())
if len(all_addrs) > 1:
# don't be fooled by "os.path." - commonprefix
# operates char-by-char!
sfx = os.path.commonprefix(all_addrs)
if sfx:
score = (len(sfx) * len(all_addrs)) // 10
# Similar analysis as above regarding suffix length
# I suspect the best cutoff is probably dependent on
# how long the recipient domain is (e.g. "mojam.com" vs.
# "montanaro.dyndns.org")
if score > 5:
yield "sfxlen:big"
else:
yield "sfxlen:%d" % score
# To:
# Cc:
# Count the number of addresses in each of the recipient headers.
for field in ('to', 'cc'):
count = 0
for addrs in msg.get_all(field, []):
count += len(addrs.split(','))
if count > 0:
yield '%s:2**%d' % (field, round(log2(count)))
# These headers seem to work best if they're not tokenized: just
# normalize case and whitespace.
# X-Mailer: This is a pure and significant win for the f-n rate; f-p
# rate isn't affected.
for field in ('x-mailer',):
prefix = field + ':'
x = msg.get(field, 'none').lower()
yield prefix + ' '.join(x.split())
# Received:
# Neil Schemenauer reports good results from this.
if options["Tokenizer", "mine_received_headers"]:
for header in msg.get_all("received", ()):
# everything here should be case insensitive and not be
# split across continuation lines, so normalize whitespace
# and letter case just once per header
header = ' '.join(header.split()).lower()
for clue in received_complaints_re.findall(header):
yield 'received:' + clue
for pat, breakdown in [(received_host_re, breakdown_host),
(received_ip_re, breakdown_ipaddr)]:
m = pat.search(header)
if m:
for tok in breakdown(m.group(1)):
yield 'received:' + tok
# Date:
if options["Tokenizer", "x-generate_time_buckets"]:
for header in msg.get_all("date", ()):
mat = self.date_hms_re.search(header)
# return the time in Date: headers arranged in
# 10-minute buckets
if mat is not None:
h = int(mat.group('hour'))
bucket = int(mat.group('minute')) // 10
yield 'time:%02d:%d' % (h, bucket)
if options["Tokenizer", "x-extract_dow"]:
for header in msg.get_all("date", ()):
# extract the day of the week
for fmt in self.date_formats:
try:
timetuple = time.strptime(header, fmt)
except ValueError:
pass
else:
yield 'dow:%d' % timetuple[6]
break
else:
# if nothing matches, declare the Date: header invalid
yield 'dow:invalid'
# Message-Id: This seems to be a small win and should not
# adversely affect a mixed source corpus so it's always enabled.
msgid = msg.get("message-id", "")
m = message_id_re.match(msgid)
if m:
# looks okay, return the hostname
yield 'message-id:@%s' % m.group(1)
else:
# might be weird instead of invalid but who cares?
yield 'message-id:invalid'
# As suggested by Anthony Baxter, merely counting the number of
# header lines, and in a case-sensitive way, has real value.
# For example, all-caps SUBJECT is a strong spam clue, while
# X-Complaints-To a strong ham clue.
x2n = {}
if options["Tokenizer", "count_all_header_lines"]:
for x in msg.keys():
x2n[x] = x2n.get(x, 0) + 1
else:
# Do a "safe" approximation to that. When spam and ham are
# collected from different sources, the count of some header
# lines can be a too strong a discriminator for accidental
# reasons.
safe_headers = options["Tokenizer", "safe_headers"]
for x in msg.keys():
if x.lower() in safe_headers:
x2n[x] = x2n.get(x, 0) + 1
for x in x2n.items():
yield "header:%s:%d" % x
if options["Tokenizer", "record_header_absence"]:
for k in x2n:
if not k.lower() in options["Tokenizer", "safe_headers"]:
yield "noheader:" + k
def tokenize_body(self, msg, maxword=options["Tokenizer",
"skip_max_word_size"]):
"""Generate a stream of tokens from an email Message.
If options['Tokenizer', 'check_octets'] is True, the first few
undecoded characters of application/octet-stream parts of the
message body become tokens.
"""
if options["Tokenizer", "check_octets"]:
# Find, decode application/octet-stream parts of the body,
# tokenizing the first few characters of each chunk.
for part in octetparts(msg):
try:
text = part.get_payload(decode=True)
except:
yield "control: couldn't decode octet"
text = part.get_payload(decode=False)
if text is None:
yield "control: octet payload is None"
continue
yield "octet:%s" % text[:options["Tokenizer",
"octet_prefix_size"]]
# Find, decode (base64, qp), and tokenize textual parts of the body.
for part in textparts(msg):
# Decode, or take it as-is if decoding fails.
try:
text = part.get_payload(decode=True)
except:
yield "control: couldn't decode"
text = part.get_payload(decode=False)
if text is not None:
text = try_to_repair_damaged_base64(text)
if text is None:
yield 'control: payload is None'
continue
# Replace numeric character entities (like a for the letter
# 'a').
text = numeric_entity_re.sub(numeric_entity_replacer, text)
# Normalize case.
text = text.lower()
if options["Tokenizer", "replace_nonascii_chars"]:
# Replace high-bit chars and control chars with '?'.
text = text.translate(non_ascii_translate_tab)
for t in find_html_virus_clues(text):
yield "virus:%s" % t
# Get rid of uuencoded sections, embedded URLs, <style gimmicks,
# and HTML comments.
for cracker in (crack_uuencode,
crack_urls,
crack_html_style,
crack_html_comment,
crack_noframes):
text, tokens = cracker(text)
for t in tokens:
yield t
# Remove HTML/XML tags. Also . <br> and <p> tags should
# create a space too.
text = breaking_entity_re.sub(' ', text)
# It's important to eliminate HTML tags rather than, e.g.,
# replace them with a blank (as this code used to do), else
# simple tricks like
# Wr<!$FS|i|R3$s80sA >inkle Reduc<!$FS|i|R3$s80sA >tion
# can be used to disguise words. <br> and <p> were special-
# cased just above (because browsers break text on those,
# they can't be used to hide words effectively).
text = html_re.sub('', text)
# Tokenize everything in the body.
for w in text.split():
n = len(w)
# Make sure this range matches in tokenize_word().
if 3 <= n <= maxword:
yield w
elif n >= 3:
for t in tokenize_word(w):
yield t
tokenize = Tokenizer().tokenize
syntax highlighted by Code2HTML, v. 0.9.1