#!/usr/local/bin/python

"""
Compute a 'loose' checksum on the msg (file on cmdline or via stdin).

Attempts are made to eliminate content which tends to obscure the 'sameness'
of messages.  This is aimed particularly at spam, which tends to contains
lots of small differences across messages to try and thwart spam filters, in
hopes that at least one copy reaches its desitination.

Before calculating the checksum, this script does the following:

    * delete the message header

    * delete HTML tags which generally contain URLs

    * delete anything which looks like an email address or URL

    * finally, discard everything other than ascii letters and digits (note
      that this will almost certainly be ineffectual for spam written in
      eastern languages such as Korean)

An MD5 checksum is then computed for the resulting text and written to stdout.

"""

import getopt
import sys
import email.Parser
import md5
import re
import time
import binascii

from spambayes.mboxutils import getmbox

def flatten(obj):
    # I do not know how to use the email package very well - all I want here
    # is the body of obj expressed as a string - there is probably a better
    # way to accomplish this which I haven't discovered.
    # three types are possible: string, Message (hasattr(get_payload)), list
    if isinstance(obj, str):
        return obj
    if hasattr(obj, "get_payload"):
        return flatten(obj.get_payload())
    if isinstance(obj, list):
        return "\n".join([flatten(b) for b in obj])
    raise TypeError, ("unrecognized body type: %s" % type(obj))

def generate_checksum(msg):
    data = flatten(msg)

    # modelled after Justin Mason's fuzzy checksummer for SpamAssassin.
    # Message body is cleaned, then broken into lines.  The list of lines is
    # then broken into four parts and separate checksums are generated for
    # each part.  They are then joined together with '.'.  Downstream
    # processes can split those chunks into pieces and consider them
    # separately or in various combinations if desired.

    # Get rid of anything which looks like an HTML tag and downcase it all
    data = re.sub(r"<[^>]*>", "", data).lower()

    # delete anything which looks like a url or email address
    # not sure what a pmguid: url is but it seems to occur frequently in spam
    words = [w for w in data.split(' ')
             if ('@' not in w and
                 (':' not in w or
                  w[:4] != "ftp:" and
                  w[:7] != "mailto:" and
                  w[:5] != "http:" and
                  w[:7] != "gopher:" and
                  w[:8] != "pmguid:"))]

    # delete lines which contain white space
    lines = [line for line in " ".join(words).split('\n') if ' ' in line]

    # +1 guarantees we don't miss lines at the end
    chunksize = len(lines)//4+1
    sum = []
    for i in range(4):
        chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
        sum.append(binascii.b2a_hex(md5.new(chunk).digest()))

    return ".".join(sum)

def main(args):
    opts, args = getopt.getopt(args, "")
    for opt, arg in opts:
        pass
    if not args:
        mboxes = [getmbox("-")]
    else:
        mboxes = [getmbox(a) for a in args]

    for mbox in mboxes:
        for msg in mbox:
            print generate_checksum(msg)

if __name__ == "__main__":
    main(sys.argv[1:])