#! /usr/local/bin/python2.3

### Set up the classic Data/Ham/reservior and Data/Spam/reservior
### directories based from MH mailboxes ~/Mail/everything and
### ~/Mail/spam.

"""Usage: %(program)s [OPTIONS] ...

Where OPTIONS is one or more of:
    -h
        show usage and exit
    -e PATH
        directory of all messages (ham and spam).
    -s PATH
        directory of known spam messages.  These should be duplicates
        of messages in the everything folder.  Can be specified more
        than once.
"""

import getopt
import sys
import os
import filecmp
import shutil

program = sys.argv[0]
loud = True
day = 24 * 60 * 60
# The following are in days
expire = 4 * 30
grouping = 2
hamdir = "Data/Ham/reservoir"
spamdir = "Data/Spam/reservoir"

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def main():
    """Main program; parse options and go."""

    global loud

    everything = None
    spam = []

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hs:e:')
    except getopt.error, msg:
        usage(2, msg)

    if opts:
        for opt, arg in opts:
            if opt == '-h':
                usage(0)
            elif opt == '-e':
                everything = arg
            elif opt == '-s':
                spam.append(arg)
        if args:
            usage(2, "Positional arguments not allowed")
    else:
        everything = os.path.expanduser("~/Mail/everything")
        spam = [os.path.expanduser("~/Mail/spam"),
                os.path.expanduser("~/Mail/newspam")]

    spamsizes = {}

    for s in spam:
        if loud: print "Scanning spamdir (%s):" % s
        files = os.listdir(s)
        for f in files:
            if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
                name = os.path.join(s, f)
                size = os.stat(name).st_size
                try:
                    spamsizes[size].append(name)
                except KeyError:
                    spamsizes[size] = [name]

    os.makedirs(spamdir)
    os.makedirs(hamdir)

    if loud: print "Scanning everything"
    for f in os.listdir(everything):
        if f[0] in ('1', '2', '3', '4', '5', '6', '7', '8', '9'):
            name = os.path.join(everything, f)
            size = os.stat(name).st_size
            isspam = False
            try:
                for s in spamsizes[size]:
                    if filecmp.cmp(name, s):
                        isspam = True
            except KeyError:
                pass
            if isspam:
                shutil.copyfile(name, os.path.join(spamdir, f))
            else:
                shutil.copyfile(name, os.path.join(hamdir, f))


if __name__ == "__main__":
    main()


syntax highlighted by Code2HTML, v. 0.9.1