#! /usr/local/bin/python2.3

"""Count the number of messages in Unix mboxes.

Usage: %(programs)s [-g] [-h] path1 ...
Options:

    -h
        Print this help message and exit
    -g
        Do globbing on each path.  This is helpful on Windows, where the
        native shells don't glob.
"""

"""
Stats for Barry's corpora, as of 26-Aug-2002, using then-current 2.3a0:

edu-sig-clean.mbox                 252 (+ unparseable: 0)
python-dev-clean.mbox             8326 (+ unparseable: 0)
mailman-developers-clean.mbox     2427 (+ unparseable: 0)
python-list-clean.mbox          159072 (+ unparseable: 2)
zope3-clean.mbox                  2177 (+ unparseable: 0)

Unparseable messages are likely spam.
zope3-clean.mbox is really from the zope3-dev mailing list.
The Python version matters because the email package varies across releases
in whether it uses strict or lax parsing.
"""

import sys
import mailbox
import email
import getopt
import glob

from spambayes.mboxutils import get_message

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


program = sys.argv[0]

def usage(code, msg=''):
    print >> sys.stderr, __doc__
    if msg:
        print >> sys.stderr, msg
    sys.exit(code)

def count(fname):
    fp = open(fname, 'rb')
    mbox = mailbox.PortableUnixMailbox(fp, get_message)
    goodcount = 0
    badcount = 0
    for msg in mbox:
        if msg["to"] is None and msg["cc"] is None:
            badcount += 1
        else:
            goodcount += 1
    fp.close()
    return goodcount, badcount

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hg', ['help'])
    except getopt.error, msg:
        usage(1, msg)

    doglob = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-g':
            doglob = True

    for path in args:
        if doglob:
            fnames = glob.glob(path)
        else:
            fnames = [path]

        for fname in fnames:
            goodn, badn = count(fname)
            print "%-35s %7d (+ unparseable: %d)" % (fname, goodn, badn)

if __name__ == '__main__':
    main()


syntax highlighted by Code2HTML, v. 0.9.1