#!/usr/local/bin/python2.3

"""
rebal.py - rebalance a ham or spam test directory

usage: rebal.py [ options ]
options:
   -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
   -n num - specify number of files per Set dir desired [%(NPERDIR)s]
   -t     - top directory, holding Set and reservoir subdirs [%(TOPDIR)s]

   -v     - tell user what's happening; opposite of -q [%(VERBOSE)s]
   -q     - be quiet about what's happening; opposite of -v [not %(VERBOSE)s]

   -c     - confirm file moves into Set directory; opposite of -Q [%(CONFIRM)s]
   -Q     - don't confirm moves; opposite of -c; independent of -v/-q

   -h     - display this message and quit

If you have a non-standard test setup, you can use -r/-s instead of -t:
   -r res - specify an alternate reservoir [%(RESDIR)s]
   -s set - specify an alternate Set prefix [%(SETPREFIX)s]

Moves files randomly among the Set subdirectories and a reservoir directory to
leave -n files in each Set directory.  By default, the Set1, Set2, ..., and
reservoir subdirectories under (relative path) Data/Ham/ are rebalanced; this
can be changed with the -t option.  The script will work with a variable
number of Set directories, but they must already exist, and the reservoir
directory must also exist.

It's recommended that you run with the -d (dry run) option first, to see what
the script would do without actually moving any files.  If, e.g., you
accidentally mix up spam Sets with your Ham reservoir, it could be very
difficult to recover from that mistake.

See the module comments for examples.
"""

# Examples:
#
#    rebal.py -n 300
#
# Moves files among the Set1, Set2, ..., and reservoir directories under
# Data/Ham/, leaving 300 files in each Set directory.
#
#    rebal.py -t Data/Spam -n 300
#
# The same, but under Data/Spam/.
#
#    rebal.py -r reservoir -s Set -n 300
#
# The same, but under the Set1, Set2, ..., and reservoir directories
# in the current directory.
#
# Supposing you want to shuffle your Set files around randomly, winding up
# with 300 files in each one, you can execute:
#
#    rebal.py -n 0
#    rebal.py -n 300 -Q
#
# The first moves all files from the various Data/Ham/Set directories to the
# Data/Ham/reservoir directory.  The second run randomly parcels out 300 files
# to each of the Data/Ham/Set directories.

import os
import sys
import random
import glob
import getopt

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


# defaults
NPERDIR = 4000
TOPDIR = os.path.join('Data', 'Ham')
RESDIR = os.path.join(TOPDIR, 'reservoir')
SETPREFIX = os.path.join(TOPDIR, 'Set')
VERBOSE = True
CONFIRM = True
DRYRUN = False

def usage(msg=None):
    if msg:
        print >> sys.stderr, str(msg)
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()

def migrate(f, targetdir, verbose):
    """Move f into targetdir, renaming if needed to avoid name clashes.

       The basename of the moved file is returned; this may not be the
       same as the basename of f, if the file had to be renamed because
       a file with f's basename already existed in targetdir.
    """

    base = os.path.basename(f)
    out = os.path.join(targetdir, base)
    while os.path.exists(out):
        basename, ext = os.path.splitext(base)
        digits = random.randrange(100000000)
        out = os.path.join(targetdir, str(digits) + ext)
    if verbose:
        print "moving", f, "to", out
    os.rename(f, out)
    return os.path.basename(out)

def main(args):
    nperdir = NPERDIR
    verbose = VERBOSE
    confirm = CONFIRM
    dryrun = DRYRUN
    topdir = resdir = setprefix = None

    try:
        opts, args = getopt.getopt(args, "dr:s:t:n:vqcQh")
    except getopt.GetoptError, msg:
        usage(msg)
        return 1

    for opt, arg in opts:
        if opt == "-n":
            nperdir = int(arg)
        elif opt == "-t":
            topdir = arg
        elif opt == "-r":
            resdir = arg
        elif opt == "-s":
            setprefix = arg
        elif opt == "-v":
            verbose = True
        elif opt == "-c":
            confirm = True
        elif opt == "-q":
            verbose = False
        elif opt == "-Q":
            confirm = False
        elif opt == "-d":
            dryrun = True
        elif opt == "-h":
            usage()
            return 0
        else:
            raise SystemError("internal error on option '%s'" % opt)

    # Derive setprefix and resdir from topdir, if the latter was given.
    if topdir is not None:
        if resdir is not None or setprefix is not None:
            usage("-t can't be specified with -r or -s")
            return -1
        setprefix = os.path.join(topdir, "Set")
        resdir = os.path.join(topdir, "reservoir")
    else:
        if setprefix is None:
            setprefix = SETPREFIX
        if resdir is None:
            resdir = RESDIR

    if not os.path.exists(resdir):
        print >> sys.stderr, "reservoir directory %s doesn't exist" % resdir
        return 1
    res = os.listdir(resdir)

    dirs = glob.glob(setprefix + "*")
    if not dirs:
        print >> sys.stderr, "no directories starting with", setprefix, "exist."
        return 1

    # stuff <- list of (directory, files) pairs, where directory is the
    # name of a Set subdirectory, and files is a list of files in that dir.
    stuff = []
    n = len(res)    # total number of all files
    for d in dirs:
        fs = os.listdir(d)
        n += len(fs)
        stuff.append((d, fs))

    if nperdir * len(dirs) > n:
        print >> sys.stderr, "not enough files to go around - use lower -n."
        return 1

    # weak check against mixing ham and spam
    if ((setprefix.find("Ham") >= 0 and resdir.find("Spam") >= 0) or
        (setprefix.find("Spam") >= 0 and resdir.find("Ham") >= 0)):
        yn = raw_input("Reservoir and Set dirs appear not to match. "
                       "Continue? (y/n) ")
        if yn.lower()[0:1] != 'y':
            return 1

    # If necessary, migrate random files to the reservoir.
    for (d, fs) in stuff:
        if len(fs) <= nperdir:
            continue

        # Retain only nperdir files, moving the rest to reservoir.
        random.shuffle(fs)
        movethese = fs[nperdir:]
        del fs[nperdir:]
        if dryrun:
            print "would move", len(movethese), "files from", d, \
                  "to reservoir", resdir
            res.extend(movethese)
        else:
            for f in movethese:
                newname = migrate(os.path.join(d, f), resdir, verbose)
                res.append(newname)

    # Randomize reservoir once so we can just bite chunks from the end.
    random.shuffle(res)

    # Grow Set* directories from the reservoir as needed.
    for (d, fs) in stuff:
        assert len(fs) <= nperdir
        if nperdir == len(fs):
            continue

        numtomove = nperdir - len(fs)
        assert 0 < numtomove <= len(res)
        movethese = res[-numtomove:]
        del res[-numtomove:]
        if dryrun:
            print "would move", len(movethese), "files from reservoir", \
                  resdir, "to", d
        else:
            for f in movethese:
                if confirm:
                    print file(os.path.join(resdir, f)).read()
                    ok = raw_input('good enough? ').lower()
                    if not ok.startswith('y'):
                        continue
                migrate(os.path.join(resdir, f), d, verbose)

    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))


syntax highlighted by Code2HTML, v. 0.9.1