#!/usr/bin/env python

"""Whitewash the contents of a Subversion file and its successors.

Usage: svn-obliterate.py REPOS_PATH PATH REVISION
"""

import sys
import os
import string
import re
import bsddb3
from svn import repos, fs, core

###  TODO: Clean out the transactions table.
###  TODO: Clean out the other stuff (maybe).

def die(msg):
    sys.stderr.write(msg + '\n')
    sys.exit(1)
                     

def get_rep_keys(skel):
    # PROP-KEY and NODE-KEY (and maybe EDIT-KEY) follow the header,
    # again with possible atom size bitz.
    size, rest = string.split(skel[6:], ' ', 1)
    path = rest[0:int(size)]
    rest = rest[int(size) + 1:]
    end_header = string.find(rest, ')')
    pieces = string.split(rest[end_header + 2:-1], ' ')
    prop_key = None
    data_key = None
    if pieces[0][0] in string.digits:
        del pieces[0]
    if pieces[0]:
        prop_key = pieces[0]
    if pieces[1][0] in string.digits:
        del pieces[1]
    if pieces[1]:
        data_key = pieces[1]
    return prop_key, data_key


def read_string(strings_db, string_key):
    string_data = ''
    key, value = strings_db.set_location(string_key)
    while key == string_key:
        string_data = string_data + value
        key, value = strings_db.next()
    return string_data


def unparse_dirent_skel(entries):
    items = ''
    first_one = 1
    for name, id in entries.items():
        if not first_one:
            items = items + ' '
        first_one = 0
        items = items + '(%d %s %d %s)' % (len(name), name, len(id), id)
    return '(%s)' % items


def parse_dirent_skel(skel):
    skel = skel[1:-1]
    entries = {}
    while 1:
        if not len(skel) or skel[0] != '(':
            break
        token, rest = string.split(skel[1:], ' ', 1)
        if skel[1] in string.digits:
            size = token
            name = rest[0:int(size)]
            rest = skel[1 + len(size) + 1 + int(size) + 1:]
        else:
            name = token
        match = re.match('([0-9]+ )?([a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+)\)',
                         rest)
        if not match:
            break
        id = match.group(2)
        entries[name] = id
        skel = rest[len(match.group(0)) + 1:]
    return entries


_fulltext_re = re.compile('^(\(\(fulltext [^\(]+)\(md5 (16 )?')
def fix_affected_dirlists(node, reps_db, strings_db, affected_nodes, dirlists):
    prop_key, data_key = get_rep_keys(node)
    if not data_key:
        return
    data_rep = reps_db[data_key]
    
    # See if this is a fulltext rep.  If so, the STRING-KEY is a
    # pretty easy find.  Well wipe that STRING-KEY, and clear the
    # checksum from the REPRESENTATION.
    match = re.match(_fulltext_re, data_rep)
    if not match:
        die('Unable to handle non-fulltext dirent list "%s"' % data_key)

    rep_rest = data_rep[len(match.group(0)) + 16 + 3:-1]
    pieces = string.split(rep_rest, ' ')
    string_key = pieces[-1]
    string_data = read_string(strings_db, string_key)
    entries = parse_dirent_skel(string_data)
    kill_count = 0
    for name, id in entries.items():
        if id in affected_nodes:
            kill_count = kill_count + 1
            del(entries[name])
    if kill_count:
        ### begin txn!
        del(strings_db[string_key])
        strings_db[string_key] = unparse_dirent_skel(entries)
        reps_db[data_key] = match.group(1) + \
                            '(md5 16 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0)) ' + \
                            str(len(string_key)) + ' ' + string_key + ')'
        ### end txn!
    return kill_count
    

def parse_node_skel(skel):
    # PREV-ID immediately follows the COMMITTED-PATH, unless there is
    # a skel atom size marker in there first.
    is_dir = 0
    if skel[0:7] == '((file ':
        size, rest = string.split(skel[7:], ' ', 1)
    elif skel[0:6] == '((dir ':
        is_dir = 1
        size, rest = string.split(skel[6:], ' ', 1)
    else:
        die("Unable to parse skel '%s'" % skel)
    path = rest[0:int(size)]
    rest = rest[int(size) + 1:]
    rest = rest[:string.find(rest, ')')]
    pieces = string.split(rest, ' ')
    prev_id = None
    if pieces[0][0] in string.digits:
        del pieces[0]
    if pieces[0] != '':
        prev_id = pieces[0]
    return prev_id, is_dir


def get_node_id(pool, repos_path, path, revision):
    # Open the repository and filesystem.
    repos_ptr = repos.open(repos_path, pool)
    fs_ptr = repos.fs(repos_ptr)

    # Fetch the node revision ID of interest
    rev_root = fs.revision_root(fs_ptr, int(revision), pool)
    return fs.unparse_id(fs.node_id(rev_root, path, pool), pool)


def append_successors(nodes, node_id, affected_nodes):
    node = nodes[node_id]
    affected_nodes.append(node_id)
    for succ_id in node[2]:
        append_successors(nodes, succ_id, affected_nodes)


def main():
    kill_preds = 1

    ### Until this thing learns to purge the 'changes', it ise
    ### basically useless (because dumps/loads are entirely
    ### 'changes'-table driven).  So just bail.

    print "This script will, at the moment, destroy your repository."
    print "You don't really want that, right?"
    sys.exit(0)
    
    # Parse the commandline arguments.
    argc = len(sys.argv)
    if argc < 4:
        print __doc__
        sys.exit(1)
    repos_path, path, revision = sys.argv[1:4]

    # Fetch the NODE-REV-ID of the PATH@REV which holds our interest.
    sys.stdout.write('Harvesting info for "%s" in r%s.\n' % \
                     (path, revision))
    sys.stdout.write('-- Determining node revision ID... ')
    sys.stdout.flush()
    node_id = core.run_app(get_node_id, repos_path, path, revision)
    sys.stdout.write('done.  [%s]\n' % node_id)

    # Scan the nodes table, parsing skels and building a node tree.
    nodes = {}
    sys.stdout.write('-- Building node tree... ')
    sys.stdout.flush()
    nodes_table = os.path.join(repos_path, 'db', 'nodes')
    nodes_db = bsddb3.btopen(nodes_table, 'w')
    for key in nodes_db.keys():
        if key == 'next-key':
            continue
        value = nodes_db[key]
        prev_id, is_dir = parse_node_skel(value)
        nodes[key] = [prev_id, is_dir, []]
    for key in nodes.keys():
        value = nodes[key]
        if value[0]:
            prev_value = nodes[value[0]]
            prev_value[2].append(key)
            nodes[value[0]] = prev_value
    sys.stdout.write('done.  [found %d node(s)]\n' % len(nodes.keys()))

    # Determine the nodes we wish to purge.
    affected_nodes = []
    sys.stdout.write('-- Building node purge list... ')
    sys.stdout.flush()
    if kill_preds:
        prev_id = node_id
        while nodes[prev_id][0]:
            prev_id = nodes[prev_id][0]
    append_successors(nodes, prev_id, affected_nodes)
    sys.stdout.write('done.  [found %d node(s)]\n' % len(affected_nodes))
    for id in affected_nodes:
        sys.stdout.write('   -- %s\n' % id)

    # Now, the hard part.  We need to find every directory listing
    # that contains one of our to-be-purge nodes, and then remove
    # those nodes from the entries list.
    dirlists = []
    sys.stdout.write('-- Fixing affected directory entries lists... ')
    sys.stdout.flush()
    strings_table = os.path.join(repos_path, 'db', 'strings')
    strings_db = bsddb3.btopen(strings_table, 'w')
    reps_table = os.path.join(repos_path, 'db', 'representations')
    reps_db = bsddb3.btopen(reps_table, 'w')
    dirs_fixed = 0
    entries_fixed = 0
    for key in nodes.keys():
        value = nodes[key]
        if value[1]:
            node = nodes_db[key]
            kill_count = fix_affected_dirlists(node, reps_db, strings_db,
                                               affected_nodes, dirlists)
            if kill_count:
                sys.stdout.write('   -- %s\n' % key)
                dirs_fixed = dirs_fixed + 1
                entries_fixed = entries_fixed + kill_count
    sys.stdout.write('done.  [fixed %d entries in %d dirs]\n' \
                     % (entries_fixed, dirs_fixed))

    sys.stdout.write('-- Removing deleted nodes... ')
    sys.stdout.flush()
    for key in affected_nodes:
        del(nodes_db[key])
    sys.stdout.write('done.  [removed %d nodes]\n' % len(affected_nodes))

    # Cleanup after ourselves.
    strings_db.sync()
    nodes_db.sync()
    reps_db.sync()
    strings_db.close()
    reps_db.close()
    nodes_db.close()

        
if __name__ == '__main__':
    main()


syntax highlighted by Code2HTML, v. 0.9.1