#!/usr/bin/env python
"""Whitewash the contents of a Subversion file and its successors.
Usage: svn-obliterate.py REPOS_PATH PATH REVISION
"""
import sys
import os
import string
import re
import bsddb3
from svn import repos, fs, core
### TODO: Clean out the transactions table.
### TODO: Clean out the other stuff (maybe).
def die(msg):
sys.stderr.write(msg + '\n')
sys.exit(1)
def get_rep_keys(skel):
# PROP-KEY and NODE-KEY (and maybe EDIT-KEY) follow the header,
# again with possible atom size bitz.
size, rest = string.split(skel[6:], ' ', 1)
path = rest[0:int(size)]
rest = rest[int(size) + 1:]
end_header = string.find(rest, ')')
pieces = string.split(rest[end_header + 2:-1], ' ')
prop_key = None
data_key = None
if pieces[0][0] in string.digits:
del pieces[0]
if pieces[0]:
prop_key = pieces[0]
if pieces[1][0] in string.digits:
del pieces[1]
if pieces[1]:
data_key = pieces[1]
return prop_key, data_key
def read_string(strings_db, string_key):
string_data = ''
key, value = strings_db.set_location(string_key)
while key == string_key:
string_data = string_data + value
key, value = strings_db.next()
return string_data
def unparse_dirent_skel(entries):
items = ''
first_one = 1
for name, id in entries.items():
if not first_one:
items = items + ' '
first_one = 0
items = items + '(%d %s %d %s)' % (len(name), name, len(id), id)
return '(%s)' % items
def parse_dirent_skel(skel):
skel = skel[1:-1]
entries = {}
while 1:
if not len(skel) or skel[0] != '(':
break
token, rest = string.split(skel[1:], ' ', 1)
if skel[1] in string.digits:
size = token
name = rest[0:int(size)]
rest = skel[1 + len(size) + 1 + int(size) + 1:]
else:
name = token
match = re.match('([0-9]+ )?([a-zA-Z0-9]+\.[a-zA-Z0-9]+\.[a-zA-Z0-9]+)\)',
rest)
if not match:
break
id = match.group(2)
entries[name] = id
skel = rest[len(match.group(0)) + 1:]
return entries
_fulltext_re = re.compile('^(\(\(fulltext [^\(]+)\(md5 (16 )?')
def fix_affected_dirlists(node, reps_db, strings_db, affected_nodes, dirlists):
prop_key, data_key = get_rep_keys(node)
if not data_key:
return
data_rep = reps_db[data_key]
# See if this is a fulltext rep. If so, the STRING-KEY is a
# pretty easy find. Well wipe that STRING-KEY, and clear the
# checksum from the REPRESENTATION.
match = re.match(_fulltext_re, data_rep)
if not match:
die('Unable to handle non-fulltext dirent list "%s"' % data_key)
rep_rest = data_rep[len(match.group(0)) + 16 + 3:-1]
pieces = string.split(rep_rest, ' ')
string_key = pieces[-1]
string_data = read_string(strings_db, string_key)
entries = parse_dirent_skel(string_data)
kill_count = 0
for name, id in entries.items():
if id in affected_nodes:
kill_count = kill_count + 1
del(entries[name])
if kill_count:
### begin txn!
del(strings_db[string_key])
strings_db[string_key] = unparse_dirent_skel(entries)
reps_db[data_key] = match.group(1) + \
'(md5 16 \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0)) ' + \
str(len(string_key)) + ' ' + string_key + ')'
### end txn!
return kill_count
def parse_node_skel(skel):
# PREV-ID immediately follows the COMMITTED-PATH, unless there is
# a skel atom size marker in there first.
is_dir = 0
if skel[0:7] == '((file ':
size, rest = string.split(skel[7:], ' ', 1)
elif skel[0:6] == '((dir ':
is_dir = 1
size, rest = string.split(skel[6:], ' ', 1)
else:
die("Unable to parse skel '%s'" % skel)
path = rest[0:int(size)]
rest = rest[int(size) + 1:]
rest = rest[:string.find(rest, ')')]
pieces = string.split(rest, ' ')
prev_id = None
if pieces[0][0] in string.digits:
del pieces[0]
if pieces[0] != '':
prev_id = pieces[0]
return prev_id, is_dir
def get_node_id(pool, repos_path, path, revision):
# Open the repository and filesystem.
repos_ptr = repos.open(repos_path, pool)
fs_ptr = repos.fs(repos_ptr)
# Fetch the node revision ID of interest
rev_root = fs.revision_root(fs_ptr, int(revision), pool)
return fs.unparse_id(fs.node_id(rev_root, path, pool), pool)
def append_successors(nodes, node_id, affected_nodes):
node = nodes[node_id]
affected_nodes.append(node_id)
for succ_id in node[2]:
append_successors(nodes, succ_id, affected_nodes)
def main():
kill_preds = 1
### Until this thing learns to purge the 'changes', it ise
### basically useless (because dumps/loads are entirely
### 'changes'-table driven). So just bail.
print "This script will, at the moment, destroy your repository."
print "You don't really want that, right?"
sys.exit(0)
# Parse the commandline arguments.
argc = len(sys.argv)
if argc < 4:
print __doc__
sys.exit(1)
repos_path, path, revision = sys.argv[1:4]
# Fetch the NODE-REV-ID of the PATH@REV which holds our interest.
sys.stdout.write('Harvesting info for "%s" in r%s.\n' % \
(path, revision))
sys.stdout.write('-- Determining node revision ID... ')
sys.stdout.flush()
node_id = core.run_app(get_node_id, repos_path, path, revision)
sys.stdout.write('done. [%s]\n' % node_id)
# Scan the nodes table, parsing skels and building a node tree.
nodes = {}
sys.stdout.write('-- Building node tree... ')
sys.stdout.flush()
nodes_table = os.path.join(repos_path, 'db', 'nodes')
nodes_db = bsddb3.btopen(nodes_table, 'w')
for key in nodes_db.keys():
if key == 'next-key':
continue
value = nodes_db[key]
prev_id, is_dir = parse_node_skel(value)
nodes[key] = [prev_id, is_dir, []]
for key in nodes.keys():
value = nodes[key]
if value[0]:
prev_value = nodes[value[0]]
prev_value[2].append(key)
nodes[value[0]] = prev_value
sys.stdout.write('done. [found %d node(s)]\n' % len(nodes.keys()))
# Determine the nodes we wish to purge.
affected_nodes = []
sys.stdout.write('-- Building node purge list... ')
sys.stdout.flush()
if kill_preds:
prev_id = node_id
while nodes[prev_id][0]:
prev_id = nodes[prev_id][0]
append_successors(nodes, prev_id, affected_nodes)
sys.stdout.write('done. [found %d node(s)]\n' % len(affected_nodes))
for id in affected_nodes:
sys.stdout.write(' -- %s\n' % id)
# Now, the hard part. We need to find every directory listing
# that contains one of our to-be-purge nodes, and then remove
# those nodes from the entries list.
dirlists = []
sys.stdout.write('-- Fixing affected directory entries lists... ')
sys.stdout.flush()
strings_table = os.path.join(repos_path, 'db', 'strings')
strings_db = bsddb3.btopen(strings_table, 'w')
reps_table = os.path.join(repos_path, 'db', 'representations')
reps_db = bsddb3.btopen(reps_table, 'w')
dirs_fixed = 0
entries_fixed = 0
for key in nodes.keys():
value = nodes[key]
if value[1]:
node = nodes_db[key]
kill_count = fix_affected_dirlists(node, reps_db, strings_db,
affected_nodes, dirlists)
if kill_count:
sys.stdout.write(' -- %s\n' % key)
dirs_fixed = dirs_fixed + 1
entries_fixed = entries_fixed + kill_count
sys.stdout.write('done. [fixed %d entries in %d dirs]\n' \
% (entries_fixed, dirs_fixed))
sys.stdout.write('-- Removing deleted nodes... ')
sys.stdout.flush()
for key in affected_nodes:
del(nodes_db[key])
sys.stdout.write('done. [removed %d nodes]\n' % len(affected_nodes))
# Cleanup after ourselves.
strings_db.sync()
nodes_db.sync()
reps_db.sync()
strings_db.close()
reps_db.close()
nodes_db.close()
if __name__ == '__main__':
main()
syntax highlighted by Code2HTML, v. 0.9.1