"""Optimise the cache."""

import os
from logging import warn

def _already_linked(a, b):
	ai = os.stat(a)
	bi = os.stat(b)
	return (ai.st_dev, ai.st_ino) == (bi.st_dev, bi.st_ino)

def _byte_identical(a, b):
	af = file(a, 'rb')
	bf = file(b, 'rb')
	while True:
		adata = af.read(100)
		bdata = bf.read(100)
		if adata != bdata:
			return False
		if not adata:
			return True

def _link(a, b, tmpfile):
	"""Keep 'a', delete 'b' and hard-link to 'a'"""
	if not _byte_identical(a, b):
		warn("Files should be identical, but they're not!\n%s\n%s", a, b)

	b_dir = os.path.dirname(b)
	old_mode = os.lstat(b_dir).st_mode
	os.chmod(b_dir, old_mode | 0200)	# Need write access briefly
	try:
		os.link(a, tmpfile)
		try:
			os.rename(tmpfile, b)
		except:
			os.unlink(tmpfile)
			raise
	finally:
		os.chmod(b_dir, old_mode)

def optimise(impl_dir):
	"""Scan an implementation cache directory for duplicate files, and
	hard-link any duplicates together to save space.
	@param impl_dir: a $cache/0install.net/implementations directory
	@type impl_dir: str
	@param verbose: display progress messages
	@type verbose: bool
	@return: (unique bytes, duplicated bytes, already linked, manifest size)
	@rtype: (int, int, int, int)"""

	first_copy = {}		# TypeDigest -> Path
	dup_size = uniq_size = already_linked = man_size = 0

	import random

	for x in range(10):
		tmpfile = os.path.join(impl_dir, 'optimise-%d' % random.randint(0, 1000000))
		if not os.path.exists(tmpfile):
			break
	else:
		raise Exception("Can't generate unused tempfile name!")

	for impl in os.listdir(impl_dir):
		if impl.startswith('.') or '=' not in impl:
			warn("Skipping non-implementation '%s'", impl)
			continue
		manifest_path = os.path.join(impl_dir, impl, '.manifest')
		try:
			ms = file(manifest_path)
		except OSError, ex:
			warn("Failed to read manifest file '%s': %s", manifest_path, str(ex))
			continue

		alg = impl.split('=', 1)[0]
		if alg == 'sha1': continue

		man_size += os.path.getsize(manifest_path)

		dir = ""
		for line in ms:
			if line[0] == 'D':
				type, path = line.split(' ', 1)
				assert path.startswith('/')
				dir = path[1:-1]	# Strip slash and newline
				continue

			if line[0] == "S":
				type, digest, size, rest = line.split(' ', 3)
				uniq_size += long(size)
				continue

			assert line[0] in "FX"

			type, digest, mtime, size, path = line.split(' ', 4)
			path = path[:-1]	# Strip newline
			size = long(size)

			key = (type, digest, mtime, size)
			loc_path = (impl, dir, path)

			first_loc = first_copy.get(key, None)
			if first_loc:
				first_full = os.path.join(impl_dir, *first_loc)
				new_full = os.path.join(impl_dir, *loc_path)
				if _already_linked(first_full, new_full):
					already_linked += size
				else:
					_link(first_full, new_full, tmpfile)
					dup_size += size
			else:
				first_copy[key] = loc_path
				uniq_size += size
	return (uniq_size, dup_size, already_linked, man_size)


syntax highlighted by Code2HTML, v. 0.9.1