/* * vdelta.c: vdelta generator. * * ==================================================================== * Copyright (c) 2000-2004 CollabNet. All rights reserved. * * This software is licensed as described in the file COPYING, which * you should have received as part of this distribution. The terms * are also available at http://subversion.tigris.org/license-1.html. * If newer versions of this license are posted there, you may use a * newer version instead, at your option. * * This software consists of voluntary contributions made by many * individuals. For exact contribution history, see the revision * history and logs, available at http://subversion.tigris.org/. * ==================================================================== */ #include #include /* for APR_INLINE */ #include "svn_delta.h" #include "delta.h" /* ==================================================================== */ /* Hash table for vdelta hashing. Each hash bucket is a chain of slots. The index of the slot in the slots array is also the index of the key string in the current window's data stream. The hash table implements a multimap (i.e., hash and key collisions are allowed). To store a key->index mapping, just add slot[index] to the slot chain tn key's bucket (see store_mapping). For a given key, you can traverse the list of match candidates (some of which may be hash collisions) like this: for (slot = buckets[get_bucket(key)]; slot != NULL; slot = slot->next) { index = slot - slots; ... } */ /* Size of a vdelta hash key. */ #define VD_KEY_SIZE 4 /* Hash slot. */ typedef struct hash_slot_t { struct hash_slot_t *next; } hash_slot_t; /* Hash table. */ typedef struct hash_table_t { int num_buckets; /* Number of buckets in the table. */ hash_slot_t **buckets; /* Bucket array. */ hash_slot_t *slots; /* Slots array. */ } hash_table_t; /* Create a hash table with NUM_SLOTS slots. NUM_SLOTS should be the sum of the size of the source and target parts of the delta window. Allocate from POOL. */ static hash_table_t * create_hash_table(apr_size_t num_slots, apr_pool_t *pool) { int i; apr_size_t j; hash_table_t* table = apr_palloc(pool, sizeof(*table)); /* This should be a reasonable number of buckets ... */ table->num_buckets = (num_slots / 3) | 1; table->buckets = apr_palloc(pool, (table->num_buckets * sizeof(*table->buckets))); for (i = 0; i < table->num_buckets; ++i) table->buckets[i] = NULL; table->slots = apr_palloc(pool, num_slots * sizeof(*table->slots)); for (j = 0; j < num_slots; ++j) table->slots[j].next = NULL; return table; } /* Convert a key to a pointer to the key's hash bucket. We use a 2-universal multiplicative hash function. If you're wondering about the selected multiplier, take a look at the comments in apr/tables/apr_hash.c:find_entry for a discussion on fast string hashes; it's very illuminating. [ We use 127 instead of 33 here because I happen to like interesting prime numbers, so there. --xbc ] */ static APR_INLINE apr_uint32_t get_bucket(const hash_table_t *table, const char *key) { int i; apr_uint32_t hash = 0; for (i = 0; i < VD_KEY_SIZE; ++i) hash = hash * 127 + *key++; return hash % table->num_buckets; } /* Store a key->index mapping into the hash table. */ static APR_INLINE void store_mapping(hash_table_t *table, const char* key, apr_size_t idx) { apr_uint32_t bucket = get_bucket(table, key); assert(table->slots[idx].next == NULL); table->slots[idx].next = table->buckets[bucket]; table->buckets[bucket] = &table->slots[idx]; } /* ==================================================================== */ /* Vdelta generator. The article "Delta Algorithms: An Empirical Analysis" by Hunt, Vo and Tichy contains a description of the vdelta algorithm, but it's incomplete. Here's a detailed description: 1. Look up the four bytes starting at the current position pointer. If there are no matches for those four bytes, output an insert, move the position pointer forward by one, and go back to step 1. 2. Determine which of the candidates yields the longest extension. This will be called the "current match". 3. Look up the last three bytes of the current match plus one unmatched byte. If there is no match for those four bytes, the current match is the best match; go to step 6. 4. For each candidate, check backwards to see if it matches the entire match so far. If no candidates satisfy that constraint, the current match is the best match; go to step 6. 5. Among the candidates which do satisfy the constraint, determine which one yields the longest extension. This will be the new "current match." Go back to step 3. 6. Output a block copy instruction, add indexes for the last three positions of the matched data, advance the position pointer by the length of the match, and go back to step 1. Inserts and copies are generated only when the current position is within the target data. Note that the vdelta algorithm allows copies that cross the source/target data boundary. Because our internal delta representation has different opcodes for source and target copies, we split them in two. This means that the opcode stream in the delta window can contain copies shorter than VD_KEY_SIZE. These could be represented by insert ops instead, but we'll leave them in, so that we can merge them again when we convert the delta window to an external format like vcdiff that supports cross -boundary copies. */ /* Find the length of a match within the data window. Note that (match < from && from <= end) must always be true here. */ static APR_INLINE int find_match_len(const char *match, const char *from, const char *end) { const char *here = from; while (here < end && *match == *here) { ++match; ++here; } return here - from; } /* This is the main vdelta generator. */ static void vdelta(svn_txdelta__ops_baton_t *build_baton, const char *data, const char *start, const char *end, svn_boolean_t outputflag, hash_table_t *table, apr_pool_t *pool) { const char *here = start; /* Current position in the buffer. */ const char *insert_from = NULL; /* Start of byte range for insertion. */ for (;;) { const char *current_match, *key; apr_size_t current_match_len = 0; hash_slot_t *slot; svn_boolean_t progress; /* If we're near the end, just insert the last few bytes. */ if (end - here < VD_KEY_SIZE) { const char *from = ((insert_from != NULL) ? insert_from : here); if (outputflag && from < end) svn_txdelta__insert_op(build_baton, svn_txdelta_new, 0, end - from, from, pool); return; } /* Search for the longest match. */ current_match = NULL; current_match_len = 0; key = here; do { /* Try to extend the current match. Our key is the last three matched bytes plus one unmatched byte if we already have a current match, or just the four bytes where we are if we don't have a current match yet. See which mapping yields the longest extension. */ progress = FALSE; for (slot = table->buckets[get_bucket(table, key)]; slot != NULL; slot = slot->next) { const char *match; apr_size_t match_len; if (slot - table->slots < key - here) /* Too close to start */ continue; match = data + (slot - table->slots) - (key - here); match_len = find_match_len(match, here, end); /* We can only copy from the source or from the target, so don't let the match cross START. */ if (match < start && match + match_len > start) match_len = start - match; if (match_len >= VD_KEY_SIZE && match_len > current_match_len) { /* We have a longer match; record it. */ current_match = match; current_match_len = match_len; progress = TRUE; } } if (progress) key = here + current_match_len - (VD_KEY_SIZE - 1); } while (progress && end - key >= VD_KEY_SIZE); if (current_match_len < VD_KEY_SIZE) { /* There is no match here; store a mapping and insert this byte. */ store_mapping(table, here, here - data); if (insert_from == NULL) insert_from = here; here++; continue; } else if (outputflag) { if (insert_from != NULL) { /* Commit the pending insert. */ svn_txdelta__insert_op(build_baton, svn_txdelta_new, 0, here - insert_from, insert_from, pool); insert_from = NULL; } if (current_match < start) /* Copy from source. */ svn_txdelta__insert_op(build_baton, svn_txdelta_source, current_match - data, current_match_len, NULL, pool); else /* Copy from target */ svn_txdelta__insert_op(build_baton, svn_txdelta_target, current_match - start, current_match_len, NULL, pool); } /* Adjust the current position and insert mappings for the last three bytes of the match. */ here += current_match_len; if (end - here >= VD_KEY_SIZE) { char const *last = here - (VD_KEY_SIZE - 1); for (; last < here; ++last) store_mapping(table, last, last - data); } } } void svn_txdelta__vdelta(svn_txdelta__ops_baton_t *build_baton, const char *data, apr_size_t source_len, apr_size_t target_len, apr_pool_t *pool) { hash_table_t *table = create_hash_table(source_len + target_len, pool); vdelta(build_baton, data, data, data + source_len, FALSE, table, pool); vdelta(build_baton, data, data + source_len, data + source_len + target_len, TRUE, table, pool); #if 0 /* This bit of code calculates the hash load and the number of collisions. Please note that a the number of collisions per bucket is one less than the length of the chain. :-) --xbc */ { int i; int empty = 0; int collisions = 0; for (i = 0; i < table->num_buckets; ++i) { hash_slot_t *slot = table->buckets[i]; if (!slot) ++empty; else { slot = slot->next; while (slot != NULL) { ++collisions; slot = slot->next; } } } fprintf(stderr, "Hash stats: load %d, collisions %d\n", 100 - 100 * empty / table->num_buckets, collisions); } #endif }