#include "base.hh" #include #include #include #include #include "sanity.hh" #include "graph.hh" #include "safe_map.hh" #include "numeric_vocab.hh" #include "hash_map.hh" #include "vocab_hash.hh" using boost::shared_ptr; using std::string; using std::vector; using std::set; using std::pair; using std::map; using std::multimap; using std::make_pair; using std::list; using hashmap::hash_set; void get_reconstruction_path(std::string const & start, reconstruction_graph const & graph, reconstruction_path & path) { // This function does a breadth-first search from a starting point, until it // finds some node that matches an arbitrary condition. The intended usage // is for finding reconstruction paths in a database of deltas -- we start // from the node we want to reconstruct, and follow existing deltas outward // until we reach a full-text base. We return the shortest path from // 'start' to a base version. // // The algorithm involves keeping a set of parallel linear paths, starting // from 'start', that move forward through the DAG until we hit a base. // // On each iteration, we extend every active path by one step. If our // extension involves a fork, we duplicate the path. If any path // contains a cycle, we fault. // // If, by extending a path C, we enter a node which another path // D has already seen, we kill path C. This avoids the possibility of // exponential growth in the number of paths due to extensive forking // and merging. // Long ago, we used to do this with the boost graph library, but it // involved loading too much of the storage graph into memory at any // moment. this imperative version only loads the descendents of the // reconstruction node, so it much cheaper in terms of memory. vector< shared_ptr > live_paths; { shared_ptr pth0 = shared_ptr(new reconstruction_path()); pth0->push_back(start); live_paths.push_back(pth0); } shared_ptr selected_path; set seen_nodes; while (!selected_path) { vector< shared_ptr > next_paths; I(!live_paths.empty()); for (vector >::const_iterator i = live_paths.begin(); i != live_paths.end(); ++i) { shared_ptr pth = *i; string tip = pth->back(); if (graph.is_base(tip)) { selected_path = pth; break; } else { // This tip is not a root, so extend the path. set next; graph.get_next(tip, next); I(!next.empty()); // Replicate the path if there's a fork. bool first = true; for (set::const_iterator j = next.begin(); j != next.end(); ++j) { L(FL("considering %s -> %s") % tip % *j); if (seen_nodes.find(*j) == seen_nodes.end()) { shared_ptr pthN; if (first) { pthN = pth; first = false; } else { // NOTE: this is not the first iteration of the loop, and // the first iteration appended one item to pth. So, we // want to remove one before we use it. (Why not just // copy every time? Because that makes this into an // O(n^2) algorithm, in the common case where there is // only one direction to go at each stop.) pthN = shared_ptr(new reconstruction_path(*pth)); I(!pthN->empty()); pthN->pop_back(); } // check for a cycle... not that anything would break if // there were one, but it's nice to let us know we have a bug for (reconstruction_path::const_iterator k = pthN->begin(); k != pthN->end(); ++k) I(*k != *j); pthN->push_back(*j); next_paths.push_back(pthN); seen_nodes.insert(*j); } } } } I(selected_path || !next_paths.empty()); live_paths = next_paths; } path = *selected_path; } #ifdef BUILD_UNIT_TESTS #include #include "unit_tests.hh" #include "randomizer.hh" #include "lexical_cast.hh" using boost::lexical_cast; using std::pair; typedef std::multimap rg_map; struct mock_reconstruction_graph : public reconstruction_graph { rg_map ancestry; set bases; mock_reconstruction_graph(rg_map const & ancestry, set const & bases) : ancestry(ancestry), bases(bases) {} virtual bool is_base(string const & node) const { return bases.find(node) != bases.end(); } virtual void get_next(string const & from, set & next) const { typedef rg_map::const_iterator ci; pair range = ancestry.equal_range(from); for (ci i = range.first; i != range.second; ++i) next.insert(i->second); } }; static void make_random_reconstruction_graph(size_t num_nodes, size_t num_random_edges, size_t num_random_bases, vector & all_nodes, rg_map & ancestry, set & bases, randomizer & rng) { for (size_t i = 0; i != num_nodes; ++i) all_nodes.push_back(lexical_cast(i)); // We put a single long chain of edges in, to make sure that everything is // reconstructable somehow. for (size_t i = 1; i != num_nodes; ++i) ancestry.insert(make_pair(idx(all_nodes, i - 1), idx(all_nodes, i))); bases.insert(all_nodes.back()); // Then we insert a bunch of random edges too. These edges always go // forwards, to avoid creating cycles (which make get_reconstruction_path // unhappy). for (size_t i = 0; i != num_random_edges; ++i) { size_t from_idx = rng.uniform(all_nodes.size() - 1); size_t to_idx = from_idx + 1 + rng.uniform(all_nodes.size() - 1 - from_idx); ancestry.insert(make_pair(idx(all_nodes, from_idx), idx(all_nodes, to_idx))); } // And a bunch of random bases. for (size_t i = 0; i != num_random_bases; ++i) bases.insert(idx(all_nodes, rng.uniform(all_nodes.size()))); } static void check_reconstruction_path(string const & start, reconstruction_graph const & graph, reconstruction_path const & path) { I(!path.empty()); I(*path.begin() == start); reconstruction_path::const_iterator last = path.end(); --last; I(graph.is_base(*last)); for (reconstruction_path::const_iterator i = path.begin(); i != last; ++i) { set children; graph.get_next(*i, children); reconstruction_path::const_iterator next = i; ++next; I(children.find(*next) != children.end()); } } static void run_get_reconstruction_path_tests_on_random_graph(size_t num_nodes, size_t num_random_edges, size_t num_random_bases, randomizer & rng) { vector all_nodes; rg_map ancestry; set bases; make_random_reconstruction_graph(num_nodes, num_random_edges, num_random_bases, all_nodes, ancestry, bases, rng); mock_reconstruction_graph graph(ancestry, bases); for (vector::const_iterator i = all_nodes.begin(); i != all_nodes.end(); ++i) { reconstruction_path path; get_reconstruction_path(*i, graph, path); check_reconstruction_path(*i, graph, path); } } UNIT_TEST(graph, random_get_reconstruction_path) { randomizer rng; // Some arbitrary numbers. run_get_reconstruction_path_tests_on_random_graph(100, 100, 10, rng); run_get_reconstruction_path_tests_on_random_graph(100, 200, 5, rng); run_get_reconstruction_path_tests_on_random_graph(1000, 1000, 50, rng); run_get_reconstruction_path_tests_on_random_graph(1000, 2000, 100, rng); } #endif // BUILD_UNIT_TESTS // graph is a parent->child map void toposort_rev_ancestry(rev_ancestry_map const & graph, vector & revisions) { typedef multimap::const_iterator gi; typedef map::iterator pi; revisions.clear(); // determine the number of parents for each rev map pcount; for (gi i = graph.begin(); i != graph.end(); ++i) pcount.insert(make_pair(i->first, 0)); for (gi i = graph.begin(); i != graph.end(); ++i) ++(pcount[i->second]); // find the set of graph roots list roots; for (pi i = pcount.begin(); i != pcount.end(); ++i) if(i->second==0) roots.push_back(i->first); while (!roots.empty()) { revision_id cur = roots.front(); roots.pop_front(); if (!null_id(cur)) revisions.push_back(cur); for(gi i = graph.lower_bound(cur); i != graph.upper_bound(cur); i++) if(--(pcount[i->second]) == 0) roots.push_back(i->second); } } // get_uncommon_ancestors typedef std::pair height_rev_pair; static void advance_frontier(set & frontier, hash_set & seen, rev_graph const & rg) { const height_rev_pair h_node = *frontier.rbegin(); const revision_id & node(h_node.second); frontier.erase(h_node); set parents; rg.get_parents(node, parents); for (set::const_iterator r = parents.begin(); r != parents.end(); r++) { if (seen.find(*r) == seen.end()) { rev_height h; rg.get_height(*r, h); frontier.insert(make_pair(h, *r)); seen.insert(*r); } } } void get_uncommon_ancestors(revision_id const & a, revision_id const & b, rev_graph const & rg, set & a_uncommon_ancs, set & b_uncommon_ancs) { a_uncommon_ancs.clear(); b_uncommon_ancs.clear(); // We extend a frontier from each revision until it reaches // a revision that has been seen by the other frontier. By // traversing in ascending height order we can ensure that // any common ancestor will have been 'seen' by both sides // before it is traversed. set a_frontier, b_frontier, common_frontier; { rev_height h; rg.get_height(a, h); a_frontier.insert(make_pair(h, a)); rg.get_height(b, h); b_frontier.insert(make_pair(h, b)); } hash_set a_seen, b_seen, common_seen; a_seen.insert(a); b_seen.insert(b); while (!a_frontier.empty() || !b_frontier.empty()) { // We take the leaf-most (ie highest) height entry from any frontier. // Note: the default height is the lowest possible. rev_height a_height, b_height, common_height; if (!a_frontier.empty()) a_height = a_frontier.rbegin()->first; if (!b_frontier.empty()) b_height = b_frontier.rbegin()->first; if (!common_frontier.empty()) common_height = common_frontier.rbegin()->first; if (a_height > b_height && a_height > common_height) { a_uncommon_ancs.insert(a_frontier.rbegin()->second); advance_frontier(a_frontier, a_seen, rg); } else if (b_height > a_height && b_height > common_height) { b_uncommon_ancs.insert(b_frontier.rbegin()->second); advance_frontier(b_frontier, b_seen, rg); } else if (common_height > a_height && common_height > b_height) { advance_frontier(common_frontier, common_seen, rg); } else if (a_height == b_height) // may or may not also == common_height { // if both frontiers are the same, then we can safely say that // we've found all uncommon ancestors. This stopping condition // can result in traversing more nodes than required, but is simple. if (a_frontier == b_frontier) break; common_frontier.insert(*a_frontier.rbegin()); a_frontier.erase(*a_frontier.rbegin()); b_frontier.erase(*b_frontier.rbegin()); } else if (a_height == common_height) { a_frontier.erase(*a_frontier.rbegin()); } else if (b_height == common_height) { b_frontier.erase(*b_frontier.rbegin()); } else I(false); } } #ifdef BUILD_UNIT_TESTS #include #include "unit_tests.hh" #include "randomizer.hh" #include "roster.hh" static void get_all_ancestors(revision_id const & start, rev_ancestry_map const & child_to_parent_map, set & ancestors) { ancestors.clear(); vector frontier; frontier.push_back(start); while (!frontier.empty()) { revision_id rid = frontier.back(); frontier.pop_back(); if (ancestors.find(rid) != ancestors.end()) continue; safe_insert(ancestors, rid); typedef rev_ancestry_map::const_iterator ci; pair range = child_to_parent_map.equal_range(rid); for (ci i = range.first; i != range.second; ++i) frontier.push_back(i->second); } } struct mock_rev_graph : rev_graph { mock_rev_graph(rev_ancestry_map const & child_to_parent_map) : child_to_parent_map(child_to_parent_map) { // assign sensible heights height_map.clear(); // toposort expects parent->child rev_ancestry_map parent_to_child; for (rev_ancestry_map::const_iterator i = child_to_parent_map.begin(); i != child_to_parent_map.end(); i++) { parent_to_child.insert(make_pair(i->second, i->first)); } vector topo_revs; toposort_rev_ancestry(parent_to_child, topo_revs); // this is ugly but works. just give each one a sequential number. rev_height top = rev_height::root_height(); u32 num = 1; for (vector::const_iterator r = topo_revs.begin(); r != topo_revs.end(); r++, num++) { height_map.insert(make_pair(*r, top.child_height(num))); } } virtual void get_parents(revision_id const & node, set & parents) const { parents.clear(); for (rev_ancestry_map::const_iterator i = child_to_parent_map.lower_bound(node); i != child_to_parent_map.upper_bound(node); i++) { if (!null_id(i->second)) safe_insert(parents, i->second); } } virtual void get_children(revision_id const & node, set & parents) const { // not required I(false); } virtual void get_height(revision_id const & rev, rev_height & h) const { MM(rev); h = safe_get(height_map, rev); } rev_ancestry_map const & child_to_parent_map; map height_map; }; static void run_a_get_uncommon_ancestors_test(rev_ancestry_map const & child_to_parent_map, revision_id const & left, revision_id const & right) { set true_left_ancestors, true_right_ancestors; get_all_ancestors(left, child_to_parent_map, true_left_ancestors); get_all_ancestors(right, child_to_parent_map, true_right_ancestors); set true_left_uncommon_ancestors, true_right_uncommon_ancestors; MM(true_left_uncommon_ancestors); MM(true_right_uncommon_ancestors); set_difference(true_left_ancestors.begin(), true_left_ancestors.end(), true_right_ancestors.begin(), true_right_ancestors.end(), inserter(true_left_uncommon_ancestors, true_left_uncommon_ancestors.begin())); set_difference(true_right_ancestors.begin(), true_right_ancestors.end(), true_left_ancestors.begin(), true_left_ancestors.end(), inserter(true_right_uncommon_ancestors, true_right_uncommon_ancestors.begin())); set calculated_left_uncommon_ancestors, calculated_right_uncommon_ancestors; MM(calculated_left_uncommon_ancestors); MM(calculated_right_uncommon_ancestors); mock_rev_graph rg(child_to_parent_map); get_uncommon_ancestors(left, right, rg, calculated_left_uncommon_ancestors, calculated_right_uncommon_ancestors); I(calculated_left_uncommon_ancestors == true_left_uncommon_ancestors); I(calculated_right_uncommon_ancestors == true_right_uncommon_ancestors); get_uncommon_ancestors(right, left, rg, calculated_right_uncommon_ancestors, calculated_left_uncommon_ancestors); I(calculated_left_uncommon_ancestors == true_left_uncommon_ancestors); I(calculated_right_uncommon_ancestors == true_right_uncommon_ancestors); } UNIT_TEST(graph, get_uncommon_ancestors_nasty_convexity_case) { // This tests the nasty case described in the giant comment above // get_uncommon_ancestors: // // 9 // |\ . Extraneous dots brought to you by the // 8 \ . Committee to Shut Up the C Preprocessor // /| \ . (CSUCPP), and viewers like you and me. // / | | // / 7 | // | | | // | 6 | // | | | // | 5 | // | | | // | 4 | // | | | // | : | <-- insert arbitrarily many revisions at the ellipsis // | : | // | | | // 1 2 3 // \ / \ / // L R rev_ancestry_map child_to_parent_map; revision_id left(fake_id()), right(fake_id()); revision_id one(fake_id()), two(fake_id()), eight(fake_id()), three(fake_id()), nine(fake_id()); MM(left); MM(right); MM(one); MM(two); MM(three); MM(eight); MM(nine); child_to_parent_map.insert(make_pair(left, one)); child_to_parent_map.insert(make_pair(one, eight)); child_to_parent_map.insert(make_pair(eight, nine)); child_to_parent_map.insert(make_pair(right, three)); child_to_parent_map.insert(make_pair(three, nine)); revision_id middle(fake_id()); child_to_parent_map.insert(make_pair(left, two)); child_to_parent_map.insert(make_pair(right, two)); // We insert a _lot_ of revisions at the ellipsis, to make sure that // whatever sort of step-size is used on the expansion, we can't take the // entire middle portion in one big gulp and make the test pointless. for (int i = 0; i != 1000; ++i) { revision_id next(fake_id()); child_to_parent_map.insert(make_pair(middle, next)); middle = next; } child_to_parent_map.insert(make_pair(middle, eight)); run_a_get_uncommon_ancestors_test(child_to_parent_map, left, right); } double const new_root_freq = 0.05; double const merge_node_freq = 0.2; double const skip_up_freq = 0.5; static revision_id pick_node_from_set(set const & heads, randomizer & rng) { I(!heads.empty()); size_t which_start = rng.uniform(heads.size()); set::const_iterator i = heads.begin(); for (size_t j = 0; j != which_start; ++j) ++i; return *i; } static revision_id pick_node_or_ancestor(set const & heads, rev_ancestry_map const & child_to_parent_map, randomizer & rng) { revision_id rev = pick_node_from_set(heads, rng); // now we recurse up from this starting point while (rng.bernoulli(skip_up_freq)) { typedef rev_ancestry_map::const_iterator ci; pair range = child_to_parent_map.equal_range(rev); if (range.first == range.second) break; ci second = range.first; ++second; if (second == range.second) // there was only one parent rev = range.first->second; else { // there are two parents, pick one randomly if (rng.flip()) rev = range.first->second; else rev = second->second; } } return rev; } static void make_random_graph(size_t num_nodes, rev_ancestry_map & child_to_parent_map, vector & nodes, randomizer & rng) { set heads; for (size_t i = 0; i != num_nodes; ++i) { revision_id new_rid = revision_id(fake_id()); nodes.push_back(new_rid); set parents; if (heads.empty() || rng.bernoulli(new_root_freq)) parents.insert(revision_id()); else if (rng.bernoulli(merge_node_freq) && heads.size() > 1) { // maybe we'll pick the same node twice and end up not doing a // merge, oh well... parents.insert(pick_node_from_set(heads, rng)); parents.insert(pick_node_from_set(heads, rng)); } else { parents.insert(pick_node_or_ancestor(heads, child_to_parent_map, rng)); } for (set::const_iterator j = parents.begin(); j != parents.end(); ++j) { heads.erase(*j); child_to_parent_map.insert(std::make_pair(new_rid, *j)); } safe_insert(heads, new_rid); } } static void run_a_get_uncommon_ancestors_random_test(size_t num_nodes, size_t iterations, randomizer & rng) { rev_ancestry_map child_to_parent_map; vector nodes; make_random_graph(num_nodes, child_to_parent_map, nodes, rng); for (size_t i = 0; i != iterations; ++i) { L(FL("get_uncommon_ancestors: random test %s-%s") % num_nodes % i); revision_id left = idx(nodes, rng.uniform(nodes.size())); revision_id right = idx(nodes, rng.uniform(nodes.size())); run_a_get_uncommon_ancestors_test(child_to_parent_map, left, right); } } UNIT_TEST(graph, get_uncommon_ancestors_randomly) { randomizer rng; run_a_get_uncommon_ancestors_random_test(100, 100, rng); run_a_get_uncommon_ancestors_random_test(1000, 100, rng); run_a_get_uncommon_ancestors_random_test(10000, 1000, rng); } #endif // BUILD_UNIT_TESTS // Local Variables: // mode: C++ // fill-column: 76 // c-file-style: "gnu" // indent-tabs-mode: nil // End: // vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s: