#ifndef __DATABASE_HH__ #define __DATABASE_HH__ // Copyright (C) 2002 Graydon Hoare // // This program is made available under the GNU GPL version 2.0 or // greater. See the accompanying file COPYING for details. // // This program is distributed WITHOUT ANY WARRANTY; without even the // implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR // PURPOSE. struct sqlite3; struct sqlite3_stmt; struct cert; int sqlite3_finalize(sqlite3_stmt *); #include "vector.hh" #include #include #include "numeric_vocab.hh" #include "vocab.hh" #include "paths.hh" #include "cleanup.hh" #include "roster.hh" #include "selectors.hh" #include "graph.hh" // FIXME: would be better not to include this everywhere #include "outdated_indicator.hh" #include "lru_writeback_cache.hh" // this file defines a public, typed interface to the database. // the database class encapsulates all knowledge about sqlite, // the schema, and all SQL statements used to access the schema. // // one thing which is rather important to note is that this file // deals with two sorts of version relationships. the versions // stored in the database are all *backwards* from those the program // sees. so for example if you have two versions of a file // // file.1, file.2 // // where file.2 was a modification of file.1, then as far as the rest of // the application is concerned -- and the ancestry graph -- file.1 is the // "old" version and file.2 is the "new" version. note the use of terms // which describe time, and the sequence of edits a user makes to a // file. those are ancestry terms. when the application composes a // patchset, for example, it'll contain the diff delta(file.1, file.2) // // from the database's perspective, however, file.1 is the derived version, // and file.2 is the base version. the base version is stored in the // "files" table, and the *reverse* diff delta(file.2, file.1) is stored in // the "file_deltas" table, under the id of file.1, with the id of file.2 // listed as its base. note the use of the terms which describe // reconstruction; those are storage-system terms. // // the interface *to* the database, and the ancestry version graphs, use // the old / new metaphor of ancestry, but within the database (including // the private helper methods, and the storage version graphs) the // base/derived storage metaphor is used. the only real way to tell which // is which is to look at the parameter names and code. I might try to // express this in the type system some day, but not presently. // // the key phrase to keep repeating when working on this code is: // // "base files are new, derived files are old" // // it makes the code confusing, I know. this is possibly the worst part of // the program. I don't know if there's any way to make it clearer. class transaction_guard; class app_state; struct revision_t; struct query; class rev_height; struct globish; class database { // // --== Opening the database and schema checking ==-- // private: system_path filename; app_state * __app; struct sqlite3 * __sql; enum open_mode { normal_mode = 0, schema_bypass_mode, format_bypass_mode }; void install_functions(app_state * app); struct sqlite3 * sql(enum open_mode mode = normal_mode); void check_filename(); void check_db_exists(); void check_db_nonexistent(); void open(); void close(); void check_format(); public: database(system_path const & file); ~database(); void set_app(app_state * app); void set_filename(system_path const & file); system_path get_filename(); bool is_dbfile(any_path const & file); void ensure_open(); void ensure_open_for_format_changes(); private: void ensure_open_for_maintenance(); public: void check_is_not_rosterified(); bool database_specified(); // // --== Basic SQL interface and statement caching ==-- // private: struct statement { statement() : count(0), stmt(0, sqlite3_finalize) {} int count; cleanup_ptr stmt; }; std::map statement_cache; typedef std::vector< std::vector > results; void fetch(results & res, int const want_cols, int const want_rows, query const & q); void execute(query const & q); bool table_has_entry(std::string const & key, std::string const & column, std::string const & table); // // --== Generic database metadata gathering ==-- // private: std::string count(std::string const & table); std::string space(std::string const & table, std::string const & concatenated_columns, u64 & total); unsigned int page_size(); unsigned int cache_size(); // // --== Transactions ==-- // private: int transaction_level; bool transaction_exclusive; void begin_transaction(bool exclusive); void commit_transaction(); void rollback_transaction(); friend class transaction_guard; // // --== Write-buffering -- tied into transaction ==-- // --== machinery and delta compression machinery ==-- // public: typedef boost::shared_ptr roster_t_cp; typedef boost::shared_ptr marking_map_cp; typedef std::pair cached_roster; private: struct roster_size_estimator { unsigned long operator() (cached_roster const &); }; struct roster_writeback_manager { database & db; roster_writeback_manager(database & db) : db(db) {} void writeout(revision_id const &, cached_roster const &); }; LRUWritebackCache roster_cache; size_t size_delayed_file(file_id const & id, file_data const & dat); bool have_delayed_file(file_id const & id); void load_delayed_file(file_id const & id, file_data & dat); void cancel_delayed_file(file_id const & id); void drop_or_cancel_file(file_id const & id); void schedule_delayed_file(file_id const & id, file_data const & dat); std::map delayed_files; size_t delayed_writes_size; void flush_delayed_writes(); void clear_delayed_writes(); void write_delayed_file(file_id const & new_id, file_data const & dat); void write_delayed_roster(revision_id const & new_id, roster_t const & roster, marking_map const & marking); // // --== Reading/writing delta-compressed objects ==-- // private: // "do we have any entry for 'ident' that is a base version" bool file_or_manifest_base_exists(hexenc const & ident, std::string const & table); bool roster_base_stored(revision_id const & ident); bool roster_base_available(revision_id const & ident); // "do we have any entry for 'ident' that is a delta" bool delta_exists(std::string const & ident, std::string const & table); bool delta_exists(std::string const & ident, std::string const & base, std::string const & table); void get_file_or_manifest_base_unchecked(hexenc const & new_id, data & dat, std::string const & table); void get_file_or_manifest_delta_unchecked(hexenc const & ident, hexenc const & base, delta & del, std::string const & table); void get_roster_base(std::string const & ident, roster_t & roster, marking_map & marking); void get_roster_delta(std::string const & ident, std::string const & base, roster_delta & del); friend struct file_and_manifest_reconstruction_graph; friend struct roster_reconstruction_graph; void get_version(hexenc const & ident, data & dat, std::string const & data_table, std::string const & delta_table); void drop(std::string const & base, std::string const & table); void put_file_delta(file_id const & ident, file_id const & base, file_delta const & del); private: void put_roster_delta(revision_id const & ident, revision_id const & base, roster_delta const & del); void put_version(hexenc const & old_id, hexenc const & new_id, delta const & del, std::string const & data_table, std::string const & delta_table); void put_roster(revision_id const & rev_id, roster_t_cp const & roster, marking_map_cp const & marking); public: bool file_version_exists(file_id const & ident); bool revision_exists(revision_id const & ident); bool roster_link_exists_for_revision(revision_id const & ident); bool roster_exists_for_revision(revision_id const & ident); // get plain version if it exists, or reconstruct version // from deltas (if they exist) void get_file_version(file_id const & ident, file_data & dat); // put file w/o predecessor into db void put_file(file_id const & new_id, file_data const & dat); // store new version and update old version to be a delta void put_file_version(file_id const & old_id, file_id const & new_id, file_delta const & del); void get_arbitrary_file_delta(file_id const & src_id, file_id const & dst_id, file_delta & del); // get plain version if it exists, or reconstruct version // from deltas (if they exist). void get_manifest_version(manifest_id const & ident, manifest_data & dat); // // --== The ancestry graph ==-- // public: void get_revision_ancestry(rev_ancestry_map & graph); void get_revision_parents(revision_id const & ident, std::set & parents); void get_revision_children(revision_id const & ident, std::set & children); void get_leaves(std::set & leaves); void get_revision_manifest(revision_id const & cid, manifest_id & mid); private: // helper void get_ids(std::string const & table, std::set< hexenc > & ids); public: void get_revision_ids(std::set & ids); // this is exposed for 'db check': void get_file_ids(std::set & ids); // // --== Revision reading/writing ==-- // private: void deltify_revision(revision_id const & rid); public: void get_revision(revision_id const & ident, revision_t & cs); void get_revision(revision_id const & ident, revision_data & dat); bool put_revision(revision_id const & new_id, revision_t const & rev); bool put_revision(revision_id const & new_id, revision_data const & dat); // // --== Rosters ==-- // private: u64 next_id_from_table(std::string const & table); void get_roster_version(revision_id const & ros_id, cached_roster & cr); public: node_id next_node_id(); void get_roster(revision_id const & rid, roster_t & roster); void get_roster(revision_id const & rid, roster_t & roster, marking_map & marks); void get_roster(revision_id const & rid, cached_roster & cr); // these are exposed for the use of database_check.cc bool roster_version_exists(revision_id const & ident); void get_roster_ids(std::set & ids); // using roster deltas void get_markings(revision_id const & id, node_id const & nid, marking_t & markings); void get_file_content(revision_id const & id, node_id const & nid, file_id & content); private: struct extractor; struct file_content_extractor; struct markings_extractor; void extract_from_deltas(revision_id const & id, extractor & x); // // --== Keys ==-- // private: void get_keys(std::string const & table, std::vector & keys); public: void get_key_ids(std::vector & pubkeys); void get_key_ids(globish const & pattern, std::vector & pubkeys); void get_public_keys(std::vector & pubkeys); bool public_key_exists(hexenc const & hash); bool public_key_exists(rsa_keypair_id const & ident); void get_pubkey(hexenc const & hash, rsa_keypair_id & ident, base64 & pub_encoded); void get_key(rsa_keypair_id const & ident, base64 & pub_encoded); bool put_key(rsa_keypair_id const & ident, base64 const & pub_encoded); void delete_public_key(rsa_keypair_id const & pub_id); // // --== Certs ==-- // // note: this section is ridiculous. please do something about it. private: bool cert_exists(cert const & t, std::string const & table); void put_cert(cert const & t, std::string const & table); void results_to_certs(results const & res, std::vector & certs); void get_certs(std::vector< cert > & certs, std::string const & table); void get_certs(hexenc const & ident, std::vector< cert > & certs, std::string const & table); void get_certs(cert_name const & name, std::vector< cert > & certs, std::string const & table); void get_certs(hexenc const & ident, cert_name const & name, std::vector< cert > & certs, std::string const & table); void get_certs(hexenc const & ident, cert_name const & name, base64 const & val, std::vector< cert > & certs, std::string const & table); void get_certs(cert_name const & name, base64 const & val, std::vector & certs, std::string const & table); outdated_indicator_factory cert_stamper; public: bool revision_cert_exists(revision const & cert); bool revision_cert_exists(hexenc const & hash); bool put_revision_cert(revision const & cert); // this variant has to be rather coarse and fast, for netsync's use outdated_indicator get_revision_cert_nobranch_index(std::vector< std::pair, std::pair > > & idx); // Only used by database_check.cc outdated_indicator get_revision_certs(std::vector< revision > & certs); outdated_indicator get_revision_certs(cert_name const & name, std::vector< revision > & certs); outdated_indicator get_revision_certs(revision_id const & ident, cert_name const & name, std::vector< revision > & certs); // Only used by get_branch_certs (project.cc) outdated_indicator get_revision_certs(cert_name const & name, base64 const & val, std::vector< revision > & certs); // Only used by revision_is_in_branch (project.cc) outdated_indicator get_revision_certs(revision_id const & ident, cert_name const & name, base64 const & value, std::vector< revision > & certs); // Only used by get_branch_heads (project.cc) outdated_indicator get_revisions_with_cert(cert_name const & name, base64 const & value, std::set & revisions); // Used through project.cc, and by // anc_graph::add_node_for_oldstyle_revision (revision.cc) outdated_indicator get_revision_certs(revision_id const & ident, std::vector< revision > & certs); // Used through get_revision_cert_hashes (project.cc) outdated_indicator get_revision_certs(revision_id const & ident, std::vector< hexenc > & hashes); void get_revision_cert(hexenc const & hash, revision & c); void get_manifest_certs(manifest_id const & ident, std::vector< manifest > & certs); void get_manifest_certs(cert_name const & name, std::vector< manifest > & certs); // // --== Epochs ==-- // public: void get_epochs(std::map & epochs); void get_epoch(epoch_id const & eid, branch_name & branch, epoch_data & epo); bool epoch_exists(epoch_id const & eid); void set_epoch(branch_name const & branch, epoch_data const & epo); void clear_epoch(branch_name const & branch); // // --== Database 'vars' ==-- // public: void get_vars(std::map & vars); void get_var(var_key const & key, var_value & value); bool var_exists(var_key const & key); void set_var(var_key const & key, var_value const & value); void clear_var(var_key const & key); // // --== Completion ==-- // public: void complete(std::string const & partial, std::set & completions); void complete(std::string const & partial, std::set & completions); void complete(std::string const & partial, std::set< std::pair > & completions); void complete(selectors::selector_type ty, std::string const & partial, std::vector > const & limit, std::set & completions); // // --== The 'db' family of top-level commands ==-- // public: void initialize(); void debug(std::string const & sql, std::ostream & out); void dump(std::ostream &); void load(std::istream &); void info(std::ostream &); void version(std::ostream &); void migrate(); void test_migration_step(std::string const &); // for kill_rev_locally: void delete_existing_rev_and_certs(revision_id const & rid); // for kill_branch_certs_locally: void delete_branch_named(cert_value const & branch); // for kill_tag_locally: void delete_tag_named(cert_value const & tag); // misc private: friend void rcs_put_raw_file_edge(hexenc const & old_id, hexenc const & new_id, delta const & del, database & db); public: // branches outdated_indicator get_branches(std::vector & names); outdated_indicator get_branches(globish const & glob, std::vector & names); bool check_integrity(); void get_uncommon_ancestors(revision_id const & a, revision_id const & b, std::set & a_uncommon_ancs, std::set & b_uncommon_ancs); // for changesetify, rosterify void delete_existing_revs_and_certs(); void delete_existing_manifests(); // heights void get_rev_height(revision_id const & id, rev_height & height); void put_rev_height(revision_id const & id, rev_height const & height); bool has_rev_height(rev_height const & height); void delete_existing_heights(); void put_height_for_revision(revision_id const & new_id, revision_t const & rev); // for regenerate_rosters void delete_existing_rosters(); void put_roster_for_revision(revision_id const & new_id, revision_t const & rev); }; // Parent maps are used in a number of places to keep track of all the // parent rosters of a given revision. typedef std::map parent_map; typedef parent_map::value_type parent_entry; inline revision_id const & parent_id(parent_entry const & p) { return p.first; } inline revision_id const & parent_id(parent_map::const_iterator i) { return i->first; } inline database::cached_roster const & parent_cached_roster(parent_entry const & p) { return p.second; } inline database::cached_roster const & parent_cached_roster(parent_map::const_iterator i) { return i->second; } inline roster_t const & parent_roster(parent_entry const & p) { return *(p.second.first); } inline roster_t const & parent_roster(parent_map::const_iterator i) { return *(i->second.first); } inline marking_map const & parent_marking(parent_entry const & p) { return *(p.second.second); } inline marking_map const & parent_marking(parent_map::const_iterator i) { return *(i->second.second); } // Transaction guards nest. Acquire one in any scope you'd like // transaction-protected, and it'll make sure the db aborts a transaction // if there's any exception before you call commit(). // // By default, transaction_guard locks the database exclusively. If the // transaction is intended to be read-only, construct the guard with // exclusive=false. In this case, if a database update is attempted and // another process is accessing the database an exception will be thrown - // uglier and more confusing for the user - however no data inconsistency // should result. // // An exception is thrown if an exclusive transaction_guard is created // while a non-exclusive transaction_guard exists. // // Transaction guards also support splitting long transactions up into // checkpoints. Any time you feel the database is in an // acceptably-consistent state, you can call maybe_checkpoint(nn) with a // given number of bytes. When the number of bytes and number of // maybe_checkpoint() calls exceeds the guard's parameters, the transaction // is committed and reopened. Any time you feel the database has reached a // point where want to ensure a transaction commit, without destructing the // object, you can call do_checkpoint(). // // This does *not* free you from having to call .commit() on the guard when // it "completes" its lifecycle. Here's a way to think of checkpointing: a // normal transaction guard is associated with a program-control // scope. Sometimes (notably in netsync) it is not convenient to create a // scope which exactly matches the size of work-unit you want to commit (a // bunch of packets, or a session-close, whichever comes first) so // checkpointing allows you to use a long-lived transaction guard and mark // off the moments where commits are desired, without destructing the // guard. The guard still performs an error-management task in case of an // exception, so you still have to clean it before destruction using // .commit(). // // Checkpointing also does not override the transaction guard nesting: if // there's an enclosing transaction_guard, your checkpointing calls have no // affect. // // The purpose of checkpointing is to provide an alternative to "many short // transactions" on platforms (OSX in particular) where the overhead of // full commits at high frequency is too high. The solution for these // platforms is to run inside a longer-lived transaction (session-length), // and checkpoint at higher granularity (every megabyte or so). class transaction_guard { bool committed; database & db; bool exclusive; size_t const checkpoint_batch_size; size_t const checkpoint_batch_bytes; size_t checkpointed_calls; size_t checkpointed_bytes; public: transaction_guard(database & d, bool exclusive=true, size_t checkpoint_batch_size=1000, size_t checkpoint_batch_bytes=0xfffff); ~transaction_guard(); void do_checkpoint(); void maybe_checkpoint(size_t nbytes); void commit(); }; // Local Variables: // mode: C++ // fill-column: 76 // c-file-style: "gnu" // indent-tabs-mode: nil // End: // vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s: #endif // __DATABASE_HH__