// Copyright (c) 2005, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // --- // Author: Craig Silverstein // // This tests // This tests // This tests // This tests // This tests // This tests // Since {dense,sparse}hashtable is templatized, it's important that // we test every function in every class in this file -- not just to // see if it works, but even if it compiles. #include "config.h" #include #include // for stat() #ifdef HAVE_UNISTD_H #include // for unlink() #endif #include #include // for silly random-number-seed generator #include // for sqrt() #include #include // for insert_iterator #include #include // for setprecision() #include #include HASH_FUN_H // defined in config.h #include #include #include #include #include #include #include // Otherwise, VC++7 warns about size_t -> int in the cout logging lines #ifdef MSVC #pragma warning(disable:4267) #endif using GOOGLE_NAMESPACE::sparse_hash_map; using GOOGLE_NAMESPACE::dense_hash_map; using GOOGLE_NAMESPACE::sparse_hash_set; using GOOGLE_NAMESPACE::dense_hash_set; using GOOGLE_NAMESPACE::sparse_hashtable; using GOOGLE_NAMESPACE::dense_hashtable; using STL_NAMESPACE::map; using STL_NAMESPACE::pair; using STL_NAMESPACE::string; using STL_NAMESPACE::insert_iterator; using STL_NAMESPACE::allocator; using STL_NAMESPACE::equal_to; using STL_NAMESPACE::ostream; #define LOGF STL_NAMESPACE::cout // where we log to; LOGF is a historical name #define CHECK(cond) do { \ if (!(cond)) { \ LOGF << "Test failed: " #cond "\n"; \ exit(1); \ } \ } while (0) #ifndef WIN32 // windows defines its own version static string TmpFile(const char* basename) { return string("/tmp/") + basename; } #endif const char *words[] = {"Baffin\n", // in /usr/dict/words "Boffin\n", // not in "baffin\n", // not in "genial\n", // last word in "Aarhus\n", // first word alphabetically "Zurich\n", // last word alphabetically "Getty\n", }; const char *nwords[] = {"Boffin\n", "baffin\n", }; const char *default_dict[] = {"Aarhus\n", "aback\n", "abandon\n", "Baffin\n", "baffle\n", "bagged\n", "congenial\n", "genial\n", "Getty\n", "indiscreet\n", "linens\n", "pence\n", "reassure\n", "sequel\n", "zoning\n", "zoo\n", "Zurich\n", }; // Apparently identity is not stl-standard, so we define our own template struct Identity { Value& operator()(Value& v) const { return v; } const Value& operator()(const Value& v) const { return v; } }; // Likewise, it's not standard to hash a string. Luckily, it is a char* struct StrHash { size_t operator()(const string& s) const { return HASH_NAMESPACE::hash()(s.c_str()); } }; // Let us log the pairs that make up a hash_map template ostream& operator<<(ostream& s, const pair& p) { s << "pair(" << p.first << ", " << p.second << ")"; return s; } struct strcmp_fnc { bool operator()(const char* s1, const char* s2) const { return ((s1 == 0 && s2 == 0) || (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0)); } }; template static void set_empty_key(sparse_hashtable *ht, T val) { } template static void set_empty_key(sparse_hash_set *ht, T val) { } template static void set_empty_key(sparse_hash_map *ht, K val) { } template static void set_empty_key(dense_hashtable *ht, T val) { ht->set_empty_key(val); } template static void set_empty_key(dense_hash_set *ht, T val) { ht->set_empty_key(val); } template static void set_empty_key(dense_hash_map *ht, K val) { ht->set_empty_key(val); } template static bool clear_no_resize(sparse_hashtable *ht) { return false; } template static bool clear_no_resize(sparse_hash_set *ht) { return false; } template static bool clear_no_resize(sparse_hash_map *ht) { return false; } template static bool clear_no_resize(dense_hashtable *ht) { ht->clear_no_resize(); return true; } template static bool clear_no_resize(dense_hash_set *ht) { ht->clear_no_resize(); return true; } template static bool clear_no_resize(dense_hash_map *ht) { ht->clear_no_resize(); return true; } template static void insert(dense_hashtable *ht, T val) { ht->insert(val); } template static void insert(dense_hash_set *ht, T val) { ht->insert(val); } template static void insert(dense_hash_map *ht, K val) { ht->insert(pair(val,V())); } template static void insert(sparse_hashtable *ht, T val) { ht->insert(val); } template static void insert(sparse_hash_set *ht, T val) { ht->insert(val); } template static void insert(sparse_hash_map *ht, K val) { ht->insert(pair(val,V())); } template static void insert(HT *ht, Iterator begin, Iterator end) { ht->insert(begin, end); } // For hashtable's and hash_set's, the iterator insert works fine (and // is used). But for the hash_map's, the iterator insert expects the // iterators to point to pair's. So by looping over and calling insert // on each element individually, the code below automatically expands // into inserting a pair. template static void insert(dense_hash_map *ht, Iterator begin, Iterator end) { while (begin != end) { insert(ht, *begin); ++begin; } } template static void insert(sparse_hash_map *ht, Iterator begin, Iterator end) { while (begin != end) { insert(ht, *begin); ++begin; } } // A version of insert that uses the insert_iterator. But insert_iterator // isn't defined for the low level hashtable classes, so we just punt to insert. template static void iterator_insert(dense_hashtable* ht, T val, insert_iterator >* ) { ht->insert(val); } template static void iterator_insert(dense_hash_set* , T val, insert_iterator >* ii) { *(*ii)++ = val; } template static void iterator_insert(dense_hash_map* , K val, insert_iterator >* ii) { *(*ii)++ = pair(val,V()); } template static void iterator_insert(sparse_hashtable* ht, T val, insert_iterator >* ) { ht->insert(val); } template static void iterator_insert(sparse_hash_set* , T val, insert_iterator >* ii) { *(*ii)++ = val; } template static void iterator_insert(sparse_hash_map *, K val, insert_iterator >* ii) { *(*ii)++ = pair(val,V()); } static void write_item(FILE *fp, const char *val) { fwrite(val, strlen(val), 1, fp); // \n serves to separate } // The weird 'const' declarations are desired by the compiler. Yucko. static void write_item(FILE *fp, const pair &val) { fwrite(val.first, strlen(val.first), 1, fp); } static void write_item(FILE *fp, const string &val) { fwrite(val.data(), val.length(), 1, fp); // \n serves to separate } // The weird 'const' declarations are desired by the compiler. Yucko. static void write_item(FILE *fp, const pair &val) { fwrite(val.first.data(), val.first.length(), 1, fp); } static char* read_line(FILE* fp, char* line, int linesize) { if ( fgets(line, linesize, fp) == NULL ) return NULL; // normalize windows files :-( const size_t linelen = strlen(line); if ( linelen >= 2 && line[linelen-2] == '\r' && line[linelen-1] == '\n' ) { line[linelen-2] = '\n'; line[linelen-1] = '\0'; } return line; } static void read_item(FILE *fp, char*const* val) { char line[1024]; read_line(fp, line, sizeof(line)); char **p = const_cast(val); *p = strdup(line); } static void read_item(FILE *fp, pair *val) { char line[1024]; read_line(fp, line, sizeof(line)); char **p = const_cast(&val->first); *p = strdup(line); } static void read_item(FILE *fp, const string* val) { char line[1024]; read_line(fp, line, sizeof(line)); new(const_cast(val)) string(line); // need to use placement new } static void read_item(FILE *fp, pair *val) { char line[1024]; read_line(fp, line, sizeof(line)); new(const_cast(&val->first)) string(line); } static void free_item(char*const* val) { free(*val); } static void free_item(pair *val) { free(val->first); } static int get_int_item(int int_item) { return int_item; } static int get_int_item(pair val) { return val.first; } static const string& getstrkey(const string& str) { return str; } static const string& getstrkey(const pair &p) { return p.first; } // Performs tests where the hashtable's value type is assumed to be int. template void test_int() { htint x; htint y(1000); htint z(64); set_empty_key(&x, 0xefefef); set_empty_key(&y, 0xefefef); set_empty_key(&z, 0xefefef); CHECK(y.empty()); insert(&y, 1); CHECK(!y.empty()); insert(&y, 11); insert(&y, 111); insert(&y, 1111); insert(&y, 11111); insert(&y, 111111); insert(&y, 1111111); // 1M, more or less insert(&y, 11111111); insert(&y, 111111111); insert(&y, 1111111111); // 1B, more or less for ( int i = 0; i < 32; ++i ) insert(&z, i); // test the second half of the insert with an insert_iterator insert_iterator insert_iter(z, z.begin()); for ( int i = 32; i < 64; ++i ) iterator_insert(&z, i, &insert_iter); // only perform the following CHECKs for // dense{hashtable, _hash_set, _hash_map} if (clear_no_resize(&x)) { // make sure x has to increase its number of buckets typename htint::size_type empty_bucket_count = x.bucket_count(); int last_element = 0; while (x.bucket_count() == empty_bucket_count) { insert(&x, last_element); ++last_element; } // if clear_no_resize is supported (i.e. htint is a // dense{hashtable,_hash_set,_hash_map}), it should leave the bucket_count // as is. typename htint::size_type last_bucket_count = x.bucket_count(); clear_no_resize(&x); CHECK(last_bucket_count == x.bucket_count()); CHECK(x.empty()); LOGF << "x has " << x.bucket_count() << " buckets\n"; LOGF << "x size " << x.size() << "\n"; // when inserting the same number of elements again, no resize should be // necessary for (int i = 0; i < last_element; ++i) { insert(&x, i); CHECK(x.bucket_count() == last_bucket_count); } } for ( typename htint::const_iterator it = y.begin(); it != y.end(); ++it ) LOGF << "y: " << *it << "\n"; z.insert(y.begin(), y.end()); swap(y,z); for ( typename htint::iterator it = y.begin(); it != y.end(); ++it ) LOGF << "y+z: " << *it << "\n"; LOGF << "z has " << z.bucket_count() << " buckets\n"; LOGF << "y has " << y.bucket_count() << " buckets\n"; LOGF << "z size: " << z.size() << "\n"; for (int i = 0; i < 64; ++i) CHECK(y.find(i) != y.end()); CHECK(z.size() == 10); z.set_deleted_key(1010101010); // an unused value z.erase(11111); CHECK(z.size() == 9); insert(&z, 11111); // should retake deleted value CHECK(z.size() == 10); // Do the delete/insert again. Last time we probably resized; this time no z.erase(11111); insert(&z, 11111); // should retake deleted value CHECK(z.size() == 10); z.erase(-11111); // shouldn't do anything CHECK(z.size() == 10); z.erase(1); CHECK(z.size() == 9); typename htint::iterator itdel = z.find(1111); z.erase(itdel); CHECK(z.size() == 8); itdel = z.find(2222); // should be end() z.erase(itdel); // shouldn't do anything CHECK(z.size() == 8); for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it ) LOGF << "y: " << *it << "\n"; z.set_deleted_key(1010101011); // a different unused value for ( typename htint::const_iterator it = z.begin(); it != z.end(); ++it ) LOGF << "y: " << *it << "\n"; LOGF << "That's " << z.size() << " elements\n"; z.erase(z.begin(), z.end()); CHECK(z.empty()); y.clear(); CHECK(y.empty()); LOGF << "y has " << y.bucket_count() << " buckets\n"; } // Performs tests where the hashtable's value type is assumed to be char*. // The read_write parameters specifies whether the read/write tests // should be performed. Note that densehashtable::write_metdata is not // implemented, so we only do the read/write tests for the // sparsehashtable varieties. template void test_charptr(bool read_write) { ht w; set_empty_key(&w, (char*) NULL); insert(&w, const_cast(nwords), const_cast(nwords) + sizeof(nwords) / sizeof(*nwords)); LOGF << "w has " << w.size() << " items\n"; CHECK(w.size() == 2); CHECK(w == w); ht x; set_empty_key(&x, (char*) NULL); long dict_size = 1; // for size stats -- can't be 0 'cause of division map counts; // Hash the dictionary { // automake says 'look for all data files in $srcdir.' OK. string filestr = (string(getenv("srcdir") ? getenv("srcdir") : ".") + "/src/words"); const char* file = filestr.c_str(); FILE *fp = fopen(file, "rb"); if ( fp == NULL ) { LOGF << "Can't open " << file << ", using small, built-in dict...\n"; for (int i = 0; i < sizeof(default_dict)/sizeof(*default_dict); ++i) { insert(&x, strdup(default_dict[i])); counts[default_dict[i]] = 0; } } else { char line[1024]; while ( read_line(fp, line, sizeof(line)) ) { insert(&x, strdup(line)); counts[line] = 0; } LOGF << "Read " << x.size() << " words from " << file << "\n"; fclose(fp); struct stat buf; stat(file, &buf); dict_size = buf.st_size; LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n"; } for (char **word = const_cast(words); word < const_cast(words) + sizeof(words) / sizeof(*words); ++word ) { if (x.find(*word) == x.end()) { CHECK(w.find(*word) != w.end()); } else { CHECK(w.find(*word) == w.end()); } } } CHECK(counts.size() == x.size()); // Save the hashtable. if (read_write) { const string file_string = TmpFile("#hashtable_unittest_dicthash"); const char* file = file_string.c_str(); FILE *fp = fopen(file, "wb"); if ( fp == NULL ) { // maybe we can't write to /tmp/. Try the current directory file = "#hashtable_unittest_dicthash"; fp = fopen(file, "wb"); } if ( fp == NULL ) { LOGF << "Can't open " << file << " skipping hashtable save...\n"; } else { x.write_metadata(fp); // this only writes meta-information int count = 0; for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) { write_item(fp, *it); free_item(&(*it)); ++count; } LOGF << "Wrote " << count << " words to " << file << "\n"; fclose(fp); struct stat buf; stat(file, &buf); LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n"; LOGF << STL_NAMESPACE::setprecision(3) << "Hashtable overhead " << (buf.st_size - dict_size) * 100.0 / dict_size << "% (" << (buf.st_size - dict_size) * 8.0 / count << " bits/entry)\n"; x.clear(); // Load the hashtable fp = fopen(file, "rb"); if ( fp == NULL ) { LOGF << "Can't open " << file << " skipping hashtable reload...\n"; } else { x.read_metadata(fp); // reads metainformation LOGF << "Hashtable size: " << x.size() << "\n"; int count = 0; for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) { read_item(fp, &(*it)); ++count; } LOGF << "Read " << count << " words from " << file << "\n"; fclose(fp); unlink(file); for ( char **word = const_cast(words); word < const_cast(words) + sizeof(words) / sizeof(*words); ++word ) { if (x.find(*word) == x.end()) { CHECK(w.find(*word) != w.end()); } else { CHECK(w.find(*word) == w.end()); } } } } } for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) { free_item(&(*it)); } } // Perform tests where the hashtable's value type is assumed to // be string. // TODO(austern): factor out the bulk of test_charptr and test_string // into a common function. template void test_string(bool read_write) { ht w; set_empty_key(&w, string("-*- empty key -*-")); const int N = sizeof(nwords) / sizeof(*nwords); string* nwords1 = new string[N]; for (int i = 0; i < N; ++i) nwords1[i] = nwords[i]; insert(&w, nwords1, nwords1 + N); delete[] nwords1; LOGF << "w has " << w.size() << " items\n"; CHECK(w.size() == 2); CHECK(w == w); ht x; set_empty_key(&x, string("-*- empty key -*-")); long dict_size = 1; // for size stats -- can't be 0 'cause of division map counts; // Hash the dictionary { // automake says 'look for all data files in $srcdir.' OK. string filestr = (string(getenv("srcdir") ? getenv("srcdir") : ".") + "/src/words"); const char* file = filestr.c_str(); FILE *fp = fopen(file, "rb"); if ( fp == NULL ) { LOGF << "Can't open " << file << ", using small, built-in dict...\n"; for (int i = 0; i < sizeof(default_dict)/sizeof(*default_dict); ++i) { insert(&x, string(default_dict[i])); counts[default_dict[i]] = 0; } } else { char line[1024]; while ( fgets(line, sizeof(line), fp) ) { insert(&x, string(line)); counts[line] = 0; } LOGF << "Read " << x.size() << " words from " << file << "\n"; fclose(fp); struct stat buf; stat(file, &buf); dict_size = buf.st_size; LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n"; } for ( const char* const* word = words; word < words + sizeof(words) / sizeof(*words); ++word ) { if (x.find(*word) == x.end()) { CHECK(w.find(*word) != w.end()); } else { CHECK(w.find(*word) == w.end()); } } } CHECK(counts.size() == x.size()); // Save the hashtable. if (read_write) { const string file_string = TmpFile("#hashtable_unittest_dicthash_str"); const char* file = file_string.c_str(); FILE *fp = fopen(file, "wb"); if ( fp == NULL ) { // maybe we can't write to /tmp/. Try the current directory file = "#hashtable_unittest_dicthash_str"; fp = fopen(file, "wb"); } if ( fp == NULL ) { LOGF << "Can't open " << file << " skipping hashtable save...\n"; } else { x.write_metadata(fp); // this only writes meta-information int count = 0; for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) { write_item(fp, *it); ++count; } LOGF << "Wrote " << count << " words to " << file << "\n"; fclose(fp); struct stat buf; stat(file, &buf); LOGF << "Size of " << file << ": " << buf.st_size << " bytes\n"; LOGF << STL_NAMESPACE::setprecision(3) << "Hashtable overhead " << (buf.st_size - dict_size) * 100.0 / dict_size << "% (" << (buf.st_size - dict_size) * 8.0 / count << " bits/entry)\n"; x.clear(); // Load the hashtable fp = fopen(file, "rb"); if ( fp == NULL ) { LOGF << "Can't open " << file << " skipping hashtable reload...\n"; } else { x.read_metadata(fp); // reads metainformation LOGF << "Hashtable size: " << x.size() << "\n"; int count = 0; for ( typename ht::iterator it = x.begin(); it != x.end(); ++it ) { read_item(fp, &(*it)); ++count; } LOGF << "Read " << count << " words from " << file << "\n"; fclose(fp); unlink(file); for ( const char* const* word = words; word < words + sizeof(words) / sizeof(*words); ++word ) { if (x.find(*word) == x.end()) { CHECK(w.find(*word) != w.end()); } else { CHECK(w.find(*word) == w.end()); } } } } } // ensure that destruction is done properly in clear_no_resize() if (!clear_no_resize(&w)) w.clear(); } // The read_write parameters specifies whether the read/write tests // should be performed. Note that densehashtable::write_metdata is not // implemented, so we only do the read/write tests for the // sparsehashtable varieties. template void test(bool read_write) { test_int(); test_string(read_write); test_charptr(read_write); } // For data types with trivial copy-constructors and destructors, we // should use an optimized routine for data-copying, that involves // memmove. We test this by keeping count of how many times the // copy-constructor is called; it should be much less with the // optimized code. class Memmove { public: Memmove(): i_(0) {} explicit Memmove(int i): i_(i) {} Memmove(const Memmove& that) { this->i_ = that.i_; num_copies_++; } int i_; static int num_copies_; }; int Memmove::num_copies_ = 0; // This is what tells the hashtable code it can use memmove for this class: _START_GOOGLE_NAMESPACE_ template<> struct has_trivial_copy : true_type { }; template<> struct has_trivial_destructor : true_type { }; _END_GOOGLE_NAMESPACE_ class NoMemmove { public: NoMemmove(): i_(0) {} explicit NoMemmove(int i): i_(i) {} NoMemmove(const NoMemmove& that) { this->i_ = that.i_; num_copies_++; } int i_; static int num_copies_; }; int NoMemmove::num_copies_ = 0; void TestSimpleDataTypeOptimizations() { { sparse_hash_map memmove; sparse_hash_map nomemmove; Memmove::num_copies_ = 0; // reset NoMemmove::num_copies_ = 0; // reset for (int i = 10000; i > 0; i--) { memmove[i] = Memmove(i); } for (int i = 10000; i > 0; i--) { nomemmove[i] = NoMemmove(i); } LOGF << "sparse_hash_map copies for unoptimized/optimized cases: " << NoMemmove::num_copies_ << "/" << Memmove::num_copies_ << "\n"; CHECK(NoMemmove::num_copies_ > Memmove::num_copies_); } // Same should hold true for dense_hash_map { dense_hash_map memmove; dense_hash_map nomemmove; memmove.set_empty_key(0); nomemmove.set_empty_key(0); Memmove::num_copies_ = 0; // reset NoMemmove::num_copies_ = 0; // reset for (int i = 10000; i > 0; i--) { memmove[i] = Memmove(i); } for (int i = 10000; i > 0; i--) { nomemmove[i] = NoMemmove(i); } LOGF << "dense_hash_map copies for unoptimized/optimized cases: " << NoMemmove::num_copies_ << "/" << Memmove::num_copies_ << "\n"; CHECK(NoMemmove::num_copies_ > Memmove::num_copies_); } } int main(int argc, char **argv) { // First try with the low-level hashtable interface LOGF << "\n\nTEST WITH DENSE_HASHTABLE\n\n"; test, Identity, strcmp_fnc, allocator >, dense_hashtable, equal_to, allocator >, dense_hashtable, Identity, equal_to, allocator > >( false); // Now try with hash_set, which should be equivalent LOGF << "\n\nTEST WITH DENSE_HASH_SET\n\n"; test, strcmp_fnc>, dense_hash_set, dense_hash_set >(false); // Now try with hash_map, which differs only in insert() LOGF << "\n\nTEST WITH DENSE_HASH_MAP\n\n"; test, strcmp_fnc>, dense_hash_map, dense_hash_map >(false); // First try with the low-level hashtable interface LOGF << "\n\nTEST WITH SPARSE_HASHTABLE\n\n"; test, Identity, strcmp_fnc, allocator >, sparse_hashtable, equal_to, allocator >, sparse_hashtable, Identity, equal_to, allocator > >( true); // Now try with hash_set, which should be equivalent LOGF << "\n\nTEST WITH SPARSE_HASH_SET\n\n"; test, strcmp_fnc>, sparse_hash_set, sparse_hash_set >(true); // Now try with hash_map, which differs only in insert() LOGF << "\n\nTEST WITH SPARSE_HASH_MAP\n\n"; test, strcmp_fnc>, sparse_hash_map, sparse_hash_map >(true); // Test that we use the optimized routines for simple data types LOGF << "\n\nTesting simple-data-type optimizations\n"; TestSimpleDataTypeOptimizations(); LOGF << "\nAll tests pass.\n"; return 0; }