// strhash.cc see license.txt for copyright and terms of use // code for strhash.h #include "strhash.h" // this module #include "xassert.h" // xassert #include // strcmp // notes on string hash functions **************** // We need to test the various hash functions versus each other for // randomness. Scott suggests simply hashing the same data and // modding it down as a hashtable would and seeing which gives more // collisions. // Note that both hash functions could be improved if they were aware // of how many of their low order bits were really going to be // preserved and then taking the high order bits that would otherwise // be discarded and xoring or adding them to the low order bits. // Hash function 2: // 1) Does not work on architectures other than 32 bit. // 2) Will not work well or at all if the strings do not start on // 32-bit aligned boundaries. // 3) Could be made to work faster if the strings start AND end on // 32-bit aligned boundaries and are padded out with NUL-s. Actually, // if we do this trick, then the code becomes portable with no // #ifdef-s ! All you do is cast the array to an array of ints and // then test for termination by masking off all but the last 8 bits. // Everything else is just operations on ints. You might want to pick // 64-bit primes, but they will work in 32-bit mode as long as the // compiler just truncates their high bits off. // **************** StringHash::StringHash(GetKeyFn gk) : HashTable((HashTable::GetKeyFn)gk, (HashTable::HashFn)coreHash, (HashTable::EqualKeyFn)keyCompare) {} StringHash::~StringHash() {} STATICDEF unsigned StringHash::coreHash(char const *key) { // dsw: not sure if this is the best place for it, but an assertion // failure is better than a segfault // // sm: I don't agree; segfaults arising from NULL derefs are // quite friendly (deref'ing random address is another story). // Anyway, this is fine, but I'd like it to go away in NDEBUG // mode so I'm changing it to use 'xassertdb'. xassertdb(key); // some more references: // http://www.cs.yorku.ca/~oz/hash.html // // Describes three well-known hashes: djb2, sdbm, and K&R ed. 1. // http://burtleburtle.net/bob/hash/doobs.html // // Describes a particular hash function (called simply "My Hash") // and provides justifications for preferring it to several others, // including MD4. // http://www.isthe.com/chongo/tech/comp/fnv/ // // Glen Fowler, Landon Curt Noll, and Phong Vo's hash function. // http://mail.python.org/pipermail/python-dev/2004-April/044235.html // // Start of a discussion thread about changing the string hash // in Python. #if 0 // I pulled this out of my ass.. it's supposed to mimic // a linear congruential random number generator unsigned val = 0x42e9d115; // arbitrary while (*key != 0) { val *= (unsigned)(*key); val += 1; key++; } return val; #endif // 0 // pick a default STRHASH_ALG #ifndef STRHASH_ALG #define STRHASH_ALG 1 #endif // STRHASH_ALG #if STRHASH_ALG == 1 #ifdef SAY_STRHASH_ALG #warning hash function 1: Nelson #endif // SAY_STRHASH_ALG // this one is supposed to be better /* An excellent string hashing function. Adapted from glib's g_str_hash(). Investigation by Karl Nelson . Do a web search for "g_str_hash X31_HASH" if you want to know more. */ /* update: this is the same function as that described in Kernighan and Pike, "The Practice of Programming", section 2.9 */ unsigned h = 0; for (; *key != '\0'; key += 1) { // original X31_HASH h = ( h << 5 ) - h + *key; // h*31 + *key // dsw: this one is better because it does the multiply last; // otherwise the last byte has no hope of modifying the high order // bits // // sm: I'm not convinced it's better. For short strings, say less // than 6 characters, the arithmetic won't overflow a 32-bit // register. In that case, by multiplying last, the hash value is // always a multiple of 31 and hence will suffer many more // collisions. I would like more justification in the form of // experimental measurements before making a change. //h += *key; //h = ( h << 5 ) - h; // h *= 31 } return h; #elif STRHASH_ALG == 2 #ifdef SAY_STRHASH_ALG #warning hash function 2: word-rotate/final-mix #endif // SAY_STRHASH_ALG // FIX: #warning word-rotate/final-mix hash function only works on 32-bit architectures #warning word-rotate/final-mix hash function still needs to be tested for randomness vs nelson // Word-Rotate / Final-Mix hash function by Daniel Wilkerson; A // slighly faster and likely more random hash function; Invented in // collaboration with Simon Goldsmith. // // Supposedly gcc will sometimes recognize this and generate a // single rotate instruction 'ROR'. Thanks to Matt Harren for this. // http://groups.google.com/groups?q=rorl+x86&start=10&hl=en&lr=&ie=UTF-8&oe=UTF-8& // selm=359954C9.3B354F0%40cartsys.com&rnum=11 // http://www.privacy.nb.ca/cryptography/archives/coderpunks/new/1998-10/0096.html #define ROTATE(n, b) (n >> b) | (n << (32 - b)) // Note that UINT_MAX = 4294967295U; // source of primes: http://www.utm.edu/research/primes/lists/small/small.html static unsigned const primeA = 1500450271U; static unsigned const primeB = 2860486313U; // source of primes: http://www.utm.edu/research/primes/lists/2small/0bit.html static unsigned const primeC = (1U<<31) - 99U; static unsigned const primeD = (1U<<30) - 35U; static int count = 0; ++count; unsigned h = primeA; // Stride a word at a time. Note that this works best (or works at // all) if the string is 32-bit aligned. This initial 'if' block is // to prevent an extra unneeded rotate. // // FIX: this would be even faster if all strings were NUL-padded in // length to a multiple of 4; we could then omit all but the last // 'if' and the ragged end after the loop (it doesn't matter if you // tile a few extra NULs into your value). if (!key[0]) goto end0; if (!key[1]) goto end1; if (!key[2]) goto end2; if (!key[3]) goto end3; // No rotate here. h += *( (unsigned *) key ); key += 4; while (1) { // invariant: when we get here, we are ready to rotate if (!key[0]) {h = ROTATE(h, 5); goto end0;} if (!key[1]) {h = ROTATE(h, 5); goto end1;} if (!key[2]) {h = ROTATE(h, 5); goto end2;} if (!key[3]) {h = ROTATE(h, 5); goto end3;} h = ROTATE(h, 5); // FIX: if the start of the string is not 4-byte aligned then this // will be slower on x86 and I think even illegal on MIPS and // perhaps others. To be portable we should ensure this. h += *( (unsigned *) key ); // on my machine plus is faster than xor key += 4; } xfailure("shouldn't get here"); // deal with the ragged end // invariant: when we get here we are ready to add end3: h += *key; h = ROTATE(h, 5); key += 1; end2: h += *key; h = ROTATE(h, 5); key += 1; end1: h += *key; // No rotate nor increment here. #ifndef NDEBUG key += 1; // this is only needed for the assertion below #endif end0: xassertdb(*key=='\0'); // I will say for now that the property of hash functions that we // want is that a change in any input bit has a 50% chance of // inverting any output bit. // // At this point, compare this hash function to the Nelson hash // function above. In Nelson, everytime data is added, the // accumulator, 'h', is "stirred" with a multiplication. Since data // is being added at the low byte and since multiplication // propagates dependencies towards the high bytes and since after // ever add there is at least one multiply, every byte has a chance // to affect the value of every bit above the first byte. // // However, in this hash function we have saved time by simply // "tiling" the data across the accumulator and at this point it // hasn't been "stirred" at all. Since most hashvalues are used by // modding off some high bits, those bits have never had a chance to // affect the final value, so some stirring is needed. How much // "stirring" do we need? // // Consider the 32-bit word in two halves, H and L. // // 1) With a single multiply, any bit of L "has a chance" to affect // any bit in H. The reverse is not true. h *= primeB; // 2) We therefore swap H and L and multiply again. Now, any bit in // H has had a chance to affect any bit in L. We are not done // though, since the high order bits in H have not had a chance to // affect the low order bits of H (yes H). Please note however, // that since L affected H and H affected L, the hight order bits of // L *have* had a chance to affect the low order bits of L. h = ROTATE(h, 16); h *= primeC; // 3) Therefore we swap H and L and multiply once again. Now the // high order bits of H have had a chance to affect L (in 2) which // now can affect H again. Any bit now has "had a chance" to affect // any other bit. h = ROTATE(h, 16); h *= primeD; return h; #undef ROTATE #else #error You must pick a hash function #endif // STRHASH_ALG multi-switch } STATICDEF bool StringHash::keyCompare(char const *key1, char const *key2) { return 0==strcmp(key1, key2); } // ---------------------- test code -------------------- #ifdef TEST_STRHASH #include // cout #include // rand #include // istream #include // filebuf #include "trace.h" // traceProgress #include "crc.h" // crc32 #include "nonport.h" // getMilliseconds #include "array.h" // GrowArray #include "str.h" // string // pair a GrowArray with its size struct StringArray { int tableSize; GrowArray table; bool appendable; StringArray(int tableSize0) : tableSize(tableSize0) , table(tableSize) , appendable(tableSize == 0) {} void append(char *str) { xassert(appendable); table.ensureIndexDoubler(tableSize); table[tableSize] = str; ++tableSize; } }; // data to hash StringArray *dataArray = NULL; char const *id(void *p) { return (char const*)p; } char *randomString() { char *ret = new char[11]; loopi(10) { ret[i] = (rand()%26)+'a'; } ret[10]=0; return ret; } // fill a table with random strings void makeRandomData(int numRandStrs) { dataArray = new StringArray(numRandStrs); {loopi(dataArray->tableSize) { dataArray->table[i] = randomString(); }} } // file the data array with whitespace-delimited strings from a file void readDataFromFile(char *inFileName) { dataArray = new StringArray(0); char *delim = " \t\n\r\v\f"; std::filebuf fb; fb.open (inFileName, ios::in); istream in(&fb); while(true) { stringBuilder s; s.readdelim(in, delim); // cout << ":" << s->pcharc() << ":" << endl; if (in.eof()) break; // // don't insert 0 length strings // if (s->length() == 0) continue; dataArray->append(strdup(s.c_str())); } } void writeData(ostream &out) { cout << "write data" << endl; for(int i=0; itableSize; ++i) { out << dataArray->table[i] << endl; } } // dsw: what is the point of this? // dealloc the test strings // void deleteData() { // {loopi(dataArray->tableSize) { // delete[] dataArray->table[i]; // }} // // delete[] dataArray->table; // } void correctnessTest() { traceProgress() << "start of strhash correctness testing\n"; // insert them all into a hash table StringHash hash(id); {loopi(dataArray->tableSize) { hash.add(dataArray->table[i], dataArray->table[i]); hash.selfCheck(); }} hash.selfCheck(); xassert(hash.getNumEntries() == dataArray->tableSize); // verify that they are all mapped properly {loopi(dataArray->tableSize) { xassert(hash.get(dataArray->table[i]) == dataArray->table[i]); }} hash.selfCheck(); // remove every other one {loopi(dataArray->tableSize) { if (i%2 == 0) { hash.remove(dataArray->table[i]); hash.selfCheck(); } }} hash.selfCheck(); xassert(hash.getNumEntries() == dataArray->tableSize / 2); // verify it {loopi(dataArray->tableSize) { if (i%2 == 0) { xassert(hash.get(dataArray->table[i]) == NULL); } else { xassert(hash.get(dataArray->table[i]) == dataArray->table[i]); } }} hash.selfCheck(); // remove the rest {loopi(dataArray->tableSize) { if (i%2 == 1) { hash.remove(dataArray->table[i]); hash.selfCheck(); } }} hash.selfCheck(); xassert(hash.getNumEntries() == 0); traceProgress() << "end of strhash correctness testing\n"; } void performanceTest(int numPerfRuns) { // test performance of the hash function traceProgress() << "start of strhash performance testing\n"; long startTime = getMilliseconds(); loopj(numPerfRuns) { loopi(dataArray->tableSize) { StringHash::coreHash(dataArray->table[i]); //crc32((unsigned char*)dataArray->table[i], strlen(dataArray->table[i])); //crc32((unsigned char*)dataArray->table[i], 10); } } long stopTime = getMilliseconds(); long duration = stopTime - startTime; cout << "milliseconds to hash: " << duration << endl; traceProgress() << "end of strhash performance testing\n"; } // command-line state int numRandStrs = 0; char *inFileName = NULL; bool dump = false; bool testCor = true; bool testPerf = true; int numPerfRuns = 10000; void usage() { cout << "Test the string hashing module strhash.cc\n" << " --help / -h : print this message\n" << " --[no-]testCor : run the correctness tests\n" << " will fail if data has duplicate strings (?!)\n" << " --[no-]testPerf : run the performance tests\n" << " --numPerfRuns N : loop over data N times during performance run\n" << " --file FILE : use the whitespace-delimited string contents of FILE\n" << " --random N : use N internally generated random strings of length 10;\n" << " N should be even\n" << " --dump : dump out the data after generating/reading it\n" << "The default is '--random 300 --testCor --testPerf --numPerfRuns 10000'." << endl; } void initFromFlags(int &argc, char**&argv) { --argc; ++argv; for(; *argv; --argc, ++argv) { if (strcmp(*argv, "--help")==0 || strcmp(*argv, "-h")==0) { usage(); exit(0); } else if (strcmp(*argv, "--testCor")==0) { testCor = true; } else if (strcmp(*argv, "--no-testCor")==0) { testCor = false; } else if (strcmp(*argv, "--testPerf")==0) { testPerf = true; } else if (strcmp(*argv, "--no-testPerf")==0) { testPerf = false; } else if (strcmp(*argv, "--random")==0) { if (inFileName) { cout << "do not use --random and --file together" << endl; usage(); exit(1); } --argc; ++argv; if (!*argv) { cout << "supply an argument to --random" << endl; usage(); exit(1); } numRandStrs = atoi(*argv); if (!(numRandStrs > 0)) { cout << "argument to --random must be > 0" << endl; usage(); exit(1); } } else if (strcmp(*argv, "--file")==0) { if (numRandStrs) { cout << "do not use --random and --file together" << endl; usage(); exit(1); } --argc; ++argv; if (!*argv) { cout << "supply an argument to --file" << endl; usage(); exit(1); } inFileName = strdup(*argv); xassert(inFileName); } else if (strcmp(*argv, "--numPerfRuns")==0) { --argc; ++argv; if (!*argv) { cout << "supply an argument to --numPerfRuns" << endl; usage(); exit(1); } numPerfRuns = atoi(*argv); if (!(numPerfRuns > 0)) { cout << "argument to --numPerfRuns must be > 0" << endl; usage(); exit(1); } } else if (strcmp(*argv, "--dump")==0) { dump = true; } else { cout << "unrecognized flag " << *argv << endl; usage(); exit(1); } } } int main(int argc, char **argv) { traceAddSys("progress"); #if STRHASH_ALG == 1 cout << "hash function 1: Nelson" << endl; #elif STRHASH_ALG == 2 cout << "hash function 2: word-rotate/final-mix" << endl; #else #error You must pick a hash function #endif // STRHASH_ALG multi-switch // read command line flags initFromFlags(argc, argv); // read data if ((!inFileName) && (!numRandStrs)) { numRandStrs = 300; // default } if (numRandStrs % 2 != 0) { cout << "use an even-number argument for --random" << endl; usage(); exit(1); } if (numRandStrs) { makeRandomData(numRandStrs); } else if (inFileName) { if (testCor) { cout << "Warning: The correctness test fails if strings are duplicated " "and you are reading data from a file." << endl; } readDataFromFile(inFileName); } else { xfailure("goink?"); } // dump data if (dump) { writeData(cout); } // test if (testCor) { correctnessTest(); } if (testPerf) { performanceTest(numPerfRuns); } // delete data // deleteData(); cout << "strhash tests finished\n"; return 0; } #endif // TEST_STRHASH