Fix bug 69271, r=waterson, sr=shaver:

- Don't ape java.lang.String's bogo-sampling hash function for "long" (>=16
  char) strings.
- Theory and practice comment in pldhash.h helps analyze when to use double
  hashing (most of the time) vs. when to use chaining.
- Subroutine ChangeTable from PL_DHashTableOperate so it can be called from
  PL_DHashTableEnumerate, if the latter finds that enough entries have been
  removed to be worth a shrink or compress cycle.
This commit is contained in:
brendan%mozilla.org 2001-03-14 07:48:07 +00:00
parent ae45fb03d7
commit 1e8530d216
2 changed files with 185 additions and 65 deletions

View File

@ -17,6 +17,9 @@
* Copyright (C) 1999,2000 Netscape Communications Corporation.
* All Rights Reserved.
*
* Original Contributor:
* Brendan Eich <brendan@mozilla.org>
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the
@ -35,6 +38,7 @@
* Double hashing implementation.
* GENERATED BY js/src/plify_jsdhash.sed -- DO NOT EDIT!!!
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "prbit.h"
@ -62,22 +66,12 @@ PL_DHashFreeTable(PLDHashTable *table, void *ptr)
PR_IMPLEMENT(PLDHashNumber)
PL_DHashStringKey(PLDHashTable *table, const void *key)
{
const char *s;
size_t n, m;
PLDHashNumber h;
const unsigned char *s;
s = key;
n = strlen(s);
h = 0;
if (n < 16) {
/* Hash every char in a short string. */
for (; n; s++, n--)
h = (h >> 28) ^ (h << 4) ^ *s;
} else {
/* Sample a la java.lang.String.hash(). */
for (m = n / 8; n >= m; s += m, n -= m)
h = (h >> 28) ^ (h << 4) ^ *s;
}
for (s = key; *s != '\0'; s++)
h = (h >> (PL_DHASH_BITS - 4)) ^ (h << 4) ^ *s;
return h;
}
@ -171,6 +165,17 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data,
int log2;
PRUint32 nbytes;
#ifdef DEBUG
if (entrySize > 6 * sizeof(void *)) {
fprintf(stderr,
"pldhash: for the table at address 0x%p, the given entrySize"
" of %lu %s favors chaining over double hashing.\n",
table,
(unsigned long) entrySize,
(entrySize > 16 * sizeof(void*)) ? "definitely" : "probably");
}
#endif
table->ops = ops;
table->data = data;
if (capacity < PL_DHASH_MIN_SIZE)
@ -179,10 +184,11 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data,
capacity = PR_BIT(log2);
table->hashShift = PL_DHASH_BITS - log2;
table->sizeLog2 = log2;
table->sizeMask = PR_BITMASK(table->sizeLog2);
table->sizeMask = PR_BITMASK(log2);
table->entrySize = entrySize;
table->entryCount = table->removedCount = 0;
nbytes = capacity * entrySize;
table->entryStore = ops->allocTable(table, nbytes);
if (!table->entryStore)
return PR_FALSE;
@ -260,21 +266,75 @@ SearchTable(PLDHashTable *table, const void *key, PLDHashNumber keyHash)
return entry;
}
PR_IMPLEMENT(PLDHashEntryHdr *)
PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
static PRBool
ChangeTable(PLDHashTable *table, int deltaLog2, PLDHashEntryHdr *skipEntry)
{
int change;
PLDHashNumber keyHash;
PRUint32 i, size, capacity, nbytes, entrySize;
PLDHashEntryHdr *entry, *oldEntry, *newEntry;
char *entryStore, *newEntryStore, *entryAddr;
int oldLog2, newLog2;
PRUint32 oldCapacity, newCapacity;
char *newEntryStore, *oldEntryStore, *oldEntryAddr;
PRUint32 entrySize, i, nbytes;
PLDHashEntryHdr *oldEntry, *newEntry;
PLDHashGetKey getKey;
PLDHashMoveEntry moveEntry;
/* Usually we don't grow or shrink the table. */
change = 0;
/* Look, but don't touch, until we succeed in getting new entry store. */
oldLog2 = table->sizeLog2;
newLog2 = oldLog2 + deltaLog2;
oldCapacity = PR_BIT(oldLog2);
newCapacity = PR_BIT(newLog2);
entrySize = table->entrySize;
nbytes = newCapacity * entrySize;
/* Avoid 0 and 1 hash codes, they indicate free and deleted entries. */
newEntryStore = table->ops->allocTable(table, nbytes);
if (!newEntryStore)
return PR_FALSE;
table->hashShift = PL_DHASH_BITS - newLog2;
table->sizeLog2 = newLog2;
table->sizeMask = PR_BITMASK(newLog2);
table->removedCount = 0;
memset(newEntryStore, 0, nbytes);
oldEntryAddr = oldEntryStore = table->entryStore;
table->entryStore = newEntryStore;
getKey = table->ops->getKey;
moveEntry = table->ops->moveEntry;
/* Copy only live entries, leaving removed ones (and skipEntry) behind. */
for (i = 0; i < oldCapacity; i++) {
oldEntry = (PLDHashEntryHdr *)oldEntryAddr;
if (oldEntry != skipEntry && ENTRY_IS_LIVE(oldEntry)) {
newEntry = SearchTable(table, getKey(table, oldEntry),
oldEntry->keyHash);
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry));
moveEntry(table, oldEntry, newEntry);
newEntry->keyHash = oldEntry->keyHash;
}
oldEntryAddr += entrySize;
}
table->ops->freeTable(table, oldEntryStore);
return PR_TRUE;
}
PR_IMPLEMENT(PLDHashEntryHdr *)
PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
{
int biasedDeltaLog2;
PLDHashNumber keyHash;
PLDHashEntryHdr *entry;
PRUint32 size;
/*
* Usually we don't grow or shrink the table, so optimize for test-not-zero
* by biasing the deltaLog2 of -1 (shrink), 0 (compress), or 1 (grow) so that
* the biased no-change value is 0.
*/
#define DELTA_LOG2_BIAS 2
biasedDeltaLog2 = 0;
/* Avoid 0 and 1 hash codes, they indicate free and removed entries. */
keyHash = table->ops->hashKey(table, key);
ENSURE_LIVE_KEYHASH(keyHash);
keyHash *= PL_DHASH_GOLDEN_RATIO;
@ -292,12 +352,16 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
entry->keyHash = keyHash;
table->entryCount++;
/* If alpha is >= .75, set change to trigger table growth below. */
/* If alpha is >= .75, set biasedDeltaLog2 to trigger growth. */
size = PR_BIT(table->sizeLog2);
if (table->entryCount + table->removedCount >= size - (size >> 2)) {
METER(table->stats.grows++);
change = 1;
capacity = size << 1;
if (table->removedCount >= size >> 2) {
METER(table->stats.compresses++);
biasedDeltaLog2 = 0 + DELTA_LOG2_BIAS;
} else {
METER(table->stats.grows++);
biasedDeltaLog2 = 1 + DELTA_LOG2_BIAS;
}
}
}
METER(else table->stats.addHits++);
@ -313,8 +377,7 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
size = PR_BIT(table->sizeLog2);
if (size > PL_DHASH_MIN_SIZE && table->entryCount <= size >> 2) {
METER(table->stats.shrinks++);
change = -1;
capacity = size >> 1;
biasedDeltaLog2 = -1 + DELTA_LOG2_BIAS;
}
}
METER(else table->stats.removeMisses++);
@ -325,11 +388,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
PR_ASSERT(0);
}
if (change) {
entrySize = table->entrySize;
nbytes = capacity * entrySize;
newEntryStore = table->ops->allocTable(table, nbytes);
if (!newEntryStore) {
if (biasedDeltaLog2) {
if (!ChangeTable(table, biasedDeltaLog2 - DELTA_LOG2_BIAS, entry)) {
/* If we just grabbed the last free entry, undo and fail hard. */
if (op == PL_DHASH_ADD &&
table->entryCount + table->removedCount == size) {
@ -339,32 +399,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
entry = NULL;
}
} else {
memset(newEntryStore, 0, nbytes);
entryStore = table->entryStore;
table->entryStore = newEntryStore;
table->sizeLog2 += change;
table->sizeMask = PR_BITMASK(table->sizeLog2);
table->hashShift = PL_DHASH_BITS - table->sizeLog2;
table->removedCount = 0;
getKey = table->ops->getKey;
moveEntry = table->ops->moveEntry;
entryAddr = entryStore;
for (i = 0; i < size; i++) {
oldEntry = (PLDHashEntryHdr *)entryAddr;
if (oldEntry != entry && ENTRY_IS_LIVE(oldEntry)) {
newEntry = SearchTable(table, getKey(table,oldEntry),
oldEntry->keyHash);
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry));
moveEntry(table, oldEntry, newEntry);
newEntry->keyHash = oldEntry->keyHash;
}
entryAddr += entrySize;
}
table->ops->freeTable(table, entryStore);
if (op == PL_DHASH_ADD) {
/* If the table grew, add the new (skipped) entry. */
entry = SearchTable(table, key, keyHash);
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(entry));
entry->keyHash = keyHash;
@ -372,6 +408,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
}
}
#undef DELTA_LOG2_BIAS
return entry;
}
@ -388,14 +426,14 @@ PR_IMPLEMENT(PRUint32)
PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg)
{
char *entryAddr;
PRUint32 i, j, n, entrySize;
PRUint32 i, j, capacity, entrySize;
PLDHashEntryHdr *entry;
PLDHashOperator op;
entryAddr = table->entryStore;
entrySize = table->entrySize;
n = PR_BIT(table->sizeLog2);
for (i = j = 0; i < n; i++) {
capacity = PR_BIT(table->sizeLog2);
for (i = j = 0; i < capacity; i++) {
entry = (PLDHashEntryHdr *)entryAddr;
if (ENTRY_IS_LIVE(entry)) {
op = etor(table, entry, j++, arg);
@ -408,12 +446,23 @@ PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg)
}
entryAddr += entrySize;
}
/* Shrink or compress if enough entries were removed that alpha < .5. */
if (table->removedCount >= capacity >> 2) {
METER(table->stats.enumShrinks++);
capacity = table->entryCount;
capacity += capacity >> 1;
if (capacity < PL_DHASH_MIN_SIZE)
capacity = PL_DHASH_MIN_SIZE;
(void) ChangeTable(table,
PR_CeilingLog2(capacity) - table->sizeLog2,
NULL);
}
return j;
}
#ifdef PL_DHASHMETER
#include <math.h>
#include <stdio.h>
PR_IMPLEMENT(void)
PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp)
@ -490,6 +539,8 @@ PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp)
fprintf(fp, " removes while enumerating: %u\n", table->stats.removeEnums);
fprintf(fp, " number of grows: %u\n", table->stats.grows);
fprintf(fp, " number of shrinks: %u\n", table->stats.shrinks);
fprintf(fp, " number of compresses: %u\n", table->stats.compresses);
fprintf(fp, "number of enumerate shrinks: %u\n", table->stats.enumShrinks);
if (maxChainLen && hash2) {
fputs("Maximum hash chain:\n", fp);

View File

@ -17,7 +17,10 @@
* Copyright (C) 1999,2000 Netscape Communications Corporation.
* All Rights Reserved.
*
* Contributor(s):
* Original Contributor:
* Brendan Eich <brendan@mozilla.org>
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the
* terms of the GNU Public License (the "GPL"), in which case the
@ -88,6 +91,70 @@ struct PLDHashEntryHdr {
* A PLDHashTable is currently 8 words (without the PL_DHASHMETER overhead)
* on most architectures, and may be allocated on the stack or within another
* structure or class (see below for the Init and Finish functions to use).
*
* To decide whether to use double hashing vs. chaining, we need to develop a
* trade-off relation, as follows:
*
* Let alpha be the load factor, esize the entry size in words, count the
* entry count, and pow2 the power-of-two table size in entries.
*
* (PLDHashTable overhead) > (PLHashTable overhead)
* (unused table entry space) > (malloc and .next overhead per entry) +
* (buckets overhead)
* (1 - alpha) * esize * pow2 > 2 * count + pow2
*
* Notice that alpha is by definition (count / pow2):
*
* (1 - alpha) * esize * pow2 > 2 * alpha * pow2 + pow2
* (1 - alpha) * esize > 2 * alpha + 1
*
* esize > (1 + 2 * alpha) / (1 - alpha)
*
* This assumes both tables must keep keyHash, key, and value for each entry,
* where key and value point to separately allocated strings or structures.
* If key and value can be combined into one pointer, then the trade-off is:
*
* esize > (1 + 3 * alpha) / (1 - alpha)
*
* If the entry value can be a subtype of PLDHashEntryHdr, rather than a type
* that must be allocated separately and referenced by an entry.value pointer
* member, and provided key's allocation can be fused with its entry's, then
* k (the words wasted per entry with chaining) is 4.
*
* To see these curves, feed gnuplot input like so:
*
* gnuplot> f(x,k) = (1 + k * x) / (1 - x)
* gnuplot> plot [0:.75] f(x,2), f(x,3), f(x,4)
*
* For k of 2 and a well-loaded table (alpha > .5), esize must be more than 4
* words for chaining to be more space-efficient than double hashing.
*
* Solving for alpha helps us decide when to shrink an underloaded table:
*
* esize > (1 + k * alpha) / (1 - alpha)
* esize - alpha * esize > 1 + k * alpha
* esize - 1 > (k + esize) * alpha
* (esize - 1) / (k + esize) > alpha
*
* alpha < (esize - 1) / (esize + k)
*
* Therefore double hashing should keep alpha >= (esize - 1) / (esize + k),
* assuming esize is not too large (in which case, chaining should probably be
* used for any alpha). For esize=2 and k=3, we want alpha >= .2; for esize=3
* and k=2, we want alpha >= .4. For k=4, esize could be 6, and alpha >= .5
* would still obtain.
*
* The current implementation uses a constant .25 as alpha's lower bound when
* deciding to shrink the table (while respecting PL_DHASH_MIN_SIZE).
*
* Note a qualitative difference between chaining and double hashing: under
* chaining, entry addresses are stable across table shrinks and grows. With
* double hashing, you can't safely hold an entry pointer and use it after an
* ADD or REMOVE operation.
*
* The moral of this story: there is no one-size-fits-all hash table scheme,
* but for small table entry size, and assuming entry address stability is not
* required, double hashing wins.
*/
struct PLDHashTable {
PLDHashTableOps *ops; /* virtual operations, see below */
@ -114,6 +181,8 @@ struct PLDHashTable {
PRUint32 removeEnums; /* removes done by Enumerate */
PRUint32 grows; /* table expansions */
PRUint32 shrinks; /* table contractions */
PRUint32 compresses; /* table compressions */
PRUint32 enumShrinks; /* contractions after Enumerate */
} stats;
#endif
};