mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-03-01 05:48:26 +00:00
Fix bug 69271, r=waterson, sr=shaver:
- Don't ape java.lang.String's bogo-sampling hash function for "long" (>=16 char) strings. - Theory and practice comment in pldhash.h helps analyze when to use double hashing (most of the time) vs. when to use chaining. - Subroutine ChangeTable from PL_DHashTableOperate so it can be called from PL_DHashTableEnumerate, if the latter finds that enough entries have been removed to be worth a shrink or compress cycle.
This commit is contained in:
parent
ae45fb03d7
commit
1e8530d216
@ -17,6 +17,9 @@
|
||||
* Copyright (C) 1999,2000 Netscape Communications Corporation.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Original Contributor:
|
||||
* Brendan Eich <brendan@mozilla.org>
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the
|
||||
@ -35,6 +38,7 @@
|
||||
* Double hashing implementation.
|
||||
* GENERATED BY js/src/plify_jsdhash.sed -- DO NOT EDIT!!!
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "prbit.h"
|
||||
@ -62,22 +66,12 @@ PL_DHashFreeTable(PLDHashTable *table, void *ptr)
|
||||
PR_IMPLEMENT(PLDHashNumber)
|
||||
PL_DHashStringKey(PLDHashTable *table, const void *key)
|
||||
{
|
||||
const char *s;
|
||||
size_t n, m;
|
||||
PLDHashNumber h;
|
||||
const unsigned char *s;
|
||||
|
||||
s = key;
|
||||
n = strlen(s);
|
||||
h = 0;
|
||||
if (n < 16) {
|
||||
/* Hash every char in a short string. */
|
||||
for (; n; s++, n--)
|
||||
h = (h >> 28) ^ (h << 4) ^ *s;
|
||||
} else {
|
||||
/* Sample a la java.lang.String.hash(). */
|
||||
for (m = n / 8; n >= m; s += m, n -= m)
|
||||
h = (h >> 28) ^ (h << 4) ^ *s;
|
||||
}
|
||||
for (s = key; *s != '\0'; s++)
|
||||
h = (h >> (PL_DHASH_BITS - 4)) ^ (h << 4) ^ *s;
|
||||
return h;
|
||||
}
|
||||
|
||||
@ -171,6 +165,17 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data,
|
||||
int log2;
|
||||
PRUint32 nbytes;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (entrySize > 6 * sizeof(void *)) {
|
||||
fprintf(stderr,
|
||||
"pldhash: for the table at address 0x%p, the given entrySize"
|
||||
" of %lu %s favors chaining over double hashing.\n",
|
||||
table,
|
||||
(unsigned long) entrySize,
|
||||
(entrySize > 16 * sizeof(void*)) ? "definitely" : "probably");
|
||||
}
|
||||
#endif
|
||||
|
||||
table->ops = ops;
|
||||
table->data = data;
|
||||
if (capacity < PL_DHASH_MIN_SIZE)
|
||||
@ -179,10 +184,11 @@ PL_DHashTableInit(PLDHashTable *table, PLDHashTableOps *ops, void *data,
|
||||
capacity = PR_BIT(log2);
|
||||
table->hashShift = PL_DHASH_BITS - log2;
|
||||
table->sizeLog2 = log2;
|
||||
table->sizeMask = PR_BITMASK(table->sizeLog2);
|
||||
table->sizeMask = PR_BITMASK(log2);
|
||||
table->entrySize = entrySize;
|
||||
table->entryCount = table->removedCount = 0;
|
||||
nbytes = capacity * entrySize;
|
||||
|
||||
table->entryStore = ops->allocTable(table, nbytes);
|
||||
if (!table->entryStore)
|
||||
return PR_FALSE;
|
||||
@ -260,21 +266,75 @@ SearchTable(PLDHashTable *table, const void *key, PLDHashNumber keyHash)
|
||||
return entry;
|
||||
}
|
||||
|
||||
PR_IMPLEMENT(PLDHashEntryHdr *)
|
||||
PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
static PRBool
|
||||
ChangeTable(PLDHashTable *table, int deltaLog2, PLDHashEntryHdr *skipEntry)
|
||||
{
|
||||
int change;
|
||||
PLDHashNumber keyHash;
|
||||
PRUint32 i, size, capacity, nbytes, entrySize;
|
||||
PLDHashEntryHdr *entry, *oldEntry, *newEntry;
|
||||
char *entryStore, *newEntryStore, *entryAddr;
|
||||
int oldLog2, newLog2;
|
||||
PRUint32 oldCapacity, newCapacity;
|
||||
char *newEntryStore, *oldEntryStore, *oldEntryAddr;
|
||||
PRUint32 entrySize, i, nbytes;
|
||||
PLDHashEntryHdr *oldEntry, *newEntry;
|
||||
PLDHashGetKey getKey;
|
||||
PLDHashMoveEntry moveEntry;
|
||||
|
||||
/* Usually we don't grow or shrink the table. */
|
||||
change = 0;
|
||||
/* Look, but don't touch, until we succeed in getting new entry store. */
|
||||
oldLog2 = table->sizeLog2;
|
||||
newLog2 = oldLog2 + deltaLog2;
|
||||
oldCapacity = PR_BIT(oldLog2);
|
||||
newCapacity = PR_BIT(newLog2);
|
||||
entrySize = table->entrySize;
|
||||
nbytes = newCapacity * entrySize;
|
||||
|
||||
/* Avoid 0 and 1 hash codes, they indicate free and deleted entries. */
|
||||
newEntryStore = table->ops->allocTable(table, nbytes);
|
||||
if (!newEntryStore)
|
||||
return PR_FALSE;
|
||||
|
||||
table->hashShift = PL_DHASH_BITS - newLog2;
|
||||
table->sizeLog2 = newLog2;
|
||||
table->sizeMask = PR_BITMASK(newLog2);
|
||||
table->removedCount = 0;
|
||||
|
||||
memset(newEntryStore, 0, nbytes);
|
||||
oldEntryAddr = oldEntryStore = table->entryStore;
|
||||
table->entryStore = newEntryStore;
|
||||
getKey = table->ops->getKey;
|
||||
moveEntry = table->ops->moveEntry;
|
||||
|
||||
/* Copy only live entries, leaving removed ones (and skipEntry) behind. */
|
||||
for (i = 0; i < oldCapacity; i++) {
|
||||
oldEntry = (PLDHashEntryHdr *)oldEntryAddr;
|
||||
if (oldEntry != skipEntry && ENTRY_IS_LIVE(oldEntry)) {
|
||||
newEntry = SearchTable(table, getKey(table, oldEntry),
|
||||
oldEntry->keyHash);
|
||||
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry));
|
||||
moveEntry(table, oldEntry, newEntry);
|
||||
newEntry->keyHash = oldEntry->keyHash;
|
||||
}
|
||||
oldEntryAddr += entrySize;
|
||||
}
|
||||
|
||||
table->ops->freeTable(table, oldEntryStore);
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
||||
PR_IMPLEMENT(PLDHashEntryHdr *)
|
||||
PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
{
|
||||
int biasedDeltaLog2;
|
||||
PLDHashNumber keyHash;
|
||||
PLDHashEntryHdr *entry;
|
||||
PRUint32 size;
|
||||
|
||||
/*
|
||||
* Usually we don't grow or shrink the table, so optimize for test-not-zero
|
||||
* by biasing the deltaLog2 of -1 (shrink), 0 (compress), or 1 (grow) so that
|
||||
* the biased no-change value is 0.
|
||||
*/
|
||||
#define DELTA_LOG2_BIAS 2
|
||||
|
||||
biasedDeltaLog2 = 0;
|
||||
|
||||
/* Avoid 0 and 1 hash codes, they indicate free and removed entries. */
|
||||
keyHash = table->ops->hashKey(table, key);
|
||||
ENSURE_LIVE_KEYHASH(keyHash);
|
||||
keyHash *= PL_DHASH_GOLDEN_RATIO;
|
||||
@ -292,12 +352,16 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
entry->keyHash = keyHash;
|
||||
table->entryCount++;
|
||||
|
||||
/* If alpha is >= .75, set change to trigger table growth below. */
|
||||
/* If alpha is >= .75, set biasedDeltaLog2 to trigger growth. */
|
||||
size = PR_BIT(table->sizeLog2);
|
||||
if (table->entryCount + table->removedCount >= size - (size >> 2)) {
|
||||
METER(table->stats.grows++);
|
||||
change = 1;
|
||||
capacity = size << 1;
|
||||
if (table->removedCount >= size >> 2) {
|
||||
METER(table->stats.compresses++);
|
||||
biasedDeltaLog2 = 0 + DELTA_LOG2_BIAS;
|
||||
} else {
|
||||
METER(table->stats.grows++);
|
||||
biasedDeltaLog2 = 1 + DELTA_LOG2_BIAS;
|
||||
}
|
||||
}
|
||||
}
|
||||
METER(else table->stats.addHits++);
|
||||
@ -313,8 +377,7 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
size = PR_BIT(table->sizeLog2);
|
||||
if (size > PL_DHASH_MIN_SIZE && table->entryCount <= size >> 2) {
|
||||
METER(table->stats.shrinks++);
|
||||
change = -1;
|
||||
capacity = size >> 1;
|
||||
biasedDeltaLog2 = -1 + DELTA_LOG2_BIAS;
|
||||
}
|
||||
}
|
||||
METER(else table->stats.removeMisses++);
|
||||
@ -325,11 +388,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
PR_ASSERT(0);
|
||||
}
|
||||
|
||||
if (change) {
|
||||
entrySize = table->entrySize;
|
||||
nbytes = capacity * entrySize;
|
||||
newEntryStore = table->ops->allocTable(table, nbytes);
|
||||
if (!newEntryStore) {
|
||||
if (biasedDeltaLog2) {
|
||||
if (!ChangeTable(table, biasedDeltaLog2 - DELTA_LOG2_BIAS, entry)) {
|
||||
/* If we just grabbed the last free entry, undo and fail hard. */
|
||||
if (op == PL_DHASH_ADD &&
|
||||
table->entryCount + table->removedCount == size) {
|
||||
@ -339,32 +399,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
entry = NULL;
|
||||
}
|
||||
} else {
|
||||
memset(newEntryStore, 0, nbytes);
|
||||
entryStore = table->entryStore;
|
||||
table->entryStore = newEntryStore;
|
||||
|
||||
table->sizeLog2 += change;
|
||||
table->sizeMask = PR_BITMASK(table->sizeLog2);
|
||||
table->hashShift = PL_DHASH_BITS - table->sizeLog2;
|
||||
table->removedCount = 0;
|
||||
|
||||
getKey = table->ops->getKey;
|
||||
moveEntry = table->ops->moveEntry;
|
||||
entryAddr = entryStore;
|
||||
for (i = 0; i < size; i++) {
|
||||
oldEntry = (PLDHashEntryHdr *)entryAddr;
|
||||
if (oldEntry != entry && ENTRY_IS_LIVE(oldEntry)) {
|
||||
newEntry = SearchTable(table, getKey(table,oldEntry),
|
||||
oldEntry->keyHash);
|
||||
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(newEntry));
|
||||
moveEntry(table, oldEntry, newEntry);
|
||||
newEntry->keyHash = oldEntry->keyHash;
|
||||
}
|
||||
entryAddr += entrySize;
|
||||
}
|
||||
table->ops->freeTable(table, entryStore);
|
||||
|
||||
if (op == PL_DHASH_ADD) {
|
||||
/* If the table grew, add the new (skipped) entry. */
|
||||
entry = SearchTable(table, key, keyHash);
|
||||
PR_ASSERT(PL_DHASH_ENTRY_IS_FREE(entry));
|
||||
entry->keyHash = keyHash;
|
||||
@ -372,6 +408,8 @@ PL_DHashTableOperate(PLDHashTable *table, const void *key, PLDHashOperator op)
|
||||
}
|
||||
}
|
||||
|
||||
#undef DELTA_LOG2_BIAS
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
@ -388,14 +426,14 @@ PR_IMPLEMENT(PRUint32)
|
||||
PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg)
|
||||
{
|
||||
char *entryAddr;
|
||||
PRUint32 i, j, n, entrySize;
|
||||
PRUint32 i, j, capacity, entrySize;
|
||||
PLDHashEntryHdr *entry;
|
||||
PLDHashOperator op;
|
||||
|
||||
entryAddr = table->entryStore;
|
||||
entrySize = table->entrySize;
|
||||
n = PR_BIT(table->sizeLog2);
|
||||
for (i = j = 0; i < n; i++) {
|
||||
capacity = PR_BIT(table->sizeLog2);
|
||||
for (i = j = 0; i < capacity; i++) {
|
||||
entry = (PLDHashEntryHdr *)entryAddr;
|
||||
if (ENTRY_IS_LIVE(entry)) {
|
||||
op = etor(table, entry, j++, arg);
|
||||
@ -408,12 +446,23 @@ PL_DHashTableEnumerate(PLDHashTable *table, PLDHashEnumerator etor, void *arg)
|
||||
}
|
||||
entryAddr += entrySize;
|
||||
}
|
||||
|
||||
/* Shrink or compress if enough entries were removed that alpha < .5. */
|
||||
if (table->removedCount >= capacity >> 2) {
|
||||
METER(table->stats.enumShrinks++);
|
||||
capacity = table->entryCount;
|
||||
capacity += capacity >> 1;
|
||||
if (capacity < PL_DHASH_MIN_SIZE)
|
||||
capacity = PL_DHASH_MIN_SIZE;
|
||||
(void) ChangeTable(table,
|
||||
PR_CeilingLog2(capacity) - table->sizeLog2,
|
||||
NULL);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
#ifdef PL_DHASHMETER
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
|
||||
PR_IMPLEMENT(void)
|
||||
PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp)
|
||||
@ -490,6 +539,8 @@ PL_DHashTableDumpMeter(PLDHashTable *table, PLDHashEnumerator dump, FILE *fp)
|
||||
fprintf(fp, " removes while enumerating: %u\n", table->stats.removeEnums);
|
||||
fprintf(fp, " number of grows: %u\n", table->stats.grows);
|
||||
fprintf(fp, " number of shrinks: %u\n", table->stats.shrinks);
|
||||
fprintf(fp, " number of compresses: %u\n", table->stats.compresses);
|
||||
fprintf(fp, "number of enumerate shrinks: %u\n", table->stats.enumShrinks);
|
||||
|
||||
if (maxChainLen && hash2) {
|
||||
fputs("Maximum hash chain:\n", fp);
|
||||
|
@ -17,7 +17,10 @@
|
||||
* Copyright (C) 1999,2000 Netscape Communications Corporation.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Contributor(s):
|
||||
* Original Contributor:
|
||||
* Brendan Eich <brendan@mozilla.org>
|
||||
*
|
||||
* Contributor(s):
|
||||
*
|
||||
* Alternatively, the contents of this file may be used under the
|
||||
* terms of the GNU Public License (the "GPL"), in which case the
|
||||
@ -88,6 +91,70 @@ struct PLDHashEntryHdr {
|
||||
* A PLDHashTable is currently 8 words (without the PL_DHASHMETER overhead)
|
||||
* on most architectures, and may be allocated on the stack or within another
|
||||
* structure or class (see below for the Init and Finish functions to use).
|
||||
*
|
||||
* To decide whether to use double hashing vs. chaining, we need to develop a
|
||||
* trade-off relation, as follows:
|
||||
*
|
||||
* Let alpha be the load factor, esize the entry size in words, count the
|
||||
* entry count, and pow2 the power-of-two table size in entries.
|
||||
*
|
||||
* (PLDHashTable overhead) > (PLHashTable overhead)
|
||||
* (unused table entry space) > (malloc and .next overhead per entry) +
|
||||
* (buckets overhead)
|
||||
* (1 - alpha) * esize * pow2 > 2 * count + pow2
|
||||
*
|
||||
* Notice that alpha is by definition (count / pow2):
|
||||
*
|
||||
* (1 - alpha) * esize * pow2 > 2 * alpha * pow2 + pow2
|
||||
* (1 - alpha) * esize > 2 * alpha + 1
|
||||
*
|
||||
* esize > (1 + 2 * alpha) / (1 - alpha)
|
||||
*
|
||||
* This assumes both tables must keep keyHash, key, and value for each entry,
|
||||
* where key and value point to separately allocated strings or structures.
|
||||
* If key and value can be combined into one pointer, then the trade-off is:
|
||||
*
|
||||
* esize > (1 + 3 * alpha) / (1 - alpha)
|
||||
*
|
||||
* If the entry value can be a subtype of PLDHashEntryHdr, rather than a type
|
||||
* that must be allocated separately and referenced by an entry.value pointer
|
||||
* member, and provided key's allocation can be fused with its entry's, then
|
||||
* k (the words wasted per entry with chaining) is 4.
|
||||
*
|
||||
* To see these curves, feed gnuplot input like so:
|
||||
*
|
||||
* gnuplot> f(x,k) = (1 + k * x) / (1 - x)
|
||||
* gnuplot> plot [0:.75] f(x,2), f(x,3), f(x,4)
|
||||
*
|
||||
* For k of 2 and a well-loaded table (alpha > .5), esize must be more than 4
|
||||
* words for chaining to be more space-efficient than double hashing.
|
||||
*
|
||||
* Solving for alpha helps us decide when to shrink an underloaded table:
|
||||
*
|
||||
* esize > (1 + k * alpha) / (1 - alpha)
|
||||
* esize - alpha * esize > 1 + k * alpha
|
||||
* esize - 1 > (k + esize) * alpha
|
||||
* (esize - 1) / (k + esize) > alpha
|
||||
*
|
||||
* alpha < (esize - 1) / (esize + k)
|
||||
*
|
||||
* Therefore double hashing should keep alpha >= (esize - 1) / (esize + k),
|
||||
* assuming esize is not too large (in which case, chaining should probably be
|
||||
* used for any alpha). For esize=2 and k=3, we want alpha >= .2; for esize=3
|
||||
* and k=2, we want alpha >= .4. For k=4, esize could be 6, and alpha >= .5
|
||||
* would still obtain.
|
||||
*
|
||||
* The current implementation uses a constant .25 as alpha's lower bound when
|
||||
* deciding to shrink the table (while respecting PL_DHASH_MIN_SIZE).
|
||||
*
|
||||
* Note a qualitative difference between chaining and double hashing: under
|
||||
* chaining, entry addresses are stable across table shrinks and grows. With
|
||||
* double hashing, you can't safely hold an entry pointer and use it after an
|
||||
* ADD or REMOVE operation.
|
||||
*
|
||||
* The moral of this story: there is no one-size-fits-all hash table scheme,
|
||||
* but for small table entry size, and assuming entry address stability is not
|
||||
* required, double hashing wins.
|
||||
*/
|
||||
struct PLDHashTable {
|
||||
PLDHashTableOps *ops; /* virtual operations, see below */
|
||||
@ -114,6 +181,8 @@ struct PLDHashTable {
|
||||
PRUint32 removeEnums; /* removes done by Enumerate */
|
||||
PRUint32 grows; /* table expansions */
|
||||
PRUint32 shrinks; /* table contractions */
|
||||
PRUint32 compresses; /* table compressions */
|
||||
PRUint32 enumShrinks; /* contractions after Enumerate */
|
||||
} stats;
|
||||
#endif
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user