simplify integration of other hash algorithms

This commit is contained in:
Yann Collet 2020-02-28 17:07:12 -08:00
parent dadcbc4a0f
commit e618ec4dcc
5 changed files with 155 additions and 10 deletions

View File

@ -34,13 +34,17 @@ LDFLAGS += -pthread
LDFLAGS += -maes -mavx2
TESTHASHES = 110000000
HASH_SRC := $(sort $(wildcard allcodecs/*.c allcodecs/*.cc))
HASH_OBJ := $(patsubst %.c,%.o,$(HASH_SRC))
.PHONY: default
default: release
.PHONY: all
all: release
collisionsTest: main.o pool.o threading.o sort.o
collisionsTest: main.o pool.o threading.o sort.o $(HASH_OBJ)
$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
main.o: hashes.h xxh3.h xxhash.h
@ -68,5 +72,5 @@ test: debug
.PHONY: clean
clean:
$(RM) *.o
$(RM) *.o allcodecs/*.o
$(RM) collisionsTest

View File

@ -6,18 +6,22 @@ and comparing the result to an "ideal" target.
The test requires a very large amount of memory.
By default, it will generate 24 billion of 64-bit hashes,
requiring 192 GB of RAM for their storage.
The number of hashes can be modified using command `--nbh=`,
but beware that requiring too few hashes will not provide meaningful information on the algorithm's collision performance.
requiring __192 GB of RAM__ for their storage.
The number of hashes can be modified using command `--nbh=`.
be aware that testing the collision ratio of 64-bit hashes
requires a very large amount of hashes (several billions) for meaningful measurements.
To reduce RAM usage, an optional filter can be requested, with `--filter`.
It reduces the nb of candidates to analyze, hence associated RAM budget.
Be aware that the filter also requires RAM
Note that the filter itself requires a lot of RAM
(32 GB by default, can be modified using `--filterlog=`,
a too small filter will not be efficient, aim at ~2 bytes per hash),
and that managing the filter costs a significant CPU budget.
and reading and writing into filter cost a significant CPU budget,
so this method is slower.
It also doesn't allow advanced analysis of partial bitfields,
since most hashes will be discarded and not stored.
The RAM budget will be completed by a list of candidates,
When using the filter, the RAM budget consists of the filter and a list of candidates,
which will be a fraction of original hash list.
Using default settings (24 billions hashes, 32 GB filter),
the number of potential candidates should be reduced to less than 2 billions,
@ -28,6 +32,45 @@ but storage must allocate an upper bound.
For the default test, the expected "optimal" collision rate for a 64-bit hash function is ~18 collisions.
#### How to integrate any hash in the tester
The build script is expecting to compile files in `./allcodecs`.
Put the source code here.
This also works if the hash is a single `*.h` files.
The flue happens in `hashes.h`.
In this file, there are 2 sections :
- Add the required `#include "header.h"`, and create a wrapper,
to respect the format expected by the function pointer.
- Add the wrapper, along with the name and an indication of the output width,
to the table, at the end of `hashed.h`
Build with `make`. Locate your new hash with `./collisionsTest -h`,
it should be listed.
#### Some advises on how to setup a collisions test
The test is primarily driven by the amount of RAM available.
Here's a method to decide the size of the test.
Presuming that RAM budget is not plentiful, for this example 32 GB,
the `--filter` mode is actually compulsory to measure anything meaningful.
Let's plan 50% of memory for the filter, that's 16 GB.
This will be good enough to filter about 10% less hashes than this size.
Let's round down to 14 G.
By requesting 14G, the expectation is that the program will automatically
size the filter to 16 GB, and expect to store ~1G candidates,
leaving enough room to breeze for the system.
The command line becomes :
```
./collisionsTest --nbh=14G --filter NameOfHash
```
#### Examples :
Here are a few results produced with this tester :
| Name | nb Collisions | Notes |

View File

@ -0,0 +1,38 @@
/* dummy.c,
* a fake hash algorithm, just to test integration capabilities.
* Part of xxHash project
* Copyright (C) 2012-present, Yann Collet
*
* GPL v2 License
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* You can contact the author at :
* - xxHash homepage : http://www.xxhash.com
* - xxHash source repository : https://github.com/Cyan4973/xxHash
*/
#include <dummy.h>
unsigned badsum32(const void* input, size_t len, unsigned seed)
{
unsigned sum = seed;
const unsigned char* in8 = input;
size_t c;
for (c=0; c<len; c++)
sum += in8[c];
return sum;
}

View File

@ -0,0 +1,44 @@
/* dummy.c,
* a fake hash algorithm, just to test integration capabilities.
* Part of xxHash project
* Copyright (C) 2012-present, Yann Collet
*
* GPL v2 License
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* You can contact the author at :
* - xxHash homepage : http://www.xxhash.com
* - xxHash source repository : https://github.com/Cyan4973/xxHash
*/
#ifndef DUMMY_H_987987
#define DUMMY_H_987987
#if defined (__cplusplus)
extern "C" {
#endif
#include <stddef.h> /* size_t */
unsigned badsum32(const void* input, size_t len, unsigned seed);
#if defined (__cplusplus)
}
#endif
#endif /* DUMMY_H_987987 */

View File

@ -39,6 +39,12 @@ typedef union {
XXH128_hash_t h128;
} UniHash;
UniHash uniHash32(uint64_t v32)
{ UniHash unih;
unih.h64 = v32;
return unih;
}
UniHash uniHash64(uint64_t v64)
{ UniHash unih;
unih.h64 = v64;
@ -81,7 +87,16 @@ UniHash XXH64_wrapper (const void* data, size_t size)
UniHash XXH32_wrapper (const void* data, size_t size)
{
return uniHash64( XXH32(data, size, 0) );
return uniHash32( XXH32(data, size, 0) );
}
/* === Dummy integration example === */
#include "dummy.h"
UniHash badsum32_wrapper (const void* data, size_t size)
{
return uniHash32( badsum32(data, size, 0) );
}
@ -96,7 +111,7 @@ typedef struct {
int bits;
} hashDescription;
#define HASH_FN_TOTAL 6
#define HASH_FN_TOTAL 7
hashDescription hashfnTable[HASH_FN_TOTAL] = {
{ "xxh3" , XXH3_wrapper, 64 },
@ -105,6 +120,7 @@ hashDescription hashfnTable[HASH_FN_TOTAL] = {
{ "xxh128l", XXH128l_wrapper, 64 },
{ "xxh128h", XXH128h_wrapper, 64 },
{ "xxh32" , XXH32_wrapper, 32 },
{ "badsum32",badsum32_wrapper, 32 },
};
#endif /* HASHES_H_1235465 */