simplify integration of other hash algorithms

2024-11-23 14:39:40 +00:00 · 2020-02-28 17:07:12 -08:00 · 2020-02-28 17:07:12 -08:00 · e618ec4dcc
commit e618ec4dcc
parent dadcbc4a0f
5 changed files with 155 additions and 10 deletions
--- a/tests/collisions/Makefile
+++ b/tests/collisions/Makefile
@ -34,13 +34,17 @@ LDFLAGS  += -pthread
 LDFLAGS  += -maes -mavx2
 TESTHASHES = 110000000

+HASH_SRC := $(sort $(wildcard allcodecs/*.c allcodecs/*.cc))
+HASH_OBJ := $(patsubst %.c,%.o,$(HASH_SRC))
+
+
 .PHONY: default
 default: release

 .PHONY: all
 all: release

-collisionsTest: main.o pool.o threading.o sort.o
+collisionsTest: main.o pool.o threading.o sort.o $(HASH_OBJ)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ $(LDFLAGS) -o $@

 main.o: hashes.h xxh3.h xxhash.h
@ -68,5 +72,5 @@ test: debug

 .PHONY: clean
 clean:
-	$(RM) *.o
+	$(RM) *.o allcodecs/*.o
 	$(RM) collisionsTest
--- a/tests/collisions/README.md
+++ b/tests/collisions/README.md
@ -6,18 +6,22 @@ and comparing the result to an "ideal" target.

 The test requires a very large amount of memory.
 By default, it will generate 24 billion of 64-bit hashes,
-requiring 192 GB of RAM for their storage.
-The number of hashes can be modified using command `--nbh=`,
-but beware that requiring too few hashes will not provide meaningful information on the algorithm's collision performance.
+requiring __192 GB of RAM__ for their storage.
+The number of hashes can be modified using command `--nbh=`.
+be aware that testing the collision ratio of 64-bit hashes
+requires a very large amount of hashes (several billions) for meaningful measurements.

 To reduce RAM usage, an optional filter can be requested, with `--filter`.
 It reduces the nb of candidates to analyze, hence associated RAM budget.
-Be aware that the filter also requires RAM
+Note that the filter itself requires a lot of RAM
 (32 GB by default, can be modified using `--filterlog=`,
 a too small filter will not be efficient, aim at ~2 bytes per hash),
-and that managing the filter costs a significant CPU budget.
+and reading and writing into filter cost a significant CPU budget,
+so this method is slower.
+It also doesn't allow advanced analysis of partial bitfields,
+since most hashes will be discarded and not stored.

-The RAM budget will be completed by a list of candidates,
+When using the filter, the RAM budget consists of the filter and a list of candidates,
 which will be a fraction of original hash list.
 Using default settings (24 billions hashes, 32 GB filter),
 the number of potential candidates should be reduced to less than 2 billions,
@ -28,6 +32,45 @@ but storage must allocate an upper bound.

 For the default test, the expected "optimal" collision rate for a 64-bit hash function is ~18 collisions.

+#### How to integrate any hash in the tester
+
+The build script is expecting to compile files in `./allcodecs`.
+Put the source code here.
+This also works if the hash is a single `*.h` files.
+
+The flue happens in `hashes.h`.
+In this file, there are 2 sections :
+- Add the required `#include "header.h"`, and create a wrapper,
+to respect the format expected by the function pointer.
+- Add the wrapper, along with the name and an indication of the output width,
+to the table, at the end of `hashed.h`
+
+Build with `make`. Locate your new hash with `./collisionsTest -h`,
+it should be listed.
+
+
+#### Some advises on how to setup a collisions test
+
+The test is primarily driven by the amount of RAM available.
+Here's a method to decide the size of the test.
+
+Presuming that RAM budget is not plentiful, for this example 32 GB,
+the `--filter` mode is actually compulsory to measure anything meaningful.
+Let's plan 50% of memory for the filter, that's 16 GB.
+This will be good enough to filter about 10% less hashes than this size.
+Let's round down to 14 G.
+
+By requesting 14G, the expectation is that the program will automatically
+size the filter to 16 GB, and expect to store ~1G candidates,
+leaving enough room to breeze for the system.
+
+The command line becomes :
+```
+./collisionsTest --nbh=14G --filter NameOfHash
+```
+
+#### Examples :
+
 Here are a few results produced with this tester :

 | Name        | nb Collisions | Notes |
--- a/tests/collisions/allcodecs/dummy.c
+++ b/tests/collisions/allcodecs/dummy.c
@ -0,0 +1,38 @@
+/* dummy.c,
+*  a fake hash algorithm, just to test integration capabilities.
+*  Part of xxHash project
+*  Copyright (C) 2012-present, Yann Collet
+*
+*  GPL v2 License
+*
+*  This program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License along
+*  with this program; if not, write to the Free Software Foundation, Inc.,
+*  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+*  You can contact the author at :
+*  - xxHash homepage : http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+#include <dummy.h>
+
+unsigned badsum32(const void* input, size_t len, unsigned seed)
+{
+    unsigned sum = seed;
+    const unsigned char* in8 = input;
+    size_t c;
+    for (c=0; c<len; c++)
+        sum += in8[c];
+    return sum;
+}
--- a/tests/collisions/allcodecs/dummy.h
+++ b/tests/collisions/allcodecs/dummy.h
@ -0,0 +1,44 @@
+/* dummy.c,
+*  a fake hash algorithm, just to test integration capabilities.
+*  Part of xxHash project
+*  Copyright (C) 2012-present, Yann Collet
+*
+*  GPL v2 License
+*
+*  This program is free software; you can redistribute it and/or modify
+*  it under the terms of the GNU General Public License as published by
+*  the Free Software Foundation; either version 2 of the License, or
+*  (at your option) any later version.
+*
+*  This program is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  GNU General Public License for more details.
+*
+*  You should have received a copy of the GNU General Public License along
+*  with this program; if not, write to the Free Software Foundation, Inc.,
+*  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+*  You can contact the author at :
+*  - xxHash homepage : http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+#ifndef DUMMY_H_987987
+#define DUMMY_H_987987
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h> /* size_t */
+
+unsigned badsum32(const void* input, size_t len, unsigned seed);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* DUMMY_H_987987 */
--- a/tests/collisions/hashes.h
+++ b/tests/collisions/hashes.h
@ -39,6 +39,12 @@ typedef union {
    XXH128_hash_t h128;
 } UniHash;

+UniHash uniHash32(uint64_t v32)
+{   UniHash unih;
+    unih.h64 = v32;
+    return unih;
+}
+
 UniHash uniHash64(uint64_t v64)
 {   UniHash unih;
    unih.h64 = v64;
@ -81,7 +87,16 @@ UniHash XXH64_wrapper (const void* data, size_t size)

 UniHash XXH32_wrapper (const void* data, size_t size)
 {
-    return uniHash64( XXH32(data, size, 0) );
+    return uniHash32( XXH32(data, size, 0) );
+}
+
+/* ===  Dummy integration example  === */
+
+#include "dummy.h"
+
+UniHash badsum32_wrapper (const void* data, size_t size)
+{
+    return uniHash32( badsum32(data, size, 0) );
 }


@ -96,7 +111,7 @@ typedef struct {
    int bits;
 } hashDescription;

-#define HASH_FN_TOTAL 6
+#define HASH_FN_TOTAL 7

 hashDescription hashfnTable[HASH_FN_TOTAL] = {
    { "xxh3"  ,  XXH3_wrapper,     64 },
@ -105,6 +120,7 @@ hashDescription hashfnTable[HASH_FN_TOTAL] = {
    { "xxh128l", XXH128l_wrapper,  64 },
    { "xxh128h", XXH128h_wrapper,  64 },
    { "xxh32" ,  XXH32_wrapper,    32 },
+    { "badsum32",badsum32_wrapper, 32 },
 };

 #endif   /* HASHES_H_1235465 */