From 32030d0033ace8f90310f14e2db8b824fcc0530e Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 23 Jul 2019 15:49:54 -0700 Subject: [PATCH] add functions for 128-bit canonical representation --- xxh3.h | 23 +++++++++++++++++++++++ xxhash.h | 24 +++++++++++++++--------- 2 files changed, 38 insertions(+), 9 deletions(-) diff --git a/xxh3.h b/xxh3.h index 12bcd46..4de6db0 100644 --- a/xxh3.h +++ b/xxh3.h @@ -1543,6 +1543,29 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) } +/*====== Canonical representation ======*/ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + #ifdef UNDEF_NDEBUG diff --git a/xxhash.h b/xxhash.h index 946a01a..1cd5e9c 100644 --- a/xxhash.h +++ b/xxhash.h @@ -339,7 +339,7 @@ struct XXH64_state_s { * There are still a number of opened questions that community can influence during the experimental period. * I'm trying to list a few of them below, though don't consider this list as complete. * - * - 128-bits output type : currently defined as a structure of 2 64-bits fields. + * - 128-bits output type : currently defined as a structure of two 64-bits fields. * That's because 128-bit values do not exist in C standard. * Note that it means that, at byte level, result is not identical depending on endianess. * However, at field level, they are identical on all platforms. @@ -349,23 +349,23 @@ struct XXH64_state_s { * Are the names of the inner 64-bit fields important ? Should they be changed ? * * - Canonical representation : for the 64-bit variant, canonical representation is the same as XXH64() (aka big-endian). - * What should it be for the 128-bit variant ? - * Since it's no longer a scalar value, big-endian representation is no longer an obvious choice. - * One possibility : represent it as the concatenation of two 64-bits canonical representation (aka 2x big-endian) - * Another one : represent it in the same order as natural order in the struct for little-endian platforms. - * Less consistent with existing convention for XXH32/XXH64, but may be more natural for little-endian platforms. + * For consistency with existing variants, the same rule has been selected for the 128-bit hash, + * and canonical representation also uses big-endian convention. + * It's less convenient for little-endian cpus (requires swapping registers), + * but canonical representation is expected to be useful for serialization and display only, + * hence is not a speed critical operation. * * - Seed type for 128-bits variant : currently, it's a single 64-bit value, like the 64-bit variant. * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. - * Although it's also more difficult to use, since it requires to declare and pass a structure instead of a value. - * It would either replace current choice, or add a new one. + * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value. + * Such a variant could either replace current choice, or add a new one. * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). * If both 64-bit and 128-bit seeds are possible, which variant should be called XXH128 ? * * - Result for len==0 : Currently, the result of hashing a zero-length input is `0`. * It seems okay as a return value when using all "default" secret and seed (it used to be a request for XXH32/XXH64). * But is it still fine to return `0` when secret or seed are non-default ? - * Are there use case which would depend on a different hash result when the secret is different ? + * Are there use cases which would depend on a different hash result for zero-length input when the secret is different ? */ #ifdef XXH_NAMESPACE @@ -514,6 +514,12 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); +/*====== Canonical representation ======*/ +typedef struct { unsigned char digest[16]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + #endif /* XXH_NO_LONG_LONG */