diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b9ca7b4..b5abd6f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,6 +86,7 @@ option(WITH_REDUCED_MEM "Reduced memory usage for special cases (reduces perform option(WITH_NEW_STRATEGIES "Use new strategies" ON) option(WITH_NATIVE_INSTRUCTIONS "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)" OFF) +option(WITH_RUNTIME_CPU_DETECTION "Build with runtime detection of CPU architecture" ON) option(WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings" OFF) option(WITH_CODE_COVERAGE "Enable code coverage reporting" OFF) option(WITH_INFLATE_STRICT "Build with strict inflate distance checking" OFF) @@ -287,12 +288,21 @@ if(WITH_NATIVE_INSTRUCTIONS) separate_arguments(NATIVEOPTIONS UNIX_COMMAND "${NATIVEFLAG}") endif() add_compile_options(${NATIVEOPTIONS}) + set(WITH_RUNTIME_CPU_DETECTION OFF) else() message(STATUS "Ignoring WITH_NATIVE_INSTRUCTIONS; not implemented yet on this configuration") set(WITH_NATIVE_INSTRUCTIONS OFF) endif() endif() +# Compile without functable or CPU detection +if(NOT WITH_RUNTIME_CPU_DETECTION) + if(MSVC AND BASEARCH_X86_FOUND) + message(STATUS "WARNING: Microsoft Visual Studio does not support compile time detection of CPU features for \"/arch\" before \"AVX\"") + endif() + add_definitions(-DDISABLE_RUNTIME_CPU_DETECTION) +endif() + # Force disable LTO if WITH_NATIVE_INSTRUCTIONS is not active if(NOT WITH_NATIVE_INSTRUCTIONS) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION OFF) @@ -1302,6 +1312,7 @@ add_feature_info(WITH_OPTIM WITH_OPTIM "Build with optimisation") add_feature_info(WITH_NEW_STRATEGIES WITH_NEW_STRATEGIES "Use new strategies") add_feature_info(WITH_NATIVE_INSTRUCTIONS WITH_NATIVE_INSTRUCTIONS "Instruct the compiler to use the full instruction set on this host (gcc/clang -march=native)") +add_feature_info(WITH_RUNTIME_CPU_DETECTION WITH_RUNTIME_CPU_DETECTION "Build with runtime CPU detection") add_feature_info(WITH_MAINTAINER_WARNINGS WITH_MAINTAINER_WARNINGS "Build with project maintainer warnings") add_feature_info(WITH_CODE_COVERAGE WITH_CODE_COVERAGE "Enable code coverage reporting") add_feature_info(WITH_INFLATE_STRICT WITH_INFLATE_STRICT "Build with strict inflate distance checking") diff --git a/README.md b/README.md index 123a4bbc..411621b5 100644 --- a/README.md +++ b/README.md @@ -94,20 +94,21 @@ make test Build Options ------------- -| CMake | configure | Description | Default | -|:-------------------------|:-------------------------|:--------------------------------------------------------------------------------------|---------| -| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF | -| ZLIB_ENABLE_TESTS | | Build test binaries | ON | -| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON | -| WITH_OPTIM | --without-optimizations | Build with optimisations | ON | -| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON | -| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | -| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF | -| WITH_GTEST | | Build gtest_zlib | ON | -| WITH_FUZZERS | | Build test/fuzz | OFF | -| WITH_BENCHMARKS | | Build test/benchmarks | OFF | -| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | -| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | +| CMake | configure | Description | Default | +|:---------------------------|:-------------------------|:------------------------------------------------------------------------------------|---------| +| ZLIB_COMPAT | --zlib-compat | Compile with zlib compatible API | OFF | +| ZLIB_ENABLE_TESTS | | Build test binaries | ON | +| WITH_GZFILEOP | --without-gzfileops | Compile with support for gzFile related functions | ON | +| WITH_OPTIM | --without-optimizations | Build with optimisations | ON | +| WITH_NEW_STRATEGIES | --without-new-strategies | Use new strategies | ON | +| WITH_NATIVE_INSTRUCTIONS | | Compiles with full instruction set supported on this host (gcc/clang -march=native) | OFF | +| WITH_RUNTIME_CPU_DETECTION | | Compiles with runtime CPU detection | ON | +| WITH_SANITIZER | | Build with sanitizer (memory, address, undefined) | OFF | +| WITH_GTEST | | Build gtest_zlib | ON | +| WITH_FUZZERS | | Build test/fuzz | OFF | +| WITH_BENCHMARKS | | Build test/benchmarks | OFF | +| WITH_MAINTAINER_WARNINGS | | Build with project maintainer warnings | OFF | +| WITH_CODE_COVERAGE | | Enable code coverage reporting | OFF | Install diff --git a/arch/arm/arm_functions.h b/arch/arm/arm_functions.h index 95a9a7e0..61c68271 100644 --- a/arch/arm/arm_functions.h +++ b/arch/arm/arm_functions.h @@ -5,7 +5,6 @@ #ifndef ARM_FUNCTIONS_H_ #define ARM_FUNCTIONS_H_ - #ifdef ARM_NEON uint32_t adler32_neon(uint32_t adler, const uint8_t *buf, size_t len); uint32_t chunksize_neon(void); @@ -28,4 +27,39 @@ uint32_t crc32_acle(uint32_t crc, const uint8_t *buf, size_t len); void slide_hash_armv6(deflate_state *s); #endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// ARM - SIMD +# if (defined(ARM_SIMD) && defined(__ARM_FEATURE_SIMD32)) || defined(ARM_NOCHECK_SIMD) +# undef native_slide_hash +# define native_slide_hash slide_hash_armv6 +# endif +// ARM - NEON +# if (defined(ARM_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) || ARM_NOCHECK_NEON +# undef native_adler32 +# define native_adler32 adler32_neon +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_neon +# undef native_chunksize +# define native_chunksize chunksize_neon +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_neon +# undef native_slide_hash +# define native_slide_hash slide_hash_neon +# ifdef HAVE_BUILTIN_CTZLL +# undef native_compare256 +# define native_compare256 compare256_neon +# undef native_longest_match +# define native_longest_match longest_match_neon +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_neon +# endif +# endif +// ARM - ACLE +# if defined(ARM_ACLE) && defined(__ARM_ACLE) && defined(__ARM_FEATURE_CRC32) +# undef native_crc32 +# define native_crc32 crc32_acle +# endif +#endif + #endif /* ARM_FUNCTIONS_H_ */ diff --git a/arch/generic/generic_functions.h b/arch/generic/generic_functions.h index 02b2cdda..997dd4d0 100644 --- a/arch/generic/generic_functions.h +++ b/arch/generic/generic_functions.h @@ -84,4 +84,23 @@ uint32_t longest_match_slow_c(deflate_state *const s, Pos cur_match); # define compare256_generic compare256_c #endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Generic code +# define native_adler32 adler32_c +# define native_adler32_fold_copy adler32_fold_copy_c +# define native_chunkmemset_safe chunkmemset_safe_c +# define native_chunksize chunksize_c +# define native_crc32 PREFIX(crc32_braid) +# define native_crc32_fold crc32_fold_c +# define native_crc32_fold_copy crc32_fold_copy_c +# define native_crc32_fold_final crc32_fold_final_c +# define native_crc32_fold_reset crc32_fold_reset_c +# define native_inflate_fast inflate_fast_c +# define native_slide_hash slide_hash_c +# define native_longest_match longest_match_generic +# define native_longest_match_slow longest_match_slow_generic +# define native_compare256 compare256_generic +#endif + #endif diff --git a/arch/power/power_functions.h b/arch/power/power_functions.h index c64eafcd..cb6b7650 100644 --- a/arch/power/power_functions.h +++ b/arch/power/power_functions.h @@ -27,4 +27,41 @@ uint32_t longest_match_power9(deflate_state *const s, Pos cur_match); uint32_t longest_match_slow_power9(deflate_state *const s, Pos cur_match); #endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// Power - VMX +# if defined(PPC_VMX) && defined(__ALTIVEC__) +# undef native_adler32 +# define native_adler32 adler32_vmx +# undef native_slide_hash +# define native_slide_hash slide_hash_vmx +# endif +// Power8 - VSX +# if defined(POWER8_VSX) && defined(_ARCH_PWR8) && defined(__VSX__) +# undef native_adler32 +# define native_adler32 adler32_power8 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_power8 +# undef native_chunksize +# define native_chunksize chunksize_power8 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_power8 +# undef native_slide_hash +# define native_slide_hash slide_hash_power8 +# endif +# if defined(POWER8_VSX_CRC32) && defined(_ARCH_PWR8) && defined(__VSX__) +# undef native_crc32 +# define native_crc32 crc32_power8 +# endif +// Power9 +# if defined(POWER9) && defined(_ARCH_PWR9) +# undef native_compare256 +# define native_compare256 compare256_power9 +# undef native_longest_match +# define native_longest_match longest_match_power9 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_power9 +# endif +#endif + #endif /* POWER_FUNCTIONS_H_ */ diff --git a/arch/riscv/riscv_functions.h b/arch/riscv/riscv_functions.h index 90b398fb..015b2fbd 100644 --- a/arch/riscv/riscv_functions.h +++ b/arch/riscv/riscv_functions.h @@ -22,4 +22,28 @@ void slide_hash_rvv(deflate_state *s); void inflate_fast_rvv(PREFIX3(stream) *strm, uint32_t start); #endif +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// RISCV - RVV +# if defined(RISCV_RVV) && defined(__riscv_v) && defined(__linux__) +# undef native_adler32 +# define native_adler32 adler32_rvv +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_rvv +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_rvv +# undef native_chunksize +# define native_chunksize chunksize_rvv +# undef native_compare256 +# define native_compare256 compare256_rvv +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_rvv +# undef native_longest_match +# define native_longest_match longest_match_rvv +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_rvv +# undef native_slide_hash +# define native_slide_hash slide_hash_rvv +# endif +#endif + #endif /* RISCV_FUNCTIONS_H_ */ diff --git a/arch/s390/s390_functions.h b/arch/s390/s390_functions.h index e9f3cda4..e9c67978 100644 --- a/arch/s390/s390_functions.h +++ b/arch/s390/s390_functions.h @@ -9,4 +9,12 @@ uint32_t crc32_s390_vx(uint32_t crc, const uint8_t *buf, size_t len); #endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +# if defined(S390_CRC32_VX) && defined(__zarch__) && __ARCH__ >= 11 && defined(__VX__) +# undef native_crc32 +# define native_crc32 = crc32_s390_vx +# endif +#endif + #endif diff --git a/arch/x86/x86_functions.h b/arch/x86/x86_functions.h index 55ec4acc..5aa9b317 100644 --- a/arch/x86/x86_functions.h +++ b/arch/x86/x86_functions.h @@ -67,4 +67,106 @@ uint32_t crc32_fold_vpclmulqdq_final(crc32_fold *crc); uint32_t crc32_vpclmulqdq(uint32_t crc32, const uint8_t *buf, size_t len); #endif + +#ifdef DISABLE_RUNTIME_CPU_DETECTION +// X86 - SSE2 +# if (defined(X86_SSE2) && defined(__SSE2__)) || defined(__x86_64__) || defined(_M_X64) || defined(X86_NOCHECK_SSE2) +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_sse2 +# undef native_chunksize +# define native_chunksize chunksize_sse2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_sse2 +# undef native_slide_hash +# define native_slide_hash slide_hash_sse2 +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_sse2 +# undef native_longest_match +# define native_longest_match longest_match_sse2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_sse2 +# endif +#endif +// X86 - SSSE3 +# if defined(X86_SSSE3) && defined(__SSSE3__) +# undef native_adler32 +# define native_adler32 adler32_ssse3 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_ssse3 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_ssse3 +# endif +// X86 - SSE4.2 +# if defined(X86_SSE42) && defined(__SSE4_2__) +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_sse42 +# endif + +// X86 - PCLMUL +#if defined(X86_PCLMULQDQ_CRC) && defined(__PCLMUL__) +# undef native_crc32 +# define native_crc32 crc32_pclmulqdq +# undef native_crc32_fold +# define native_crc32_fold crc32_fold_pclmulqdq +# undef native_crc32_fold_copy +# define native_crc32_fold_copy crc32_fold_pclmulqdq_copy +# undef native_crc32_fold_final +# define native_crc32_fold_final crc32_fold_pclmulqdq_final +# undef native_crc32_fold_reset +# define native_crc32_fold_reset crc32_fold_pclmulqdq_reset +#endif +// X86 - AVX +# if defined(X86_AVX2) && defined(__AVX2__) +# undef native_adler32 +# define native_adler32 adler32_avx2 +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx2 +# undef native_chunkmemset_safe +# define native_chunkmemset_safe chunkmemset_safe_avx2 +# undef native_chunksize +# define native_chunksize chunksize_avx2 +# undef native_inflate_fast +# define native_inflate_fast inflate_fast_avx2 +# undef native_slide_hash +# define native_slide_hash slide_hash_avx2 +# ifdef HAVE_BUILTIN_CTZ +# undef native_compare256 +# define native_compare256 compare256_avx2 +# undef native_longest_match +# define native_longest_match longest_match_avx2 +# undef native_longest_match_slow +# define native_longest_match_slow longest_match_slow_avx2 +# endif +# endif + +// X86 - AVX512 (F,DQ,BW,Vl) +# if defined(X86_AVX512) && defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VL__) +# undef native_adler32 +# define native_adler32 adler32_avx512 +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx512 +// X86 - AVX512 (VNNI) +# if defined(X86_AVX512VNNI) && defined(__AVX512VNNI__) +# undef native_adler32 +# define native_adler32 adler32_avx512_vnni +# undef native_adler32_fold_copy +# define native_adler32_fold_copy adler32_fold_copy_avx512_vnni +# endif +// X86 - VPCLMULQDQ +# if defined(__PCLMUL__) && defined(__AVX512F__) && defined(__VPCLMULQDQ__) +# undef native_crc32 +# define native_crc32 crc32_vpclmulqdq +# undef native_crc32_fold +# define native_crc32_fold crc32_fold_vpclmulqdq +# undef native_crc32_fold_copy +# define native_crc32_fold_copy crc32_fold_vpclmulqdq_copy +# undef native_crc32_fold_final +# define native_crc32_fold_final crc32_fold_vpclmulqdq_final +# undef native_crc32_fold_reset +# define native_crc32_fold_reset crc32_fold_vpclmulqdq_reset +# endif +# endif +#endif + #endif /* X86_FUNCTIONS_H_ */ diff --git a/deflate.c b/deflate.c index b542815b..cf77eb94 100644 --- a/deflate.c +++ b/deflate.c @@ -190,8 +190,10 @@ int32_t ZNG_CONDEXPORT PREFIX(deflateInit2)(PREFIX3(stream) *strm, int32_t level deflate_state *s; int wrap = 1; +#ifndef DISABLE_RUNTIME_CPU_DETECTION /* Force initialization functable, because deflate captures function pointers from functable. */ functable.force_init(); +#endif if (strm == NULL) return Z_STREAM_ERROR; diff --git a/functable.c b/functable.c index c9444c04..8012a40b 100644 --- a/functable.c +++ b/functable.c @@ -2,6 +2,7 @@ * Copyright (C) 2017 Hans Kristian Rosbach * For conditions of distribution and use, see copyright notice in zlib.h */ +#ifndef DISABLE_RUNTIME_CPU_DETECTION #include "zbuild.h" #include "functable.h" @@ -122,6 +123,7 @@ static void init_functable(void) { # endif } #endif + // X86 - AVX512 (F,DQ,BW,Vl) #ifdef X86_AVX512 if (cf.x86.has_avx512) { ft.adler32 = &adler32_avx512; @@ -348,3 +350,5 @@ Z_INTERNAL struct functable_s functable = { longest_match_slow_stub, slide_hash_stub, }; + +#endif diff --git a/functable.h b/functable.h index b0d64d49..81a386cd 100644 --- a/functable.h +++ b/functable.h @@ -9,6 +9,19 @@ #include "deflate.h" #include "crc32.h" +#ifdef DISABLE_RUNTIME_CPU_DETECTION + +# include "arch_functions.h" + +/* When compiling with native instructions it is not necessary to use functable. + * Instead we use native_ macro indicating the best available variant of arch-specific + * functions for the current platform. + */ +# define FUNCTABLE_CALL(name) native_ ## name +# define FUNCTABLE_FPTR(name) &native_ ## name + +#else + struct functable_s { void (* force_init) (void); uint32_t (* adler32) (uint32_t adler, const uint8_t *buf, size_t len); @@ -32,8 +45,9 @@ Z_INTERNAL extern struct functable_s functable; /* Explicitly indicate functions are conditionally dispatched. */ -#define FUNCTABLE_CALL(name) functable.name -#define FUNCTABLE_FPTR(name) functable.name - +# define FUNCTABLE_CALL(name) functable.name +# define FUNCTABLE_FPTR(name) functable.name + +#endif #endif diff --git a/inflate.c b/inflate.c index 52b0a29e..9acfbc05 100644 --- a/inflate.c +++ b/inflate.c @@ -139,8 +139,10 @@ int32_t ZNG_CONDEXPORT PREFIX(inflateInit2)(PREFIX3(stream) *strm, int32_t windo int32_t ret; struct inflate_state *state; +#ifndef DISABLE_RUNTIME_CPU_DETECTION /* Initialize functable earlier. */ functable.force_init(); +#endif if (strm == NULL) return Z_STREAM_ERROR;