mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-07 18:04:46 +00:00
Bug 1807473 - Update libjxl and highway r=tnikkel
Differential Revision: https://phabricator.services.mozilla.com/D166317
This commit is contained in:
parent
bb9d3ed10b
commit
c9046ede50
@ -20,11 +20,11 @@ origin:
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
|
||||
release: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a (2022-11-18T10:04:25Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
|
||||
revision: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -10,9 +10,9 @@ origin:
|
||||
|
||||
url: https://github.com/libjxl/libjxl
|
||||
|
||||
release: afa493d9c7c8b47b6ce709180a74a49085291776 (2022-11-12T22:27:21Z).
|
||||
release: 31e38dae584bae991631750ed6a04f1f6323846a (2023-01-09T10:57:58Z).
|
||||
|
||||
revision: afa493d9c7c8b47b6ce709180a74a49085291776
|
||||
revision: 31e38dae584bae991631750ed6a04f1f6323846a
|
||||
|
||||
license: Apache-2.0
|
||||
|
||||
|
21
third_party/highway/BUILD
vendored
21
third_party/highway/BUILD
vendored
@ -161,6 +161,8 @@ cc_library(
|
||||
# These are textual because config macros influence them:
|
||||
"hwy/detect_targets.h", # private
|
||||
"hwy/targets.h",
|
||||
# This .cc file #includes itself through foreach_target.h
|
||||
"hwy/per_target.cc",
|
||||
# End of list
|
||||
"hwy/highway.h", # public
|
||||
"hwy/foreach_target.h", # public
|
||||
@ -179,7 +181,10 @@ cc_library(
|
||||
"hwy/ops/x86_512-inl.h",
|
||||
# Select avoids recompiling native arch if only non-native changed
|
||||
] + select({
|
||||
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
|
||||
":compiler_emscripten": [
|
||||
"hwy/ops/wasm_128-inl.h",
|
||||
"hwy/ops/wasm_256-inl.h",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
|
||||
@ -201,6 +206,18 @@ cc_library(
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "bit_pack",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/bit_pack/bit_pack-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "dot",
|
||||
compatible_with = [],
|
||||
@ -303,6 +320,7 @@ HWY_TESTS = [
|
||||
("hwy/contrib/algo/", "copy_test"),
|
||||
("hwy/contrib/algo/", "find_test"),
|
||||
("hwy/contrib/algo/", "transform_test"),
|
||||
("hwy/contrib/bit_pack/", "bit_pack_test"),
|
||||
("hwy/contrib/dot/", "dot_test"),
|
||||
("hwy/contrib/image/", "image_test"),
|
||||
("hwy/contrib/math/", "math_test"),
|
||||
@ -349,6 +367,7 @@ HWY_TEST_COPTS = select({
|
||||
|
||||
HWY_TEST_DEPS = [
|
||||
":algo",
|
||||
":bit_pack",
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
|
61
third_party/highway/CMakeLists.txt
vendored
61
third_party/highway/CMakeLists.txt
vendored
@ -19,7 +19,13 @@ if(POLICY CMP0083)
|
||||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 1.0.1) # Keep in sync with highway.h version
|
||||
# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
|
||||
# target "GTest::gtest_main"'.
|
||||
if(POLICY CMP0111)
|
||||
cmake_policy(SET CMP0111 OLD)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 1.0.2) # Keep in sync with highway.h version
|
||||
|
||||
# Directly define the ABI version from the cmake project() version values:
|
||||
set(LIBRARY_VERSION "${hwy_VERSION}")
|
||||
@ -27,6 +33,10 @@ set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
|
||||
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
# Search for Atomics implementation:
|
||||
find_package(Atomics REQUIRED)
|
||||
|
||||
# Enabled PIE binaries by default if supported.
|
||||
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
|
||||
if(CHECK_PIE_SUPPORTED)
|
||||
@ -51,6 +61,7 @@ set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
|
||||
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
|
||||
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
|
||||
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
|
||||
set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
@ -111,6 +122,7 @@ set(HWY_SOURCES
|
||||
hwy/ops/arm_sve-inl.h
|
||||
hwy/ops/emu128-inl.h
|
||||
hwy/ops/generic_ops-inl.h
|
||||
hwy/ops/rvv-inl.h
|
||||
hwy/ops/scalar-inl.h
|
||||
hwy/ops/set_macros-inl.h
|
||||
hwy/ops/shared-inl.h
|
||||
@ -225,8 +237,11 @@ else()
|
||||
endif() # HWY_CMAKE_ARM7
|
||||
|
||||
if(HWY_RISCV)
|
||||
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
# Not yet supported by GCC. When runtime dispatch is supported and
|
||||
# implemented, we will remove v from the required flags. Until then, using
|
||||
# clang for RISC-V will require the CPU to support the V extension (1.0).
|
||||
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
|
||||
list(APPEND HWY_FLAGS -menable-experimental-extensions)
|
||||
endif()
|
||||
endif()
|
||||
@ -277,16 +292,29 @@ target_include_directories(hwy PUBLIC
|
||||
target_compile_features(hwy PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# For GCC __atomic_store_8, see #887
|
||||
target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
if(NOT HWY_EMSCRIPTEN)
|
||||
# For GCC __atomic_store_8, see #887
|
||||
target_link_libraries(hwy atomic)
|
||||
endif()
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
set_property(TARGET hwy APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
# uname -p is broken on this system. Try uname -m
|
||||
EXECUTE_PROCESS( COMMAND uname -m
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
ERROR_QUIET
|
||||
OUTPUT_VARIABLE HWY_ARCH)
|
||||
else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||
endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
message(STATUS "Architecture: " ${HWY_ARCH})
|
||||
if (HWY_ARCH MATCHES "mips")
|
||||
target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
|
||||
endif (HWY_ARCH MATCHES "mips")
|
||||
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
|
||||
target_link_libraries(hwy_contrib hwy)
|
||||
@ -426,7 +454,7 @@ endif() # HWY_ENABLE_EXAMPLES
|
||||
|
||||
include(CTest)
|
||||
|
||||
if(BUILD_TESTING)
|
||||
if(BUILD_TESTING AND HWY_ENABLE_TESTS)
|
||||
enable_testing()
|
||||
include(GoogleTest)
|
||||
|
||||
@ -458,13 +486,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
|
||||
EXCLUDE_FROM_ALL)
|
||||
|
||||
# The gtest/gtest_main targets carry header search path
|
||||
# dependencies automatically when using CMake 2.8.11 or
|
||||
# later. Otherwise we have to add them here ourselves.
|
||||
if (CMAKE_VERSION VERSION_LESS 2.8.11)
|
||||
include_directories("${gtest_SOURCE_DIR}/include")
|
||||
endif()
|
||||
endif() # HWY_SYSTEM_GTEST
|
||||
|
||||
set(HWY_TEST_FILES
|
||||
@ -517,7 +538,11 @@ list(APPEND HWY_TEST_FILES
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
|
||||
if (CMAKE_VERSION VERSION_LESS 3.20)
|
||||
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
|
||||
else()
|
||||
set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
|
||||
endif()
|
||||
else()
|
||||
set(HWY_GTEST_LIBS gtest gtest_main)
|
||||
endif()
|
||||
@ -534,7 +559,9 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
||||
# that include us may set them.
|
||||
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
|
||||
|
||||
target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
|
||||
target_link_libraries(${TESTNAME} PRIVATE ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
|
||||
# For GCC __atomic_store_8, see #887
|
||||
target_link_libraries(${TESTNAME} PRIVATE ${ATOMICS_LIBRARIES})
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
|
||||
|
||||
|
39
third_party/highway/README.md
vendored
39
third_party/highway/README.md
vendored
@ -55,7 +55,8 @@ layouts, and aligned/padded allocations.
|
||||
|
||||
Online demos using Compiler Explorer:
|
||||
|
||||
- [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
|
||||
- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
|
||||
(recommended)
|
||||
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
|
||||
|
||||
Projects using Highway: (to add yours, feel free to raise an issue or contact us
|
||||
@ -74,6 +75,10 @@ Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
|
||||
WASM SIMD, RISC-V V.
|
||||
|
||||
`HWY_WASM_EMU256` is a 2x unrolled version of wasm128 and is enabled if
|
||||
`HWY_WANT_WASM2` is defined. This will remain supported until it is potentially
|
||||
superseded by a future version of WASM.
|
||||
|
||||
SVE was initially tested using farm_sve (see acknowledgments).
|
||||
|
||||
### Versioning
|
||||
@ -134,6 +139,10 @@ Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
|
||||
|
||||
Bazel is also supported for building, but it is not as widely used/tested.
|
||||
|
||||
When building for Arm v7, a limitation of current compilers requires you to add
|
||||
`-DHWY_CMAKE_ARM7:BOOL=ON` to the CMake command line; see #834 and #1032. We
|
||||
understand that work is underway to remove this limitation.
|
||||
|
||||
## Quick start
|
||||
|
||||
You can use the `benchmark` inside examples/ as a starting point.
|
||||
@ -142,6 +151,9 @@ A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
|
||||
and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
|
||||
indicates the number of instructions per operation.
|
||||
|
||||
The [FAQ](g3doc/faq.md) answers questions about portability, API design and
|
||||
where to find more information.
|
||||
|
||||
We recommend using full SIMD vectors whenever possible for maximum performance
|
||||
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
|
||||
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
|
||||
@ -163,8 +175,8 @@ Due to ADL restrictions, user code calling Highway ops must either:
|
||||
hn::Add()`; or
|
||||
* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
|
||||
|
||||
Additionally, each function that calls Highway ops must either be prefixed with
|
||||
`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
Additionally, each function that calls Highway ops (such as `Load`) must either
|
||||
be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
|
||||
their opening brace.
|
||||
|
||||
@ -186,6 +198,27 @@ they use static or dynamic dispatch.
|
||||
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
|
||||
defined and `foreach_target.h` is included.
|
||||
|
||||
When using dynamic dispatch, `foreach_target.h` is included from translation
|
||||
units (.cc files), not headers. Headers containing vector code shared between
|
||||
several translation units require a special include guard, for example the
|
||||
following taken from `examples/skeleton-inl.h`:
|
||||
|
||||
```
|
||||
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
// Your vector code
|
||||
#endif
|
||||
```
|
||||
|
||||
By convention, we name such headers `-inl.h` because their contents (often
|
||||
function templates) are usually inlined.
|
||||
|
||||
## Compiler flags
|
||||
|
||||
Applications should be compiled with optimizations enabled - without inlining,
|
||||
|
56
third_party/highway/cmake/FindAtomics.cmake
vendored
Normal file
56
third_party/highway/cmake/FindAtomics.cmake
vendored
Normal file
@ -0,0 +1,56 @@
|
||||
# Original issue:
|
||||
# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733
|
||||
#
|
||||
# For reference:
|
||||
# * https://gcc.gnu.org/wiki/Atomic/GCCMM
|
||||
#
|
||||
# riscv64 specific:
|
||||
# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html
|
||||
#
|
||||
# ATOMICS_FOUND - system has c++ atomics
|
||||
# ATOMICS_LIBRARIES - libraries needed to use c++ atomics
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
|
||||
# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed
|
||||
# to convert smaller atomics to those larger ones via masking and
|
||||
# shifting like LLVM, but it’s a known bug that it does not. This means
|
||||
# anything that wants to use atomics on 1-byte or 2-byte types needs
|
||||
# -latomic, but not 4-byte or 8-byte (though it does no harm).
|
||||
set(atomic_code
|
||||
"
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
std::atomic<uint8_t> n8 (0); // riscv64
|
||||
std::atomic<uint64_t> n64 (0); // armel, mipsel, powerpc
|
||||
int main() {
|
||||
++n8;
|
||||
++n64;
|
||||
return 0;
|
||||
}")
|
||||
|
||||
# https://gitlab.kitware.com/cmake/cmake/-/issues/24063
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS)
|
||||
|
||||
if(ATOMICS_LOCK_FREE_INSTRUCTIONS)
|
||||
set(ATOMICS_FOUND TRUE)
|
||||
set(ATOMICS_LIBRARIES)
|
||||
else()
|
||||
set(CMAKE_REQUIRED_LIBRARIES "-latomic")
|
||||
check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY)
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
if(ATOMICS_IN_LIBRARY)
|
||||
set(ATOMICS_LIBRARY atomic)
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY)
|
||||
set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY})
|
||||
unset(ATOMICS_LIBRARY)
|
||||
else()
|
||||
if(Atomics_FIND_REQUIRED)
|
||||
message(FATAL_ERROR "Neither lock free instructions nor -latomic found.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
unset(atomic_code)
|
||||
unset(CMAKE_CXX_STANDARD)
|
15
third_party/highway/debian/changelog
vendored
15
third_party/highway/debian/changelog
vendored
@ -1,3 +1,18 @@
|
||||
highway (1.0.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
|
||||
* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
|
||||
* Faster sort for low-entropy input, improved pivot selection
|
||||
* Add GN build system, Highway FAQ, k32v32 type to vqsort
|
||||
* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
|
||||
* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
|
||||
* Fix missing exec_prefix, RVV build, warnings, libatomic linking
|
||||
* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
|
||||
* Documentation/example improvements
|
||||
* Support static dispatch to SVE2_128 and SVE_256
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 27 Oct 2022 17:00:00 +0200
|
||||
|
||||
highway (1.0.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add Eq128, i64 Mul, unsigned->float ConvertTo
|
||||
|
53
third_party/highway/hwy.gni
vendored
Normal file
53
third_party/highway/hwy.gni
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
_hwy = get_path_info("hwy", "abspath")
|
||||
|
||||
hwy_public = [
|
||||
# Public
|
||||
"$_hwy/aligned_allocator.h",
|
||||
"$_hwy/base.h",
|
||||
"$_hwy/cache_control.h",
|
||||
"$_hwy/per_target.h",
|
||||
"$_hwy/print.h",
|
||||
|
||||
# Public, textual
|
||||
"$_hwy/foreach_target.h",
|
||||
"$_hwy/highway_export.h",
|
||||
"$_hwy/highway.h",
|
||||
"$_hwy/print-inl.h",
|
||||
|
||||
# Private
|
||||
"$_hwy/detect_compiler_arch.h",
|
||||
"$_hwy/detect_targets.h",
|
||||
"$_hwy/targets.h",
|
||||
|
||||
# Private, textual:
|
||||
"$_hwy/ops/arm_neon-inl.h",
|
||||
"$_hwy/ops/arm_sve-inl.h",
|
||||
"$_hwy/ops/emu128-inl.h",
|
||||
"$_hwy/ops/generic_ops-inl.h",
|
||||
"$_hwy/ops/scalar-inl.h",
|
||||
"$_hwy/ops/set_macros-inl.h",
|
||||
"$_hwy/ops/shared-inl.h",
|
||||
"$_hwy/ops/x86_128-inl.h",
|
||||
"$_hwy/ops/x86_256-inl.h",
|
||||
"$_hwy/ops/x86_512-inl.h",
|
||||
]
|
||||
|
||||
hwy_sources = [
|
||||
"$_hwy/aligned_allocator.cc",
|
||||
"$_hwy/per_target.cc",
|
||||
"$_hwy/print.cc",
|
||||
"$_hwy/targets.cc",
|
||||
]
|
||||
|
||||
hwy_contrib_public = [
|
||||
"$_hwy/contrib/algo/copy-inl.h",
|
||||
"$_hwy/contrib/algo/find-inl.h",
|
||||
"$_hwy/contrib/algo/transform-inl.h",
|
||||
"$_hwy/contrib/dot/dot-inl.h",
|
||||
"$_hwy/contrib/image/image.h",
|
||||
"$_hwy/contrib/math/math-inl.h",
|
||||
]
|
||||
|
||||
hwy_contrib_sources = [
|
||||
"$_hwy/contrib/image/image.cc",
|
||||
]
|
@ -48,7 +48,7 @@ class SampleObject {
|
||||
|
||||
class FakeAllocator {
|
||||
public:
|
||||
// static AllocPtr and FreePtr member to be used with the alligned
|
||||
// static AllocPtr and FreePtr member to be used with the aligned
|
||||
// allocator. These functions calls the private non-static members.
|
||||
static void* StaticAlloc(void* opaque, size_t bytes) {
|
||||
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
|
||||
|
86
third_party/highway/hwy/base.h
vendored
86
third_party/highway/hwy/base.h
vendored
@ -143,7 +143,7 @@
|
||||
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
|
||||
#else
|
||||
#define HWY_UNROLL(factor)
|
||||
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
|
||||
#define HWY_DEFAULT_UNROLL
|
||||
#endif
|
||||
|
||||
|
||||
@ -293,6 +293,13 @@ struct alignas(16) K64V64 {
|
||||
uint64_t key;
|
||||
};
|
||||
|
||||
// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
|
||||
// than when considering both to be a 64-bit key.
|
||||
struct alignas(8) K32V32 {
|
||||
uint32_t value; // little-endian layout
|
||||
uint32_t key;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
|
||||
@ -304,6 +311,10 @@ static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return b < a;
|
||||
}
|
||||
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return a.lo == b.lo && a.hi == b.hi;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
@ -314,6 +325,24 @@ static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return b < a;
|
||||
}
|
||||
static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return a.key == b.key;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
|
||||
const K32V32& b) {
|
||||
return a.key < b.key;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
|
||||
const K32V32& b) {
|
||||
return b < a;
|
||||
}
|
||||
static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
|
||||
const K32V32& b) {
|
||||
return a.key == b.key;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Controlling overload resolution (SFINAE)
|
||||
@ -369,6 +398,8 @@ HWY_API constexpr bool IsSame() {
|
||||
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
|
||||
#define HWY_IF_LANE_SIZE_LT(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
|
||||
#define HWY_IF_LANE_SIZE_GE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) >= (bytes)>* = nullptr
|
||||
|
||||
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
|
||||
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
|
||||
@ -401,16 +432,14 @@ struct Relations<uint8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = uint16_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = int16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint16_t> {
|
||||
@ -418,8 +447,7 @@ struct Relations<uint16_t> {
|
||||
using Signed = int16_t;
|
||||
using Wide = uint32_t;
|
||||
using Narrow = uint8_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int16_t> {
|
||||
@ -427,8 +455,7 @@ struct Relations<int16_t> {
|
||||
using Signed = int16_t;
|
||||
using Wide = int32_t;
|
||||
using Narrow = int8_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint32_t> {
|
||||
@ -437,8 +464,7 @@ struct Relations<uint32_t> {
|
||||
using Float = float;
|
||||
using Wide = uint64_t;
|
||||
using Narrow = uint16_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int32_t> {
|
||||
@ -447,8 +473,7 @@ struct Relations<int32_t> {
|
||||
using Float = float;
|
||||
using Wide = int64_t;
|
||||
using Narrow = int16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint64_t> {
|
||||
@ -457,8 +482,7 @@ struct Relations<uint64_t> {
|
||||
using Float = double;
|
||||
using Wide = uint128_t;
|
||||
using Narrow = uint32_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int64_t> {
|
||||
@ -466,15 +490,13 @@ struct Relations<int64_t> {
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = int32_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint128_t> {
|
||||
using Unsigned = uint128_t;
|
||||
using Narrow = uint64_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float16_t> {
|
||||
@ -482,16 +504,14 @@ struct Relations<float16_t> {
|
||||
using Signed = int16_t;
|
||||
using Float = float16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<bfloat16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float> {
|
||||
@ -500,8 +520,7 @@ struct Relations<float> {
|
||||
using Float = float;
|
||||
using Wide = double;
|
||||
using Narrow = float16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<double> {
|
||||
@ -509,8 +528,7 @@ struct Relations<double> {
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
@ -649,6 +667,20 @@ constexpr double HighestValue<double>() {
|
||||
return 1.7976931348623158e+308;
|
||||
}
|
||||
|
||||
// Difference between 1.0 and the next representable value.
|
||||
template <typename T>
|
||||
HWY_API constexpr T Epsilon() {
|
||||
return 1;
|
||||
}
|
||||
template <>
|
||||
constexpr float Epsilon<float>() {
|
||||
return 1.192092896e-7f;
|
||||
}
|
||||
template <>
|
||||
constexpr double Epsilon<double>() {
|
||||
return 2.2204460492503131e-16;
|
||||
}
|
||||
|
||||
// Returns width in bits of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr int MantissaBits() {
|
||||
|
@ -13,7 +13,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <algorithm> // std::find_if
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
1438
third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
vendored
Normal file
1438
third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
177
third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
vendored
Normal file
177
third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
vendored
Normal file
@ -0,0 +1,177 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/bit_pack/bit_pack-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#ifndef HWY_BIT_PACK_BENCHMARK
|
||||
#define HWY_BIT_PACK_BENCHMARK 0
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
// Used to prevent running benchmark (slow) for partial vectors and targets
|
||||
// except the best available. Global, not per-target, hence must be outside
|
||||
// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code
|
||||
// has been re-included.
|
||||
extern size_t last_bits;
|
||||
extern uint64_t best_target;
|
||||
#if HWY_ONCE
|
||||
size_t last_bits = 0;
|
||||
uint64_t best_target = ~0ull;
|
||||
#endif
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <size_t kBits, typename T>
|
||||
T Random(RandomState& rng) {
|
||||
return static_cast<T>(Random32(&rng) & kBits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class Checker {
|
||||
public:
|
||||
explicit Checker(size_t num) { raw_.reserve(num); }
|
||||
void NotifyRaw(T raw) { raw_.push_back(raw); }
|
||||
|
||||
void NotifyRawOutput(size_t bits, T raw) {
|
||||
if (raw_[num_verified_] != raw) {
|
||||
HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits,
|
||||
num_verified_, raw_.size(),
|
||||
static_cast<double>(raw_[num_verified_]),
|
||||
static_cast<double>(raw));
|
||||
}
|
||||
++num_verified_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> raw_;
|
||||
size_t num_verified_ = 0;
|
||||
};
|
||||
|
||||
template <class PackT>
|
||||
struct TestPack {
|
||||
template <typename T, class D>
|
||||
void operator()(T /* t */, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
RandomState rng(N * 129);
|
||||
const size_t num = N * PackT::kRawVectors;
|
||||
const size_t packed_size = N * PackT::kPackedVectors;
|
||||
Checker<T> checker(num);
|
||||
AlignedFreeUniquePtr<T[]> raw = hwy::AllocateAligned<T>(num);
|
||||
AlignedFreeUniquePtr<T[]> raw2 = hwy::AllocateAligned<T>(num);
|
||||
AlignedFreeUniquePtr<T[]> packed = hwy::AllocateAligned<T>(packed_size);
|
||||
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
raw[i] = Random<PackT::kBits, T>(rng);
|
||||
checker.NotifyRaw(raw[i]);
|
||||
}
|
||||
|
||||
best_target = HWY_MIN(best_target, HWY_TARGET);
|
||||
const bool run_bench = HWY_BIT_PACK_BENCHMARK &&
|
||||
(PackT::kBits != last_bits) &&
|
||||
(HWY_TARGET == best_target);
|
||||
last_bits = PackT::kBits;
|
||||
|
||||
if (run_bench) {
|
||||
const size_t kNumInputs = 1;
|
||||
const size_t num_items = num * size_t(Unpredictable1());
|
||||
const FuncInput inputs[kNumInputs] = {num_items};
|
||||
Result results[kNumInputs];
|
||||
|
||||
Params p;
|
||||
p.verbose = false;
|
||||
p.max_evals = 7;
|
||||
p.target_rel_mad = 0.002;
|
||||
const size_t num_results = MeasureClosure(
|
||||
[&](FuncInput) HWY_ATTR {
|
||||
PackT().Pack(d, raw.get(), packed.get());
|
||||
PackT().Unpack(d, packed.get(), raw2.get());
|
||||
return raw2[Random32(&rng) % num];
|
||||
},
|
||||
inputs, kNumInputs, results, p);
|
||||
if (num_results != kNumInputs) {
|
||||
fprintf(stderr, "MeasureClosure failed.\n");
|
||||
return;
|
||||
}
|
||||
// Print cycles per element
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
const double cycles_per_item =
|
||||
results[i].ticks / static_cast<double>(results[i].input);
|
||||
const double mad = results[i].variability * cycles_per_item;
|
||||
printf("Bits:%2d elements:%3d cyc/elt:%6.3f (+/- %5.3f)\n",
|
||||
static_cast<int>(PackT::kBits),
|
||||
static_cast<int>(results[i].input), cycles_per_item, mad);
|
||||
}
|
||||
} else {
|
||||
PackT().Pack(d, raw.get(), packed.get());
|
||||
PackT().Unpack(d, packed.get(), raw2.get());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
checker.NotifyRawOutput(PackT::kBits, raw2[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllPack8() {
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<1>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<2>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<3>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<4>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<5>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<6>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<7>>>()(uint8_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack8<8>>>()(uint8_t());
|
||||
}
|
||||
|
||||
void TestAllPack16() {
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<1>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<2>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<3>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<4>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<5>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<6>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<7>>>()(uint16_t());
|
||||
ForShrinkableVectors<TestPack<detail::Pack16<8>>>()(uint16_t());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(BitPackTest);
|
||||
HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8);
|
||||
HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
@ -15,7 +15,7 @@
|
||||
|
||||
#include "hwy/contrib/image/image.h"
|
||||
|
||||
#include <algorithm> // swap
|
||||
#include <algorithm> // std::swap
|
||||
#include <cstddef>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
|
@ -22,7 +22,6 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility> // std::move
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cfloat> // FLT_MAX
|
||||
#include <cmath> // std::abs
|
||||
#include <type_traits>
|
||||
|
||||
// clang-format off
|
||||
|
2
third_party/highway/hwy/contrib/sort/BUILD
vendored
2
third_party/highway/hwy/contrib/sort/BUILD
vendored
@ -79,6 +79,8 @@ cc_library(
|
||||
"vqsort_i32d.cc",
|
||||
"vqsort_i64a.cc",
|
||||
"vqsort_i64d.cc",
|
||||
"vqsort_kv64a.cc",
|
||||
"vqsort_kv64d.cc",
|
||||
"vqsort_kv128a.cc",
|
||||
"vqsort_kv128d.cc",
|
||||
"vqsort_u16a.cc",
|
||||
|
16
third_party/highway/hwy/contrib/sort/README.md
vendored
16
third_party/highway/hwy/contrib/sort/README.md
vendored
@ -9,10 +9,9 @@ and [paper](https://arxiv.org/abs/2205.05982).
|
||||
|
||||
## Instructions
|
||||
|
||||
Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
|
||||
and Arm V1 (NEON, SVE).
|
||||
Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
|
||||
|
||||
### x86 (Linux)
|
||||
### Linux
|
||||
|
||||
Please first ensure golang, and Clang (tested with 13.0.1) are installed via
|
||||
your system's package manager.
|
||||
@ -43,9 +42,10 @@ make -j8 && sudo make install
|
||||
cd ..
|
||||
```
|
||||
|
||||
AWS clang is at version 11.1, which generates unnecessary AND instructions which
|
||||
slow down the sort by 1.15x. We tested with clang trunk as of June 13
|
||||
AWS clang is at version 11.1, which generates unnecessary `AND` instructions
|
||||
which slow down the sort by 1.15x. We tested with clang trunk as of June 13
|
||||
(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
|
||||
|
||||
```
|
||||
git clone --depth 1 https://github.com/llvm/llvm-project.git
|
||||
cd llvm-project
|
||||
@ -64,6 +64,12 @@ bazel-bin/hwy/contrib/sort/sort_test
|
||||
bazel-bin/hwy/contrib/sort/bench_sort
|
||||
```
|
||||
|
||||
The above command line enables SVE, which is currently only available on
|
||||
Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
|
||||
changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
|
||||
such flags will be unnecessary once Clang supports `#pragma target` for NEON and
|
||||
SVE intrinsics, as it does for x86.
|
||||
|
||||
## Results
|
||||
|
||||
`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
|
||||
|
@ -20,8 +20,9 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath> // std::abs
|
||||
#include <algorithm> // std::sort, std::min, std::max
|
||||
#include <functional> // std::less, std::greater
|
||||
#include <thread> // NOLINT
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
@ -81,13 +81,12 @@ HWY_NOINLINE void BenchPartition() {
|
||||
// The pivot value can influence performance. Do exactly what vqsort will
|
||||
// do so that the performance (influenced by prefetching and branch
|
||||
// prediction) is likely to predict the actual performance inside vqsort.
|
||||
detail::PivotResult result;
|
||||
const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
|
||||
buf.get(), rng, result);
|
||||
detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
|
||||
detail::SortSamples(d, st, buf.get());
|
||||
auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
|
||||
|
||||
const Timestamp t0;
|
||||
detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
|
||||
buf.get());
|
||||
detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// 'Use' the result to prevent optimizing out the partition.
|
||||
sum += static_cast<double>(aligned.get()[num_lanes / 2]);
|
||||
|
@ -63,15 +63,16 @@ struct SortConstants {
|
||||
}
|
||||
|
||||
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
|
||||
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
|
||||
// are larger, use entire vectors to ensure we do not overrun the array.
|
||||
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
|
||||
return HWY_MAX(64 / sizeof_t, N);
|
||||
// cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
|
||||
// ensures vectors are no larger than that, so this can be independent of the
|
||||
// vector size and thus constexpr.
|
||||
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
|
||||
return 64 / sizeof_t;
|
||||
}
|
||||
|
||||
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
|
||||
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
|
||||
return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
|
||||
return (3 + 1) * LanesPerChunk(sizeof_t) + 2 * N;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
@ -49,8 +50,10 @@ using detail::TraitsLane;
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderAscendingKV128;
|
||||
using detail::OrderAscendingKV64;
|
||||
using detail::OrderDescending128;
|
||||
using detail::OrderDescendingKV128;
|
||||
using detail::OrderDescendingKV64;
|
||||
using detail::Traits128;
|
||||
|
||||
template <class Traits>
|
||||
@ -282,10 +285,10 @@ static HWY_NOINLINE void TestPartition() {
|
||||
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
for (bool in_asc : {false, true}) {
|
||||
for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
|
||||
for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
|
||||
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
|
||||
for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
|
||||
2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
|
||||
for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
|
||||
3 * N - 1, 4 * N - 3, 4 * N - 2}) {
|
||||
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
|
||||
for (LaneType pivot1 :
|
||||
{LaneType(0), LaneType(len / 3), LaneType(len / 2),
|
||||
@ -311,10 +314,12 @@ static HWY_NOINLINE void TestPartition() {
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
std::unordered_map<LaneType, int> counts;
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
lanes[i] = static_cast<LaneType>(
|
||||
in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
|
||||
: static_cast<LaneType>(right) - LaneType(i));
|
||||
++counts[lanes[i]];
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
@ -324,7 +329,8 @@ static HWY_NOINLINE void TestPartition() {
|
||||
}
|
||||
|
||||
size_t border =
|
||||
detail::Partition(d, st, lanes, left, right, pivot, buf.get());
|
||||
left + detail::Partition(d, st, lanes + left, right - left,
|
||||
pivot, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
@ -335,7 +341,15 @@ static HWY_NOINLINE void TestPartition() {
|
||||
printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
--counts[lanes[i]];
|
||||
}
|
||||
for (auto kv : counts) {
|
||||
if (kv.second != 0) {
|
||||
PrintValue(kv.first);
|
||||
HWY_ABORT("Incorrect count %d\n", kv.second);
|
||||
}
|
||||
}
|
||||
VerifyPartition(st, lanes, left, border, right, N1, pivot2);
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
|
||||
@ -357,15 +371,18 @@ static HWY_NOINLINE void TestPartition() {
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPartition() {
|
||||
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
|
||||
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
|
||||
TestPartition<Traits128<OrderAscending128> >();
|
||||
|
||||
#if !HWY_IS_DEBUG_BUILD
|
||||
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
|
||||
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
|
||||
TestPartition<TraitsLane<OrderDescending<float> > >();
|
||||
#if HWY_HAVE_FLOAT64
|
||||
TestPartition<TraitsLane<OrderDescending<double> > >();
|
||||
#endif
|
||||
TestPartition<Traits128<OrderAscending128> >();
|
||||
TestPartition<Traits128<OrderDescending128> >();
|
||||
#endif
|
||||
}
|
||||
|
||||
// (used for sample selection for choosing a pivot)
|
||||
@ -436,7 +453,13 @@ class CompareResults {
|
||||
const size_t num_keys = copy_.size() / st.LanesPerKey();
|
||||
Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
|
||||
shared, /*thread=*/0);
|
||||
|
||||
#if VQSORT_PRINT >= 3
|
||||
fprintf(stderr, "\nExpected:\n");
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
PrintValue(copy_[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
if (copy_[i] != output[i]) {
|
||||
if (sizeof(KeyType) == 16) {
|
||||
@ -546,7 +569,7 @@ void TestSort(size_t num_lanes) {
|
||||
}
|
||||
|
||||
void TestAllSort() {
|
||||
for (int num : {129, 504, 20 * 1000, 34567}) {
|
||||
for (int num : {129, 504, 3 * 1000, 34567}) {
|
||||
const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
|
||||
TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
|
||||
@ -572,6 +595,9 @@ void TestAllSort() {
|
||||
TestSort<Traits128<OrderAscending128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescending128> >(num_lanes);
|
||||
|
||||
TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
|
||||
|
||||
TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
|
||||
#endif
|
||||
|
157
third_party/highway/hwy/contrib/sort/traits-inl.h
vendored
157
third_party/highway/hwy/contrib/sort/traits-inl.h
vendored
@ -42,6 +42,9 @@ namespace detail {
|
||||
template <typename T>
|
||||
struct KeyLane {
|
||||
static constexpr bool Is128() { return false; }
|
||||
// False indicates the entire key (i.e. lane) should be compared. KV stands
|
||||
// for key-value.
|
||||
static constexpr bool IsKV() { return false; }
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
// What type bench_sort should allocate for generating inputs.
|
||||
@ -78,7 +81,20 @@ struct KeyLane {
|
||||
return Eq(a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Ne(a, b);
|
||||
}
|
||||
|
||||
// For keys=lanes, any difference counts.
|
||||
template <class D>
|
||||
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
|
||||
// Must avoid floating-point comparisons (for -0)
|
||||
const RebindToUnsigned<D> du;
|
||||
return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const T* a, const T* b) const { return *a == *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
@ -223,7 +239,7 @@ struct OrderAscending : public KeyLane<T> {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Sub(v, Set(d, 1));
|
||||
return Sub(v, Set(d, hwy::Epsilon<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
@ -272,7 +288,142 @@ struct OrderDescending : public KeyLane<T> {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Add(v, Set(d, 1));
|
||||
return Add(v, Set(d, hwy::Epsilon<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
struct KeyValue64 : public KeyLane<uint64_t> {
|
||||
// True indicates only part of the key (i.e. lane) should be compared. KV
|
||||
// stands for key-value.
|
||||
static constexpr bool IsKV() { return true; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Eq(ShiftRight<32>(a), ShiftRight<32>(b));
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Ne(ShiftRight<32>(a), ShiftRight<32>(b));
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const {
|
||||
return (*a >> 32) == (*b >> 32);
|
||||
}
|
||||
|
||||
// Only count differences in the actual key, not the value.
|
||||
template <class D>
|
||||
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
|
||||
// Must avoid floating-point comparisons (for -0)
|
||||
const RebindToUnsigned<D> du;
|
||||
const Vec<decltype(du)> zero = Zero(du);
|
||||
const Vec<decltype(du)> keys = ShiftRight<32>(diff); // clear values
|
||||
return AllTrue(du, Eq(BitCast(du, keys), zero));
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderAscendingKV64 : public KeyValue64 {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (*a >> 32) < (*b >> 32);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
|
||||
}
|
||||
|
||||
// Not required to be stable (preserving the order of equivalent keys), so
|
||||
// we can include the value in the comparison.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
// Same as for regular lanes.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Sub(v, Set(d, uint64_t{1}));
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescendingKV64 : public KeyValue64 {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (*b >> 32) < (*a >> 32);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
|
||||
}
|
||||
|
||||
// Not required to be stable (preserving the order of equivalent keys), so
|
||||
// we can include the value in the comparison.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Add(v, Set(d, uint64_t{1}));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -124,6 +124,9 @@ struct KeyAny128 {
|
||||
|
||||
// Base class shared between OrderAscending128, OrderDescending128.
|
||||
struct Key128 : public KeyAny128 {
|
||||
// False indicates the entire key should be compared. KV means key-value.
|
||||
static constexpr bool IsKV() { return false; }
|
||||
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = hwy::uint128_t;
|
||||
|
||||
@ -134,7 +137,20 @@ struct Key128 : public KeyAny128 {
|
||||
return Eq128(d, a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Ne128(d, a, b);
|
||||
}
|
||||
|
||||
// For keys=entire 128 bits, any difference counts.
|
||||
template <class D>
|
||||
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
|
||||
// Must avoid floating-point comparisons (for -0)
|
||||
const RebindToUnsigned<D> du;
|
||||
return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
|
||||
return a[0] == b[0] && a[1] == b[1];
|
||||
}
|
||||
};
|
||||
@ -187,8 +203,12 @@ struct OrderAscending128 : public Key128 {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
|
||||
return Sub(v, k1);
|
||||
const Vec<D> k0 = Zero(d);
|
||||
const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1}));
|
||||
const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0
|
||||
// lo == 0? 1 : 0, 0
|
||||
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
|
||||
return Sub(Sub(v, k1), adjust);
|
||||
}
|
||||
};
|
||||
|
||||
@ -233,13 +253,21 @@ struct OrderDescending128 : public Key128 {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
|
||||
return Add(v, k1);
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1}));
|
||||
const Vec<D> added = Add(v, k1);
|
||||
const Mask<D> overflowed = Lt(added, v); // false, overflowed
|
||||
// overflowed? 1 : 0, 0
|
||||
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
|
||||
return Add(added, adjust);
|
||||
}
|
||||
};
|
||||
|
||||
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
|
||||
struct KeyValue128 : public KeyAny128 {
|
||||
// True indicates only part of the key (the more significant lane) should be
|
||||
// compared. KV stands for key-value.
|
||||
static constexpr bool IsKV() { return true; }
|
||||
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = K64V64;
|
||||
|
||||
@ -250,7 +278,22 @@ struct KeyValue128 : public KeyAny128 {
|
||||
return Eq128Upper(d, a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Ne128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Only count differences in the actual key, not the value.
|
||||
template <class D>
|
||||
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
|
||||
// Must avoid floating-point comparisons (for -0)
|
||||
const RebindToUnsigned<D> du;
|
||||
const Vec<decltype(du)> zero = Zero(du);
|
||||
const Vec<decltype(du)> keys = OddEven(diff, zero); // clear values
|
||||
return AllTrue(du, Eq(BitCast(du, keys), zero));
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
|
||||
return a[1] == b[1];
|
||||
}
|
||||
};
|
||||
@ -296,7 +339,7 @@ struct OrderAscendingKV128 : public KeyValue128 {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
|
||||
const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
|
||||
return Sub(v, k1);
|
||||
}
|
||||
};
|
||||
@ -342,7 +385,7 @@ struct OrderDescendingKV128 : public KeyValue128 {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
|
||||
const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
|
||||
return Add(v, k1);
|
||||
}
|
||||
};
|
||||
|
1155
third_party/highway/hwy/contrib/sort/vqsort-inl.h
vendored
1155
third_party/highway/hwy/contrib/sort/vqsort-inl.h
vendored
File diff suppressed because it is too large
Load Diff
@ -85,6 +85,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
// For internal use only
|
||||
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
|
||||
static bool HaveFloat64();
|
||||
|
65
third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
vendored
Normal file
65
third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV64Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
65
third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
vendored
Normal file
65
third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV64Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
12
third_party/highway/hwy/detect_compiler_arch.h
vendored
12
third_party/highway/hwy/detect_compiler_arch.h
vendored
@ -21,7 +21,8 @@
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
|
||||
(defined Q_CREATOR_RUN) || (defined __CLANGD__) || \
|
||||
(defined GROK_ELLIPSIS_BUILD)
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
@ -69,7 +70,7 @@
|
||||
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
|
||||
// an invalid version number, deduce it from the presence of warnings.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if defined(__APPLE__) || __clang_major__ >= 999
|
||||
#if defined(__apple_build_version__) || __clang_major__ >= 999
|
||||
#if __has_warning("-Wbitwise-instead-of-logical")
|
||||
#define HWY_COMPILER_CLANG 1400
|
||||
#elif __has_warning("-Wreserved-identifier")
|
||||
@ -85,7 +86,12 @@
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi")
|
||||
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
|
||||
// based on Clang 7, but does not support the warning we test.
|
||||
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
|
||||
// https://trac.macports.org/wiki/XcodeVersionInfo.
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi") || \
|
||||
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
|
79
third_party/highway/hwy/detect_targets.h
vendored
79
third_party/highway/hwy/detect_targets.h
vendored
@ -23,7 +23,7 @@
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See ../quick_reference.md for documentation of these macros.
|
||||
// See g3doc/quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
@ -169,13 +169,14 @@
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
|
||||
// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
|
||||
// be enabled. If 1, we instead choose HWY_SCALAR even without
|
||||
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
|
||||
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
|
||||
// always be enabled. If 1, we instead choose HWY_SCALAR even without
|
||||
// HWY_COMPILE_ONLY_SCALAR being set.
|
||||
#if !defined(HWY_BROKEN_EMU128) // allow overriding
|
||||
#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
|
||||
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
|
||||
defined(HWY_NO_LIBCXX)
|
||||
#define HWY_BROKEN_EMU128 1
|
||||
#else
|
||||
#define HWY_BROKEN_EMU128 0
|
||||
@ -215,30 +216,45 @@
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
|
||||
// Baseline targets can be used unconditionally, which does not apply to
|
||||
// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
|
||||
// in the baseline would also disable all 'worse' targets (including SVE and
|
||||
// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
|
||||
// HWY_ATTAINABLE_TARGETS below.
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#else
|
||||
#define HWY_BASELINE_SVE 0
|
||||
#endif
|
||||
#define HWY_BASELINE_NEON 0
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE2)
|
||||
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
|
||||
// If user specified -msve-vector-bits=128, they assert the vector length is
|
||||
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2_128
|
||||
// Otherwise we're not sure what the vector length will be. The baseline must be
|
||||
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
|
||||
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
|
||||
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE2
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#undef HWY_BASELINE_SVE // was 0, will be re-defined
|
||||
// See above. If user-specified vector length matches our optimization, use it.
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
|
||||
#define HWY_BASELINE_SVE HWY_SVE_256
|
||||
#else
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
||||
#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#undef HWY_BASELINE_NEON
|
||||
#define HWY_BASELINE_NEON HWY_NEON
|
||||
#else
|
||||
#define HWY_BASELINE_NEON 0
|
||||
#endif
|
||||
|
||||
#endif // HWY_ARCH_ARM
|
||||
|
||||
// Special handling for MSVC because it has fewer predefined macros:
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
@ -372,9 +388,12 @@
|
||||
#endif
|
||||
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
|
||||
|
||||
// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
|
||||
// does, and we require Linux to detect CPU capabilities.
|
||||
#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
|
||||
// Clang, GCC and MSVC allow runtime dispatch on x86.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
// On Arm, currently only GCC does, and we require Linux to detect CPU
|
||||
// capabilities.
|
||||
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
#else
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 0
|
||||
@ -389,15 +408,15 @@
|
||||
#define HWY_ATTAINABLE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && \
|
||||
((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
|
||||
#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && \
|
||||
((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
|
||||
#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE2 0
|
||||
|
@ -21,8 +21,9 @@
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cmath> // std::abs
|
||||
#include <memory>
|
||||
#include <numeric> // iota
|
||||
#include <numeric> // std::iota, std::inner_product
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
|
7
third_party/highway/hwy/examples/skeleton.cc
vendored
7
third_party/highway/hwy/examples/skeleton.cc
vendored
@ -52,10 +52,11 @@ HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const hn::RebindToSigned<DF> d32;
|
||||
const hn::Rebind<uint8_t, DF> d8;
|
||||
using VI32 = hn::Vec<decltype(d32)>;
|
||||
|
||||
const auto u8 = hn::Load(d8, values);
|
||||
const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
|
||||
const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
|
||||
const VI32 vi32 = hn::PromoteTo(d32, hn::Load(d8, values));
|
||||
const VI32 bits = hn::BitCast(d32, hn::ConvertTo(df, vi32));
|
||||
const VI32 exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
|
||||
hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
|
||||
}
|
||||
|
||||
|
2
third_party/highway/hwy/highway.h
vendored
2
third_party/highway/hwy/highway.h
vendored
@ -29,7 +29,7 @@ namespace hwy {
|
||||
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
||||
#define HWY_MAJOR 1
|
||||
#define HWY_MINOR 0
|
||||
#define HWY_PATCH 1
|
||||
#define HWY_PATCH 2
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
|
||||
|
1
third_party/highway/hwy/highway_test.cc
vendored
1
third_party/highway/hwy/highway_test.cc
vendored
@ -16,6 +16,7 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
#include <bitset>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
7
third_party/highway/hwy/nanobenchmark.cc
vendored
7
third_party/highway/hwy/nanobenchmark.cc
vendored
@ -24,14 +24,15 @@
|
||||
#include <stdlib.h>
|
||||
#include <time.h> // clock_gettime
|
||||
|
||||
#include <algorithm> // sort
|
||||
#include <algorithm> // std::sort, std::find_if
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <chrono> //NOLINT
|
||||
#include <limits>
|
||||
#include <numeric> // iota
|
||||
#include <numeric> // std::iota
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <utility> // std::pair
|
||||
#include <vector>
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
@ -150,7 +151,7 @@ inline Ticks Start() {
|
||||
// "cc" = flags modified by SHL.
|
||||
: "rdx", "memory", "cc");
|
||||
#elif HWY_ARCH_RVV
|
||||
asm volatile("rdcycle %0" : "=r"(t));
|
||||
asm volatile("rdtime %0" : "=r"(t));
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
LARGE_INTEGER counter;
|
||||
(void)QueryPerformanceCounter(&counter);
|
||||
|
233
third_party/highway/hwy/ops/arm_neon-inl.h
vendored
233
third_party/highway/hwy/ops/arm_neon-inl.h
vendored
@ -22,16 +22,18 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/base.h" // before HWY_DIAGNOSTICS
|
||||
#include "hwy/ops/shared-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
|
||||
// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
|
||||
// the same target attribute as our code, see #834.
|
||||
HWY_DIAGNOSTICS(push)
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
||||
#include <arm_neon.h>
|
||||
HWY_DIAGNOSTICS(pop)
|
||||
|
||||
#include "hwy/ops/shared-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
// Must come after arm_neon.h.
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
@ -766,6 +768,9 @@ class Vec128 {
|
||||
using Raw = typename detail::Raw128<T, N>::type;
|
||||
|
||||
public:
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = N; // only for DFromV
|
||||
|
||||
HWY_INLINE Vec128() {}
|
||||
Vec128(const Vec128&) = default;
|
||||
Vec128& operator=(const Vec128&) = default;
|
||||
@ -822,23 +827,11 @@ class Mask128 {
|
||||
template <typename T>
|
||||
using Mask64 = Mask128<T, 8 / sizeof(T)>;
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Deduce Simd<T, N, 0> from Vec128<T, N>
|
||||
struct DeduceD {
|
||||
template <typename T, size_t N>
|
||||
Simd<T, N, 0> operator()(Vec128<T, N>) const {
|
||||
return Simd<T, N, 0>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
template <class V>
|
||||
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
||||
|
||||
template <class V>
|
||||
using DFromV = decltype(detail::DeduceD()(V()));
|
||||
|
||||
template <class V>
|
||||
using TFromV = TFromD<DFromV<V>>;
|
||||
using TFromV = typename V::PrivateT;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
@ -1025,19 +1018,21 @@ HWY_API Vec128<bfloat16_t, N> Zero(Simd<bfloat16_t, N, 0> /* tag */) {
|
||||
template <class D>
|
||||
using VFromD = decltype(Zero(D()));
|
||||
|
||||
// Returns a vector with uninitialized elements.
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
|
||||
HWY_DIAGNOSTICS(push)
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
||||
HWY_DIAGNOSTICS(push)
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
||||
#if HWY_COMPILER_GCC_ACTUAL
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
|
||||
#endif
|
||||
|
||||
// Returns a vector with uninitialized elements.
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
|
||||
typename detail::Raw128<T, N>::type a;
|
||||
return Vec128<T, N>(a);
|
||||
HWY_DIAGNOSTICS(pop)
|
||||
}
|
||||
|
||||
HWY_DIAGNOSTICS(pop)
|
||||
|
||||
// Returns a vector with lane i=[0, N) set to "first" + i.
|
||||
template <typename T, size_t N, typename T2>
|
||||
Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
|
||||
@ -2277,6 +2272,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
const Simd<T, N, 0> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
// ================================================== COMPARE
|
||||
|
||||
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
||||
@ -2885,12 +2886,19 @@ HWY_API void StoreU(Vec128<bfloat16_t, N> v, Simd<bfloat16_t, N, 0> d,
|
||||
return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
|
||||
}
|
||||
|
||||
HWY_DIAGNOSTICS(push)
|
||||
#if HWY_COMPILER_GCC_ACTUAL
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
|
||||
#endif
|
||||
|
||||
// On ARM, Store is the same as StoreU.
|
||||
template <typename T, size_t N>
|
||||
HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
|
||||
StoreU(v, d, aligned);
|
||||
}
|
||||
|
||||
HWY_DIAGNOSTICS(pop)
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT p) {
|
||||
@ -3527,6 +3535,11 @@ HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
|
||||
return Vec64<double>(vget_low_f64(v.raw));
|
||||
}
|
||||
#endif
|
||||
HWY_API Vec64<bfloat16_t> LowerHalf(const Vec128<bfloat16_t> v) {
|
||||
const Full128<uint16_t> du;
|
||||
const Full64<bfloat16_t> dbh;
|
||||
return BitCast(dbh, LowerHalf(BitCast(du, v)));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
|
||||
@ -3727,6 +3740,13 @@ HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
|
||||
}
|
||||
#endif
|
||||
|
||||
HWY_API Vec64<bfloat16_t> UpperHalf(Full64<bfloat16_t> dbh,
|
||||
const Vec128<bfloat16_t> v) {
|
||||
const RebindToUnsigned<decltype(dbh)> duh;
|
||||
const Twice<decltype(duh)> du;
|
||||
return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
|
||||
}
|
||||
|
||||
// Partial
|
||||
template <typename T, size_t N, HWY_IF_LE64(T, N)>
|
||||
HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
|
||||
@ -4243,6 +4263,48 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(Full128<int32_t> /*d32*/,
|
||||
Vec128<int16_t> a,
|
||||
Vec128<int16_t> b,
|
||||
const Vec128<int32_t> sum0,
|
||||
Vec128<int32_t>& sum1) {
|
||||
#if HWY_ARCH_ARM_A64
|
||||
sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
|
||||
#else
|
||||
const Full64<int16_t> dh;
|
||||
sum1 = Vec128<int32_t>(
|
||||
vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
|
||||
#endif
|
||||
return Vec128<int32_t>(
|
||||
vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
|
||||
}
|
||||
|
||||
HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(Full64<int32_t> d32,
|
||||
Vec64<int16_t> a,
|
||||
Vec64<int16_t> b,
|
||||
const Vec64<int32_t> sum0,
|
||||
Vec64<int32_t>& sum1) {
|
||||
// vmlal writes into the upper half, which the caller cannot use, so
|
||||
// split into two halves.
|
||||
const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
|
||||
const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
|
||||
sum1 += mul_32;
|
||||
return sum0 + LowerHalf(mul_3210);
|
||||
}
|
||||
|
||||
HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(Full32<int32_t> d32,
|
||||
Vec32<int16_t> a,
|
||||
Vec32<int16_t> b,
|
||||
const Vec32<int32_t> sum0,
|
||||
Vec32<int32_t>& sum1) {
|
||||
const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
|
||||
const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
|
||||
const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
|
||||
const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
|
||||
sum1 += mul1;
|
||||
return sum0 + mul0;
|
||||
}
|
||||
|
||||
// ================================================== COMBINE
|
||||
|
||||
// ------------------------------ Combine (InterleaveLower)
|
||||
@ -4587,6 +4649,32 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> d16,
|
||||
Vec128<int32_t> a, Vec128<int32_t> b) {
|
||||
const Vec64<int16_t> a16(vqmovn_s32(a.raw));
|
||||
#if HWY_ARCH_ARM_A64
|
||||
(void)d16;
|
||||
return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
|
||||
#else
|
||||
const Vec64<int16_t> b16(vqmovn_s32(b.raw));
|
||||
return Combine(d16, a16, b16);
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API Vec64<int16_t> ReorderDemote2To(Full64<int16_t> /*d16*/,
|
||||
Vec64<int32_t> a, Vec64<int32_t> b) {
|
||||
const Full128<int32_t> d32;
|
||||
const Vec128<int32_t> ab = Combine(d32, a, b);
|
||||
return Vec64<int16_t>(vqmovn_s32(ab.raw));
|
||||
}
|
||||
|
||||
HWY_API Vec32<int16_t> ReorderDemote2To(Full32<int16_t> /*d16*/,
|
||||
Vec32<int32_t> a, Vec32<int32_t> b) {
|
||||
const Full128<int32_t> d32;
|
||||
const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
|
||||
return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
|
||||
}
|
||||
|
||||
// ================================================== CRYPTO
|
||||
|
||||
#if defined(__ARM_FEATURE_AES) || \
|
||||
@ -4892,7 +4980,8 @@ namespace detail {
|
||||
|
||||
// N=1 for any T: no-op
|
||||
template <typename T>
|
||||
HWY_INLINE Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
|
||||
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
||||
const Vec128<T, 1> v) {
|
||||
return v;
|
||||
}
|
||||
template <typename T>
|
||||
@ -4908,7 +4997,8 @@ HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
|
||||
|
||||
// u32/i32/f32: N=2
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_INLINE Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
|
||||
HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<T, 2> v10) {
|
||||
return v10 + Shuffle2301(v10);
|
||||
}
|
||||
template <typename T>
|
||||
@ -4924,48 +5014,59 @@ HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
|
||||
// full vectors
|
||||
#if HWY_ARCH_ARM_A64
|
||||
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
|
||||
HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<uint32_t> v) {
|
||||
return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
|
||||
}
|
||||
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
|
||||
HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<int32_t> v) {
|
||||
return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
|
||||
}
|
||||
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
|
||||
HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<float> v) {
|
||||
return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
|
||||
}
|
||||
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
|
||||
HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
|
||||
}
|
||||
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
|
||||
HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
const Vec128<int64_t> v) {
|
||||
return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
|
||||
}
|
||||
HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
|
||||
HWY_INLINE Vec128<double> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
const Vec128<double> v) {
|
||||
return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
|
||||
}
|
||||
#else
|
||||
// ARMv7 version for everything except doubles.
|
||||
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
|
||||
HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<uint32_t> v) {
|
||||
uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
|
||||
uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
|
||||
uint32x4x2_t v1 = vuzpq_u32(c0, c0);
|
||||
return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
|
||||
}
|
||||
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
|
||||
HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<int32_t> v) {
|
||||
int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
|
||||
int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
|
||||
int32x4x2_t v1 = vuzpq_s32(c0, c0);
|
||||
return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
|
||||
}
|
||||
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
|
||||
HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
|
||||
const Vec128<float> v) {
|
||||
float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
|
||||
float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
|
||||
float32x4x2_t v1 = vuzpq_f32(c0, c0);
|
||||
return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
|
||||
}
|
||||
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
|
||||
HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
return v + Shuffle01(v);
|
||||
}
|
||||
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
|
||||
HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
const Vec128<int64_t> v) {
|
||||
return v + Shuffle01(v);
|
||||
}
|
||||
#endif
|
||||
@ -5001,6 +5102,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
return Max(v10, v01);
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
const Simd<uint16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
template <size_t N, HWY_IF_GE32(int16_t, N)>
|
||||
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<int16_t, N> v) {
|
||||
const Simd<int16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
// Sign-extend
|
||||
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
@ -5053,7 +5178,7 @@ HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
|
||||
return detail::SumOfLanes(v);
|
||||
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
|
||||
}
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
|
||||
@ -5399,6 +5524,15 @@ HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
|
||||
constexpr int kDiv = 4 * sizeof(T);
|
||||
return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> d,
|
||||
const Mask128<T, N> mask) {
|
||||
const uint64_t nib = detail::NibblesFromMask(d, mask);
|
||||
constexpr size_t kDiv = 4 * sizeof(T);
|
||||
return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
|
||||
const Mask128<T, N> mask) {
|
||||
@ -6334,7 +6468,7 @@ HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
// Truth table of Eq and Lt for Hi and Lo u64.
|
||||
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
|
||||
// =H =L cH cL | out = cH | (=H & cL)
|
||||
@ -6371,7 +6505,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
|
||||
return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
|
||||
}
|
||||
@ -6383,6 +6517,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
|
||||
}
|
||||
|
||||
// ------------------------------ Ne128
|
||||
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
|
||||
return MaskFromVec(Or(Reverse2(d, neHL), neHL));
|
||||
}
|
||||
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
|
||||
return MaskFromVec(InterleaveUpper(d, neHL, neHL));
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
// Without a native OddEven, it seems infeasible to go faster than Lt128.
|
||||
|
243
third_party/highway/hwy/ops/arm_sve-inl.h
vendored
243
third_party/highway/hwy/ops/arm_sve-inl.h
vendored
@ -265,6 +265,9 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
|
||||
HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
|
||||
#undef HWY_SVE_FIRSTN
|
||||
|
||||
template <class D>
|
||||
using MFromD = decltype(FirstN(D(), 0));
|
||||
|
||||
namespace detail {
|
||||
|
||||
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
||||
@ -320,7 +323,9 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
|
||||
|
||||
template <class D>
|
||||
VFromD<D> Zero(D d) {
|
||||
return Set(d, 0);
|
||||
// Cast to support bfloat16_t.
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
return BitCast(d, Set(du, 0));
|
||||
}
|
||||
|
||||
// ------------------------------ Undefined
|
||||
@ -638,10 +643,9 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)
|
||||
|
||||
// ------------------------------ MulHigh
|
||||
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
||||
namespace detail {
|
||||
// Not part of API, used internally:
|
||||
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
||||
HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
|
||||
} // namespace detail
|
||||
|
||||
// ------------------------------ MulFixedPoint15
|
||||
HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
|
||||
@ -732,6 +736,10 @@ HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
|
||||
return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
|
||||
}
|
||||
|
||||
HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
|
||||
return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b.
|
||||
}
|
||||
|
||||
// ------------------------------ CountTrue
|
||||
|
||||
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
||||
@ -777,6 +785,12 @@ HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
|
||||
CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
|
||||
}
|
||||
|
||||
// ------------------------------ FindKnownFirstTrue
|
||||
template <class D>
|
||||
HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
|
||||
return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
|
||||
}
|
||||
|
||||
// ------------------------------ IfThenElse
|
||||
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
|
||||
HWY_API HWY_SVE_V(BASE, BITS) \
|
||||
@ -1221,8 +1235,9 @@ HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
|
||||
|
||||
// ------------------------------ PromoteTo F
|
||||
|
||||
// Unlike Highway's ZipLower, this returns the same type.
|
||||
namespace detail {
|
||||
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
|
||||
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
|
||||
} // namespace detail
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
@ -1230,21 +1245,21 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
|
||||
const svfloat16_t v) {
|
||||
// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
|
||||
// first replicate each lane once.
|
||||
const svfloat16_t vv = detail::ZipLower(v, v);
|
||||
const svfloat16_t vv = detail::ZipLowerSame(v, v);
|
||||
return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
||||
const svfloat32_t v) {
|
||||
const svfloat32_t vv = detail::ZipLower(v, v);
|
||||
const svfloat32_t vv = detail::ZipLowerSame(v, v);
|
||||
return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
|
||||
const svint32_t v) {
|
||||
const svint32_t vv = detail::ZipLower(v, v);
|
||||
const svint32_t vv = detail::ZipLowerSame(v, v);
|
||||
return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
|
||||
}
|
||||
|
||||
@ -1431,8 +1446,8 @@ namespace detail {
|
||||
NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
|
||||
return sv##OP##_##CHAR##BITS(lo, hi); \
|
||||
}
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
|
||||
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
|
||||
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
|
||||
@ -1455,10 +1470,10 @@ template <class D>
|
||||
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
|
||||
#if HWY_SVE_IS_POW2
|
||||
(void)d;
|
||||
return detail::ConcatOdd(hi, lo);
|
||||
return detail::ConcatOddFull(hi, lo);
|
||||
#else
|
||||
const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
|
||||
const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
|
||||
const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
|
||||
const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
|
||||
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
||||
#endif
|
||||
}
|
||||
@ -1467,10 +1482,10 @@ template <class D>
|
||||
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
||||
#if HWY_SVE_IS_POW2
|
||||
(void)d;
|
||||
return detail::ConcatEven(hi, lo);
|
||||
return detail::ConcatEvenFull(hi, lo);
|
||||
#else
|
||||
const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
|
||||
const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
|
||||
const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
|
||||
const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
|
||||
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
|
||||
#endif
|
||||
}
|
||||
@ -1480,25 +1495,28 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
|
||||
const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
|
||||
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
|
||||
return detail::ConcatEvenFull(in_even,
|
||||
in_even); // lower half
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
|
||||
const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
|
||||
return detail::ConcatOdd(in_even, in_even); // can ignore upper half of vec
|
||||
return detail::ConcatOddFull(in_even, in_even); // lower half
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
|
||||
const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
|
||||
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
|
||||
return detail::ConcatEvenFull(in_even,
|
||||
in_even); // lower half
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
|
||||
const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
|
||||
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
|
||||
return detail::ConcatEvenFull(in_even,
|
||||
in_even); // lower half
|
||||
}
|
||||
|
||||
// ------------------------------ ConvertTo F
|
||||
@ -1559,15 +1577,15 @@ HWY_API V InterleaveLower(D d, const V a, const V b) {
|
||||
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
|
||||
#if HWY_TARGET == HWY_SVE2_128
|
||||
(void)d;
|
||||
return detail::ZipLower(a, b);
|
||||
return detail::ZipLowerSame(a, b);
|
||||
#else
|
||||
// Move lower halves of blocks to lower half of vector.
|
||||
const Repartition<uint64_t, decltype(d)> d64;
|
||||
const auto a64 = BitCast(d64, a);
|
||||
const auto b64 = BitCast(d64, b);
|
||||
const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
|
||||
const auto b_blocks = detail::ConcatEven(b64, b64);
|
||||
return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
|
||||
const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half
|
||||
const auto b_blocks = detail::ConcatEvenFull(b64, b64);
|
||||
return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1582,7 +1600,8 @@ HWY_API V InterleaveLower(const V a, const V b) {
|
||||
// "upper half" requires MaskUpperHalf.
|
||||
#if HWY_TARGET == HWY_SVE2_128
|
||||
namespace detail {
|
||||
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpper, zip2)
|
||||
// Unlike Highway's ZipUpper, this returns the same type.
|
||||
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
|
||||
} // namespace detail
|
||||
#endif
|
||||
|
||||
@ -1592,15 +1611,15 @@ template <class D, class V = VFromD<D>,
|
||||
HWY_API V InterleaveUpper(D d, const V a, const V b) {
|
||||
#if HWY_TARGET == HWY_SVE2_128
|
||||
(void)d;
|
||||
return detail::ZipUpper(a, b);
|
||||
return detail::ZipUpperSame(a, b);
|
||||
#else
|
||||
// Move upper halves of blocks to lower half of vector.
|
||||
const Repartition<uint64_t, decltype(d)> d64;
|
||||
const auto a64 = BitCast(d64, a);
|
||||
const auto b64 = BitCast(d64, b);
|
||||
const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
|
||||
const auto b_blocks = detail::ConcatOdd(b64, b64);
|
||||
return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
|
||||
const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half
|
||||
const auto b_blocks = detail::ConcatOddFull(b64, b64);
|
||||
return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1814,12 +1833,17 @@ HWY_API V LowerHalf(const V v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D2, class V>
|
||||
HWY_API V UpperHalf(const D2 d2, const V v) {
|
||||
template <class DH, class V>
|
||||
HWY_API V UpperHalf(const DH dh, const V v) {
|
||||
const Twice<decltype(dh)> d;
|
||||
// Cast so that we support bfloat16_t.
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const VFromD<decltype(du)> vu = BitCast(du, v);
|
||||
#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
|
||||
return detail::Ext<Lanes(d2)>(v, v);
|
||||
return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
|
||||
#else
|
||||
return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
|
||||
const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
|
||||
return BitCast(d, detail::Splice(vu, vu, mask));
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -1842,14 +1866,14 @@ namespace detail {
|
||||
return sv##OP##_##CHAR##BITS(pg, v); \
|
||||
}
|
||||
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanes, addv)
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanes, addv)
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
|
||||
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
|
||||
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
|
||||
// NaN if all are
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
|
||||
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
|
||||
|
||||
#undef HWY_SVE_REDUCE
|
||||
#undef HWY_SVE_REDUCE_ADD
|
||||
@ -1857,17 +1881,17 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
|
||||
|
||||
template <class D, class V>
|
||||
V SumOfLanes(D d, V v) {
|
||||
return Set(d, detail::SumOfLanes(detail::MakeMask(d), v));
|
||||
return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
V MinOfLanes(D d, V v) {
|
||||
return Set(d, detail::MinOfLanes(detail::MakeMask(d), v));
|
||||
return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
V MaxOfLanes(D d, V v) {
|
||||
return Set(d, detail::MaxOfLanes(detail::MakeMask(d), v));
|
||||
return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
|
||||
}
|
||||
|
||||
|
||||
@ -1882,19 +1906,19 @@ namespace detail {
|
||||
return sv##OP##_##CHAR##BITS(mask, v); \
|
||||
}
|
||||
|
||||
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLane, lasta)
|
||||
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
|
||||
#undef HWY_SVE_GET_LANE
|
||||
} // namespace detail
|
||||
|
||||
template <class V>
|
||||
HWY_API TFromV<V> GetLane(V v) {
|
||||
return detail::GetLane(v, detail::PFalse());
|
||||
return detail::GetLaneM(v, detail::PFalse());
|
||||
}
|
||||
|
||||
// ------------------------------ ExtractLane
|
||||
template <class V>
|
||||
HWY_API TFromV<V> ExtractLane(V v, size_t i) {
|
||||
return detail::GetLane(v, FirstN(DFromV<V>(), i));
|
||||
return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
|
||||
}
|
||||
|
||||
// ------------------------------ InsertLane (IfThenElse)
|
||||
@ -2154,7 +2178,7 @@ HWY_API V Compress(V v, svbool_t mask) {
|
||||
// bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
|
||||
// SetTableIndices.
|
||||
const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
|
||||
const size_t offset = detail::SumOfLanes(mask, bits);
|
||||
const size_t offset = detail::SumOfLanesM(mask, bits);
|
||||
|
||||
// See CompressIsPartition.
|
||||
alignas(16) static constexpr uint64_t table[4 * 16] = {
|
||||
@ -2196,8 +2220,8 @@ HWY_API V Compress(V v, svbool_t mask16) {
|
||||
// Demote to 16-bit (already in range) - separately so we can splice
|
||||
const V evenL = BitCast(d16, compressedL);
|
||||
const V evenH = BitCast(d16, compressedH);
|
||||
const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
|
||||
const V v16H = detail::ConcatEven(evenH, evenH);
|
||||
const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half
|
||||
const V v16H = detail::ConcatEvenFull(evenH, evenH);
|
||||
|
||||
// We need to combine two vectors of non-constexpr length, so the only option
|
||||
// is Splice, which requires us to synthesize a mask. NOTE: this function uses
|
||||
@ -2240,7 +2264,7 @@ HWY_API V CompressNot(V v, svbool_t mask) {
|
||||
// bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
|
||||
// SetTableIndices.
|
||||
const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
|
||||
const size_t offset = detail::SumOfLanes(mask, bits);
|
||||
const size_t offset = detail::SumOfLanesM(mask, bits);
|
||||
|
||||
// See CompressIsPartition.
|
||||
alignas(16) static constexpr uint64_t table[4 * 16] = {
|
||||
@ -2478,7 +2502,7 @@ namespace detail {
|
||||
return sv##OP##_##CHAR##BITS(v, kLane); \
|
||||
}
|
||||
|
||||
HWY_SVE_FOREACH(HWY_SVE_BROADCAST, Broadcast, dup_lane)
|
||||
HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane)
|
||||
#undef HWY_SVE_BROADCAST
|
||||
} // namespace detail
|
||||
#endif
|
||||
@ -2490,7 +2514,7 @@ HWY_API V Broadcast(const V v) {
|
||||
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
|
||||
static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
|
||||
#if HWY_TARGET == HWY_SVE2_128
|
||||
return detail::Broadcast<kLane>(v);
|
||||
return detail::BroadcastLane<kLane>(v);
|
||||
#else
|
||||
auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
|
||||
if (kLane != 0) {
|
||||
@ -2585,10 +2609,11 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32,
|
||||
const svuint16_t v) {
|
||||
return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
|
||||
return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v));
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderDemote2To (OddEven)
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
|
||||
svfloat32_t a, svfloat32_t b) {
|
||||
@ -2598,6 +2623,21 @@ HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svint16_t ReorderDemote2To(Simd<int16_t, N, kPow2> d16, svint32_t a,
|
||||
svint32_t b) {
|
||||
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
|
||||
(void)d16;
|
||||
const svint16_t a_in_even = svqxtnb_s32(a);
|
||||
return svqxtnt_s32(a_in_even, b);
|
||||
#else
|
||||
const Half<decltype(d16)> dh;
|
||||
const svint16_t a16 = BitCast(dh, detail::SaturateI<int16_t>(a));
|
||||
const svint16_t b16 = BitCast(dh, detail::SaturateI<int16_t>(b));
|
||||
return detail::InterleaveEven(a16, b16);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
|
||||
template <class V>
|
||||
HWY_API V ZeroIfNegative(const V v) {
|
||||
@ -2716,7 +2756,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
||||
const ScalableTag<uint8_t> d8;
|
||||
const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
|
||||
return detail::ConcatEven(b16, b16); // only lower half needed
|
||||
return detail::ConcatEvenFull(b16, b16); // lower half
|
||||
}
|
||||
template <class T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
||||
@ -2726,7 +2766,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
|
||||
const ScalableTag<uint32_t> d32;
|
||||
const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
|
||||
return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
|
||||
return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
|
||||
}
|
||||
|
||||
// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
|
||||
@ -2791,7 +2831,7 @@ namespace detail {
|
||||
return sv##OP##_##CHAR##BITS(a, b); \
|
||||
}
|
||||
|
||||
HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
|
||||
HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
|
||||
#undef HWY_SVE_MUL_EVEN
|
||||
} // namespace detail
|
||||
#endif
|
||||
@ -2799,27 +2839,28 @@ HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
|
||||
template <class V, class DW = RepartitionToWide<DFromV<V>>>
|
||||
HWY_API VFromD<DW> MulEven(const V a, const V b) {
|
||||
#if HWY_TARGET == HWY_SVE2
|
||||
return BitCast(DW(), detail::MulEven(a, b));
|
||||
return BitCast(DW(), detail::MulEvenNative(a, b));
|
||||
#else
|
||||
const auto lo = Mul(a, b);
|
||||
const auto hi = detail::MulHigh(a, b);
|
||||
const auto hi = MulHigh(a, b);
|
||||
return BitCast(DW(), detail::InterleaveEven(lo, hi));
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
|
||||
const auto lo = Mul(a, b);
|
||||
const auto hi = detail::MulHigh(a, b);
|
||||
const auto hi = MulHigh(a, b);
|
||||
return detail::InterleaveEven(lo, hi);
|
||||
}
|
||||
|
||||
HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
|
||||
const auto lo = Mul(a, b);
|
||||
const auto hi = detail::MulHigh(a, b);
|
||||
const auto hi = MulHigh(a, b);
|
||||
return detail::InterleaveOdd(lo, hi);
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
||||
svuint16_t a, svuint16_t b,
|
||||
@ -2837,6 +2878,33 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
|
||||
svint16_t a, svint16_t b,
|
||||
const svint32_t sum0,
|
||||
svint32_t& sum1) {
|
||||
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
|
||||
(void)d32;
|
||||
sum1 = svmlalt_s32(sum1, a, b);
|
||||
return svmlalb_s32(sum0, a, b);
|
||||
#else
|
||||
const svbool_t pg = detail::PTrue(d32);
|
||||
const svint32_t a0 = svunpklo_s32(a);
|
||||
const svint32_t b0 = svunpklo_s32(b);
|
||||
svint32_t a1, b1;
|
||||
if (detail::IsFull(d32)) {
|
||||
a1 = svunpkhi_s32(a);
|
||||
b1 = svunpkhi_s32(b);
|
||||
} else {
|
||||
const Rebind<int16_t, decltype(d32)> d16h;
|
||||
a1 = svunpklo_s32(UpperHalf(d16h, a));
|
||||
b1 = svunpklo_s32(UpperHalf(d16h, b));
|
||||
}
|
||||
sum1 = svmla_s32_x(pg, sum1, a1, b1);
|
||||
return svmla_s32_x(pg, sum0, a0, b0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ AESRound / CLMul
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE2_AES) || \
|
||||
@ -2886,7 +2954,8 @@ HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
|
||||
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
|
||||
template <class D>
|
||||
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t eqHx = Eq(a, b); // only odd lanes used
|
||||
// Convert to vector: more pipelines can execute vector TRN* instructions
|
||||
// than the predicate version.
|
||||
@ -2905,7 +2974,8 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return MaskFromVec(detail::Lt128Vec(d, a, b));
|
||||
#else
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t eqHx = Eq(a, b); // only odd lanes used
|
||||
const svbool_t ltHL = Lt(a, b);
|
||||
// Move into upper lane: ltL if the upper half is equal, otherwise ltH.
|
||||
@ -2919,18 +2989,21 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t ltHL = Lt(a, b);
|
||||
return detail::DupOddB(d, ltHL);
|
||||
}
|
||||
|
||||
// ------------------------------ Eq128
|
||||
// ------------------------------ Eq128, Ne128
|
||||
|
||||
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
|
||||
namespace detail {
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
// Convert to vector: more pipelines can execute vector TRN* instructions
|
||||
// than the predicate version.
|
||||
const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
|
||||
@ -2939,6 +3012,20 @@ HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
|
||||
const svuint64_t eqLL = DupEven(eqHL);
|
||||
return And(eqLL, eqHH);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
// Convert to vector: more pipelines can execute vector TRN* instructions
|
||||
// than the predicate version.
|
||||
const svuint64_t neHL = VecFromMask(d, Ne(a, b));
|
||||
// Duplicate upper and lower.
|
||||
const svuint64_t neHH = DupOdd(neHL);
|
||||
const svuint64_t neLL = DupEven(neHL);
|
||||
return Or(neLL, neHH);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
#endif
|
||||
|
||||
@ -2947,7 +3034,8 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return MaskFromVec(detail::Eq128Vec(d, a, b));
|
||||
#else
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t eqHL = Eq(a, b);
|
||||
const svbool_t eqHH = detail::DupOddB(d, eqHL);
|
||||
const svbool_t eqLL = detail::DupEvenB(d, eqHL);
|
||||
@ -2955,15 +3043,38 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
|
||||
#endif // HWY_TARGET != HWY_SVE_256
|
||||
}
|
||||
|
||||
// ------------------------------ Eq128Upper
|
||||
template <class D>
|
||||
HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return MaskFromVec(detail::Ne128Vec(d, a, b));
|
||||
#else
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t neHL = Ne(a, b);
|
||||
const svbool_t neHH = detail::DupOddB(d, neHL);
|
||||
const svbool_t neLL = detail::DupEvenB(d, neHL);
|
||||
return Or(neLL, neHH);
|
||||
#endif // HWY_TARGET != HWY_SVE_256
|
||||
}
|
||||
|
||||
// ------------------------------ Eq128Upper, Ne128Upper
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t eqHL = Eq(a, b);
|
||||
return detail::DupOddB(d, eqHL);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const svbool_t neHL = Ne(a, b);
|
||||
return detail::DupOddB(d, neHL);
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
template <class D>
|
||||
|
91
third_party/highway/hwy/ops/emu128-inl.h
vendored
91
third_party/highway/hwy/ops/emu128-inl.h
vendored
@ -18,6 +18,7 @@
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <cmath> // std::abs, std::isnan
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/ops/shared-inl.h"
|
||||
@ -32,6 +33,9 @@ using Full128 = Simd<T, 16 / sizeof(T), 0>;
|
||||
// (Wrapper class required for overloading comparison operators.)
|
||||
template <typename T, size_t N = 16 / sizeof(T)>
|
||||
struct Vec128 {
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = N; // only for DFromV
|
||||
|
||||
HWY_INLINE Vec128() = default;
|
||||
Vec128(const Vec128&) = default;
|
||||
Vec128& operator=(const Vec128&) = default;
|
||||
@ -78,23 +82,11 @@ struct Mask128 {
|
||||
Raw bits[16 / sizeof(T)] = {};
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Deduce Simd<T, N, 0> from Vec128<T, N>
|
||||
struct Deduce128 {
|
||||
template <typename T, size_t N>
|
||||
Simd<T, N, 0> operator()(Vec128<T, N>) const {
|
||||
return Simd<T, N, 0>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
template <class V>
|
||||
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
||||
|
||||
template <class V>
|
||||
using DFromV = decltype(detail::Deduce128()(V()));
|
||||
|
||||
template <class V>
|
||||
using TFromV = TFromD<DFromV<V>>;
|
||||
using TFromV = typename V::PrivateT;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
@ -380,6 +372,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
const Simd<T, N, 0> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
// ================================================== SHIFTS
|
||||
|
||||
// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
|
||||
@ -1235,6 +1233,14 @@ HWY_API Mask128<uint64_t> Eq128(Simd<uint64_t, 2, 0> /* tag */,
|
||||
return ret;
|
||||
}
|
||||
|
||||
HWY_API Mask128<uint64_t> Ne128(Simd<uint64_t, 2, 0> /* tag */,
|
||||
Vec128<uint64_t> a, const Vec128<uint64_t> b) {
|
||||
const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
|
||||
Mask128<uint64_t> ret;
|
||||
ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
|
||||
return ret;
|
||||
}
|
||||
|
||||
HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
|
||||
Vec128<uint64_t> a,
|
||||
const Vec128<uint64_t> b) {
|
||||
@ -1244,6 +1250,15 @@ HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
|
||||
return ret;
|
||||
}
|
||||
|
||||
HWY_API Mask128<uint64_t> Ne128Upper(Simd<uint64_t, 2, 0> /* tag */,
|
||||
Vec128<uint64_t> a,
|
||||
const Vec128<uint64_t> b) {
|
||||
const bool ne = a.raw[1] != b.raw[1];
|
||||
Mask128<uint64_t> ret;
|
||||
ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
@ -1548,6 +1563,22 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
|
||||
return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
HWY_API Vec128<int16_t, 2 * N> ReorderDemote2To(Simd<int16_t, 2 * N, 0> /*d16*/,
|
||||
Vec128<int32_t, N> a,
|
||||
Vec128<int32_t, N> b) {
|
||||
const int16_t min = LimitsMin<int16_t>();
|
||||
const int16_t max = LimitsMax<int16_t>();
|
||||
Vec128<int16_t, 2 * N> ret;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
ret.raw[i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
|
||||
}
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
ret.raw[N + i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
HWY_INLINE void StoreU16ToF16(const uint16_t val,
|
||||
@ -2233,9 +2264,8 @@ HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
|
||||
using Bits = typename Mask128<T, N>::Raw;
|
||||
constexpr Bits kAll = static_cast<Bits>(~Bits{0});
|
||||
Bits and_sum = kAll;
|
||||
constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
|
||||
uint64_t and_sum = kAll;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
and_sum &= mask.bits[i];
|
||||
}
|
||||
@ -2280,6 +2310,16 @@ HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
|
||||
return count;
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API size_t FindKnownFirstTrue(Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (mask.bits[i] != 0) return i;
|
||||
}
|
||||
HWY_DASSERT(false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
@ -2379,6 +2419,7 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
template <size_t N>
|
||||
HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
|
||||
Vec128<bfloat16_t, 2 * N> a,
|
||||
@ -2395,6 +2436,20 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
|
||||
Simd<int32_t, N, 0> d32, Vec128<int16_t, 2 * N> a, Vec128<int16_t, 2 * N> b,
|
||||
const Vec128<int32_t, N> sum0, Vec128<int32_t, N>& sum1) {
|
||||
const Rebind<int16_t, decltype(d32)> d16;
|
||||
// Avoid ZipLower/Upper so this also works on big-endian systems.
|
||||
const Vec128<int32_t, N> a0 = PromoteTo(d32, LowerHalf(d16, a));
|
||||
const Vec128<int32_t, N> a1 = PromoteTo(d32, UpperHalf(d16, a));
|
||||
const Vec128<int32_t, N> b0 = PromoteTo(d32, LowerHalf(d16, b));
|
||||
const Vec128<int32_t, N> b1 = PromoteTo(d32, UpperHalf(d16, b));
|
||||
sum1 = MulAdd(BitCast(d32, a1), BitCast(d32, b1), sum1);
|
||||
return MulAdd(BitCast(d32, a0), BitCast(d32, b0), sum0);
|
||||
}
|
||||
|
||||
// ================================================== REDUCTIONS
|
||||
|
||||
template <typename T, size_t N>
|
||||
|
89
third_party/highway/hwy/ops/generic_ops-inl.h
vendored
89
third_party/highway/hwy/ops/generic_ops-inl.h
vendored
@ -15,6 +15,14 @@
|
||||
|
||||
// Target-independent types/functions defined after target-specific ops.
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Define detail::Shuffle1230 etc, but only when viewing the current header;
|
||||
// normally this is included via highway.h, which includes ops/*.h.
|
||||
#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
|
||||
#include "hwy/ops/emu128-inl.h"
|
||||
#endif // HWY_IDE
|
||||
|
||||
// Relies on the external include guard in highway.h.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
@ -476,31 +484,15 @@ HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
|
||||
detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
|
||||
}
|
||||
|
||||
// 64 bits
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
|
||||
Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const auto v10 = InterleaveLower(d_full, v0, v1);
|
||||
StoreU(v10, d_full, unaligned);
|
||||
}
|
||||
|
||||
// <= 32 bits
|
||||
template <typename T, size_t N, HWY_IF_LE32(T, N)>
|
||||
HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
|
||||
const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
|
||||
// <= 64 bits
|
||||
template <class V, typename T, size_t N, HWY_IF_LE64(T, N)>
|
||||
HWY_API void StoreInterleaved2(const V part0, const V part1, Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const auto v10 = InterleaveLower(d_full, v0, v1);
|
||||
alignas(16) T buf[16 / sizeof(T)];
|
||||
StoreU(v10, d_full, buf);
|
||||
CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
|
||||
const Twice<decltype(d)> d2;
|
||||
const auto v0 = ZeroExtendVector(d2, part0);
|
||||
const auto v1 = ZeroExtendVector(d2, part1);
|
||||
const auto v10 = InterleaveLower(d2, v0, v1);
|
||||
StoreU(v10, d2, unaligned);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
||||
@ -526,8 +518,9 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const auto k5 = Set(du, 5);
|
||||
const auto k6 = Set(du, 6);
|
||||
using TU = TFromD<decltype(du)>;
|
||||
const auto k5 = Set(du, TU{5});
|
||||
const auto k6 = Set(du, TU{6});
|
||||
|
||||
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
||||
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
|
||||
@ -576,8 +569,8 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const Repartition<uint8_t, decltype(d)> du8;
|
||||
const auto k2 = Set(du8, 2 * sizeof(T));
|
||||
const auto k3 = Set(du8, 3 * sizeof(T));
|
||||
const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
|
||||
const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
|
||||
|
||||
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
||||
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
|
||||
@ -666,16 +659,15 @@ HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
}
|
||||
|
||||
// 64-bit vector, 8-bit lanes
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
||||
HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Vec64<T> part2, Full64<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
template <class V, typename T, HWY_IF_LANE_SIZE(T, 1)>
|
||||
HWY_API void StoreInterleaved3(const V part0, const V part1, const V part2,
|
||||
Full64<T> d, T* HWY_RESTRICT unaligned) {
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
// Use full vectors for the shuffles and first result.
|
||||
const Full128<uint8_t> du;
|
||||
const Full128<T> d_full;
|
||||
const auto k5 = Set(du, 5);
|
||||
const auto k6 = Set(du, 6);
|
||||
const auto k5 = Set(du, uint8_t{5});
|
||||
const auto k6 = Set(du, uint8_t{6});
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
@ -708,7 +700,7 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
|
||||
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
|
||||
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
|
||||
const Vec64<T> B{(B0 | B1 | B2).raw};
|
||||
const V B{(B0 | B1 | B2).raw};
|
||||
StoreU(B, d, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
@ -720,8 +712,8 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Full128<T> d;
|
||||
const Full128<uint8_t> du8;
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
const auto k2 = Set(du8, 2 * sizeof(T));
|
||||
const auto k3 = Set(du8, 3 * sizeof(T));
|
||||
const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
|
||||
const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
@ -975,7 +967,7 @@ HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
|
||||
// ------------------------------ AESRound
|
||||
|
||||
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
|
||||
// Define for white-box testing, even if native instructions are available.
|
||||
namespace detail {
|
||||
@ -991,7 +983,7 @@ namespace detail {
|
||||
template <class V> // u8
|
||||
HWY_INLINE V SubBytes(V state) {
|
||||
const DFromV<V> du;
|
||||
const auto mask = Set(du, 0xF);
|
||||
const auto mask = Set(du, uint8_t{0xF});
|
||||
|
||||
// Change polynomial basis to GF(2^4)
|
||||
{
|
||||
@ -1034,7 +1026,7 @@ HWY_INLINE V SubBytes(V state) {
|
||||
0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
|
||||
const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
|
||||
const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
|
||||
return Xor(Xor(affL, affU), Set(du, 0x63));
|
||||
return Xor(Xor(affL, affU), Set(du, uint8_t{0x63}));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
@ -1080,7 +1072,7 @@ HWY_API V MixColumns(const V state) {
|
||||
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
|
||||
const RebindToSigned<decltype(du)> di; // can only do signed comparisons
|
||||
const auto msb = Lt(BitCast(di, state), Zero(di));
|
||||
const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
|
||||
const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
|
||||
const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
|
||||
const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
|
||||
const auto d_s2301 = Xor(d, s2301);
|
||||
@ -1200,7 +1192,7 @@ HWY_API V PopulationCount(V v) {
|
||||
HWY_ALIGN constexpr uint8_t kLookup[16] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
};
|
||||
const auto lo = And(v, Set(d, 0xF));
|
||||
const auto lo = And(v, Set(d, uint8_t{0xF}));
|
||||
const auto hi = ShiftRight<4>(v);
|
||||
const auto lookup = LoadDup128(d, kLookup);
|
||||
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
|
||||
@ -1215,9 +1207,10 @@ HWY_API V PopulationCount(V v) {
|
||||
static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
|
||||
const D d;
|
||||
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
|
||||
v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
|
||||
v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
|
||||
return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
|
||||
const V k33 = Set(d, uint8_t{0x33});
|
||||
v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
|
||||
v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
|
||||
return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
|
||||
}
|
||||
#endif // HWY_TARGET != HWY_RVV
|
||||
|
||||
@ -1227,7 +1220,7 @@ HWY_API V PopulationCount(V v) {
|
||||
const D d;
|
||||
const Repartition<uint8_t, decltype(d)> d8;
|
||||
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
||||
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
|
||||
return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
|
||||
}
|
||||
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
|
||||
@ -1236,7 +1229,7 @@ HWY_API V PopulationCount(V v) {
|
||||
const D d;
|
||||
Repartition<uint16_t, decltype(d)> d16;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
|
||||
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
|
||||
return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
|
||||
}
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
@ -1246,7 +1239,7 @@ HWY_API V PopulationCount(V v) {
|
||||
const D d;
|
||||
Repartition<uint32_t, decltype(d)> d32;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
|
||||
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
|
||||
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
|
||||
}
|
||||
#endif
|
||||
|
||||
|
189
third_party/highway/hwy/ops/rvv-inl.h
vendored
189
third_party/highway/hwy/ops/rvv-inl.h
vendored
@ -494,9 +494,11 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
|
||||
|
||||
// ------------------------------ Zero
|
||||
|
||||
template <typename T, size_t N, int kPow2>
|
||||
HWY_API VFromD<Simd<T, N, kPow2>> Zero(Simd<T, N, kPow2> d) {
|
||||
return Set(d, T(0));
|
||||
template <class D>
|
||||
HWY_API VFromD<D> Zero(D d) {
|
||||
// Cast to support bfloat16_t.
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
return BitCast(d, Set(du, 0));
|
||||
}
|
||||
|
||||
// ------------------------------ Undefined
|
||||
@ -1109,6 +1111,9 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
|
||||
// ------------------------------ Xor
|
||||
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)
|
||||
|
||||
// ------------------------------ ExclusiveNeither
|
||||
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
|
||||
|
||||
#undef HWY_RVV_RETM_ARGMM
|
||||
|
||||
// ------------------------------ IfThenElse
|
||||
@ -1219,14 +1224,19 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) {
|
||||
|
||||
// ------------------------------ FindFirstTrue
|
||||
|
||||
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
|
||||
template <class D> \
|
||||
HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
|
||||
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
|
||||
return vfirst_m_b##MLEN(m, Lanes(d)); \
|
||||
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
|
||||
template <class D> \
|
||||
HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
|
||||
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
|
||||
return vfirst_m_b##MLEN(m, Lanes(d)); \
|
||||
} \
|
||||
template <class D> \
|
||||
HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
|
||||
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
|
||||
return static_cast<size_t>(vfirst_m_b##MLEN(m, Lanes(d))); \
|
||||
}
|
||||
|
||||
HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, _, _)
|
||||
HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _)
|
||||
#undef HWY_RVV_FIND_FIRST_TRUE
|
||||
|
||||
// ------------------------------ AllFalse
|
||||
@ -2642,9 +2652,10 @@ HWY_API V ShiftLeftLanes(const D d, const V v) {
|
||||
using TI = TFromD<decltype(di)>;
|
||||
const auto shifted = detail::SlideUp(v, v, kLanes);
|
||||
// Match x86 semantics by zeroing lower lanes in 128-bit blocks
|
||||
const auto idx_mod = detail::AndS(
|
||||
detail::Iota0(di), static_cast<TI>(detail::LanesPerBlock(di) - 1));
|
||||
const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
|
||||
const auto idx_mod =
|
||||
detail::AndS(BitCast(di, detail::Iota0(di)),
|
||||
static_cast<TI>(detail::LanesPerBlock(di) - 1));
|
||||
const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
|
||||
return IfThenZeroElse(clear, shifted);
|
||||
}
|
||||
|
||||
@ -2681,9 +2692,8 @@ HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
|
||||
// Match x86 semantics by zeroing upper lanes in 128-bit blocks
|
||||
const size_t lpb = detail::LanesPerBlock(di);
|
||||
const auto idx_mod =
|
||||
detail::AndS(detail::Iota0(di), static_cast<TI>(lpb - 1));
|
||||
const auto keep =
|
||||
detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
|
||||
detail::AndS(BitCast(di, detail::Iota0(di)), static_cast<TI>(lpb - 1));
|
||||
const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
|
||||
return IfThenElseZero(keep, shifted);
|
||||
}
|
||||
|
||||
@ -2827,12 +2837,14 @@ HWY_API V PopulationCount(V v) {
|
||||
|
||||
// ------------------------------ LoadDup128
|
||||
|
||||
template <class D, typename T = TFromD<D>>
|
||||
HWY_API VFromD<D> LoadDup128(D d, const T* const HWY_RESTRICT p) {
|
||||
const auto loaded = Load(d, p);
|
||||
// Broadcast the first block
|
||||
const auto idx = detail::AndS(detail::Iota0(d),
|
||||
static_cast<T>(detail::LanesPerBlock(d) - 1));
|
||||
template <class D>
|
||||
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
|
||||
const VFromD<D> loaded = Load(d, p);
|
||||
// idx must be unsigned for TableLookupLanes.
|
||||
using TU = MakeUnsigned<TFromD<D>>;
|
||||
const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
|
||||
// Broadcast the first block.
|
||||
const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(d), mask);
|
||||
return TableLookupLanes(loaded, idx);
|
||||
}
|
||||
|
||||
@ -3086,7 +3098,7 @@ HWY_INLINE V MulOdd(const V a, const V b) {
|
||||
return OddEven(hi, detail::Slide1Down(lo));
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderDemote2To (OddEven)
|
||||
// ------------------------------ ReorderDemote2To (OddEven, Combine)
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
|
||||
@ -3099,22 +3111,42 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
// If LMUL is not the max, Combine first to avoid another DemoteTo.
|
||||
template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
|
||||
class D32 = RepartitionToWide<Simd<int16_t, N, kPow2>>>
|
||||
HWY_API VFromD<Simd<int16_t, N, kPow2>> ReorderDemote2To(
|
||||
Simd<int16_t, N, kPow2> d16, VFromD<D32> a, VFromD<D32> b) {
|
||||
const Twice<D32> d32t;
|
||||
const VFromD<decltype(d32t)> ab = Combine(d32t, a, b);
|
||||
return DemoteTo(d16, ab);
|
||||
}
|
||||
|
||||
// Max LMUL: must DemoteTo first, then Combine.
|
||||
template <size_t N, class V32 = VFromD<RepartitionToWide<Simd<int16_t, N, 3>>>>
|
||||
HWY_API VFromD<Simd<int16_t, N, 3>> ReorderDemote2To(Simd<int16_t, N, 3> d16,
|
||||
V32 a, V32 b) {
|
||||
const Half<decltype(d16)> d16h;
|
||||
const VFromD<decltype(d16h)> a16 = DemoteTo(d16h, a);
|
||||
const VFromD<decltype(d16h)> b16 = DemoteTo(d16h, b);
|
||||
return Combine(d16, a16, b16);
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
template <class DF>
|
||||
using DU16FromDF = RepartitionToNarrow<RebindToUnsigned<DF>>;
|
||||
namespace detail {
|
||||
|
||||
template <size_t N, int kPow2>
|
||||
HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
||||
VFromD<DU16FromDF<decltype(df32)>> a,
|
||||
VFromD<DU16FromDF<decltype(df32)>> b,
|
||||
const VFromD<decltype(df32)> sum0,
|
||||
VFromD<decltype(df32)>& sum1)
|
||||
-> VFromD<decltype(df32)> {
|
||||
const DU16FromDF<decltype(df32)> du16;
|
||||
const RebindToUnsigned<decltype(df32)> du32;
|
||||
// Non-overloaded wrapper function so we can define DF32 in template args.
|
||||
template <
|
||||
size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
|
||||
class VF32 = VFromD<DF32>,
|
||||
class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>>
|
||||
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
|
||||
VFromD<DU16> a, VFromD<DU16> b,
|
||||
const VF32 sum0, VF32& sum1) {
|
||||
const DU16 du16;
|
||||
const RebindToUnsigned<DF32> du32;
|
||||
using VU32 = VFromD<decltype(du32)>;
|
||||
const VFromD<decltype(du16)> zero = Zero(du16);
|
||||
const VFromD<DU16> zero = Zero(du16);
|
||||
const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
|
||||
const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
|
||||
const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
|
||||
@ -3123,10 +3155,68 @@ HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
|
||||
SHIFT, MLEN, NAME, OP) \
|
||||
template <size_t N> \
|
||||
HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
|
||||
HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
|
||||
HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
|
||||
return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \
|
||||
}
|
||||
|
||||
HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT)
|
||||
#undef HWY_RVV_WIDEN_MACC
|
||||
|
||||
// If LMUL is not the max, we can WidenMul first (3 instructions).
|
||||
template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
|
||||
class D32 = Simd<int32_t, N, kPow2>, class V32 = VFromD<D32>,
|
||||
class D16 = RepartitionToNarrow<D32>>
|
||||
HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, kPow2> d32,
|
||||
VFromD<D16> a, VFromD<D16> b,
|
||||
const V32 sum0, V32& sum1) {
|
||||
const Twice<decltype(d32)> d32t;
|
||||
using V32T = VFromD<decltype(d32t)>;
|
||||
V32T sum = Combine(d32t, sum0, sum1);
|
||||
sum = detail::WidenMulAcc(d32t, sum, a, b);
|
||||
sum1 = UpperHalf(d32, sum);
|
||||
return LowerHalf(d32, sum);
|
||||
}
|
||||
|
||||
// Max LMUL: must LowerHalf first (4 instructions).
|
||||
template <size_t N, class D32 = Simd<int32_t, N, 3>, class V32 = VFromD<D32>,
|
||||
class D16 = RepartitionToNarrow<D32>>
|
||||
HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, 3> d32,
|
||||
VFromD<D16> a, VFromD<D16> b,
|
||||
const V32 sum0, V32& sum1) {
|
||||
const Half<D16> d16h;
|
||||
using V16H = VFromD<decltype(d16h)>;
|
||||
const V16H a0 = LowerHalf(d16h, a);
|
||||
const V16H a1 = UpperHalf(d16h, a);
|
||||
const V16H b0 = LowerHalf(d16h, b);
|
||||
const V16H b1 = UpperHalf(d16h, b);
|
||||
sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
|
||||
return detail::WidenMulAcc(d32, sum0, a0, b0);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <size_t N, int kPow2, class VN, class VW>
|
||||
HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
|
||||
const VW sum0, VW& sum1) {
|
||||
return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
|
||||
}
|
||||
|
||||
template <size_t N, int kPow2, class VN, class VW>
|
||||
HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
|
||||
const VW sum0, VW& sum1) {
|
||||
return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
|
||||
}
|
||||
|
||||
// ------------------------------ Lt128
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
// Truth table of Eq and Compare for Hi and Lo u64.
|
||||
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
|
||||
// =H =L cH cL | out = cH | (=H & cL)
|
||||
@ -3152,7 +3242,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
// ------------------------------ Lt128Upper
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
|
||||
// Replicate H to its neighbor.
|
||||
return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
|
||||
@ -3161,7 +3252,8 @@ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
// ------------------------------ Eq128
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
|
||||
const VFromD<D> eqLH = Reverse2(d, eqHL);
|
||||
return MaskFromVec(And(eqHL, eqLH));
|
||||
@ -3170,12 +3262,33 @@ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
// ------------------------------ Eq128Upper
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
|
||||
// Replicate H to its neighbor.
|
||||
return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
|
||||
}
|
||||
|
||||
// ------------------------------ Ne128
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
|
||||
const VFromD<D> neLH = Reverse2(d, neHL);
|
||||
return MaskFromVec(Or(neHL, neLH));
|
||||
}
|
||||
|
||||
// ------------------------------ Ne128Upper
|
||||
template <class D>
|
||||
HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
|
||||
// Replicate H to its neighbor.
|
||||
return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL)));
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
template <class D>
|
||||
|
128
third_party/highway/hwy/ops/scalar-inl.h
vendored
128
third_party/highway/hwy/ops/scalar-inl.h
vendored
@ -33,6 +33,9 @@ using Sisd = Simd<T, 1, 0>;
|
||||
// (Wrapper class required for overloading comparison operators.)
|
||||
template <typename T>
|
||||
struct Vec1 {
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = 1; // only for DFromV
|
||||
|
||||
HWY_INLINE Vec1() = default;
|
||||
Vec1(const Vec1&) = default;
|
||||
Vec1& operator=(const Vec1&) = default;
|
||||
@ -78,23 +81,11 @@ class Mask1 {
|
||||
Raw bits;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Deduce Sisd<T> from Vec1<T>
|
||||
struct Deduce1 {
|
||||
template <typename T>
|
||||
Sisd<T> operator()(Vec1<T>) const {
|
||||
return Sisd<T>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
template <class V>
|
||||
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
||||
|
||||
template <class V>
|
||||
using DFromV = decltype(detail::Deduce1()(V()));
|
||||
|
||||
template <class V>
|
||||
using TFromV = TFromD<DFromV<V>>;
|
||||
using TFromV = typename V::PrivateT;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
@ -341,6 +332,12 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
|
||||
const Sisd<T> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
// ================================================== SHIFTS
|
||||
|
||||
// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
|
||||
@ -365,7 +362,7 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
|
||||
// signed shifts are still implementation-defined.
|
||||
using TU = hwy::MakeUnsigned<T>;
|
||||
const Sisd<TU> du;
|
||||
const TU shifted = BitCast(du, v).raw >> kBits;
|
||||
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
|
||||
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
||||
const size_t sign_shift =
|
||||
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
|
||||
@ -426,7 +423,7 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
|
||||
// signed shifts are still implementation-defined.
|
||||
using TU = hwy::MakeUnsigned<T>;
|
||||
const Sisd<TU> du;
|
||||
const TU shifted = BitCast(du, v).raw >> bits;
|
||||
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
|
||||
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
||||
const size_t sign_shift =
|
||||
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
|
||||
@ -557,16 +554,47 @@ HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Abs(const Vec1<T> a) {
|
||||
const T i = a.raw;
|
||||
return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
|
||||
return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(static_cast<T>(-i));
|
||||
}
|
||||
HWY_API Vec1<float> Abs(const Vec1<float> a) {
|
||||
return Vec1<float>(std::abs(a.raw));
|
||||
return Vec1<float>(fabsf(a.raw));
|
||||
}
|
||||
HWY_API Vec1<double> Abs(const Vec1<double> a) {
|
||||
return Vec1<double>(std::abs(a.raw));
|
||||
return Vec1<double>(fabs(a.raw));
|
||||
}
|
||||
|
||||
// ------------------------------ min/max
|
||||
// ------------------------------ Min/Max
|
||||
|
||||
// <cmath> may be unavailable, so implement our own.
|
||||
namespace detail {
|
||||
|
||||
static inline float Abs(float f) {
|
||||
uint32_t i;
|
||||
CopyBytes<4>(&f, &i);
|
||||
i &= 0x7FFFFFFFu;
|
||||
CopyBytes<4>(&i, &f);
|
||||
return f;
|
||||
}
|
||||
static inline double Abs(double f) {
|
||||
uint64_t i;
|
||||
CopyBytes<8>(&f, &i);
|
||||
i &= 0x7FFFFFFFFFFFFFFFull;
|
||||
CopyBytes<8>(&i, &f);
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline bool SignBit(float f) {
|
||||
uint32_t i;
|
||||
CopyBytes<4>(&f, &i);
|
||||
return (i >> 31) != 0;
|
||||
}
|
||||
static inline bool SignBit(double f) {
|
||||
uint64_t i;
|
||||
CopyBytes<8>(&f, &i);
|
||||
return (i >> 63) != 0;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
||||
HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
|
||||
@ -575,8 +603,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
|
||||
if (std::isnan(a.raw)) return b;
|
||||
if (std::isnan(b.raw)) return a;
|
||||
if (isnan(a.raw)) return b;
|
||||
if (isnan(b.raw)) return a;
|
||||
return Vec1<T>(HWY_MIN(a.raw, b.raw));
|
||||
}
|
||||
|
||||
@ -587,8 +615,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
|
||||
if (std::isnan(a.raw)) return b;
|
||||
if (std::isnan(b.raw)) return a;
|
||||
if (isnan(a.raw)) return b;
|
||||
if (isnan(b.raw)) return a;
|
||||
return Vec1<T>(HWY_MAX(a.raw, b.raw));
|
||||
}
|
||||
|
||||
@ -707,10 +735,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
|
||||
|
||||
// Square root
|
||||
HWY_API Vec1<float> Sqrt(const Vec1<float> v) {
|
||||
return Vec1<float>(std::sqrt(v.raw));
|
||||
return Vec1<float>(sqrtf(v.raw));
|
||||
}
|
||||
HWY_API Vec1<double> Sqrt(const Vec1<double> v) {
|
||||
return Vec1<double>(std::sqrt(v.raw));
|
||||
return Vec1<double>(sqrt(v.raw));
|
||||
}
|
||||
|
||||
// ------------------------------ Floating-point rounding
|
||||
@ -725,7 +753,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
|
||||
const TI rounded = static_cast<TI>(v.raw + bias);
|
||||
if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
|
||||
// Round to even
|
||||
if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
||||
if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
||||
return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
|
||||
}
|
||||
return Vec1<T>(static_cast<T>(rounded));
|
||||
@ -737,12 +765,12 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
|
||||
using TI = int32_t;
|
||||
|
||||
const T abs = Abs(v).raw;
|
||||
const bool signbit = std::signbit(v.raw);
|
||||
const bool is_sign = detail::SignBit(v.raw);
|
||||
|
||||
if (!(abs < MantissaEnd<T>())) { // Huge or NaN
|
||||
// Check if too large to cast or NaN
|
||||
if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
|
||||
return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
|
||||
return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
|
||||
}
|
||||
return Vec1<int32_t>(static_cast<TI>(v.raw));
|
||||
}
|
||||
@ -750,8 +778,8 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
|
||||
const TI rounded = static_cast<TI>(v.raw + bias);
|
||||
if (rounded == 0) return Vec1<int32_t>(0);
|
||||
// Round to even
|
||||
if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
||||
return Vec1<TI>(rounded - (signbit ? -1 : 1));
|
||||
if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
||||
return Vec1<TI>(rounded - (is_sign ? -1 : 1));
|
||||
}
|
||||
return Vec1<TI>(rounded);
|
||||
}
|
||||
@ -1090,19 +1118,19 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
||||
// so we overload for FromT=double and ToT={float,int32_t}.
|
||||
HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
|
||||
// Prevent ubsan errors when converting float to narrower integer/float
|
||||
if (std::isinf(from.raw) ||
|
||||
std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
|
||||
return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
|
||||
: HighestValue<float>());
|
||||
if (isinf(from.raw) ||
|
||||
fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
|
||||
return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>()
|
||||
: HighestValue<float>());
|
||||
}
|
||||
return Vec1<float>(static_cast<float>(from.raw));
|
||||
}
|
||||
HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
|
||||
// Prevent ubsan errors when converting int32_t to narrower integer/int32_t
|
||||
if (std::isinf(from.raw) ||
|
||||
std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
|
||||
return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
|
||||
: HighestValue<int32_t>());
|
||||
if (isinf(from.raw) ||
|
||||
fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
|
||||
return Vec1<int32_t>(detail::SignBit(from.raw) ? LowestValue<int32_t>()
|
||||
: HighestValue<int32_t>());
|
||||
}
|
||||
return Vec1<int32_t>(static_cast<int32_t>(from.raw));
|
||||
}
|
||||
@ -1196,10 +1224,9 @@ HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
||||
// float## -> int##: return closest representable value. We cannot exactly
|
||||
// represent LimitsMax<ToT> in FromT, so use double.
|
||||
const double f = static_cast<double>(from.raw);
|
||||
if (std::isinf(from.raw) ||
|
||||
std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
|
||||
return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
|
||||
: LimitsMax<ToT>());
|
||||
if (isinf(from.raw) || fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
|
||||
return Vec1<ToT>(detail::SignBit(from.raw) ? LimitsMin<ToT>()
|
||||
: LimitsMax<ToT>());
|
||||
}
|
||||
return Vec1<ToT>(static_cast<ToT>(from.raw));
|
||||
}
|
||||
@ -1468,6 +1495,11 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
|
||||
return mask.bits == 0 ? -1 : 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t FindKnownFirstTrue(Sisd<T> /* tag */, const Mask1<T> /* m */) {
|
||||
return 0; // There is only one lane and we know it is true.
|
||||
}
|
||||
|
||||
// ------------------------------ Compress, CompressBits
|
||||
|
||||
template <typename T>
|
||||
@ -1530,6 +1562,14 @@ HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
|
||||
Vec1<float>(F32FromBF16(b.raw)), sum0);
|
||||
}
|
||||
|
||||
HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(Sisd<int32_t> /* tag */,
|
||||
Vec1<int16_t> a,
|
||||
Vec1<int16_t> b,
|
||||
const Vec1<int32_t> sum0,
|
||||
Vec1<int32_t>& /* sum1 */) {
|
||||
return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
|
||||
}
|
||||
|
||||
// ================================================== REDUCTIONS
|
||||
|
||||
// Sum of all lanes, i.e. the only one.
|
||||
|
2
third_party/highway/hwy/ops/set_macros-inl.h
vendored
2
third_party/highway/hwy/ops/set_macros-inl.h
vendored
@ -319,7 +319,7 @@
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM_EMU256
|
||||
|
17
third_party/highway/hwy/ops/shared-inl.h
vendored
17
third_party/highway/hwy/ops/shared-inl.h
vendored
@ -15,7 +15,17 @@
|
||||
|
||||
// Per-target definitions shared by ops/*.h and user code.
|
||||
|
||||
#include <cmath>
|
||||
// We are covered by the highway.h include guard, but generic_ops-inl.h
|
||||
// includes this again #if HWY_IDE.
|
||||
#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
@ -218,6 +228,9 @@ using Half = typename D::Half;
|
||||
template <class D>
|
||||
using Twice = typename D::Twice;
|
||||
|
||||
template <typename T>
|
||||
using Full16 = Simd<T, 2 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full32 = Simd<T, 4 / sizeof(T), 0>;
|
||||
|
||||
@ -309,3 +322,5 @@ using VecArg = V;
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE
|
||||
|
181
third_party/highway/hwy/ops/wasm_128-inl.h
vendored
181
third_party/highway/hwy/ops/wasm_128-inl.h
vendored
@ -49,6 +49,11 @@ HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
#if HWY_TARGET == HWY_WASM_EMU256
|
||||
template <typename T>
|
||||
using Full256 = Simd<T, 32 / sizeof(T), 0>;
|
||||
#endif
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
@ -67,6 +72,9 @@ class Vec128 {
|
||||
using Raw = typename detail::Raw128<T>::type;
|
||||
|
||||
public:
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = N; // only for DFromV
|
||||
|
||||
// Compound assignment. Only usable if there is a corresponding non-member
|
||||
// binary operator overload. For example, only f32 and f64 support division.
|
||||
HWY_INLINE Vec128& operator*=(const Vec128 other) {
|
||||
@ -100,29 +108,20 @@ using Vec64 = Vec128<T, 8 / sizeof(T)>;
|
||||
template <typename T>
|
||||
using Vec32 = Vec128<T, 4 / sizeof(T)>;
|
||||
|
||||
template <typename T>
|
||||
using Vec16 = Vec128<T, 2 / sizeof(T)>;
|
||||
|
||||
// FF..FF or 0.
|
||||
template <typename T, size_t N = 16 / sizeof(T)>
|
||||
struct Mask128 {
|
||||
typename detail::Raw128<T>::type raw;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Deduce Simd<T, N, 0> from Vec128<T, N>
|
||||
struct DeduceD {
|
||||
template <typename T, size_t N>
|
||||
Simd<T, N, 0> operator()(Vec128<T, N>) const {
|
||||
return Simd<T, N, 0>();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
template <class V>
|
||||
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
||||
|
||||
template <class V>
|
||||
using DFromV = decltype(detail::DeduceD()(V()));
|
||||
|
||||
template <class V>
|
||||
using TFromV = TFromD<DFromV<V>>;
|
||||
using TFromV = typename V::PrivateT;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
@ -237,7 +236,7 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
|
||||
HWY_DIAGNOSTICS(pop)
|
||||
|
||||
// Returns a vector with lane i=[0, N) set to "first" + i.
|
||||
template <typename T, size_t N, typename T2>
|
||||
template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
|
||||
Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
|
||||
HWY_ALIGN T lanes[16 / sizeof(T)];
|
||||
for (size_t i = 0; i < 16 / sizeof(T); ++i) {
|
||||
@ -1219,7 +1218,7 @@ HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
|
||||
|
||||
// ------------------------------ FirstN (Iota, Lt)
|
||||
|
||||
template <typename T, size_t N>
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
|
||||
const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
|
||||
return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
|
||||
@ -1412,6 +1411,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
const Simd<T, N, 0> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
|
||||
|
||||
// The x86 multiply-by-Pow2() trick will not work because WASM saturates
|
||||
@ -1568,7 +1573,7 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
|
||||
}
|
||||
|
||||
// LoadU == Load.
|
||||
template <typename T, size_t N>
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
|
||||
return Load(d, p);
|
||||
}
|
||||
@ -2516,7 +2521,7 @@ HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
|
||||
// ------------------------------ TableLookupLanes
|
||||
|
||||
// Returned by SetTableIndices for use by TableLookupLanes.
|
||||
template <typename T, size_t N>
|
||||
template <typename T, size_t N = 16 / sizeof(T)>
|
||||
struct Indices128 {
|
||||
__v128_u raw;
|
||||
};
|
||||
@ -2822,7 +2827,7 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
|
||||
// ------------------------------ Combine (InterleaveLower)
|
||||
|
||||
// N = N/2 + N/2 (upper half undefined)
|
||||
template <typename T, size_t N>
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
|
||||
Vec128<T, N / 2> lo_half) {
|
||||
const Half<decltype(d)> d2;
|
||||
@ -2836,7 +2841,7 @@ HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
|
||||
|
||||
// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
|
||||
|
||||
template <typename T, size_t N>
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
|
||||
return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
|
||||
}
|
||||
@ -3095,75 +3100,75 @@ HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
|
||||
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
||||
|
||||
// Unsigned: zero-extend.
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
|
||||
const Vec128<uint8_t, N> v) {
|
||||
return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(uint32_t, N)>
|
||||
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
|
||||
const Vec128<uint8_t, N> v) {
|
||||
return Vec128<uint32_t, N>{
|
||||
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int16_t, N)>
|
||||
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
|
||||
const Vec128<uint8_t, N> v) {
|
||||
return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int32_t, N)>
|
||||
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
|
||||
const Vec128<uint8_t, N> v) {
|
||||
return Vec128<int32_t, N>{
|
||||
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(uint32_t, N)>
|
||||
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
|
||||
const Vec128<uint16_t, N> v) {
|
||||
return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(uint64_t, N)>
|
||||
HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
|
||||
const Vec128<uint32_t, N> v) {
|
||||
return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int32_t, N)>
|
||||
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
|
||||
const Vec128<uint16_t, N> v) {
|
||||
return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
|
||||
}
|
||||
|
||||
// Signed: replicate sign bit.
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int16_t, N)>
|
||||
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
|
||||
const Vec128<int8_t, N> v) {
|
||||
return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int32_t, N)>
|
||||
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
|
||||
const Vec128<int8_t, N> v) {
|
||||
return Vec128<int32_t, N>{
|
||||
wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int32_t, N)>
|
||||
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
|
||||
const Vec128<int16_t, N> v) {
|
||||
return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
|
||||
}
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(int64_t, N)>
|
||||
HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
|
||||
const Vec128<int32_t, N> v) {
|
||||
return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(double, N)>
|
||||
HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
|
||||
const Vec128<int32_t, N> v) {
|
||||
return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(float, N)>
|
||||
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
|
||||
const Vec128<float16_t, N> v) {
|
||||
const RebindToSigned<decltype(df32)> di32;
|
||||
@ -3184,7 +3189,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
|
||||
return BitCast(df32, ShiftLeft<31>(sign) | bits32);
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
template <size_t N, HWY_IF_LE128(float, N)>
|
||||
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
|
||||
const Vec128<bfloat16_t, N> v) {
|
||||
const Rebind<uint16_t, decltype(df32)> du16;
|
||||
@ -3285,7 +3290,33 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
|
||||
const RebindToUnsigned<decltype(dbf16)> du16;
|
||||
const Repartition<uint32_t, decltype(dbf16)> du32;
|
||||
const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even));
|
||||
return BitCast(dbf16, u16);
|
||||
}
|
||||
|
||||
// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
|
||||
// above 2*N.
|
||||
HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
|
||||
Vec128<int32_t, 1> a,
|
||||
Vec128<int32_t, 1> b) {
|
||||
const Half<decltype(dn)> dnh;
|
||||
// Pretend the result has twice as many lanes so we can InterleaveLower.
|
||||
const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
|
||||
const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
|
||||
return InterleaveLower(an, bn);
|
||||
}
|
||||
HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
|
||||
Vec128<int32_t, 2> a,
|
||||
Vec128<int32_t, 2> b) {
|
||||
const Half<decltype(dn)> dnh;
|
||||
// Pretend the result has twice as many lanes so we can InterleaveLower.
|
||||
const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
|
||||
const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
|
||||
return InterleaveLower(an, bn);
|
||||
}
|
||||
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
|
||||
Vec128<int32_t> a, Vec128<int32_t> b) {
|
||||
return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// For already range-limited input [0, 255].
|
||||
@ -3308,8 +3339,8 @@ HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
|
||||
return Vec128<To, 1>{v1.raw};
|
||||
}
|
||||
|
||||
HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
HWY_API Vec16<uint8_t> TruncateTo(Full16<uint8_t> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
const Full128<uint8_t> d;
|
||||
const auto v1 = BitCast(d, v);
|
||||
const auto v2 = ConcatEven(d, v1, v1);
|
||||
@ -3317,16 +3348,16 @@ HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
|
||||
return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
|
||||
}
|
||||
|
||||
HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
HWY_API Vec32<uint16_t> TruncateTo(Full32<uint16_t> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
const Full128<uint16_t> d;
|
||||
const auto v1 = BitCast(d, v);
|
||||
const auto v2 = ConcatEven(d, v1, v1);
|
||||
return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
|
||||
}
|
||||
|
||||
HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
HWY_API Vec64<uint32_t> TruncateTo(Full64<uint32_t> /* tag */,
|
||||
const Vec128<uint64_t> v) {
|
||||
const Full128<uint32_t> d;
|
||||
const auto v1 = BitCast(d, v);
|
||||
return LowerHalf(ConcatEven(d, v1, v1));
|
||||
@ -3683,6 +3714,13 @@ HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
|
||||
return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
const uint64_t bits = detail::BitsFromMask(mask);
|
||||
return Num0BitsBelowLS1Bit_Nonzero64(bits);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
@ -4102,7 +4140,11 @@ HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
|
||||
|
||||
template <typename T>
|
||||
struct CompressIsPartition {
|
||||
#if HWY_TARGET == HWY_WASM_EMU256
|
||||
enum { value = 0 };
|
||||
#else
|
||||
enum { value = 1 };
|
||||
#endif
|
||||
};
|
||||
|
||||
// Single lane: no-op
|
||||
@ -4265,6 +4307,16 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
|
||||
// safe.
|
||||
template <size_t N>
|
||||
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
|
||||
Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
|
||||
Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
|
||||
Vec128<int32_t, N>& /*sum1*/) {
|
||||
return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ Reductions
|
||||
|
||||
namespace detail {
|
||||
@ -4353,6 +4405,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
return Max(v10, v01);
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
const Simd<uint16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
template <size_t N, HWY_IF_GE32(int16_t, N)>
|
||||
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<int16_t, N> v) {
|
||||
const Simd<int16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
// Sign-extend
|
||||
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
@ -4422,7 +4498,7 @@ HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
// Truth table of Eq and Lt for Hi and Lo u64.
|
||||
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
|
||||
// =H =L cH cL | out = cH | (=H & cL)
|
||||
@ -4459,7 +4535,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
|
||||
return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
|
||||
}
|
||||
@ -4471,6 +4547,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
|
||||
}
|
||||
|
||||
// ------------------------------ Ne128
|
||||
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
|
||||
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
|
||||
return MaskFromVec(Or(Reverse2(d, neHL), neHL));
|
||||
}
|
||||
|
||||
template <typename T, size_t N, HWY_IF_LE128(T, N)>
|
||||
HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
|
||||
Vec128<T, N> b) {
|
||||
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
|
||||
return MaskFromVec(InterleaveUpper(d, neHL, neHL));
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
// Without a native OddEven, it seems infeasible to go faster than Lt128.
|
||||
|
3162
third_party/highway/hwy/ops/wasm_256-inl.h
vendored
3162
third_party/highway/hwy/ops/wasm_256-inl.h
vendored
File diff suppressed because it is too large
Load Diff
238
third_party/highway/hwy/ops/x86_128-inl.h
vendored
238
third_party/highway/hwy/ops/x86_128-inl.h
vendored
@ -21,7 +21,7 @@
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Avoid uninitialized warnings in GCC's emmintrin.h - see
|
||||
// https://github.com/google/highway/issues/710 and pull/902)
|
||||
// https://github.com/google/highway/issues/710 and pull/902
|
||||
HWY_DIAGNOSTICS(push)
|
||||
#if HWY_COMPILER_GCC_ACTUAL
|
||||
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
|
||||
@ -49,17 +49,6 @@ HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
template <typename T>
|
||||
using Full256 = Simd<T, 32 / sizeof(T), 0>;
|
||||
#endif
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
template <typename T>
|
||||
using Full512 = Simd<T, 64 / sizeof(T), 0>;
|
||||
#endif
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
@ -82,6 +71,9 @@ class Vec128 {
|
||||
using Raw = typename detail::Raw128<T>::type;
|
||||
|
||||
public:
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = N; // only for DFromV
|
||||
|
||||
// Compound assignment. Only usable if there is a corresponding non-member
|
||||
// binary operator overload. For example, only f32 and f64 support division.
|
||||
HWY_INLINE Vec128& operator*=(const Vec128 other) {
|
||||
@ -117,10 +109,6 @@ using Vec32 = Vec128<T, 4 / sizeof(T)>;
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
|
||||
// Forward-declare for use by DeduceD, see below.
|
||||
template <typename T>
|
||||
class Vec512;
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Template arg: sizeof(lane type)
|
||||
@ -166,49 +154,11 @@ struct Mask128 {
|
||||
|
||||
#endif // HWY_TARGET <= HWY_AVX3
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
// Forward-declare for use by DeduceD, see below.
|
||||
template <typename T>
|
||||
class Vec256;
|
||||
#endif
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
|
||||
// incomplete types at this point; this is simpler than avoiding multiple
|
||||
// definitions of DFromV via #if)
|
||||
struct DeduceD {
|
||||
template <typename T, size_t N>
|
||||
Simd<T, N, 0> operator()(const Vec128<T, N>*) const {
|
||||
return Simd<T, N, 0>();
|
||||
}
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
template <typename T>
|
||||
Full256<T> operator()(const hwy::HWY_NAMESPACE::Vec256<T>*) const {
|
||||
return Full256<T>();
|
||||
}
|
||||
#endif
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
template <typename T>
|
||||
Full512<T> operator()(const hwy::HWY_NAMESPACE::Vec512<T>*) const {
|
||||
return Full512<T>();
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
|
||||
template <class V>
|
||||
struct ExpandDFromV {
|
||||
using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
|
||||
|
||||
template <class V>
|
||||
using DFromV = typename detail::ExpandDFromV<V>::type;
|
||||
|
||||
template <class V>
|
||||
using TFromV = TFromD<DFromV<V>>;
|
||||
using TFromV = typename V::PrivateT;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
@ -983,6 +933,47 @@ HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
|
||||
const Mask128<T, N> a,
|
||||
const Mask128<T, N> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
|
||||
const Mask128<T, N> a,
|
||||
const Mask128<T, N> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
|
||||
const Mask128<T, N> a,
|
||||
const Mask128<T, N> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
|
||||
#else
|
||||
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
||||
const Mask128<T, N> a,
|
||||
const Mask128<T, N> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
|
||||
#else
|
||||
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, size_t N>
|
||||
@ -1012,6 +1003,11 @@ HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
|
||||
return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
||||
}
|
||||
|
||||
#else // AVX2 or below
|
||||
|
||||
// ------------------------------ Mask
|
||||
@ -1109,6 +1105,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
|
||||
const Simd<T, N, 0> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET <= HWY_AVX3
|
||||
|
||||
// ------------------------------ ShiftLeft
|
||||
@ -5170,26 +5172,33 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
template <size_t N>
|
||||
HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
|
||||
Vec128<bfloat16_t, 2 * N> a,
|
||||
Vec128<bfloat16_t, 2 * N> b,
|
||||
const Vec128<float, N> sum0,
|
||||
Vec128<float, N>& sum1) {
|
||||
template <class V, size_t N, class D16 = Simd<bfloat16_t, 2 * N, 0>>
|
||||
HWY_API V ReorderWidenMulAccumulate(Simd<float, N, 0> df32, VFromD<D16> a,
|
||||
VFromD<D16> b, const V sum0, V& sum1) {
|
||||
// TODO(janwas): _mm_dpbf16_ps when available
|
||||
const Repartition<uint16_t, decltype(df32)> du16;
|
||||
const RebindToUnsigned<decltype(df32)> du32;
|
||||
const Vec128<uint16_t, 2 * N> zero = Zero(du16);
|
||||
const auto zero = Zero(du16);
|
||||
// Lane order within sum0/1 is undefined, hence we can avoid the
|
||||
// longer-latency lane-crossing PromoteTo.
|
||||
const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
|
||||
const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
|
||||
const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
|
||||
const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
|
||||
using VU32 = VFromD<RebindToUnsigned<decltype(df32)>>;
|
||||
const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
|
||||
const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
|
||||
const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
|
||||
const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
|
||||
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
}
|
||||
|
||||
// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
|
||||
template <size_t N>
|
||||
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
|
||||
Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
|
||||
Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
|
||||
Vec128<int32_t, N>& /*sum1*/) {
|
||||
return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ================================================== CONVERT
|
||||
|
||||
// ------------------------------ Promotions (part w/ narrow lanes -> full)
|
||||
@ -5461,6 +5470,30 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
|
||||
HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
|
||||
Vec128<int32_t, 1> a,
|
||||
Vec128<int32_t, 1> b) {
|
||||
const Half<decltype(dn)> dnh;
|
||||
// Pretend the result has twice as many lanes so we can InterleaveLower.
|
||||
const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
|
||||
const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
|
||||
return InterleaveLower(an, bn);
|
||||
}
|
||||
HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
|
||||
Vec128<int32_t, 2> a,
|
||||
Vec128<int32_t, 2> b) {
|
||||
const Half<decltype(dn)> dnh;
|
||||
// Pretend the result has twice as many lanes so we can InterleaveLower.
|
||||
const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
|
||||
const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
|
||||
return InterleaveLower(an, bn);
|
||||
}
|
||||
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
|
||||
Vec128<int32_t> a, Vec128<int32_t> b) {
|
||||
return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
|
||||
const Vec128<double, N> v) {
|
||||
@ -6035,6 +6068,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
|
||||
return PopCount(mask_bits);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
|
||||
return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
@ -6500,6 +6540,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
|
||||
return PopCount(detail::BitsFromMask(mask));
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
||||
return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
|
||||
const Mask128<T, N> mask) {
|
||||
@ -7161,6 +7208,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
return Max(v10, v01);
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
const Simd<uint16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
template <size_t N, HWY_IF_GE32(int16_t, N)>
|
||||
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<int16_t, N> v) {
|
||||
const Simd<int16_t, N, 0> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
// Sign-extend
|
||||
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
|
||||
template <size_t N, HWY_IF_GE32(uint16_t, N)>
|
||||
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec128<uint16_t, N> v) {
|
||||
@ -7232,7 +7303,8 @@ namespace detail {
|
||||
// Returns vector-mask for Lt128. Also used by x86_256/x86_512.
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
// Truth table of Eq and Lt for Hi and Lo u64.
|
||||
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
|
||||
// =H =L cH cL | out = cH | (=H & cL)
|
||||
@ -7256,12 +7328,22 @@ HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
|
||||
// Returns vector-mask for Eq128. Also used by x86_256/x86_512.
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const auto eqHL = VecFromMask(d, Eq(a, b));
|
||||
const auto eqLH = Reverse2(d, eqHL);
|
||||
return And(eqHL, eqLH);
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_INLINE V Ne128Vec(const D d, const V a, const V b) {
|
||||
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
|
||||
"D must be u64");
|
||||
const auto neHL = VecFromMask(d, Ne(a, b));
|
||||
const auto neLH = Reverse2(d, neHL);
|
||||
return Or(neHL, neLH);
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
|
||||
// No specialization required for AVX-512: Mask <-> Vec is fast, and
|
||||
@ -7278,6 +7360,14 @@ HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
|
||||
return InterleaveUpper(d, eqHL, eqHL);
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) {
|
||||
// No specialization required for AVX-512: Mask <-> Vec is fast, and
|
||||
// copying mask bits to their neighbor seems infeasible.
|
||||
const V neHL = VecFromMask(d, Ne(a, b));
|
||||
return InterleaveUpper(d, neHL, neHL);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
@ -7290,6 +7380,11 @@ HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
|
||||
return MaskFromVec(detail::Eq128Vec(d, a, b));
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_API MFromD<D> Ne128(D d, const V a, const V b) {
|
||||
return MaskFromVec(detail::Ne128Vec(d, a, b));
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
|
||||
return MaskFromVec(detail::Lt128UpperVec(d, a, b));
|
||||
@ -7300,6 +7395,11 @@ HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
|
||||
return MaskFromVec(detail::Eq128UpperVec(d, a, b));
|
||||
}
|
||||
|
||||
template <class D, class V = VFromD<D>>
|
||||
HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) {
|
||||
return MaskFromVec(detail::Ne128UpperVec(d, a, b));
|
||||
}
|
||||
|
||||
// ------------------------------ Min128, Max128 (Lt128)
|
||||
|
||||
// Avoids the extra MaskFromVec in Lt128.
|
||||
|
392
third_party/highway/hwy/ops/x86_256-inl.h
vendored
392
third_party/highway/hwy/ops/x86_256-inl.h
vendored
@ -83,6 +83,9 @@ class Vec256 {
|
||||
using Raw = typename detail::Raw256<T>::type;
|
||||
|
||||
public:
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
|
||||
|
||||
// Compound assignment. Only usable if there is a corresponding non-member
|
||||
// binary operator overload. For example, only f32 and f64 support division.
|
||||
HWY_INLINE Vec256& operator*=(const Vec256 other) {
|
||||
@ -157,6 +160,9 @@ struct Mask256 {
|
||||
|
||||
#endif // HWY_TARGET <= HWY_AVX3
|
||||
|
||||
template <typename T>
|
||||
using Full256 = Simd<T, 32 / sizeof(T), 0>;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
namespace detail {
|
||||
@ -764,6 +770,43 @@ HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
|
||||
const Mask256<T> a, const Mask256<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
|
||||
const Mask256<T> a, const Mask256<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
|
||||
const Mask256<T> a, const Mask256<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
||||
const Mask256<T> a, const Mask256<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
|
||||
#else
|
||||
return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
@ -793,6 +836,11 @@ HWY_API Mask256<T> Not(const Mask256<T> m) {
|
||||
return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
|
||||
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
||||
}
|
||||
|
||||
#else // AVX2
|
||||
|
||||
// ------------------------------ Mask
|
||||
@ -883,6 +931,12 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
|
||||
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
|
||||
const Full256<T> d;
|
||||
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET <= HWY_AVX3
|
||||
|
||||
// ================================================== COMPARE
|
||||
@ -2866,6 +2920,7 @@ HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
|
||||
return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
|
||||
}
|
||||
|
||||
// Used by generic_ops-inl.h
|
||||
namespace detail {
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
@ -3694,7 +3749,7 @@ HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if HWY_TARGET > HWY_AVX3 // AVX2 or older
|
||||
#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older
|
||||
|
||||
// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
|
||||
template <typename T>
|
||||
@ -3721,7 +3776,7 @@ HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
|
||||
|
||||
HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
|
||||
Vec256<uint16_t> bits) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
|
||||
return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
|
||||
#else
|
||||
return v * Pow2(bits);
|
||||
@ -3757,7 +3812,7 @@ HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
|
||||
// ------------------------------ Shr (MulHigh, IfThenElse, Not)
|
||||
|
||||
HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
|
||||
return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
|
||||
#else
|
||||
Full256<uint16_t> d;
|
||||
@ -3798,7 +3853,7 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
|
||||
|
||||
HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
|
||||
const Vec256<uint64_t> b) {
|
||||
const DFromV<decltype(a)> du64;
|
||||
const Full256<uint64_t> du64;
|
||||
const RepartitionToNarrow<decltype(du64)> du32;
|
||||
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
||||
const auto a32 = BitCast(du32, a);
|
||||
@ -3827,7 +3882,7 @@ HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
|
||||
|
||||
HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
|
||||
const Vec256<uint64_t> b) {
|
||||
const DFromV<decltype(a)> du64;
|
||||
const Full256<uint64_t> du64;
|
||||
const RepartitionToNarrow<decltype(du64)> du32;
|
||||
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
||||
const auto a32 = BitCast(du32, a);
|
||||
@ -3852,25 +3907,13 @@ HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
|
||||
return InterleaveUpper(du64, mulL, mulH);
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
|
||||
Vec256<bfloat16_t> a,
|
||||
Vec256<bfloat16_t> b,
|
||||
const Vec256<float> sum0,
|
||||
Vec256<float>& sum1) {
|
||||
// TODO(janwas): _mm256_dpbf16_ps when available
|
||||
const Repartition<uint16_t, decltype(df32)> du16;
|
||||
const RebindToUnsigned<decltype(df32)> du32;
|
||||
const Vec256<uint16_t> zero = Zero(du16);
|
||||
// Lane order within sum0/1 is undefined, hence we can avoid the
|
||||
// longer-latency lane-crossing PromoteTo.
|
||||
const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
|
||||
const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
|
||||
const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
|
||||
const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
|
||||
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
// ------------------------------ ReorderWidenMulAccumulate
|
||||
HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(Full256<int32_t> /*d32*/,
|
||||
Vec256<int16_t> a,
|
||||
Vec256<int16_t> b,
|
||||
const Vec256<int32_t> sum0,
|
||||
Vec256<int32_t>& /*sum1*/) {
|
||||
return sum0 + Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ================================================== CONVERT
|
||||
@ -4053,6 +4096,11 @@ HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
HWY_API Vec256<int16_t> ReorderDemote2To(Full256<int16_t> /*d16*/,
|
||||
Vec256<int32_t> a, Vec256<int32_t> b) {
|
||||
return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
|
||||
const Vec256<double> v) {
|
||||
return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
|
||||
@ -4218,7 +4266,7 @@ HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
|
||||
const RebindToSigned<decltype(df)> d32;
|
||||
|
||||
const auto msk_lo = Set(du32, 0xFFFF);
|
||||
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
|
||||
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
|
||||
|
||||
// Extract the 16 lowest/highest significant bits of v and cast to signed int
|
||||
const auto v_lo = BitCast(d32, And(v, msk_lo));
|
||||
@ -4238,9 +4286,9 @@ HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
|
||||
using VU = VFromD<decltype(d64)>;
|
||||
|
||||
const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
|
||||
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
|
||||
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
|
||||
|
||||
// Extract the 32 lowest significant bits of v
|
||||
// Extract the 32 lowest significant bits of v
|
||||
const VU v_lo = And(v, msk_lo);
|
||||
const VU v_hi = ShiftRight<32>(v);
|
||||
|
||||
@ -4458,9 +4506,15 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
|
||||
const Mask256<T> mask) {
|
||||
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
|
||||
HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
|
||||
const Mask256<T> mask) {
|
||||
return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
|
||||
return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
|
||||
: intptr_t{-1};
|
||||
}
|
||||
|
||||
// Beware: the suffix indicates the number of mask bits, not lane size!
|
||||
@ -4903,6 +4957,13 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
|
||||
return PopCount(detail::BitsFromMask(mask));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
|
||||
const Mask256<T> mask) {
|
||||
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
||||
return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
|
||||
const Mask256<T> mask) {
|
||||
@ -4915,8 +4976,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
|
||||
namespace detail {
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
|
||||
const RebindToUnsigned<decltype(d)> d32;
|
||||
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
|
||||
// of SetTableIndices would require 8 KiB, a large part of L1D. The other
|
||||
@ -4925,49 +4985,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
// bits, for a total of 1 KiB.
|
||||
alignas(16) constexpr uint32_t packed_array[256] = {
|
||||
// PrintCompress32x8Tables
|
||||
0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
|
||||
0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
|
||||
0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
|
||||
0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
|
||||
0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
|
||||
0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
|
||||
0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
|
||||
0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
|
||||
0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
|
||||
0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
|
||||
0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
|
||||
0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
|
||||
0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
|
||||
0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
|
||||
0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
|
||||
0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
|
||||
0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
|
||||
0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
|
||||
0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
|
||||
0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
|
||||
0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
|
||||
0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
|
||||
0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
|
||||
0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
|
||||
0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
|
||||
0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
|
||||
0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
|
||||
0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
|
||||
0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
|
||||
0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
|
||||
0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
|
||||
0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
|
||||
0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
|
||||
0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
|
||||
0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
|
||||
0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
|
||||
0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
|
||||
0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
|
||||
0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
|
||||
0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
|
||||
0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
|
||||
0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
|
||||
0x10765432, 0x17654320, 0x07654321, 0x76543210};
|
||||
0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
|
||||
0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
|
||||
0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
|
||||
0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
|
||||
0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
|
||||
0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
|
||||
0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
|
||||
0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
|
||||
0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
|
||||
0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
|
||||
0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
|
||||
0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
|
||||
0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
|
||||
0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
|
||||
0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
|
||||
0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
|
||||
0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
|
||||
0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
|
||||
0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
|
||||
0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
|
||||
0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
|
||||
0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
|
||||
0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
|
||||
0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
|
||||
0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
|
||||
0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
|
||||
0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
|
||||
0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
|
||||
0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
|
||||
0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
|
||||
0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
|
||||
0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
|
||||
0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
|
||||
0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
|
||||
0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
|
||||
0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
|
||||
0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
|
||||
0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
|
||||
0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
|
||||
0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
|
||||
0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
|
||||
0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
|
||||
0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
|
||||
|
||||
// No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
|
||||
// Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
|
||||
@ -4975,12 +5035,11 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
// latency, it may be faster to use LoadDup128 and PSHUFB.
|
||||
const auto packed = Set(d32, packed_array[mask_bits]);
|
||||
alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
|
||||
return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
|
||||
return packed >> Load(d32, shifts);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
|
||||
const Repartition<uint32_t, decltype(d)> d32;
|
||||
|
||||
// For 64-bit, we still need 32-bit indices because there is no 64-bit
|
||||
@ -4988,18 +5047,20 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
// unpacking and load the entire index vector directly.
|
||||
alignas(32) constexpr uint32_t u32_indices[128] = {
|
||||
// PrintCompress64x4PairTables
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
|
||||
6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
|
||||
2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
|
||||
0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
|
||||
0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
|
||||
2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
|
||||
return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
|
||||
10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
|
||||
12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
|
||||
10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
|
||||
14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
|
||||
10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
|
||||
12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
|
||||
10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
return Load(d32, u32_indices + 8 * mask_bits);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
const RebindToUnsigned<decltype(d)> d32;
|
||||
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
|
||||
// of SetTableIndices would require 8 KiB, a large part of L1D. The other
|
||||
@ -5008,49 +5069,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
// bits, for a total of 1 KiB.
|
||||
alignas(16) constexpr uint32_t packed_array[256] = {
|
||||
// PrintCompressNot32x8Tables
|
||||
0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
|
||||
0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
|
||||
0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
|
||||
0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
|
||||
0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
|
||||
0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
|
||||
0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
|
||||
0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
|
||||
0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
|
||||
0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
|
||||
0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
|
||||
0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
|
||||
0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
|
||||
0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
|
||||
0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
|
||||
0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
|
||||
0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
|
||||
0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
|
||||
0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
|
||||
0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
|
||||
0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
|
||||
0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
|
||||
0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
|
||||
0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
|
||||
0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
|
||||
0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
|
||||
0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
|
||||
0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
|
||||
0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
|
||||
0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
|
||||
0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
|
||||
0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
|
||||
0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
|
||||
0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
|
||||
0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
|
||||
0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
|
||||
0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
|
||||
0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
|
||||
0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
|
||||
0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
|
||||
0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
|
||||
0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
|
||||
0x76543210, 0x76543201, 0x76543210, 0x76543210};
|
||||
0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
|
||||
0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
|
||||
0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
|
||||
0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
|
||||
0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
|
||||
0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
|
||||
0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
|
||||
0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
|
||||
0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
|
||||
0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
|
||||
0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
|
||||
0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
|
||||
0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
|
||||
0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
|
||||
0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
|
||||
0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
|
||||
0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
|
||||
0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
|
||||
0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
|
||||
0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
|
||||
0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
|
||||
0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
|
||||
0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
|
||||
0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
|
||||
0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
|
||||
0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
|
||||
0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
|
||||
0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
|
||||
0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
|
||||
0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
|
||||
0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
|
||||
0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
|
||||
0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
|
||||
0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
|
||||
0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
|
||||
0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
|
||||
0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
|
||||
0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
|
||||
0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
|
||||
0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
|
||||
0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
|
||||
0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
|
||||
0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
|
||||
|
||||
// No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
|
||||
// Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
|
||||
@ -5058,12 +5119,12 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
// latency, it may be faster to use LoadDup128 and PSHUFB.
|
||||
const auto packed = Set(d32, packed_array[mask_bits]);
|
||||
alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
|
||||
return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
|
||||
return packed >> Load(d32, shifts);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
uint64_t mask_bits) {
|
||||
const Repartition<uint32_t, decltype(d)> d32;
|
||||
|
||||
// For 64-bit, we still need 32-bit indices because there is no 64-bit
|
||||
@ -5071,13 +5132,15 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
|
||||
// unpacking and load the entire index vector directly.
|
||||
alignas(32) constexpr uint32_t u32_indices[128] = {
|
||||
// PrintCompressNot64x4PairTables
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
|
||||
2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
|
||||
0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
|
||||
2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
|
||||
4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
|
||||
6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
|
||||
return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
|
||||
8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
|
||||
8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
|
||||
8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
|
||||
8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
|
||||
8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
return Load(d32, u32_indices + 8 * mask_bits);
|
||||
}
|
||||
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
|
||||
HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
|
||||
@ -5085,7 +5148,9 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
|
||||
const Repartition<uint32_t, decltype(d)> du32;
|
||||
|
||||
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
|
||||
const auto indices = IndicesFromBits(d, mask_bits);
|
||||
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
|
||||
// no instruction for 4x64).
|
||||
const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
|
||||
return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
|
||||
}
|
||||
|
||||
@ -5135,7 +5200,9 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
|
||||
const Repartition<uint32_t, decltype(d)> du32;
|
||||
|
||||
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
|
||||
const auto indices = IndicesFromNotBits(d, mask_bits);
|
||||
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
|
||||
// no instruction for 4x64).
|
||||
const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
|
||||
return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
|
||||
}
|
||||
|
||||
@ -5199,7 +5266,22 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
const uint64_t mask_bits = detail::BitsFromMask(m);
|
||||
const size_t count = PopCount(mask_bits);
|
||||
BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
|
||||
|
||||
const Repartition<uint32_t, decltype(d)> du32;
|
||||
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
|
||||
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
|
||||
// no instruction for 4x64). Nibble MSB encodes FirstN.
|
||||
const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
|
||||
// Shift nibble MSB into MSB
|
||||
const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
|
||||
// First cast to unsigned (RebindMask cannot change lane size)
|
||||
const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
|
||||
const Mask256<T> mask = RebindMask(d, mask_u);
|
||||
const Vec256<T> compressed =
|
||||
BitCast(d, TableLookupLanes(BitCast(du32, v),
|
||||
Indices256<uint32_t>{idx_and_mask.raw}));
|
||||
|
||||
BlendedStore(compressed, mask, d, unaligned);
|
||||
// Workaround for MSAN not marking output as initialized (b/233326619)
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(unaligned, count * sizeof(T));
|
||||
@ -5429,6 +5511,28 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
|
||||
return Max(v10, v01);
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec256<uint16_t> v) {
|
||||
const Full256<uint16_t> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
HWY_API Vec256<int16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec256<int16_t> v) {
|
||||
const Full256<int16_t> d;
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
// Sign-extend
|
||||
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
Vec256<uint16_t> v) {
|
||||
const Full256<uint16_t> d;
|
||||
@ -5475,7 +5579,7 @@ HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
|
||||
// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result.
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
|
||||
const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
|
||||
|
128
third_party/highway/hwy/ops/x86_512-inl.h
vendored
128
third_party/highway/hwy/ops/x86_512-inl.h
vendored
@ -113,6 +113,9 @@ class Vec512 {
|
||||
using Raw = typename detail::Raw512<T>::type;
|
||||
|
||||
public:
|
||||
using PrivateT = T; // only for DFromV
|
||||
static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV
|
||||
|
||||
// Compound assignment. Only usable if there is a corresponding non-member
|
||||
// binary operator overload. For example, only f32 and f64 support division.
|
||||
HWY_INLINE Vec512& operator*=(const Vec512 other) {
|
||||
@ -146,6 +149,9 @@ struct Mask512 {
|
||||
typename detail::RawMask512<sizeof(T)>::type raw;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
using Full512 = Simd<T, 64 / sizeof(T), 0>;
|
||||
|
||||
// ------------------------------ BitCast
|
||||
|
||||
namespace detail {
|
||||
@ -1775,6 +1781,43 @@ HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
|
||||
const Mask512<T> a, const Mask512<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask512<T>{~(a.raw ^ b.raw)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
|
||||
const Mask512<T> a, const Mask512<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
|
||||
const Mask512<T> a, const Mask512<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
|
||||
#endif
|
||||
}
|
||||
template <typename T>
|
||||
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
|
||||
const Mask512<T> a, const Mask512<T> b) {
|
||||
#if HWY_COMPILER_HAS_MASK_INTRINSICS
|
||||
return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
|
||||
#else
|
||||
return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
@ -1802,6 +1845,11 @@ HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) {
|
||||
return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask512<T> ExclusiveNeither(const Mask512<T> a, Mask512<T> b) {
|
||||
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
|
||||
}
|
||||
|
||||
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
|
||||
|
||||
HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) {
|
||||
@ -3285,6 +3333,11 @@ HWY_API Vec512<bfloat16_t> ReorderDemote2To(Full512<bfloat16_t> dbf16,
|
||||
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
|
||||
}
|
||||
|
||||
HWY_API Vec512<int16_t> ReorderDemote2To(Full512<int16_t> /*d16*/,
|
||||
Vec512<int32_t> a, Vec512<int32_t> b) {
|
||||
return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
|
||||
const Vec512<double> v) {
|
||||
return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
|
||||
@ -3646,15 +3699,21 @@ HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
|
||||
HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
|
||||
const Mask512<T> mask) {
|
||||
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
|
||||
HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
|
||||
const Mask512<T> mask) {
|
||||
return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
||||
HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
|
||||
const Mask512<T> mask) {
|
||||
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
|
||||
HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
|
||||
const Mask512<T> mask) {
|
||||
return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API intptr_t FindFirstTrue(const Full512<T> d, const Mask512<T> mask) {
|
||||
return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
|
||||
: intptr_t{-1};
|
||||
}
|
||||
|
||||
// ------------------------------ Compress
|
||||
@ -3672,7 +3731,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
|
||||
// See CompressIsPartition. u64 is faster than u32.
|
||||
alignas(16) constexpr uint64_t packed_array[256] = {
|
||||
// PrintCompress32x8Tables
|
||||
// From PrintCompress32x8Tables, without the FirstN extension (there is
|
||||
// no benefit to including them because 64-bit CompressStore is anyway
|
||||
// masked, but also no harm because TableLookupLanes ignores the MSB).
|
||||
0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
|
||||
0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
|
||||
0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
|
||||
@ -3781,7 +3842,7 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
const auto idx = LoadU(du, iota + 32 - num0);
|
||||
const Vec512<uint16_t> idx = LoadU(du, iota + 32 - num0);
|
||||
const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
|
||||
demoted0.raw, m_upper, idx.raw, demoted1.raw)};
|
||||
#endif // HWY_TARGET == HWY_AVX3_DL
|
||||
@ -3800,7 +3861,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
|
||||
// See CompressIsPartition. u64 is faster than u32.
|
||||
alignas(16) constexpr uint64_t packed_array[256] = {
|
||||
// PrintCompressNot32x8Tables
|
||||
// From PrintCompressNot32x8Tables, without the FirstN extension (there is
|
||||
// no benefit to including them because 64-bit CompressStore is anyway
|
||||
// masked, but also no harm because TableLookupLanes ignores the MSB).
|
||||
0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
|
||||
0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
|
||||
0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
|
||||
@ -4149,7 +4212,7 @@ HWY_API void StoreTransposedBlocks4(const Vec512<T> i, const Vec512<T> j,
|
||||
|
||||
HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
|
||||
const Vec512<uint64_t> b) {
|
||||
const DFromV<decltype(a)> du64;
|
||||
const Full512<uint64_t> du64;
|
||||
const RepartitionToNarrow<decltype(du64)> du32;
|
||||
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
||||
const auto a32 = BitCast(du32, a);
|
||||
@ -4178,7 +4241,7 @@ HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
|
||||
|
||||
HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
|
||||
const Vec512<uint64_t> b) {
|
||||
const DFromV<decltype(a)> du64;
|
||||
const Full512<uint64_t> du64;
|
||||
const RepartitionToNarrow<decltype(du64)> du32;
|
||||
const auto maskL = Set(du64, 0xFFFFFFFFULL);
|
||||
const auto a32 = BitCast(du32, a);
|
||||
@ -4203,25 +4266,13 @@ HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
|
||||
return InterleaveUpper(du64, mulL, mulH);
|
||||
}
|
||||
|
||||
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
|
||||
|
||||
HWY_API Vec512<float> ReorderWidenMulAccumulate(Full512<float> df32,
|
||||
Vec512<bfloat16_t> a,
|
||||
Vec512<bfloat16_t> b,
|
||||
const Vec512<float> sum0,
|
||||
Vec512<float>& sum1) {
|
||||
// TODO(janwas): _mm512_dpbf16_ps when available
|
||||
const Repartition<uint16_t, decltype(df32)> du16;
|
||||
const RebindToUnsigned<decltype(df32)> du32;
|
||||
const Vec512<uint16_t> zero = Zero(du16);
|
||||
// Lane order within sum0/1 is undefined, hence we can avoid the
|
||||
// longer-latency lane-crossing PromoteTo.
|
||||
const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
|
||||
const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
|
||||
const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
|
||||
const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
|
||||
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
|
||||
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
|
||||
// ------------------------------ ReorderWidenMulAccumulate
|
||||
HWY_API Vec512<int32_t> ReorderWidenMulAccumulate(Full512<int32_t> /*d32*/,
|
||||
Vec512<int16_t> a,
|
||||
Vec512<int16_t> b,
|
||||
const Vec512<int32_t> sum0,
|
||||
Vec512<int32_t>& /*sum1*/) {
|
||||
return sum0 + Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ Reductions
|
||||
@ -4245,6 +4296,23 @@ HWY_API Vec512<float> SumOfLanes(Full512<float> d, Vec512<float> v) {
|
||||
HWY_API Vec512<double> SumOfLanes(Full512<double> d, Vec512<double> v) {
|
||||
return Set(d, _mm512_reduce_add_pd(v.raw));
|
||||
}
|
||||
HWY_API Vec512<uint16_t> SumOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(d32, even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
HWY_API Vec512<int16_t> SumOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
|
||||
const RepartitionToWide<decltype(d)> d32;
|
||||
// Sign-extend
|
||||
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
|
||||
const auto odd = ShiftRight<16>(BitCast(d32, v));
|
||||
const auto sum = SumOfLanes(d32, even + odd);
|
||||
// Also broadcast into odd lanes.
|
||||
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
|
||||
}
|
||||
|
||||
// Returns the minimum in each lane.
|
||||
HWY_API Vec512<int32_t> MinOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
|
||||
|
2
third_party/highway/hwy/print-inl.h
vendored
2
third_party/highway/hwy/print-inl.h
vendored
@ -35,7 +35,7 @@ namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Prints lanes around `lane`, in memory order.
|
||||
template <class D, class V = Vec<D>>
|
||||
template <class D, class V = VFromD<D>>
|
||||
void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
|
||||
size_t max_lanes = 7) {
|
||||
const size_t N = Lanes(d);
|
||||
|
3
third_party/highway/hwy/targets.cc
vendored
3
third_party/highway/hwy/targets.cc
vendored
@ -43,7 +43,6 @@
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
|
||||
#elif HWY_ARCH_ARM && HWY_OS_LINUX
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif // HWY_ARCH_*
|
||||
|
||||
@ -104,7 +103,7 @@ int64_t supported_targets_for_test_ = 0;
|
||||
int64_t supported_mask_ = LimitsMax<int64_t>();
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Arbritrary bit indices indicating which instruction set extensions are
|
||||
// Arbitrary bit indices indicating which instruction set extensions are
|
||||
// supported. Use enum to ensure values are distinct.
|
||||
enum class FeatureIndex : uint32_t {
|
||||
kSSE = 0,
|
||||
|
14
third_party/highway/hwy/targets.h
vendored
14
third_party/highway/hwy/targets.h
vendored
@ -16,7 +16,11 @@
|
||||
#ifndef HIGHWAY_HWY_TARGETS_H_
|
||||
#define HIGHWAY_HWY_TARGETS_H_
|
||||
|
||||
// Allows opting out of C++ standard library usage, which is not available in
|
||||
// some Compiler Explorer environments.
|
||||
#ifndef HWY_NO_LIBCXX
|
||||
#include <vector>
|
||||
#endif
|
||||
|
||||
// For SIMD module implementations and their callers. Defines which targets to
|
||||
// generate and call.
|
||||
@ -25,7 +29,7 @@
|
||||
#include "hwy/detect_targets.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
#if !HWY_ARCH_RVV
|
||||
#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
@ -61,6 +65,8 @@ HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
|
||||
// all targets.
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
|
||||
|
||||
#ifndef HWY_NO_LIBCXX
|
||||
|
||||
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
|
||||
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
|
||||
// is affected by the current SetSupportedTargetsForTest() mock if any.
|
||||
@ -74,6 +80,8 @@ HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif // HWY_NO_LIBCXX
|
||||
|
||||
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
||||
switch (target) {
|
||||
#if HWY_ARCH_X86
|
||||
@ -296,8 +304,8 @@ struct ChosenTarget {
|
||||
}
|
||||
|
||||
private:
|
||||
// TODO(janwas): remove #if once <atomic> is available
|
||||
#if HWY_ARCH_RVV
|
||||
// TODO(janwas): remove RVV once <atomic> is available
|
||||
#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
|
||||
int64_t LoadMask() const { return mask_; }
|
||||
void StoreMask(int64_t mask) { mask_ = mask; }
|
||||
|
||||
|
2
third_party/highway/hwy/targets_test.cc
vendored
2
third_party/highway/hwy/targets_test.cc
vendored
@ -37,6 +37,7 @@ DECLARE_FUNCTION(SVE_256)
|
||||
DECLARE_FUNCTION(SVE2_128)
|
||||
DECLARE_FUNCTION(PPC8)
|
||||
DECLARE_FUNCTION(WASM)
|
||||
DECLARE_FUNCTION(WASM_EMU256)
|
||||
DECLARE_FUNCTION(RVV)
|
||||
DECLARE_FUNCTION(SCALAR)
|
||||
DECLARE_FUNCTION(EMU128)
|
||||
@ -81,6 +82,7 @@ void CheckFakeFunction() {
|
||||
CallFunctionForTarget(HWY_SVE2_128, __LINE__);
|
||||
CallFunctionForTarget(HWY_PPC8, __LINE__);
|
||||
CallFunctionForTarget(HWY_WASM, __LINE__);
|
||||
CallFunctionForTarget(HWY_WASM_EMU256, __LINE__);
|
||||
CallFunctionForTarget(HWY_RVV, __LINE__);
|
||||
// The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
|
||||
// is opt-in only.
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
@ -17,6 +17,8 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
54
third_party/highway/hwy/tests/compare_test.cc
vendored
54
third_party/highway/hwy/tests/compare_test.cc
vendored
@ -338,7 +338,7 @@ HWY_NOINLINE void TestAllLt128Upper() {
|
||||
ForGEVectors<128, TestLt128Upper>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestEq128 {
|
||||
struct TestEq128 { // Also Ne128
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using V = Vec<D>;
|
||||
@ -353,15 +353,24 @@ struct TestEq128 {
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));
|
||||
|
||||
// Reversed order
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));
|
||||
|
||||
// Also check 128-bit blocks are independent
|
||||
const V iota = Iota(d, 1);
|
||||
@ -369,10 +378,16 @@ struct TestEq128 {
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));
|
||||
|
||||
// Max value
|
||||
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
|
||||
@ -381,12 +396,21 @@ struct TestEq128 {
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
|
||||
|
||||
struct TestEq128Upper {
|
||||
struct TestEq128Upper { // Also Ne128Upper
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using V = Vec<D>;
|
||||
@ -401,26 +425,43 @@ struct TestEq128Upper {
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));
|
||||
|
||||
// Reversed order
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));
|
||||
|
||||
// Also check 128-bit blocks are independent
|
||||
const V iota = Iota(d, 1);
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));
|
||||
|
||||
// Max value
|
||||
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
|
||||
@ -429,6 +470,15 @@ struct TestEq128Upper {
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
|
||||
}
|
||||
};
|
||||
|
||||
|
77
third_party/highway/hwy/tests/compress_test.cc
vendored
77
third_party/highway/hwy/tests/compress_test.cc
vendored
@ -37,13 +37,15 @@ namespace HWY_NAMESPACE {
|
||||
#if !HWY_PRINT_TABLES || HWY_IDE
|
||||
|
||||
template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
|
||||
void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
|
||||
size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
|
||||
void CheckStored(D d, DI di, const char* op, size_t expected_pos,
|
||||
size_t actual_pos, size_t num_to_check,
|
||||
const AlignedFreeUniquePtr<T[]>& in,
|
||||
const AlignedFreeUniquePtr<TI[]>& mask_lanes,
|
||||
const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
|
||||
int line) {
|
||||
if (expected_pos != actual_pos) {
|
||||
hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
|
||||
hwy::Abort(__FILE__, line,
|
||||
"%s: size mismatch for %s: expected %d, actual %d\n", op,
|
||||
TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
|
||||
static_cast<int>(actual_pos));
|
||||
}
|
||||
@ -51,7 +53,7 @@ void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
|
||||
for (size_t i = 0; i < num_to_check; ++i) {
|
||||
if (!IsEqual(expected[i], actual_u[i])) {
|
||||
const size_t N = Lanes(d);
|
||||
fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
|
||||
fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
|
||||
static_cast<int>(i), static_cast<int>(num_to_check), line);
|
||||
Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
|
||||
Print(d, "in", Load(d, in.get()), 0, N);
|
||||
@ -91,9 +93,9 @@ struct TestCompress {
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
size_t expected_pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const uint64_t bits = Random32(&rng);
|
||||
const uint64_t r = Random32(&rng);
|
||||
in_lanes[i] = T(); // cannot initialize float16_t directly.
|
||||
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
|
||||
CopyBytes<sizeof(T)>(&r, &in_lanes[i]); // not same size
|
||||
mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
|
||||
if (mask_lanes[i] > 0) {
|
||||
expected[expected_pos++] = in_lanes[i];
|
||||
@ -124,30 +126,32 @@ struct TestCompress {
|
||||
// Compress
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
StoreU(Compress(in, mask), d, actual_u);
|
||||
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
|
||||
in_lanes, mask_lanes, expected, actual_u, __LINE__);
|
||||
|
||||
// CompressNot
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
StoreU(CompressNot(in, Not(mask)), d, actual_u);
|
||||
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
|
||||
num_to_check, in_lanes, mask_lanes, expected, actual_u,
|
||||
__LINE__);
|
||||
|
||||
// CompressStore
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
const size_t size1 = CompressStore(in, mask, d, actual_u);
|
||||
// expected_pos instead of num_to_check because this op is not
|
||||
// affected by CompressIsPartition.
|
||||
CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
|
||||
in_lanes, mask_lanes, expected, actual_u, __LINE__);
|
||||
|
||||
// CompressBlendedStore
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
|
||||
// expected_pos instead of num_to_check because this op only writes
|
||||
// the mask=true lanes.
|
||||
CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
|
||||
expected_pos, in_lanes, mask_lanes, expected, actual_u,
|
||||
__LINE__);
|
||||
// Subsequent lanes are untouched.
|
||||
for (size_t i = size2; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(zero, actual_u[i]);
|
||||
@ -156,16 +160,18 @@ struct TestCompress {
|
||||
// CompressBits
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
StoreU(CompressBits(in, bits.get()), d, actual_u);
|
||||
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
|
||||
num_to_check, in_lanes, mask_lanes, expected, actual_u,
|
||||
__LINE__);
|
||||
|
||||
// CompressBitsStore
|
||||
memset(actual_u, 0, N * sizeof(T));
|
||||
const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
|
||||
// expected_pos instead of num_to_check because this op is not
|
||||
// affected by CompressIsPartition.
|
||||
CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
|
||||
mask_lanes, expected, actual_u, __LINE__);
|
||||
CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
|
||||
expected_pos, in_lanes, mask_lanes, expected, actual_u,
|
||||
__LINE__);
|
||||
} // rep
|
||||
} // frac
|
||||
} // operator()
|
||||
@ -230,8 +236,9 @@ struct TestCompressBlocks {
|
||||
// CompressBlocksNot
|
||||
memset(actual.get(), 0, N * sizeof(T));
|
||||
StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
|
||||
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
|
||||
mask_lanes, expected, actual.get(), __LINE__);
|
||||
CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
|
||||
num_to_check, in_lanes, mask_lanes, expected, actual.get(),
|
||||
__LINE__);
|
||||
} // rep
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
} // operator()
|
||||
@ -305,11 +312,13 @@ void PrintCompressNot16x8Tables() {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Compressed to nibbles, unpacked via variable right shift
|
||||
// Compressed to nibbles, unpacked via variable right shift. Also includes
|
||||
// FirstN bits in the nibble MSB.
|
||||
void PrintCompress32x8Tables() {
|
||||
printf("======================================= 32/64x8\n");
|
||||
constexpr size_t N = 8; // AVX2 or 64-bit AVX3
|
||||
for (uint64_t code = 0; code < (1ull << N); ++code) {
|
||||
const size_t count = PopCount(code);
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
// All lanes where mask = true
|
||||
@ -330,6 +339,10 @@ void PrintCompress32x8Tables() {
|
||||
uint64_t packed = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(indices[i] < N);
|
||||
if (i < count) {
|
||||
indices[i] |= N;
|
||||
HWY_ASSERT(indices[i] < 0x10);
|
||||
}
|
||||
packed += indices[i] << (i * 4);
|
||||
}
|
||||
|
||||
@ -344,6 +357,7 @@ void PrintCompressNot32x8Tables() {
|
||||
constexpr size_t N = 8; // AVX2 or 64-bit AVX3
|
||||
for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
|
||||
const uint64_t code = ~not_code;
|
||||
const size_t count = PopCount(code);
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
// All lanes where mask = true
|
||||
@ -364,6 +378,10 @@ void PrintCompressNot32x8Tables() {
|
||||
uint64_t packed = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(indices[i] < N);
|
||||
if (i < count) {
|
||||
indices[i] |= N;
|
||||
HWY_ASSERT(indices[i] < 0x10);
|
||||
}
|
||||
packed += indices[i] << (i * 4);
|
||||
}
|
||||
|
||||
@ -504,11 +522,13 @@ void PrintCompressNot64x4Tables() {
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Same as above, but prints pairs of u32 indices (for AVX2)
|
||||
// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
|
||||
// FirstN bits in the nibble MSB.
|
||||
void PrintCompress64x4PairTables() {
|
||||
printf("======================================= 64x4 u32 index\n");
|
||||
constexpr size_t N = 4; // AVX2
|
||||
for (uint64_t code = 0; code < (1ull << N); ++code) {
|
||||
const size_t count = PopCount(code);
|
||||
std::array<size_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
// All lanes where mask = true
|
||||
@ -530,8 +550,10 @@ void PrintCompress64x4PairTables() {
|
||||
// interpreted modulo N. Compression is not worth the extra shift+AND
|
||||
// because the table is anyway only 512 bytes.
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
|
||||
static_cast<int>(2 * indices[i]) + 1);
|
||||
const int first_n_bit = i < count ? 8 : 0;
|
||||
const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
|
||||
HWY_ASSERT(low < 0x10);
|
||||
printf("%d, %d, ", low, low + 1);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
@ -542,6 +564,7 @@ void PrintCompressNot64x4PairTables() {
|
||||
constexpr size_t N = 4; // AVX2
|
||||
for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
|
||||
const uint64_t code = ~not_code;
|
||||
const size_t count = PopCount(code);
|
||||
std::array<size_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
// All lanes where mask = true
|
||||
@ -563,8 +586,10 @@ void PrintCompressNot64x4PairTables() {
|
||||
// interpreted modulo N. Compression is not worth the extra shift+AND
|
||||
// because the table is anyway only 512 bytes.
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
|
||||
static_cast<int>(2 * indices[i]) + 1);
|
||||
const int first_n_bit = i < count ? 8 : 0;
|
||||
const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
|
||||
HWY_ASSERT(low < 0x10);
|
||||
printf("%d, %d, ", low, low + 1);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
2
third_party/highway/hwy/tests/demote_test.cc
vendored
2
third_party/highway/hwy/tests/demote_test.cc
vendored
@ -16,6 +16,8 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cmath> // std::isfinite
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
3
third_party/highway/hwy/tests/float_test.cc
vendored
3
third_party/highway/hwy/tests/float_test.cc
vendored
@ -18,8 +18,9 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <algorithm> // std::copy, std::fill
|
||||
#include <limits>
|
||||
#include <cmath> // std::abs, std::isnan, std::isinf, std::ceil, std::floor
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/float_test.cc"
|
||||
|
2
third_party/highway/hwy/tests/hwy_gtest.h
vendored
2
third_party/highway/hwy/tests/hwy_gtest.h
vendored
@ -22,7 +22,7 @@
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility> // std::tuple
|
||||
#include <tuple>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "hwy/highway.h"
|
||||
|
18
third_party/highway/hwy/tests/mask_test.cc
vendored
18
third_party/highway/hwy/tests/mask_test.cc
vendored
@ -17,6 +17,8 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcmp
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
@ -189,7 +191,7 @@ HWY_NOINLINE void TestAllCountTrue() {
|
||||
ForAllTypes(ForPartialVectors<TestCountTrue>());
|
||||
}
|
||||
|
||||
struct TestFindFirstTrue {
|
||||
struct TestFindFirstTrue { // Also FindKnownFirstTrue
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
@ -203,17 +205,18 @@ struct TestFindFirstTrue {
|
||||
|
||||
HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
|
||||
HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
|
||||
HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));
|
||||
|
||||
for (size_t code = 1; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const intptr_t expected = static_cast<intptr_t>(
|
||||
Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)));
|
||||
const size_t expected =
|
||||
Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
const intptr_t actual = FindFirstTrue(d, mask);
|
||||
HWY_ASSERT_EQ(expected, actual);
|
||||
HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
|
||||
HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
|
||||
}
|
||||
}
|
||||
};
|
||||
@ -237,6 +240,11 @@ struct TestLogicalMask {
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
|
||||
|
||||
Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
|
||||
HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
|
2
third_party/highway/hwy/tests/memory_test.cc
vendored
2
third_party/highway/hwy/tests/memory_test.cc
vendored
@ -23,6 +23,8 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm> // std::fill
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
|
||||
#include "hwy/cache_control.h"
|
||||
|
86
third_party/highway/hwy/tests/mul_test.cc
vendored
86
third_party/highway/hwy/tests/mul_test.cc
vendored
@ -26,6 +26,15 @@ HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <size_t kBits>
|
||||
constexpr uint64_t FirstBits() {
|
||||
return (1ull << kBits) - 1;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t FirstBits<64>() {
|
||||
return ~uint64_t{0};
|
||||
}
|
||||
|
||||
struct TestUnsignedMul {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
@ -56,9 +65,8 @@ struct TestUnsignedMul {
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
|
||||
|
||||
const size_t bits = sizeof(T) * 8;
|
||||
const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
|
||||
const T max2 = (static_cast<uint64_t>(max) * max) & mask;
|
||||
constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
|
||||
const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
|
||||
}
|
||||
};
|
||||
@ -349,64 +357,65 @@ struct TestReorderWidenMulAccumulate {
|
||||
HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
|
||||
using TW = MakeWide<TN>;
|
||||
const RepartitionToWide<DN> dw;
|
||||
const auto f0 = Zero(dw);
|
||||
const auto f1 = Set(dw, 1.0f);
|
||||
const auto fi = Iota(dw, 1);
|
||||
const auto bf0 = ReorderDemote2To(dn, f0, f0);
|
||||
const auto bf1 = ReorderDemote2To(dn, f1, f1);
|
||||
const auto bfi = ReorderDemote2To(dn, fi, fi);
|
||||
const size_t NW = Lanes(dw);
|
||||
auto delta = AllocateAligned<TW>(2 * NW);
|
||||
for (size_t i = 0; i < 2 * NW; ++i) {
|
||||
delta[i] = 0.0f;
|
||||
}
|
||||
const Half<DN> dnh;
|
||||
using VW = Vec<decltype(dw)>;
|
||||
using VN = Vec<decltype(dn)>;
|
||||
const size_t NN = Lanes(dn);
|
||||
|
||||
const VW f0 = Zero(dw);
|
||||
const VW f1 = Set(dw, TW{1});
|
||||
const VN bf0 = Zero(dn);
|
||||
// Cannot Set() bfloat16_t directly.
|
||||
const VN bf1 = ReorderDemote2To(dn, f1, f1);
|
||||
|
||||
// Any input zero => both outputs zero
|
||||
auto sum1 = f0;
|
||||
VW sum1 = f0;
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
|
||||
ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
|
||||
// delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
|
||||
for (size_t p = 0; p < 2 * NW; ++p) {
|
||||
delta[p] = 1.0f;
|
||||
const auto delta0 = Load(dw, delta.get() + 0);
|
||||
const auto delta1 = Load(dw, delta.get() + NW);
|
||||
delta[p] = 0.0f;
|
||||
const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
|
||||
// delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
|
||||
auto delta_w = AllocateAligned<TW>(NN);
|
||||
for (size_t p = 0; p < NN; ++p) {
|
||||
// Workaround for incorrect Clang wasm codegen: re-initialize the entire
|
||||
// array rather than zero-initialize once and then toggle lane p.
|
||||
for (size_t i = 0; i < NN; ++i) {
|
||||
delta_w[i] = static_cast<TW>(i == p);
|
||||
}
|
||||
const VW delta0 = Load(dw, delta_w.get());
|
||||
const VW delta1 = Load(dw, delta_w.get() + NN / 2);
|
||||
const VN delta = ReorderDemote2To(dn, delta0, delta1);
|
||||
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
|
||||
HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Swapped arg order
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
|
||||
HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
|
||||
sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
|
||||
sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
|
||||
HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1, and swap arg order
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
|
||||
sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
|
||||
sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
|
||||
HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -414,6 +423,7 @@ struct TestReorderWidenMulAccumulate {
|
||||
|
||||
HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
|
||||
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
|
||||
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -54,6 +54,7 @@ struct TestSumOfLanes {
|
||||
|
||||
HWY_NOINLINE void TestAllSumOfLanes() {
|
||||
ForUIF3264(ForPartialVectors<TestSumOfLanes>());
|
||||
ForUI16(ForPartialVectors<TestSumOfLanes>());
|
||||
}
|
||||
|
||||
struct TestMinOfLanes {
|
||||
@ -170,10 +171,8 @@ HWY_NOINLINE void TestAllMinMaxOfLanes() {
|
||||
const ForPartialVectors<TestMaxOfLanes> test_max;
|
||||
ForUIF3264(test_min);
|
||||
ForUIF3264(test_max);
|
||||
test_min(uint16_t());
|
||||
test_max(uint16_t());
|
||||
test_min(int16_t());
|
||||
test_max(int16_t());
|
||||
ForUI16(test_min);
|
||||
ForUI16(test_max);
|
||||
}
|
||||
|
||||
struct TestSumsOf8 {
|
||||
|
1
third_party/highway/hwy/tests/test_util.h
vendored
1
third_party/highway/hwy/tests/test_util.h
vendored
@ -22,6 +22,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cmath> // std::isnan
|
||||
#include <string>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
@ -16,6 +16,8 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
1
third_party/highway/libhwy-test.pc.in
vendored
1
third_party/highway/libhwy-test.pc.in
vendored
@ -1,4 +1,5 @@
|
||||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
|
||||
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
|
6
third_party/highway/run_tests.sh
vendored
6
third_party/highway/run_tests.sh
vendored
@ -19,11 +19,11 @@ cd ..
|
||||
rm -rf build
|
||||
|
||||
#######################################
|
||||
echo DEBUG Clang 7
|
||||
echo DEBUG Clang 9
|
||||
rm -rf build_dbg
|
||||
mkdir build_dbg
|
||||
cd build_dbg
|
||||
CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
|
||||
CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
|
||||
make -j
|
||||
ctest -j
|
||||
cd ..
|
||||
@ -41,7 +41,7 @@ cd ..
|
||||
rm -rf build_32
|
||||
|
||||
#######################################
|
||||
for VER in 8 9 10; do
|
||||
for VER in 10 11 12; do
|
||||
echo GCC $VER
|
||||
rm -rf build_g$VER
|
||||
mkdir build_g$VER
|
||||
|
2
third_party/jpeg-xl/AUTHORS
vendored
2
third_party/jpeg-xl/AUTHORS
vendored
@ -24,6 +24,7 @@ Aous Naman <aous@unsw.edu.au>
|
||||
Artem Selishchev
|
||||
Biswapriyo Nath <nathbappai@gmail.com>
|
||||
CanadianBaconBoi <beamconnor@gmail.com>
|
||||
Damiano Albani <damiano.albani@gmail.com>
|
||||
Daniel Novomeský <dnovomesky@gmail.com>
|
||||
David Burnett <vargolsoft@gmail.com>
|
||||
Dirk Lemstra <dirk@lemstra.org>
|
||||
@ -31,6 +32,7 @@ Don Olmstead <don.j.olmstead@gmail.com>
|
||||
Even Rouault <even.rouault@spatialys.com>
|
||||
Fred Brennan <copypaste@kittens.ph>
|
||||
Heiko Becker <heirecka@exherbo.org>
|
||||
Jim Robinson <jimbo2150@gmail.com>
|
||||
Jon Sneyers <jon@cloudinary.com>
|
||||
Kai Hollberg <Schweinepriester@users.noreply.github.com>
|
||||
Kleis Auke Wolthuizen <github@kleisauke.nl>
|
||||
|
1
third_party/jpeg-xl/CHANGELOG.md
vendored
1
third_party/jpeg-xl/CHANGELOG.md
vendored
@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
of the input buffer.
|
||||
- decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
|
||||
of the output buffer.
|
||||
- encoder API: add an effort 10 option for lossless compression.
|
||||
|
||||
## [0.7] - 2022-07-21
|
||||
|
||||
|
4
third_party/jpeg-xl/CMakeLists.txt
vendored
4
third_party/jpeg-xl/CMakeLists.txt
vendored
@ -100,6 +100,10 @@ set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
|
||||
"Build JPEGXL developer tools.")
|
||||
set(JPEGXL_ENABLE_TOOLS true CACHE BOOL
|
||||
"Build JPEGXL user tools: cjxl and djxl.")
|
||||
set(JPEGXL_ENABLE_JPEGLI true CACHE BOOL
|
||||
"Build jpegli library.")
|
||||
set(JPEGXL_ENABLE_JPEGLI_LIBJPEG true CACHE BOOL
|
||||
"Build libjpeg.so shared library based on jpegli.")
|
||||
set(JPEGXL_ENABLE_DOXYGEN true CACHE BOOL
|
||||
"Generate C API documentation using Doxygen.")
|
||||
set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
|
||||
|
7
third_party/jpeg-xl/ci.sh
vendored
7
third_party/jpeg-xl/ci.sh
vendored
@ -69,6 +69,11 @@ if [[ "${ENABLE_WASM_SIMD}" -ne "0" ]]; then
|
||||
CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -msimd128"
|
||||
fi
|
||||
|
||||
if [[ "${ENABLE_WASM_SIMD}" -eq "2" ]]; then
|
||||
CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_WANT_WASM2"
|
||||
CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -DHWY_WANT_WASM2"
|
||||
fi
|
||||
|
||||
if [[ ! -z "${HWY_BASELINE_TARGETS}" ]]; then
|
||||
CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS}"
|
||||
fi
|
||||
@ -139,6 +144,7 @@ detect_clang_version() {
|
||||
fi
|
||||
local clang_version=$("${CC:-clang}" --version | head -n1)
|
||||
clang_version=${clang_version#"Debian "}
|
||||
clang_version=${clang_version#"Ubuntu "}
|
||||
local llvm_tag
|
||||
case "${clang_version}" in
|
||||
"clang version 6."*)
|
||||
@ -547,6 +553,7 @@ cmd_coverage_report() {
|
||||
# Only print coverage information for the libjxl directories. The rest
|
||||
# is not part of the code under test.
|
||||
--filter '.*jxl/.*'
|
||||
--exclude '.*_gbench.cc'
|
||||
--exclude '.*_test.cc'
|
||||
--exclude '.*_testonly..*'
|
||||
--exclude '.*_debug.*'
|
||||
|
1
third_party/jpeg-xl/debian/rules
vendored
1
third_party/jpeg-xl/debian/rules
vendored
@ -14,4 +14,5 @@ override_dh_auto_configure:
|
||||
-DJPEGXL_FORCE_SYSTEM_GTEST=ON \
|
||||
-DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
|
||||
-DJPEGXL_FORCE_SYSTEM_HWY=ON \
|
||||
-DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
|
||||
-DJPEGXL_ENABLE_PLUGINS=ON
|
||||
|
2
third_party/jpeg-xl/deps.sh
vendored
2
third_party/jpeg-xl/deps.sh
vendored
@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
|
||||
# Git revisions we use for the given submodules. Update these whenever you
|
||||
# update a git submodule.
|
||||
THIRD_PARTY_BROTLI="35ef5c554d888bef217d449346067de05e269b30"
|
||||
THIRD_PARTY_HIGHWAY="22e3d7276f4157d4a47586ba9fd91dd6303f441a"
|
||||
THIRD_PARTY_HIGHWAY="f670ea580bb70b4113b63b9cdaa42ba9b10cd13a"
|
||||
THIRD_PARTY_SKCMS="b25b07b4b07990811de121c0356155b2ba0f4318"
|
||||
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
|
||||
THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
|
||||
|
10
third_party/jpeg-xl/experimental/fast_lossless/README.md
vendored
Normal file
10
third_party/jpeg-xl/experimental/fast_lossless/README.md
vendored
Normal file
@ -0,0 +1,10 @@
|
||||
# Fast-lossless
|
||||
This is a script to compile a standalone version of a JXL encoder that supports
|
||||
lossless compression, up to 16 bits, of 1- to 4-channel images and animations; it is
|
||||
very fast and compression is slightly worse than PNG for 8-bit nonphoto content
|
||||
and better or much better than PNG for all other situations.
|
||||
|
||||
The main encoder is made out of two files, `lib/jxl/enc_fast_lossless.{cc,h}`;
|
||||
it automatically selects and runs a SIMD implementation supported by your CPU.
|
||||
|
||||
This folder contains an example build script and `main` file.
|
@ -20,7 +20,8 @@ fi
|
||||
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
|
||||
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
|
||||
|
||||
"$CXX" -O3 -DFASTLL_ENABLE_NEON_INTRINSICS -fopenmp \
|
||||
"$CXX" -O3 \
|
||||
-I. lodepng.o \
|
||||
"${DIR}"/fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
|
||||
-I"${DIR}"/../../ \
|
||||
"${DIR}"/../../lib/jxl/enc_fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
|
||||
-o fast_lossless
|
||||
|
@ -18,9 +18,10 @@ fi
|
||||
|
||||
[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
|
||||
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
|
||||
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -mavx2 -o lodepng.o -c
|
||||
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
|
||||
|
||||
"$CXX" -O3 -mavx2 -DFASTLL_ENABLE_AVX2_INTRINSICS -fopenmp \
|
||||
-I. lodepng.o \
|
||||
"$DIR"/fast_lossless.cc "$DIR"/fast_lossless_main.cc \
|
||||
"$CXX" -O3 \
|
||||
-I. -g lodepng.o \
|
||||
-I"$DIR"/../../ \
|
||||
"$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
|
||||
-o fast_lossless
|
||||
|
26
third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
vendored
Normal file
26
third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
vendored
Normal file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
set -e
|
||||
|
||||
DIR=$(realpath "$(dirname "$0")")
|
||||
mkdir -p "$DIR"/build-aarch64
|
||||
cd "$DIR"/build-aarch64
|
||||
|
||||
CXX="${CXX-aarch64-linux-gnu-c++}"
|
||||
if ! command -v "$CXX" >/dev/null ; then
|
||||
printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
|
||||
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
|
||||
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
|
||||
|
||||
"$CXX" -O3 -static \
|
||||
-I. lodepng.o \
|
||||
-I"$DIR"/../../ \
|
||||
"$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
|
||||
-o fast_lossless
|
File diff suppressed because it is too large
Load Diff
@ -1,23 +0,0 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef FAST_LOSSLESS_H
|
||||
#define FAST_LOSSLESS_H
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
|
||||
size_t row_stride, size_t height, size_t nb_chans,
|
||||
size_t bitdepth, int effort,
|
||||
unsigned char** output);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif
|
@ -7,16 +7,20 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "fast_lossless.h"
|
||||
#include "lib/jxl/enc_fast_lossless.h"
|
||||
#include "lodepng.h"
|
||||
#include "pam-input.h"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s in.png out.jxl [effort] [num_reps]\n", argv[0]);
|
||||
fprintf(stderr,
|
||||
"Usage: %s in.png out.jxl [effort] [num_reps] [num_threads]\n",
|
||||
argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -24,6 +28,7 @@ int main(int argc, char** argv) {
|
||||
const char* out = argv[2];
|
||||
int effort = argc >= 4 ? atoi(argv[3]) : 2;
|
||||
size_t num_reps = argc >= 5 ? atoi(argv[4]) : 1;
|
||||
size_t num_threads = argc >= 6 ? atoi(argv[5]) : 0;
|
||||
|
||||
if (effort < 0 || effort > 127) {
|
||||
fprintf(
|
||||
@ -44,6 +49,35 @@ int main(int argc, char** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto parallel_runner = [](void* num_threads_ptr, void* opaque,
|
||||
void fun(void*, size_t), size_t count) {
|
||||
size_t num_threads = *(size_t*)num_threads_ptr;
|
||||
if (num_threads == 0) {
|
||||
num_threads = std::thread::hardware_concurrency();
|
||||
}
|
||||
if (num_threads > count) {
|
||||
num_threads = count;
|
||||
}
|
||||
if (num_threads == 1) {
|
||||
for (size_t i = 0; i < count; i++) {
|
||||
fun(opaque, i);
|
||||
}
|
||||
} else {
|
||||
std::atomic<int> task{0};
|
||||
std::vector<std::thread> threads;
|
||||
for (size_t i = 0; i < num_threads; i++) {
|
||||
threads.push_back(std::thread([count, opaque, fun, &task]() {
|
||||
while (true) {
|
||||
int t = task++;
|
||||
if (t >= count) break;
|
||||
fun(opaque, t);
|
||||
}
|
||||
}));
|
||||
}
|
||||
for (auto& t : threads) t.join();
|
||||
}
|
||||
};
|
||||
|
||||
size_t encoded_size = 0;
|
||||
unsigned char* encoded = nullptr;
|
||||
size_t stride = width * nb_chans * (bitdepth > 8 ? 2 : 1);
|
||||
@ -51,8 +85,9 @@ int main(int argc, char** argv) {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
for (size_t _ = 0; _ < num_reps; _++) {
|
||||
free(encoded);
|
||||
encoded_size = JxlFastLosslessEncode(png, width, stride, height, nb_chans,
|
||||
bitdepth, effort, &encoded);
|
||||
encoded_size = JxlFastLosslessEncode(
|
||||
png, width, stride, height, nb_chans, bitdepth,
|
||||
/*big_endian=*/true, effort, &encoded, &num_threads, +parallel_runner);
|
||||
}
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
if (num_reps > 1) {
|
||||
|
@ -270,8 +270,8 @@ bool DecodePAM(const char* filename, uint8_t** buffer, size_t* w, size_t* h,
|
||||
const uint8_t* pos = nullptr;
|
||||
if (!parser.ParseHeader(&header, &pos)) return false;
|
||||
|
||||
if (header.bits_per_sample == 0 || header.bits_per_sample > 12) {
|
||||
return error_msg("PNM: bits_per_sample invalid (can do at most 12-bit)");
|
||||
if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
|
||||
return error_msg("PNM: bits_per_sample invalid (can do at most 16-bit)");
|
||||
}
|
||||
*w = header.xsize;
|
||||
*h = header.ysize;
|
||||
|
14
third_party/jpeg-xl/lib/CMakeLists.txt
vendored
14
third_party/jpeg-xl/lib/CMakeLists.txt
vendored
@ -132,6 +132,15 @@ set(JPEGXL_COVERAGE_FLAGS
|
||||
endif() # JPEGXL_ENABLE_COVERAGE
|
||||
endif() #!MSVC
|
||||
|
||||
# strips the -static suffix from all the elements in LIST
|
||||
function(strip_static OUTPUT_VAR LIB_LIST)
|
||||
foreach(lib IN LISTS ${LIB_LIST})
|
||||
string(REGEX REPLACE "-static$" "" lib "${lib}")
|
||||
list(APPEND out_list "${lib}")
|
||||
endforeach()
|
||||
set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# The jxl library definition.
|
||||
include(jxl.cmake)
|
||||
|
||||
@ -140,6 +149,11 @@ if(JPEGXL_ENABLE_TOOLS)
|
||||
include(jxl_extras.cmake)
|
||||
endif()
|
||||
include(jxl_threads.cmake)
|
||||
# We only build JPEGLI on linux for now.
|
||||
find_package(JPEG)
|
||||
if (JPEG_FOUND AND JPEGXL_ENABLE_JPEGLI)
|
||||
include(jpegli.cmake)
|
||||
endif()
|
||||
|
||||
# Install all the library headers from the source and the generated ones. There
|
||||
# is no distinction on which libraries use which header since it is expected
|
||||
|
111
third_party/jpeg-xl/lib/extras/codec_test.cc
vendored
111
third_party/jpeg-xl/lib/extras/codec_test.cc
vendored
@ -14,14 +14,17 @@
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "lib/extras/dec/jpegli.h"
|
||||
#include "lib/extras/dec/pgx.h"
|
||||
#include "lib/extras/dec/pnm.h"
|
||||
#include "lib/extras/enc/encode.h"
|
||||
#include "lib/extras/encode_jpeg.h"
|
||||
#include "lib/extras/packed_image_convert.h"
|
||||
#include "lib/jxl/base/printf_macros.h"
|
||||
#include "lib/jxl/base/random.h"
|
||||
#include "lib/jxl/base/thread_pool_internal.h"
|
||||
#include "lib/jxl/color_management.h"
|
||||
#include "lib/jxl/enc_butteraugli_comparator.h"
|
||||
#include "lib/jxl/enc_color_management.h"
|
||||
#include "lib/jxl/image.h"
|
||||
#include "lib/jxl/image_bundle.h"
|
||||
@ -174,6 +177,7 @@ struct TestImageParams {
|
||||
bool add_alpha;
|
||||
bool big_endian;
|
||||
bool add_extra_channels;
|
||||
bool jpegli_decode = false;
|
||||
|
||||
bool ShouldTestRoundtrip() const {
|
||||
if (codec == Codec::kPNG) {
|
||||
@ -273,11 +277,32 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
|
||||
color_hints.Add("color_space",
|
||||
params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
|
||||
}
|
||||
ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
|
||||
color_hints, SizeConstraints(), &ppf_out));
|
||||
|
||||
if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
|
||||
params.codec != Codec::kEXR) {
|
||||
if (params.codec == Codec::kJPG && params.jpegli_decode) {
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
ASSERT_TRUE(
|
||||
DecodeJpeg(encoded.bitstreams[0], JXL_TYPE_UINT8, pool, &ppf_out));
|
||||
#endif
|
||||
} else {
|
||||
ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
|
||||
color_hints, SizeConstraints(), &ppf_out));
|
||||
}
|
||||
if (params.codec == Codec::kPNG && ppf_out.icc.empty()) {
|
||||
// Decoding a PNG may drop the ICC profile if there's a valid cICP chunk.
|
||||
// Rendering intent is not preserved in this case.
|
||||
EXPECT_EQ(ppf_in.color_encoding.color_space,
|
||||
ppf_out.color_encoding.color_space);
|
||||
EXPECT_EQ(ppf_in.color_encoding.white_point,
|
||||
ppf_out.color_encoding.white_point);
|
||||
if (ppf_in.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
|
||||
EXPECT_EQ(ppf_in.color_encoding.primaries,
|
||||
ppf_out.color_encoding.primaries);
|
||||
}
|
||||
EXPECT_EQ(ppf_in.color_encoding.transfer_function,
|
||||
ppf_out.color_encoding.transfer_function);
|
||||
EXPECT_EQ(ppf_out.color_encoding.rendering_intent,
|
||||
JXL_RENDERING_INTENT_RELATIVE);
|
||||
} else if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
|
||||
params.codec != Codec::kEXR) {
|
||||
EXPECT_EQ(ppf_in.icc, ppf_out.icc);
|
||||
}
|
||||
|
||||
@ -322,6 +347,10 @@ TEST(CodecTest, TestRoundTrip) {
|
||||
params.add_extra_channels = true;
|
||||
TestRoundTrip(params, &pool);
|
||||
}
|
||||
if (codec == Codec::kJPG) {
|
||||
params.jpegli_decode = true;
|
||||
TestRoundTrip(params, &pool);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -362,6 +391,78 @@ TEST(CodecTest, LosslessPNMRoundtrip) {
|
||||
}
|
||||
}
|
||||
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
TEST(CodecTest, JpegliXYBEncodeTest) {
|
||||
ThreadPool* pool = nullptr;
|
||||
CodecInOut io;
|
||||
const PaddedBytes orig =
|
||||
ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
|
||||
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
|
||||
|
||||
std::vector<uint8_t> compressed;
|
||||
JpegSettings settings;
|
||||
settings.xyb = true;
|
||||
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
|
||||
|
||||
CodecInOut io2;
|
||||
ASSERT_TRUE(
|
||||
SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
|
||||
|
||||
double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
|
||||
EXPECT_THAT(bpp, IsSlightlyBelow(1.5f));
|
||||
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
|
||||
/*distmap=*/nullptr, nullptr),
|
||||
IsSlightlyBelow(1.3f));
|
||||
}
|
||||
|
||||
TEST(CodecTest, JpegliYUVEncodeTest) {
|
||||
ThreadPool* pool = nullptr;
|
||||
CodecInOut io;
|
||||
const PaddedBytes orig =
|
||||
ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
|
||||
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
|
||||
|
||||
std::vector<uint8_t> compressed;
|
||||
JpegSettings settings;
|
||||
settings.xyb = false;
|
||||
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
|
||||
|
||||
CodecInOut io2;
|
||||
ASSERT_TRUE(
|
||||
SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
|
||||
|
||||
double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
|
||||
EXPECT_THAT(bpp, IsSlightlyBelow(2.3f));
|
||||
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
|
||||
/*distmap=*/nullptr, nullptr),
|
||||
IsSlightlyBelow(1.3f));
|
||||
}
|
||||
|
||||
TEST(CodecTest, Jpegli16bitRoundtripTest) {
|
||||
ThreadPool* pool = nullptr;
|
||||
CodecInOut io;
|
||||
const PaddedBytes orig = ReadTestData(
|
||||
"external/raw.pixls/"
|
||||
"Google-Pixel2XL-16bit_srgb8_v4_krita.png");
|
||||
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
|
||||
|
||||
std::vector<uint8_t> compressed;
|
||||
JpegSettings settings;
|
||||
settings.xyb = false;
|
||||
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
|
||||
|
||||
PackedPixelFile ppf_out;
|
||||
ASSERT_TRUE(DecodeJpeg(compressed, JXL_TYPE_UINT16, pool, &ppf_out));
|
||||
CodecInOut io2;
|
||||
ASSERT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf_out, pool, &io2));
|
||||
|
||||
EXPECT_THAT(compressed.size(), IsSlightlyBelow(3500u));
|
||||
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
|
||||
/*distmap=*/nullptr, nullptr),
|
||||
IsSlightlyBelow(1.13f));
|
||||
}
|
||||
#endif
|
||||
|
||||
CodecInOut DecodeRoundtrip(const std::string& pathname, ThreadPool* pool,
|
||||
const ColorHints& color_hints = ColorHints()) {
|
||||
CodecInOut io;
|
||||
|
164
third_party/jpeg-xl/lib/extras/dec/apng.cc
vendored
164
third_party/jpeg-xl/lib/extras/dec/apng.cc
vendored
@ -76,11 +76,145 @@ Status DecodeSRGB(const unsigned char* payload, const size_t payload_size,
|
||||
if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size");
|
||||
// (PNG uses the same values as ICC.)
|
||||
if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent");
|
||||
color_encoding->white_point = JXL_WHITE_POINT_D65;
|
||||
color_encoding->primaries = JXL_PRIMARIES_SRGB;
|
||||
color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
|
||||
color_encoding->rendering_intent =
|
||||
static_cast<JxlRenderingIntent>(payload[0]);
|
||||
return true;
|
||||
}
|
||||
|
||||
// If the cICP profile is not fully supported, return false and leave
|
||||
// color_encoding unmodified.
|
||||
Status DecodeCICP(const unsigned char* payload, const size_t payload_size,
|
||||
JxlColorEncoding* color_encoding) {
|
||||
if (payload_size != 4) return JXL_FAILURE("Wrong cICP size");
|
||||
JxlColorEncoding color_enc = *color_encoding;
|
||||
|
||||
// From https://www.itu.int/rec/T-REC-H.273-202107-I/en
|
||||
if (payload[0] == 1) {
|
||||
// IEC 61966-2-1 sRGB
|
||||
color_enc.primaries = JXL_PRIMARIES_SRGB;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else if (payload[0] == 4) {
|
||||
// Rec. ITU-R BT.470-6 System M
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 0.67;
|
||||
color_enc.primaries_red_xy[1] = 0.33;
|
||||
color_enc.primaries_green_xy[0] = 0.21;
|
||||
color_enc.primaries_green_xy[1] = 0.71;
|
||||
color_enc.primaries_blue_xy[0] = 0.14;
|
||||
color_enc.primaries_blue_xy[1] = 0.08;
|
||||
color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
|
||||
color_enc.white_point_xy[0] = 0.310;
|
||||
color_enc.white_point_xy[1] = 0.316;
|
||||
} else if (payload[0] == 5) {
|
||||
// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 0.64;
|
||||
color_enc.primaries_red_xy[1] = 0.33;
|
||||
color_enc.primaries_green_xy[0] = 0.29;
|
||||
color_enc.primaries_green_xy[1] = 0.60;
|
||||
color_enc.primaries_blue_xy[0] = 0.15;
|
||||
color_enc.primaries_blue_xy[1] = 0.06;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else if (payload[0] == 6 || payload[0] == 7) {
|
||||
// SMPTE ST 170 (2004) / SMPTE ST 240 (1999)
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 0.630;
|
||||
color_enc.primaries_red_xy[1] = 0.340;
|
||||
color_enc.primaries_green_xy[0] = 0.310;
|
||||
color_enc.primaries_green_xy[1] = 0.595;
|
||||
color_enc.primaries_blue_xy[0] = 0.155;
|
||||
color_enc.primaries_blue_xy[1] = 0.070;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else if (payload[0] == 8) {
|
||||
// Generic film (colour filters using Illuminant C)
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 0.681;
|
||||
color_enc.primaries_red_xy[1] = 0.319;
|
||||
color_enc.primaries_green_xy[0] = 0.243;
|
||||
color_enc.primaries_green_xy[1] = 0.692;
|
||||
color_enc.primaries_blue_xy[0] = 0.145;
|
||||
color_enc.primaries_blue_xy[1] = 0.049;
|
||||
color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
|
||||
color_enc.white_point_xy[0] = 0.310;
|
||||
color_enc.white_point_xy[1] = 0.316;
|
||||
} else if (payload[0] == 9) {
|
||||
// Rec. ITU-R BT.2100-2
|
||||
color_enc.primaries = JXL_PRIMARIES_2100;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else if (payload[0] == 10) {
|
||||
// CIE 1931 XYZ
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 1;
|
||||
color_enc.primaries_red_xy[1] = 0;
|
||||
color_enc.primaries_green_xy[0] = 0;
|
||||
color_enc.primaries_green_xy[1] = 1;
|
||||
color_enc.primaries_blue_xy[0] = 0;
|
||||
color_enc.primaries_blue_xy[1] = 0;
|
||||
color_enc.white_point = JXL_WHITE_POINT_E;
|
||||
} else if (payload[0] == 11) {
|
||||
// SMPTE RP 431-2 (2011)
|
||||
color_enc.primaries = JXL_PRIMARIES_P3;
|
||||
color_enc.white_point = JXL_WHITE_POINT_DCI;
|
||||
} else if (payload[0] == 12) {
|
||||
// SMPTE EG 432-1 (2010)
|
||||
color_enc.primaries = JXL_PRIMARIES_P3;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else if (payload[0] == 22) {
|
||||
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
|
||||
color_enc.primaries_red_xy[0] = 0.630;
|
||||
color_enc.primaries_red_xy[1] = 0.340;
|
||||
color_enc.primaries_green_xy[0] = 0.295;
|
||||
color_enc.primaries_green_xy[1] = 0.605;
|
||||
color_enc.primaries_blue_xy[0] = 0.155;
|
||||
color_enc.primaries_blue_xy[1] = 0.077;
|
||||
color_enc.white_point = JXL_WHITE_POINT_D65;
|
||||
} else {
|
||||
JXL_WARNING("Unsupported primaries specified in cICP chunk: %d",
|
||||
static_cast<int>(payload[0]));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (payload[1] == 1 || payload[1] == 6 || payload[1] == 14 ||
|
||||
payload[1] == 15) {
|
||||
// Rec. ITU-R BT.709-6
|
||||
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_709;
|
||||
} else if (payload[1] == 4) {
|
||||
// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
|
||||
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
|
||||
color_enc.gamma = 1 / 2.2;
|
||||
} else if (payload[1] == 5) {
|
||||
// Rec. ITU-R BT.470-6 System B, G
|
||||
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
|
||||
color_enc.gamma = 1 / 2.8;
|
||||
} else if (payload[1] == 8 || payload[1] == 13 || payload[1] == 16 ||
|
||||
payload[1] == 17 || payload[1] == 18) {
|
||||
// These codes all match the corresponding JXL enum values
|
||||
color_enc.transfer_function = static_cast<JxlTransferFunction>(payload[1]);
|
||||
} else {
|
||||
JXL_WARNING("Unsupported transfer function specified in cICP chunk: %d",
|
||||
static_cast<int>(payload[1]));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (payload[2] != 0) {
|
||||
JXL_WARNING("Unsupported color space specified in cICP chunk: %d",
|
||||
static_cast<int>(payload[2]));
|
||||
return false;
|
||||
}
|
||||
if (payload[3] != 1) {
|
||||
JXL_WARNING("Unsupported full-range flag specified in cICP chunk: %d",
|
||||
static_cast<int>(payload[3]));
|
||||
return false;
|
||||
}
|
||||
// cICP has no rendering intent, so use the default
|
||||
color_enc.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
|
||||
*color_encoding = color_enc;
|
||||
return true;
|
||||
}
|
||||
|
||||
Status DecodeGAMA(const unsigned char* payload, const size_t payload_size,
|
||||
JxlColorEncoding* color_encoding) {
|
||||
if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size");
|
||||
@ -286,6 +420,7 @@ constexpr uint32_t kId_fcTL = 0x4C546366;
|
||||
constexpr uint32_t kId_IDAT = 0x54414449;
|
||||
constexpr uint32_t kId_fdAT = 0x54416466;
|
||||
constexpr uint32_t kId_IEND = 0x444E4549;
|
||||
constexpr uint32_t kId_cICP = 0x50434963;
|
||||
constexpr uint32_t kId_iCCP = 0x50434369;
|
||||
constexpr uint32_t kId_sRGB = 0x42475273;
|
||||
constexpr uint32_t kId_gAMA = 0x414D4167;
|
||||
@ -469,7 +604,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
||||
|
||||
ppf->frames.clear();
|
||||
|
||||
bool have_color = false, have_srgb = false;
|
||||
bool have_color = false;
|
||||
bool have_cicp = false, have_iccp = false, have_srgb = false;
|
||||
bool errorstate = true;
|
||||
if (id == kId_IHDR && chunkIHDR.size() == 25) {
|
||||
x0 = 0;
|
||||
@ -490,6 +626,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
||||
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
|
||||
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
|
||||
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
|
||||
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
|
||||
|
||||
if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
|
||||
chunkIHDR, chunksInfo)) {
|
||||
@ -625,7 +762,17 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
||||
chunk.size() - 4)) {
|
||||
break;
|
||||
}
|
||||
} else if (id == kId_iCCP) {
|
||||
} else if (id == kId_cICP) {
|
||||
// Color profile chunks: cICP has the highest priority, followed by
|
||||
// iCCP and sRGB (which shouldn't co-exist, but if they do, we use
|
||||
// iCCP), followed finally by gAMA and cHRM.
|
||||
if (DecodeCICP(chunk.data() + 8, chunk.size() - 12,
|
||||
&ppf->color_encoding)) {
|
||||
have_cicp = true;
|
||||
have_color = true;
|
||||
ppf->icc.clear();
|
||||
}
|
||||
} else if (!have_cicp && id == kId_iCCP) {
|
||||
if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
|
||||
JXL_WARNING("Corrupt iCCP chunk");
|
||||
break;
|
||||
@ -642,19 +789,20 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
||||
if (ok && proflen) {
|
||||
ppf->icc.assign(profile, profile + proflen);
|
||||
have_color = true;
|
||||
have_iccp = true;
|
||||
} else {
|
||||
// TODO(eustas): JXL_WARNING?
|
||||
}
|
||||
} else if (id == kId_sRGB) {
|
||||
} else if (!have_cicp && !have_iccp && id == kId_sRGB) {
|
||||
JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12,
|
||||
&ppf->color_encoding));
|
||||
have_srgb = true;
|
||||
have_color = true;
|
||||
} else if (id == kId_gAMA) {
|
||||
} else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) {
|
||||
JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12,
|
||||
&ppf->color_encoding));
|
||||
have_color = true;
|
||||
} else if (id == kId_cHRM) {
|
||||
} else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) {
|
||||
JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12,
|
||||
&ppf->color_encoding));
|
||||
have_color = true;
|
||||
@ -677,12 +825,6 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
||||
}
|
||||
}
|
||||
|
||||
if (have_srgb) {
|
||||
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
|
||||
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
|
||||
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
|
||||
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
|
||||
}
|
||||
JXL_RETURN_IF_ERROR(ApplyColorHints(
|
||||
color_hints, have_color, ppf->info.num_color_channels == 1, ppf));
|
||||
}
|
||||
|
3
third_party/jpeg-xl/lib/extras/dec/decode.cc
vendored
3
third_party/jpeg-xl/lib/extras/dec/decode.cc
vendored
@ -107,7 +107,8 @@ Status DecodeBytes(const Span<const uint8_t> bytes,
|
||||
}
|
||||
#endif
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
else if (DecodeImageJPG(bytes, color_hints, constraints, ppf)) {
|
||||
else if (DecodeImageJPG(bytes, color_hints, constraints,
|
||||
/*output_bit_depth=*/8, ppf)) {
|
||||
codec = Codec::kJPG;
|
||||
}
|
||||
#endif
|
||||
|
209
third_party/jpeg-xl/lib/extras/dec/jpegli.cc
vendored
Normal file
209
third_party/jpeg-xl/lib/extras/dec/jpegli.cc
vendored
Normal file
@ -0,0 +1,209 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "lib/extras/dec/jpegli.h"
|
||||
|
||||
#include <setjmp.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "lib/jpegli/decode.h"
|
||||
#include "lib/jxl/base/status.h"
|
||||
#include "lib/jxl/sanitizers.h"
|
||||
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
|
||||
0x66, 0x00, 0x00};
|
||||
constexpr int kExifMarker = JPEG_APP0 + 1;
|
||||
constexpr int kICCMarker = JPEG_APP0 + 2;
|
||||
|
||||
static inline bool IsJPG(const std::vector<uint8_t>& bytes) {
|
||||
if (bytes.size() < 2) return false;
|
||||
if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
|
||||
return marker->marker == kExifMarker &&
|
||||
marker->data_length >= sizeof kExifSignature + 2 &&
|
||||
std::equal(std::begin(kExifSignature), std::end(kExifSignature),
|
||||
marker->data);
|
||||
}
|
||||
|
||||
Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
|
||||
std::vector<uint8_t>* const icc) {
|
||||
uint8_t* icc_data_ptr;
|
||||
unsigned int icc_data_len;
|
||||
if (jpegli_read_icc_profile(cinfo, &icc_data_ptr, &icc_data_len)) {
|
||||
icc->assign(icc_data_ptr, icc_data_ptr + icc_data_len);
|
||||
free(icc_data_ptr);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void ReadExif(jpeg_decompress_struct* const cinfo,
|
||||
std::vector<uint8_t>* const exif) {
|
||||
constexpr size_t kExifSignatureSize = sizeof kExifSignature;
|
||||
for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
|
||||
marker = marker->next) {
|
||||
// marker is initialized by libjpeg, which we are not instrumenting with
|
||||
// msan.
|
||||
msan::UnpoisonMemory(marker, sizeof(*marker));
|
||||
msan::UnpoisonMemory(marker->data, marker->data_length);
|
||||
if (!MarkerIsExif(marker)) continue;
|
||||
size_t marker_length = marker->data_length - kExifSignatureSize;
|
||||
exif->resize(marker_length);
|
||||
std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void MyErrorExit(j_common_ptr cinfo) {
|
||||
jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
|
||||
(*cinfo->err->output_message)(cinfo);
|
||||
jpegli_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
|
||||
longjmp(*env, 1);
|
||||
}
|
||||
|
||||
void MyOutputMessage(j_common_ptr cinfo) {
|
||||
#if JXL_DEBUG_WARNING == 1
|
||||
char buf[JMSG_LENGTH_MAX + 1];
|
||||
(*cinfo->err->format_message)(cinfo, buf);
|
||||
buf[JMSG_LENGTH_MAX] = 0;
|
||||
JXL_WARNING("%s", buf);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
|
||||
JxlDataType output_data_type, ThreadPool* pool,
|
||||
PackedPixelFile* ppf) {
|
||||
// Don't do anything for non-JPEG files (no need to report an error)
|
||||
if (!IsJPG(compressed)) return false;
|
||||
|
||||
// TODO(veluca): use JPEGData also for pixels?
|
||||
|
||||
// We need to declare all the non-trivial destructor local variables before
|
||||
// the call to setjmp().
|
||||
std::unique_ptr<JSAMPLE[]> row;
|
||||
|
||||
const auto try_catch_block = [&]() -> bool {
|
||||
jpeg_decompress_struct cinfo;
|
||||
// cinfo is initialized by libjpeg, which we are not instrumenting with
|
||||
// msan, therefore we need to initialize cinfo here.
|
||||
msan::UnpoisonMemory(&cinfo, sizeof(cinfo));
|
||||
// Setup error handling in jpeg library so we can deal with broken jpegs in
|
||||
// the fuzzer.
|
||||
jpeg_error_mgr jerr;
|
||||
jmp_buf env;
|
||||
cinfo.err = jpegli_std_error(&jerr);
|
||||
jerr.error_exit = &MyErrorExit;
|
||||
jerr.output_message = &MyOutputMessage;
|
||||
if (setjmp(env)) {
|
||||
return false;
|
||||
}
|
||||
cinfo.client_data = static_cast<void*>(&env);
|
||||
|
||||
jpegli_create_decompress(&cinfo);
|
||||
jpegli_mem_src(&cinfo,
|
||||
reinterpret_cast<const unsigned char*>(compressed.data()),
|
||||
compressed.size());
|
||||
jpegli_save_markers(&cinfo, kICCMarker, 0xFFFF);
|
||||
jpegli_save_markers(&cinfo, kExifMarker, 0xFFFF);
|
||||
const auto failure = [&cinfo](const char* str) -> Status {
|
||||
jpegli_abort_decompress(&cinfo);
|
||||
jpegli_destroy_decompress(&cinfo);
|
||||
return JXL_FAILURE("%s", str);
|
||||
};
|
||||
jpegli_read_header(&cinfo, TRUE);
|
||||
// Might cause CPU-zip bomb.
|
||||
if (cinfo.arith_code) {
|
||||
return failure("arithmetic code JPEGs are not supported");
|
||||
}
|
||||
int nbcomp = cinfo.num_components;
|
||||
if (nbcomp != 1 && nbcomp != 3) {
|
||||
return failure("unsupported number of components in JPEG");
|
||||
}
|
||||
if (!ReadICCProfile(&cinfo, &ppf->icc)) {
|
||||
ppf->icc.clear();
|
||||
// Default to SRGB
|
||||
// Actually, (cinfo.output_components == nbcomp) will be checked after
|
||||
// `jpegli_start_decompress`.
|
||||
ppf->color_encoding.color_space =
|
||||
(nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
|
||||
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
|
||||
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
|
||||
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
|
||||
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
|
||||
}
|
||||
ReadExif(&cinfo, &ppf->metadata.exif);
|
||||
|
||||
ppf->info.xsize = cinfo.image_width;
|
||||
ppf->info.ysize = cinfo.image_height;
|
||||
if (output_data_type == JXL_TYPE_UINT8) {
|
||||
ppf->info.bits_per_sample = 8;
|
||||
} else if (output_data_type == JXL_TYPE_UINT16) {
|
||||
ppf->info.bits_per_sample = 16;
|
||||
} else {
|
||||
return failure("unsupported data type");
|
||||
}
|
||||
ppf->info.exponent_bits_per_sample = 0;
|
||||
ppf->info.uses_original_profile = true;
|
||||
|
||||
// No alpha in JPG
|
||||
ppf->info.alpha_bits = 0;
|
||||
ppf->info.alpha_exponent_bits = 0;
|
||||
|
||||
ppf->info.num_color_channels = nbcomp;
|
||||
ppf->info.orientation = JXL_ORIENT_IDENTITY;
|
||||
|
||||
// Set output bit depth.
|
||||
cinfo.quantize_colors = FALSE;
|
||||
cinfo.desired_number_of_colors = 1 << ppf->info.bits_per_sample;
|
||||
jpegli_start_decompress(&cinfo);
|
||||
JXL_ASSERT(cinfo.output_components == nbcomp);
|
||||
|
||||
const JxlPixelFormat format{
|
||||
/*num_channels=*/static_cast<uint32_t>(nbcomp),
|
||||
output_data_type,
|
||||
/*endianness=*/JXL_NATIVE_ENDIAN,
|
||||
/*align=*/0,
|
||||
};
|
||||
ppf->frames.clear();
|
||||
// Allocates the frame buffer.
|
||||
ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
|
||||
const auto& frame = ppf->frames.back();
|
||||
JXL_ASSERT(sizeof(JSAMPLE) * cinfo.output_components * cinfo.image_width <=
|
||||
frame.color.stride);
|
||||
|
||||
for (size_t y = 0; y < cinfo.image_height; ++y) {
|
||||
JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
|
||||
static_cast<uint8_t*>(frame.color.pixels()) +
|
||||
frame.color.stride * y)};
|
||||
jpegli_read_scanlines(&cinfo, rows, 1);
|
||||
msan::UnpoisonMemory(rows[0], sizeof(JSAMPLE) * cinfo.output_components *
|
||||
cinfo.image_width);
|
||||
}
|
||||
|
||||
jpegli_finish_decompress(&cinfo);
|
||||
jpegli_destroy_decompress(&cinfo);
|
||||
return true;
|
||||
};
|
||||
|
||||
return try_catch_block();
|
||||
}
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
30
third_party/jpeg-xl/lib/extras/dec/jpegli.h
vendored
Normal file
30
third_party/jpeg-xl/lib/extras/dec/jpegli.h
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef LIB_EXTRAS_DEC_JPEGLI_H_
|
||||
#define LIB_EXTRAS_DEC_JPEGLI_H_
|
||||
|
||||
// Decodes JPG pixels and metadata in memory using the libjpegli library.
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "jxl/types.h"
|
||||
#include "lib/extras/packed_image.h"
|
||||
#include "lib/jxl/base/data_parallel.h"
|
||||
#include "lib/jxl/base/status.h"
|
||||
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
|
||||
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
|
||||
JxlDataType output_data_type, ThreadPool* pool,
|
||||
PackedPixelFile* ppf);
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
||||
|
||||
#endif // LIB_EXTRAS_DEC_JPEGLI_H_
|
20
third_party/jpeg-xl/lib/extras/dec/jpg.cc
vendored
20
third_party/jpeg-xl/lib/extras/dec/jpg.cc
vendored
@ -165,7 +165,7 @@ void MyOutputMessage(j_common_ptr cinfo) {
|
||||
Status DecodeImageJPG(const Span<const uint8_t> bytes,
|
||||
const ColorHints& color_hints,
|
||||
const SizeConstraints& constraints,
|
||||
PackedPixelFile* ppf) {
|
||||
size_t output_bit_depth, PackedPixelFile* ppf) {
|
||||
// Don't do anything for non-JPEG files (no need to report an error)
|
||||
if (!IsJPG(bytes)) return false;
|
||||
|
||||
@ -175,6 +175,10 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
|
||||
// the call to setjmp().
|
||||
std::unique_ptr<JSAMPLE[]> row;
|
||||
|
||||
if (output_bit_depth == 0 || output_bit_depth > 16) {
|
||||
return JXL_FAILURE("Invalid output bitdepth");
|
||||
}
|
||||
|
||||
const auto try_catch_block = [&]() -> bool {
|
||||
jpeg_decompress_struct cinfo;
|
||||
// cinfo is initialized by libjpeg, which we are not instrumenting with
|
||||
@ -252,12 +256,24 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
|
||||
ppf->info.num_color_channels = nbcomp;
|
||||
ppf->info.orientation = JXL_ORIENT_IDENTITY;
|
||||
|
||||
// Try setting output bit depth. In libjpeg-turbo, this combination of
|
||||
// parameters will be ignored, but in libjpegli it will override output bit
|
||||
// depth.
|
||||
cinfo.quantize_colors = FALSE;
|
||||
cinfo.desired_number_of_colors = 1 << output_bit_depth;
|
||||
jpeg_start_decompress(&cinfo);
|
||||
JXL_ASSERT(cinfo.output_components == nbcomp);
|
||||
if (cinfo.desired_number_of_colors == 0) {
|
||||
// We know that the output bit depth was set because
|
||||
// desired_number_of_colors was reset to zero by libjpegli.
|
||||
ppf->info.bits_per_sample = output_bit_depth;
|
||||
}
|
||||
JxlDataType data_type =
|
||||
ppf->info.bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
|
||||
|
||||
const JxlPixelFormat format{
|
||||
/*num_channels=*/static_cast<uint32_t>(nbcomp),
|
||||
/*data_type=*/BITS_IN_JSAMPLE == 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
|
||||
data_type,
|
||||
/*endianness=*/JXL_NATIVE_ENDIAN,
|
||||
/*align=*/0,
|
||||
};
|
||||
|
3
third_party/jpeg-xl/lib/extras/dec/jpg.h
vendored
3
third_party/jpeg-xl/lib/extras/dec/jpg.h
vendored
@ -25,7 +25,8 @@ namespace extras {
|
||||
// `elapsed_deinterleave`, if non-null, will be set to the time (in seconds)
|
||||
// that it took to deinterleave the raw JSAMPLEs to planar floats.
|
||||
Status DecodeImageJPG(Span<const uint8_t> bytes, const ColorHints& color_hints,
|
||||
const SizeConstraints& constraints, PackedPixelFile* ppf);
|
||||
const SizeConstraints& constraints,
|
||||
size_t output_bit_depth, PackedPixelFile* ppf);
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
||||
|
2
third_party/jpeg-xl/lib/extras/dec/jxl.cc
vendored
2
third_party/jpeg-xl/lib/extras/dec/jxl.cc
vendored
@ -223,7 +223,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
|
||||
fprintf(stderr,
|
||||
"Input file is truncated (total bytes: %" PRIuS
|
||||
", processed bytes: %" PRIuS
|
||||
") and allow_partial_input was disabled.",
|
||||
") and --allow_partial_files is not present.\n",
|
||||
bytes_size, bytes_size - released_size);
|
||||
return false;
|
||||
} else if (status == JXL_DEC_BOX) {
|
||||
|
559
third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
vendored
559
third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
vendored
@ -1,559 +0,0 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "lib/extras/dec_group_jpeg.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#ifdef MEMORY_SANITIZER
|
||||
#define JXL_MEMORY_SANITIZER 1
|
||||
#elif defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#define JXL_MEMORY_SANITIZER 1
|
||||
#else
|
||||
#define JXL_MEMORY_SANITIZER 0
|
||||
#endif
|
||||
#else
|
||||
#define JXL_MEMORY_SANITIZER 0
|
||||
#endif
|
||||
|
||||
#if JXL_MEMORY_SANITIZER
|
||||
#include "sanitizer/msan_interface.h"
|
||||
#endif
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "lib/extras/dec_group_jpeg.cc"
|
||||
#include <hwy/foreach_target.h>
|
||||
#include <hwy/highway.h>
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace jxl {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These templates are not found via ADL.
|
||||
using hwy::HWY_NAMESPACE::Abs;
|
||||
using hwy::HWY_NAMESPACE::Add;
|
||||
using hwy::HWY_NAMESPACE::Clamp;
|
||||
using hwy::HWY_NAMESPACE::Gt;
|
||||
using hwy::HWY_NAMESPACE::IfThenElseZero;
|
||||
using hwy::HWY_NAMESPACE::Mul;
|
||||
using hwy::HWY_NAMESPACE::MulAdd;
|
||||
using hwy::HWY_NAMESPACE::NearestInt;
|
||||
using hwy::HWY_NAMESPACE::NegMulAdd;
|
||||
using hwy::HWY_NAMESPACE::Rebind;
|
||||
using hwy::HWY_NAMESPACE::Sub;
|
||||
using hwy::HWY_NAMESPACE::Vec;
|
||||
using hwy::HWY_NAMESPACE::Xor;
|
||||
|
||||
using D = HWY_FULL(float);
|
||||
using DI = HWY_FULL(int32_t);
|
||||
constexpr D d;
|
||||
constexpr DI di;
|
||||
|
||||
using D8 = HWY_CAPPED(float, 8);
|
||||
constexpr D8 d8;
|
||||
|
||||
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
|
||||
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
|
||||
int32_t* JXL_RESTRICT sumabs) {
|
||||
for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
|
||||
size_t k = i % kDCTBlockSize;
|
||||
const Rebind<int16_t, DI> di16;
|
||||
const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
|
||||
const auto abs_coeff = Abs(coeff);
|
||||
const auto not_0 = Gt(abs_coeff, Zero(di));
|
||||
const auto nzero = IfThenElseZero(not_0, Set(di, 1));
|
||||
Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
|
||||
Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
|
||||
}
|
||||
}
|
||||
|
||||
void DequantBlock(const int16_t* JXL_RESTRICT qblock,
|
||||
const float* JXL_RESTRICT dequant,
|
||||
const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
|
||||
for (size_t k = 0; k < kDCTBlockSize; k += Lanes(d)) {
|
||||
const auto mul = Load(d, dequant + k);
|
||||
const auto bias = Load(d, biases + k);
|
||||
const Rebind<int16_t, DI> di16;
|
||||
const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
|
||||
const Rebind<float, DI> df;
|
||||
const auto quant = ConvertTo(df, quant_i);
|
||||
const auto abs_quant = Abs(quant);
|
||||
const auto not_0 = Gt(abs_quant, Zero(df));
|
||||
const auto sign_quant = Xor(quant, abs_quant);
|
||||
const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
|
||||
const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
|
||||
Store(dequant, d, block + k);
|
||||
}
|
||||
}
|
||||
|
||||
#if HWY_CAP_GE256
|
||||
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
|
||||
float* JXL_RESTRICT to) {
|
||||
const D8 d;
|
||||
auto i0 = Load(d, from);
|
||||
auto i1 = Load(d, from + 1 * 8);
|
||||
auto i2 = Load(d, from + 2 * 8);
|
||||
auto i3 = Load(d, from + 3 * 8);
|
||||
auto i4 = Load(d, from + 4 * 8);
|
||||
auto i5 = Load(d, from + 5 * 8);
|
||||
auto i6 = Load(d, from + 6 * 8);
|
||||
auto i7 = Load(d, from + 7 * 8);
|
||||
|
||||
const auto q0 = InterleaveLower(d, i0, i2);
|
||||
const auto q1 = InterleaveLower(d, i1, i3);
|
||||
const auto q2 = InterleaveUpper(d, i0, i2);
|
||||
const auto q3 = InterleaveUpper(d, i1, i3);
|
||||
const auto q4 = InterleaveLower(d, i4, i6);
|
||||
const auto q5 = InterleaveLower(d, i5, i7);
|
||||
const auto q6 = InterleaveUpper(d, i4, i6);
|
||||
const auto q7 = InterleaveUpper(d, i5, i7);
|
||||
|
||||
const auto r0 = InterleaveLower(d, q0, q1);
|
||||
const auto r1 = InterleaveUpper(d, q0, q1);
|
||||
const auto r2 = InterleaveLower(d, q2, q3);
|
||||
const auto r3 = InterleaveUpper(d, q2, q3);
|
||||
const auto r4 = InterleaveLower(d, q4, q5);
|
||||
const auto r5 = InterleaveUpper(d, q4, q5);
|
||||
const auto r6 = InterleaveLower(d, q6, q7);
|
||||
const auto r7 = InterleaveUpper(d, q6, q7);
|
||||
|
||||
i0 = ConcatLowerLower(d, r4, r0);
|
||||
i1 = ConcatLowerLower(d, r5, r1);
|
||||
i2 = ConcatLowerLower(d, r6, r2);
|
||||
i3 = ConcatLowerLower(d, r7, r3);
|
||||
i4 = ConcatUpperUpper(d, r4, r0);
|
||||
i5 = ConcatUpperUpper(d, r5, r1);
|
||||
i6 = ConcatUpperUpper(d, r6, r2);
|
||||
i7 = ConcatUpperUpper(d, r7, r3);
|
||||
|
||||
Store(i0, d, to);
|
||||
Store(i1, d, to + 1 * 8);
|
||||
Store(i2, d, to + 2 * 8);
|
||||
Store(i3, d, to + 3 * 8);
|
||||
Store(i4, d, to + 4 * 8);
|
||||
Store(i5, d, to + 5 * 8);
|
||||
Store(i6, d, to + 6 * 8);
|
||||
Store(i7, d, to + 7 * 8);
|
||||
}
|
||||
#elif HWY_TARGET != HWY_SCALAR
|
||||
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
|
||||
float* JXL_RESTRICT to) {
|
||||
const HWY_CAPPED(float, 4) d;
|
||||
for (size_t n = 0; n < 8; n += 4) {
|
||||
for (size_t m = 0; m < 8; m += 4) {
|
||||
auto p0 = Load(d, from + n * 8 + m);
|
||||
auto p1 = Load(d, from + (n + 1) * 8 + m);
|
||||
auto p2 = Load(d, from + (n + 2) * 8 + m);
|
||||
auto p3 = Load(d, from + (n + 3) * 8 + m);
|
||||
const auto q0 = InterleaveLower(d, p0, p2);
|
||||
const auto q1 = InterleaveLower(d, p1, p3);
|
||||
const auto q2 = InterleaveUpper(d, p0, p2);
|
||||
const auto q3 = InterleaveUpper(d, p1, p3);
|
||||
|
||||
const auto r0 = InterleaveLower(d, q0, q1);
|
||||
const auto r1 = InterleaveUpper(d, q0, q1);
|
||||
const auto r2 = InterleaveLower(d, q2, q3);
|
||||
const auto r3 = InterleaveUpper(d, q2, q3);
|
||||
Store(r0, d, to + m * 8 + n);
|
||||
Store(r1, d, to + (1 + m) * 8 + n);
|
||||
Store(r2, d, to + (2 + m) * 8 + n);
|
||||
Store(r3, d, to + (3 + m) * 8 + n);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
|
||||
float* JXL_RESTRICT to) {
|
||||
for (size_t n = 0; n < 8; ++n) {
|
||||
for (size_t m = 0; m < 8; ++m) {
|
||||
to[8 * n + m] = from[8 * m + n];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <size_t N>
|
||||
void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
|
||||
float* JXL_RESTRICT aout) {
|
||||
for (size_t i = 0; i < N / 2; i++) {
|
||||
auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
|
||||
Store(in1, d8, aout + i * 8);
|
||||
}
|
||||
for (size_t i = N / 2; i < N; i++) {
|
||||
auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
|
||||
Store(in1, d8, aout + i * 8);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
void BTranspose(float* JXL_RESTRICT coeff) {
|
||||
for (size_t i = N - 1; i > 0; i--) {
|
||||
auto in1 = Load(d8, coeff + i * 8);
|
||||
auto in2 = Load(d8, coeff + (i - 1) * 8);
|
||||
Store(Add(in1, in2), d8, coeff + i * 8);
|
||||
}
|
||||
constexpr float kSqrt2 = 1.41421356237f;
|
||||
auto sqrt2 = Set(d8, kSqrt2);
|
||||
auto in1 = Load(d8, coeff);
|
||||
Store(Mul(in1, sqrt2), d8, coeff);
|
||||
}
|
||||
|
||||
// Constants for DCT implementation. Generated by the following snippet:
|
||||
// for i in range(N // 2):
|
||||
// print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
|
||||
template <size_t N>
|
||||
struct WcMultipliers;
|
||||
|
||||
template <>
|
||||
struct WcMultipliers<4> {
|
||||
static constexpr float kMultipliers[] = {
|
||||
0.541196100146197,
|
||||
1.3065629648763764,
|
||||
};
|
||||
};
|
||||
|
||||
template <>
|
||||
struct WcMultipliers<8> {
|
||||
static constexpr float kMultipliers[] = {
|
||||
0.5097955791041592,
|
||||
0.6013448869350453,
|
||||
0.8999762231364156,
|
||||
2.5629154477415055,
|
||||
};
|
||||
};
|
||||
|
||||
constexpr float WcMultipliers<4>::kMultipliers[];
|
||||
constexpr float WcMultipliers<8>::kMultipliers[];
|
||||
|
||||
template <size_t N>
|
||||
void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
|
||||
size_t out_stride) {
|
||||
for (size_t i = 0; i < N / 2; i++) {
|
||||
auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
|
||||
auto in1 = Load(d8, coeff + i * 8);
|
||||
auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
|
||||
auto out1 = MulAdd(mul, in2, in1);
|
||||
auto out2 = NegMulAdd(mul, in2, in1);
|
||||
StoreU(out1, d8, out + i * out_stride);
|
||||
StoreU(out2, d8, out + (N - i - 1) * out_stride);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
struct IDCT1DImpl;
|
||||
|
||||
template <>
|
||||
struct IDCT1DImpl<1> {
|
||||
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
|
||||
size_t to_stride) {
|
||||
StoreU(LoadU(d8, from), d8, to);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct IDCT1DImpl<2> {
|
||||
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
|
||||
size_t to_stride) {
|
||||
JXL_DASSERT(from_stride >= 8);
|
||||
JXL_DASSERT(to_stride >= 8);
|
||||
auto in1 = LoadU(d8, from);
|
||||
auto in2 = LoadU(d8, from + from_stride);
|
||||
StoreU(Add(in1, in2), d8, to);
|
||||
StoreU(Sub(in1, in2), d8, to + to_stride);
|
||||
}
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
struct IDCT1DImpl {
|
||||
void operator()(const float* from, size_t from_stride, float* to,
|
||||
size_t to_stride) {
|
||||
JXL_DASSERT(from_stride >= 8);
|
||||
JXL_DASSERT(to_stride >= 8);
|
||||
HWY_ALIGN float tmp[64];
|
||||
ForwardEvenOdd<N>(from, from_stride, tmp);
|
||||
IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
|
||||
BTranspose<N / 2>(tmp + N * 4);
|
||||
IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
|
||||
MultiplyAndAdd<N>(tmp, to, to_stride);
|
||||
}
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
|
||||
size_t output_stride) {
|
||||
for (size_t i = 0; i < 8; i += Lanes(d8)) {
|
||||
IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
|
||||
float* JXL_RESTRICT output, size_t output_stride) {
|
||||
Transpose8x8Block(block0, block1);
|
||||
IDCT1D<8>(block1, block0, 8);
|
||||
Transpose8x8Block(block0, block1);
|
||||
IDCT1D<8>(block1, output, output_stride);
|
||||
}
|
||||
|
||||
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
|
||||
const float* JXL_RESTRICT dequant,
|
||||
const float* JXL_RESTRICT biases,
|
||||
float* JXL_RESTRICT scratch_space,
|
||||
float* JXL_RESTRICT output, size_t output_stride) {
|
||||
float* JXL_RESTRICT block0 = scratch_space;
|
||||
float* JXL_RESTRICT block1 = scratch_space + kDCTBlockSize;
|
||||
DequantBlock(qblock, dequant, biases, block0);
|
||||
ComputeScaledIDCT(block0, block1, output, output_stride);
|
||||
}
|
||||
|
||||
#if HWY_CAP_GE512
|
||||
using hwy::HWY_NAMESPACE::Half;
|
||||
using hwy::HWY_NAMESPACE::Vec;
|
||||
template <size_t i, class DF, class V>
|
||||
HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
|
||||
using HF = Half<DF>;
|
||||
using HHF = Half<HF>;
|
||||
auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
|
||||
return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
|
||||
}
|
||||
|
||||
template <class DF, class V>
|
||||
HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
|
||||
using HF = Half<DF>;
|
||||
return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
|
||||
// aligned.
|
||||
template <class DF, class V, typename T>
|
||||
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
|
||||
static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
Store(v0, df, mem);
|
||||
Store(v1, df, mem + 1);
|
||||
#elif !HWY_CAP_GE256
|
||||
Store(InterleaveLower(df, v0, v1), df, mem);
|
||||
Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
|
||||
#else
|
||||
if (!HWY_CAP_GE512 || Lanes(df) == 8) {
|
||||
auto t0 = InterleaveLower(df, v0, v1);
|
||||
auto t1 = InterleaveUpper(df, v0, v1);
|
||||
Store(ConcatLowerLower(df, t1, t0), df, mem);
|
||||
Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
|
||||
} else {
|
||||
#if HWY_CAP_GE512
|
||||
auto t0 = InterleaveLower(df, v0, v1);
|
||||
auto t1 = InterleaveUpper(df, v0, v1);
|
||||
Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
|
||||
Quarter<1>(df, t0), Quarter<1>(df, t1)),
|
||||
df, mem);
|
||||
Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
|
||||
Quarter<3>(df, t0), Quarter<3>(df, t1)),
|
||||
df, mem + Lanes(df));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
|
||||
float* JXL_RESTRICT row_out, size_t len_out) {
|
||||
HWY_FULL(float) df;
|
||||
auto threefour = Set(df, 0.75f);
|
||||
auto onefour = Set(df, 0.25f);
|
||||
const size_t len_in = len_out >> 1;
|
||||
row_in[-1] = row_in[0];
|
||||
row_in[len_in] = row_in[len_in - 1];
|
||||
for (size_t x = 0; x < len_in; x += Lanes(df)) {
|
||||
auto current = Mul(Load(df, row_in + x), threefour);
|
||||
auto prev = LoadU(df, row_in + x - 1);
|
||||
auto next = LoadU(df, row_in + x + 1);
|
||||
auto left = MulAdd(onefour, prev, current);
|
||||
auto right = MulAdd(onefour, next, current);
|
||||
StoreInterleaved(df, left, right, row_out + x * 2);
|
||||
}
|
||||
}
|
||||
|
||||
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
|
||||
const float* JXL_RESTRICT row_mid,
|
||||
const float* JXL_RESTRICT row_bot,
|
||||
float* JXL_RESTRICT row_out0,
|
||||
float* JXL_RESTRICT row_out1, size_t len) {
|
||||
HWY_FULL(float) df;
|
||||
auto threefour = Set(df, 0.75f);
|
||||
auto onefour = Set(df, 0.25f);
|
||||
for (size_t x = 0; x < len; x += Lanes(df)) {
|
||||
auto it = Load(df, row_top + x);
|
||||
auto im = Load(df, row_mid + x);
|
||||
auto ib = Load(df, row_bot + x);
|
||||
auto im_scaled = Mul(im, threefour);
|
||||
Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
|
||||
Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
|
||||
}
|
||||
}
|
||||
|
||||
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
|
||||
float* JXL_RESTRICT row2, size_t xsize) {
|
||||
const HWY_FULL(float) df;
|
||||
|
||||
// Full-range BT.601 as defined by JFIF Clause 7:
|
||||
// https://www.itu.int/rec/T-REC-T.871-201105-I/en
|
||||
const auto c128 = Set(df, 128.0f / 255);
|
||||
const auto crcr = Set(df, 1.402f);
|
||||
const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
|
||||
const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
|
||||
const auto cbcb = Set(df, 1.772f);
|
||||
|
||||
for (size_t x = 0; x < xsize; x += Lanes(df)) {
|
||||
const auto y_vec = Add(Load(df, row0 + x), c128);
|
||||
const auto cb_vec = Load(df, row1 + x);
|
||||
const auto cr_vec = Load(df, row2 + x);
|
||||
const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
|
||||
const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
|
||||
const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
|
||||
Store(r_vec, df, row0 + x);
|
||||
Store(g_vec, df, row1 + x);
|
||||
Store(b_vec, df, row2 + x);
|
||||
}
|
||||
}
|
||||
|
||||
void DecenterRow(float* row, size_t xsize) {
|
||||
const HWY_FULL(float) df;
|
||||
const auto c128 = Set(df, 128.0f / 255);
|
||||
for (size_t x = 0; x < xsize; x += Lanes(df)) {
|
||||
Store(Add(Load(df, row + x), c128), df, row + x);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void StoreUnsignedRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
|
||||
size_t num_channels, T* output) {
|
||||
const HWY_FULL(float) d;
|
||||
auto zero = Zero(d);
|
||||
auto one = Set(d, 1.0f);
|
||||
auto mul = Set(d, (1u << (sizeof(T) * 8)) - 1);
|
||||
const Rebind<T, decltype(d)> du;
|
||||
#if JXL_MEMORY_SANITIZER
|
||||
const size_t padding = RoundUpTo(len, Lanes(d)) - len;
|
||||
for (size_t c = 0; c < num_channels; ++c) {
|
||||
__msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
|
||||
}
|
||||
#endif
|
||||
if (num_channels == 1) {
|
||||
for (size_t i = 0; i < len; i += Lanes(d)) {
|
||||
auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
|
||||
Store(DemoteTo(du, NearestInt(v0)), du, &output[i]);
|
||||
}
|
||||
} else if (num_channels == 3) {
|
||||
for (size_t i = 0; i < len; i += Lanes(d)) {
|
||||
auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
|
||||
auto v1 = Mul(Clamp(zero, Load(d, &input[1][x0 + i]), one), mul);
|
||||
auto v2 = Mul(Clamp(zero, Load(d, &input[2][x0 + i]), one), mul);
|
||||
StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
|
||||
DemoteTo(du, NearestInt(v1)),
|
||||
DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
|
||||
}
|
||||
}
|
||||
#if JXL_MEMORY_SANITIZER
|
||||
__msan_poison(output + num_channels * len,
|
||||
sizeof(output[0]) * num_channels * padding);
|
||||
#endif
|
||||
}
|
||||
|
||||
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
|
||||
size_t len, uint8_t* JXL_RESTRICT scratch_space,
|
||||
extras::PackedImage* image) {
|
||||
if (y0 >= image->ysize) return;
|
||||
JxlPixelFormat format = image->format;
|
||||
uint8_t* pixels = reinterpret_cast<uint8_t*>(image->pixels());
|
||||
if (format.data_type == JXL_TYPE_UINT8) {
|
||||
size_t offset = y0 * image->stride + x0 * format.num_channels;
|
||||
JXL_CHECK(offset + len * format.num_channels <= image->pixels_size);
|
||||
StoreUnsignedRow(rows, x0, len, format.num_channels, scratch_space);
|
||||
memcpy(pixels + offset, scratch_space, len * format.num_channels);
|
||||
} else if (format.data_type == JXL_TYPE_UINT16) {
|
||||
size_t offset = y0 * image->stride + x0 * format.num_channels * 2;
|
||||
JXL_CHECK(offset + len * format.num_channels * 2 <= image->pixels_size);
|
||||
uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
|
||||
StoreUnsignedRow(rows, x0, len, format.num_channels, tmp);
|
||||
// TODO(szabadka) Handle endianness.
|
||||
memcpy(pixels + offset, tmp, len * format.num_channels * 2);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace jxl
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace jxl {
|
||||
namespace {
|
||||
HWY_EXPORT(GatherBlockStats);
|
||||
HWY_EXPORT(DecodeJpegBlock);
|
||||
HWY_EXPORT(Upsample2Horizontal);
|
||||
HWY_EXPORT(Upsample2Vertical);
|
||||
HWY_EXPORT(YCbCrToRGB);
|
||||
HWY_EXPORT(DecenterRow);
|
||||
HWY_EXPORT(WriteToPackedImage);
|
||||
} // namespace
|
||||
|
||||
namespace extras {
|
||||
|
||||
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
|
||||
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
|
||||
int32_t* JXL_RESTRICT sumabs) {
|
||||
return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
|
||||
sumabs);
|
||||
}
|
||||
|
||||
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
|
||||
const float* JXL_RESTRICT dequant_matrices,
|
||||
const float* JXL_RESTRICT biases,
|
||||
float* JXL_RESTRICT scratch_space,
|
||||
float* JXL_RESTRICT output, size_t output_stride) {
|
||||
return HWY_DYNAMIC_DISPATCH(DecodeJpegBlock)(
|
||||
qblock, dequant_matrices, biases, scratch_space, output, output_stride);
|
||||
}
|
||||
|
||||
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
|
||||
float* JXL_RESTRICT row_out, size_t len_out) {
|
||||
return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row_in, row_out, len_out);
|
||||
}
|
||||
|
||||
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
|
||||
const float* JXL_RESTRICT row_mid,
|
||||
const float* JXL_RESTRICT row_bot,
|
||||
float* JXL_RESTRICT row_out0,
|
||||
float* JXL_RESTRICT row_out1, size_t len) {
|
||||
return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
|
||||
row_out0, row_out1, len);
|
||||
}
|
||||
|
||||
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
|
||||
float* JXL_RESTRICT row2, size_t xsize) {
|
||||
return HWY_DYNAMIC_DISPATCH(YCbCrToRGB)(row0, row1, row2, xsize);
|
||||
}
|
||||
|
||||
void DecenterRow(float* row, size_t xsize) {
|
||||
return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
|
||||
}
|
||||
|
||||
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
|
||||
size_t len, uint8_t* JXL_RESTRICT scratch_space,
|
||||
extras::PackedImage* image) {
|
||||
return HWY_DYNAMIC_DISPATCH(WriteToPackedImage)(rows, x0, y0, len,
|
||||
scratch_space, image);
|
||||
}
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
||||
#endif // HWY_ONCE
|
51
third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
vendored
51
third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
vendored
@ -1,51 +0,0 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef LIB_EXTRAS_DEC_GROUP_JPEG_H_
|
||||
#define LIB_EXTRAS_DEC_GROUP_JPEG_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "lib/extras/packed_image.h"
|
||||
#include "lib/jxl/base/compiler_specific.h"
|
||||
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
|
||||
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
|
||||
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
|
||||
int32_t* JXL_RESTRICT sumabs);
|
||||
|
||||
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
|
||||
const float* JXL_RESTRICT dequant_matrices,
|
||||
const float* JXL_RESTRICT biases,
|
||||
float* JXL_RESTRICT scratch_space,
|
||||
float* JXL_RESTRICT output, size_t output_stride);
|
||||
|
||||
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
|
||||
float* JXL_RESTRICT row_out, size_t len_out);
|
||||
|
||||
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
|
||||
const float* JXL_RESTRICT row_mid,
|
||||
const float* JXL_RESTRICT row_bot,
|
||||
float* JXL_RESTRICT row_out0,
|
||||
float* JXL_RESTRICT row_out1, size_t len);
|
||||
|
||||
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
|
||||
float* JXL_RESTRICT row2, size_t xsize);
|
||||
|
||||
void DecenterRow(float* row, size_t xsize);
|
||||
|
||||
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
|
||||
size_t len, uint8_t* JXL_RESTRICT scratch_space,
|
||||
extras::PackedImage* image);
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
||||
|
||||
#endif // LIB_EXTRAS_DEC_GROUP_JPEG_H_
|
1539
third_party/jpeg-xl/lib/extras/decode_jpeg.cc
vendored
1539
third_party/jpeg-xl/lib/extras/decode_jpeg.cc
vendored
File diff suppressed because it is too large
Load Diff
276
third_party/jpeg-xl/lib/extras/decode_jpeg.h
vendored
276
third_party/jpeg-xl/lib/extras/decode_jpeg.h
vendored
@ -1,276 +0,0 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef LIB_EXTRAS_DECODE_JPEG_H_
|
||||
#define LIB_EXTRAS_DECODE_JPEG_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <array>
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "lib/extras/packed_image.h"
|
||||
#include "lib/jxl/base/data_parallel.h"
|
||||
#include "lib/jxl/base/status.h"
|
||||
#include "lib/jxl/image.h"
|
||||
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
|
||||
constexpr int kMaxComponents = 4;
|
||||
|
||||
typedef int16_t coeff_t;
|
||||
|
||||
// Represents one component of a jpeg file.
|
||||
struct JPEGComponent {
|
||||
JPEGComponent()
|
||||
: id(0),
|
||||
h_samp_factor(1),
|
||||
v_samp_factor(1),
|
||||
quant_idx(0),
|
||||
width_in_blocks(0),
|
||||
height_in_blocks(0) {}
|
||||
|
||||
// One-byte id of the component.
|
||||
uint32_t id;
|
||||
// Horizontal and vertical sampling factors.
|
||||
// In interleaved mode, each minimal coded unit (MCU) has
|
||||
// h_samp_factor x v_samp_factor DCT blocks from this component.
|
||||
int h_samp_factor;
|
||||
int v_samp_factor;
|
||||
// The index of the quantization table used for this component.
|
||||
uint32_t quant_idx;
|
||||
// The dimensions of the component measured in 8x8 blocks.
|
||||
uint32_t width_in_blocks;
|
||||
uint32_t height_in_blocks;
|
||||
// The DCT coefficients of this component, laid out block-by-block, divided
|
||||
// through the quantization matrix values.
|
||||
hwy::AlignedFreeUniquePtr<coeff_t[]> coeffs;
|
||||
};
|
||||
|
||||
struct HuffmanTableEntry {
|
||||
// Initialize the value to an invalid symbol so that we can recognize it
|
||||
// when reading the bit stream using a Huffman code with space > 0.
|
||||
HuffmanTableEntry() : bits(0), value(0xffff) {}
|
||||
|
||||
uint8_t bits; // number of bits used for this symbol
|
||||
uint16_t value; // symbol value or table offset
|
||||
};
|
||||
|
||||
// Quantization values for an 8x8 pixel block.
|
||||
struct JPEGQuantTable {
|
||||
std::array<int32_t, kDCTBlockSize> values;
|
||||
// The index of this quantization table as it was parsed from the input JPEG.
|
||||
// Each DQT marker segment contains an 'index' field, and we save this index
|
||||
// here. Valid values are 0 to 3.
|
||||
uint32_t index = 0;
|
||||
};
|
||||
|
||||
// Huffman table indexes and MCU dimensions used for one component of one scan.
|
||||
struct JPEGComponentScanInfo {
|
||||
uint32_t comp_idx;
|
||||
uint32_t dc_tbl_idx;
|
||||
uint32_t ac_tbl_idx;
|
||||
uint32_t mcu_ysize_blocks;
|
||||
uint32_t mcu_xsize_blocks;
|
||||
};
|
||||
|
||||
// Contains information that is used in one scan.
|
||||
struct JPEGScanInfo {
|
||||
// Parameters used for progressive scans (named the same way as in the spec):
|
||||
// Ss : Start of spectral band in zig-zag sequence.
|
||||
// Se : End of spectral band in zig-zag sequence.
|
||||
// Ah : Successive approximation bit position, high.
|
||||
// Al : Successive approximation bit position, low.
|
||||
uint32_t Ss;
|
||||
uint32_t Se;
|
||||
uint32_t Ah;
|
||||
uint32_t Al;
|
||||
uint32_t num_components = 0;
|
||||
std::array<JPEGComponentScanInfo, kMaxComponents> components;
|
||||
size_t MCU_rows;
|
||||
size_t MCU_cols;
|
||||
};
|
||||
|
||||
// State of the decoder that has to be saved before decoding one MCU in case
|
||||
// we run out of the bitstream.
|
||||
struct MCUCodingState {
|
||||
coeff_t last_dc_coeff[kMaxComponents];
|
||||
int eobrun;
|
||||
std::vector<coeff_t> coeffs;
|
||||
};
|
||||
|
||||
// Streaming JPEG decoding object.
|
||||
class JpegDecoder {
|
||||
public:
|
||||
enum class Status {
|
||||
kSuccess,
|
||||
kNeedMoreInput,
|
||||
kError,
|
||||
};
|
||||
|
||||
// Sets the next chunk of input. It must be called before the first call to
|
||||
// ReadHeaders() and every time a reder function returns
|
||||
// Status::kNeedMoreInput.
|
||||
Status SetInput(const uint8_t* data, size_t len);
|
||||
|
||||
// Sets the output image. Must be called between ReadHeaders() and
|
||||
// ReadScanLines(). The provided image must have the dimensions and number of
|
||||
// channels as the underlying JPEG bitstream.
|
||||
Status SetOutput(PackedImage* image);
|
||||
|
||||
// Reads the header markers up to and including SOF marker. After this returns
|
||||
// kSuccess, the image attribute accessors can be called.
|
||||
Status ReadHeaders();
|
||||
|
||||
// Reads the bitstream after the SOF marker, and fills in at most
|
||||
// max_output_rows scan lines of the provided image. Set *num_output_rows to
|
||||
// the actual number of lines produced.
|
||||
Status ReadScanLines(size_t* num_output_rows, size_t max_output_rows);
|
||||
|
||||
// Image attribute accessors, can be called after ReadHeaders() returns
|
||||
// kSuccess.
|
||||
size_t xsize() const { return xsize_; }
|
||||
size_t ysize() const { return ysize_; }
|
||||
size_t num_channels() const { return components_.size(); }
|
||||
const std::vector<uint8_t>& icc_profile() const { return icc_profile_; }
|
||||
|
||||
private:
|
||||
enum class State {
|
||||
kStart,
|
||||
kProcessMarkers,
|
||||
kScan,
|
||||
kRender,
|
||||
kEnd,
|
||||
};
|
||||
State state_ = State::kStart;
|
||||
|
||||
//
|
||||
// Input handling state.
|
||||
//
|
||||
const uint8_t* next_in_ = nullptr;
|
||||
size_t avail_in_ = 0;
|
||||
// Codestream input data is copied here temporarily when the decoder needs
|
||||
// more input bytes to process the next part of the stream.
|
||||
std::vector<uint8_t> codestream_copy_;
|
||||
// Number of bytes at the end of codestream_copy_ that were not yet consumed
|
||||
// by calling AdvanceInput().
|
||||
size_t codestream_unconsumed_ = 0;
|
||||
// Position in the codestream_copy_ vector that the decoder already finished
|
||||
// processing.
|
||||
size_t codestream_pos_ = 0;
|
||||
// Number of bits after codestream_pos_ that were already processed.
|
||||
size_t codestream_bits_ahead_ = 0;
|
||||
|
||||
//
|
||||
// Marker data processing state.
|
||||
//
|
||||
bool found_soi_ = false;
|
||||
bool found_app0_ = false;
|
||||
bool found_dri_ = false;
|
||||
bool found_sof_ = false;
|
||||
bool found_eoi_ = false;
|
||||
size_t xsize_ = 0;
|
||||
size_t ysize_ = 0;
|
||||
bool is_ycbcr_ = true;
|
||||
size_t icc_index_ = 0;
|
||||
size_t icc_total_ = 0;
|
||||
std::vector<uint8_t> icc_profile_;
|
||||
size_t restart_interval_ = 0;
|
||||
std::vector<JPEGQuantTable> quant_;
|
||||
std::vector<JPEGComponent> components_;
|
||||
std::vector<HuffmanTableEntry> dc_huff_lut_;
|
||||
std::vector<HuffmanTableEntry> ac_huff_lut_;
|
||||
uint8_t huff_slot_defined_[256] = {};
|
||||
|
||||
// Fields defined by SOF marker.
|
||||
bool is_progressive_;
|
||||
int max_h_samp_;
|
||||
int max_v_samp_;
|
||||
size_t iMCU_rows_;
|
||||
size_t iMCU_cols_;
|
||||
size_t iMCU_width_;
|
||||
size_t iMCU_height_;
|
||||
|
||||
// Initialized at strat of frame.
|
||||
uint16_t scan_progression_[kMaxComponents][kDCTBlockSize];
|
||||
|
||||
//
|
||||
// Per scan state.
|
||||
//
|
||||
JPEGScanInfo scan_info_;
|
||||
size_t scan_mcu_row_;
|
||||
size_t scan_mcu_col_;
|
||||
coeff_t last_dc_coeff_[kMaxComponents];
|
||||
int eobrun_;
|
||||
int restarts_to_go_;
|
||||
int next_restart_marker_;
|
||||
|
||||
MCUCodingState mcu_;
|
||||
|
||||
//
|
||||
// Rendering state.
|
||||
//
|
||||
PackedImage* output_;
|
||||
|
||||
Image3F MCU_row_buf_;
|
||||
size_t MCU_buf_current_row_;
|
||||
size_t MCU_buf_ready_rows_;
|
||||
|
||||
size_t output_row_;
|
||||
size_t output_mcu_row_;
|
||||
size_t output_ci_;
|
||||
|
||||
// Temporary buffers for vertically upsampled chroma components. We keep a
|
||||
// ringbuffer of 3 * kBlockDim rows so that we have access for previous and
|
||||
// next rows.
|
||||
std::vector<ImageF> chroma_;
|
||||
// In the rendering order, vertically upsampled chroma components come first.
|
||||
std::vector<size_t> component_order_;
|
||||
hwy::AlignedFreeUniquePtr<float[]> idct_scratch_;
|
||||
hwy::AlignedFreeUniquePtr<float[]> upsample_scratch_;
|
||||
hwy::AlignedFreeUniquePtr<uint8_t[]> output_scratch_;
|
||||
|
||||
hwy::AlignedFreeUniquePtr<float[]> dequant_;
|
||||
// Per channel and per frequency statistics about the number of nonzeros and
|
||||
// the sum of coefficient absolute values, used in dequantization bias
|
||||
// computation.
|
||||
hwy::AlignedFreeUniquePtr<int[]> nonzeros_;
|
||||
hwy::AlignedFreeUniquePtr<int[]> sumabs_;
|
||||
std::vector<size_t> num_processed_blocks_;
|
||||
hwy::AlignedFreeUniquePtr<float[]> biases_;
|
||||
|
||||
void AdvanceInput(size_t size);
|
||||
void AdvanceCodestream(size_t size);
|
||||
Status RequestMoreInput();
|
||||
Status GetCodestreamInput(const uint8_t** data, size_t* len);
|
||||
|
||||
Status ProcessMarker(const uint8_t* data, size_t len, size_t* pos);
|
||||
Status ProcessSOF(const uint8_t* data, size_t len);
|
||||
Status ProcessSOS(const uint8_t* data, size_t len);
|
||||
Status ProcessDHT(const uint8_t* data, size_t len);
|
||||
Status ProcessDQT(const uint8_t* data, size_t len);
|
||||
Status ProcessDRI(const uint8_t* data, size_t len);
|
||||
Status ProcessAPP(const uint8_t* data, size_t len);
|
||||
Status ProcessCOM(const uint8_t* data, size_t len);
|
||||
|
||||
Status ProcessScan(const uint8_t* data, size_t len, size_t* pos);
|
||||
|
||||
void SaveMCUCodingState();
|
||||
void RestoreMCUCodingState();
|
||||
|
||||
void PrepareForOutput();
|
||||
void ProcessOutput(size_t* num_output_rows, size_t max_output_rows);
|
||||
};
|
||||
|
||||
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
|
||||
JxlDataType output_data_type, ThreadPool* pool,
|
||||
PackedPixelFile* ppf);
|
||||
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
||||
|
||||
#endif // LIB_EXTRAS_DECODE_JPEG_H_
|
190
third_party/jpeg-xl/lib/extras/decode_jpeg_test.cc
vendored
190
third_party/jpeg-xl/lib/extras/decode_jpeg_test.cc
vendored
@ -1,190 +0,0 @@
|
||||
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
//
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "lib/extras/decode_jpeg.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
#include "lib/extras/dec/jpg.h"
|
||||
#endif
|
||||
#include "lib/jxl/test_utils.h"
|
||||
#include "lib/jxl/testdata.h"
|
||||
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
namespace {
|
||||
|
||||
using test::DistanceRMS;
|
||||
|
||||
struct TestConfig {
|
||||
std::string fn;
|
||||
std::string fn_desc;
|
||||
size_t chunk_size;
|
||||
size_t max_output_lines;
|
||||
};
|
||||
|
||||
class DecodeJpegTestParam : public ::testing::TestWithParam<TestConfig> {};
|
||||
|
||||
TEST_P(DecodeJpegTestParam, Streaming) {
|
||||
TestConfig config = GetParam();
|
||||
const PaddedBytes compressed = ReadTestData(config.fn.c_str());
|
||||
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
PackedPixelFile ppf_libjpeg;
|
||||
EXPECT_TRUE(
|
||||
DecodeImageJPG(Span<const uint8_t>(compressed.data(), compressed.size()),
|
||||
ColorHints(), SizeConstraints(), &ppf_libjpeg));
|
||||
ASSERT_EQ(1, ppf_libjpeg.frames.size());
|
||||
#endif
|
||||
|
||||
JpegDecoder dec;
|
||||
|
||||
size_t chunk_size = config.chunk_size;
|
||||
if (chunk_size == 0) chunk_size = compressed.size();
|
||||
size_t pos = std::min(chunk_size, compressed.size());
|
||||
ASSERT_EQ(JpegDecoder::Status::kSuccess,
|
||||
dec.SetInput(compressed.data(), pos));
|
||||
|
||||
JpegDecoder::Status status;
|
||||
for (;;) {
|
||||
status = dec.ReadHeaders();
|
||||
if (status == JpegDecoder::Status::kNeedMoreInput) {
|
||||
ASSERT_LT(pos, compressed.size());
|
||||
size_t len = std::min(chunk_size, compressed.size() - pos);
|
||||
ASSERT_EQ(JpegDecoder::Status::kSuccess,
|
||||
dec.SetInput(compressed.data() + pos, len));
|
||||
pos += len;
|
||||
continue;
|
||||
}
|
||||
ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
|
||||
break;
|
||||
}
|
||||
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
EXPECT_EQ(ppf_libjpeg.info.xsize, dec.xsize());
|
||||
EXPECT_EQ(ppf_libjpeg.info.ysize, dec.ysize());
|
||||
EXPECT_EQ(ppf_libjpeg.info.num_color_channels, dec.num_channels());
|
||||
#endif
|
||||
|
||||
JxlPixelFormat format = {static_cast<uint32_t>(dec.num_channels()),
|
||||
JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
|
||||
PackedImage output(dec.xsize(), dec.ysize(), format);
|
||||
ASSERT_EQ(JpegDecoder::Status::kSuccess, dec.SetOutput(&output));
|
||||
|
||||
size_t max_output_lines = config.max_output_lines;
|
||||
if (max_output_lines == 0) max_output_lines = dec.ysize();
|
||||
|
||||
size_t total_output_lines = 0;
|
||||
while (total_output_lines < dec.ysize()) {
|
||||
size_t num_output_lines = 0;
|
||||
status = dec.ReadScanLines(&num_output_lines, max_output_lines);
|
||||
total_output_lines += num_output_lines;
|
||||
if (status == JpegDecoder::Status::kNeedMoreInput) {
|
||||
ASSERT_LT(pos, compressed.size());
|
||||
size_t len = std::min(chunk_size, compressed.size() - pos);
|
||||
ASSERT_EQ(JpegDecoder::Status::kSuccess,
|
||||
dec.SetInput(compressed.data() + pos, len));
|
||||
pos += len;
|
||||
continue;
|
||||
}
|
||||
ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
|
||||
if (total_output_lines < dec.ysize()) {
|
||||
EXPECT_EQ(num_output_lines, max_output_lines);
|
||||
}
|
||||
}
|
||||
|
||||
#if JPEGXL_ENABLE_JPEG
|
||||
const PackedImage& output_libjpeg = ppf_libjpeg.frames[0].color;
|
||||
ASSERT_EQ(output.xsize, output_libjpeg.xsize);
|
||||
ASSERT_EQ(output.ysize, output_libjpeg.ysize);
|
||||
EXPECT_LE(
|
||||
DistanceRMS(reinterpret_cast<const uint8_t*>(output.pixels()),
|
||||
reinterpret_cast<const uint8_t*>(output_libjpeg.pixels()),
|
||||
output.xsize, output.ysize, output.format),
|
||||
0.0075);
|
||||
#endif
|
||||
}
|
||||
|
||||
std::vector<TestConfig> GenerateTests() {
|
||||
std::vector<TestConfig> all_tests;
|
||||
{
|
||||
std::vector<std::pair<std::string, std::string>> testfiles({
|
||||
{"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
|
||||
{"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
|
||||
{"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
|
||||
{"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
|
||||
});
|
||||
for (const auto& it : testfiles) {
|
||||
for (size_t chunk_size : {0, 1, 64, 65536}) {
|
||||
for (size_t max_output_lines : {0, 1, 8, 16}) {
|
||||
TestConfig config;
|
||||
config.fn = it.first;
|
||||
config.fn_desc = it.second;
|
||||
config.chunk_size = chunk_size;
|
||||
config.max_output_lines = max_output_lines;
|
||||
all_tests.push_back(config);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
std::vector<std::pair<std::string, std::string>> testfiles({
|
||||
{"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
|
||||
{"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
|
||||
{"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
|
||||
{"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
|
||||
{"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
|
||||
{"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
|
||||
{"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
|
||||
{"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
|
||||
"Q85RGBSubsampleBlue"},
|
||||
});
|
||||
for (const auto& it : testfiles) {
|
||||
for (size_t chunk_size : {0, 64}) {
|
||||
for (size_t max_output_lines : {0, 16}) {
|
||||
TestConfig config;
|
||||
config.fn = it.first;
|
||||
config.fn_desc = it.second;
|
||||
config.chunk_size = chunk_size;
|
||||
config.max_output_lines = max_output_lines;
|
||||
all_tests.push_back(config);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return all_tests;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
|
||||
os << c.fn_desc;
|
||||
if (c.chunk_size == 0) {
|
||||
os << "CompleteInput";
|
||||
} else {
|
||||
os << "InputChunks" << c.chunk_size;
|
||||
}
|
||||
if (c.max_output_lines == 0) {
|
||||
os << "CompleteOutput";
|
||||
} else {
|
||||
os << "OutputLines" << c.max_output_lines;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
std::string TestDescription(
|
||||
const testing::TestParamInfo<DecodeJpegTestParam::ParamType>& info) {
|
||||
std::stringstream name;
|
||||
name << info.param;
|
||||
return name.str();
|
||||
}
|
||||
|
||||
JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeJpegTest, DecodeJpegTestParam,
|
||||
testing::ValuesIn(GenerateTests()),
|
||||
TestDescription);
|
||||
|
||||
} // namespace
|
||||
} // namespace extras
|
||||
} // namespace jxl
|
141
third_party/jpeg-xl/lib/extras/encode_jpeg.cc
vendored
141
third_party/jpeg-xl/lib/extras/encode_jpeg.cc
vendored
@ -27,8 +27,9 @@ namespace jxl {
|
||||
namespace extras {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
|
||||
const FrameDimensions& frame_dim, const float* qm,
|
||||
void ComputeDCTCoefficients(const Image3F& opsin, const bool xyb,
|
||||
const ImageF& qf, const FrameDimensions& frame_dim,
|
||||
const float* qm,
|
||||
std::vector<jpeg::JPEGComponent>* components) {
|
||||
int max_samp_factor = 1;
|
||||
for (const auto& c : *components) {
|
||||
@ -75,7 +76,11 @@ void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
|
||||
block[ix * 8 + iy] = cc;
|
||||
}
|
||||
}
|
||||
block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
|
||||
if (xyb) {
|
||||
// ToXYB does not create zero-centered sample values like RgbToYcbcr
|
||||
// does, so we apply an offset to the DC values instead.
|
||||
block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -95,12 +100,7 @@ HWY_EXPORT(ComputeDCTCoefficients);
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<uint8_t> CreateXybICCAppMarker() {
|
||||
ColorEncoding c_xyb;
|
||||
c_xyb.SetColorSpace(ColorSpace::kXYB);
|
||||
c_xyb.rendering_intent = RenderingIntent::kPerceptual;
|
||||
JXL_CHECK(c_xyb.CreateICC());
|
||||
const auto& icc = c_xyb.ICC();
|
||||
std::vector<uint8_t> CreateICCAppMarker(const PaddedBytes& icc) {
|
||||
std::vector<uint8_t> icc_marker(17 + icc.size());
|
||||
// See the APP2 marker format for embedded ICC profile at
|
||||
// https://www.color.org/technotes/ICC-Technote-ProfileEmbedding.pdf
|
||||
@ -116,7 +116,15 @@ std::vector<uint8_t> CreateXybICCAppMarker() {
|
||||
return icc_marker;
|
||||
}
|
||||
|
||||
static constexpr float kBaseQuantMatrix[] = {
|
||||
std::vector<uint8_t> CreateXybICCAppMarker() {
|
||||
ColorEncoding c_xyb;
|
||||
c_xyb.SetColorSpace(ColorSpace::kXYB);
|
||||
c_xyb.rendering_intent = RenderingIntent::kPerceptual;
|
||||
JXL_CHECK(c_xyb.CreateICC());
|
||||
return CreateICCAppMarker(c_xyb.ICC());
|
||||
}
|
||||
|
||||
static constexpr float kBaseQuantMatrixXYB[] = {
|
||||
// c = 0
|
||||
0.010745695802f,
|
||||
0.014724285860f,
|
||||
@ -314,9 +322,45 @@ static constexpr float kBaseQuantMatrix[] = {
|
||||
0.047241950370f,
|
||||
};
|
||||
|
||||
void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
|
||||
// Y: mozjpeg q99; Cb, Cr: mozjpeg q95
|
||||
static constexpr float kBaseQuantMatrixYCbCr[] = {
|
||||
// c = 0
|
||||
1, 1, 1, 1, 1, 1, 1, 2, //
|
||||
1, 1, 1, 1, 1, 1, 1, 2, //
|
||||
1, 1, 1, 1, 1, 1, 2, 3, //
|
||||
1, 1, 1, 1, 1, 1, 2, 3, //
|
||||
1, 1, 1, 1, 1, 2, 3, 4, //
|
||||
1, 1, 1, 1, 2, 2, 3, 5, //
|
||||
1, 1, 2, 2, 3, 3, 5, 6, //
|
||||
2, 2, 3, 3, 4, 5, 6, 8, //
|
||||
|
||||
// c = 1
|
||||
2, 2, 2, 2, 3, 4, 6, 9, //
|
||||
2, 2, 2, 3, 3, 4, 5, 8, //
|
||||
2, 2, 2, 3, 4, 6, 9, 14, //
|
||||
2, 3, 3, 4, 5, 7, 11, 16, //
|
||||
3, 3, 4, 5, 7, 9, 13, 19, //
|
||||
4, 4, 6, 7, 9, 12, 17, 24, //
|
||||
6, 5, 9, 11, 13, 17, 23, 31, //
|
||||
9, 8, 14, 16, 19, 24, 31, 42, //
|
||||
|
||||
// c = 2
|
||||
2, 2, 2, 2, 3, 4, 6, 9, //
|
||||
2, 2, 2, 3, 3, 4, 5, 8, //
|
||||
2, 2, 2, 3, 4, 6, 9, 14, //
|
||||
2, 3, 3, 4, 5, 7, 11, 16, //
|
||||
3, 3, 4, 5, 7, 9, 13, 19, //
|
||||
4, 4, 6, 7, 9, 12, 17, 24, //
|
||||
6, 5, 9, 11, 13, 17, 23, 31, //
|
||||
9, 8, 14, 16, 19, 24, 31, 42, //
|
||||
};
|
||||
|
||||
void AddJpegQuantMatrices(const ImageF& qf, bool xyb, float dc_quant,
|
||||
float global_scale,
|
||||
std::vector<jpeg::JPEGQuantTable>* quant_tables,
|
||||
float* qm) {
|
||||
const float* const base_quant_matrix =
|
||||
xyb ? kBaseQuantMatrixXYB : kBaseQuantMatrixYCbCr;
|
||||
// Scale the base quant matrix based on the scaled XYB scales and the quant
|
||||
// field.
|
||||
float qfmin, qfmax;
|
||||
@ -324,10 +368,10 @@ void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
|
||||
const float dc_scale = global_scale / dc_quant;
|
||||
const float ac_scale = global_scale / qfmax;
|
||||
for (size_t c = 0, ix = 0; c < 3; c++) {
|
||||
qm[ix] = dc_scale * kBaseQuantMatrix[ix];
|
||||
qm[ix] = dc_scale * base_quant_matrix[ix];
|
||||
ix++;
|
||||
for (size_t j = 1; j < kDCTBlockSize; j++, ix++) {
|
||||
qm[ix] = ac_scale * kBaseQuantMatrix[ix];
|
||||
qm[ix] = ac_scale * base_quant_matrix[ix];
|
||||
}
|
||||
}
|
||||
|
||||
@ -514,26 +558,37 @@ void SetJpegHuffmanCode(const JpegClusteredHistograms& clusters,
|
||||
}
|
||||
|
||||
void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
|
||||
float global_scale, const bool subsample_blue,
|
||||
const FrameDimensions& frame_dim, jpeg::JPEGData* out) {
|
||||
float global_scale, const bool xyb, const bool subsample_blue,
|
||||
const PaddedBytes& icc, const FrameDimensions& frame_dim,
|
||||
jpeg::JPEGData* out) {
|
||||
*out = jpeg::JPEGData();
|
||||
// ICC
|
||||
out->marker_order.push_back(0xe2);
|
||||
out->app_data.push_back(CreateXybICCAppMarker());
|
||||
if (xyb) {
|
||||
out->app_data.push_back(CreateXybICCAppMarker());
|
||||
} else {
|
||||
out->app_data.push_back(CreateICCAppMarker(icc));
|
||||
}
|
||||
|
||||
// DQT
|
||||
out->marker_order.emplace_back(0xdb);
|
||||
float qm[3 * kDCTBlockSize];
|
||||
AddJpegQuantMatrices(qf, dc_quant, global_scale, &out->quant, qm);
|
||||
AddJpegQuantMatrices(qf, xyb, dc_quant, global_scale, &out->quant, qm);
|
||||
|
||||
// SOF
|
||||
out->marker_order.emplace_back(0xc2);
|
||||
out->components.resize(3);
|
||||
out->height = frame_dim.ysize;
|
||||
out->width = frame_dim.xsize;
|
||||
out->components[0].id = 'R';
|
||||
out->components[1].id = 'G';
|
||||
out->components[2].id = 'B';
|
||||
if (xyb) {
|
||||
out->components[0].id = 'R';
|
||||
out->components[1].id = 'G';
|
||||
out->components[2].id = 'B';
|
||||
} else {
|
||||
out->components[0].id = 1;
|
||||
out->components[1].id = 2;
|
||||
out->components[2].id = 3;
|
||||
}
|
||||
size_t max_samp_factor = subsample_blue ? 2 : 1;
|
||||
for (size_t c = 0; c < 3; ++c) {
|
||||
const size_t factor = (subsample_blue && c == 2) ? 2 : 1;
|
||||
@ -546,7 +601,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
|
||||
out->components[c].quant_idx = c;
|
||||
}
|
||||
HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)
|
||||
(opsin, qf, frame_dim, qm, &out->components);
|
||||
(opsin, xyb, qf, frame_dim, qm, &out->components);
|
||||
|
||||
// DHT (the actual Huffman codes will be added later).
|
||||
out->marker_order.emplace_back(0xc4);
|
||||
@ -635,9 +690,9 @@ size_t JpegSize(const jpeg::JPEGData& jpeg_data) {
|
||||
|
||||
} // namespace
|
||||
|
||||
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
|
||||
Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
|
||||
ThreadPool* pool, std::vector<uint8_t>* compressed) {
|
||||
const bool subsample_blue = true;
|
||||
const bool subsample_blue = jpeg_settings.xyb;
|
||||
const size_t max_shift = subsample_blue ? 1 : 0;
|
||||
FrameDimensions frame_dim;
|
||||
frame_dim.Set(input.xsize(), input.ysize(), 1, max_shift, max_shift, false,
|
||||
@ -651,17 +706,35 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
|
||||
|
||||
// Compute adaptive quant field.
|
||||
ImageF mask;
|
||||
ImageF qf = InitialQuantField(distance, opsin, frame_dim, pool, 1.0, &mask);
|
||||
ScaleXYB(&opsin);
|
||||
ImageF qf = InitialQuantField(jpeg_settings.distance, opsin, frame_dim, pool,
|
||||
1.0, &mask);
|
||||
if (jpeg_settings.xyb) {
|
||||
ScaleXYB(&opsin);
|
||||
} else {
|
||||
opsin.ShrinkTo(input.xsize(), input.ysize());
|
||||
JXL_RETURN_IF_ERROR(RgbToYcbcr(
|
||||
input.color().Plane(0), input.color().Plane(1), input.color().Plane(2),
|
||||
&opsin.Plane(0), &opsin.Plane(1), &opsin.Plane(2), pool));
|
||||
PadImageToBlockMultipleInPlace(&opsin, 8 << max_shift);
|
||||
}
|
||||
|
||||
// Create jpeg data and optimize Huffman codes.
|
||||
jpeg::JPEGData jpeg_data;
|
||||
float global_scale = 0.66f;
|
||||
float dc_quant = InitialQuantDC(distance);
|
||||
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
|
||||
&jpeg_data);
|
||||
if (!jpeg_settings.xyb) {
|
||||
global_scale /= 500;
|
||||
if (input.metadata()->color_encoding.tf.IsPQ()) {
|
||||
global_scale *= .4f;
|
||||
} else if (input.metadata()->color_encoding.tf.IsHLG()) {
|
||||
global_scale *= .5f;
|
||||
}
|
||||
}
|
||||
float dc_quant = InitialQuantDC(jpeg_settings.distance);
|
||||
FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
|
||||
subsample_blue, input.metadata()->color_encoding.ICC(),
|
||||
frame_dim, &jpeg_data);
|
||||
|
||||
if (target_size != 0) {
|
||||
if (jpeg_settings.target_size != 0) {
|
||||
// Tweak the jpeg data so that the resulting compressed file is
|
||||
// approximately target_size long.
|
||||
size_t prev_size = 0;
|
||||
@ -670,7 +743,7 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
|
||||
size_t iter = 0;
|
||||
for (;;) {
|
||||
size_t size = JpegSize(jpeg_data);
|
||||
float error = size * 1.0f / target_size - 1.0f;
|
||||
float error = size * 1.0f / jpeg_settings.target_size - 1.0f;
|
||||
if (std::abs(error) < std::abs(best_error)) {
|
||||
best_error = error;
|
||||
best_global_scale = global_scale;
|
||||
@ -679,13 +752,15 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
|
||||
break;
|
||||
}
|
||||
global_scale *= 1.0f + error;
|
||||
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
|
||||
&jpeg_data);
|
||||
FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
|
||||
subsample_blue, input.metadata()->color_encoding.ICC(),
|
||||
frame_dim, &jpeg_data);
|
||||
prev_size = size;
|
||||
++iter;
|
||||
}
|
||||
if (best_global_scale != global_scale) {
|
||||
FillJPEGData(opsin, qf, dc_quant, best_global_scale, subsample_blue,
|
||||
FillJPEGData(opsin, qf, dc_quant, best_global_scale, jpeg_settings.xyb,
|
||||
subsample_blue, input.metadata()->color_encoding.ICC(),
|
||||
frame_dim, &jpeg_data);
|
||||
}
|
||||
}
|
||||
|
8
third_party/jpeg-xl/lib/extras/encode_jpeg.h
vendored
8
third_party/jpeg-xl/lib/extras/encode_jpeg.h
vendored
@ -16,7 +16,13 @@
|
||||
namespace jxl {
|
||||
namespace extras {
|
||||
|
||||
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
|
||||
struct JpegSettings {
|
||||
bool xyb = true;
|
||||
size_t target_size = 0;
|
||||
float distance = 1.f;
|
||||
};
|
||||
|
||||
Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
|
||||
ThreadPool* pool, std::vector<uint8_t>* compressed);
|
||||
|
||||
} // namespace extras
|
||||
|
@ -58,7 +58,6 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,
|
||||
|
||||
JXL_RETURN_IF_ERROR(ConvertFromExternal(
|
||||
span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
|
||||
/*alpha_is_premultiplied=*/info.alpha_premultiplied,
|
||||
frame_bits_per_sample, frame.color.format, pool, bundle));
|
||||
|
||||
bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
|
||||
|
68
third_party/jpeg-xl/lib/include/jxl/decode.h
vendored
68
third_party/jpeg-xl/lib/include/jxl/decode.h
vendored
@ -168,16 +168,6 @@ typedef enum {
|
||||
*/
|
||||
JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3,
|
||||
|
||||
/** The decoder is able to decode a DC image and requests setting a DC output
|
||||
* buffer using @ref JxlDecoderSetDCOutBuffer. This occurs if @ref
|
||||
* JXL_DEC_DC_IMAGE is requested and it is possible to decode a DC image from
|
||||
* the codestream and the DC out buffer was not yet set. This event re-occurs
|
||||
* for new frames if there are multiple animation frames.
|
||||
* @deprecated The DC feature in this form will be removed. For progressive
|
||||
* rendering, @ref JxlDecoderFlushImage should be used.
|
||||
*/
|
||||
JXL_DEC_NEED_DC_OUT_BUFFER = 4,
|
||||
|
||||
/** The decoder requests an output buffer to store the full resolution image,
|
||||
* which can be set with @ref JxlDecoderSetImageOutBuffer or with @ref
|
||||
* JxlDecoderSetImageOutCallback. This event re-occurs for new frames if
|
||||
@ -260,28 +250,12 @@ typedef enum {
|
||||
*/
|
||||
JXL_DEC_FRAME = 0x400,
|
||||
|
||||
/** Informative event by @ref JxlDecoderProcessInput
|
||||
* "JxlDecoderProcessInput": DC image, 8x8 sub-sampled frame, decoded. It is
|
||||
* not guaranteed that the decoder will always return DC separately, but when
|
||||
* it does it will do so before outputting the full frame. @ref
|
||||
* JxlDecoderSetDCOutBuffer must be used after getting the basic image
|
||||
* information to be able to get the DC pixels, if not this return status only
|
||||
* indicates we're past this point in the codestream. This event occurs max
|
||||
* once per frame and always later than @ref JXL_DEC_FRAME and other header
|
||||
* events and earlier than full resolution pixel data.
|
||||
*
|
||||
* @deprecated The DC feature in this form will be removed. For progressive
|
||||
* rendering, @ref JxlDecoderFlushImage should be used.
|
||||
*/
|
||||
JXL_DEC_DC_IMAGE = 0x800,
|
||||
|
||||
/** Informative event by @ref JxlDecoderProcessInput
|
||||
* "JxlDecoderProcessInput": full frame (or layer, in case coalescing is
|
||||
* disabled) is decoded. @ref JxlDecoderSetImageOutBuffer must be used after
|
||||
* getting the basic image information to be able to get the image pixels, if
|
||||
* not this return status only indicates we're past this point in the
|
||||
* codestream. This event occurs max once per frame and always later than @ref
|
||||
* JXL_DEC_DC_IMAGE.
|
||||
* codestream. This event occurs max once per frame.
|
||||
* In this case, @ref JxlDecoderReleaseInput will return all bytes from the
|
||||
* end of the frame (or if @ref JXL_DEC_JPEG_RECONSTRUCTION is subscribed to,
|
||||
* from the end of the last box that is needed for jpeg reconstruction) as
|
||||
@ -599,8 +573,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
|
||||
* available and this informative event is subscribed to.
|
||||
* @return @ref JXL_DEC_PREVIEW_IMAGE when preview pixel information is
|
||||
* available and output in the preview buffer.
|
||||
* @return @ref JXL_DEC_DC_IMAGE when DC pixel information (8x8 downscaled
|
||||
* version of the image) is available and output is in the DC buffer.
|
||||
* @return @ref JXL_DEC_FULL_IMAGE when all pixel information at highest detail
|
||||
* is available and has been output in the pixel buffer.
|
||||
*/
|
||||
@ -992,44 +964,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
|
||||
JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
|
||||
const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
|
||||
|
||||
/**
|
||||
* Returns the minimum size in bytes of the DC image output buffer
|
||||
* for the given format. This is the buffer for @ref JxlDecoderSetDCOutBuffer.
|
||||
* Requires the basic image information is available in the decoder.
|
||||
*
|
||||
* @param dec decoder object
|
||||
* @param format format of pixels
|
||||
* @param size output value, buffer size in bytes
|
||||
* @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
|
||||
* information not available yet.
|
||||
*
|
||||
* @deprecated The DC feature in this form will be removed. Use @ref
|
||||
* JxlDecoderFlushImage for progressive rendering.
|
||||
*/
|
||||
JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize(
|
||||
const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
|
||||
|
||||
/**
|
||||
* Sets the buffer to write the lower resolution (8x8 sub-sampled) DC image
|
||||
* to. The size of the buffer must be at least as large as given by @ref
|
||||
* JxlDecoderDCOutBufferSize. The buffer follows the format described by
|
||||
* JxlPixelFormat. The DC image has dimensions ceil(xsize / 8) * ceil(ysize /
|
||||
* 8). The buffer is owned by the caller.
|
||||
*
|
||||
* @param dec decoder object
|
||||
* @param format format of pixels. Object owned by user and its contents are
|
||||
* copied internally.
|
||||
* @param buffer buffer type to output the pixel data to
|
||||
* @param size size of buffer in bytes
|
||||
* @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
|
||||
* size too small.
|
||||
*
|
||||
* @deprecated The DC feature in this form will be removed. Use @ref
|
||||
* JxlDecoderFlushImage for progressive rendering.
|
||||
*/
|
||||
JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer(
|
||||
JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
|
||||
|
||||
/**
|
||||
* Returns the minimum size in bytes of the image output pixel buffer for the
|
||||
* given format. This is the buffer for @ref JxlDecoderSetImageOutBuffer.
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user