Bug 1807473 - Update libjxl and highway r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D166317
This commit is contained in:
Kagami Sascha Rosylight 2023-01-09 16:54:44 +00:00
parent bb9d3ed10b
commit c9046ede50
195 changed files with 17395 additions and 7868 deletions

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
release: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a (2022-11-18T10:04:25Z).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
revision: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -10,9 +10,9 @@ origin:
url: https://github.com/libjxl/libjxl
release: afa493d9c7c8b47b6ce709180a74a49085291776 (2022-11-12T22:27:21Z).
release: 31e38dae584bae991631750ed6a04f1f6323846a (2023-01-09T10:57:58Z).
revision: afa493d9c7c8b47b6ce709180a74a49085291776
revision: 31e38dae584bae991631750ed6a04f1f6323846a
license: Apache-2.0

View File

@ -161,6 +161,8 @@ cc_library(
# These are textual because config macros influence them:
"hwy/detect_targets.h", # private
"hwy/targets.h",
# This .cc file #includes itself through foreach_target.h
"hwy/per_target.cc",
# End of list
"hwy/highway.h", # public
"hwy/foreach_target.h", # public
@ -179,7 +181,10 @@ cc_library(
"hwy/ops/x86_512-inl.h",
# Select avoids recompiling native arch if only non-native changed
] + select({
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
":compiler_emscripten": [
"hwy/ops/wasm_128-inl.h",
"hwy/ops/wasm_256-inl.h",
],
"//conditions:default": [],
}) + select({
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
@ -201,6 +206,18 @@ cc_library(
],
)
cc_library(
name = "bit_pack",
compatible_with = [],
copts = COPTS,
textual_hdrs = [
"hwy/contrib/bit_pack/bit_pack-inl.h",
],
deps = [
":hwy",
],
)
cc_library(
name = "dot",
compatible_with = [],
@ -303,6 +320,7 @@ HWY_TESTS = [
("hwy/contrib/algo/", "copy_test"),
("hwy/contrib/algo/", "find_test"),
("hwy/contrib/algo/", "transform_test"),
("hwy/contrib/bit_pack/", "bit_pack_test"),
("hwy/contrib/dot/", "dot_test"),
("hwy/contrib/image/", "image_test"),
("hwy/contrib/math/", "math_test"),
@ -349,6 +367,7 @@ HWY_TEST_COPTS = select({
HWY_TEST_DEPS = [
":algo",
":bit_pack",
":dot",
":hwy",
":hwy_test_util",

View File

@ -19,7 +19,13 @@ if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
project(hwy VERSION 1.0.1) # Keep in sync with highway.h version
# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
# target "GTest::gtest_main"'.
if(POLICY CMP0111)
cmake_policy(SET CMP0111 OLD)
endif()
project(hwy VERSION 1.0.2) # Keep in sync with highway.h version
# Directly define the ABI version from the cmake project() version values:
set(LIBRARY_VERSION "${hwy_VERSION}")
@ -27,6 +33,10 @@ set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Search for Atomics implementation:
find_package(Atomics REQUIRED)
# Enabled PIE binaries by default if supported.
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
if(CHECK_PIE_SUPPORTED)
@ -51,6 +61,7 @@ set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
@ -111,6 +122,7 @@ set(HWY_SOURCES
hwy/ops/arm_sve-inl.h
hwy/ops/emu128-inl.h
hwy/ops/generic_ops-inl.h
hwy/ops/rvv-inl.h
hwy/ops/scalar-inl.h
hwy/ops/set_macros-inl.h
hwy/ops/shared-inl.h
@ -225,8 +237,11 @@ else()
endif() # HWY_CMAKE_ARM7
if(HWY_RISCV)
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
# Not yet supported by GCC. When runtime dispatch is supported and
# implemented, we will remove v from the required flags. Until then, using
# clang for RISC-V will require the CPU to support the V extension (1.0).
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
list(APPEND HWY_FLAGS -menable-experimental-extensions)
endif()
endif()
@ -277,16 +292,29 @@ target_include_directories(hwy PUBLIC
target_compile_features(hwy PUBLIC cxx_std_11)
set_target_properties(hwy PROPERTIES
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
# For GCC __atomic_store_8, see #887
target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
if(UNIX AND NOT APPLE)
if(NOT HWY_EMSCRIPTEN)
# For GCC __atomic_store_8, see #887
target_link_libraries(hwy atomic)
endif()
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
set_property(TARGET hwy APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
# uname -p is broken on this system. Try uname -m
EXECUTE_PROCESS( COMMAND uname -m
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_QUIET
OUTPUT_VARIABLE HWY_ARCH)
else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
message(STATUS "Architecture: " ${HWY_ARCH})
if (HWY_ARCH MATCHES "mips")
target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
endif (HWY_ARCH MATCHES "mips")
if (HWY_ENABLE_CONTRIB)
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
target_link_libraries(hwy_contrib hwy)
@ -426,7 +454,7 @@ endif() # HWY_ENABLE_EXAMPLES
include(CTest)
if(BUILD_TESTING)
if(BUILD_TESTING AND HWY_ENABLE_TESTS)
enable_testing()
include(GoogleTest)
@ -458,13 +486,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)
# The gtest/gtest_main targets carry header search path
# dependencies automatically when using CMake 2.8.11 or
# later. Otherwise we have to add them here ourselves.
if (CMAKE_VERSION VERSION_LESS 2.8.11)
include_directories("${gtest_SOURCE_DIR}/include")
endif()
endif() # HWY_SYSTEM_GTEST
set(HWY_TEST_FILES
@ -517,7 +538,11 @@ list(APPEND HWY_TEST_FILES
endif() # HWY_ENABLE_CONTRIB
if(HWY_SYSTEM_GTEST)
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
if (CMAKE_VERSION VERSION_LESS 3.20)
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
else()
set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
endif()
else()
set(HWY_GTEST_LIBS gtest gtest_main)
endif()
@ -534,7 +559,9 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
# that include us may set them.
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
target_link_libraries(${TESTNAME} PRIVATE ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
# For GCC __atomic_store_8, see #887
target_link_libraries(${TESTNAME} PRIVATE ${ATOMICS_LIBRARIES})
# Output test targets in the test directory.
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")

View File

@ -55,7 +55,8 @@ layouts, and aligned/padded allocations.
Online demos using Compiler Explorer:
- [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
(recommended)
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
Projects using Highway: (to add yours, feel free to raise an issue or contact us
@ -74,6 +75,10 @@ Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
WASM SIMD, RISC-V V.
`HWY_WASM_EMU256` is a 2x unrolled version of wasm128 and is enabled if
`HWY_WANT_WASM2` is defined. This will remain supported until it is potentially
superseded by a future version of WASM.
SVE was initially tested using farm_sve (see acknowledgments).
### Versioning
@ -134,6 +139,10 @@ Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
Bazel is also supported for building, but it is not as widely used/tested.
When building for Arm v7, a limitation of current compilers requires you to add
`-DHWY_CMAKE_ARM7:BOOL=ON` to the CMake command line; see #834 and #1032. We
understand that work is underway to remove this limitation.
## Quick start
You can use the `benchmark` inside examples/ as a starting point.
@ -142,6 +151,9 @@ A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
indicates the number of instructions per operation.
The [FAQ](g3doc/faq.md) answers questions about portability, API design and
where to find more information.
We recommend using full SIMD vectors whenever possible for maximum performance
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
@ -163,8 +175,8 @@ Due to ADL restrictions, user code calling Highway ops must either:
hn::Add()`; or
* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
Additionally, each function that calls Highway ops must either be prefixed with
`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
Additionally, each function that calls Highway ops (such as `Load`) must either
be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
their opening brace.
@ -186,6 +198,27 @@ they use static or dynamic dispatch.
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
defined and `foreach_target.h` is included.
When using dynamic dispatch, `foreach_target.h` is included from translation
units (.cc files), not headers. Headers containing vector code shared between
several translation units require a special include guard, for example the
following taken from `examples/skeleton-inl.h`:
```
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#else
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#endif
#include "hwy/highway.h"
// Your vector code
#endif
```
By convention, we name such headers `-inl.h` because their contents (often
function templates) are usually inlined.
## Compiler flags
Applications should be compiled with optimizations enabled - without inlining,

View File

@ -0,0 +1,56 @@
# Original issue:
# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733
#
# For reference:
# * https://gcc.gnu.org/wiki/Atomic/GCCMM
#
# riscv64 specific:
# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html
#
# ATOMICS_FOUND - system has c++ atomics
# ATOMICS_LIBRARIES - libraries needed to use c++ atomics
include(CheckCXXSourceCompiles)
# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed
# to convert smaller atomics to those larger ones via masking and
# shifting like LLVM, but its a known bug that it does not. This means
# anything that wants to use atomics on 1-byte or 2-byte types needs
# -latomic, but not 4-byte or 8-byte (though it does no harm).
set(atomic_code
"
#include <atomic>
#include <cstdint>
std::atomic<uint8_t> n8 (0); // riscv64
std::atomic<uint64_t> n64 (0); // armel, mipsel, powerpc
int main() {
++n8;
++n64;
return 0;
}")
# https://gitlab.kitware.com/cmake/cmake/-/issues/24063
set(CMAKE_CXX_STANDARD 11)
check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS)
if(ATOMICS_LOCK_FREE_INSTRUCTIONS)
set(ATOMICS_FOUND TRUE)
set(ATOMICS_LIBRARIES)
else()
set(CMAKE_REQUIRED_LIBRARIES "-latomic")
check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY)
set(CMAKE_REQUIRED_LIBRARIES)
if(ATOMICS_IN_LIBRARY)
set(ATOMICS_LIBRARY atomic)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY)
set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY})
unset(ATOMICS_LIBRARY)
else()
if(Atomics_FIND_REQUIRED)
message(FATAL_ERROR "Neither lock free instructions nor -latomic found.")
endif()
endif()
endif()
unset(atomic_code)
unset(CMAKE_CXX_STANDARD)

View File

@ -1,3 +1,18 @@
highway (1.0.2-1) UNRELEASED; urgency=medium
* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
* Faster sort for low-entropy input, improved pivot selection
* Add GN build system, Highway FAQ, k32v32 type to vqsort
* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
* Fix missing exec_prefix, RVV build, warnings, libatomic linking
* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
* Documentation/example improvements
* Support static dispatch to SVE2_128 and SVE_256
-- Jan Wassenberg <janwas@google.com> Thu, 27 Oct 2022 17:00:00 +0200
highway (1.0.1-1) UNRELEASED; urgency=medium
* Add Eq128, i64 Mul, unsigned->float ConvertTo

53
third_party/highway/hwy.gni vendored Normal file
View File

@ -0,0 +1,53 @@
_hwy = get_path_info("hwy", "abspath")
hwy_public = [
# Public
"$_hwy/aligned_allocator.h",
"$_hwy/base.h",
"$_hwy/cache_control.h",
"$_hwy/per_target.h",
"$_hwy/print.h",
# Public, textual
"$_hwy/foreach_target.h",
"$_hwy/highway_export.h",
"$_hwy/highway.h",
"$_hwy/print-inl.h",
# Private
"$_hwy/detect_compiler_arch.h",
"$_hwy/detect_targets.h",
"$_hwy/targets.h",
# Private, textual:
"$_hwy/ops/arm_neon-inl.h",
"$_hwy/ops/arm_sve-inl.h",
"$_hwy/ops/emu128-inl.h",
"$_hwy/ops/generic_ops-inl.h",
"$_hwy/ops/scalar-inl.h",
"$_hwy/ops/set_macros-inl.h",
"$_hwy/ops/shared-inl.h",
"$_hwy/ops/x86_128-inl.h",
"$_hwy/ops/x86_256-inl.h",
"$_hwy/ops/x86_512-inl.h",
]
hwy_sources = [
"$_hwy/aligned_allocator.cc",
"$_hwy/per_target.cc",
"$_hwy/print.cc",
"$_hwy/targets.cc",
]
hwy_contrib_public = [
"$_hwy/contrib/algo/copy-inl.h",
"$_hwy/contrib/algo/find-inl.h",
"$_hwy/contrib/algo/transform-inl.h",
"$_hwy/contrib/dot/dot-inl.h",
"$_hwy/contrib/image/image.h",
"$_hwy/contrib/math/math-inl.h",
]
hwy_contrib_sources = [
"$_hwy/contrib/image/image.cc",
]

View File

@ -48,7 +48,7 @@ class SampleObject {
class FakeAllocator {
public:
// static AllocPtr and FreePtr member to be used with the alligned
// static AllocPtr and FreePtr member to be used with the aligned
// allocator. These functions calls the private non-static members.
static void* StaticAlloc(void* opaque, size_t bytes) {
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);

View File

@ -143,7 +143,7 @@
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#else
#define HWY_UNROLL(factor)
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
#define HWY_DEFAULT_UNROLL
#endif
@ -293,6 +293,13 @@ struct alignas(16) K64V64 {
uint64_t key;
};
// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
// than when considering both to be a 64-bit key.
struct alignas(8) K32V32 {
uint32_t value; // little-endian layout
uint32_t key;
};
#pragma pack(pop)
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
@ -304,6 +311,10 @@ static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
const uint128_t& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
const uint128_t& b) {
return a.lo == b.lo && a.hi == b.hi;
}
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
const K64V64& b) {
@ -314,6 +325,24 @@ static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
const K64V64& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
const K64V64& b) {
return a.key == b.key;
}
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
const K32V32& b) {
return a.key < b.key;
}
// Required for std::greater.
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
const K32V32& b) {
return b < a;
}
static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
const K32V32& b) {
return a.key == b.key;
}
//------------------------------------------------------------------------------
// Controlling overload resolution (SFINAE)
@ -369,6 +398,8 @@ HWY_API constexpr bool IsSame() {
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
#define HWY_IF_LANE_SIZE_LT(T, bytes) \
hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
#define HWY_IF_LANE_SIZE_GE(T, bytes) \
hwy::EnableIf<sizeof(T) >= (bytes)>* = nullptr
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
@ -401,16 +432,14 @@ struct Relations<uint8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = uint16_t;
enum { is_signed = 0 };
enum { is_float = 0 };
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = int16_t;
enum { is_signed = 1 };
enum { is_float = 0 };
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint16_t> {
@ -418,8 +447,7 @@ struct Relations<uint16_t> {
using Signed = int16_t;
using Wide = uint32_t;
using Narrow = uint8_t;
enum { is_signed = 0 };
enum { is_float = 0 };
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int16_t> {
@ -427,8 +455,7 @@ struct Relations<int16_t> {
using Signed = int16_t;
using Wide = int32_t;
using Narrow = int8_t;
enum { is_signed = 1 };
enum { is_float = 0 };
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint32_t> {
@ -437,8 +464,7 @@ struct Relations<uint32_t> {
using Float = float;
using Wide = uint64_t;
using Narrow = uint16_t;
enum { is_signed = 0 };
enum { is_float = 0 };
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int32_t> {
@ -447,8 +473,7 @@ struct Relations<int32_t> {
using Float = float;
using Wide = int64_t;
using Narrow = int16_t;
enum { is_signed = 1 };
enum { is_float = 0 };
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint64_t> {
@ -457,8 +482,7 @@ struct Relations<uint64_t> {
using Float = double;
using Wide = uint128_t;
using Narrow = uint32_t;
enum { is_signed = 0 };
enum { is_float = 0 };
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<int64_t> {
@ -466,15 +490,13 @@ struct Relations<int64_t> {
using Signed = int64_t;
using Float = double;
using Narrow = int32_t;
enum { is_signed = 1 };
enum { is_float = 0 };
enum { is_signed = 1, is_float = 0 };
};
template <>
struct Relations<uint128_t> {
using Unsigned = uint128_t;
using Narrow = uint64_t;
enum { is_signed = 0 };
enum { is_float = 0 };
enum { is_signed = 0, is_float = 0 };
};
template <>
struct Relations<float16_t> {
@ -482,16 +504,14 @@ struct Relations<float16_t> {
using Signed = int16_t;
using Float = float16_t;
using Wide = float;
enum { is_signed = 1 };
enum { is_float = 1 };
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<bfloat16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = float;
enum { is_signed = 1 };
enum { is_float = 1 };
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<float> {
@ -500,8 +520,7 @@ struct Relations<float> {
using Float = float;
using Wide = double;
using Narrow = float16_t;
enum { is_signed = 1 };
enum { is_float = 1 };
enum { is_signed = 1, is_float = 1 };
};
template <>
struct Relations<double> {
@ -509,8 +528,7 @@ struct Relations<double> {
using Signed = int64_t;
using Float = double;
using Narrow = float;
enum { is_signed = 1 };
enum { is_float = 1 };
enum { is_signed = 1, is_float = 1 };
};
template <size_t N>
@ -649,6 +667,20 @@ constexpr double HighestValue<double>() {
return 1.7976931348623158e+308;
}
// Difference between 1.0 and the next representable value.
template <typename T>
HWY_API constexpr T Epsilon() {
return 1;
}
template <>
constexpr float Epsilon<float>() {
return 1.192092896e-7f;
}
template <>
constexpr double Epsilon<double>() {
return 2.2204460492503131e-16;
}
// Returns width in bits of the mantissa field in IEEE binary32/64.
template <typename T>
constexpr int MantissaBits() {

View File

@ -13,7 +13,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <algorithm> // std::find_if
#include <vector>
#include "hwy/aligned_allocator.h"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,177 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <vector>
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/nanobenchmark.h"
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc" // NOLINT
#include "hwy/foreach_target.h" // IWYU pragma: keep
#include "hwy/contrib/bit_pack/bit_pack-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
#ifndef HWY_BIT_PACK_BENCHMARK
#define HWY_BIT_PACK_BENCHMARK 0
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
// Used to prevent running benchmark (slow) for partial vectors and targets
// except the best available. Global, not per-target, hence must be outside
// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code
// has been re-included.
extern size_t last_bits;
extern uint64_t best_target;
#if HWY_ONCE
size_t last_bits = 0;
uint64_t best_target = ~0ull;
#endif
namespace HWY_NAMESPACE {
template <size_t kBits, typename T>
T Random(RandomState& rng) {
return static_cast<T>(Random32(&rng) & kBits);
}
template <typename T>
class Checker {
public:
explicit Checker(size_t num) { raw_.reserve(num); }
void NotifyRaw(T raw) { raw_.push_back(raw); }
void NotifyRawOutput(size_t bits, T raw) {
if (raw_[num_verified_] != raw) {
HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits,
num_verified_, raw_.size(),
static_cast<double>(raw_[num_verified_]),
static_cast<double>(raw));
}
++num_verified_;
}
private:
std::vector<T> raw_;
size_t num_verified_ = 0;
};
template <class PackT>
struct TestPack {
template <typename T, class D>
void operator()(T /* t */, D d) {
const size_t N = Lanes(d);
RandomState rng(N * 129);
const size_t num = N * PackT::kRawVectors;
const size_t packed_size = N * PackT::kPackedVectors;
Checker<T> checker(num);
AlignedFreeUniquePtr<T[]> raw = hwy::AllocateAligned<T>(num);
AlignedFreeUniquePtr<T[]> raw2 = hwy::AllocateAligned<T>(num);
AlignedFreeUniquePtr<T[]> packed = hwy::AllocateAligned<T>(packed_size);
for (size_t i = 0; i < num; ++i) {
raw[i] = Random<PackT::kBits, T>(rng);
checker.NotifyRaw(raw[i]);
}
best_target = HWY_MIN(best_target, HWY_TARGET);
const bool run_bench = HWY_BIT_PACK_BENCHMARK &&
(PackT::kBits != last_bits) &&
(HWY_TARGET == best_target);
last_bits = PackT::kBits;
if (run_bench) {
const size_t kNumInputs = 1;
const size_t num_items = num * size_t(Unpredictable1());
const FuncInput inputs[kNumInputs] = {num_items};
Result results[kNumInputs];
Params p;
p.verbose = false;
p.max_evals = 7;
p.target_rel_mad = 0.002;
const size_t num_results = MeasureClosure(
[&](FuncInput) HWY_ATTR {
PackT().Pack(d, raw.get(), packed.get());
PackT().Unpack(d, packed.get(), raw2.get());
return raw2[Random32(&rng) % num];
},
inputs, kNumInputs, results, p);
if (num_results != kNumInputs) {
fprintf(stderr, "MeasureClosure failed.\n");
return;
}
// Print cycles per element
for (size_t i = 0; i < num_results; ++i) {
const double cycles_per_item =
results[i].ticks / static_cast<double>(results[i].input);
const double mad = results[i].variability * cycles_per_item;
printf("Bits:%2d elements:%3d cyc/elt:%6.3f (+/- %5.3f)\n",
static_cast<int>(PackT::kBits),
static_cast<int>(results[i].input), cycles_per_item, mad);
}
} else {
PackT().Pack(d, raw.get(), packed.get());
PackT().Unpack(d, packed.get(), raw2.get());
}
for (size_t i = 0; i < num; ++i) {
checker.NotifyRawOutput(PackT::kBits, raw2[i]);
}
}
};
void TestAllPack8() {
ForShrinkableVectors<TestPack<detail::Pack8<1>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<2>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<3>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<4>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<5>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<6>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<7>>>()(uint8_t());
ForShrinkableVectors<TestPack<detail::Pack8<8>>>()(uint8_t());
}
void TestAllPack16() {
ForShrinkableVectors<TestPack<detail::Pack16<1>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<2>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<3>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<4>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<5>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<6>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<7>>>()(uint16_t());
ForShrinkableVectors<TestPack<detail::Pack16<8>>>()(uint16_t());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(BitPackTest);
HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8);
HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16);
} // namespace hwy
#endif

View File

@ -15,7 +15,7 @@
#include "hwy/contrib/image/image.h"
#include <algorithm> // swap
#include <algorithm> // std::swap
#include <cstddef>
#undef HWY_TARGET_INCLUDE

View File

@ -22,7 +22,6 @@
#include <stdint.h>
#include <string.h>
#include <cstddef>
#include <utility> // std::move
#include "hwy/aligned_allocator.h"

View File

@ -20,6 +20,7 @@
#include <stdio.h>
#include <cfloat> // FLT_MAX
#include <cmath> // std::abs
#include <type_traits>
// clang-format off

View File

@ -79,6 +79,8 @@ cc_library(
"vqsort_i32d.cc",
"vqsort_i64a.cc",
"vqsort_i64d.cc",
"vqsort_kv64a.cc",
"vqsort_kv64d.cc",
"vqsort_kv128a.cc",
"vqsort_kv128d.cc",
"vqsort_u16a.cc",

View File

@ -9,10 +9,9 @@ and [paper](https://arxiv.org/abs/2205.05982).
## Instructions
Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
and Arm V1 (NEON, SVE).
Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
### x86 (Linux)
### Linux
Please first ensure golang, and Clang (tested with 13.0.1) are installed via
your system's package manager.
@ -43,9 +42,10 @@ make -j8 && sudo make install
cd ..
```
AWS clang is at version 11.1, which generates unnecessary AND instructions which
slow down the sort by 1.15x. We tested with clang trunk as of June 13
AWS clang is at version 11.1, which generates unnecessary `AND` instructions
which slow down the sort by 1.15x. We tested with clang trunk as of June 13
(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
```
git clone --depth 1 https://github.com/llvm/llvm-project.git
cd llvm-project
@ -64,6 +64,12 @@ bazel-bin/hwy/contrib/sort/sort_test
bazel-bin/hwy/contrib/sort/bench_sort
```
The above command line enables SVE, which is currently only available on
Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
such flags will be unnecessary once Clang supports `#pragma target` for NEON and
SVE intrinsics, as it does for x86.
## Results
`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort

View File

@ -20,8 +20,9 @@
#include <stdint.h>
#include <string.h> // memcpy
#include <algorithm>
#include <cmath> // std::abs
#include <algorithm> // std::sort, std::min, std::max
#include <functional> // std::less, std::greater
#include <thread> // NOLINT
#include <vector>
#include "hwy/base.h"

View File

@ -81,13 +81,12 @@ HWY_NOINLINE void BenchPartition() {
// The pivot value can influence performance. Do exactly what vqsort will
// do so that the performance (influenced by prefetching and branch
// prediction) is likely to predict the actual performance inside vqsort.
detail::PivotResult result;
const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
buf.get(), rng, result);
detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
detail::SortSamples(d, st, buf.get());
auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
const Timestamp t0;
detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
buf.get());
detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
seconds.push_back(SecondsSince(t0));
// 'Use' the result to prevent optimizing out the partition.
sum += static_cast<double>(aligned.get()[num_lanes / 2]);

View File

@ -63,15 +63,16 @@ struct SortConstants {
}
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
// are larger, use entire vectors to ensure we do not overrun the array.
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
return HWY_MAX(64 / sizeof_t, N);
// cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
// ensures vectors are no larger than that, so this can be independent of the
// vector size and thus constexpr.
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
return 64 / sizeof_t;
}
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
return (3 + 1) * LanesPerChunk(sizeof_t) + 2 * N;
}
template <typename T>

View File

@ -21,6 +21,7 @@
#include <stdio.h>
#include <string.h> // memcpy
#include <unordered_map>
#include <vector>
// clang-format off
@ -49,8 +50,10 @@ using detail::TraitsLane;
#if VQSORT_ENABLED || HWY_IDE
using detail::OrderAscending128;
using detail::OrderAscendingKV128;
using detail::OrderAscendingKV64;
using detail::OrderDescending128;
using detail::OrderDescendingKV128;
using detail::OrderDescendingKV64;
using detail::Traits128;
template <class Traits>
@ -282,10 +285,10 @@ static HWY_NOINLINE void TestPartition() {
const size_t N1 = st.LanesPerKey();
for (bool in_asc : {false, true}) {
for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
3 * N - 1, 4 * N - 3, 4 * N - 2}) {
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
for (LaneType pivot1 :
{LaneType(0), LaneType(len / 3), LaneType(len / 2),
@ -311,10 +314,12 @@ static HWY_NOINLINE void TestPartition() {
for (size_t i = 0; i < left; ++i) {
lanes[i] = hwy::LowestValue<LaneType>();
}
std::unordered_map<LaneType, int> counts;
for (size_t i = left; i < right; ++i) {
lanes[i] = static_cast<LaneType>(
in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
: static_cast<LaneType>(right) - LaneType(i));
++counts[lanes[i]];
if (kDebug >= 2) {
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
}
@ -324,7 +329,8 @@ static HWY_NOINLINE void TestPartition() {
}
size_t border =
detail::Partition(d, st, lanes, left, right, pivot, buf.get());
left + detail::Partition(d, st, lanes + left, right - left,
pivot, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
@ -335,7 +341,15 @@ static HWY_NOINLINE void TestPartition() {
printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
}
}
for (size_t i = left; i < right; ++i) {
--counts[lanes[i]];
}
for (auto kv : counts) {
if (kv.second != 0) {
PrintValue(kv.first);
HWY_ABORT("Incorrect count %d\n", kv.second);
}
}
VerifyPartition(st, lanes, left, border, right, N1, pivot2);
for (size_t i = 0; i < misalign; ++i) {
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
@ -357,15 +371,18 @@ static HWY_NOINLINE void TestPartition() {
}
HWY_NOINLINE void TestAllPartition() {
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
TestPartition<Traits128<OrderAscending128> >();
#if !HWY_IS_DEBUG_BUILD
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
TestPartition<TraitsLane<OrderDescending<float> > >();
#if HWY_HAVE_FLOAT64
TestPartition<TraitsLane<OrderDescending<double> > >();
#endif
TestPartition<Traits128<OrderAscending128> >();
TestPartition<Traits128<OrderDescending128> >();
#endif
}
// (used for sample selection for choosing a pivot)
@ -436,7 +453,13 @@ class CompareResults {
const size_t num_keys = copy_.size() / st.LanesPerKey();
Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
shared, /*thread=*/0);
#if VQSORT_PRINT >= 3
fprintf(stderr, "\nExpected:\n");
for (size_t i = 0; i < copy_.size(); ++i) {
PrintValue(copy_[i]);
}
fprintf(stderr, "\n");
#endif
for (size_t i = 0; i < copy_.size(); ++i) {
if (copy_[i] != output[i]) {
if (sizeof(KeyType) == 16) {
@ -546,7 +569,7 @@ void TestSort(size_t num_lanes) {
}
void TestAllSort() {
for (int num : {129, 504, 20 * 1000, 34567}) {
for (int num : {129, 504, 3 * 1000, 34567}) {
const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
@ -572,6 +595,9 @@ void TestAllSort() {
TestSort<Traits128<OrderAscending128> >(num_lanes);
TestSort<Traits128<OrderDescending128> >(num_lanes);
TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
#endif

View File

@ -42,6 +42,9 @@ namespace detail {
template <typename T>
struct KeyLane {
static constexpr bool Is128() { return false; }
// False indicates the entire key (i.e. lane) should be compared. KV stands
// for key-value.
static constexpr bool IsKV() { return false; }
constexpr size_t LanesPerKey() const { return 1; }
// What type bench_sort should allocate for generating inputs.
@ -78,7 +81,20 @@ struct KeyLane {
return Eq(a, b);
}
HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Ne(a, b);
}
// For keys=lanes, any difference counts.
template <class D>
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
// Must avoid floating-point comparisons (for -0)
const RebindToUnsigned<D> du;
return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
}
HWY_INLINE bool Equal1(const T* a, const T* b) const { return *a == *b; }
template <class D>
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
@ -223,7 +239,7 @@ struct OrderAscending : public KeyLane<T> {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Sub(v, Set(d, 1));
return Sub(v, Set(d, hwy::Epsilon<T>()));
}
};
@ -272,7 +288,142 @@ struct OrderDescending : public KeyLane<T> {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Add(v, Set(d, 1));
return Add(v, Set(d, hwy::Epsilon<T>()));
}
};
struct KeyValue64 : public KeyLane<uint64_t> {
// True indicates only part of the key (i.e. lane) should be compared. KV
// stands for key-value.
static constexpr bool IsKV() { return true; }
template <class D>
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Eq(ShiftRight<32>(a), ShiftRight<32>(b));
}
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
return Ne(ShiftRight<32>(a), ShiftRight<32>(b));
}
HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const {
return (*a >> 32) == (*b >> 32);
}
// Only count differences in the actual key, not the value.
template <class D>
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
// Must avoid floating-point comparisons (for -0)
const RebindToUnsigned<D> du;
const Vec<decltype(du)> zero = Zero(du);
const Vec<decltype(du)> keys = ShiftRight<32>(diff); // clear values
return AllTrue(du, Eq(BitCast(du, keys), zero));
}
};
struct OrderAscendingKV64 : public KeyValue64 {
using Order = SortAscending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (*a >> 32) < (*b >> 32);
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
}
// Not required to be stable (preserving the order of equivalent keys), so
// we can include the value in the comparison.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
// Same as for regular lanes.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Sub(v, Set(d, uint64_t{1}));
}
};
struct OrderDescendingKV64 : public KeyValue64 {
using Order = SortDescending;
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
return (*b >> 32) < (*a >> 32);
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
}
// Not required to be stable (preserving the order of equivalent keys), so
// we can include the value in the comparison.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
uint64_t* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
return Add(v, Set(d, uint64_t{1}));
}
};

View File

@ -124,6 +124,9 @@ struct KeyAny128 {
// Base class shared between OrderAscending128, OrderDescending128.
struct Key128 : public KeyAny128 {
// False indicates the entire key should be compared. KV means key-value.
static constexpr bool IsKV() { return false; }
// What type to pass to Sorter::operator().
using KeyType = hwy::uint128_t;
@ -134,7 +137,20 @@ struct Key128 : public KeyAny128 {
return Eq128(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Ne128(d, a, b);
}
// For keys=entire 128 bits, any difference counts.
template <class D>
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
// Must avoid floating-point comparisons (for -0)
const RebindToUnsigned<D> du;
return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
return a[0] == b[0] && a[1] == b[1];
}
};
@ -187,8 +203,12 @@ struct OrderAscending128 : public Key128 {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Sub(v, k1);
const Vec<D> k0 = Zero(d);
const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1}));
const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0
// lo == 0? 1 : 0, 0
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
return Sub(Sub(v, k1), adjust);
}
};
@ -233,13 +253,21 @@ struct OrderDescending128 : public Key128 {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
return Add(v, k1);
const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1}));
const Vec<D> added = Add(v, k1);
const Mask<D> overflowed = Lt(added, v); // false, overflowed
// overflowed? 1 : 0, 0
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
return Add(added, adjust);
}
};
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
struct KeyValue128 : public KeyAny128 {
// True indicates only part of the key (the more significant lane) should be
// compared. KV stands for key-value.
static constexpr bool IsKV() { return true; }
// What type to pass to Sorter::operator().
using KeyType = K64V64;
@ -250,7 +278,22 @@ struct KeyValue128 : public KeyAny128 {
return Eq128Upper(d, a, b);
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
template <class D>
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
return Ne128Upper(d, a, b);
}
// Only count differences in the actual key, not the value.
template <class D>
HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
// Must avoid floating-point comparisons (for -0)
const RebindToUnsigned<D> du;
const Vec<decltype(du)> zero = Zero(du);
const Vec<decltype(du)> keys = OddEven(diff, zero); // clear values
return AllTrue(du, Eq(BitCast(du, keys), zero));
}
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
return a[1] == b[1];
}
};
@ -296,7 +339,7 @@ struct OrderAscendingKV128 : public KeyValue128 {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
return Sub(v, k1);
}
};
@ -342,7 +385,7 @@ struct OrderDescendingKV128 : public KeyValue128 {
template <class D>
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
return Add(v, k1);
}
};

File diff suppressed because it is too large Load Diff

View File

@ -85,6 +85,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
// For internal use only
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
static bool HaveFloat64();

View File

@ -0,0 +1,65 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV64Asc);
} // namespace
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortKV64Asc)
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

View File

@ -0,0 +1,65 @@
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
// clang-format off
// (avoid line break, which would prevent Copybara rules from matching)
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT
// clang-format on
#include "hwy/foreach_target.h" // IWYU pragma: keep
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
#if VQSORT_ENABLED
SortTag<uint64_t> d;
detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
Sort(d, st, keys, num, buf);
#else
(void) keys;
(void) num;
(void) buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortKV64Desc);
} // namespace
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortKV64Desc)
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

View File

@ -21,7 +21,8 @@
// Add to #if conditions to prevent IDE from graying out code.
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
(defined Q_CREATOR_RUN) || (defined __CLANGD__) || \
(defined GROK_ELLIPSIS_BUILD)
#define HWY_IDE 1
#else
#define HWY_IDE 0
@ -69,7 +70,7 @@
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
// an invalid version number, deduce it from the presence of warnings.
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
#if defined(__APPLE__) || __clang_major__ >= 999
#if defined(__apple_build_version__) || __clang_major__ >= 999
#if __has_warning("-Wbitwise-instead-of-logical")
#define HWY_COMPILER_CLANG 1400
#elif __has_warning("-Wreserved-identifier")
@ -85,7 +86,12 @@
#elif __has_warning("-Wextra-semi-stmt") || \
__has_builtin(__builtin_rotateleft32)
#define HWY_COMPILER_CLANG 800
#elif __has_warning("-Wc++98-compat-extra-semi")
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
// based on Clang 7, but does not support the warning we test.
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
// https://trac.macports.org/wiki/XcodeVersionInfo.
#elif __has_warning("-Wc++98-compat-extra-semi") || \
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
#define HWY_COMPILER_CLANG 700
#else // Anything older than 7.0 is not recommended for Highway.
#define HWY_COMPILER_CLANG 600

View File

@ -23,7 +23,7 @@
//------------------------------------------------------------------------------
// Optional configuration
// See ../quick_reference.md for documentation of these macros.
// See g3doc/quick_reference.md for documentation of these macros.
// Uncomment to override the default baseline determined from predefined macros:
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
@ -169,13 +169,14 @@
#define HWY_ENABLED(targets) \
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
// be enabled. If 1, we instead choose HWY_SCALAR even without
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
// always be enabled. If 1, we instead choose HWY_SCALAR even without
// HWY_COMPILE_ONLY_SCALAR being set.
#if !defined(HWY_BROKEN_EMU128) // allow overriding
#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
defined(HWY_NO_LIBCXX)
#define HWY_BROKEN_EMU128 1
#else
#define HWY_BROKEN_EMU128 0
@ -215,30 +216,45 @@
#define HWY_BASELINE_PPC8 0
#endif
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
#define HWY_BASELINE_SVE2 HWY_SVE2
#else
#define HWY_BASELINE_SVE2 0
#endif
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
// Baseline targets can be used unconditionally, which does not apply to
// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
// in the baseline would also disable all 'worse' targets (including SVE and
// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
// HWY_ATTAINABLE_TARGETS below.
#define HWY_BASELINE_SVE HWY_SVE
#else
#define HWY_BASELINE_SVE 0
#endif
#define HWY_BASELINE_NEON 0
#if HWY_ARCH_ARM
#if defined(__ARM_FEATURE_SVE2)
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
// If user specified -msve-vector-bits=128, they assert the vector length is
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
#define HWY_BASELINE_SVE2 HWY_SVE2_128
// Otherwise we're not sure what the vector length will be. The baseline must be
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
#else
#define HWY_BASELINE_SVE2 HWY_SVE2
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE2
#if defined(__ARM_FEATURE_SVE)
#undef HWY_BASELINE_SVE // was 0, will be re-defined
// See above. If user-specified vector length matches our optimization, use it.
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
#define HWY_BASELINE_SVE HWY_SVE_256
#else
#define HWY_BASELINE_SVE HWY_SVE
#endif // __ARM_FEATURE_SVE_BITS
#endif // __ARM_FEATURE_SVE
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#undef HWY_BASELINE_NEON
#define HWY_BASELINE_NEON HWY_NEON
#else
#define HWY_BASELINE_NEON 0
#endif
#endif // HWY_ARCH_ARM
// Special handling for MSVC because it has fewer predefined macros:
#if HWY_COMPILER_MSVC
@ -372,9 +388,12 @@
#endif
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
// does, and we require Linux to detect CPU capabilities.
#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
// Clang, GCC and MSVC allow runtime dispatch on x86.
#if HWY_ARCH_X86
#define HWY_HAVE_RUNTIME_DISPATCH 1
// On Arm, currently only GCC does, and we require Linux to detect CPU
// capabilities.
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
#define HWY_HAVE_RUNTIME_DISPATCH 1
#else
#define HWY_HAVE_RUNTIME_DISPATCH 0
@ -389,15 +408,15 @@
#define HWY_ATTAINABLE_AVX3_DL 0
#endif
#if HWY_ARCH_ARM_A64 && \
((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
#else
#define HWY_ATTAINABLE_SVE 0
#endif
#if HWY_ARCH_ARM_A64 && \
((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
#else
#define HWY_ATTAINABLE_SVE2 0

View File

@ -21,8 +21,9 @@
#include <stdint.h>
#include <stdio.h>
#include <cmath> // std::abs
#include <memory>
#include <numeric> // iota
#include <numeric> // std::iota, std::inner_product
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"

View File

@ -52,10 +52,11 @@ HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
// Type tags for converting to other element types (Rebind = same count).
const hn::RebindToSigned<DF> d32;
const hn::Rebind<uint8_t, DF> d8;
using VI32 = hn::Vec<decltype(d32)>;
const auto u8 = hn::Load(d8, values);
const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
const VI32 vi32 = hn::PromoteTo(d32, hn::Load(d8, values));
const VI32 bits = hn::BitCast(d32, hn::ConvertTo(df, vi32));
const VI32 exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
}

View File

@ -29,7 +29,7 @@ namespace hwy {
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 1
#define HWY_MINOR 0
#define HWY_PATCH 1
#define HWY_PATCH 2
//------------------------------------------------------------------------------
// Shorthand for tags (defined in shared-inl.h) used to select overloads.

View File

@ -16,6 +16,7 @@
#include <stddef.h>
#include <stdint.h>
#include <algorithm> // std::fill
#include <bitset>
#include "hwy/base.h"

View File

@ -24,14 +24,15 @@
#include <stdlib.h>
#include <time.h> // clock_gettime
#include <algorithm> // sort
#include <algorithm> // std::sort, std::find_if
#include <array>
#include <atomic>
#include <chrono> //NOLINT
#include <limits>
#include <numeric> // iota
#include <numeric> // std::iota
#include <random>
#include <string>
#include <utility> // std::pair
#include <vector>
#if defined(_WIN32) || defined(_WIN64)
@ -150,7 +151,7 @@ inline Ticks Start() {
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
#elif HWY_ARCH_RVV
asm volatile("rdcycle %0" : "=r"(t));
asm volatile("rdtime %0" : "=r"(t));
#elif defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER counter;
(void)QueryPerformanceCounter(&counter);

View File

@ -22,16 +22,18 @@
#include <stddef.h>
#include <stdint.h>
#include "hwy/base.h" // before HWY_DIAGNOSTICS
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
// the same target attribute as our code, see #834.
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
#include <arm_neon.h>
HWY_DIAGNOSTICS(pop)
#include "hwy/ops/shared-inl.h"
HWY_BEFORE_NAMESPACE();
// Must come after arm_neon.h.
namespace hwy {
namespace HWY_NAMESPACE {
@ -766,6 +768,9 @@ class Vec128 {
using Raw = typename detail::Raw128<T, N>::type;
public:
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = N; // only for DFromV
HWY_INLINE Vec128() {}
Vec128(const Vec128&) = default;
Vec128& operator=(const Vec128&) = default;
@ -822,23 +827,11 @@ class Mask128 {
template <typename T>
using Mask64 = Mask128<T, 8 / sizeof(T)>;
namespace detail {
// Deduce Simd<T, N, 0> from Vec128<T, N>
struct DeduceD {
template <typename T, size_t N>
Simd<T, N, 0> operator()(Vec128<T, N>) const {
return Simd<T, N, 0>();
}
};
} // namespace detail
template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class V>
using DFromV = decltype(detail::DeduceD()(V()));
template <class V>
using TFromV = TFromD<DFromV<V>>;
using TFromV = typename V::PrivateT;
// ------------------------------ BitCast
@ -1025,19 +1018,21 @@ HWY_API Vec128<bfloat16_t, N> Zero(Simd<bfloat16_t, N, 0> /* tag */) {
template <class D>
using VFromD = decltype(Zero(D()));
// Returns a vector with uninitialized elements.
template <typename T, size_t N>
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
HWY_DIAGNOSTICS(push)
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif
// Returns a vector with uninitialized elements.
template <typename T, size_t N>
HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
typename detail::Raw128<T, N>::type a;
return Vec128<T, N>(a);
HWY_DIAGNOSTICS(pop)
}
HWY_DIAGNOSTICS(pop)
// Returns a vector with lane i=[0, N) set to "first" + i.
template <typename T, size_t N, typename T2>
Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
@ -2277,6 +2272,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
// ================================================== COMPARE
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
@ -2885,12 +2886,19 @@ HWY_API void StoreU(Vec128<bfloat16_t, N> v, Simd<bfloat16_t, N, 0> d,
return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
}
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
#endif
// On ARM, Store is the same as StoreU.
template <typename T, size_t N>
HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
StoreU(v, d, aligned);
}
HWY_DIAGNOSTICS(pop)
template <typename T, size_t N>
HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
T* HWY_RESTRICT p) {
@ -3527,6 +3535,11 @@ HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
return Vec64<double>(vget_low_f64(v.raw));
}
#endif
HWY_API Vec64<bfloat16_t> LowerHalf(const Vec128<bfloat16_t> v) {
const Full128<uint16_t> du;
const Full64<bfloat16_t> dbh;
return BitCast(dbh, LowerHalf(BitCast(du, v)));
}
template <typename T, size_t N>
HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
@ -3727,6 +3740,13 @@ HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
}
#endif
HWY_API Vec64<bfloat16_t> UpperHalf(Full64<bfloat16_t> dbh,
const Vec128<bfloat16_t> v) {
const RebindToUnsigned<decltype(dbh)> duh;
const Twice<decltype(duh)> du;
return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
}
// Partial
template <typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
@ -4243,6 +4263,48 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(Full128<int32_t> /*d32*/,
Vec128<int16_t> a,
Vec128<int16_t> b,
const Vec128<int32_t> sum0,
Vec128<int32_t>& sum1) {
#if HWY_ARCH_ARM_A64
sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
#else
const Full64<int16_t> dh;
sum1 = Vec128<int32_t>(
vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
#endif
return Vec128<int32_t>(
vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
}
HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(Full64<int32_t> d32,
Vec64<int16_t> a,
Vec64<int16_t> b,
const Vec64<int32_t> sum0,
Vec64<int32_t>& sum1) {
// vmlal writes into the upper half, which the caller cannot use, so
// split into two halves.
const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
sum1 += mul_32;
return sum0 + LowerHalf(mul_3210);
}
HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(Full32<int32_t> d32,
Vec32<int16_t> a,
Vec32<int16_t> b,
const Vec32<int32_t> sum0,
Vec32<int32_t>& sum1) {
const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
sum1 += mul1;
return sum0 + mul0;
}
// ================================================== COMBINE
// ------------------------------ Combine (InterleaveLower)
@ -4587,6 +4649,32 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> d16,
Vec128<int32_t> a, Vec128<int32_t> b) {
const Vec64<int16_t> a16(vqmovn_s32(a.raw));
#if HWY_ARCH_ARM_A64
(void)d16;
return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
#else
const Vec64<int16_t> b16(vqmovn_s32(b.raw));
return Combine(d16, a16, b16);
#endif
}
HWY_API Vec64<int16_t> ReorderDemote2To(Full64<int16_t> /*d16*/,
Vec64<int32_t> a, Vec64<int32_t> b) {
const Full128<int32_t> d32;
const Vec128<int32_t> ab = Combine(d32, a, b);
return Vec64<int16_t>(vqmovn_s32(ab.raw));
}
HWY_API Vec32<int16_t> ReorderDemote2To(Full32<int16_t> /*d16*/,
Vec32<int32_t> a, Vec32<int32_t> b) {
const Full128<int32_t> d32;
const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
}
// ================================================== CRYPTO
#if defined(__ARM_FEATURE_AES) || \
@ -4892,7 +4980,8 @@ namespace detail {
// N=1 for any T: no-op
template <typename T>
HWY_INLINE Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
const Vec128<T, 1> v) {
return v;
}
template <typename T>
@ -4908,7 +4997,8 @@ HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
// u32/i32/f32: N=2
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<T, 2> v10) {
return v10 + Shuffle2301(v10);
}
template <typename T>
@ -4924,48 +5014,59 @@ HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
// full vectors
#if HWY_ARCH_ARM_A64
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<uint32_t> v) {
return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
}
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<int32_t> v) {
return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
}
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<float> v) {
return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
}
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<uint64_t> v) {
return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
}
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<int64_t> v) {
return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
}
HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
HWY_INLINE Vec128<double> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<double> v) {
return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
}
#else
// ARMv7 version for everything except doubles.
HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<uint32_t> v) {
uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
uint32x4x2_t v1 = vuzpq_u32(c0, c0);
return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<int32_t> v) {
int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
int32x4x2_t v1 = vuzpq_s32(c0, c0);
return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
const Vec128<float> v) {
float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
float32x4x2_t v1 = vuzpq_f32(c0, c0);
return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
}
HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<uint64_t> v) {
return v + Shuffle01(v);
}
HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
const Vec128<int64_t> v) {
return v + Shuffle01(v);
}
#endif
@ -5001,6 +5102,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
@ -5053,7 +5178,7 @@ HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
template <typename T, size_t N>
HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
return detail::SumOfLanes(v);
return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
}
template <typename T, size_t N>
HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
@ -5399,6 +5524,15 @@ HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
constexpr int kDiv = 4 * sizeof(T);
return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
}
template <typename T, size_t N>
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> d,
const Mask128<T, N> mask) {
const uint64_t nib = detail::NibblesFromMask(d, mask);
constexpr size_t kDiv = 4 * sizeof(T);
return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
const Mask128<T, N> mask) {
@ -6334,7 +6468,7 @@ HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
// Truth table of Eq and Lt for Hi and Lo u64.
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
// =H =L cH cL | out = cH | (=H & cL)
@ -6371,7 +6505,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
}
@ -6383,6 +6517,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
}
// ------------------------------ Ne128
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
return MaskFromVec(Or(Reverse2(d, neHL), neHL));
}
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
return MaskFromVec(InterleaveUpper(d, neHL, neHL));
}
// ------------------------------ Min128, Max128 (Lt128)
// Without a native OddEven, it seems infeasible to go faster than Lt128.

View File

@ -265,6 +265,9 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
#undef HWY_SVE_FIRSTN
template <class D>
using MFromD = decltype(FirstN(D(), 0));
namespace detail {
#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
@ -320,7 +323,9 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
template <class D>
VFromD<D> Zero(D d) {
return Set(d, 0);
// Cast to support bfloat16_t.
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, 0));
}
// ------------------------------ Undefined
@ -638,10 +643,9 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)
// ------------------------------ MulHigh
HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
namespace detail {
// Not part of API, used internally:
HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
} // namespace detail
// ------------------------------ MulFixedPoint15
HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
@ -732,6 +736,10 @@ HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
return svsel_b(a, svnand_b_z(a, a, b), b); // a ? !(a & b) : b.
}
HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
return svnor_b_z(HWY_SVE_PTRUE(8), a, b); // !a && !b, undefined if a && b.
}
// ------------------------------ CountTrue
#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP) \
@ -777,6 +785,12 @@ HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
}
// ------------------------------ FindKnownFirstTrue
template <class D>
HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
}
// ------------------------------ IfThenElse
#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP) \
HWY_API HWY_SVE_V(BASE, BITS) \
@ -1221,8 +1235,9 @@ HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
// ------------------------------ PromoteTo F
// Unlike Highway's ZipLower, this returns the same type.
namespace detail {
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
} // namespace detail
template <size_t N, int kPow2>
@ -1230,21 +1245,21 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
const svfloat16_t v) {
// svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
// first replicate each lane once.
const svfloat16_t vv = detail::ZipLower(v, v);
const svfloat16_t vv = detail::ZipLowerSame(v, v);
return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
}
template <size_t N, int kPow2>
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
const svfloat32_t v) {
const svfloat32_t vv = detail::ZipLower(v, v);
const svfloat32_t vv = detail::ZipLowerSame(v, v);
return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
}
template <size_t N, int kPow2>
HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
const svint32_t v) {
const svint32_t vv = detail::ZipLower(v, v);
const svint32_t vv = detail::ZipLowerSame(v, v);
return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
}
@ -1431,8 +1446,8 @@ namespace detail {
NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
return sv##OP##_##CHAR##BITS(lo, hi); \
}
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
@ -1455,10 +1470,10 @@ template <class D>
HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_SVE_IS_POW2
(void)d;
return detail::ConcatOdd(hi, lo);
return detail::ConcatOddFull(hi, lo);
#else
const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
#endif
}
@ -1467,10 +1482,10 @@ template <class D>
HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
#if HWY_SVE_IS_POW2
(void)d;
return detail::ConcatEven(hi, lo);
return detail::ConcatEvenFull(hi, lo);
#else
const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
#endif
}
@ -1480,25 +1495,28 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
template <size_t N, int kPow2>
HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
return detail::ConcatEvenFull(in_even,
in_even); // lower half
}
template <size_t N, int kPow2>
HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
return detail::ConcatOdd(in_even, in_even); // can ignore upper half of vec
return detail::ConcatOddFull(in_even, in_even); // lower half
}
template <size_t N, int kPow2>
HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
return detail::ConcatEvenFull(in_even,
in_even); // lower half
}
template <size_t N, int kPow2>
HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
return detail::ConcatEven(in_even, in_even); // only low 1/2 of result valid
return detail::ConcatEvenFull(in_even,
in_even); // lower half
}
// ------------------------------ ConvertTo F
@ -1559,15 +1577,15 @@ HWY_API V InterleaveLower(D d, const V a, const V b) {
static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
#if HWY_TARGET == HWY_SVE2_128
(void)d;
return detail::ZipLower(a, b);
return detail::ZipLowerSame(a, b);
#else
// Move lower halves of blocks to lower half of vector.
const Repartition<uint64_t, decltype(d)> d64;
const auto a64 = BitCast(d64, a);
const auto b64 = BitCast(d64, b);
const auto a_blocks = detail::ConcatEven(a64, a64); // only lower half needed
const auto b_blocks = detail::ConcatEven(b64, b64);
return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
const auto a_blocks = detail::ConcatEvenFull(a64, a64); // lower half
const auto b_blocks = detail::ConcatEvenFull(b64, b64);
return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
#endif
}
@ -1582,7 +1600,8 @@ HWY_API V InterleaveLower(const V a, const V b) {
// "upper half" requires MaskUpperHalf.
#if HWY_TARGET == HWY_SVE2_128
namespace detail {
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpper, zip2)
// Unlike Highway's ZipUpper, this returns the same type.
HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
} // namespace detail
#endif
@ -1592,15 +1611,15 @@ template <class D, class V = VFromD<D>,
HWY_API V InterleaveUpper(D d, const V a, const V b) {
#if HWY_TARGET == HWY_SVE2_128
(void)d;
return detail::ZipUpper(a, b);
return detail::ZipUpperSame(a, b);
#else
// Move upper halves of blocks to lower half of vector.
const Repartition<uint64_t, decltype(d)> d64;
const auto a64 = BitCast(d64, a);
const auto b64 = BitCast(d64, b);
const auto a_blocks = detail::ConcatOdd(a64, a64); // only lower half needed
const auto b_blocks = detail::ConcatOdd(b64, b64);
return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
const auto a_blocks = detail::ConcatOddFull(a64, a64); // lower half
const auto b_blocks = detail::ConcatOddFull(b64, b64);
return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
#endif
}
@ -1814,12 +1833,17 @@ HWY_API V LowerHalf(const V v) {
return v;
}
template <class D2, class V>
HWY_API V UpperHalf(const D2 d2, const V v) {
template <class DH, class V>
HWY_API V UpperHalf(const DH dh, const V v) {
const Twice<decltype(dh)> d;
// Cast so that we support bfloat16_t.
const RebindToUnsigned<decltype(d)> du;
const VFromD<decltype(du)> vu = BitCast(du, v);
#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // constexpr Lanes
return detail::Ext<Lanes(d2)>(v, v);
return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
#else
return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
return BitCast(d, detail::Splice(vu, vu, mask));
#endif
}
@ -1842,14 +1866,14 @@ namespace detail {
return sv##OP##_##CHAR##BITS(pg, v); \
}
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanes, addv)
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanes, addv)
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
// NaN if all are
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)
#undef HWY_SVE_REDUCE
#undef HWY_SVE_REDUCE_ADD
@ -1857,17 +1881,17 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
template <class D, class V>
V SumOfLanes(D d, V v) {
return Set(d, detail::SumOfLanes(detail::MakeMask(d), v));
return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
}
template <class D, class V>
V MinOfLanes(D d, V v) {
return Set(d, detail::MinOfLanes(detail::MakeMask(d), v));
return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
}
template <class D, class V>
V MaxOfLanes(D d, V v) {
return Set(d, detail::MaxOfLanes(detail::MakeMask(d), v));
return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
}
@ -1882,19 +1906,19 @@ namespace detail {
return sv##OP##_##CHAR##BITS(mask, v); \
}
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLane, lasta)
HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
#undef HWY_SVE_GET_LANE
} // namespace detail
template <class V>
HWY_API TFromV<V> GetLane(V v) {
return detail::GetLane(v, detail::PFalse());
return detail::GetLaneM(v, detail::PFalse());
}
// ------------------------------ ExtractLane
template <class V>
HWY_API TFromV<V> ExtractLane(V v, size_t i) {
return detail::GetLane(v, FirstN(DFromV<V>(), i));
return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
}
// ------------------------------ InsertLane (IfThenElse)
@ -2154,7 +2178,7 @@ HWY_API V Compress(V v, svbool_t mask) {
// bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
// SetTableIndices.
const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
const size_t offset = detail::SumOfLanes(mask, bits);
const size_t offset = detail::SumOfLanesM(mask, bits);
// See CompressIsPartition.
alignas(16) static constexpr uint64_t table[4 * 16] = {
@ -2196,8 +2220,8 @@ HWY_API V Compress(V v, svbool_t mask16) {
// Demote to 16-bit (already in range) - separately so we can splice
const V evenL = BitCast(d16, compressedL);
const V evenH = BitCast(d16, compressedH);
const V v16L = detail::ConcatEven(evenL, evenL); // only lower half needed
const V v16H = detail::ConcatEven(evenH, evenH);
const V v16L = detail::ConcatEvenFull(evenL, evenL); // lower half
const V v16H = detail::ConcatEvenFull(evenH, evenH);
// We need to combine two vectors of non-constexpr length, so the only option
// is Splice, which requires us to synthesize a mask. NOTE: this function uses
@ -2240,7 +2264,7 @@ HWY_API V CompressNot(V v, svbool_t mask) {
// bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
// SetTableIndices.
const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
const size_t offset = detail::SumOfLanes(mask, bits);
const size_t offset = detail::SumOfLanesM(mask, bits);
// See CompressIsPartition.
alignas(16) static constexpr uint64_t table[4 * 16] = {
@ -2478,7 +2502,7 @@ namespace detail {
return sv##OP##_##CHAR##BITS(v, kLane); \
}
HWY_SVE_FOREACH(HWY_SVE_BROADCAST, Broadcast, dup_lane)
HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane)
#undef HWY_SVE_BROADCAST
} // namespace detail
#endif
@ -2490,7 +2514,7 @@ HWY_API V Broadcast(const V v) {
constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
#if HWY_TARGET == HWY_SVE2_128
return detail::Broadcast<kLane>(v);
return detail::BroadcastLane<kLane>(v);
#else
auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
if (kLane != 0) {
@ -2585,10 +2609,11 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
template <size_t N, int kPow2>
HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32,
const svuint16_t v) {
return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v));
}
// ------------------------------ ReorderDemote2To (OddEven)
template <size_t N, int kPow2>
HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
svfloat32_t a, svfloat32_t b) {
@ -2598,6 +2623,21 @@ HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
template <size_t N, int kPow2>
HWY_API svint16_t ReorderDemote2To(Simd<int16_t, N, kPow2> d16, svint32_t a,
svint32_t b) {
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
(void)d16;
const svint16_t a_in_even = svqxtnb_s32(a);
return svqxtnt_s32(a_in_even, b);
#else
const Half<decltype(d16)> dh;
const svint16_t a16 = BitCast(dh, detail::SaturateI<int16_t>(a));
const svint16_t b16 = BitCast(dh, detail::SaturateI<int16_t>(b));
return detail::InterleaveEven(a16, b16);
#endif
}
// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
template <class V>
HWY_API V ZeroIfNegative(const V v) {
@ -2716,7 +2756,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 2)>
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
const ScalableTag<uint8_t> d8;
const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
return detail::ConcatEven(b16, b16); // only lower half needed
return detail::ConcatEvenFull(b16, b16); // lower half
}
template <class T, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
@ -2726,7 +2766,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 8)>
HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
const ScalableTag<uint32_t> d32;
const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
return U8FromU32(detail::ConcatEven(b64, b64)); // only lower half needed
return U8FromU32(detail::ConcatEvenFull(b64, b64)); // lower half
}
// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
@ -2791,7 +2831,7 @@ namespace detail {
return sv##OP##_##CHAR##BITS(a, b); \
}
HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
#undef HWY_SVE_MUL_EVEN
} // namespace detail
#endif
@ -2799,27 +2839,28 @@ HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
template <class V, class DW = RepartitionToWide<DFromV<V>>>
HWY_API VFromD<DW> MulEven(const V a, const V b) {
#if HWY_TARGET == HWY_SVE2
return BitCast(DW(), detail::MulEven(a, b));
return BitCast(DW(), detail::MulEvenNative(a, b));
#else
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
const auto hi = MulHigh(a, b);
return BitCast(DW(), detail::InterleaveEven(lo, hi));
#endif
}
HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
const auto hi = MulHigh(a, b);
return detail::InterleaveEven(lo, hi);
}
HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
const auto lo = Mul(a, b);
const auto hi = detail::MulHigh(a, b);
const auto hi = MulHigh(a, b);
return detail::InterleaveOdd(lo, hi);
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
template <size_t N, int kPow2>
HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
svuint16_t a, svuint16_t b,
@ -2837,6 +2878,33 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
template <size_t N, int kPow2>
HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
svint16_t a, svint16_t b,
const svint32_t sum0,
svint32_t& sum1) {
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
(void)d32;
sum1 = svmlalt_s32(sum1, a, b);
return svmlalb_s32(sum0, a, b);
#else
const svbool_t pg = detail::PTrue(d32);
const svint32_t a0 = svunpklo_s32(a);
const svint32_t b0 = svunpklo_s32(b);
svint32_t a1, b1;
if (detail::IsFull(d32)) {
a1 = svunpkhi_s32(a);
b1 = svunpkhi_s32(b);
} else {
const Rebind<int16_t, decltype(d32)> d16h;
a1 = svunpklo_s32(UpperHalf(d16h, a));
b1 = svunpklo_s32(UpperHalf(d16h, b));
}
sum1 = svmla_s32_x(pg, sum1, a1, b1);
return svmla_s32_x(pg, sum0, a0, b0);
#endif
}
// ------------------------------ AESRound / CLMul
#if defined(__ARM_FEATURE_SVE2_AES) || \
@ -2886,7 +2954,8 @@ HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2) // actually for bool
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
template <class D>
HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t eqHx = Eq(a, b); // only odd lanes used
// Convert to vector: more pipelines can execute vector TRN* instructions
// than the predicate version.
@ -2905,7 +2974,8 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
#if HWY_TARGET == HWY_SVE_256
return MaskFromVec(detail::Lt128Vec(d, a, b));
#else
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t eqHx = Eq(a, b); // only odd lanes used
const svbool_t ltHL = Lt(a, b);
// Move into upper lane: ltL if the upper half is equal, otherwise ltH.
@ -2919,18 +2989,21 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
template <class D>
HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t ltHL = Lt(a, b);
return detail::DupOddB(d, ltHL);
}
// ------------------------------ Eq128
// ------------------------------ Eq128, Ne128
#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
namespace detail {
template <class D>
HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
// Convert to vector: more pipelines can execute vector TRN* instructions
// than the predicate version.
const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
@ -2939,6 +3012,20 @@ HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
const svuint64_t eqLL = DupEven(eqHL);
return And(eqLL, eqHH);
}
template <class D>
HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
// Convert to vector: more pipelines can execute vector TRN* instructions
// than the predicate version.
const svuint64_t neHL = VecFromMask(d, Ne(a, b));
// Duplicate upper and lower.
const svuint64_t neHH = DupOdd(neHL);
const svuint64_t neLL = DupEven(neHL);
return Or(neLL, neHH);
}
} // namespace detail
#endif
@ -2947,7 +3034,8 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
#if HWY_TARGET == HWY_SVE_256
return MaskFromVec(detail::Eq128Vec(d, a, b));
#else
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t eqHL = Eq(a, b);
const svbool_t eqHH = detail::DupOddB(d, eqHL);
const svbool_t eqLL = detail::DupEvenB(d, eqHL);
@ -2955,15 +3043,38 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
#endif // HWY_TARGET != HWY_SVE_256
}
// ------------------------------ Eq128Upper
template <class D>
HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
#if HWY_TARGET == HWY_SVE_256
return MaskFromVec(detail::Ne128Vec(d, a, b));
#else
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t neHL = Ne(a, b);
const svbool_t neHH = detail::DupOddB(d, neHL);
const svbool_t neLL = detail::DupEvenB(d, neHL);
return Or(neLL, neHH);
#endif // HWY_TARGET != HWY_SVE_256
}
// ------------------------------ Eq128Upper, Ne128Upper
template <class D>
HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t eqHL = Eq(a, b);
return detail::DupOddB(d, eqHL);
}
template <class D>
HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const svbool_t neHL = Ne(a, b);
return detail::DupOddB(d, neHL);
}
// ------------------------------ Min128, Max128 (Lt128)
template <class D>

View File

@ -18,6 +18,7 @@
#include <stddef.h>
#include <stdint.h>
#include <cmath> // std::abs, std::isnan
#include "hwy/base.h"
#include "hwy/ops/shared-inl.h"
@ -32,6 +33,9 @@ using Full128 = Simd<T, 16 / sizeof(T), 0>;
// (Wrapper class required for overloading comparison operators.)
template <typename T, size_t N = 16 / sizeof(T)>
struct Vec128 {
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = N; // only for DFromV
HWY_INLINE Vec128() = default;
Vec128(const Vec128&) = default;
Vec128& operator=(const Vec128&) = default;
@ -78,23 +82,11 @@ struct Mask128 {
Raw bits[16 / sizeof(T)] = {};
};
namespace detail {
// Deduce Simd<T, N, 0> from Vec128<T, N>
struct Deduce128 {
template <typename T, size_t N>
Simd<T, N, 0> operator()(Vec128<T, N>) const {
return Simd<T, N, 0>();
}
};
} // namespace detail
template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class V>
using DFromV = decltype(detail::Deduce128()(V()));
template <class V>
using TFromV = TFromD<DFromV<V>>;
using TFromV = typename V::PrivateT;
// ------------------------------ BitCast
@ -380,6 +372,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
// ================================================== SHIFTS
// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@ -1235,6 +1233,14 @@ HWY_API Mask128<uint64_t> Eq128(Simd<uint64_t, 2, 0> /* tag */,
return ret;
}
HWY_API Mask128<uint64_t> Ne128(Simd<uint64_t, 2, 0> /* tag */,
Vec128<uint64_t> a, const Vec128<uint64_t> b) {
const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
Mask128<uint64_t> ret;
ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
return ret;
}
HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
@ -1244,6 +1250,15 @@ HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
return ret;
}
HWY_API Mask128<uint64_t> Ne128Upper(Simd<uint64_t, 2, 0> /* tag */,
Vec128<uint64_t> a,
const Vec128<uint64_t> b) {
const bool ne = a.raw[1] != b.raw[1];
Mask128<uint64_t> ret;
ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
return ret;
}
// ------------------------------ Min128, Max128 (Lt128)
template <class D, class V = VFromD<D>>
@ -1548,6 +1563,22 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
}
template <size_t N>
HWY_API Vec128<int16_t, 2 * N> ReorderDemote2To(Simd<int16_t, 2 * N, 0> /*d16*/,
Vec128<int32_t, N> a,
Vec128<int32_t, N> b) {
const int16_t min = LimitsMin<int16_t>();
const int16_t max = LimitsMax<int16_t>();
Vec128<int16_t, 2 * N> ret;
for (size_t i = 0; i < N; ++i) {
ret.raw[i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
}
for (size_t i = 0; i < N; ++i) {
ret.raw[N + i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
}
return ret;
}
namespace detail {
HWY_INLINE void StoreU16ToF16(const uint16_t val,
@ -2233,9 +2264,8 @@ HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
template <typename T, size_t N>
HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
using Bits = typename Mask128<T, N>::Raw;
constexpr Bits kAll = static_cast<Bits>(~Bits{0});
Bits and_sum = kAll;
constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
uint64_t and_sum = kAll;
for (size_t i = 0; i < N; ++i) {
and_sum &= mask.bits[i];
}
@ -2280,6 +2310,16 @@ HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
return count;
}
template <typename T, size_t N>
HWY_API size_t FindKnownFirstTrue(Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
for (size_t i = 0; i < N; ++i) {
if (mask.bits[i] != 0) return i;
}
HWY_DASSERT(false);
return 0;
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
@ -2379,6 +2419,7 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
template <size_t N>
HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
Vec128<bfloat16_t, 2 * N> a,
@ -2395,6 +2436,20 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
template <size_t N>
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
Simd<int32_t, N, 0> d32, Vec128<int16_t, 2 * N> a, Vec128<int16_t, 2 * N> b,
const Vec128<int32_t, N> sum0, Vec128<int32_t, N>& sum1) {
const Rebind<int16_t, decltype(d32)> d16;
// Avoid ZipLower/Upper so this also works on big-endian systems.
const Vec128<int32_t, N> a0 = PromoteTo(d32, LowerHalf(d16, a));
const Vec128<int32_t, N> a1 = PromoteTo(d32, UpperHalf(d16, a));
const Vec128<int32_t, N> b0 = PromoteTo(d32, LowerHalf(d16, b));
const Vec128<int32_t, N> b1 = PromoteTo(d32, UpperHalf(d16, b));
sum1 = MulAdd(BitCast(d32, a1), BitCast(d32, b1), sum1);
return MulAdd(BitCast(d32, a0), BitCast(d32, b0), sum0);
}
// ================================================== REDUCTIONS
template <typename T, size_t N>

View File

@ -15,6 +15,14 @@
// Target-independent types/functions defined after target-specific ops.
#include "hwy/base.h"
// Define detail::Shuffle1230 etc, but only when viewing the current header;
// normally this is included via highway.h, which includes ops/*.h.
#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
#include "hwy/ops/emu128-inl.h"
#endif // HWY_IDE
// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE();
namespace hwy {
@ -476,31 +484,15 @@ HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
}
// 64 bits
template <typename T>
HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
// Use full vectors to reduce the number of stores.
const Full128<T> d_full;
const Vec128<T> v0{part0.raw};
const Vec128<T> v1{part1.raw};
const auto v10 = InterleaveLower(d_full, v0, v1);
StoreU(v10, d_full, unaligned);
}
// <= 32 bits
template <typename T, size_t N, HWY_IF_LE32(T, N)>
HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
// <= 64 bits
template <class V, typename T, size_t N, HWY_IF_LE64(T, N)>
HWY_API void StoreInterleaved2(const V part0, const V part1, Simd<T, N, 0> d,
T* HWY_RESTRICT unaligned) {
// Use full vectors to reduce the number of stores.
const Full128<T> d_full;
const Vec128<T> v0{part0.raw};
const Vec128<T> v1{part1.raw};
const auto v10 = InterleaveLower(d_full, v0, v1);
alignas(16) T buf[16 / sizeof(T)];
StoreU(v10, d_full, buf);
CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
const Twice<decltype(d)> d2;
const auto v0 = ZeroExtendVector(d2, part0);
const auto v1 = ZeroExtendVector(d2, part1);
const auto v10 = InterleaveLower(d2, v0, v1);
StoreU(v10, d2, unaligned);
}
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
@ -526,8 +518,9 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
const RebindToUnsigned<decltype(d)> du;
const auto k5 = Set(du, 5);
const auto k6 = Set(du, 6);
using TU = TFromD<decltype(du)>;
const auto k5 = Set(du, TU{5});
const auto k6 = Set(du, TU{6});
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
@ -576,8 +569,8 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
const Repartition<uint8_t, decltype(d)> du8;
const auto k2 = Set(du8, 2 * sizeof(T));
const auto k3 = Set(du8, 3 * sizeof(T));
const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
@ -666,16 +659,15 @@ HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
}
// 64-bit vector, 8-bit lanes
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
const Vec64<T> part2, Full64<T> d,
T* HWY_RESTRICT unaligned) {
template <class V, typename T, HWY_IF_LANE_SIZE(T, 1)>
HWY_API void StoreInterleaved3(const V part0, const V part1, const V part2,
Full64<T> d, T* HWY_RESTRICT unaligned) {
constexpr size_t N = 16 / sizeof(T);
// Use full vectors for the shuffles and first result.
const Full128<uint8_t> du;
const Full128<T> d_full;
const auto k5 = Set(du, 5);
const auto k6 = Set(du, 6);
const auto k5 = Set(du, uint8_t{5});
const auto k6 = Set(du, uint8_t{6});
const Vec128<T> v0{part0.raw};
const Vec128<T> v1{part1.raw};
@ -708,7 +700,7 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
const Vec64<T> B{(B0 | B1 | B2).raw};
const V B{(B0 | B1 | B2).raw};
StoreU(B, d, unaligned + 1 * N);
}
@ -720,8 +712,8 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
const Full128<T> d;
const Full128<uint8_t> du8;
constexpr size_t N = 16 / sizeof(T);
const auto k2 = Set(du8, 2 * sizeof(T));
const auto k3 = Set(du8, 3 * sizeof(T));
const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});
const Vec128<T> v0{part0.raw};
const Vec128<T> v1{part1.raw};
@ -975,7 +967,7 @@ HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
// ------------------------------ AESRound
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
#if HWY_TARGET != HWY_SCALAR
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
// Define for white-box testing, even if native instructions are available.
namespace detail {
@ -991,7 +983,7 @@ namespace detail {
template <class V> // u8
HWY_INLINE V SubBytes(V state) {
const DFromV<V> du;
const auto mask = Set(du, 0xF);
const auto mask = Set(du, uint8_t{0xF});
// Change polynomial basis to GF(2^4)
{
@ -1034,7 +1026,7 @@ HWY_INLINE V SubBytes(V state) {
0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
return Xor(Xor(affL, affU), Set(du, 0x63));
return Xor(Xor(affL, affU), Set(du, uint8_t{0x63}));
}
} // namespace detail
@ -1080,7 +1072,7 @@ HWY_API V MixColumns(const V state) {
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
const RebindToSigned<decltype(du)> di; // can only do signed comparisons
const auto msb = Lt(BitCast(di, state), Zero(di));
const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
const auto d_s2301 = Xor(d, s2301);
@ -1200,7 +1192,7 @@ HWY_API V PopulationCount(V v) {
HWY_ALIGN constexpr uint8_t kLookup[16] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
};
const auto lo = And(v, Set(d, 0xF));
const auto lo = And(v, Set(d, uint8_t{0xF}));
const auto hi = ShiftRight<4>(v);
const auto lookup = LoadDup128(d, kLookup);
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
@ -1215,9 +1207,10 @@ HWY_API V PopulationCount(V v) {
static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
const D d;
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
const V k33 = Set(d, uint8_t{0x33});
v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
}
#endif // HWY_TARGET != HWY_RVV
@ -1227,7 +1220,7 @@ HWY_API V PopulationCount(V v) {
const D d;
const Repartition<uint8_t, decltype(d)> d8;
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
}
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
@ -1236,7 +1229,7 @@ HWY_API V PopulationCount(V v) {
const D d;
Repartition<uint16_t, decltype(d)> d16;
auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
}
#if HWY_HAVE_INTEGER64
@ -1246,7 +1239,7 @@ HWY_API V PopulationCount(V v) {
const D d;
Repartition<uint32_t, decltype(d)> d32;
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
}
#endif

View File

@ -494,9 +494,11 @@ using VFromD = decltype(Set(D(), TFromD<D>()));
// ------------------------------ Zero
template <typename T, size_t N, int kPow2>
HWY_API VFromD<Simd<T, N, kPow2>> Zero(Simd<T, N, kPow2> d) {
return Set(d, T(0));
template <class D>
HWY_API VFromD<D> Zero(D d) {
// Cast to support bfloat16_t.
const RebindToUnsigned<decltype(d)> du;
return BitCast(d, Set(du, 0));
}
// ------------------------------ Undefined
@ -1109,6 +1111,9 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
// ------------------------------ Xor
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)
// ------------------------------ ExclusiveNeither
HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
#undef HWY_RVV_RETM_ARGMM
// ------------------------------ IfThenElse
@ -1219,14 +1224,19 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) {
// ------------------------------ FindFirstTrue
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
template <class D> \
HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
return vfirst_m_b##MLEN(m, Lanes(d)); \
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
template <class D> \
HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
return vfirst_m_b##MLEN(m, Lanes(d)); \
} \
template <class D> \
HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
return static_cast<size_t>(vfirst_m_b##MLEN(m, Lanes(d))); \
}
HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, _, _)
HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _)
#undef HWY_RVV_FIND_FIRST_TRUE
// ------------------------------ AllFalse
@ -2642,9 +2652,10 @@ HWY_API V ShiftLeftLanes(const D d, const V v) {
using TI = TFromD<decltype(di)>;
const auto shifted = detail::SlideUp(v, v, kLanes);
// Match x86 semantics by zeroing lower lanes in 128-bit blocks
const auto idx_mod = detail::AndS(
detail::Iota0(di), static_cast<TI>(detail::LanesPerBlock(di) - 1));
const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
const auto idx_mod =
detail::AndS(BitCast(di, detail::Iota0(di)),
static_cast<TI>(detail::LanesPerBlock(di) - 1));
const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
return IfThenZeroElse(clear, shifted);
}
@ -2681,9 +2692,8 @@ HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
// Match x86 semantics by zeroing upper lanes in 128-bit blocks
const size_t lpb = detail::LanesPerBlock(di);
const auto idx_mod =
detail::AndS(detail::Iota0(di), static_cast<TI>(lpb - 1));
const auto keep =
detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
detail::AndS(BitCast(di, detail::Iota0(di)), static_cast<TI>(lpb - 1));
const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
return IfThenElseZero(keep, shifted);
}
@ -2827,12 +2837,14 @@ HWY_API V PopulationCount(V v) {
// ------------------------------ LoadDup128
template <class D, typename T = TFromD<D>>
HWY_API VFromD<D> LoadDup128(D d, const T* const HWY_RESTRICT p) {
const auto loaded = Load(d, p);
// Broadcast the first block
const auto idx = detail::AndS(detail::Iota0(d),
static_cast<T>(detail::LanesPerBlock(d) - 1));
template <class D>
HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
const VFromD<D> loaded = Load(d, p);
// idx must be unsigned for TableLookupLanes.
using TU = MakeUnsigned<TFromD<D>>;
const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
// Broadcast the first block.
const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(d), mask);
return TableLookupLanes(loaded, idx);
}
@ -3086,7 +3098,7 @@ HWY_INLINE V MulOdd(const V a, const V b) {
return OddEven(hi, detail::Slide1Down(lo));
}
// ------------------------------ ReorderDemote2To (OddEven)
// ------------------------------ ReorderDemote2To (OddEven, Combine)
template <size_t N, int kPow2>
HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
@ -3099,22 +3111,42 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
// If LMUL is not the max, Combine first to avoid another DemoteTo.
template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
class D32 = RepartitionToWide<Simd<int16_t, N, kPow2>>>
HWY_API VFromD<Simd<int16_t, N, kPow2>> ReorderDemote2To(
Simd<int16_t, N, kPow2> d16, VFromD<D32> a, VFromD<D32> b) {
const Twice<D32> d32t;
const VFromD<decltype(d32t)> ab = Combine(d32t, a, b);
return DemoteTo(d16, ab);
}
// Max LMUL: must DemoteTo first, then Combine.
template <size_t N, class V32 = VFromD<RepartitionToWide<Simd<int16_t, N, 3>>>>
HWY_API VFromD<Simd<int16_t, N, 3>> ReorderDemote2To(Simd<int16_t, N, 3> d16,
V32 a, V32 b) {
const Half<decltype(d16)> d16h;
const VFromD<decltype(d16h)> a16 = DemoteTo(d16h, a);
const VFromD<decltype(d16h)> b16 = DemoteTo(d16h, b);
return Combine(d16, a16, b16);
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
template <class DF>
using DU16FromDF = RepartitionToNarrow<RebindToUnsigned<DF>>;
namespace detail {
template <size_t N, int kPow2>
HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
VFromD<DU16FromDF<decltype(df32)>> a,
VFromD<DU16FromDF<decltype(df32)>> b,
const VFromD<decltype(df32)> sum0,
VFromD<decltype(df32)>& sum1)
-> VFromD<decltype(df32)> {
const DU16FromDF<decltype(df32)> du16;
const RebindToUnsigned<decltype(df32)> du32;
// Non-overloaded wrapper function so we can define DF32 in template args.
template <
size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
class VF32 = VFromD<DF32>,
class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>>
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
VFromD<DU16> a, VFromD<DU16> b,
const VF32 sum0, VF32& sum1) {
const DU16 du16;
const RebindToUnsigned<DF32> du32;
using VU32 = VFromD<decltype(du32)>;
const VFromD<decltype(du16)> zero = Zero(du16);
const VFromD<DU16> zero = Zero(du16);
const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
@ -3123,10 +3155,68 @@ HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
SHIFT, MLEN, NAME, OP) \
template <size_t N> \
HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \
}
HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT)
#undef HWY_RVV_WIDEN_MACC
// If LMUL is not the max, we can WidenMul first (3 instructions).
template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
class D32 = Simd<int32_t, N, kPow2>, class V32 = VFromD<D32>,
class D16 = RepartitionToNarrow<D32>>
HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, kPow2> d32,
VFromD<D16> a, VFromD<D16> b,
const V32 sum0, V32& sum1) {
const Twice<decltype(d32)> d32t;
using V32T = VFromD<decltype(d32t)>;
V32T sum = Combine(d32t, sum0, sum1);
sum = detail::WidenMulAcc(d32t, sum, a, b);
sum1 = UpperHalf(d32, sum);
return LowerHalf(d32, sum);
}
// Max LMUL: must LowerHalf first (4 instructions).
template <size_t N, class D32 = Simd<int32_t, N, 3>, class V32 = VFromD<D32>,
class D16 = RepartitionToNarrow<D32>>
HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, 3> d32,
VFromD<D16> a, VFromD<D16> b,
const V32 sum0, V32& sum1) {
const Half<D16> d16h;
using V16H = VFromD<decltype(d16h)>;
const V16H a0 = LowerHalf(d16h, a);
const V16H a1 = UpperHalf(d16h, a);
const V16H b0 = LowerHalf(d16h, b);
const V16H b1 = UpperHalf(d16h, b);
sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
return detail::WidenMulAcc(d32, sum0, a0, b0);
}
} // namespace detail
template <size_t N, int kPow2, class VN, class VW>
HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
const VW sum0, VW& sum1) {
return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
}
template <size_t N, int kPow2, class VN, class VW>
HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
const VW sum0, VW& sum1) {
return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
}
// ------------------------------ Lt128
template <class D>
HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
// Truth table of Eq and Compare for Hi and Lo u64.
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
// =H =L cH cL | out = cH | (=H & cL)
@ -3152,7 +3242,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
// ------------------------------ Lt128Upper
template <class D>
HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
// Replicate H to its neighbor.
return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
@ -3161,7 +3252,8 @@ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
// ------------------------------ Eq128
template <class D>
HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
const VFromD<D> eqLH = Reverse2(d, eqHL);
return MaskFromVec(And(eqHL, eqLH));
@ -3170,12 +3262,33 @@ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
// ------------------------------ Eq128Upper
template <class D>
HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
// Replicate H to its neighbor.
return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
}
// ------------------------------ Ne128
template <class D>
HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
const VFromD<D> neLH = Reverse2(d, neHL);
return MaskFromVec(Or(neHL, neLH));
}
// ------------------------------ Ne128Upper
template <class D>
HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
// Replicate H to its neighbor.
return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL)));
}
// ------------------------------ Min128, Max128 (Lt128)
template <class D>

View File

@ -33,6 +33,9 @@ using Sisd = Simd<T, 1, 0>;
// (Wrapper class required for overloading comparison operators.)
template <typename T>
struct Vec1 {
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = 1; // only for DFromV
HWY_INLINE Vec1() = default;
Vec1(const Vec1&) = default;
Vec1& operator=(const Vec1&) = default;
@ -78,23 +81,11 @@ class Mask1 {
Raw bits;
};
namespace detail {
// Deduce Sisd<T> from Vec1<T>
struct Deduce1 {
template <typename T>
Sisd<T> operator()(Vec1<T>) const {
return Sisd<T>();
}
};
} // namespace detail
template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class V>
using DFromV = decltype(detail::Deduce1()(V()));
template <class V>
using TFromV = TFromD<DFromV<V>>;
using TFromV = typename V::PrivateT;
// ------------------------------ BitCast
@ -341,6 +332,12 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T>
HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
const Sisd<T> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
// ================================================== SHIFTS
// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@ -365,7 +362,7 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
// signed shifts are still implementation-defined.
using TU = hwy::MakeUnsigned<T>;
const Sisd<TU> du;
const TU shifted = BitCast(du, v).raw >> kBits;
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
const size_t sign_shift =
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
@ -426,7 +423,7 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
// signed shifts are still implementation-defined.
using TU = hwy::MakeUnsigned<T>;
const Sisd<TU> du;
const TU shifted = BitCast(du, v).raw >> bits;
const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
const size_t sign_shift =
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
@ -557,16 +554,47 @@ HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
template <typename T>
HWY_API Vec1<T> Abs(const Vec1<T> a) {
const T i = a.raw;
return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(static_cast<T>(-i));
}
HWY_API Vec1<float> Abs(const Vec1<float> a) {
return Vec1<float>(std::abs(a.raw));
return Vec1<float>(fabsf(a.raw));
}
HWY_API Vec1<double> Abs(const Vec1<double> a) {
return Vec1<double>(std::abs(a.raw));
return Vec1<double>(fabs(a.raw));
}
// ------------------------------ min/max
// ------------------------------ Min/Max
// <cmath> may be unavailable, so implement our own.
namespace detail {
static inline float Abs(float f) {
uint32_t i;
CopyBytes<4>(&f, &i);
i &= 0x7FFFFFFFu;
CopyBytes<4>(&i, &f);
return f;
}
static inline double Abs(double f) {
uint64_t i;
CopyBytes<8>(&f, &i);
i &= 0x7FFFFFFFFFFFFFFFull;
CopyBytes<8>(&i, &f);
return f;
}
static inline bool SignBit(float f) {
uint32_t i;
CopyBytes<4>(&f, &i);
return (i >> 31) != 0;
}
static inline bool SignBit(double f) {
uint64_t i;
CopyBytes<8>(&f, &i);
return (i >> 63) != 0;
}
} // namespace detail
template <typename T, HWY_IF_NOT_FLOAT(T)>
HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
@ -575,8 +603,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
template <typename T, HWY_IF_FLOAT(T)>
HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
if (std::isnan(a.raw)) return b;
if (std::isnan(b.raw)) return a;
if (isnan(a.raw)) return b;
if (isnan(b.raw)) return a;
return Vec1<T>(HWY_MIN(a.raw, b.raw));
}
@ -587,8 +615,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
template <typename T, HWY_IF_FLOAT(T)>
HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
if (std::isnan(a.raw)) return b;
if (std::isnan(b.raw)) return a;
if (isnan(a.raw)) return b;
if (isnan(b.raw)) return a;
return Vec1<T>(HWY_MAX(a.raw, b.raw));
}
@ -707,10 +735,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
// Square root
HWY_API Vec1<float> Sqrt(const Vec1<float> v) {
return Vec1<float>(std::sqrt(v.raw));
return Vec1<float>(sqrtf(v.raw));
}
HWY_API Vec1<double> Sqrt(const Vec1<double> v) {
return Vec1<double>(std::sqrt(v.raw));
return Vec1<double>(sqrt(v.raw));
}
// ------------------------------ Floating-point rounding
@ -725,7 +753,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
const TI rounded = static_cast<TI>(v.raw + bias);
if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
// Round to even
if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
}
return Vec1<T>(static_cast<T>(rounded));
@ -737,12 +765,12 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
using TI = int32_t;
const T abs = Abs(v).raw;
const bool signbit = std::signbit(v.raw);
const bool is_sign = detail::SignBit(v.raw);
if (!(abs < MantissaEnd<T>())) { // Huge or NaN
// Check if too large to cast or NaN
if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
}
return Vec1<int32_t>(static_cast<TI>(v.raw));
}
@ -750,8 +778,8 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
const TI rounded = static_cast<TI>(v.raw + bias);
if (rounded == 0) return Vec1<int32_t>(0);
// Round to even
if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
return Vec1<TI>(rounded - (signbit ? -1 : 1));
if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
return Vec1<TI>(rounded - (is_sign ? -1 : 1));
}
return Vec1<TI>(rounded);
}
@ -1090,19 +1118,19 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
// so we overload for FromT=double and ToT={float,int32_t}.
HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
// Prevent ubsan errors when converting float to narrower integer/float
if (std::isinf(from.raw) ||
std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
: HighestValue<float>());
if (isinf(from.raw) ||
fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>()
: HighestValue<float>());
}
return Vec1<float>(static_cast<float>(from.raw));
}
HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
// Prevent ubsan errors when converting int32_t to narrower integer/int32_t
if (std::isinf(from.raw) ||
std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
: HighestValue<int32_t>());
if (isinf(from.raw) ||
fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
return Vec1<int32_t>(detail::SignBit(from.raw) ? LowestValue<int32_t>()
: HighestValue<int32_t>());
}
return Vec1<int32_t>(static_cast<int32_t>(from.raw));
}
@ -1196,10 +1224,9 @@ HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
// float## -> int##: return closest representable value. We cannot exactly
// represent LimitsMax<ToT> in FromT, so use double.
const double f = static_cast<double>(from.raw);
if (std::isinf(from.raw) ||
std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
: LimitsMax<ToT>());
if (isinf(from.raw) || fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
return Vec1<ToT>(detail::SignBit(from.raw) ? LimitsMin<ToT>()
: LimitsMax<ToT>());
}
return Vec1<ToT>(static_cast<ToT>(from.raw));
}
@ -1468,6 +1495,11 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
return mask.bits == 0 ? -1 : 0;
}
template <typename T>
HWY_API size_t FindKnownFirstTrue(Sisd<T> /* tag */, const Mask1<T> /* m */) {
return 0; // There is only one lane and we know it is true.
}
// ------------------------------ Compress, CompressBits
template <typename T>
@ -1530,6 +1562,14 @@ HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
Vec1<float>(F32FromBF16(b.raw)), sum0);
}
HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(Sisd<int32_t> /* tag */,
Vec1<int16_t> a,
Vec1<int16_t> b,
const Vec1<int32_t> sum0,
Vec1<int32_t>& /* sum1 */) {
return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
}
// ================================================== REDUCTIONS
// Sum of all lanes, i.e. the only one.

View File

@ -319,7 +319,7 @@
#define HWY_HAVE_FLOAT64 0
#define HWY_MEM_OPS_MIGHT_FAULT 1
#define HWY_NATIVE_FMA 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM_EMU256

View File

@ -15,7 +15,17 @@
// Per-target definitions shared by ops/*.h and user code.
#include <cmath>
// We are covered by the highway.h include guard, but generic_ops-inl.h
// includes this again #if HWY_IDE.
#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
#else
#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
#endif
#include <math.h>
#include "hwy/base.h"
@ -218,6 +228,9 @@ using Half = typename D::Half;
template <class D>
using Twice = typename D::Twice;
template <typename T>
using Full16 = Simd<T, 2 / sizeof(T), 0>;
template <typename T>
using Full32 = Simd<T, 4 / sizeof(T), 0>;
@ -309,3 +322,5 @@ using VecArg = V;
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE

View File

@ -49,6 +49,11 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
#if HWY_TARGET == HWY_WASM_EMU256
template <typename T>
using Full256 = Simd<T, 32 / sizeof(T), 0>;
#endif
namespace detail {
template <typename T>
@ -67,6 +72,9 @@ class Vec128 {
using Raw = typename detail::Raw128<T>::type;
public:
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = N; // only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128& operator*=(const Vec128 other) {
@ -100,29 +108,20 @@ using Vec64 = Vec128<T, 8 / sizeof(T)>;
template <typename T>
using Vec32 = Vec128<T, 4 / sizeof(T)>;
template <typename T>
using Vec16 = Vec128<T, 2 / sizeof(T)>;
// FF..FF or 0.
template <typename T, size_t N = 16 / sizeof(T)>
struct Mask128 {
typename detail::Raw128<T>::type raw;
};
namespace detail {
// Deduce Simd<T, N, 0> from Vec128<T, N>
struct DeduceD {
template <typename T, size_t N>
Simd<T, N, 0> operator()(Vec128<T, N>) const {
return Simd<T, N, 0>();
}
};
} // namespace detail
template <class V>
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class V>
using DFromV = decltype(detail::DeduceD()(V()));
template <class V>
using TFromV = TFromD<DFromV<V>>;
using TFromV = typename V::PrivateT;
// ------------------------------ BitCast
@ -237,7 +236,7 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
HWY_DIAGNOSTICS(pop)
// Returns a vector with lane i=[0, N) set to "first" + i.
template <typename T, size_t N, typename T2>
template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
HWY_ALIGN T lanes[16 / sizeof(T)];
for (size_t i = 0; i < 16 / sizeof(T); ++i) {
@ -1219,7 +1218,7 @@ HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
// ------------------------------ FirstN (Iota, Lt)
template <typename T, size_t N>
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
const RebindToSigned<decltype(d)> di; // Signed comparisons may be cheaper.
return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
@ -1412,6 +1411,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
// The x86 multiply-by-Pow2() trick will not work because WASM saturates
@ -1568,7 +1573,7 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
}
// LoadU == Load.
template <typename T, size_t N>
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
return Load(d, p);
}
@ -2516,7 +2521,7 @@ HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
// ------------------------------ TableLookupLanes
// Returned by SetTableIndices for use by TableLookupLanes.
template <typename T, size_t N>
template <typename T, size_t N = 16 / sizeof(T)>
struct Indices128 {
__v128_u raw;
};
@ -2822,7 +2827,7 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
// ------------------------------ Combine (InterleaveLower)
// N = N/2 + N/2 (upper half undefined)
template <typename T, size_t N>
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
Vec128<T, N / 2> lo_half) {
const Half<decltype(d)> d2;
@ -2836,7 +2841,7 @@ HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
template <typename T, size_t N>
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
}
@ -3095,75 +3100,75 @@ HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
// ------------------------------ Promotions (part w/ narrow lanes -> full)
// Unsigned: zero-extend.
template <size_t N>
template <size_t N, HWY_IF_LE128(uint16_t, N)>
HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(uint32_t, N)>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<uint32_t, N>{
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int16_t, N)>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<uint8_t, N> v) {
return Vec128<int32_t, N>{
wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(uint32_t, N)>
HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(uint64_t, N)>
HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
const Vec128<uint32_t, N> v) {
return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<uint16_t, N> v) {
return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
}
// Signed: replicate sign bit.
template <size_t N>
template <size_t N, HWY_IF_LE128(int16_t, N)>
HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<int8_t, N> v) {
return Vec128<int32_t, N>{
wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int32_t, N)>
HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
const Vec128<int16_t, N> v) {
return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(int64_t, N)>
HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(double, N)>
HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
const Vec128<int32_t, N> v) {
return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
}
template <size_t N>
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
const Vec128<float16_t, N> v) {
const RebindToSigned<decltype(df32)> di32;
@ -3184,7 +3189,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
return BitCast(df32, ShiftLeft<31>(sign) | bits32);
}
template <size_t N>
template <size_t N, HWY_IF_LE128(float, N)>
HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
const Vec128<bfloat16_t, N> v) {
const Rebind<uint16_t, decltype(df32)> du16;
@ -3285,7 +3290,33 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
const RebindToUnsigned<decltype(dbf16)> du16;
const Repartition<uint32_t, decltype(dbf16)> du32;
const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even));
return BitCast(dbf16, u16);
}
// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
// above 2*N.
HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
Vec128<int32_t, 1> a,
Vec128<int32_t, 1> b) {
const Half<decltype(dn)> dnh;
// Pretend the result has twice as many lanes so we can InterleaveLower.
const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
return InterleaveLower(an, bn);
}
HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
Vec128<int32_t, 2> a,
Vec128<int32_t, 2> b) {
const Half<decltype(dn)> dnh;
// Pretend the result has twice as many lanes so we can InterleaveLower.
const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
return InterleaveLower(an, bn);
}
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
Vec128<int32_t> a, Vec128<int32_t> b) {
return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
}
// For already range-limited input [0, 255].
@ -3308,8 +3339,8 @@ HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
return Vec128<To, 1>{v1.raw};
}
HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
const Vec128<uint64_t> v) {
HWY_API Vec16<uint8_t> TruncateTo(Full16<uint8_t> /* tag */,
const Vec128<uint64_t> v) {
const Full128<uint8_t> d;
const auto v1 = BitCast(d, v);
const auto v2 = ConcatEven(d, v1, v1);
@ -3317,16 +3348,16 @@ HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
}
HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
const Vec128<uint64_t> v) {
HWY_API Vec32<uint16_t> TruncateTo(Full32<uint16_t> /* tag */,
const Vec128<uint64_t> v) {
const Full128<uint16_t> d;
const auto v1 = BitCast(d, v);
const auto v2 = ConcatEven(d, v1, v1);
return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
}
HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
const Vec128<uint64_t> v) {
HWY_API Vec64<uint32_t> TruncateTo(Full64<uint32_t> /* tag */,
const Vec128<uint64_t> v) {
const Full128<uint32_t> d;
const auto v1 = BitCast(d, v);
return LowerHalf(ConcatEven(d, v1, v1));
@ -3683,6 +3714,13 @@ HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
}
template <typename T, size_t N>
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
const uint64_t bits = detail::BitsFromMask(mask);
return Num0BitsBelowLS1Bit_Nonzero64(bits);
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
@ -4102,7 +4140,11 @@ HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
template <typename T>
struct CompressIsPartition {
#if HWY_TARGET == HWY_WASM_EMU256
enum { value = 0 };
#else
enum { value = 1 };
#endif
};
// Single lane: no-op
@ -4265,6 +4307,16 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
// safe.
template <size_t N>
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
Vec128<int32_t, N>& /*sum1*/) {
return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
}
// ------------------------------ Reductions
namespace detail {
@ -4353,6 +4405,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
@ -4422,7 +4498,7 @@ HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
// Truth table of Eq and Lt for Hi and Lo u64.
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
// =H =L cH cL | out = cH | (=H & cL)
@ -4459,7 +4535,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
}
@ -4471,6 +4547,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
}
// ------------------------------ Ne128
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
return MaskFromVec(Or(Reverse2(d, neHL), neHL));
}
template <typename T, size_t N, HWY_IF_LE128(T, N)>
HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
Vec128<T, N> b) {
const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
return MaskFromVec(InterleaveUpper(d, neHL, neHL));
}
// ------------------------------ Min128, Max128 (Lt128)
// Without a native OddEven, it seems infeasible to go faster than Lt128.

File diff suppressed because it is too large Load Diff

View File

@ -21,7 +21,7 @@
#include "hwy/base.h"
// Avoid uninitialized warnings in GCC's emmintrin.h - see
// https://github.com/google/highway/issues/710 and pull/902)
// https://github.com/google/highway/issues/710 and pull/902
HWY_DIAGNOSTICS(push)
#if HWY_COMPILER_GCC_ACTUAL
HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
@ -49,17 +49,6 @@ HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
#if HWY_TARGET <= HWY_AVX2
template <typename T>
using Full256 = Simd<T, 32 / sizeof(T), 0>;
#endif
#if HWY_TARGET <= HWY_AVX3
template <typename T>
using Full512 = Simd<T, 64 / sizeof(T), 0>;
#endif
namespace detail {
template <typename T>
@ -82,6 +71,9 @@ class Vec128 {
using Raw = typename detail::Raw128<T>::type;
public:
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = N; // only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec128& operator*=(const Vec128 other) {
@ -117,10 +109,6 @@ using Vec32 = Vec128<T, 4 / sizeof(T)>;
#if HWY_TARGET <= HWY_AVX3
// Forward-declare for use by DeduceD, see below.
template <typename T>
class Vec512;
namespace detail {
// Template arg: sizeof(lane type)
@ -166,49 +154,11 @@ struct Mask128 {
#endif // HWY_TARGET <= HWY_AVX3
#if HWY_TARGET <= HWY_AVX2
// Forward-declare for use by DeduceD, see below.
template <typename T>
class Vec256;
#endif
namespace detail {
// Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
// incomplete types at this point; this is simpler than avoiding multiple
// definitions of DFromV via #if)
struct DeduceD {
template <typename T, size_t N>
Simd<T, N, 0> operator()(const Vec128<T, N>*) const {
return Simd<T, N, 0>();
}
#if HWY_TARGET <= HWY_AVX2
template <typename T>
Full256<T> operator()(const hwy::HWY_NAMESPACE::Vec256<T>*) const {
return Full256<T>();
}
#endif
#if HWY_TARGET <= HWY_AVX3
template <typename T>
Full512<T> operator()(const hwy::HWY_NAMESPACE::Vec512<T>*) const {
return Full512<T>();
}
#endif
};
// Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
template <class V>
struct ExpandDFromV {
using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
};
} // namespace detail
using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
template <class V>
using DFromV = typename detail::ExpandDFromV<V>::type;
template <class V>
using TFromV = TFromD<DFromV<V>>;
using TFromV = typename V::PrivateT;
// ------------------------------ BitCast
@ -983,6 +933,47 @@ HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
#else
return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
#else
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
#else
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
#endif
}
template <typename T, size_t N>
HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
const Mask128<T, N> a,
const Mask128<T, N> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
#else
return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
#endif
}
} // namespace detail
template <typename T, size_t N>
@ -1012,6 +1003,11 @@ HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
}
#else // AVX2 or below
// ------------------------------ Mask
@ -1109,6 +1105,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T, size_t N>
HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
const Simd<T, N, 0> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
#endif // HWY_TARGET <= HWY_AVX3
// ------------------------------ ShiftLeft
@ -5170,26 +5172,33 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
template <size_t N>
HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
Vec128<bfloat16_t, 2 * N> a,
Vec128<bfloat16_t, 2 * N> b,
const Vec128<float, N> sum0,
Vec128<float, N>& sum1) {
template <class V, size_t N, class D16 = Simd<bfloat16_t, 2 * N, 0>>
HWY_API V ReorderWidenMulAccumulate(Simd<float, N, 0> df32, VFromD<D16> a,
VFromD<D16> b, const V sum0, V& sum1) {
// TODO(janwas): _mm_dpbf16_ps when available
const Repartition<uint16_t, decltype(df32)> du16;
const RebindToUnsigned<decltype(df32)> du32;
const Vec128<uint16_t, 2 * N> zero = Zero(du16);
const auto zero = Zero(du16);
// Lane order within sum0/1 is undefined, hence we can avoid the
// longer-latency lane-crossing PromoteTo.
const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
using VU32 = VFromD<RebindToUnsigned<decltype(df32)>>;
const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
}
// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
template <size_t N>
HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
Vec128<int32_t, N>& /*sum1*/) {
return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
}
// ================================================== CONVERT
// ------------------------------ Promotions (part w/ narrow lanes -> full)
@ -5461,6 +5470,30 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
Vec128<int32_t, 1> a,
Vec128<int32_t, 1> b) {
const Half<decltype(dn)> dnh;
// Pretend the result has twice as many lanes so we can InterleaveLower.
const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
return InterleaveLower(an, bn);
}
HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
Vec128<int32_t, 2> a,
Vec128<int32_t, 2> b) {
const Half<decltype(dn)> dnh;
// Pretend the result has twice as many lanes so we can InterleaveLower.
const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
return InterleaveLower(an, bn);
}
HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
Vec128<int32_t> a, Vec128<int32_t> b) {
return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
}
template <size_t N>
HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
const Vec128<double, N> v) {
@ -6035,6 +6068,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
return PopCount(mask_bits);
}
template <typename T, size_t N>
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
@ -6500,6 +6540,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
return PopCount(detail::BitsFromMask(mask));
}
template <typename T, size_t N>
HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
}
template <typename T, size_t N>
HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
const Mask128<T, N> mask) {
@ -7161,6 +7208,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
const Simd<uint16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(int16_t, N)>
HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<int16_t, N> v) {
const Simd<int16_t, N, 0> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
template <size_t N, HWY_IF_GE32(uint16_t, N)>
HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec128<uint16_t, N> v) {
@ -7232,7 +7303,8 @@ namespace detail {
// Returns vector-mask for Lt128. Also used by x86_256/x86_512.
template <class D, class V = VFromD<D>>
HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
// Truth table of Eq and Lt for Hi and Lo u64.
// (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
// =H =L cH cL | out = cH | (=H & cL)
@ -7256,12 +7328,22 @@ HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
// Returns vector-mask for Eq128. Also used by x86_256/x86_512.
template <class D, class V = VFromD<D>>
HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const auto eqHL = VecFromMask(d, Eq(a, b));
const auto eqLH = Reverse2(d, eqHL);
return And(eqHL, eqLH);
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Ne128Vec(const D d, const V a, const V b) {
static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
"D must be u64");
const auto neHL = VecFromMask(d, Ne(a, b));
const auto neLH = Reverse2(d, neHL);
return Or(neHL, neLH);
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
// No specialization required for AVX-512: Mask <-> Vec is fast, and
@ -7278,6 +7360,14 @@ HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
return InterleaveUpper(d, eqHL, eqHL);
}
template <class D, class V = VFromD<D>>
HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) {
// No specialization required for AVX-512: Mask <-> Vec is fast, and
// copying mask bits to their neighbor seems infeasible.
const V neHL = VecFromMask(d, Ne(a, b));
return InterleaveUpper(d, neHL, neHL);
}
} // namespace detail
template <class D, class V = VFromD<D>>
@ -7290,6 +7380,11 @@ HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
return MaskFromVec(detail::Eq128Vec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Ne128(D d, const V a, const V b) {
return MaskFromVec(detail::Ne128Vec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
return MaskFromVec(detail::Lt128UpperVec(d, a, b));
@ -7300,6 +7395,11 @@ HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
return MaskFromVec(detail::Eq128UpperVec(d, a, b));
}
template <class D, class V = VFromD<D>>
HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) {
return MaskFromVec(detail::Ne128UpperVec(d, a, b));
}
// ------------------------------ Min128, Max128 (Lt128)
// Avoids the extra MaskFromVec in Lt128.

View File

@ -83,6 +83,9 @@ class Vec256 {
using Raw = typename detail::Raw256<T>::type;
public:
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec256& operator*=(const Vec256 other) {
@ -157,6 +160,9 @@ struct Mask256 {
#endif // HWY_TARGET <= HWY_AVX3
template <typename T>
using Full256 = Simd<T, 32 / sizeof(T), 0>;
// ------------------------------ BitCast
namespace detail {
@ -764,6 +770,43 @@ HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
#endif
}
template <typename T>
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
const Mask256<T> a, const Mask256<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
#else
return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
#endif
}
template <typename T>
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
const Mask256<T> a, const Mask256<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
#else
return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
#endif
}
template <typename T>
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
const Mask256<T> a, const Mask256<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
#else
return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
#endif
}
template <typename T>
HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
const Mask256<T> a, const Mask256<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
#else
return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
#endif
}
} // namespace detail
template <typename T>
@ -793,6 +836,11 @@ HWY_API Mask256<T> Not(const Mask256<T> m) {
return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
}
template <typename T>
HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
}
#else // AVX2
// ------------------------------ Mask
@ -883,6 +931,12 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
}
template <typename T>
HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
const Full256<T> d;
return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
}
#endif // HWY_TARGET <= HWY_AVX3
// ================================================== COMPARE
@ -2866,6 +2920,7 @@ HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
}
// Used by generic_ops-inl.h
namespace detail {
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
@ -3694,7 +3749,7 @@ HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,
namespace detail {
#if HWY_TARGET > HWY_AVX3 // AVX2 or older
#if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older
// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
template <typename T>
@ -3721,7 +3776,7 @@ HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
Vec256<uint16_t> bits) {
#if HWY_TARGET <= HWY_AVX3
#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
#else
return v * Pow2(bits);
@ -3757,7 +3812,7 @@ HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
// ------------------------------ Shr (MulHigh, IfThenElse, Not)
HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
#if HWY_TARGET <= HWY_AVX3
#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
#else
Full256<uint16_t> d;
@ -3798,7 +3853,7 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
const Vec256<uint64_t> b) {
const DFromV<decltype(a)> du64;
const Full256<uint64_t> du64;
const RepartitionToNarrow<decltype(du64)> du32;
const auto maskL = Set(du64, 0xFFFFFFFFULL);
const auto a32 = BitCast(du32, a);
@ -3827,7 +3882,7 @@ HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
const Vec256<uint64_t> b) {
const DFromV<decltype(a)> du64;
const Full256<uint64_t> du64;
const RepartitionToNarrow<decltype(du64)> du32;
const auto maskL = Set(du64, 0xFFFFFFFFULL);
const auto a32 = BitCast(du32, a);
@ -3852,25 +3907,13 @@ HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
return InterleaveUpper(du64, mulL, mulH);
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
Vec256<bfloat16_t> a,
Vec256<bfloat16_t> b,
const Vec256<float> sum0,
Vec256<float>& sum1) {
// TODO(janwas): _mm256_dpbf16_ps when available
const Repartition<uint16_t, decltype(df32)> du16;
const RebindToUnsigned<decltype(df32)> du32;
const Vec256<uint16_t> zero = Zero(du16);
// Lane order within sum0/1 is undefined, hence we can avoid the
// longer-latency lane-crossing PromoteTo.
const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
// ------------------------------ ReorderWidenMulAccumulate
HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(Full256<int32_t> /*d32*/,
Vec256<int16_t> a,
Vec256<int16_t> b,
const Vec256<int32_t> sum0,
Vec256<int32_t>& /*sum1*/) {
return sum0 + Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)};
}
// ================================================== CONVERT
@ -4053,6 +4096,11 @@ HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
HWY_API Vec256<int16_t> ReorderDemote2To(Full256<int16_t> /*d16*/,
Vec256<int32_t> a, Vec256<int32_t> b) {
return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
}
HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
const Vec256<double> v) {
return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
@ -4218,7 +4266,7 @@ HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
const RebindToSigned<decltype(df)> d32;
const auto msk_lo = Set(du32, 0xFFFF);
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
// Extract the 16 lowest/highest significant bits of v and cast to signed int
const auto v_lo = BitCast(d32, And(v, msk_lo));
@ -4238,9 +4286,9 @@ HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
using VU = VFromD<decltype(d64)>;
const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
// Extract the 32 lowest significant bits of v
// Extract the 32 lowest significant bits of v
const VU v_lo = And(v, msk_lo);
const VU v_hi = ShiftRight<32>(v);
@ -4458,9 +4506,15 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
}
template <typename T>
HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
const Mask256<T> mask) {
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
const Mask256<T> mask) {
return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
}
template <typename T>
HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
: intptr_t{-1};
}
// Beware: the suffix indicates the number of mask bits, not lane size!
@ -4903,6 +4957,13 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
return PopCount(detail::BitsFromMask(mask));
}
template <typename T>
HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
const Mask256<T> mask) {
const uint64_t mask_bits = detail::BitsFromMask(mask);
return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
}
template <typename T>
HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
const Mask256<T> mask) {
@ -4915,8 +4976,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
namespace detail {
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
uint64_t mask_bits) {
HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
const RebindToUnsigned<decltype(d)> d32;
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
// of SetTableIndices would require 8 KiB, a large part of L1D. The other
@ -4925,49 +4985,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
// bits, for a total of 1 KiB.
alignas(16) constexpr uint32_t packed_array[256] = {
// PrintCompress32x8Tables
0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
0x10765432, 0x17654320, 0x07654321, 0x76543210};
0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
// No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
// Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@ -4975,12 +5035,11 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
// latency, it may be faster to use LoadDup128 and PSHUFB.
const auto packed = Set(d32, packed_array[mask_bits]);
alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
return packed >> Load(d32, shifts);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
uint64_t mask_bits) {
HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
const Repartition<uint32_t, decltype(d)> d32;
// For 64-bit, we still need 32-bit indices because there is no 64-bit
@ -4988,18 +5047,20 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
// unpacking and load the entire index vector directly.
alignas(32) constexpr uint32_t u32_indices[128] = {
// PrintCompress64x4PairTables
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
return Load(d32, u32_indices + 8 * mask_bits);
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
uint64_t mask_bits) {
HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
uint64_t mask_bits) {
const RebindToUnsigned<decltype(d)> d32;
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
// of SetTableIndices would require 8 KiB, a large part of L1D. The other
@ -5008,49 +5069,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
// bits, for a total of 1 KiB.
alignas(16) constexpr uint32_t packed_array[256] = {
// PrintCompressNot32x8Tables
0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
0x76543210, 0x76543201, 0x76543210, 0x76543210};
0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
// No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
// Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@ -5058,12 +5119,12 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
// latency, it may be faster to use LoadDup128 and PSHUFB.
const auto packed = Set(d32, packed_array[mask_bits]);
alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
return packed >> Load(d32, shifts);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
uint64_t mask_bits) {
HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
uint64_t mask_bits) {
const Repartition<uint32_t, decltype(d)> d32;
// For 64-bit, we still need 32-bit indices because there is no 64-bit
@ -5071,13 +5132,15 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
// unpacking and load the entire index vector directly.
alignas(32) constexpr uint32_t u32_indices[128] = {
// PrintCompressNot64x4PairTables
0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
return Load(d32, u32_indices + 8 * mask_bits);
}
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
@ -5085,7 +5148,9 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
const Repartition<uint32_t, decltype(d)> du32;
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
const auto indices = IndicesFromBits(d, mask_bits);
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
// no instruction for 4x64).
const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
}
@ -5135,7 +5200,9 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
const Repartition<uint32_t, decltype(d)> du32;
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
const auto indices = IndicesFromNotBits(d, mask_bits);
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
// no instruction for 4x64).
const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
}
@ -5199,7 +5266,22 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
T* HWY_RESTRICT unaligned) {
const uint64_t mask_bits = detail::BitsFromMask(m);
const size_t count = PopCount(mask_bits);
BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
const Repartition<uint32_t, decltype(d)> du32;
HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
// 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
// no instruction for 4x64). Nibble MSB encodes FirstN.
const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
// Shift nibble MSB into MSB
const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
// First cast to unsigned (RebindMask cannot change lane size)
const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
const Mask256<T> mask = RebindMask(d, mask_u);
const Vec256<T> compressed =
BitCast(d, TableLookupLanes(BitCast(du32, v),
Indices256<uint32_t>{idx_and_mask.raw}));
BlendedStore(compressed, mask, d, unaligned);
// Workaround for MSAN not marking output as initialized (b/233326619)
#if HWY_IS_MSAN
__msan_unpoison(unaligned, count * sizeof(T));
@ -5429,6 +5511,28 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
return Max(v10, v01);
}
HWY_API Vec256<uint16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<uint16_t> v) {
const Full256<uint16_t> d;
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
HWY_API Vec256<int16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<int16_t> v) {
const Full256<int16_t> d;
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
Vec256<uint16_t> v) {
const Full256<uint16_t> d;
@ -5475,7 +5579,7 @@ HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
} // namespace detail
// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result.
template <typename T>
HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);

View File

@ -113,6 +113,9 @@ class Vec512 {
using Raw = typename detail::Raw512<T>::type;
public:
using PrivateT = T; // only for DFromV
static constexpr size_t kPrivateN = 64 / sizeof(T); // only for DFromV
// Compound assignment. Only usable if there is a corresponding non-member
// binary operator overload. For example, only f32 and f64 support division.
HWY_INLINE Vec512& operator*=(const Vec512 other) {
@ -146,6 +149,9 @@ struct Mask512 {
typename detail::RawMask512<sizeof(T)>::type raw;
};
template <typename T>
using Full512 = Simd<T, 64 / sizeof(T), 0>;
// ------------------------------ BitCast
namespace detail {
@ -1775,6 +1781,43 @@ HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
#endif
}
template <typename T>
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
const Mask512<T> a, const Mask512<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
#else
return Mask512<T>{~(a.raw ^ b.raw)};
#endif
}
template <typename T>
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
const Mask512<T> a, const Mask512<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
#else
return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
#endif
}
template <typename T>
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
const Mask512<T> a, const Mask512<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
#else
return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
#endif
}
template <typename T>
HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
const Mask512<T> a, const Mask512<T> b) {
#if HWY_COMPILER_HAS_MASK_INTRINSICS
return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
#else
return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
#endif
}
} // namespace detail
template <typename T>
@ -1802,6 +1845,11 @@ HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) {
return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
}
template <typename T>
HWY_API Mask512<T> ExclusiveNeither(const Mask512<T> a, Mask512<T> b) {
return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
}
// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) {
@ -3285,6 +3333,11 @@ HWY_API Vec512<bfloat16_t> ReorderDemote2To(Full512<bfloat16_t> dbf16,
return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
}
HWY_API Vec512<int16_t> ReorderDemote2To(Full512<int16_t> /*d16*/,
Vec512<int32_t> a, Vec512<int32_t> b) {
return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)};
}
HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
const Vec512<double> v) {
return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
@ -3646,15 +3699,21 @@ HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
}
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
const Mask512<T> mask) {
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
const Mask512<T> mask) {
return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
}
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
const Mask512<T> mask) {
return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
const Mask512<T> mask) {
return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
}
template <typename T>
HWY_API intptr_t FindFirstTrue(const Full512<T> d, const Mask512<T> mask) {
return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
: intptr_t{-1};
}
// ------------------------------ Compress
@ -3672,7 +3731,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
// See CompressIsPartition. u64 is faster than u32.
alignas(16) constexpr uint64_t packed_array[256] = {
// PrintCompress32x8Tables
// From PrintCompress32x8Tables, without the FirstN extension (there is
// no benefit to including them because 64-bit CompressStore is anyway
// masked, but also no harm because TableLookupLanes ignores the MSB).
0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@ -3781,7 +3842,7 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
const auto idx = LoadU(du, iota + 32 - num0);
const Vec512<uint16_t> idx = LoadU(du, iota + 32 - num0);
const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
demoted0.raw, m_upper, idx.raw, demoted1.raw)};
#endif // HWY_TARGET == HWY_AVX3_DL
@ -3800,7 +3861,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
// See CompressIsPartition. u64 is faster than u32.
alignas(16) constexpr uint64_t packed_array[256] = {
// PrintCompressNot32x8Tables
// From PrintCompressNot32x8Tables, without the FirstN extension (there is
// no benefit to including them because 64-bit CompressStore is anyway
// masked, but also no harm because TableLookupLanes ignores the MSB).
0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
@ -4149,7 +4212,7 @@ HWY_API void StoreTransposedBlocks4(const Vec512<T> i, const Vec512<T> j,
HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
const Vec512<uint64_t> b) {
const DFromV<decltype(a)> du64;
const Full512<uint64_t> du64;
const RepartitionToNarrow<decltype(du64)> du32;
const auto maskL = Set(du64, 0xFFFFFFFFULL);
const auto a32 = BitCast(du32, a);
@ -4178,7 +4241,7 @@ HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
const Vec512<uint64_t> b) {
const DFromV<decltype(a)> du64;
const Full512<uint64_t> du64;
const RepartitionToNarrow<decltype(du64)> du32;
const auto maskL = Set(du64, 0xFFFFFFFFULL);
const auto a32 = BitCast(du32, a);
@ -4203,25 +4266,13 @@ HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
return InterleaveUpper(du64, mulL, mulH);
}
// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
HWY_API Vec512<float> ReorderWidenMulAccumulate(Full512<float> df32,
Vec512<bfloat16_t> a,
Vec512<bfloat16_t> b,
const Vec512<float> sum0,
Vec512<float>& sum1) {
// TODO(janwas): _mm512_dpbf16_ps when available
const Repartition<uint16_t, decltype(df32)> du16;
const RebindToUnsigned<decltype(df32)> du32;
const Vec512<uint16_t> zero = Zero(du16);
// Lane order within sum0/1 is undefined, hence we can avoid the
// longer-latency lane-crossing PromoteTo.
const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
// ------------------------------ ReorderWidenMulAccumulate
HWY_API Vec512<int32_t> ReorderWidenMulAccumulate(Full512<int32_t> /*d32*/,
Vec512<int16_t> a,
Vec512<int16_t> b,
const Vec512<int32_t> sum0,
Vec512<int32_t>& /*sum1*/) {
return sum0 + Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)};
}
// ------------------------------ Reductions
@ -4245,6 +4296,23 @@ HWY_API Vec512<float> SumOfLanes(Full512<float> d, Vec512<float> v) {
HWY_API Vec512<double> SumOfLanes(Full512<double> d, Vec512<double> v) {
return Set(d, _mm512_reduce_add_pd(v.raw));
}
HWY_API Vec512<uint16_t> SumOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
const RepartitionToWide<decltype(d)> d32;
const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(d32, even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
HWY_API Vec512<int16_t> SumOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
const RepartitionToWide<decltype(d)> d32;
// Sign-extend
const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
const auto odd = ShiftRight<16>(BitCast(d32, v));
const auto sum = SumOfLanes(d32, even + odd);
// Also broadcast into odd lanes.
return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
}
// Returns the minimum in each lane.
HWY_API Vec512<int32_t> MinOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {

View File

@ -35,7 +35,7 @@ namespace hwy {
namespace HWY_NAMESPACE {
// Prints lanes around `lane`, in memory order.
template <class D, class V = Vec<D>>
template <class D, class V = VFromD<D>>
void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
size_t max_lanes = 7) {
const size_t N = Lanes(d);

View File

@ -43,7 +43,6 @@
#endif // HWY_COMPILER_MSVC
#elif HWY_ARCH_ARM && HWY_OS_LINUX
#include <asm/hwcap.h>
#include <sys/auxv.h>
#endif // HWY_ARCH_*
@ -104,7 +103,7 @@ int64_t supported_targets_for_test_ = 0;
int64_t supported_mask_ = LimitsMax<int64_t>();
#if HWY_ARCH_X86
// Arbritrary bit indices indicating which instruction set extensions are
// Arbitrary bit indices indicating which instruction set extensions are
// supported. Use enum to ensure values are distinct.
enum class FeatureIndex : uint32_t {
kSSE = 0,

View File

@ -16,7 +16,11 @@
#ifndef HIGHWAY_HWY_TARGETS_H_
#define HIGHWAY_HWY_TARGETS_H_
// Allows opting out of C++ standard library usage, which is not available in
// some Compiler Explorer environments.
#ifndef HWY_NO_LIBCXX
#include <vector>
#endif
// For SIMD module implementations and their callers. Defines which targets to
// generate and call.
@ -25,7 +29,7 @@
#include "hwy/detect_targets.h"
#include "hwy/highway_export.h"
#if !HWY_ARCH_RVV
#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
#include <atomic>
#endif
@ -61,6 +65,8 @@ HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
// all targets.
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
#ifndef HWY_NO_LIBCXX
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
// is affected by the current SetSupportedTargetsForTest() mock if any.
@ -74,6 +80,8 @@ HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
return ret;
}
#endif // HWY_NO_LIBCXX
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
switch (target) {
#if HWY_ARCH_X86
@ -296,8 +304,8 @@ struct ChosenTarget {
}
private:
// TODO(janwas): remove #if once <atomic> is available
#if HWY_ARCH_RVV
// TODO(janwas): remove RVV once <atomic> is available
#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
int64_t LoadMask() const { return mask_; }
void StoreMask(int64_t mask) { mask_ = mask; }

View File

@ -37,6 +37,7 @@ DECLARE_FUNCTION(SVE_256)
DECLARE_FUNCTION(SVE2_128)
DECLARE_FUNCTION(PPC8)
DECLARE_FUNCTION(WASM)
DECLARE_FUNCTION(WASM_EMU256)
DECLARE_FUNCTION(RVV)
DECLARE_FUNCTION(SCALAR)
DECLARE_FUNCTION(EMU128)
@ -81,6 +82,7 @@ void CheckFakeFunction() {
CallFunctionForTarget(HWY_SVE2_128, __LINE__);
CallFunctionForTarget(HWY_PPC8, __LINE__);
CallFunctionForTarget(HWY_WASM, __LINE__);
CallFunctionForTarget(HWY_WASM_EMU256, __LINE__);
CallFunctionForTarget(HWY_RVV, __LINE__);
// The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
// is opt-in only.

View File

@ -17,6 +17,8 @@
#include <stdint.h>
#include <string.h> // memcpy
#include <algorithm> // std::fill
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep

View File

@ -17,6 +17,8 @@
#include <stdint.h>
#include <string.h>
#include <algorithm> // std::fill
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep

View File

@ -17,6 +17,8 @@
#include <stdint.h>
#include <string.h> // memcpy
#include <algorithm> // std::fill
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep

View File

@ -338,7 +338,7 @@ HWY_NOINLINE void TestAllLt128Upper() {
ForGEVectors<128, TestLt128Upper>()(uint64_t());
}
struct TestEq128 {
struct TestEq128 { // Also Ne128
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using V = Vec<D>;
@ -353,15 +353,24 @@ struct TestEq128 {
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));
// Reversed order
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));
// Also check 128-bit blocks are independent
const V iota = Iota(d, 1);
@ -369,10 +378,16 @@ struct TestEq128 {
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));
// Max value
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
@ -381,12 +396,21 @@ struct TestEq128 {
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
}
};
HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
struct TestEq128Upper {
struct TestEq128Upper { // Also Ne128Upper
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using V = Vec<D>;
@ -401,26 +425,43 @@ struct TestEq128Upper {
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));
// Reversed order
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));
// Also check 128-bit blocks are independent
const V iota = Iota(d, 1);
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));
// Max value
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
@ -429,6 +470,15 @@ struct TestEq128Upper {
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
}
};

View File

@ -37,13 +37,15 @@ namespace HWY_NAMESPACE {
#if !HWY_PRINT_TABLES || HWY_IDE
template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
void CheckStored(D d, DI di, const char* op, size_t expected_pos,
size_t actual_pos, size_t num_to_check,
const AlignedFreeUniquePtr<T[]>& in,
const AlignedFreeUniquePtr<TI[]>& mask_lanes,
const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
int line) {
if (expected_pos != actual_pos) {
hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
hwy::Abort(__FILE__, line,
"%s: size mismatch for %s: expected %d, actual %d\n", op,
TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
static_cast<int>(actual_pos));
}
@ -51,7 +53,7 @@ void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
for (size_t i = 0; i < num_to_check; ++i) {
if (!IsEqual(expected[i], actual_u[i])) {
const size_t N = Lanes(d);
fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
static_cast<int>(i), static_cast<int>(num_to_check), line);
Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
Print(d, "in", Load(d, in.get()), 0, N);
@ -91,9 +93,9 @@ struct TestCompress {
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
size_t expected_pos = 0;
for (size_t i = 0; i < N; ++i) {
const uint64_t bits = Random32(&rng);
const uint64_t r = Random32(&rng);
in_lanes[i] = T(); // cannot initialize float16_t directly.
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]); // not same size
CopyBytes<sizeof(T)>(&r, &in_lanes[i]); // not same size
mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
if (mask_lanes[i] > 0) {
expected[expected_pos++] = in_lanes[i];
@ -124,30 +126,32 @@ struct TestCompress {
// Compress
memset(actual_u, 0, N * sizeof(T));
StoreU(Compress(in, mask), d, actual_u);
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
in_lanes, mask_lanes, expected, actual_u, __LINE__);
// CompressNot
memset(actual_u, 0, N * sizeof(T));
StoreU(CompressNot(in, Not(mask)), d, actual_u);
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
num_to_check, in_lanes, mask_lanes, expected, actual_u,
__LINE__);
// CompressStore
memset(actual_u, 0, N * sizeof(T));
const size_t size1 = CompressStore(in, mask, d, actual_u);
// expected_pos instead of num_to_check because this op is not
// affected by CompressIsPartition.
CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
in_lanes, mask_lanes, expected, actual_u, __LINE__);
// CompressBlendedStore
memset(actual_u, 0, N * sizeof(T));
const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
// expected_pos instead of num_to_check because this op only writes
// the mask=true lanes.
CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
expected_pos, in_lanes, mask_lanes, expected, actual_u,
__LINE__);
// Subsequent lanes are untouched.
for (size_t i = size2; i < N; ++i) {
HWY_ASSERT_EQ(zero, actual_u[i]);
@ -156,16 +160,18 @@ struct TestCompress {
// CompressBits
memset(actual_u, 0, N * sizeof(T));
StoreU(CompressBits(in, bits.get()), d, actual_u);
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
num_to_check, in_lanes, mask_lanes, expected, actual_u,
__LINE__);
// CompressBitsStore
memset(actual_u, 0, N * sizeof(T));
const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
// expected_pos instead of num_to_check because this op is not
// affected by CompressIsPartition.
CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
mask_lanes, expected, actual_u, __LINE__);
CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
expected_pos, in_lanes, mask_lanes, expected, actual_u,
__LINE__);
} // rep
} // frac
} // operator()
@ -230,8 +236,9 @@ struct TestCompressBlocks {
// CompressBlocksNot
memset(actual.get(), 0, N * sizeof(T));
StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
mask_lanes, expected, actual.get(), __LINE__);
CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
num_to_check, in_lanes, mask_lanes, expected, actual.get(),
__LINE__);
} // rep
#endif // HWY_TARGET == HWY_SCALAR
} // operator()
@ -305,11 +312,13 @@ void PrintCompressNot16x8Tables() {
printf("\n");
}
// Compressed to nibbles, unpacked via variable right shift
// Compressed to nibbles, unpacked via variable right shift. Also includes
// FirstN bits in the nibble MSB.
void PrintCompress32x8Tables() {
printf("======================================= 32/64x8\n");
constexpr size_t N = 8; // AVX2 or 64-bit AVX3
for (uint64_t code = 0; code < (1ull << N); ++code) {
const size_t count = PopCount(code);
std::array<uint32_t, N> indices{0};
size_t pos = 0;
// All lanes where mask = true
@ -330,6 +339,10 @@ void PrintCompress32x8Tables() {
uint64_t packed = 0;
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT(indices[i] < N);
if (i < count) {
indices[i] |= N;
HWY_ASSERT(indices[i] < 0x10);
}
packed += indices[i] << (i * 4);
}
@ -344,6 +357,7 @@ void PrintCompressNot32x8Tables() {
constexpr size_t N = 8; // AVX2 or 64-bit AVX3
for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
const uint64_t code = ~not_code;
const size_t count = PopCount(code);
std::array<uint32_t, N> indices{0};
size_t pos = 0;
// All lanes where mask = true
@ -364,6 +378,10 @@ void PrintCompressNot32x8Tables() {
uint64_t packed = 0;
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT(indices[i] < N);
if (i < count) {
indices[i] |= N;
HWY_ASSERT(indices[i] < 0x10);
}
packed += indices[i] << (i * 4);
}
@ -504,11 +522,13 @@ void PrintCompressNot64x4Tables() {
printf("\n");
}
// Same as above, but prints pairs of u32 indices (for AVX2)
// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
// FirstN bits in the nibble MSB.
void PrintCompress64x4PairTables() {
printf("======================================= 64x4 u32 index\n");
constexpr size_t N = 4; // AVX2
for (uint64_t code = 0; code < (1ull << N); ++code) {
const size_t count = PopCount(code);
std::array<size_t, N> indices{0};
size_t pos = 0;
// All lanes where mask = true
@ -530,8 +550,10 @@ void PrintCompress64x4PairTables() {
// interpreted modulo N. Compression is not worth the extra shift+AND
// because the table is anyway only 512 bytes.
for (size_t i = 0; i < N; ++i) {
printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
static_cast<int>(2 * indices[i]) + 1);
const int first_n_bit = i < count ? 8 : 0;
const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
HWY_ASSERT(low < 0x10);
printf("%d, %d, ", low, low + 1);
}
}
printf("\n");
@ -542,6 +564,7 @@ void PrintCompressNot64x4PairTables() {
constexpr size_t N = 4; // AVX2
for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
const uint64_t code = ~not_code;
const size_t count = PopCount(code);
std::array<size_t, N> indices{0};
size_t pos = 0;
// All lanes where mask = true
@ -563,8 +586,10 @@ void PrintCompressNot64x4PairTables() {
// interpreted modulo N. Compression is not worth the extra shift+AND
// because the table is anyway only 512 bytes.
for (size_t i = 0; i < N; ++i) {
printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
static_cast<int>(2 * indices[i]) + 1);
const int first_n_bit = i < count ? 8 : 0;
const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
HWY_ASSERT(low < 0x10);
printf("%d, %d, ", low, low + 1);
}
}
printf("\n");

View File

@ -16,6 +16,8 @@
#include <stddef.h>
#include <stdint.h>
#include <cmath> // std::isfinite
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep

View File

@ -18,8 +18,9 @@
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <algorithm> // std::copy, std::fill
#include <limits>
#include <cmath> // std::abs, std::isnan, std::isinf, std::ceil, std::floor
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/float_test.cc"

View File

@ -22,7 +22,7 @@
#include <stdint.h>
#include <string>
#include <utility> // std::tuple
#include <tuple>
#include "gtest/gtest.h"
#include "hwy/highway.h"

View File

@ -17,6 +17,8 @@
#include <stdint.h>
#include <string.h> // memcmp
#include <algorithm> // std::fill
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep
@ -189,7 +191,7 @@ HWY_NOINLINE void TestAllCountTrue() {
ForAllTypes(ForPartialVectors<TestCountTrue>());
}
struct TestFindFirstTrue {
struct TestFindFirstTrue { // Also FindKnownFirstTrue
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using TI = MakeSigned<T>; // For mask > 0 comparison
@ -203,17 +205,18 @@ struct TestFindFirstTrue {
HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));
for (size_t code = 1; code < (1ull << max_lanes); ++code) {
for (size_t i = 0; i < max_lanes; ++i) {
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
}
const intptr_t expected = static_cast<intptr_t>(
Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)));
const size_t expected =
Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
const intptr_t actual = FindFirstTrue(d, mask);
HWY_ASSERT_EQ(expected, actual);
HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
}
}
};
@ -237,6 +240,11 @@ struct TestLogicalMask {
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
// For all combinations of zero/nonzero state of subset of lanes:
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
for (size_t code = 0; code < (1ull << max_lanes); ++code) {

View File

@ -23,6 +23,8 @@
#include <stddef.h>
#include <stdint.h>
#include <algorithm> // std::fill
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
#include "hwy/cache_control.h"

View File

@ -26,6 +26,15 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <size_t kBits>
constexpr uint64_t FirstBits() {
return (1ull << kBits) - 1;
}
template <>
constexpr uint64_t FirstBits<64>() {
return ~uint64_t{0};
}
struct TestUnsignedMul {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -56,9 +65,8 @@ struct TestUnsignedMul {
HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
const size_t bits = sizeof(T) * 8;
const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
const T max2 = (static_cast<uint64_t>(max) * max) & mask;
constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
}
};
@ -349,64 +357,65 @@ struct TestReorderWidenMulAccumulate {
HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
using TW = MakeWide<TN>;
const RepartitionToWide<DN> dw;
const auto f0 = Zero(dw);
const auto f1 = Set(dw, 1.0f);
const auto fi = Iota(dw, 1);
const auto bf0 = ReorderDemote2To(dn, f0, f0);
const auto bf1 = ReorderDemote2To(dn, f1, f1);
const auto bfi = ReorderDemote2To(dn, fi, fi);
const size_t NW = Lanes(dw);
auto delta = AllocateAligned<TW>(2 * NW);
for (size_t i = 0; i < 2 * NW; ++i) {
delta[i] = 0.0f;
}
const Half<DN> dnh;
using VW = Vec<decltype(dw)>;
using VN = Vec<decltype(dn)>;
const size_t NN = Lanes(dn);
const VW f0 = Zero(dw);
const VW f1 = Set(dw, TW{1});
const VN bf0 = Zero(dn);
// Cannot Set() bfloat16_t directly.
const VN bf1 = ReorderDemote2To(dn, f1, f1);
// Any input zero => both outputs zero
auto sum1 = f0;
VW sum1 = f0;
HWY_ASSERT_VEC_EQ(dw, f0,
ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
HWY_ASSERT_VEC_EQ(dw, f0,
ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
HWY_ASSERT_VEC_EQ(dw, f0,
ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
// delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
for (size_t p = 0; p < 2 * NW; ++p) {
delta[p] = 1.0f;
const auto delta0 = Load(dw, delta.get() + 0);
const auto delta1 = Load(dw, delta.get() + NW);
delta[p] = 0.0f;
const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
// delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
auto delta_w = AllocateAligned<TW>(NN);
for (size_t p = 0; p < NN; ++p) {
// Workaround for incorrect Clang wasm codegen: re-initialize the entire
// array rather than zero-initialize once and then toggle lane p.
for (size_t i = 0; i < NN; ++i) {
delta_w[i] = static_cast<TW>(i == p);
}
const VW delta0 = Load(dw, delta_w.get());
const VW delta1 = Load(dw, delta_w.get() + NN / 2);
const VN delta = ReorderDemote2To(dn, delta0, delta1);
{
sum1 = f0;
const auto sum0 =
ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
}
// Swapped arg order
{
sum1 = f0;
const auto sum0 =
ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
}
// Start with nonzero sum0 or sum1
{
sum1 = delta1;
const auto sum0 =
ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
}
// Start with nonzero sum0 or sum1, and swap arg order
{
sum1 = delta1;
const auto sum0 =
ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
}
}
}
@ -414,6 +423,7 @@ struct TestReorderWidenMulAccumulate {
HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)

View File

@ -54,6 +54,7 @@ struct TestSumOfLanes {
HWY_NOINLINE void TestAllSumOfLanes() {
ForUIF3264(ForPartialVectors<TestSumOfLanes>());
ForUI16(ForPartialVectors<TestSumOfLanes>());
}
struct TestMinOfLanes {
@ -170,10 +171,8 @@ HWY_NOINLINE void TestAllMinMaxOfLanes() {
const ForPartialVectors<TestMaxOfLanes> test_max;
ForUIF3264(test_min);
ForUIF3264(test_max);
test_min(uint16_t());
test_max(uint16_t());
test_min(int16_t());
test_max(int16_t());
ForUI16(test_min);
ForUI16(test_max);
}
struct TestSumsOf8 {

View File

@ -22,6 +22,7 @@
#include <stdint.h>
#include <string.h>
#include <cmath> // std::isnan
#include <string>
#include "hwy/aligned_allocator.h"

View File

@ -16,6 +16,8 @@
#include <stddef.h>
#include <stdint.h>
#include <string>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
#include "hwy/foreach_target.h" // IWYU pragma: keep

View File

@ -1,4 +1,5 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

View File

@ -19,11 +19,11 @@ cd ..
rm -rf build
#######################################
echo DEBUG Clang 7
echo DEBUG Clang 9
rm -rf build_dbg
mkdir build_dbg
cd build_dbg
CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
make -j
ctest -j
cd ..
@ -41,7 +41,7 @@ cd ..
rm -rf build_32
#######################################
for VER in 8 9 10; do
for VER in 10 11 12; do
echo GCC $VER
rm -rf build_g$VER
mkdir build_g$VER

View File

@ -24,6 +24,7 @@ Aous Naman <aous@unsw.edu.au>
Artem Selishchev
Biswapriyo Nath <nathbappai@gmail.com>
CanadianBaconBoi <beamconnor@gmail.com>
Damiano Albani <damiano.albani@gmail.com>
Daniel Novomeský <dnovomesky@gmail.com>
David Burnett <vargolsoft@gmail.com>
Dirk Lemstra <dirk@lemstra.org>
@ -31,6 +32,7 @@ Don Olmstead <don.j.olmstead@gmail.com>
Even Rouault <even.rouault@spatialys.com>
Fred Brennan <copypaste@kittens.ph>
Heiko Becker <heirecka@exherbo.org>
Jim Robinson <jimbo2150@gmail.com>
Jon Sneyers <jon@cloudinary.com>
Kai Hollberg <Schweinepriester@users.noreply.github.com>
Kleis Auke Wolthuizen <github@kleisauke.nl>

View File

@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
of the input buffer.
- decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
of the output buffer.
- encoder API: add an effort 10 option for lossless compression.
## [0.7] - 2022-07-21

View File

@ -100,6 +100,10 @@ set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
"Build JPEGXL developer tools.")
set(JPEGXL_ENABLE_TOOLS true CACHE BOOL
"Build JPEGXL user tools: cjxl and djxl.")
set(JPEGXL_ENABLE_JPEGLI true CACHE BOOL
"Build jpegli library.")
set(JPEGXL_ENABLE_JPEGLI_LIBJPEG true CACHE BOOL
"Build libjpeg.so shared library based on jpegli.")
set(JPEGXL_ENABLE_DOXYGEN true CACHE BOOL
"Generate C API documentation using Doxygen.")
set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL

View File

@ -69,6 +69,11 @@ if [[ "${ENABLE_WASM_SIMD}" -ne "0" ]]; then
CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -msimd128"
fi
if [[ "${ENABLE_WASM_SIMD}" -eq "2" ]]; then
CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_WANT_WASM2"
CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -DHWY_WANT_WASM2"
fi
if [[ ! -z "${HWY_BASELINE_TARGETS}" ]]; then
CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS}"
fi
@ -139,6 +144,7 @@ detect_clang_version() {
fi
local clang_version=$("${CC:-clang}" --version | head -n1)
clang_version=${clang_version#"Debian "}
clang_version=${clang_version#"Ubuntu "}
local llvm_tag
case "${clang_version}" in
"clang version 6."*)
@ -547,6 +553,7 @@ cmd_coverage_report() {
# Only print coverage information for the libjxl directories. The rest
# is not part of the code under test.
--filter '.*jxl/.*'
--exclude '.*_gbench.cc'
--exclude '.*_test.cc'
--exclude '.*_testonly..*'
--exclude '.*_debug.*'

View File

@ -14,4 +14,5 @@ override_dh_auto_configure:
-DJPEGXL_FORCE_SYSTEM_GTEST=ON \
-DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
-DJPEGXL_FORCE_SYSTEM_HWY=ON \
-DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
-DJPEGXL_ENABLE_PLUGINS=ON

View File

@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
# Git revisions we use for the given submodules. Update these whenever you
# update a git submodule.
THIRD_PARTY_BROTLI="35ef5c554d888bef217d449346067de05e269b30"
THIRD_PARTY_HIGHWAY="22e3d7276f4157d4a47586ba9fd91dd6303f441a"
THIRD_PARTY_HIGHWAY="f670ea580bb70b4113b63b9cdaa42ba9b10cd13a"
THIRD_PARTY_SKCMS="b25b07b4b07990811de121c0356155b2ba0f4318"
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"

View File

@ -0,0 +1,10 @@
# Fast-lossless
This is a script to compile a standalone version of a JXL encoder that supports
lossless compression, up to 16 bits, of 1- to 4-channel images and animations; it is
very fast and compression is slightly worse than PNG for 8-bit nonphoto content
and better or much better than PNG for all other situations.
The main encoder is made out of two files, `lib/jxl/enc_fast_lossless.{cc,h}`;
it automatically selects and runs a SIMD implementation supported by your CPU.
This folder contains an example build script and `main` file.

View File

@ -20,7 +20,8 @@ fi
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
"$CXX" -O3 -DFASTLL_ENABLE_NEON_INTRINSICS -fopenmp \
"$CXX" -O3 \
-I. lodepng.o \
"${DIR}"/fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
-I"${DIR}"/../../ \
"${DIR}"/../../lib/jxl/enc_fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
-o fast_lossless

View File

@ -18,9 +18,10 @@ fi
[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -mavx2 -o lodepng.o -c
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
"$CXX" -O3 -mavx2 -DFASTLL_ENABLE_AVX2_INTRINSICS -fopenmp \
-I. lodepng.o \
"$DIR"/fast_lossless.cc "$DIR"/fast_lossless_main.cc \
"$CXX" -O3 \
-I. -g lodepng.o \
-I"$DIR"/../../ \
"$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
-o fast_lossless

View File

@ -0,0 +1,26 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project Authors. All rights reserved.
#
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
set -e
DIR=$(realpath "$(dirname "$0")")
mkdir -p "$DIR"/build-aarch64
cd "$DIR"/build-aarch64
CXX="${CXX-aarch64-linux-gnu-c++}"
if ! command -v "$CXX" >/dev/null ; then
printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
exit 1
fi
[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
"$CXX" -O3 -static \
-I. lodepng.o \
-I"$DIR"/../../ \
"$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
-o fast_lossless

File diff suppressed because it is too large Load Diff

View File

@ -1,23 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef FAST_LOSSLESS_H
#define FAST_LOSSLESS_H
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
size_t row_stride, size_t height, size_t nb_chans,
size_t bitdepth, int effort,
unsigned char** output);
#ifdef __cplusplus
} // extern "C"
#endif
#endif

View File

@ -7,16 +7,20 @@
#include <stdlib.h>
#include <string.h>
#include <atomic>
#include <chrono>
#include <thread>
#include <vector>
#include "fast_lossless.h"
#include "lib/jxl/enc_fast_lossless.h"
#include "lodepng.h"
#include "pam-input.h"
int main(int argc, char** argv) {
if (argc < 3) {
fprintf(stderr, "Usage: %s in.png out.jxl [effort] [num_reps]\n", argv[0]);
fprintf(stderr,
"Usage: %s in.png out.jxl [effort] [num_reps] [num_threads]\n",
argv[0]);
return 1;
}
@ -24,6 +28,7 @@ int main(int argc, char** argv) {
const char* out = argv[2];
int effort = argc >= 4 ? atoi(argv[3]) : 2;
size_t num_reps = argc >= 5 ? atoi(argv[4]) : 1;
size_t num_threads = argc >= 6 ? atoi(argv[5]) : 0;
if (effort < 0 || effort > 127) {
fprintf(
@ -44,6 +49,35 @@ int main(int argc, char** argv) {
return 1;
}
auto parallel_runner = [](void* num_threads_ptr, void* opaque,
void fun(void*, size_t), size_t count) {
size_t num_threads = *(size_t*)num_threads_ptr;
if (num_threads == 0) {
num_threads = std::thread::hardware_concurrency();
}
if (num_threads > count) {
num_threads = count;
}
if (num_threads == 1) {
for (size_t i = 0; i < count; i++) {
fun(opaque, i);
}
} else {
std::atomic<int> task{0};
std::vector<std::thread> threads;
for (size_t i = 0; i < num_threads; i++) {
threads.push_back(std::thread([count, opaque, fun, &task]() {
while (true) {
int t = task++;
if (t >= count) break;
fun(opaque, t);
}
}));
}
for (auto& t : threads) t.join();
}
};
size_t encoded_size = 0;
unsigned char* encoded = nullptr;
size_t stride = width * nb_chans * (bitdepth > 8 ? 2 : 1);
@ -51,8 +85,9 @@ int main(int argc, char** argv) {
auto start = std::chrono::high_resolution_clock::now();
for (size_t _ = 0; _ < num_reps; _++) {
free(encoded);
encoded_size = JxlFastLosslessEncode(png, width, stride, height, nb_chans,
bitdepth, effort, &encoded);
encoded_size = JxlFastLosslessEncode(
png, width, stride, height, nb_chans, bitdepth,
/*big_endian=*/true, effort, &encoded, &num_threads, +parallel_runner);
}
auto stop = std::chrono::high_resolution_clock::now();
if (num_reps > 1) {

View File

@ -270,8 +270,8 @@ bool DecodePAM(const char* filename, uint8_t** buffer, size_t* w, size_t* h,
const uint8_t* pos = nullptr;
if (!parser.ParseHeader(&header, &pos)) return false;
if (header.bits_per_sample == 0 || header.bits_per_sample > 12) {
return error_msg("PNM: bits_per_sample invalid (can do at most 12-bit)");
if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
return error_msg("PNM: bits_per_sample invalid (can do at most 16-bit)");
}
*w = header.xsize;
*h = header.ysize;

View File

@ -132,6 +132,15 @@ set(JPEGXL_COVERAGE_FLAGS
endif() # JPEGXL_ENABLE_COVERAGE
endif() #!MSVC
# strips the -static suffix from all the elements in LIST
function(strip_static OUTPUT_VAR LIB_LIST)
foreach(lib IN LISTS ${LIB_LIST})
string(REGEX REPLACE "-static$" "" lib "${lib}")
list(APPEND out_list "${lib}")
endforeach()
set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
endfunction()
# The jxl library definition.
include(jxl.cmake)
@ -140,6 +149,11 @@ if(JPEGXL_ENABLE_TOOLS)
include(jxl_extras.cmake)
endif()
include(jxl_threads.cmake)
# We only build JPEGLI on linux for now.
find_package(JPEG)
if (JPEG_FOUND AND JPEGXL_ENABLE_JPEGLI)
include(jpegli.cmake)
endif()
# Install all the library headers from the source and the generated ones. There
# is no distinction on which libraries use which header since it is expected

View File

@ -14,14 +14,17 @@
#include <utility>
#include <vector>
#include "lib/extras/dec/jpegli.h"
#include "lib/extras/dec/pgx.h"
#include "lib/extras/dec/pnm.h"
#include "lib/extras/enc/encode.h"
#include "lib/extras/encode_jpeg.h"
#include "lib/extras/packed_image_convert.h"
#include "lib/jxl/base/printf_macros.h"
#include "lib/jxl/base/random.h"
#include "lib/jxl/base/thread_pool_internal.h"
#include "lib/jxl/color_management.h"
#include "lib/jxl/enc_butteraugli_comparator.h"
#include "lib/jxl/enc_color_management.h"
#include "lib/jxl/image.h"
#include "lib/jxl/image_bundle.h"
@ -174,6 +177,7 @@ struct TestImageParams {
bool add_alpha;
bool big_endian;
bool add_extra_channels;
bool jpegli_decode = false;
bool ShouldTestRoundtrip() const {
if (codec == Codec::kPNG) {
@ -273,11 +277,32 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
color_hints.Add("color_space",
params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
}
ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
color_hints, SizeConstraints(), &ppf_out));
if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
params.codec != Codec::kEXR) {
if (params.codec == Codec::kJPG && params.jpegli_decode) {
#if JPEGXL_ENABLE_JPEG
ASSERT_TRUE(
DecodeJpeg(encoded.bitstreams[0], JXL_TYPE_UINT8, pool, &ppf_out));
#endif
} else {
ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
color_hints, SizeConstraints(), &ppf_out));
}
if (params.codec == Codec::kPNG && ppf_out.icc.empty()) {
// Decoding a PNG may drop the ICC profile if there's a valid cICP chunk.
// Rendering intent is not preserved in this case.
EXPECT_EQ(ppf_in.color_encoding.color_space,
ppf_out.color_encoding.color_space);
EXPECT_EQ(ppf_in.color_encoding.white_point,
ppf_out.color_encoding.white_point);
if (ppf_in.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
EXPECT_EQ(ppf_in.color_encoding.primaries,
ppf_out.color_encoding.primaries);
}
EXPECT_EQ(ppf_in.color_encoding.transfer_function,
ppf_out.color_encoding.transfer_function);
EXPECT_EQ(ppf_out.color_encoding.rendering_intent,
JXL_RENDERING_INTENT_RELATIVE);
} else if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
params.codec != Codec::kEXR) {
EXPECT_EQ(ppf_in.icc, ppf_out.icc);
}
@ -322,6 +347,10 @@ TEST(CodecTest, TestRoundTrip) {
params.add_extra_channels = true;
TestRoundTrip(params, &pool);
}
if (codec == Codec::kJPG) {
params.jpegli_decode = true;
TestRoundTrip(params, &pool);
}
}
}
}
@ -362,6 +391,78 @@ TEST(CodecTest, LosslessPNMRoundtrip) {
}
}
#if JPEGXL_ENABLE_JPEG
TEST(CodecTest, JpegliXYBEncodeTest) {
ThreadPool* pool = nullptr;
CodecInOut io;
const PaddedBytes orig =
ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
std::vector<uint8_t> compressed;
JpegSettings settings;
settings.xyb = true;
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
CodecInOut io2;
ASSERT_TRUE(
SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
EXPECT_THAT(bpp, IsSlightlyBelow(1.5f));
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
/*distmap=*/nullptr, nullptr),
IsSlightlyBelow(1.3f));
}
TEST(CodecTest, JpegliYUVEncodeTest) {
ThreadPool* pool = nullptr;
CodecInOut io;
const PaddedBytes orig =
ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
std::vector<uint8_t> compressed;
JpegSettings settings;
settings.xyb = false;
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
CodecInOut io2;
ASSERT_TRUE(
SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
EXPECT_THAT(bpp, IsSlightlyBelow(2.3f));
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
/*distmap=*/nullptr, nullptr),
IsSlightlyBelow(1.3f));
}
TEST(CodecTest, Jpegli16bitRoundtripTest) {
ThreadPool* pool = nullptr;
CodecInOut io;
const PaddedBytes orig = ReadTestData(
"external/raw.pixls/"
"Google-Pixel2XL-16bit_srgb8_v4_krita.png");
ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
std::vector<uint8_t> compressed;
JpegSettings settings;
settings.xyb = false;
ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
PackedPixelFile ppf_out;
ASSERT_TRUE(DecodeJpeg(compressed, JXL_TYPE_UINT16, pool, &ppf_out));
CodecInOut io2;
ASSERT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf_out, pool, &io2));
EXPECT_THAT(compressed.size(), IsSlightlyBelow(3500u));
EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
/*distmap=*/nullptr, nullptr),
IsSlightlyBelow(1.13f));
}
#endif
CodecInOut DecodeRoundtrip(const std::string& pathname, ThreadPool* pool,
const ColorHints& color_hints = ColorHints()) {
CodecInOut io;

View File

@ -76,11 +76,145 @@ Status DecodeSRGB(const unsigned char* payload, const size_t payload_size,
if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size");
// (PNG uses the same values as ICC.)
if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent");
color_encoding->white_point = JXL_WHITE_POINT_D65;
color_encoding->primaries = JXL_PRIMARIES_SRGB;
color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
color_encoding->rendering_intent =
static_cast<JxlRenderingIntent>(payload[0]);
return true;
}
// If the cICP profile is not fully supported, return false and leave
// color_encoding unmodified.
Status DecodeCICP(const unsigned char* payload, const size_t payload_size,
JxlColorEncoding* color_encoding) {
if (payload_size != 4) return JXL_FAILURE("Wrong cICP size");
JxlColorEncoding color_enc = *color_encoding;
// From https://www.itu.int/rec/T-REC-H.273-202107-I/en
if (payload[0] == 1) {
// IEC 61966-2-1 sRGB
color_enc.primaries = JXL_PRIMARIES_SRGB;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else if (payload[0] == 4) {
// Rec. ITU-R BT.470-6 System M
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 0.67;
color_enc.primaries_red_xy[1] = 0.33;
color_enc.primaries_green_xy[0] = 0.21;
color_enc.primaries_green_xy[1] = 0.71;
color_enc.primaries_blue_xy[0] = 0.14;
color_enc.primaries_blue_xy[1] = 0.08;
color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
color_enc.white_point_xy[0] = 0.310;
color_enc.white_point_xy[1] = 0.316;
} else if (payload[0] == 5) {
// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 0.64;
color_enc.primaries_red_xy[1] = 0.33;
color_enc.primaries_green_xy[0] = 0.29;
color_enc.primaries_green_xy[1] = 0.60;
color_enc.primaries_blue_xy[0] = 0.15;
color_enc.primaries_blue_xy[1] = 0.06;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else if (payload[0] == 6 || payload[0] == 7) {
// SMPTE ST 170 (2004) / SMPTE ST 240 (1999)
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 0.630;
color_enc.primaries_red_xy[1] = 0.340;
color_enc.primaries_green_xy[0] = 0.310;
color_enc.primaries_green_xy[1] = 0.595;
color_enc.primaries_blue_xy[0] = 0.155;
color_enc.primaries_blue_xy[1] = 0.070;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else if (payload[0] == 8) {
// Generic film (colour filters using Illuminant C)
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 0.681;
color_enc.primaries_red_xy[1] = 0.319;
color_enc.primaries_green_xy[0] = 0.243;
color_enc.primaries_green_xy[1] = 0.692;
color_enc.primaries_blue_xy[0] = 0.145;
color_enc.primaries_blue_xy[1] = 0.049;
color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
color_enc.white_point_xy[0] = 0.310;
color_enc.white_point_xy[1] = 0.316;
} else if (payload[0] == 9) {
// Rec. ITU-R BT.2100-2
color_enc.primaries = JXL_PRIMARIES_2100;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else if (payload[0] == 10) {
// CIE 1931 XYZ
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 1;
color_enc.primaries_red_xy[1] = 0;
color_enc.primaries_green_xy[0] = 0;
color_enc.primaries_green_xy[1] = 1;
color_enc.primaries_blue_xy[0] = 0;
color_enc.primaries_blue_xy[1] = 0;
color_enc.white_point = JXL_WHITE_POINT_E;
} else if (payload[0] == 11) {
// SMPTE RP 431-2 (2011)
color_enc.primaries = JXL_PRIMARIES_P3;
color_enc.white_point = JXL_WHITE_POINT_DCI;
} else if (payload[0] == 12) {
// SMPTE EG 432-1 (2010)
color_enc.primaries = JXL_PRIMARIES_P3;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else if (payload[0] == 22) {
color_enc.primaries = JXL_PRIMARIES_CUSTOM;
color_enc.primaries_red_xy[0] = 0.630;
color_enc.primaries_red_xy[1] = 0.340;
color_enc.primaries_green_xy[0] = 0.295;
color_enc.primaries_green_xy[1] = 0.605;
color_enc.primaries_blue_xy[0] = 0.155;
color_enc.primaries_blue_xy[1] = 0.077;
color_enc.white_point = JXL_WHITE_POINT_D65;
} else {
JXL_WARNING("Unsupported primaries specified in cICP chunk: %d",
static_cast<int>(payload[0]));
return false;
}
if (payload[1] == 1 || payload[1] == 6 || payload[1] == 14 ||
payload[1] == 15) {
// Rec. ITU-R BT.709-6
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_709;
} else if (payload[1] == 4) {
// Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
color_enc.gamma = 1 / 2.2;
} else if (payload[1] == 5) {
// Rec. ITU-R BT.470-6 System B, G
color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
color_enc.gamma = 1 / 2.8;
} else if (payload[1] == 8 || payload[1] == 13 || payload[1] == 16 ||
payload[1] == 17 || payload[1] == 18) {
// These codes all match the corresponding JXL enum values
color_enc.transfer_function = static_cast<JxlTransferFunction>(payload[1]);
} else {
JXL_WARNING("Unsupported transfer function specified in cICP chunk: %d",
static_cast<int>(payload[1]));
return false;
}
if (payload[2] != 0) {
JXL_WARNING("Unsupported color space specified in cICP chunk: %d",
static_cast<int>(payload[2]));
return false;
}
if (payload[3] != 1) {
JXL_WARNING("Unsupported full-range flag specified in cICP chunk: %d",
static_cast<int>(payload[3]));
return false;
}
// cICP has no rendering intent, so use the default
color_enc.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
*color_encoding = color_enc;
return true;
}
Status DecodeGAMA(const unsigned char* payload, const size_t payload_size,
JxlColorEncoding* color_encoding) {
if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size");
@ -286,6 +420,7 @@ constexpr uint32_t kId_fcTL = 0x4C546366;
constexpr uint32_t kId_IDAT = 0x54414449;
constexpr uint32_t kId_fdAT = 0x54416466;
constexpr uint32_t kId_IEND = 0x444E4549;
constexpr uint32_t kId_cICP = 0x50434963;
constexpr uint32_t kId_iCCP = 0x50434369;
constexpr uint32_t kId_sRGB = 0x42475273;
constexpr uint32_t kId_gAMA = 0x414D4167;
@ -469,7 +604,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
ppf->frames.clear();
bool have_color = false, have_srgb = false;
bool have_color = false;
bool have_cicp = false, have_iccp = false, have_srgb = false;
bool errorstate = true;
if (id == kId_IHDR && chunkIHDR.size() == 25) {
x0 = 0;
@ -490,6 +626,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
chunkIHDR, chunksInfo)) {
@ -625,7 +762,17 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
chunk.size() - 4)) {
break;
}
} else if (id == kId_iCCP) {
} else if (id == kId_cICP) {
// Color profile chunks: cICP has the highest priority, followed by
// iCCP and sRGB (which shouldn't co-exist, but if they do, we use
// iCCP), followed finally by gAMA and cHRM.
if (DecodeCICP(chunk.data() + 8, chunk.size() - 12,
&ppf->color_encoding)) {
have_cicp = true;
have_color = true;
ppf->icc.clear();
}
} else if (!have_cicp && id == kId_iCCP) {
if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
JXL_WARNING("Corrupt iCCP chunk");
break;
@ -642,19 +789,20 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
if (ok && proflen) {
ppf->icc.assign(profile, profile + proflen);
have_color = true;
have_iccp = true;
} else {
// TODO(eustas): JXL_WARNING?
}
} else if (id == kId_sRGB) {
} else if (!have_cicp && !have_iccp && id == kId_sRGB) {
JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12,
&ppf->color_encoding));
have_srgb = true;
have_color = true;
} else if (id == kId_gAMA) {
} else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) {
JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12,
&ppf->color_encoding));
have_color = true;
} else if (id == kId_cHRM) {
} else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) {
JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12,
&ppf->color_encoding));
have_color = true;
@ -677,12 +825,6 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
}
}
if (have_srgb) {
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
}
JXL_RETURN_IF_ERROR(ApplyColorHints(
color_hints, have_color, ppf->info.num_color_channels == 1, ppf));
}

View File

@ -107,7 +107,8 @@ Status DecodeBytes(const Span<const uint8_t> bytes,
}
#endif
#if JPEGXL_ENABLE_JPEG
else if (DecodeImageJPG(bytes, color_hints, constraints, ppf)) {
else if (DecodeImageJPG(bytes, color_hints, constraints,
/*output_bit_depth=*/8, ppf)) {
codec = Codec::kJPG;
}
#endif

View File

@ -0,0 +1,209 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/dec/jpegli.h"
#include <setjmp.h>
#include <stdint.h>
#include <algorithm>
#include <numeric>
#include <utility>
#include <vector>
#include "lib/jpegli/decode.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/sanitizers.h"
namespace jxl {
namespace extras {
namespace {
constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
0x66, 0x00, 0x00};
constexpr int kExifMarker = JPEG_APP0 + 1;
constexpr int kICCMarker = JPEG_APP0 + 2;
static inline bool IsJPG(const std::vector<uint8_t>& bytes) {
if (bytes.size() < 2) return false;
if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
return true;
}
bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
return marker->marker == kExifMarker &&
marker->data_length >= sizeof kExifSignature + 2 &&
std::equal(std::begin(kExifSignature), std::end(kExifSignature),
marker->data);
}
Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
std::vector<uint8_t>* const icc) {
uint8_t* icc_data_ptr;
unsigned int icc_data_len;
if (jpegli_read_icc_profile(cinfo, &icc_data_ptr, &icc_data_len)) {
icc->assign(icc_data_ptr, icc_data_ptr + icc_data_len);
free(icc_data_ptr);
return true;
}
return false;
}
void ReadExif(jpeg_decompress_struct* const cinfo,
std::vector<uint8_t>* const exif) {
constexpr size_t kExifSignatureSize = sizeof kExifSignature;
for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
marker = marker->next) {
// marker is initialized by libjpeg, which we are not instrumenting with
// msan.
msan::UnpoisonMemory(marker, sizeof(*marker));
msan::UnpoisonMemory(marker->data, marker->data_length);
if (!MarkerIsExif(marker)) continue;
size_t marker_length = marker->data_length - kExifSignatureSize;
exif->resize(marker_length);
std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
return;
}
}
void MyErrorExit(j_common_ptr cinfo) {
jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
(*cinfo->err->output_message)(cinfo);
jpegli_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
longjmp(*env, 1);
}
void MyOutputMessage(j_common_ptr cinfo) {
#if JXL_DEBUG_WARNING == 1
char buf[JMSG_LENGTH_MAX + 1];
(*cinfo->err->format_message)(cinfo, buf);
buf[JMSG_LENGTH_MAX] = 0;
JXL_WARNING("%s", buf);
#endif
}
} // namespace
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
JxlDataType output_data_type, ThreadPool* pool,
PackedPixelFile* ppf) {
// Don't do anything for non-JPEG files (no need to report an error)
if (!IsJPG(compressed)) return false;
// TODO(veluca): use JPEGData also for pixels?
// We need to declare all the non-trivial destructor local variables before
// the call to setjmp().
std::unique_ptr<JSAMPLE[]> row;
const auto try_catch_block = [&]() -> bool {
jpeg_decompress_struct cinfo;
// cinfo is initialized by libjpeg, which we are not instrumenting with
// msan, therefore we need to initialize cinfo here.
msan::UnpoisonMemory(&cinfo, sizeof(cinfo));
// Setup error handling in jpeg library so we can deal with broken jpegs in
// the fuzzer.
jpeg_error_mgr jerr;
jmp_buf env;
cinfo.err = jpegli_std_error(&jerr);
jerr.error_exit = &MyErrorExit;
jerr.output_message = &MyOutputMessage;
if (setjmp(env)) {
return false;
}
cinfo.client_data = static_cast<void*>(&env);
jpegli_create_decompress(&cinfo);
jpegli_mem_src(&cinfo,
reinterpret_cast<const unsigned char*>(compressed.data()),
compressed.size());
jpegli_save_markers(&cinfo, kICCMarker, 0xFFFF);
jpegli_save_markers(&cinfo, kExifMarker, 0xFFFF);
const auto failure = [&cinfo](const char* str) -> Status {
jpegli_abort_decompress(&cinfo);
jpegli_destroy_decompress(&cinfo);
return JXL_FAILURE("%s", str);
};
jpegli_read_header(&cinfo, TRUE);
// Might cause CPU-zip bomb.
if (cinfo.arith_code) {
return failure("arithmetic code JPEGs are not supported");
}
int nbcomp = cinfo.num_components;
if (nbcomp != 1 && nbcomp != 3) {
return failure("unsupported number of components in JPEG");
}
if (!ReadICCProfile(&cinfo, &ppf->icc)) {
ppf->icc.clear();
// Default to SRGB
// Actually, (cinfo.output_components == nbcomp) will be checked after
// `jpegli_start_decompress`.
ppf->color_encoding.color_space =
(nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
}
ReadExif(&cinfo, &ppf->metadata.exif);
ppf->info.xsize = cinfo.image_width;
ppf->info.ysize = cinfo.image_height;
if (output_data_type == JXL_TYPE_UINT8) {
ppf->info.bits_per_sample = 8;
} else if (output_data_type == JXL_TYPE_UINT16) {
ppf->info.bits_per_sample = 16;
} else {
return failure("unsupported data type");
}
ppf->info.exponent_bits_per_sample = 0;
ppf->info.uses_original_profile = true;
// No alpha in JPG
ppf->info.alpha_bits = 0;
ppf->info.alpha_exponent_bits = 0;
ppf->info.num_color_channels = nbcomp;
ppf->info.orientation = JXL_ORIENT_IDENTITY;
// Set output bit depth.
cinfo.quantize_colors = FALSE;
cinfo.desired_number_of_colors = 1 << ppf->info.bits_per_sample;
jpegli_start_decompress(&cinfo);
JXL_ASSERT(cinfo.output_components == nbcomp);
const JxlPixelFormat format{
/*num_channels=*/static_cast<uint32_t>(nbcomp),
output_data_type,
/*endianness=*/JXL_NATIVE_ENDIAN,
/*align=*/0,
};
ppf->frames.clear();
// Allocates the frame buffer.
ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
const auto& frame = ppf->frames.back();
JXL_ASSERT(sizeof(JSAMPLE) * cinfo.output_components * cinfo.image_width <=
frame.color.stride);
for (size_t y = 0; y < cinfo.image_height; ++y) {
JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
static_cast<uint8_t*>(frame.color.pixels()) +
frame.color.stride * y)};
jpegli_read_scanlines(&cinfo, rows, 1);
msan::UnpoisonMemory(rows[0], sizeof(JSAMPLE) * cinfo.output_components *
cinfo.image_width);
}
jpegli_finish_decompress(&cinfo);
jpegli_destroy_decompress(&cinfo);
return true;
};
return try_catch_block();
}
} // namespace extras
} // namespace jxl

View File

@ -0,0 +1,30 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_DEC_JPEGLI_H_
#define LIB_EXTRAS_DEC_JPEGLI_H_
// Decodes JPG pixels and metadata in memory using the libjpegli library.
#include <stdint.h>
#include <vector>
#include "jxl/types.h"
#include "lib/extras/packed_image.h"
#include "lib/jxl/base/data_parallel.h"
#include "lib/jxl/base/status.h"
namespace jxl {
namespace extras {
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
JxlDataType output_data_type, ThreadPool* pool,
PackedPixelFile* ppf);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_DEC_JPEGLI_H_

View File

@ -165,7 +165,7 @@ void MyOutputMessage(j_common_ptr cinfo) {
Status DecodeImageJPG(const Span<const uint8_t> bytes,
const ColorHints& color_hints,
const SizeConstraints& constraints,
PackedPixelFile* ppf) {
size_t output_bit_depth, PackedPixelFile* ppf) {
// Don't do anything for non-JPEG files (no need to report an error)
if (!IsJPG(bytes)) return false;
@ -175,6 +175,10 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
// the call to setjmp().
std::unique_ptr<JSAMPLE[]> row;
if (output_bit_depth == 0 || output_bit_depth > 16) {
return JXL_FAILURE("Invalid output bitdepth");
}
const auto try_catch_block = [&]() -> bool {
jpeg_decompress_struct cinfo;
// cinfo is initialized by libjpeg, which we are not instrumenting with
@ -252,12 +256,24 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
ppf->info.num_color_channels = nbcomp;
ppf->info.orientation = JXL_ORIENT_IDENTITY;
// Try setting output bit depth. In libjpeg-turbo, this combination of
// parameters will be ignored, but in libjpegli it will override output bit
// depth.
cinfo.quantize_colors = FALSE;
cinfo.desired_number_of_colors = 1 << output_bit_depth;
jpeg_start_decompress(&cinfo);
JXL_ASSERT(cinfo.output_components == nbcomp);
if (cinfo.desired_number_of_colors == 0) {
// We know that the output bit depth was set because
// desired_number_of_colors was reset to zero by libjpegli.
ppf->info.bits_per_sample = output_bit_depth;
}
JxlDataType data_type =
ppf->info.bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
const JxlPixelFormat format{
/*num_channels=*/static_cast<uint32_t>(nbcomp),
/*data_type=*/BITS_IN_JSAMPLE == 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
data_type,
/*endianness=*/JXL_NATIVE_ENDIAN,
/*align=*/0,
};

View File

@ -25,7 +25,8 @@ namespace extras {
// `elapsed_deinterleave`, if non-null, will be set to the time (in seconds)
// that it took to deinterleave the raw JSAMPLEs to planar floats.
Status DecodeImageJPG(Span<const uint8_t> bytes, const ColorHints& color_hints,
const SizeConstraints& constraints, PackedPixelFile* ppf);
const SizeConstraints& constraints,
size_t output_bit_depth, PackedPixelFile* ppf);
} // namespace extras
} // namespace jxl

View File

@ -223,7 +223,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
fprintf(stderr,
"Input file is truncated (total bytes: %" PRIuS
", processed bytes: %" PRIuS
") and allow_partial_input was disabled.",
") and --allow_partial_files is not present.\n",
bytes_size, bytes_size - released_size);
return false;
} else if (status == JXL_DEC_BOX) {

View File

@ -1,559 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/dec_group_jpeg.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include <utility>
#ifdef MEMORY_SANITIZER
#define JXL_MEMORY_SANITIZER 1
#elif defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define JXL_MEMORY_SANITIZER 1
#else
#define JXL_MEMORY_SANITIZER 0
#endif
#else
#define JXL_MEMORY_SANITIZER 0
#endif
#if JXL_MEMORY_SANITIZER
#include "sanitizer/msan_interface.h"
#endif
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "lib/extras/dec_group_jpeg.cc"
#include <hwy/foreach_target.h>
#include <hwy/highway.h>
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
using hwy::HWY_NAMESPACE::Abs;
using hwy::HWY_NAMESPACE::Add;
using hwy::HWY_NAMESPACE::Clamp;
using hwy::HWY_NAMESPACE::Gt;
using hwy::HWY_NAMESPACE::IfThenElseZero;
using hwy::HWY_NAMESPACE::Mul;
using hwy::HWY_NAMESPACE::MulAdd;
using hwy::HWY_NAMESPACE::NearestInt;
using hwy::HWY_NAMESPACE::NegMulAdd;
using hwy::HWY_NAMESPACE::Rebind;
using hwy::HWY_NAMESPACE::Sub;
using hwy::HWY_NAMESPACE::Vec;
using hwy::HWY_NAMESPACE::Xor;
using D = HWY_FULL(float);
using DI = HWY_FULL(int32_t);
constexpr D d;
constexpr DI di;
using D8 = HWY_CAPPED(float, 8);
constexpr D8 d8;
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
int32_t* JXL_RESTRICT sumabs) {
for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
size_t k = i % kDCTBlockSize;
const Rebind<int16_t, DI> di16;
const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
const auto abs_coeff = Abs(coeff);
const auto not_0 = Gt(abs_coeff, Zero(di));
const auto nzero = IfThenElseZero(not_0, Set(di, 1));
Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
}
}
void DequantBlock(const int16_t* JXL_RESTRICT qblock,
const float* JXL_RESTRICT dequant,
const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
for (size_t k = 0; k < kDCTBlockSize; k += Lanes(d)) {
const auto mul = Load(d, dequant + k);
const auto bias = Load(d, biases + k);
const Rebind<int16_t, DI> di16;
const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
const Rebind<float, DI> df;
const auto quant = ConvertTo(df, quant_i);
const auto abs_quant = Abs(quant);
const auto not_0 = Gt(abs_quant, Zero(df));
const auto sign_quant = Xor(quant, abs_quant);
const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
Store(dequant, d, block + k);
}
}
#if HWY_CAP_GE256
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
float* JXL_RESTRICT to) {
const D8 d;
auto i0 = Load(d, from);
auto i1 = Load(d, from + 1 * 8);
auto i2 = Load(d, from + 2 * 8);
auto i3 = Load(d, from + 3 * 8);
auto i4 = Load(d, from + 4 * 8);
auto i5 = Load(d, from + 5 * 8);
auto i6 = Load(d, from + 6 * 8);
auto i7 = Load(d, from + 7 * 8);
const auto q0 = InterleaveLower(d, i0, i2);
const auto q1 = InterleaveLower(d, i1, i3);
const auto q2 = InterleaveUpper(d, i0, i2);
const auto q3 = InterleaveUpper(d, i1, i3);
const auto q4 = InterleaveLower(d, i4, i6);
const auto q5 = InterleaveLower(d, i5, i7);
const auto q6 = InterleaveUpper(d, i4, i6);
const auto q7 = InterleaveUpper(d, i5, i7);
const auto r0 = InterleaveLower(d, q0, q1);
const auto r1 = InterleaveUpper(d, q0, q1);
const auto r2 = InterleaveLower(d, q2, q3);
const auto r3 = InterleaveUpper(d, q2, q3);
const auto r4 = InterleaveLower(d, q4, q5);
const auto r5 = InterleaveUpper(d, q4, q5);
const auto r6 = InterleaveLower(d, q6, q7);
const auto r7 = InterleaveUpper(d, q6, q7);
i0 = ConcatLowerLower(d, r4, r0);
i1 = ConcatLowerLower(d, r5, r1);
i2 = ConcatLowerLower(d, r6, r2);
i3 = ConcatLowerLower(d, r7, r3);
i4 = ConcatUpperUpper(d, r4, r0);
i5 = ConcatUpperUpper(d, r5, r1);
i6 = ConcatUpperUpper(d, r6, r2);
i7 = ConcatUpperUpper(d, r7, r3);
Store(i0, d, to);
Store(i1, d, to + 1 * 8);
Store(i2, d, to + 2 * 8);
Store(i3, d, to + 3 * 8);
Store(i4, d, to + 4 * 8);
Store(i5, d, to + 5 * 8);
Store(i6, d, to + 6 * 8);
Store(i7, d, to + 7 * 8);
}
#elif HWY_TARGET != HWY_SCALAR
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
float* JXL_RESTRICT to) {
const HWY_CAPPED(float, 4) d;
for (size_t n = 0; n < 8; n += 4) {
for (size_t m = 0; m < 8; m += 4) {
auto p0 = Load(d, from + n * 8 + m);
auto p1 = Load(d, from + (n + 1) * 8 + m);
auto p2 = Load(d, from + (n + 2) * 8 + m);
auto p3 = Load(d, from + (n + 3) * 8 + m);
const auto q0 = InterleaveLower(d, p0, p2);
const auto q1 = InterleaveLower(d, p1, p3);
const auto q2 = InterleaveUpper(d, p0, p2);
const auto q3 = InterleaveUpper(d, p1, p3);
const auto r0 = InterleaveLower(d, q0, q1);
const auto r1 = InterleaveUpper(d, q0, q1);
const auto r2 = InterleaveLower(d, q2, q3);
const auto r3 = InterleaveUpper(d, q2, q3);
Store(r0, d, to + m * 8 + n);
Store(r1, d, to + (1 + m) * 8 + n);
Store(r2, d, to + (2 + m) * 8 + n);
Store(r3, d, to + (3 + m) * 8 + n);
}
}
}
#else
JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
float* JXL_RESTRICT to) {
for (size_t n = 0; n < 8; ++n) {
for (size_t m = 0; m < 8; ++m) {
to[8 * n + m] = from[8 * m + n];
}
}
}
#endif
template <size_t N>
void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
float* JXL_RESTRICT aout) {
for (size_t i = 0; i < N / 2; i++) {
auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
Store(in1, d8, aout + i * 8);
}
for (size_t i = N / 2; i < N; i++) {
auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
Store(in1, d8, aout + i * 8);
}
}
template <size_t N>
void BTranspose(float* JXL_RESTRICT coeff) {
for (size_t i = N - 1; i > 0; i--) {
auto in1 = Load(d8, coeff + i * 8);
auto in2 = Load(d8, coeff + (i - 1) * 8);
Store(Add(in1, in2), d8, coeff + i * 8);
}
constexpr float kSqrt2 = 1.41421356237f;
auto sqrt2 = Set(d8, kSqrt2);
auto in1 = Load(d8, coeff);
Store(Mul(in1, sqrt2), d8, coeff);
}
// Constants for DCT implementation. Generated by the following snippet:
// for i in range(N // 2):
// print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
template <size_t N>
struct WcMultipliers;
template <>
struct WcMultipliers<4> {
static constexpr float kMultipliers[] = {
0.541196100146197,
1.3065629648763764,
};
};
template <>
struct WcMultipliers<8> {
static constexpr float kMultipliers[] = {
0.5097955791041592,
0.6013448869350453,
0.8999762231364156,
2.5629154477415055,
};
};
constexpr float WcMultipliers<4>::kMultipliers[];
constexpr float WcMultipliers<8>::kMultipliers[];
template <size_t N>
void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
size_t out_stride) {
for (size_t i = 0; i < N / 2; i++) {
auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
auto in1 = Load(d8, coeff + i * 8);
auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
auto out1 = MulAdd(mul, in2, in1);
auto out2 = NegMulAdd(mul, in2, in1);
StoreU(out1, d8, out + i * out_stride);
StoreU(out2, d8, out + (N - i - 1) * out_stride);
}
}
template <size_t N>
struct IDCT1DImpl;
template <>
struct IDCT1DImpl<1> {
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
size_t to_stride) {
StoreU(LoadU(d8, from), d8, to);
}
};
template <>
struct IDCT1DImpl<2> {
JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
size_t to_stride) {
JXL_DASSERT(from_stride >= 8);
JXL_DASSERT(to_stride >= 8);
auto in1 = LoadU(d8, from);
auto in2 = LoadU(d8, from + from_stride);
StoreU(Add(in1, in2), d8, to);
StoreU(Sub(in1, in2), d8, to + to_stride);
}
};
template <size_t N>
struct IDCT1DImpl {
void operator()(const float* from, size_t from_stride, float* to,
size_t to_stride) {
JXL_DASSERT(from_stride >= 8);
JXL_DASSERT(to_stride >= 8);
HWY_ALIGN float tmp[64];
ForwardEvenOdd<N>(from, from_stride, tmp);
IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
BTranspose<N / 2>(tmp + N * 4);
IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
MultiplyAndAdd<N>(tmp, to, to_stride);
}
};
template <size_t N>
void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
size_t output_stride) {
for (size_t i = 0; i < 8; i += Lanes(d8)) {
IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
}
}
void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
float* JXL_RESTRICT output, size_t output_stride) {
Transpose8x8Block(block0, block1);
IDCT1D<8>(block1, block0, 8);
Transpose8x8Block(block0, block1);
IDCT1D<8>(block1, output, output_stride);
}
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
const float* JXL_RESTRICT dequant,
const float* JXL_RESTRICT biases,
float* JXL_RESTRICT scratch_space,
float* JXL_RESTRICT output, size_t output_stride) {
float* JXL_RESTRICT block0 = scratch_space;
float* JXL_RESTRICT block1 = scratch_space + kDCTBlockSize;
DequantBlock(qblock, dequant, biases, block0);
ComputeScaledIDCT(block0, block1, output, output_stride);
}
#if HWY_CAP_GE512
using hwy::HWY_NAMESPACE::Half;
using hwy::HWY_NAMESPACE::Vec;
template <size_t i, class DF, class V>
HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
using HF = Half<DF>;
using HHF = Half<HF>;
auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
}
template <class DF, class V>
HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
using HF = Half<DF>;
return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
}
#endif
// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
// aligned.
template <class DF, class V, typename T>
void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
#if HWY_TARGET == HWY_SCALAR
Store(v0, df, mem);
Store(v1, df, mem + 1);
#elif !HWY_CAP_GE256
Store(InterleaveLower(df, v0, v1), df, mem);
Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
#else
if (!HWY_CAP_GE512 || Lanes(df) == 8) {
auto t0 = InterleaveLower(df, v0, v1);
auto t1 = InterleaveUpper(df, v0, v1);
Store(ConcatLowerLower(df, t1, t0), df, mem);
Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
} else {
#if HWY_CAP_GE512
auto t0 = InterleaveLower(df, v0, v1);
auto t1 = InterleaveUpper(df, v0, v1);
Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
Quarter<1>(df, t0), Quarter<1>(df, t1)),
df, mem);
Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
Quarter<3>(df, t0), Quarter<3>(df, t1)),
df, mem + Lanes(df));
#endif
}
#endif
}
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
float* JXL_RESTRICT row_out, size_t len_out) {
HWY_FULL(float) df;
auto threefour = Set(df, 0.75f);
auto onefour = Set(df, 0.25f);
const size_t len_in = len_out >> 1;
row_in[-1] = row_in[0];
row_in[len_in] = row_in[len_in - 1];
for (size_t x = 0; x < len_in; x += Lanes(df)) {
auto current = Mul(Load(df, row_in + x), threefour);
auto prev = LoadU(df, row_in + x - 1);
auto next = LoadU(df, row_in + x + 1);
auto left = MulAdd(onefour, prev, current);
auto right = MulAdd(onefour, next, current);
StoreInterleaved(df, left, right, row_out + x * 2);
}
}
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
const float* JXL_RESTRICT row_mid,
const float* JXL_RESTRICT row_bot,
float* JXL_RESTRICT row_out0,
float* JXL_RESTRICT row_out1, size_t len) {
HWY_FULL(float) df;
auto threefour = Set(df, 0.75f);
auto onefour = Set(df, 0.25f);
for (size_t x = 0; x < len; x += Lanes(df)) {
auto it = Load(df, row_top + x);
auto im = Load(df, row_mid + x);
auto ib = Load(df, row_bot + x);
auto im_scaled = Mul(im, threefour);
Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
}
}
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
float* JXL_RESTRICT row2, size_t xsize) {
const HWY_FULL(float) df;
// Full-range BT.601 as defined by JFIF Clause 7:
// https://www.itu.int/rec/T-REC-T.871-201105-I/en
const auto c128 = Set(df, 128.0f / 255);
const auto crcr = Set(df, 1.402f);
const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
const auto cbcb = Set(df, 1.772f);
for (size_t x = 0; x < xsize; x += Lanes(df)) {
const auto y_vec = Add(Load(df, row0 + x), c128);
const auto cb_vec = Load(df, row1 + x);
const auto cr_vec = Load(df, row2 + x);
const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
Store(r_vec, df, row0 + x);
Store(g_vec, df, row1 + x);
Store(b_vec, df, row2 + x);
}
}
void DecenterRow(float* row, size_t xsize) {
const HWY_FULL(float) df;
const auto c128 = Set(df, 128.0f / 255);
for (size_t x = 0; x < xsize; x += Lanes(df)) {
Store(Add(Load(df, row + x), c128), df, row + x);
}
}
template <typename T>
void StoreUnsignedRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
size_t num_channels, T* output) {
const HWY_FULL(float) d;
auto zero = Zero(d);
auto one = Set(d, 1.0f);
auto mul = Set(d, (1u << (sizeof(T) * 8)) - 1);
const Rebind<T, decltype(d)> du;
#if JXL_MEMORY_SANITIZER
const size_t padding = RoundUpTo(len, Lanes(d)) - len;
for (size_t c = 0; c < num_channels; ++c) {
__msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
}
#endif
if (num_channels == 1) {
for (size_t i = 0; i < len; i += Lanes(d)) {
auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
Store(DemoteTo(du, NearestInt(v0)), du, &output[i]);
}
} else if (num_channels == 3) {
for (size_t i = 0; i < len; i += Lanes(d)) {
auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
auto v1 = Mul(Clamp(zero, Load(d, &input[1][x0 + i]), one), mul);
auto v2 = Mul(Clamp(zero, Load(d, &input[2][x0 + i]), one), mul);
StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
DemoteTo(du, NearestInt(v1)),
DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
}
}
#if JXL_MEMORY_SANITIZER
__msan_poison(output + num_channels * len,
sizeof(output[0]) * num_channels * padding);
#endif
}
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
size_t len, uint8_t* JXL_RESTRICT scratch_space,
extras::PackedImage* image) {
if (y0 >= image->ysize) return;
JxlPixelFormat format = image->format;
uint8_t* pixels = reinterpret_cast<uint8_t*>(image->pixels());
if (format.data_type == JXL_TYPE_UINT8) {
size_t offset = y0 * image->stride + x0 * format.num_channels;
JXL_CHECK(offset + len * format.num_channels <= image->pixels_size);
StoreUnsignedRow(rows, x0, len, format.num_channels, scratch_space);
memcpy(pixels + offset, scratch_space, len * format.num_channels);
} else if (format.data_type == JXL_TYPE_UINT16) {
size_t offset = y0 * image->stride + x0 * format.num_channels * 2;
JXL_CHECK(offset + len * format.num_channels * 2 <= image->pixels_size);
uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
StoreUnsignedRow(rows, x0, len, format.num_channels, tmp);
// TODO(szabadka) Handle endianness.
memcpy(pixels + offset, tmp, len * format.num_channels * 2);
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace jxl
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace jxl {
namespace {
HWY_EXPORT(GatherBlockStats);
HWY_EXPORT(DecodeJpegBlock);
HWY_EXPORT(Upsample2Horizontal);
HWY_EXPORT(Upsample2Vertical);
HWY_EXPORT(YCbCrToRGB);
HWY_EXPORT(DecenterRow);
HWY_EXPORT(WriteToPackedImage);
} // namespace
namespace extras {
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
int32_t* JXL_RESTRICT sumabs) {
return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
sumabs);
}
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
const float* JXL_RESTRICT dequant_matrices,
const float* JXL_RESTRICT biases,
float* JXL_RESTRICT scratch_space,
float* JXL_RESTRICT output, size_t output_stride) {
return HWY_DYNAMIC_DISPATCH(DecodeJpegBlock)(
qblock, dequant_matrices, biases, scratch_space, output, output_stride);
}
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
float* JXL_RESTRICT row_out, size_t len_out) {
return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row_in, row_out, len_out);
}
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
const float* JXL_RESTRICT row_mid,
const float* JXL_RESTRICT row_bot,
float* JXL_RESTRICT row_out0,
float* JXL_RESTRICT row_out1, size_t len) {
return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
row_out0, row_out1, len);
}
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
float* JXL_RESTRICT row2, size_t xsize) {
return HWY_DYNAMIC_DISPATCH(YCbCrToRGB)(row0, row1, row2, xsize);
}
void DecenterRow(float* row, size_t xsize) {
return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
}
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
size_t len, uint8_t* JXL_RESTRICT scratch_space,
extras::PackedImage* image) {
return HWY_DYNAMIC_DISPATCH(WriteToPackedImage)(rows, x0, y0, len,
scratch_space, image);
}
} // namespace extras
} // namespace jxl
#endif // HWY_ONCE

View File

@ -1,51 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_DEC_GROUP_JPEG_H_
#define LIB_EXTRAS_DEC_GROUP_JPEG_H_
#include <stddef.h>
#include <stdint.h>
#include <vector>
#include "lib/extras/packed_image.h"
#include "lib/jxl/base/compiler_specific.h"
namespace jxl {
namespace extras {
void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
int32_t* JXL_RESTRICT sumabs);
void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
const float* JXL_RESTRICT dequant_matrices,
const float* JXL_RESTRICT biases,
float* JXL_RESTRICT scratch_space,
float* JXL_RESTRICT output, size_t output_stride);
void Upsample2Horizontal(float* JXL_RESTRICT row_in,
float* JXL_RESTRICT row_out, size_t len_out);
void Upsample2Vertical(const float* JXL_RESTRICT row_top,
const float* JXL_RESTRICT row_mid,
const float* JXL_RESTRICT row_bot,
float* JXL_RESTRICT row_out0,
float* JXL_RESTRICT row_out1, size_t len);
void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
float* JXL_RESTRICT row2, size_t xsize);
void DecenterRow(float* row, size_t xsize);
void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
size_t len, uint8_t* JXL_RESTRICT scratch_space,
extras::PackedImage* image);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_DEC_GROUP_JPEG_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,276 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef LIB_EXTRAS_DECODE_JPEG_H_
#define LIB_EXTRAS_DECODE_JPEG_H_
#include <stdint.h>
#include <array>
#include <vector>
#include "hwy/aligned_allocator.h"
#include "lib/extras/packed_image.h"
#include "lib/jxl/base/data_parallel.h"
#include "lib/jxl/base/status.h"
#include "lib/jxl/image.h"
namespace jxl {
namespace extras {
constexpr int kMaxComponents = 4;
typedef int16_t coeff_t;
// Represents one component of a jpeg file.
struct JPEGComponent {
JPEGComponent()
: id(0),
h_samp_factor(1),
v_samp_factor(1),
quant_idx(0),
width_in_blocks(0),
height_in_blocks(0) {}
// One-byte id of the component.
uint32_t id;
// Horizontal and vertical sampling factors.
// In interleaved mode, each minimal coded unit (MCU) has
// h_samp_factor x v_samp_factor DCT blocks from this component.
int h_samp_factor;
int v_samp_factor;
// The index of the quantization table used for this component.
uint32_t quant_idx;
// The dimensions of the component measured in 8x8 blocks.
uint32_t width_in_blocks;
uint32_t height_in_blocks;
// The DCT coefficients of this component, laid out block-by-block, divided
// through the quantization matrix values.
hwy::AlignedFreeUniquePtr<coeff_t[]> coeffs;
};
struct HuffmanTableEntry {
// Initialize the value to an invalid symbol so that we can recognize it
// when reading the bit stream using a Huffman code with space > 0.
HuffmanTableEntry() : bits(0), value(0xffff) {}
uint8_t bits; // number of bits used for this symbol
uint16_t value; // symbol value or table offset
};
// Quantization values for an 8x8 pixel block.
struct JPEGQuantTable {
std::array<int32_t, kDCTBlockSize> values;
// The index of this quantization table as it was parsed from the input JPEG.
// Each DQT marker segment contains an 'index' field, and we save this index
// here. Valid values are 0 to 3.
uint32_t index = 0;
};
// Huffman table indexes and MCU dimensions used for one component of one scan.
struct JPEGComponentScanInfo {
uint32_t comp_idx;
uint32_t dc_tbl_idx;
uint32_t ac_tbl_idx;
uint32_t mcu_ysize_blocks;
uint32_t mcu_xsize_blocks;
};
// Contains information that is used in one scan.
struct JPEGScanInfo {
// Parameters used for progressive scans (named the same way as in the spec):
// Ss : Start of spectral band in zig-zag sequence.
// Se : End of spectral band in zig-zag sequence.
// Ah : Successive approximation bit position, high.
// Al : Successive approximation bit position, low.
uint32_t Ss;
uint32_t Se;
uint32_t Ah;
uint32_t Al;
uint32_t num_components = 0;
std::array<JPEGComponentScanInfo, kMaxComponents> components;
size_t MCU_rows;
size_t MCU_cols;
};
// State of the decoder that has to be saved before decoding one MCU in case
// we run out of the bitstream.
struct MCUCodingState {
coeff_t last_dc_coeff[kMaxComponents];
int eobrun;
std::vector<coeff_t> coeffs;
};
// Streaming JPEG decoding object.
class JpegDecoder {
public:
enum class Status {
kSuccess,
kNeedMoreInput,
kError,
};
// Sets the next chunk of input. It must be called before the first call to
// ReadHeaders() and every time a reder function returns
// Status::kNeedMoreInput.
Status SetInput(const uint8_t* data, size_t len);
// Sets the output image. Must be called between ReadHeaders() and
// ReadScanLines(). The provided image must have the dimensions and number of
// channels as the underlying JPEG bitstream.
Status SetOutput(PackedImage* image);
// Reads the header markers up to and including SOF marker. After this returns
// kSuccess, the image attribute accessors can be called.
Status ReadHeaders();
// Reads the bitstream after the SOF marker, and fills in at most
// max_output_rows scan lines of the provided image. Set *num_output_rows to
// the actual number of lines produced.
Status ReadScanLines(size_t* num_output_rows, size_t max_output_rows);
// Image attribute accessors, can be called after ReadHeaders() returns
// kSuccess.
size_t xsize() const { return xsize_; }
size_t ysize() const { return ysize_; }
size_t num_channels() const { return components_.size(); }
const std::vector<uint8_t>& icc_profile() const { return icc_profile_; }
private:
enum class State {
kStart,
kProcessMarkers,
kScan,
kRender,
kEnd,
};
State state_ = State::kStart;
//
// Input handling state.
//
const uint8_t* next_in_ = nullptr;
size_t avail_in_ = 0;
// Codestream input data is copied here temporarily when the decoder needs
// more input bytes to process the next part of the stream.
std::vector<uint8_t> codestream_copy_;
// Number of bytes at the end of codestream_copy_ that were not yet consumed
// by calling AdvanceInput().
size_t codestream_unconsumed_ = 0;
// Position in the codestream_copy_ vector that the decoder already finished
// processing.
size_t codestream_pos_ = 0;
// Number of bits after codestream_pos_ that were already processed.
size_t codestream_bits_ahead_ = 0;
//
// Marker data processing state.
//
bool found_soi_ = false;
bool found_app0_ = false;
bool found_dri_ = false;
bool found_sof_ = false;
bool found_eoi_ = false;
size_t xsize_ = 0;
size_t ysize_ = 0;
bool is_ycbcr_ = true;
size_t icc_index_ = 0;
size_t icc_total_ = 0;
std::vector<uint8_t> icc_profile_;
size_t restart_interval_ = 0;
std::vector<JPEGQuantTable> quant_;
std::vector<JPEGComponent> components_;
std::vector<HuffmanTableEntry> dc_huff_lut_;
std::vector<HuffmanTableEntry> ac_huff_lut_;
uint8_t huff_slot_defined_[256] = {};
// Fields defined by SOF marker.
bool is_progressive_;
int max_h_samp_;
int max_v_samp_;
size_t iMCU_rows_;
size_t iMCU_cols_;
size_t iMCU_width_;
size_t iMCU_height_;
// Initialized at strat of frame.
uint16_t scan_progression_[kMaxComponents][kDCTBlockSize];
//
// Per scan state.
//
JPEGScanInfo scan_info_;
size_t scan_mcu_row_;
size_t scan_mcu_col_;
coeff_t last_dc_coeff_[kMaxComponents];
int eobrun_;
int restarts_to_go_;
int next_restart_marker_;
MCUCodingState mcu_;
//
// Rendering state.
//
PackedImage* output_;
Image3F MCU_row_buf_;
size_t MCU_buf_current_row_;
size_t MCU_buf_ready_rows_;
size_t output_row_;
size_t output_mcu_row_;
size_t output_ci_;
// Temporary buffers for vertically upsampled chroma components. We keep a
// ringbuffer of 3 * kBlockDim rows so that we have access for previous and
// next rows.
std::vector<ImageF> chroma_;
// In the rendering order, vertically upsampled chroma components come first.
std::vector<size_t> component_order_;
hwy::AlignedFreeUniquePtr<float[]> idct_scratch_;
hwy::AlignedFreeUniquePtr<float[]> upsample_scratch_;
hwy::AlignedFreeUniquePtr<uint8_t[]> output_scratch_;
hwy::AlignedFreeUniquePtr<float[]> dequant_;
// Per channel and per frequency statistics about the number of nonzeros and
// the sum of coefficient absolute values, used in dequantization bias
// computation.
hwy::AlignedFreeUniquePtr<int[]> nonzeros_;
hwy::AlignedFreeUniquePtr<int[]> sumabs_;
std::vector<size_t> num_processed_blocks_;
hwy::AlignedFreeUniquePtr<float[]> biases_;
void AdvanceInput(size_t size);
void AdvanceCodestream(size_t size);
Status RequestMoreInput();
Status GetCodestreamInput(const uint8_t** data, size_t* len);
Status ProcessMarker(const uint8_t* data, size_t len, size_t* pos);
Status ProcessSOF(const uint8_t* data, size_t len);
Status ProcessSOS(const uint8_t* data, size_t len);
Status ProcessDHT(const uint8_t* data, size_t len);
Status ProcessDQT(const uint8_t* data, size_t len);
Status ProcessDRI(const uint8_t* data, size_t len);
Status ProcessAPP(const uint8_t* data, size_t len);
Status ProcessCOM(const uint8_t* data, size_t len);
Status ProcessScan(const uint8_t* data, size_t len, size_t* pos);
void SaveMCUCodingState();
void RestoreMCUCodingState();
void PrepareForOutput();
void ProcessOutput(size_t* num_output_rows, size_t max_output_rows);
};
Status DecodeJpeg(const std::vector<uint8_t>& compressed,
JxlDataType output_data_type, ThreadPool* pool,
PackedPixelFile* ppf);
} // namespace extras
} // namespace jxl
#endif // LIB_EXTRAS_DECODE_JPEG_H_

View File

@ -1,190 +0,0 @@
// Copyright (c) the JPEG XL Project Authors. All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "lib/extras/decode_jpeg.h"
#include <stddef.h>
#include <stdio.h>
#if JPEGXL_ENABLE_JPEG
#include "lib/extras/dec/jpg.h"
#endif
#include "lib/jxl/test_utils.h"
#include "lib/jxl/testdata.h"
namespace jxl {
namespace extras {
namespace {
using test::DistanceRMS;
struct TestConfig {
std::string fn;
std::string fn_desc;
size_t chunk_size;
size_t max_output_lines;
};
class DecodeJpegTestParam : public ::testing::TestWithParam<TestConfig> {};
TEST_P(DecodeJpegTestParam, Streaming) {
TestConfig config = GetParam();
const PaddedBytes compressed = ReadTestData(config.fn.c_str());
#if JPEGXL_ENABLE_JPEG
PackedPixelFile ppf_libjpeg;
EXPECT_TRUE(
DecodeImageJPG(Span<const uint8_t>(compressed.data(), compressed.size()),
ColorHints(), SizeConstraints(), &ppf_libjpeg));
ASSERT_EQ(1, ppf_libjpeg.frames.size());
#endif
JpegDecoder dec;
size_t chunk_size = config.chunk_size;
if (chunk_size == 0) chunk_size = compressed.size();
size_t pos = std::min(chunk_size, compressed.size());
ASSERT_EQ(JpegDecoder::Status::kSuccess,
dec.SetInput(compressed.data(), pos));
JpegDecoder::Status status;
for (;;) {
status = dec.ReadHeaders();
if (status == JpegDecoder::Status::kNeedMoreInput) {
ASSERT_LT(pos, compressed.size());
size_t len = std::min(chunk_size, compressed.size() - pos);
ASSERT_EQ(JpegDecoder::Status::kSuccess,
dec.SetInput(compressed.data() + pos, len));
pos += len;
continue;
}
ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
break;
}
#if JPEGXL_ENABLE_JPEG
EXPECT_EQ(ppf_libjpeg.info.xsize, dec.xsize());
EXPECT_EQ(ppf_libjpeg.info.ysize, dec.ysize());
EXPECT_EQ(ppf_libjpeg.info.num_color_channels, dec.num_channels());
#endif
JxlPixelFormat format = {static_cast<uint32_t>(dec.num_channels()),
JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
PackedImage output(dec.xsize(), dec.ysize(), format);
ASSERT_EQ(JpegDecoder::Status::kSuccess, dec.SetOutput(&output));
size_t max_output_lines = config.max_output_lines;
if (max_output_lines == 0) max_output_lines = dec.ysize();
size_t total_output_lines = 0;
while (total_output_lines < dec.ysize()) {
size_t num_output_lines = 0;
status = dec.ReadScanLines(&num_output_lines, max_output_lines);
total_output_lines += num_output_lines;
if (status == JpegDecoder::Status::kNeedMoreInput) {
ASSERT_LT(pos, compressed.size());
size_t len = std::min(chunk_size, compressed.size() - pos);
ASSERT_EQ(JpegDecoder::Status::kSuccess,
dec.SetInput(compressed.data() + pos, len));
pos += len;
continue;
}
ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
if (total_output_lines < dec.ysize()) {
EXPECT_EQ(num_output_lines, max_output_lines);
}
}
#if JPEGXL_ENABLE_JPEG
const PackedImage& output_libjpeg = ppf_libjpeg.frames[0].color;
ASSERT_EQ(output.xsize, output_libjpeg.xsize);
ASSERT_EQ(output.ysize, output_libjpeg.ysize);
EXPECT_LE(
DistanceRMS(reinterpret_cast<const uint8_t*>(output.pixels()),
reinterpret_cast<const uint8_t*>(output_libjpeg.pixels()),
output.xsize, output.ysize, output.format),
0.0075);
#endif
}
std::vector<TestConfig> GenerateTests() {
std::vector<TestConfig> all_tests;
{
std::vector<std::pair<std::string, std::string>> testfiles({
{"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
{"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
{"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
{"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
});
for (const auto& it : testfiles) {
for (size_t chunk_size : {0, 1, 64, 65536}) {
for (size_t max_output_lines : {0, 1, 8, 16}) {
TestConfig config;
config.fn = it.first;
config.fn_desc = it.second;
config.chunk_size = chunk_size;
config.max_output_lines = max_output_lines;
all_tests.push_back(config);
}
}
}
}
{
std::vector<std::pair<std::string, std::string>> testfiles({
{"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
{"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
{"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
{"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
{"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
{"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
{"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
{"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
"Q85RGBSubsampleBlue"},
});
for (const auto& it : testfiles) {
for (size_t chunk_size : {0, 64}) {
for (size_t max_output_lines : {0, 16}) {
TestConfig config;
config.fn = it.first;
config.fn_desc = it.second;
config.chunk_size = chunk_size;
config.max_output_lines = max_output_lines;
all_tests.push_back(config);
}
}
}
}
return all_tests;
}
std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
os << c.fn_desc;
if (c.chunk_size == 0) {
os << "CompleteInput";
} else {
os << "InputChunks" << c.chunk_size;
}
if (c.max_output_lines == 0) {
os << "CompleteOutput";
} else {
os << "OutputLines" << c.max_output_lines;
}
return os;
}
std::string TestDescription(
const testing::TestParamInfo<DecodeJpegTestParam::ParamType>& info) {
std::stringstream name;
name << info.param;
return name.str();
}
JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeJpegTest, DecodeJpegTestParam,
testing::ValuesIn(GenerateTests()),
TestDescription);
} // namespace
} // namespace extras
} // namespace jxl

View File

@ -27,8 +27,9 @@ namespace jxl {
namespace extras {
namespace HWY_NAMESPACE {
void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
const FrameDimensions& frame_dim, const float* qm,
void ComputeDCTCoefficients(const Image3F& opsin, const bool xyb,
const ImageF& qf, const FrameDimensions& frame_dim,
const float* qm,
std::vector<jpeg::JPEGComponent>* components) {
int max_samp_factor = 1;
for (const auto& c : *components) {
@ -75,7 +76,11 @@ void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
block[ix * 8 + iy] = cc;
}
}
block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
if (xyb) {
// ToXYB does not create zero-centered sample values like RgbToYcbcr
// does, so we apply an offset to the DC values instead.
block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
}
}
}
}
@ -95,12 +100,7 @@ HWY_EXPORT(ComputeDCTCoefficients);
namespace {
std::vector<uint8_t> CreateXybICCAppMarker() {
ColorEncoding c_xyb;
c_xyb.SetColorSpace(ColorSpace::kXYB);
c_xyb.rendering_intent = RenderingIntent::kPerceptual;
JXL_CHECK(c_xyb.CreateICC());
const auto& icc = c_xyb.ICC();
std::vector<uint8_t> CreateICCAppMarker(const PaddedBytes& icc) {
std::vector<uint8_t> icc_marker(17 + icc.size());
// See the APP2 marker format for embedded ICC profile at
// https://www.color.org/technotes/ICC-Technote-ProfileEmbedding.pdf
@ -116,7 +116,15 @@ std::vector<uint8_t> CreateXybICCAppMarker() {
return icc_marker;
}
static constexpr float kBaseQuantMatrix[] = {
std::vector<uint8_t> CreateXybICCAppMarker() {
ColorEncoding c_xyb;
c_xyb.SetColorSpace(ColorSpace::kXYB);
c_xyb.rendering_intent = RenderingIntent::kPerceptual;
JXL_CHECK(c_xyb.CreateICC());
return CreateICCAppMarker(c_xyb.ICC());
}
static constexpr float kBaseQuantMatrixXYB[] = {
// c = 0
0.010745695802f,
0.014724285860f,
@ -314,9 +322,45 @@ static constexpr float kBaseQuantMatrix[] = {
0.047241950370f,
};
void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
// Y: mozjpeg q99; Cb, Cr: mozjpeg q95
static constexpr float kBaseQuantMatrixYCbCr[] = {
// c = 0
1, 1, 1, 1, 1, 1, 1, 2, //
1, 1, 1, 1, 1, 1, 1, 2, //
1, 1, 1, 1, 1, 1, 2, 3, //
1, 1, 1, 1, 1, 1, 2, 3, //
1, 1, 1, 1, 1, 2, 3, 4, //
1, 1, 1, 1, 2, 2, 3, 5, //
1, 1, 2, 2, 3, 3, 5, 6, //
2, 2, 3, 3, 4, 5, 6, 8, //
// c = 1
2, 2, 2, 2, 3, 4, 6, 9, //
2, 2, 2, 3, 3, 4, 5, 8, //
2, 2, 2, 3, 4, 6, 9, 14, //
2, 3, 3, 4, 5, 7, 11, 16, //
3, 3, 4, 5, 7, 9, 13, 19, //
4, 4, 6, 7, 9, 12, 17, 24, //
6, 5, 9, 11, 13, 17, 23, 31, //
9, 8, 14, 16, 19, 24, 31, 42, //
// c = 2
2, 2, 2, 2, 3, 4, 6, 9, //
2, 2, 2, 3, 3, 4, 5, 8, //
2, 2, 2, 3, 4, 6, 9, 14, //
2, 3, 3, 4, 5, 7, 11, 16, //
3, 3, 4, 5, 7, 9, 13, 19, //
4, 4, 6, 7, 9, 12, 17, 24, //
6, 5, 9, 11, 13, 17, 23, 31, //
9, 8, 14, 16, 19, 24, 31, 42, //
};
void AddJpegQuantMatrices(const ImageF& qf, bool xyb, float dc_quant,
float global_scale,
std::vector<jpeg::JPEGQuantTable>* quant_tables,
float* qm) {
const float* const base_quant_matrix =
xyb ? kBaseQuantMatrixXYB : kBaseQuantMatrixYCbCr;
// Scale the base quant matrix based on the scaled XYB scales and the quant
// field.
float qfmin, qfmax;
@ -324,10 +368,10 @@ void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
const float dc_scale = global_scale / dc_quant;
const float ac_scale = global_scale / qfmax;
for (size_t c = 0, ix = 0; c < 3; c++) {
qm[ix] = dc_scale * kBaseQuantMatrix[ix];
qm[ix] = dc_scale * base_quant_matrix[ix];
ix++;
for (size_t j = 1; j < kDCTBlockSize; j++, ix++) {
qm[ix] = ac_scale * kBaseQuantMatrix[ix];
qm[ix] = ac_scale * base_quant_matrix[ix];
}
}
@ -514,26 +558,37 @@ void SetJpegHuffmanCode(const JpegClusteredHistograms& clusters,
}
void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
float global_scale, const bool subsample_blue,
const FrameDimensions& frame_dim, jpeg::JPEGData* out) {
float global_scale, const bool xyb, const bool subsample_blue,
const PaddedBytes& icc, const FrameDimensions& frame_dim,
jpeg::JPEGData* out) {
*out = jpeg::JPEGData();
// ICC
out->marker_order.push_back(0xe2);
out->app_data.push_back(CreateXybICCAppMarker());
if (xyb) {
out->app_data.push_back(CreateXybICCAppMarker());
} else {
out->app_data.push_back(CreateICCAppMarker(icc));
}
// DQT
out->marker_order.emplace_back(0xdb);
float qm[3 * kDCTBlockSize];
AddJpegQuantMatrices(qf, dc_quant, global_scale, &out->quant, qm);
AddJpegQuantMatrices(qf, xyb, dc_quant, global_scale, &out->quant, qm);
// SOF
out->marker_order.emplace_back(0xc2);
out->components.resize(3);
out->height = frame_dim.ysize;
out->width = frame_dim.xsize;
out->components[0].id = 'R';
out->components[1].id = 'G';
out->components[2].id = 'B';
if (xyb) {
out->components[0].id = 'R';
out->components[1].id = 'G';
out->components[2].id = 'B';
} else {
out->components[0].id = 1;
out->components[1].id = 2;
out->components[2].id = 3;
}
size_t max_samp_factor = subsample_blue ? 2 : 1;
for (size_t c = 0; c < 3; ++c) {
const size_t factor = (subsample_blue && c == 2) ? 2 : 1;
@ -546,7 +601,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
out->components[c].quant_idx = c;
}
HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)
(opsin, qf, frame_dim, qm, &out->components);
(opsin, xyb, qf, frame_dim, qm, &out->components);
// DHT (the actual Huffman codes will be added later).
out->marker_order.emplace_back(0xc4);
@ -635,9 +690,9 @@ size_t JpegSize(const jpeg::JPEGData& jpeg_data) {
} // namespace
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
ThreadPool* pool, std::vector<uint8_t>* compressed) {
const bool subsample_blue = true;
const bool subsample_blue = jpeg_settings.xyb;
const size_t max_shift = subsample_blue ? 1 : 0;
FrameDimensions frame_dim;
frame_dim.Set(input.xsize(), input.ysize(), 1, max_shift, max_shift, false,
@ -651,17 +706,35 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
// Compute adaptive quant field.
ImageF mask;
ImageF qf = InitialQuantField(distance, opsin, frame_dim, pool, 1.0, &mask);
ScaleXYB(&opsin);
ImageF qf = InitialQuantField(jpeg_settings.distance, opsin, frame_dim, pool,
1.0, &mask);
if (jpeg_settings.xyb) {
ScaleXYB(&opsin);
} else {
opsin.ShrinkTo(input.xsize(), input.ysize());
JXL_RETURN_IF_ERROR(RgbToYcbcr(
input.color().Plane(0), input.color().Plane(1), input.color().Plane(2),
&opsin.Plane(0), &opsin.Plane(1), &opsin.Plane(2), pool));
PadImageToBlockMultipleInPlace(&opsin, 8 << max_shift);
}
// Create jpeg data and optimize Huffman codes.
jpeg::JPEGData jpeg_data;
float global_scale = 0.66f;
float dc_quant = InitialQuantDC(distance);
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
&jpeg_data);
if (!jpeg_settings.xyb) {
global_scale /= 500;
if (input.metadata()->color_encoding.tf.IsPQ()) {
global_scale *= .4f;
} else if (input.metadata()->color_encoding.tf.IsHLG()) {
global_scale *= .5f;
}
}
float dc_quant = InitialQuantDC(jpeg_settings.distance);
FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
subsample_blue, input.metadata()->color_encoding.ICC(),
frame_dim, &jpeg_data);
if (target_size != 0) {
if (jpeg_settings.target_size != 0) {
// Tweak the jpeg data so that the resulting compressed file is
// approximately target_size long.
size_t prev_size = 0;
@ -670,7 +743,7 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
size_t iter = 0;
for (;;) {
size_t size = JpegSize(jpeg_data);
float error = size * 1.0f / target_size - 1.0f;
float error = size * 1.0f / jpeg_settings.target_size - 1.0f;
if (std::abs(error) < std::abs(best_error)) {
best_error = error;
best_global_scale = global_scale;
@ -679,13 +752,15 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
break;
}
global_scale *= 1.0f + error;
FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
&jpeg_data);
FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
subsample_blue, input.metadata()->color_encoding.ICC(),
frame_dim, &jpeg_data);
prev_size = size;
++iter;
}
if (best_global_scale != global_scale) {
FillJPEGData(opsin, qf, dc_quant, best_global_scale, subsample_blue,
FillJPEGData(opsin, qf, dc_quant, best_global_scale, jpeg_settings.xyb,
subsample_blue, input.metadata()->color_encoding.ICC(),
frame_dim, &jpeg_data);
}
}

View File

@ -16,7 +16,13 @@
namespace jxl {
namespace extras {
Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
struct JpegSettings {
bool xyb = true;
size_t target_size = 0;
float distance = 1.f;
};
Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
ThreadPool* pool, std::vector<uint8_t>* compressed);
} // namespace extras

View File

@ -58,7 +58,6 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,
JXL_RETURN_IF_ERROR(ConvertFromExternal(
span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
/*alpha_is_premultiplied=*/info.alpha_premultiplied,
frame_bits_per_sample, frame.color.format, pool, bundle));
bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());

View File

@ -168,16 +168,6 @@ typedef enum {
*/
JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3,
/** The decoder is able to decode a DC image and requests setting a DC output
* buffer using @ref JxlDecoderSetDCOutBuffer. This occurs if @ref
* JXL_DEC_DC_IMAGE is requested and it is possible to decode a DC image from
* the codestream and the DC out buffer was not yet set. This event re-occurs
* for new frames if there are multiple animation frames.
* @deprecated The DC feature in this form will be removed. For progressive
* rendering, @ref JxlDecoderFlushImage should be used.
*/
JXL_DEC_NEED_DC_OUT_BUFFER = 4,
/** The decoder requests an output buffer to store the full resolution image,
* which can be set with @ref JxlDecoderSetImageOutBuffer or with @ref
* JxlDecoderSetImageOutCallback. This event re-occurs for new frames if
@ -260,28 +250,12 @@ typedef enum {
*/
JXL_DEC_FRAME = 0x400,
/** Informative event by @ref JxlDecoderProcessInput
* "JxlDecoderProcessInput": DC image, 8x8 sub-sampled frame, decoded. It is
* not guaranteed that the decoder will always return DC separately, but when
* it does it will do so before outputting the full frame. @ref
* JxlDecoderSetDCOutBuffer must be used after getting the basic image
* information to be able to get the DC pixels, if not this return status only
* indicates we're past this point in the codestream. This event occurs max
* once per frame and always later than @ref JXL_DEC_FRAME and other header
* events and earlier than full resolution pixel data.
*
* @deprecated The DC feature in this form will be removed. For progressive
* rendering, @ref JxlDecoderFlushImage should be used.
*/
JXL_DEC_DC_IMAGE = 0x800,
/** Informative event by @ref JxlDecoderProcessInput
* "JxlDecoderProcessInput": full frame (or layer, in case coalescing is
* disabled) is decoded. @ref JxlDecoderSetImageOutBuffer must be used after
* getting the basic image information to be able to get the image pixels, if
* not this return status only indicates we're past this point in the
* codestream. This event occurs max once per frame and always later than @ref
* JXL_DEC_DC_IMAGE.
* codestream. This event occurs max once per frame.
* In this case, @ref JxlDecoderReleaseInput will return all bytes from the
* end of the frame (or if @ref JXL_DEC_JPEG_RECONSTRUCTION is subscribed to,
* from the end of the last box that is needed for jpeg reconstruction) as
@ -599,8 +573,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
* available and this informative event is subscribed to.
* @return @ref JXL_DEC_PREVIEW_IMAGE when preview pixel information is
* available and output in the preview buffer.
* @return @ref JXL_DEC_DC_IMAGE when DC pixel information (8x8 downscaled
* version of the image) is available and output is in the DC buffer.
* @return @ref JXL_DEC_FULL_IMAGE when all pixel information at highest detail
* is available and has been output in the pixel buffer.
*/
@ -992,44 +964,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
/**
* Returns the minimum size in bytes of the DC image output buffer
* for the given format. This is the buffer for @ref JxlDecoderSetDCOutBuffer.
* Requires the basic image information is available in the decoder.
*
* @param dec decoder object
* @param format format of pixels
* @param size output value, buffer size in bytes
* @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
* information not available yet.
*
* @deprecated The DC feature in this form will be removed. Use @ref
* JxlDecoderFlushImage for progressive rendering.
*/
JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize(
const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
/**
* Sets the buffer to write the lower resolution (8x8 sub-sampled) DC image
* to. The size of the buffer must be at least as large as given by @ref
* JxlDecoderDCOutBufferSize. The buffer follows the format described by
* JxlPixelFormat. The DC image has dimensions ceil(xsize / 8) * ceil(ysize /
* 8). The buffer is owned by the caller.
*
* @param dec decoder object
* @param format format of pixels. Object owned by user and its contents are
* copied internally.
* @param buffer buffer type to output the pixel data to
* @param size size of buffer in bytes
* @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
* size too small.
*
* @deprecated The DC feature in this form will be removed. Use @ref
* JxlDecoderFlushImage for progressive rendering.
*/
JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer(
JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
/**
* Returns the minimum size in bytes of the image output pixel buffer for the
* given format. This is the buffer for @ref JxlDecoderSetImageOutBuffer.

Some files were not shown because too many files have changed in this diff Show More