Bug 1807473 - Update libjxl and highway r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D166317
2024-10-07 18:04:46 +00:00 · 2023-01-09 16:54:44 +00:00 · 2023-01-09 16:54:44 +00:00 · c9046ede50
commit c9046ede50
parent bb9d3ed10b
195 changed files with 17395 additions and 7868 deletions
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
+  release: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a (2022-11-18T10:04:25Z).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 22e3d7276f4157d4a47586ba9fd91dd6303f441a
+  revision: f670ea580bb70b4113b63b9cdaa42ba9b10cd13a

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: afa493d9c7c8b47b6ce709180a74a49085291776 (2022-11-12T22:27:21Z).
+  release: 31e38dae584bae991631750ed6a04f1f6323846a (2023-01-09T10:57:58Z).

-  revision: afa493d9c7c8b47b6ce709180a74a49085291776
+  revision: 31e38dae584bae991631750ed6a04f1f6323846a

  license: Apache-2.0

--- a/third_party/highway/BUILD
+++ b/third_party/highway/BUILD
@ -161,6 +161,8 @@ cc_library(
        # These are textual because config macros influence them:
        "hwy/detect_targets.h",  # private
        "hwy/targets.h",
+        # This .cc file #includes itself through foreach_target.h
+        "hwy/per_target.cc",
        # End of list
        "hwy/highway.h",  # public
        "hwy/foreach_target.h",  # public
@ -179,7 +181,10 @@ cc_library(
        "hwy/ops/x86_512-inl.h",
        # Select avoids recompiling native arch if only non-native changed
    ] + select({
-        ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
+        ":compiler_emscripten": [
+            "hwy/ops/wasm_128-inl.h",
+            "hwy/ops/wasm_256-inl.h",
+        ],
        "//conditions:default": [],
    }) + select({
        "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
@ -201,6 +206,18 @@ cc_library(
    ],
 )

+cc_library(
+    name = "bit_pack",
+    compatible_with = [],
+    copts = COPTS,
+    textual_hdrs = [
+        "hwy/contrib/bit_pack/bit_pack-inl.h",
+    ],
+    deps = [
+        ":hwy",
+    ],
+)
+
 cc_library(
    name = "dot",
    compatible_with = [],
@ -303,6 +320,7 @@ HWY_TESTS = [
    ("hwy/contrib/algo/", "copy_test"),
    ("hwy/contrib/algo/", "find_test"),
    ("hwy/contrib/algo/", "transform_test"),
+    ("hwy/contrib/bit_pack/", "bit_pack_test"),
    ("hwy/contrib/dot/", "dot_test"),
    ("hwy/contrib/image/", "image_test"),
    ("hwy/contrib/math/", "math_test"),
@ -349,6 +367,7 @@ HWY_TEST_COPTS = select({

 HWY_TEST_DEPS = [
    ":algo",
+    ":bit_pack",
    ":dot",
    ":hwy",
    ":hwy_test_util",
--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@ -19,7 +19,13 @@ if(POLICY CMP0083)
  cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 1.0.1)  # Keep in sync with highway.h version
+# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
+# target "GTest::gtest_main"'.
+if(POLICY CMP0111)
+  cmake_policy(SET CMP0111 OLD)
+endif()
+
+project(hwy VERSION 1.0.2)  # Keep in sync with highway.h version

 # Directly define the ABI version from the cmake project() version values:
 set(LIBRARY_VERSION "${hwy_VERSION}")
@ -27,6 +33,10 @@ set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})

 set(CMAKE_CXX_EXTENSIONS OFF)

+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+# Search for Atomics implementation:
+find_package(Atomics REQUIRED)
+
 # Enabled PIE binaries by default if supported.
 include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
 if(CHECK_PIE_SUPPORTED)
@ -51,6 +61,7 @@ set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
 set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
 set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
 set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
+set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")

 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
@ -111,6 +122,7 @@ set(HWY_SOURCES
    hwy/ops/arm_sve-inl.h
    hwy/ops/emu128-inl.h
    hwy/ops/generic_ops-inl.h
+    hwy/ops/rvv-inl.h
    hwy/ops/scalar-inl.h
    hwy/ops/set_macros-inl.h
    hwy/ops/shared-inl.h
@ -225,8 +237,11 @@ else()
  endif()  # HWY_CMAKE_ARM7

  if(HWY_RISCV)
-    list(APPEND HWY_FLAGS -march=rv64gcv1p0)
    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      # Not yet supported by GCC. When runtime dispatch is supported and
+      # implemented, we will remove v from the required flags. Until then, using
+      # clang for RISC-V will require the CPU to support the V extension (1.0).
+      list(APPEND HWY_FLAGS -march=rv64gcv1p0)
      list(APPEND HWY_FLAGS -menable-experimental-extensions)
    endif()
  endif()
@ -277,16 +292,29 @@ target_include_directories(hwy PUBLIC
 target_compile_features(hwy PUBLIC cxx_std_11)
 set_target_properties(hwy PROPERTIES
  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# For GCC __atomic_store_8, see #887
+target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
 if(UNIX AND NOT APPLE)
-  if(NOT HWY_EMSCRIPTEN)
-    # For GCC __atomic_store_8, see #887
-    target_link_libraries(hwy atomic)
-  endif()
-  # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
  set_property(TARGET hwy APPEND_STRING PROPERTY
    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
 endif()

+if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  # uname -p is broken on this system.  Try uname -m
+  EXECUTE_PROCESS( COMMAND uname -m
+		   OUTPUT_STRIP_TRAILING_WHITESPACE
+		   ERROR_QUIET
+		   OUTPUT_VARIABLE HWY_ARCH)
+else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+  set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
+message(STATUS "Architecture: " ${HWY_ARCH})
+if (HWY_ARCH MATCHES "mips")
+  target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
+endif (HWY_ARCH MATCHES "mips")
+
+
 if (HWY_ENABLE_CONTRIB)
 add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
 target_link_libraries(hwy_contrib hwy)
@ -426,7 +454,7 @@ endif()  # HWY_ENABLE_EXAMPLES

 include(CTest)

-if(BUILD_TESTING)
+if(BUILD_TESTING AND HWY_ENABLE_TESTS)
 enable_testing()
 include(GoogleTest)

@ -458,13 +486,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
                 EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
 endif()  # HWY_SYSTEM_GTEST

 set(HWY_TEST_FILES
@ -517,7 +538,11 @@ list(APPEND HWY_TEST_FILES
 endif()  # HWY_ENABLE_CONTRIB

 if(HWY_SYSTEM_GTEST)
-  set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+  if (CMAKE_VERSION VERSION_LESS 3.20)
+    set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+  else()
+    set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
+  endif()
 else()
  set(HWY_GTEST_LIBS gtest gtest_main)
 endif()
@ -534,7 +559,9 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
  # that include us may set them.
  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)

-  target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
+  target_link_libraries(${TESTNAME} PRIVATE ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
+  # For GCC __atomic_store_8, see #887
+  target_link_libraries(${TESTNAME} PRIVATE ${ATOMICS_LIBRARIES})
  # Output test targets in the test directory.
  set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")

--- a/third_party/highway/README.md
+++ b/third_party/highway/README.md
@ -55,7 +55,8 @@ layouts, and aligned/padded allocations.

 Online demos using Compiler Explorer:

-   [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
+-   [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
+    (recommended)
 -   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)

 Projects using Highway: (to add yours, feel free to raise an issue or contact us
@ -74,6 +75,10 @@ Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
 requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
 WASM SIMD, RISC-V V.

+`HWY_WASM_EMU256` is a 2x unrolled version of wasm128 and is enabled if
+`HWY_WANT_WASM2` is defined. This will remain supported until it is potentially
+superseded by a future version of WASM.
+
 SVE was initially tested using farm_sve (see acknowledgments).

 ### Versioning
@ -134,6 +139,10 @@ Or you can run `run_tests.sh` (`run_tests.bat` on Windows).

 Bazel is also supported for building, but it is not as widely used/tested.

+When building for Arm v7, a limitation of current compilers requires you to add
+`-DHWY_CMAKE_ARM7:BOOL=ON` to the CMake command line; see #834 and #1032. We
+understand that work is underway to remove this limitation.
+
 ## Quick start

 You can use the `benchmark` inside examples/ as a starting point.
@ -142,6 +151,9 @@ A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
 and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
 indicates the number of instructions per operation.

+The [FAQ](g3doc/faq.md) answers questions about portability, API design and
+where to find more information.
+
 We recommend using full SIMD vectors whenever possible for maximum performance
 portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
 `HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
@ -163,8 +175,8 @@ Due to ADL restrictions, user code calling Highway ops must either:
    hn::Add()`; or
 *   add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.

-Additionally, each function that calls Highway ops must either be prefixed with
-`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
+Additionally, each function that calls Highway ops (such as `Load`) must either
+be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
 `HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
 their opening brace.

@ -186,6 +198,27 @@ they use static or dynamic dispatch.
    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
    defined and `foreach_target.h` is included.

+When using dynamic dispatch, `foreach_target.h` is included from translation
+units (.cc files), not headers. Headers containing vector code shared between
+several translation units require a special include guard, for example the
+following taken from `examples/skeleton-inl.h`:
+
+```
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+#include "hwy/highway.h"
+// Your vector code
+#endif
+```
+
+By convention, we name such headers `-inl.h` because their contents (often
+function templates) are usually inlined.
+
 ## Compiler flags

 Applications should be compiled with optimizations enabled - without inlining,
--- a/third_party/highway/cmake/FindAtomics.cmake
+++ b/third_party/highway/cmake/FindAtomics.cmake
@ -0,0 +1,56 @@
+# Original issue:
+# * https://gitlab.kitware.com/cmake/cmake/-/issues/23021#note_1098733
+#
+# For reference:
+# * https://gcc.gnu.org/wiki/Atomic/GCCMM
+#
+# riscv64 specific:
+# * https://lists.debian.org/debian-riscv/2022/01/msg00009.html
+#
+# ATOMICS_FOUND        - system has c++ atomics
+# ATOMICS_LIBRARIES    - libraries needed to use c++ atomics
+
+include(CheckCXXSourceCompiles)
+
+# RISC-V only has 32-bit and 64-bit atomic instructions. GCC is supposed
+# to convert smaller atomics to those larger ones via masking and
+# shifting like LLVM, but it’s a known bug that it does not. This means
+# anything that wants to use atomics on 1-byte or 2-byte types needs
+# -latomic, but not 4-byte or 8-byte (though it does no harm).
+set(atomic_code
+    "
+     #include <atomic>
+     #include <cstdint>
+     std::atomic<uint8_t> n8 (0); // riscv64
+     std::atomic<uint64_t> n64 (0); // armel, mipsel, powerpc
+     int main() {
+       ++n8;
+       ++n64;
+       return 0;
+     }")
+
+# https://gitlab.kitware.com/cmake/cmake/-/issues/24063
+set(CMAKE_CXX_STANDARD 11)
+check_cxx_source_compiles("${atomic_code}" ATOMICS_LOCK_FREE_INSTRUCTIONS)
+
+if(ATOMICS_LOCK_FREE_INSTRUCTIONS)
+  set(ATOMICS_FOUND TRUE)
+  set(ATOMICS_LIBRARIES)
+else()
+  set(CMAKE_REQUIRED_LIBRARIES "-latomic")
+  check_cxx_source_compiles("${atomic_code}" ATOMICS_IN_LIBRARY)
+  set(CMAKE_REQUIRED_LIBRARIES)
+  if(ATOMICS_IN_LIBRARY)
+    set(ATOMICS_LIBRARY atomic)
+    include(FindPackageHandleStandardArgs)
+    find_package_handle_standard_args(Atomics DEFAULT_MSG ATOMICS_LIBRARY)
+    set(ATOMICS_LIBRARIES ${ATOMICS_LIBRARY})
+    unset(ATOMICS_LIBRARY)
+  else()
+    if(Atomics_FIND_REQUIRED)
+      message(FATAL_ERROR "Neither lock free instructions nor -latomic found.")
+    endif()
+  endif()
+endif()
+unset(atomic_code)
+unset(CMAKE_CXX_STANDARD)
--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@ -1,3 +1,18 @@
+highway (1.0.2-1) UNRELEASED; urgency=medium
+
+* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
+* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
+* Faster sort for low-entropy input, improved pivot selection
+* Add GN build system, Highway FAQ, k32v32 type to vqsort
+* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
+* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
+* Fix missing exec_prefix, RVV build, warnings, libatomic linking
+* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
+* Documentation/example improvements
+* Support static dispatch to SVE2_128 and SVE_256
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 27 Oct 2022 17:00:00 +0200
+
 highway (1.0.1-1) UNRELEASED; urgency=medium

 * Add Eq128, i64 Mul, unsigned->float ConvertTo
--- a/third_party/highway/hwy.gni
+++ b/third_party/highway/hwy.gni
@ -0,0 +1,53 @@
+_hwy = get_path_info("hwy", "abspath")
+
+hwy_public = [
+  # Public
+  "$_hwy/aligned_allocator.h",
+  "$_hwy/base.h",
+  "$_hwy/cache_control.h",
+  "$_hwy/per_target.h",
+  "$_hwy/print.h",
+
+  # Public, textual
+  "$_hwy/foreach_target.h",
+  "$_hwy/highway_export.h",
+  "$_hwy/highway.h",
+  "$_hwy/print-inl.h",
+
+  # Private
+  "$_hwy/detect_compiler_arch.h",
+  "$_hwy/detect_targets.h",
+  "$_hwy/targets.h",
+
+  # Private, textual:
+  "$_hwy/ops/arm_neon-inl.h",
+  "$_hwy/ops/arm_sve-inl.h",
+  "$_hwy/ops/emu128-inl.h",
+  "$_hwy/ops/generic_ops-inl.h",
+  "$_hwy/ops/scalar-inl.h",
+  "$_hwy/ops/set_macros-inl.h",
+  "$_hwy/ops/shared-inl.h",
+  "$_hwy/ops/x86_128-inl.h",
+  "$_hwy/ops/x86_256-inl.h",
+  "$_hwy/ops/x86_512-inl.h",
+]
+
+hwy_sources = [
+  "$_hwy/aligned_allocator.cc",
+  "$_hwy/per_target.cc",
+  "$_hwy/print.cc",
+  "$_hwy/targets.cc",
+]
+
+hwy_contrib_public = [
+  "$_hwy/contrib/algo/copy-inl.h",
+  "$_hwy/contrib/algo/find-inl.h",
+  "$_hwy/contrib/algo/transform-inl.h",
+  "$_hwy/contrib/dot/dot-inl.h",
+  "$_hwy/contrib/image/image.h",
+  "$_hwy/contrib/math/math-inl.h",
+]
+
+hwy_contrib_sources = [
+  "$_hwy/contrib/image/image.cc",
+]
--- a/third_party/highway/hwy/aligned_allocator_test.cc
+++ b/third_party/highway/hwy/aligned_allocator_test.cc
@ -48,7 +48,7 @@ class SampleObject {

 class FakeAllocator {
 public:
-  // static AllocPtr and FreePtr member to be used with the alligned
+  // static AllocPtr and FreePtr member to be used with the aligned
  // allocator. These functions calls the private non-static members.
  static void* StaticAlloc(void* opaque, size_t bytes) {
    return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@ -143,7 +143,7 @@
 #define HWY_DEFAULT_UNROLL HWY_UNROLL()
 #else
 #define HWY_UNROLL(factor)
-#define HWY_DEFAULT_UNROLL HWY_UNROLL()
+#define HWY_DEFAULT_UNROLL
 #endif


@ -293,6 +293,13 @@ struct alignas(16) K64V64 {
  uint64_t key;
 };

+// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
+// than when considering both to be a 64-bit key.
+struct alignas(8) K32V32 {
+  uint32_t value;  // little-endian layout
+  uint32_t key;
+};
+
 #pragma pack(pop)

 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
@ -304,6 +311,10 @@ static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
                                              const uint128_t& b) {
  return b < a;
 }
+static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
+                                               const uint128_t& b) {
+  return a.lo == b.lo && a.hi == b.hi;
+}

 static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
                                              const K64V64& b) {
@ -314,6 +325,24 @@ static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
                                              const K64V64& b) {
  return b < a;
 }
+static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a,
+                                               const K64V64& b) {
+  return a.key == b.key;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
+                                              const K32V32& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
+                                              const K32V32& b) {
+  return b < a;
+}
+static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a,
+                                               const K32V32& b) {
+  return a.key == b.key;
+}

 //------------------------------------------------------------------------------
 // Controlling overload resolution (SFINAE)
@ -369,6 +398,8 @@ HWY_API constexpr bool IsSame() {
  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
 #define HWY_IF_LANE_SIZE_LT(T, bytes) \
  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
+#define HWY_IF_LANE_SIZE_GE(T, bytes) \
+  hwy::EnableIf<sizeof(T) >= (bytes)>* = nullptr

 #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
@ -401,16 +432,14 @@ struct Relations<uint8_t> {
  using Unsigned = uint8_t;
  using Signed = int8_t;
  using Wide = uint16_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int8_t> {
  using Unsigned = uint8_t;
  using Signed = int8_t;
  using Wide = int16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint16_t> {
@ -418,8 +447,7 @@ struct Relations<uint16_t> {
  using Signed = int16_t;
  using Wide = uint32_t;
  using Narrow = uint8_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int16_t> {
@ -427,8 +455,7 @@ struct Relations<int16_t> {
  using Signed = int16_t;
  using Wide = int32_t;
  using Narrow = int8_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint32_t> {
@ -437,8 +464,7 @@ struct Relations<uint32_t> {
  using Float = float;
  using Wide = uint64_t;
  using Narrow = uint16_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int32_t> {
@ -447,8 +473,7 @@ struct Relations<int32_t> {
  using Float = float;
  using Wide = int64_t;
  using Narrow = int16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint64_t> {
@ -457,8 +482,7 @@ struct Relations<uint64_t> {
  using Float = double;
  using Wide = uint128_t;
  using Narrow = uint32_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<int64_t> {
@ -466,15 +490,13 @@ struct Relations<int64_t> {
  using Signed = int64_t;
  using Float = double;
  using Narrow = int32_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
+  enum { is_signed = 1, is_float = 0 };
 };
 template <>
 struct Relations<uint128_t> {
  using Unsigned = uint128_t;
  using Narrow = uint64_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
+  enum { is_signed = 0, is_float = 0 };
 };
 template <>
 struct Relations<float16_t> {
@ -482,16 +504,14 @@ struct Relations<float16_t> {
  using Signed = int16_t;
  using Float = float16_t;
  using Wide = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<bfloat16_t> {
  using Unsigned = uint16_t;
  using Signed = int16_t;
  using Wide = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<float> {
@ -500,8 +520,7 @@ struct Relations<float> {
  using Float = float;
  using Wide = double;
  using Narrow = float16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
+  enum { is_signed = 1, is_float = 1 };
 };
 template <>
 struct Relations<double> {
@ -509,8 +528,7 @@ struct Relations<double> {
  using Signed = int64_t;
  using Float = double;
  using Narrow = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
+  enum { is_signed = 1, is_float = 1 };
 };

 template <size_t N>
@ -649,6 +667,20 @@ constexpr double HighestValue<double>() {
  return 1.7976931348623158e+308;
 }

+// Difference between 1.0 and the next representable value.
+template <typename T>
+HWY_API constexpr T Epsilon() {
+  return 1;
+}
+template <>
+constexpr float Epsilon<float>() {
+  return 1.192092896e-7f;
+}
+template <>
+constexpr double Epsilon<double>() {
+  return 2.2204460492503131e-16;
+}
+
 // Returns width in bits of the mantissa field in IEEE binary32/64.
 template <typename T>
 constexpr int MantissaBits() {
--- a/third_party/highway/hwy/contrib/algo/find_test.cc
+++ b/third_party/highway/hwy/contrib/algo/find_test.cc
@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <algorithm>
+#include <algorithm>  // std::find_if
 #include <vector>

 #include "hwy/aligned_allocator.h"
--- a/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
+++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack-inl.h
--- a/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
+++ b/third_party/highway/hwy/contrib/bit_pack/bit_pack_test.cc
@ -0,0 +1,177 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/nanobenchmark.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/bit_pack/bit_pack_test.cc"  // NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/bit_pack/bit_pack-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+#ifndef HWY_BIT_PACK_BENCHMARK
+#define HWY_BIT_PACK_BENCHMARK 0
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+// Used to prevent running benchmark (slow) for partial vectors and targets
+// except the best available. Global, not per-target, hence must be outside
+// HWY_NAMESPACE. Declare first because HWY_ONCE is only true after some code
+// has been re-included.
+extern size_t last_bits;
+extern uint64_t best_target;
+#if HWY_ONCE
+size_t last_bits = 0;
+uint64_t best_target = ~0ull;
+#endif
+namespace HWY_NAMESPACE {
+
+template <size_t kBits, typename T>
+T Random(RandomState& rng) {
+  return static_cast<T>(Random32(&rng) & kBits);
+}
+
+template <typename T>
+class Checker {
+ public:
+  explicit Checker(size_t num) { raw_.reserve(num); }
+  void NotifyRaw(T raw) { raw_.push_back(raw); }
+
+  void NotifyRawOutput(size_t bits, T raw) {
+    if (raw_[num_verified_] != raw) {
+      HWY_ABORT("%zu bits: pos %zu of %zu, expected %.0f actual %.0f\n", bits,
+                num_verified_, raw_.size(),
+                static_cast<double>(raw_[num_verified_]),
+                static_cast<double>(raw));
+    }
+    ++num_verified_;
+  }
+
+ private:
+  std::vector<T> raw_;
+  size_t num_verified_ = 0;
+};
+
+template <class PackT>
+struct TestPack {
+  template <typename T, class D>
+  void operator()(T /* t */, D d) {
+    const size_t N = Lanes(d);
+    RandomState rng(N * 129);
+    const size_t num = N * PackT::kRawVectors;
+    const size_t packed_size = N * PackT::kPackedVectors;
+    Checker<T> checker(num);
+    AlignedFreeUniquePtr<T[]> raw = hwy::AllocateAligned<T>(num);
+    AlignedFreeUniquePtr<T[]> raw2 = hwy::AllocateAligned<T>(num);
+    AlignedFreeUniquePtr<T[]> packed = hwy::AllocateAligned<T>(packed_size);
+
+    for (size_t i = 0; i < num; ++i) {
+      raw[i] = Random<PackT::kBits, T>(rng);
+      checker.NotifyRaw(raw[i]);
+    }
+
+    best_target = HWY_MIN(best_target, HWY_TARGET);
+    const bool run_bench = HWY_BIT_PACK_BENCHMARK &&
+                           (PackT::kBits != last_bits) &&
+                           (HWY_TARGET == best_target);
+    last_bits = PackT::kBits;
+
+    if (run_bench) {
+      const size_t kNumInputs = 1;
+      const size_t num_items = num * size_t(Unpredictable1());
+      const FuncInput inputs[kNumInputs] = {num_items};
+      Result results[kNumInputs];
+
+      Params p;
+      p.verbose = false;
+      p.max_evals = 7;
+      p.target_rel_mad = 0.002;
+      const size_t num_results = MeasureClosure(
+          [&](FuncInput) HWY_ATTR {
+            PackT().Pack(d, raw.get(), packed.get());
+            PackT().Unpack(d, packed.get(), raw2.get());
+            return raw2[Random32(&rng) % num];
+          },
+          inputs, kNumInputs, results, p);
+      if (num_results != kNumInputs) {
+        fprintf(stderr, "MeasureClosure failed.\n");
+        return;
+      }
+      // Print cycles per element
+      for (size_t i = 0; i < num_results; ++i) {
+        const double cycles_per_item =
+            results[i].ticks / static_cast<double>(results[i].input);
+        const double mad = results[i].variability * cycles_per_item;
+        printf("Bits:%2d elements:%3d cyc/elt:%6.3f (+/- %5.3f)\n",
+               static_cast<int>(PackT::kBits),
+               static_cast<int>(results[i].input), cycles_per_item, mad);
+      }
+    } else {
+      PackT().Pack(d, raw.get(), packed.get());
+      PackT().Unpack(d, packed.get(), raw2.get());
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+      checker.NotifyRawOutput(PackT::kBits, raw2[i]);
+    }
+  }
+};
+
+void TestAllPack8() {
+  ForShrinkableVectors<TestPack<detail::Pack8<1>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<2>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<3>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<4>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<5>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<6>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<7>>>()(uint8_t());
+  ForShrinkableVectors<TestPack<detail::Pack8<8>>>()(uint8_t());
+}
+
+void TestAllPack16() {
+  ForShrinkableVectors<TestPack<detail::Pack16<1>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<2>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<3>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<4>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<5>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<6>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<7>>>()(uint16_t());
+  ForShrinkableVectors<TestPack<detail::Pack16<8>>>()(uint16_t());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(BitPackTest);
+HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack8);
+HWY_EXPORT_AND_TEST_P(BitPackTest, TestAllPack16);
+}  // namespace hwy
+
+#endif
--- a/third_party/highway/hwy/contrib/image/image.cc
+++ b/third_party/highway/hwy/contrib/image/image.cc
@ -15,7 +15,7 @@

 #include "hwy/contrib/image/image.h"

-#include <algorithm>  // swap
+#include <algorithm>  // std::swap
 #include <cstddef>

 #undef HWY_TARGET_INCLUDE
--- a/third_party/highway/hwy/contrib/image/image.h
+++ b/third_party/highway/hwy/contrib/image/image.h
@ -22,7 +22,6 @@
 #include <stdint.h>
 #include <string.h>

-#include <cstddef>
 #include <utility>  // std::move

 #include "hwy/aligned_allocator.h"
--- a/third_party/highway/hwy/contrib/math/math_test.cc
+++ b/third_party/highway/hwy/contrib/math/math_test.cc
@ -20,6 +20,7 @@
 #include <stdio.h>

 #include <cfloat>  // FLT_MAX
+#include <cmath>   // std::abs
 #include <type_traits>

 // clang-format off
--- a/third_party/highway/hwy/contrib/sort/BUILD
+++ b/third_party/highway/hwy/contrib/sort/BUILD
@ -79,6 +79,8 @@ cc_library(
        "vqsort_i32d.cc",
        "vqsort_i64a.cc",
        "vqsort_i64d.cc",
+        "vqsort_kv64a.cc",
+        "vqsort_kv64d.cc",
        "vqsort_kv128a.cc",
        "vqsort_kv128d.cc",
        "vqsort_u16a.cc",
--- a/third_party/highway/hwy/contrib/sort/README.md
+++ b/third_party/highway/hwy/contrib/sort/README.md
@ -9,10 +9,9 @@ and [paper](https://arxiv.org/abs/2205.05982).

 ## Instructions

-Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
-and Arm V1 (NEON, SVE).
+Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).

-### x86 (Linux)
+### Linux

 Please first ensure golang, and Clang (tested with 13.0.1) are installed via
 your system's package manager.
@ -43,9 +42,10 @@ make -j8 && sudo make install
 cd ..
 ```

-AWS clang is at version 11.1, which generates unnecessary AND instructions which
-slow down the sort by 1.15x. We tested with clang trunk as of June 13
+AWS clang is at version 11.1, which generates unnecessary `AND` instructions
+which slow down the sort by 1.15x. We tested with clang trunk as of June 13
 (which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+
 ```
 git clone --depth 1 https://github.com/llvm/llvm-project.git
 cd llvm-project
@ -64,6 +64,12 @@ bazel-bin/hwy/contrib/sort/sort_test
 bazel-bin/hwy/contrib/sort/bench_sort
 ```

+The above command line enables SVE, which is currently only available on
+Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
+changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
+such flags will be unnecessary once Clang supports `#pragma target` for NEON and
+SVE intrinsics, as it does for x86.
+
 ## Results

 `bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
--- a/third_party/highway/hwy/contrib/sort/algo-inl.h
+++ b/third_party/highway/hwy/contrib/sort/algo-inl.h
@ -20,8 +20,9 @@
 #include <stdint.h>
 #include <string.h>  // memcpy

-#include <algorithm>
-#include <cmath>  // std::abs
+#include <algorithm>   // std::sort, std::min, std::max
+#include <functional>  // std::less, std::greater
+#include <thread>      // NOLINT
 #include <vector>

 #include "hwy/base.h"
--- a/third_party/highway/hwy/contrib/sort/bench_sort.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@ -81,13 +81,12 @@ HWY_NOINLINE void BenchPartition() {
      // The pivot value can influence performance. Do exactly what vqsort will
      // do so that the performance (influenced by prefetching and branch
      // prediction) is likely to predict the actual performance inside vqsort.
-      detail::PivotResult result;
-      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
-                                             buf.get(), rng, result);
+      detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
+      detail::SortSamples(d, st, buf.get());
+      auto pivot = detail::ChoosePivotByRank(d, st, buf.get());

      const Timestamp t0;
-      detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
-                        buf.get());
+      detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
      seconds.push_back(SecondsSince(t0));
      // 'Use' the result to prevent optimizing out the partition.
      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
--- a/third_party/highway/hwy/contrib/sort/shared-inl.h
+++ b/third_party/highway/hwy/contrib/sort/shared-inl.h
@ -63,15 +63,16 @@ struct SortConstants {
  }

  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
-  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
-  // are larger, use entire vectors to ensure we do not overrun the array.
-  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
-    return HWY_MAX(64 / sizeof_t, N);
+  // cache line size of 64 bytes to get maximum benefit per L2 miss. Sort()
+  // ensures vectors are no larger than that, so this can be independent of the
+  // vector size and thus constexpr.
+  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t) {
+    return 64 / sizeof_t;
  }

  static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
    // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
-    return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
+    return (3 + 1) * LanesPerChunk(sizeof_t) + 2 * N;
  }

  template <typename T>
--- a/third_party/highway/hwy/contrib/sort/sort_test.cc
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@ -21,6 +21,7 @@
 #include <stdio.h>
 #include <string.h>  // memcpy

+#include <unordered_map>
 #include <vector>

 // clang-format off
@ -49,8 +50,10 @@ using detail::TraitsLane;
 #if VQSORT_ENABLED || HWY_IDE
 using detail::OrderAscending128;
 using detail::OrderAscendingKV128;
+using detail::OrderAscendingKV64;
 using detail::OrderDescending128;
 using detail::OrderDescendingKV128;
+using detail::OrderDescendingKV64;
 using detail::Traits128;

 template <class Traits>
@ -282,10 +285,10 @@ static HWY_NOINLINE void TestPartition() {

  const size_t N1 = st.LanesPerKey();
  for (bool in_asc : {false, true}) {
-    for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
+    for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
      const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
-      for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
-                         2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
+      for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
+                         3 * N - 1, 4 * N - 3, 4 * N - 2}) {
        const size_t len = (base_case_num + ofs) & ~(N1 - 1);
        for (LaneType pivot1 :
             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
@ -311,10 +314,12 @@ static HWY_NOINLINE void TestPartition() {
            for (size_t i = 0; i < left; ++i) {
              lanes[i] = hwy::LowestValue<LaneType>();
            }
+            std::unordered_map<LaneType, int> counts;
            for (size_t i = left; i < right; ++i) {
              lanes[i] = static_cast<LaneType>(
                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
                         : static_cast<LaneType>(right) - LaneType(i));
+              ++counts[lanes[i]];
              if (kDebug >= 2) {
                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
              }
@ -324,7 +329,8 @@ static HWY_NOINLINE void TestPartition() {
            }

            size_t border =
-                detail::Partition(d, st, lanes, left, right, pivot, buf.get());
+                left + detail::Partition(d, st, lanes + left, right - left,
+                                         pivot, buf.get());

            if (kDebug >= 2) {
              printf("out>>>>>>\n");
@ -335,7 +341,15 @@ static HWY_NOINLINE void TestPartition() {
                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
              }
            }
-
+            for (size_t i = left; i < right; ++i) {
+              --counts[lanes[i]];
+            }
+            for (auto kv : counts) {
+              if (kv.second != 0) {
+                PrintValue(kv.first);
+                HWY_ABORT("Incorrect count %d\n", kv.second);
+              }
+            }
            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
            for (size_t i = 0; i < misalign; ++i) {
              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
@ -357,15 +371,18 @@ static HWY_NOINLINE void TestPartition() {
 }

 HWY_NOINLINE void TestAllPartition() {
-  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
+  TestPartition<Traits128<OrderAscending128> >();
+
+#if !HWY_IS_DEBUG_BUILD
+  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
  TestPartition<TraitsLane<OrderDescending<float> > >();
 #if HWY_HAVE_FLOAT64
  TestPartition<TraitsLane<OrderDescending<double> > >();
 #endif
-  TestPartition<Traits128<OrderAscending128> >();
  TestPartition<Traits128<OrderDescending128> >();
+#endif
 }

 // (used for sample selection for choosing a pivot)
@ -436,7 +453,13 @@ class CompareResults {
    const size_t num_keys = copy_.size() / st.LanesPerKey();
    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
               shared, /*thread=*/0);
-
+#if VQSORT_PRINT >= 3
+    fprintf(stderr, "\nExpected:\n");
+    for (size_t i = 0; i < copy_.size(); ++i) {
+      PrintValue(copy_[i]);
+    }
+    fprintf(stderr, "\n");
+#endif
    for (size_t i = 0; i < copy_.size(); ++i) {
      if (copy_[i] != output[i]) {
        if (sizeof(KeyType) == 16) {
@ -546,7 +569,7 @@ void TestSort(size_t num_lanes) {
 }

 void TestAllSort() {
-  for (int num : {129, 504, 20 * 1000, 34567}) {
+  for (int num : {129, 504, 3 * 1000, 34567}) {
    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
@ -572,6 +595,9 @@ void TestAllSort() {
    TestSort<Traits128<OrderAscending128> >(num_lanes);
    TestSort<Traits128<OrderDescending128> >(num_lanes);

+    TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
+    TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
+
    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
 #endif
--- a/third_party/highway/hwy/contrib/sort/traits-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits-inl.h
@ -42,6 +42,9 @@ namespace detail {
 template <typename T>
 struct KeyLane {
  static constexpr bool Is128() { return false; }
+  // False indicates the entire key (i.e. lane) should be compared. KV stands
+  // for key-value.
+  static constexpr bool IsKV() { return false; }
  constexpr size_t LanesPerKey() const { return 1; }

  // What type bench_sort should allocate for generating inputs.
@ -78,7 +81,20 @@ struct KeyLane {
    return Eq(a, b);
  }

-  HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Ne(a, b);
+  }
+
+  // For keys=lanes, any difference counts.
+  template <class D>
+  HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+    // Must avoid floating-point comparisons (for -0)
+    const RebindToUnsigned<D> du;
+    return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
+  }
+
+  HWY_INLINE bool Equal1(const T* a, const T* b) const { return *a == *b; }

  template <class D>
  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
@ -223,7 +239,7 @@ struct OrderAscending : public KeyLane<T> {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    return Sub(v, Set(d, 1));
+    return Sub(v, Set(d, hwy::Epsilon<T>()));
  }
 };

@ -272,7 +288,142 @@ struct OrderDescending : public KeyLane<T> {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    return Add(v, Set(d, 1));
+    return Add(v, Set(d, hwy::Epsilon<T>()));
+  }
+};
+
+struct KeyValue64 : public KeyLane<uint64_t> {
+  // True indicates only part of the key (i.e. lane) should be compared. KV
+  // stands for key-value.
+  static constexpr bool IsKV() { return true; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq(ShiftRight<32>(a), ShiftRight<32>(b));
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Ne(ShiftRight<32>(a), ShiftRight<32>(b));
+  }
+
+  HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const {
+    return (*a >> 32) == (*b >> 32);
+  }
+
+  // Only count differences in the actual key, not the value.
+  template <class D>
+  HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+    // Must avoid floating-point comparisons (for -0)
+    const RebindToUnsigned<D> du;
+    const Vec<decltype(du)> zero = Zero(du);
+    const Vec<decltype(du)> keys = ShiftRight<32>(diff);  // clear values
+    return AllTrue(du, Eq(BitCast(du, keys), zero));
+  }
+};
+
+struct OrderAscendingKV64 : public KeyValue64 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*a >> 32) < (*b >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  // Same as for regular lanes.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Sub(v, Set(d, uint64_t{1}));
+  }
+};
+
+struct OrderDescendingKV64 : public KeyValue64 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return (*b >> 32) < (*a >> 32);
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
+    return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
+  }
+
+  // Not required to be stable (preserving the order of equivalent keys), so
+  // we can include the value in the comparison.
+  template <class D>
+  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Max(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
+    return Min(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 uint64_t* HWY_RESTRICT /* buf */) const {
+    return MaxOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                uint64_t* HWY_RESTRICT /* buf */) const {
+    return MinOfLanes(d, v);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
+    return Add(v, Set(d, uint64_t{1}));
  }
 };

--- a/third_party/highway/hwy/contrib/sort/traits128-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h
@ -124,6 +124,9 @@ struct KeyAny128 {

 // Base class shared between OrderAscending128, OrderDescending128.
 struct Key128 : public KeyAny128 {
+  // False indicates the entire key should be compared. KV means key-value.
+  static constexpr bool IsKV() { return false; }
+
  // What type to pass to Sorter::operator().
  using KeyType = hwy::uint128_t;

@ -134,7 +137,20 @@ struct Key128 : public KeyAny128 {
    return Eq128(d, a, b);
  }

-  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128(d, a, b);
+  }
+
+  // For keys=entire 128 bits, any difference counts.
+  template <class D>
+  HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+    // Must avoid floating-point comparisons (for -0)
+    const RebindToUnsigned<D> du;
+    return AllTrue(du, Eq(BitCast(du, diff), Zero(du)));
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
    return a[0] == b[0] && a[1] == b[1];
  }
 };
@ -187,8 +203,12 @@ struct OrderAscending128 : public Key128 {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Sub(v, k1);
+    const Vec<D> k0 = Zero(d);
+    const Vec<D> k1 = OddEven(k0, Set(d, uint64_t{1}));
+    const Mask<D> borrow = Eq(v, k0);  // don't-care, lo == 0
+    // lo == 0? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
+    return Sub(Sub(v, k1), adjust);
  }
 };

@ -233,13 +253,21 @@ struct OrderDescending128 : public Key128 {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Add(v, k1);
+    const Vec<D> k1 = OddEven(Zero(d), Set(d, uint64_t{1}));
+    const Vec<D> added = Add(v, k1);
+    const Mask<D> overflowed = Lt(added, v);  // false, overflowed
+    // overflowed? 1 : 0, 0
+    const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
+    return Add(added, adjust);
  }
 };

 // Base class shared between OrderAscendingKV128, OrderDescendingKV128.
 struct KeyValue128 : public KeyAny128 {
+  // True indicates only part of the key (the more significant lane) should be
+  // compared. KV stands for key-value.
+  static constexpr bool IsKV() { return true; }
+
  // What type to pass to Sorter::operator().
  using KeyType = K64V64;

@ -250,7 +278,22 @@ struct KeyValue128 : public KeyAny128 {
    return Eq128Upper(d, a, b);
  }

-  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+  template <class D>
+  HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
+    return Ne128Upper(d, a, b);
+  }
+
+  // Only count differences in the actual key, not the value.
+  template <class D>
+  HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec<D> diff) const {
+    // Must avoid floating-point comparisons (for -0)
+    const RebindToUnsigned<D> du;
+    const Vec<decltype(du)> zero = Zero(du);
+    const Vec<decltype(du)> keys = OddEven(diff, zero);  // clear values
+    return AllTrue(du, Eq(BitCast(du, keys), zero));
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const {
    return a[1] == b[1];
  }
 };
@ -296,7 +339,7 @@ struct OrderAscendingKV128 : public KeyValue128 {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
    return Sub(v, k1);
  }
 };
@ -342,7 +385,7 @@ struct OrderDescendingKV128 : public KeyValue128 {

  template <class D>
  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
+    const Vec<D> k1 = OddEven(Set(d, uint64_t{1}), Zero(d));
    return Add(v, k1);
  }
 };
--- a/third_party/highway/hwy/contrib/sort/vqsort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
--- a/third_party/highway/hwy/contrib/sort/vqsort.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort.h
@ -85,6 +85,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;

+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
  // For internal use only
  static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
  static bool HaveFloat64();
--- a/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64a.cc
@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Asc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Asc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv64d.cc
@ -0,0 +1,65 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV64Desc);
+}  // namespace
+
+void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV64Desc)
+  (reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/detect_compiler_arch.h
+++ b/third_party/highway/hwy/detect_compiler_arch.h
@ -21,7 +21,8 @@

 // Add to #if conditions to prevent IDE from graying out code.
 #if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
-    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
+    (defined Q_CREATOR_RUN) || (defined __CLANGD__) ||        \
+    (defined GROK_ELLIPSIS_BUILD)
 #define HWY_IDE 1
 #else
 #define HWY_IDE 0
@ -69,7 +70,7 @@
 // In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
 // an invalid version number, deduce it from the presence of warnings.
 // Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if defined(__APPLE__) || __clang_major__ >= 999
+#if defined(__apple_build_version__) || __clang_major__ >= 999
 #if __has_warning("-Wbitwise-instead-of-logical")
 #define HWY_COMPILER_CLANG 1400
 #elif __has_warning("-Wreserved-identifier")
@ -85,7 +86,12 @@
 #elif __has_warning("-Wextra-semi-stmt") || \
    __has_builtin(__builtin_rotateleft32)
 #define HWY_COMPILER_CLANG 800
-#elif __has_warning("-Wc++98-compat-extra-semi")
+// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
+// based on Clang 7, but does not support the warning we test.
+// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
+// https://trac.macports.org/wiki/XcodeVersionInfo.
+#elif __has_warning("-Wc++98-compat-extra-semi") || \
+    (defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
 #define HWY_COMPILER_CLANG 700
 #else  // Anything older than 7.0 is not recommended for Highway.
 #define HWY_COMPILER_CLANG 600
--- a/third_party/highway/hwy/detect_targets.h
+++ b/third_party/highway/hwy/detect_targets.h
@ -23,7 +23,7 @@
 //------------------------------------------------------------------------------
 // Optional configuration

-// See ../quick_reference.md for documentation of these macros.
+// See g3doc/quick_reference.md for documentation of these macros.

 // Uncomment to override the default baseline determined from predefined macros:
 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
@ -169,13 +169,14 @@
 #define HWY_ENABLED(targets) \
  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))

-// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
-// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
-// be enabled. If 1, we instead choose HWY_SCALAR even without
+// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
+// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
+// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
+// always be enabled. If 1, we instead choose HWY_SCALAR even without
 // HWY_COMPILE_ONLY_SCALAR being set.
 #if !defined(HWY_BROKEN_EMU128)  // allow overriding
-#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
+#if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203) || \
+    defined(HWY_NO_LIBCXX)
 #define HWY_BROKEN_EMU128 1
 #else
 #define HWY_BROKEN_EMU128 0
@ -215,30 +216,45 @@
 #define HWY_BASELINE_PPC8 0
 #endif

-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
-#define HWY_BASELINE_SVE2 HWY_SVE2
-#else
 #define HWY_BASELINE_SVE2 0
-#endif
-
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
-// Baseline targets can be used unconditionally, which does not apply to
-// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
-// in the baseline would also disable all 'worse' targets (including SVE and
-// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
-// HWY_ATTAINABLE_TARGETS below.
-#define HWY_BASELINE_SVE HWY_SVE
-#else
 #define HWY_BASELINE_SVE 0
-#endif
+#define HWY_BASELINE_NEON 0
+
+#if HWY_ARCH_ARM
+
+#if defined(__ARM_FEATURE_SVE2)
+#undef HWY_BASELINE_SVE2  // was 0, will be re-defined
+// If user specified -msve-vector-bits=128, they assert the vector length is
+// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
+#define HWY_BASELINE_SVE2 HWY_SVE2_128
+// Otherwise we're not sure what the vector length will be. The baseline must be
+// unconditionally valid, so we can only assume HWY_SVE2. However, when running
+// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
+// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
+#else
+#define HWY_BASELINE_SVE2 HWY_SVE2
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE2
+
+#if defined(__ARM_FEATURE_SVE)
+#undef HWY_BASELINE_SVE  // was 0, will be re-defined
+// See above. If user-specified vector length matches our optimization, use it.
+#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
+#define HWY_BASELINE_SVE HWY_SVE_256
+#else
+#define HWY_BASELINE_SVE HWY_SVE
+#endif  // __ARM_FEATURE_SVE_BITS
+#endif  // __ARM_FEATURE_SVE

 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
-#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#undef HWY_BASELINE_NEON
 #define HWY_BASELINE_NEON HWY_NEON
-#else
-#define HWY_BASELINE_NEON 0
 #endif

+#endif  // HWY_ARCH_ARM
+
 // Special handling for MSVC because it has fewer predefined macros:
 #if HWY_COMPILER_MSVC

@ -372,9 +388,12 @@
 #endif
 // Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.

-// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
-// does, and we require Linux to detect CPU capabilities.
-#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
+// Clang, GCC and MSVC allow runtime dispatch on x86.
+#if HWY_ARCH_X86
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+// On Arm, currently only GCC does, and we require Linux to detect CPU
+// capabilities.
+#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
 #define HWY_HAVE_RUNTIME_DISPATCH 1
 #else
 #define HWY_HAVE_RUNTIME_DISPATCH 0
@ -389,15 +408,15 @@
 #define HWY_ATTAINABLE_AVX3_DL 0
 #endif

-#if HWY_ARCH_ARM_A64 && \
-    ((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
 #define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
 #else
 #define HWY_ATTAINABLE_SVE 0
 #endif

-#if HWY_ARCH_ARM_A64 && \
-    ((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
+#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
+                         (HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
 #define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
 #else
 #define HWY_ATTAINABLE_SVE2 0
--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@ -21,8 +21,9 @@
 #include <stdint.h>
 #include <stdio.h>

+#include <cmath>  // std::abs
 #include <memory>
-#include <numeric>  // iota
+#include <numeric>  // std::iota, std::inner_product

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@ -52,10 +52,11 @@ HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
  // Type tags for converting to other element types (Rebind = same count).
  const hn::RebindToSigned<DF> d32;
  const hn::Rebind<uint8_t, DF> d8;
+  using VI32 = hn::Vec<decltype(d32)>;

-  const auto u8 = hn::Load(d8, values);
-  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
-  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+  const VI32 vi32 = hn::PromoteTo(d32, hn::Load(d8, values));
+  const VI32 bits = hn::BitCast(d32, hn::ConvertTo(df, vi32));
+  const VI32 exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
 }

--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@ -29,7 +29,7 @@ namespace hwy {
 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 1
 #define HWY_MINOR 0
-#define HWY_PATCH 1
+#define HWY_PATCH 2

 //------------------------------------------------------------------------------
 // Shorthand for tags (defined in shared-inl.h) used to select overloads.
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@ -16,6 +16,7 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <algorithm>  // std::fill
 #include <bitset>

 #include "hwy/base.h"
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@ -24,14 +24,15 @@
 #include <stdlib.h>
 #include <time.h>    // clock_gettime

-#include <algorithm>  // sort
+#include <algorithm>  // std::sort, std::find_if
 #include <array>
 #include <atomic>
 #include <chrono>  //NOLINT
 #include <limits>
-#include <numeric>  // iota
+#include <numeric>  // std::iota
 #include <random>
 #include <string>
+#include <utility>  // std::pair
 #include <vector>

 #if defined(_WIN32) || defined(_WIN64)
@ -150,7 +151,7 @@ inline Ticks Start() {
      // "cc" = flags modified by SHL.
      : "rdx", "memory", "cc");
 #elif HWY_ARCH_RVV
-  asm volatile("rdcycle %0" : "=r"(t));
+  asm volatile("rdtime %0" : "=r"(t));
 #elif defined(_WIN32) || defined(_WIN64)
  LARGE_INTEGER counter;
  (void)QueryPerformanceCounter(&counter);
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
@ -22,16 +22,18 @@
 #include <stddef.h>
 #include <stdint.h>

-#include "hwy/base.h"  // before HWY_DIAGNOSTICS
+#include "hwy/ops/shared-inl.h"

+HWY_BEFORE_NAMESPACE();
+
+// Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with
+// the same target attribute as our code, see #834.
 HWY_DIAGNOSTICS(push)
 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
 #include <arm_neon.h>
 HWY_DIAGNOSTICS(pop)

-#include "hwy/ops/shared-inl.h"
-
-HWY_BEFORE_NAMESPACE();
+// Must come after arm_neon.h.
 namespace hwy {
 namespace HWY_NAMESPACE {

@ -766,6 +768,9 @@ class Vec128 {
  using Raw = typename detail::Raw128<T, N>::type;

 public:
+  using PrivateT = T;                     // only for DFromV
+  static constexpr size_t kPrivateN = N;  // only for DFromV
+
  HWY_INLINE Vec128() {}
  Vec128(const Vec128&) = default;
  Vec128& operator=(const Vec128&) = default;
@ -822,23 +827,11 @@ class Mask128 {
 template <typename T>
 using Mask64 = Mask128<T, 8 / sizeof(T)>;

-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
+template <class V>
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class V>
-using DFromV = decltype(detail::DeduceD()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
+using TFromV = typename V::PrivateT;

 // ------------------------------ BitCast

@ -1025,19 +1018,21 @@ HWY_API Vec128<bfloat16_t, N> Zero(Simd<bfloat16_t, N, 0> /* tag */) {
 template <class D>
 using VFromD = decltype(Zero(D()));

-// Returns a vector with uninitialized elements.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
-  HWY_DIAGNOSTICS(push)
-  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
+HWY_DIAGNOSTICS(push)
+HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
 #if HWY_COMPILER_GCC_ACTUAL
  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
 #endif
+
+// Returns a vector with uninitialized elements.
+template <typename T, size_t N>
+HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
  typename detail::Raw128<T, N>::type a;
  return Vec128<T, N>(a);
-  HWY_DIAGNOSTICS(pop)
 }

+HWY_DIAGNOSTICS(pop)
+
 // Returns a vector with lane i=[0, N) set to "first" + i.
 template <typename T, size_t N, typename T2>
 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
@ -2277,6 +2272,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== COMPARE

 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
@ -2885,12 +2886,19 @@ HWY_API void StoreU(Vec128<bfloat16_t, N> v, Simd<bfloat16_t, N, 0> d,
  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
 }

+HWY_DIAGNOSTICS(push)
+#if HWY_COMPILER_GCC_ACTUAL
+  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
+#endif
+
 // On ARM, Store is the same as StoreU.
 template <typename T, size_t N>
 HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
  StoreU(v, d, aligned);
 }

+HWY_DIAGNOSTICS(pop)
+
 template <typename T, size_t N>
 HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
                          T* HWY_RESTRICT p) {
@ -3527,6 +3535,11 @@ HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
  return Vec64<double>(vget_low_f64(v.raw));
 }
 #endif
+HWY_API Vec64<bfloat16_t> LowerHalf(const Vec128<bfloat16_t> v) {
+  const Full128<uint16_t> du;
+  const Full64<bfloat16_t> dbh;
+  return BitCast(dbh, LowerHalf(BitCast(du, v)));
+}

 template <typename T, size_t N>
 HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
@ -3727,6 +3740,13 @@ HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
 }
 #endif

+HWY_API Vec64<bfloat16_t> UpperHalf(Full64<bfloat16_t> dbh,
+                                    const Vec128<bfloat16_t> v) {
+  const RebindToUnsigned<decltype(dbh)> duh;
+  const Twice<decltype(duh)> du;
+  return BitCast(dbh, UpperHalf(duh, BitCast(du, v)));
+}
+
 // Partial
 template <typename T, size_t N, HWY_IF_LE64(T, N)>
 HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
@ -4243,6 +4263,48 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(Full128<int32_t> /*d32*/,
+                                                  Vec128<int16_t> a,
+                                                  Vec128<int16_t> b,
+                                                  const Vec128<int32_t> sum0,
+                                                  Vec128<int32_t>& sum1) {
+#if HWY_ARCH_ARM_A64
+  sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
+#else
+  const Full64<int16_t> dh;
+  sum1 = Vec128<int32_t>(
+      vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
+#endif
+  return Vec128<int32_t>(
+      vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
+}
+
+HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(Full64<int32_t> d32,
+                                                 Vec64<int16_t> a,
+                                                 Vec64<int16_t> b,
+                                                 const Vec64<int32_t> sum0,
+                                                 Vec64<int32_t>& sum1) {
+  // vmlal writes into the upper half, which the caller cannot use, so
+  // split into two halves.
+  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
+  const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
+  sum1 += mul_32;
+  return sum0 + LowerHalf(mul_3210);
+}
+
+HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(Full32<int32_t> d32,
+                                                 Vec32<int16_t> a,
+                                                 Vec32<int16_t> b,
+                                                 const Vec32<int32_t> sum0,
+                                                 Vec32<int32_t>& sum1) {
+  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
+  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
+  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
+  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
+  sum1 += mul1;
+  return sum0 + mul0;
+}
+
 // ================================================== COMBINE

 // ------------------------------ Combine (InterleaveLower)
@ -4587,6 +4649,32 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> d16,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  const Vec64<int16_t> a16(vqmovn_s32(a.raw));
+#if HWY_ARCH_ARM_A64
+  (void)d16;
+  return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
+#else
+  const Vec64<int16_t> b16(vqmovn_s32(b.raw));
+  return Combine(d16, a16, b16);
+#endif
+}
+
+HWY_API Vec64<int16_t> ReorderDemote2To(Full64<int16_t> /*d16*/,
+                                        Vec64<int32_t> a, Vec64<int32_t> b) {
+  const Full128<int32_t> d32;
+  const Vec128<int32_t> ab = Combine(d32, a, b);
+  return Vec64<int16_t>(vqmovn_s32(ab.raw));
+}
+
+HWY_API Vec32<int16_t> ReorderDemote2To(Full32<int16_t> /*d16*/,
+                                        Vec32<int32_t> a, Vec32<int32_t> b) {
+  const Full128<int32_t> d32;
+  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
+  return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
+}
+
 // ================================================== CRYPTO

 #if defined(__ARM_FEATURE_AES) || \
@ -4892,7 +4980,8 @@ namespace detail {

 // N=1 for any T: no-op
 template <typename T>
-HWY_INLINE Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
+HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
+                                   const Vec128<T, 1> v) {
  return v;
 }
 template <typename T>
@ -4908,7 +4997,8 @@ HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,

 // u32/i32/f32: N=2
 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
+HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                   const Vec128<T, 2> v10) {
  return v10 + Shuffle2301(v10);
 }
 template <typename T>
@ -4924,48 +5014,59 @@ HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,

 // full vectors
 #if HWY_ARCH_ARM_A64
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                       const Vec128<uint32_t> v) {
  return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
 }
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
+HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                      const Vec128<int32_t> v) {
  return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
 }
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
+HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                    const Vec128<float> v) {
  return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
 }
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
+HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                       const Vec128<uint64_t> v) {
  return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
 }
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
+HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                      const Vec128<int64_t> v) {
  return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
 }
-HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
+HWY_INLINE Vec128<double> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                     const Vec128<double> v) {
  return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
 }
 #else
 // ARMv7 version for everything except doubles.
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
+HWY_INLINE Vec128<uint32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                       const Vec128<uint32_t> v) {
  uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
  uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
  uint32x4x2_t v1 = vuzpq_u32(c0, c0);
  return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
+HWY_INLINE Vec128<int32_t> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                      const Vec128<int32_t> v) {
  int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
  int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
  int32x4x2_t v1 = vuzpq_s32(c0, c0);
  return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
+HWY_INLINE Vec128<float> SumOfLanes(hwy::SizeTag<4> /* tag */,
+                                    const Vec128<float> v) {
  float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
  float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
  float32x4x2_t v1 = vuzpq_f32(c0, c0);
  return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
 }
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
+HWY_INLINE Vec128<uint64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                       const Vec128<uint64_t> v) {
  return v + Shuffle01(v);
 }
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
+HWY_INLINE Vec128<int64_t> SumOfLanes(hwy::SizeTag<8> /* tag */,
+                                      const Vec128<int64_t> v) {
  return v + Shuffle01(v);
 }
 #endif
@ -5001,6 +5102,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+
 template <size_t N, HWY_IF_GE32(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
                                       Vec128<uint16_t, N> v) {
@ -5053,7 +5178,7 @@ HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,

 template <typename T, size_t N>
 HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::SumOfLanes(v);
+  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
 }
 template <typename T, size_t N>
 HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
@ -5399,6 +5524,15 @@ HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
  constexpr int kDiv = 4 * sizeof(T);
  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
 }
+
+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> d,
+                                  const Mask128<T, N> mask) {
+  const uint64_t nib = detail::NibblesFromMask(d, mask);
+  constexpr size_t kDiv = 4 * sizeof(T);
+  return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
+}
+
 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
                               const Mask128<T, N> mask) {
@ -6334,7 +6468,7 @@ HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
  // Truth table of Eq and Lt for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
@ -6371,7 +6505,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
 }
@ -6383,6 +6517,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
 }

+// ------------------------------ Ne128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 // Without a native OddEven, it seems infeasible to go faster than Lt128.
--- a/third_party/highway/hwy/ops/arm_sve-inl.h
+++ b/third_party/highway/hwy/ops/arm_sve-inl.h
@ -265,6 +265,9 @@ HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
 HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
 #undef HWY_SVE_FIRSTN

+template <class D>
+using MFromD = decltype(FirstN(D(), 0));
+
 namespace detail {

 #define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)            \
@ -320,7 +323,9 @@ using VFromD = decltype(Set(D(), TFromD<D>()));

 template <class D>
 VFromD<D> Zero(D d) {
-  return Set(d, 0);
+  // Cast to support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, 0));
 }

 // ------------------------------ Undefined
@ -638,10 +643,9 @@ HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)

 // ------------------------------ MulHigh
 HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-namespace detail {
+// Not part of API, used internally:
 HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
 HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-}  // namespace detail

 // ------------------------------ MulFixedPoint15
 HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
@ -732,6 +736,10 @@ HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
  return svsel_b(a, svnand_b_z(a, a, b), b);  // a ? !(a & b) : b.
 }

+HWY_API svbool_t ExclusiveNeither(svbool_t a, svbool_t b) {
+  return svnor_b_z(HWY_SVE_PTRUE(8), a, b);  // !a && !b, undefined if a && b.
+}
+
 // ------------------------------ CountTrue

 #define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)           \
@ -777,6 +785,12 @@ HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
                              CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
 }

+// ------------------------------ FindKnownFirstTrue
+template <class D>
+HWY_API size_t FindKnownFirstTrue(D d, svbool_t m) {
+  return CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m));
+}
+
 // ------------------------------ IfThenElse
 #define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)                \
  HWY_API HWY_SVE_V(BASE, BITS)                                               \
@ -1221,8 +1235,9 @@ HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {

 // ------------------------------ PromoteTo F

+// Unlike Highway's ZipLower, this returns the same type.
 namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
+HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLowerSame, zip1)
 }  // namespace detail

 template <size_t N, int kPow2>
@ -1230,21 +1245,21 @@ HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
                              const svfloat16_t v) {
  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
  // first replicate each lane once.
-  const svfloat16_t vv = detail::ZipLower(v, v);
+  const svfloat16_t vv = detail::ZipLowerSame(v, v);
  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
 }

 template <size_t N, int kPow2>
 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
                              const svfloat32_t v) {
-  const svfloat32_t vv = detail::ZipLower(v, v);
+  const svfloat32_t vv = detail::ZipLowerSame(v, v);
  return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
 }

 template <size_t N, int kPow2>
 HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
                              const svint32_t v) {
-  const svint32_t vv = detail::ZipLower(v, v);
+  const svint32_t vv = detail::ZipLowerSame(v, v);
  return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
 }

@ -1431,8 +1446,8 @@ namespace detail {
      NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) {      \
    return sv##OP##_##CHAR##BITS(lo, hi);                             \
  }
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenFull, uzp1)
+HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddFull, uzp2)
 #if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
 HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
@ -1455,10 +1470,10 @@ template <class D>
 HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
 #if HWY_SVE_IS_POW2
  (void)d;
-  return detail::ConcatOdd(hi, lo);
+  return detail::ConcatOddFull(hi, lo);
 #else
-  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
+  const VFromD<D> hi_odd = detail::ConcatOddFull(hi, hi);
+  const VFromD<D> lo_odd = detail::ConcatOddFull(lo, lo);
  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
 #endif
 }
@ -1467,10 +1482,10 @@ template <class D>
 HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
 #if HWY_SVE_IS_POW2
  (void)d;
-  return detail::ConcatEven(hi, lo);
+  return detail::ConcatEvenFull(hi, lo);
 #else
-  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
+  const VFromD<D> hi_odd = detail::ConcatEvenFull(hi, hi);
+  const VFromD<D> lo_odd = detail::ConcatEvenFull(lo, lo);
  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
 #endif
 }
@ -1480,25 +1495,28 @@ HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
 template <size_t N, int kPow2>
 HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
  const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }

 template <size_t N, int kPow2>
 HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
  const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
-  return detail::ConcatOdd(in_even, in_even);  // can ignore upper half of vec
+  return detail::ConcatOddFull(in_even, in_even);  // lower half
 }

 template <size_t N, int kPow2>
 HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
  const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }

 template <size_t N, int kPow2>
 HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
  const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
+  return detail::ConcatEvenFull(in_even,
+                                in_even);  // lower half
 }

 // ------------------------------ ConvertTo F
@ -1559,15 +1577,15 @@ HWY_API V InterleaveLower(D d, const V a, const V b) {
  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
 #if HWY_TARGET == HWY_SVE2_128
  (void)d;
-  return detail::ZipLower(a, b);
+  return detail::ZipLowerSame(a, b);
 #else
  // Move lower halves of blocks to lower half of vector.
  const Repartition<uint64_t, decltype(d)> d64;
  const auto a64 = BitCast(d64, a);
  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatEven(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatEven(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
+  const auto a_blocks = detail::ConcatEvenFull(a64, a64);  // lower half
+  const auto b_blocks = detail::ConcatEvenFull(b64, b64);
+  return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
 #endif
 }

@ -1582,7 +1600,8 @@ HWY_API V InterleaveLower(const V a, const V b) {
 // "upper half" requires MaskUpperHalf.
 #if HWY_TARGET == HWY_SVE2_128
 namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpper, zip2)
+// Unlike Highway's ZipUpper, this returns the same type.
+HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpperSame, zip2)
 }  // namespace detail
 #endif

@ -1592,15 +1611,15 @@ template <class D, class V = VFromD<D>,
 HWY_API V InterleaveUpper(D d, const V a, const V b) {
 #if HWY_TARGET == HWY_SVE2_128
  (void)d;
-  return detail::ZipUpper(a, b);
+  return detail::ZipUpperSame(a, b);
 #else
  // Move upper halves of blocks to lower half of vector.
  const Repartition<uint64_t, decltype(d)> d64;
  const auto a64 = BitCast(d64, a);
  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatOdd(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatOdd(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
+  const auto a_blocks = detail::ConcatOddFull(a64, a64);  // lower half
+  const auto b_blocks = detail::ConcatOddFull(b64, b64);
+  return detail::ZipLowerSame(BitCast(d, a_blocks), BitCast(d, b_blocks));
 #endif
 }

@ -1814,12 +1833,17 @@ HWY_API V LowerHalf(const V v) {
  return v;
 }

-template <class D2, class V>
-HWY_API V UpperHalf(const D2 d2, const V v) {
+template <class DH, class V>
+HWY_API V UpperHalf(const DH dh, const V v) {
+  const Twice<decltype(dh)> d;
+  // Cast so that we support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  const VFromD<decltype(du)> vu = BitCast(du, v);
 #if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
-  return detail::Ext<Lanes(d2)>(v, v);
+  return BitCast(d, detail::Ext<Lanes(dh)>(vu, vu));
 #else
-  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
+  const MFromD<decltype(du)> mask = detail::MaskUpperHalf(du);
+  return BitCast(d, detail::Splice(vu, vu, mask));
 #endif
 }

@ -1842,14 +1866,14 @@ namespace detail {
    return sv##OP##_##CHAR##BITS(pg, v);                                     \
  }

-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanes, addv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanes, addv)
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanesM, addv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanesM, addv)

-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanesM, minv)
+HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanesM, maxv)
 // NaN if all are
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanesM, minnmv)
+HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanesM, maxnmv)

 #undef HWY_SVE_REDUCE
 #undef HWY_SVE_REDUCE_ADD
@ -1857,17 +1881,17 @@ HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)

 template <class D, class V>
 V SumOfLanes(D d, V v) {
-  return Set(d, detail::SumOfLanes(detail::MakeMask(d), v));
+  return Set(d, detail::SumOfLanesM(detail::MakeMask(d), v));
 }

 template <class D, class V>
 V MinOfLanes(D d, V v) {
-  return Set(d, detail::MinOfLanes(detail::MakeMask(d), v));
+  return Set(d, detail::MinOfLanesM(detail::MakeMask(d), v));
 }

 template <class D, class V>
 V MaxOfLanes(D d, V v) {
-  return Set(d, detail::MaxOfLanes(detail::MakeMask(d), v));
+  return Set(d, detail::MaxOfLanesM(detail::MakeMask(d), v));
 }


@ -1882,19 +1906,19 @@ namespace detail {
    return sv##OP##_##CHAR##BITS(mask, v);                 \
  }

-HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLane, lasta)
+HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLaneM, lasta)
 #undef HWY_SVE_GET_LANE
 }  // namespace detail

 template <class V>
 HWY_API TFromV<V> GetLane(V v) {
-  return detail::GetLane(v, detail::PFalse());
+  return detail::GetLaneM(v, detail::PFalse());
 }

 // ------------------------------ ExtractLane
 template <class V>
 HWY_API TFromV<V> ExtractLane(V v, size_t i) {
-  return detail::GetLane(v, FirstN(DFromV<V>(), i));
+  return detail::GetLaneM(v, FirstN(DFromV<V>(), i));
 }

 // ------------------------------ InsertLane (IfThenElse)
@ -2154,7 +2178,7 @@ HWY_API V Compress(V v, svbool_t mask) {
  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
  // SetTableIndices.
  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
-  const size_t offset = detail::SumOfLanes(mask, bits);
+  const size_t offset = detail::SumOfLanesM(mask, bits);

  // See CompressIsPartition.
  alignas(16) static constexpr uint64_t table[4 * 16] = {
@ -2196,8 +2220,8 @@ HWY_API V Compress(V v, svbool_t mask16) {
  // Demote to 16-bit (already in range) - separately so we can splice
  const V evenL = BitCast(d16, compressedL);
  const V evenH = BitCast(d16, compressedH);
-  const V v16L = detail::ConcatEven(evenL, evenL);  // only lower half needed
-  const V v16H = detail::ConcatEven(evenH, evenH);
+  const V v16L = detail::ConcatEvenFull(evenL, evenL);  // lower half
+  const V v16H = detail::ConcatEvenFull(evenH, evenH);

  // We need to combine two vectors of non-constexpr length, so the only option
  // is Splice, which requires us to synthesize a mask. NOTE: this function uses
@ -2240,7 +2264,7 @@ HWY_API V CompressNot(V v, svbool_t mask) {
  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
  // SetTableIndices.
  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
-  const size_t offset = detail::SumOfLanes(mask, bits);
+  const size_t offset = detail::SumOfLanesM(mask, bits);

  // See CompressIsPartition.
  alignas(16) static constexpr uint64_t table[4 * 16] = {
@ -2478,7 +2502,7 @@ namespace detail {
    return sv##OP##_##CHAR##BITS(v, kLane);                        \
  }

-HWY_SVE_FOREACH(HWY_SVE_BROADCAST, Broadcast, dup_lane)
+HWY_SVE_FOREACH(HWY_SVE_BROADCAST, BroadcastLane, dup_lane)
 #undef HWY_SVE_BROADCAST
 }  // namespace detail
 #endif
@ -2490,7 +2514,7 @@ HWY_API V Broadcast(const V v) {
  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
 #if HWY_TARGET == HWY_SVE2_128
-  return detail::Broadcast<kLane>(v);
+  return detail::BroadcastLane<kLane>(v);
 #else
  auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
  if (kLane != 0) {
@ -2585,10 +2609,11 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
 template <size_t N, int kPow2>
 HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32,
                              const svuint16_t v) {
-  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
+  return BitCast(df32, detail::ZipLowerSame(svdup_n_u16(0), v));
 }

 // ------------------------------ ReorderDemote2To (OddEven)
+
 template <size_t N, int kPow2>
 HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
                                    svfloat32_t a, svfloat32_t b) {
@ -2598,6 +2623,21 @@ HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+template <size_t N, int kPow2>
+HWY_API svint16_t ReorderDemote2To(Simd<int16_t, N, kPow2> d16, svint32_t a,
+                                   svint32_t b) {
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+  (void)d16;
+  const svint16_t a_in_even = svqxtnb_s32(a);
+  return svqxtnt_s32(a_in_even, b);
+#else
+  const Half<decltype(d16)> dh;
+  const svint16_t a16 = BitCast(dh, detail::SaturateI<int16_t>(a));
+  const svint16_t b16 = BitCast(dh, detail::SaturateI<int16_t>(b));
+  return detail::InterleaveEven(a16, b16);
+#endif
+}
+
 // ------------------------------ ZeroIfNegative (Lt, IfThenElse)
 template <class V>
 HWY_API V ZeroIfNegative(const V v) {
@ -2716,7 +2756,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 2)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
  const ScalableTag<uint8_t> d8;
  const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
-  return detail::ConcatEven(b16, b16);  // only lower half needed
+  return detail::ConcatEvenFull(b16, b16);  // lower half
 }
 template <class T, HWY_IF_LANE_SIZE(T, 4)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
@ -2726,7 +2766,7 @@ template <class T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
  const ScalableTag<uint32_t> d32;
  const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
-  return U8FromU32(detail::ConcatEven(b64, b64));  // only lower half needed
+  return U8FromU32(detail::ConcatEvenFull(b64, b64));  // lower half
 }

 // Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
@ -2791,7 +2831,7 @@ namespace detail {
    return sv##OP##_##CHAR##BITS(a, b);                        \
  }

-HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
+HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEvenNative, mullb)
 #undef HWY_SVE_MUL_EVEN
 }  // namespace detail
 #endif
@ -2799,27 +2839,28 @@ HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
 template <class V, class DW = RepartitionToWide<DFromV<V>>>
 HWY_API VFromD<DW> MulEven(const V a, const V b) {
 #if HWY_TARGET == HWY_SVE2
-  return BitCast(DW(), detail::MulEven(a, b));
+  return BitCast(DW(), detail::MulEvenNative(a, b));
 #else
  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
  return BitCast(DW(), detail::InterleaveEven(lo, hi));
 #endif
 }

 HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
  return detail::InterleaveEven(lo, hi);
 }

 HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
+  const auto hi = MulHigh(a, b);
  return detail::InterleaveOdd(lo, hi);
 }

 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+
 template <size_t N, int kPow2>
 HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
                                              svuint16_t a, svuint16_t b,
@ -2837,6 +2878,33 @@ HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+template <size_t N, int kPow2>
+HWY_API svint32_t ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32,
+                                            svint16_t a, svint16_t b,
+                                            const svint32_t sum0,
+                                            svint32_t& sum1) {
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+  (void)d32;
+  sum1 = svmlalt_s32(sum1, a, b);
+  return svmlalb_s32(sum0, a, b);
+#else
+  const svbool_t pg = detail::PTrue(d32);
+  const svint32_t a0 = svunpklo_s32(a);
+  const svint32_t b0 = svunpklo_s32(b);
+  svint32_t a1, b1;
+  if (detail::IsFull(d32)) {
+    a1 = svunpkhi_s32(a);
+    b1 = svunpkhi_s32(b);
+  } else {
+    const Rebind<int16_t, decltype(d32)> d16h;
+    a1 = svunpklo_s32(UpperHalf(d16h, a));
+    b1 = svunpklo_s32(UpperHalf(d16h, b));
+  }
+  sum1 = svmla_s32_x(pg, sum1, a1, b1);
+  return svmla_s32_x(pg, sum0, a0, b0);
+#endif
+}
+
 // ------------------------------ AESRound / CLMul

 #if defined(__ARM_FEATURE_SVE2_AES) ||                         \
@ -2886,7 +2954,8 @@ HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2)   // actually for bool
 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
 template <class D>
 HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
  // Convert to vector: more pipelines can execute vector TRN* instructions
  // than the predicate version.
@ -2905,7 +2974,8 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
 #if HWY_TARGET == HWY_SVE_256
  return MaskFromVec(detail::Lt128Vec(d, a, b));
 #else
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
  const svbool_t ltHL = Lt(a, b);
  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
@ -2919,18 +2989,21 @@ HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {

 template <class D>
 HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const svbool_t ltHL = Lt(a, b);
  return detail::DupOddB(d, ltHL);
 }

-// ------------------------------ Eq128
+// ------------------------------ Eq128, Ne128

 #if HWY_TARGET == HWY_SVE_256 || HWY_IDE
 namespace detail {
+
 template <class D>
 HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  // Convert to vector: more pipelines can execute vector TRN* instructions
  // than the predicate version.
  const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
@ -2939,6 +3012,20 @@ HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
  const svuint64_t eqLL = DupEven(eqHL);
  return And(eqLL, eqHH);
 }
+
+template <class D>
+HWY_INLINE svuint64_t Ne128Vec(D d, const svuint64_t a, const svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  // Convert to vector: more pipelines can execute vector TRN* instructions
+  // than the predicate version.
+  const svuint64_t neHL = VecFromMask(d, Ne(a, b));
+  // Duplicate upper and lower.
+  const svuint64_t neHH = DupOdd(neHL);
+  const svuint64_t neLL = DupEven(neHL);
+  return Or(neLL, neHH);
+}
+
 }  // namespace detail
 #endif

@ -2947,7 +3034,8 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
 #if HWY_TARGET == HWY_SVE_256
  return MaskFromVec(detail::Eq128Vec(d, a, b));
 #else
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const svbool_t eqHL = Eq(a, b);
  const svbool_t eqHH = detail::DupOddB(d, eqHL);
  const svbool_t eqLL = detail::DupEvenB(d, eqHL);
@ -2955,15 +3043,38 @@ HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
 #endif  // HWY_TARGET != HWY_SVE_256
 }

-// ------------------------------ Eq128Upper
+template <class D>
+HWY_INLINE svbool_t Ne128(D d, const svuint64_t a, const svuint64_t b) {
+#if HWY_TARGET == HWY_SVE_256
+  return MaskFromVec(detail::Ne128Vec(d, a, b));
+#else
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t neHL = Ne(a, b);
+  const svbool_t neHH = detail::DupOddB(d, neHL);
+  const svbool_t neLL = detail::DupEvenB(d, neHL);
+  return Or(neLL, neHH);
+#endif  // HWY_TARGET != HWY_SVE_256
+}
+
+// ------------------------------ Eq128Upper, Ne128Upper

 template <class D>
 HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const svbool_t eqHL = Eq(a, b);
  return detail::DupOddB(d, eqHL);
 }

+template <class D>
+HWY_INLINE svbool_t Ne128Upper(D d, svuint64_t a, svuint64_t b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const svbool_t neHL = Ne(a, b);
+  return detail::DupOddB(d, neHL);
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 template <class D>
--- a/third_party/highway/hwy/ops/emu128-inl.h
+++ b/third_party/highway/hwy/ops/emu128-inl.h
@ -18,6 +18,7 @@

 #include <stddef.h>
 #include <stdint.h>
+#include <cmath>  // std::abs, std::isnan

 #include "hwy/base.h"
 #include "hwy/ops/shared-inl.h"
@ -32,6 +33,9 @@ using Full128 = Simd<T, 16 / sizeof(T), 0>;
 // (Wrapper class required for overloading comparison operators.)
 template <typename T, size_t N = 16 / sizeof(T)>
 struct Vec128 {
+  using PrivateT = T;                     // only for DFromV
+  static constexpr size_t kPrivateN = N;  // only for DFromV
+
  HWY_INLINE Vec128() = default;
  Vec128(const Vec128&) = default;
  Vec128& operator=(const Vec128&) = default;
@ -78,23 +82,11 @@ struct Mask128 {
  Raw bits[16 / sizeof(T)] = {};
 };

-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct Deduce128 {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
+template <class V>
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class V>
-using DFromV = decltype(detail::Deduce128()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
+using TFromV = typename V::PrivateT;

 // ------------------------------ BitCast

@ -380,6 +372,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== SHIFTS

 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@ -1235,6 +1233,14 @@ HWY_API Mask128<uint64_t> Eq128(Simd<uint64_t, 2, 0> /* tag */,
  return ret;
 }

+HWY_API Mask128<uint64_t> Ne128(Simd<uint64_t, 2, 0> /* tag */,
+                                Vec128<uint64_t> a, const Vec128<uint64_t> b) {
+  const bool ne = a.raw[1] != b.raw[1] || a.raw[0] != b.raw[0];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
+  return ret;
+}
+
 HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
                                     Vec128<uint64_t> a,
                                     const Vec128<uint64_t> b) {
@ -1244,6 +1250,15 @@ HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
  return ret;
 }

+HWY_API Mask128<uint64_t> Ne128Upper(Simd<uint64_t, 2, 0> /* tag */,
+                                     Vec128<uint64_t> a,
+                                     const Vec128<uint64_t> b) {
+  const bool ne = a.raw[1] != b.raw[1];
+  Mask128<uint64_t> ret;
+  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(ne);
+  return ret;
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 template <class D, class V = VFromD<D>>
@ -1548,6 +1563,22 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
 }

+template <size_t N>
+HWY_API Vec128<int16_t, 2 * N> ReorderDemote2To(Simd<int16_t, 2 * N, 0> /*d16*/,
+                                                Vec128<int32_t, N> a,
+                                                Vec128<int32_t, N> b) {
+  const int16_t min = LimitsMin<int16_t>();
+  const int16_t max = LimitsMax<int16_t>();
+  Vec128<int16_t, 2 * N> ret;
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, a.raw[i]), max));
+  }
+  for (size_t i = 0; i < N; ++i) {
+    ret.raw[N + i] = static_cast<int16_t>(HWY_MIN(HWY_MAX(min, b.raw[i]), max));
+  }
+  return ret;
+}
+
 namespace detail {

 HWY_INLINE void StoreU16ToF16(const uint16_t val,
@ -2233,9 +2264,8 @@ HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {

 template <typename T, size_t N>
 HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  using Bits = typename Mask128<T, N>::Raw;
-  constexpr Bits kAll = static_cast<Bits>(~Bits{0});
-  Bits and_sum = kAll;
+  constexpr uint64_t kAll = LimitsMax<typename Mask128<T, N>::Raw>();
+  uint64_t and_sum = kAll;
  for (size_t i = 0; i < N; ++i) {
    and_sum &= mask.bits[i];
  }
@ -2280,6 +2310,16 @@ HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
  return count;
 }

+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(Simd<T, N, 0> /* tag */,
+                               const Mask128<T, N> mask) {
+  for (size_t i = 0; i < N; ++i) {
+    if (mask.bits[i] != 0) return i;
+  }
+  HWY_DASSERT(false);
+  return 0;
+}
+
 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
                               const Mask128<T, N> mask) {
@ -2379,6 +2419,7 @@ HWY_API size_t CompressBitsStore(Vec128<T, N> v,
 }

 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
+
 template <size_t N>
 HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
                                                   Vec128<bfloat16_t, 2 * N> a,
@ -2395,6 +2436,20 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> d32, Vec128<int16_t, 2 * N> a, Vec128<int16_t, 2 * N> b,
+    const Vec128<int32_t, N> sum0, Vec128<int32_t, N>& sum1) {
+  const Rebind<int16_t, decltype(d32)> d16;
+  // Avoid ZipLower/Upper so this also works on big-endian systems.
+  const Vec128<int32_t, N> a0 = PromoteTo(d32, LowerHalf(d16, a));
+  const Vec128<int32_t, N> a1 = PromoteTo(d32, UpperHalf(d16, a));
+  const Vec128<int32_t, N> b0 = PromoteTo(d32, LowerHalf(d16, b));
+  const Vec128<int32_t, N> b1 = PromoteTo(d32, UpperHalf(d16, b));
+  sum1 = MulAdd(BitCast(d32, a1), BitCast(d32, b1), sum1);
+  return MulAdd(BitCast(d32, a0), BitCast(d32, b0), sum0);
+}
+
 // ================================================== REDUCTIONS

 template <typename T, size_t N>
--- a/third_party/highway/hwy/ops/generic_ops-inl.h
+++ b/third_party/highway/hwy/ops/generic_ops-inl.h
@ -15,6 +15,14 @@

 // Target-independent types/functions defined after target-specific ops.

+#include "hwy/base.h"
+
+// Define detail::Shuffle1230 etc, but only when viewing the current header;
+// normally this is included via highway.h, which includes ops/*.h.
+#if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED)
+#include "hwy/ops/emu128-inl.h"
+#endif  // HWY_IDE
+
 // Relies on the external include guard in highway.h.
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
@ -476,31 +484,15 @@ HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
 }

-// 64 bits
-template <typename T>
-HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
-                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const auto v10 = InterleaveLower(d_full, v0, v1);
-  StoreU(v10, d_full, unaligned);
-}
-
-// <= 32 bits
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
-                               const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
+// <= 64 bits
+template <class V, typename T, size_t N, HWY_IF_LE64(T, N)>
+HWY_API void StoreInterleaved2(const V part0, const V part1, Simd<T, N, 0> d,
                               T* HWY_RESTRICT unaligned) {
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const auto v10 = InterleaveLower(d_full, v0, v1);
-  alignas(16) T buf[16 / sizeof(T)];
-  StoreU(v10, d_full, buf);
-  CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
+  const Twice<decltype(d)> d2;
+  const auto v0 = ZeroExtendVector(d2, part0);
+  const auto v1 = ZeroExtendVector(d2, part1);
+  const auto v10 = InterleaveLower(d2, v0, v1);
+  StoreU(v10, d2, unaligned);
 }

 // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
@ -526,8 +518,9 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
  const RebindToUnsigned<decltype(d)> du;
-  const auto k5 = Set(du, 5);
-  const auto k6 = Set(du, 6);
+  using TU = TFromD<decltype(du)>;
+  const auto k5 = Set(du, TU{5});
+  const auto k6 = Set(du, TU{6});

  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
@ -576,8 +569,8 @@ template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
 HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
  const Repartition<uint8_t, decltype(d)> du8;
-  const auto k2 = Set(du8, 2 * sizeof(T));
-  const auto k3 = Set(du8, 3 * sizeof(T));
+  const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
+  const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});

  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
@ -666,16 +659,15 @@ HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
 }

 // 64-bit vector, 8-bit lanes
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
-                               const Vec64<T> part2, Full64<T> d,
-                               T* HWY_RESTRICT unaligned) {
+template <class V, typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API void StoreInterleaved3(const V part0, const V part1, const V part2,
+                               Full64<T> d, T* HWY_RESTRICT unaligned) {
  constexpr size_t N = 16 / sizeof(T);
  // Use full vectors for the shuffles and first result.
  const Full128<uint8_t> du;
  const Full128<T> d_full;
-  const auto k5 = Set(du, 5);
-  const auto k6 = Set(du, 6);
+  const auto k5 = Set(du, uint8_t{5});
+  const auto k6 = Set(du, uint8_t{6});

  const Vec128<T> v0{part0.raw};
  const Vec128<T> v1{part1.raw};
@ -708,7 +700,7 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
-  const Vec64<T> B{(B0 | B1 | B2).raw};
+  const V B{(B0 | B1 | B2).raw};
  StoreU(B, d, unaligned + 1 * N);
 }

@ -720,8 +712,8 @@ HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
  const Full128<T> d;
  const Full128<uint8_t> du8;
  constexpr size_t N = 16 / sizeof(T);
-  const auto k2 = Set(du8, 2 * sizeof(T));
-  const auto k3 = Set(du8, 3 * sizeof(T));
+  const auto k2 = Set(du8, uint8_t{2 * sizeof(T)});
+  const auto k3 = Set(du8, uint8_t{3 * sizeof(T)});

  const Vec128<T> v0{part0.raw};
  const Vec128<T> v1{part1.raw};
@ -975,7 +967,7 @@ HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
 // ------------------------------ AESRound

 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
-#if HWY_TARGET != HWY_SCALAR
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE

 // Define for white-box testing, even if native instructions are available.
 namespace detail {
@ -991,7 +983,7 @@ namespace detail {
 template <class V>  // u8
 HWY_INLINE V SubBytes(V state) {
  const DFromV<V> du;
-  const auto mask = Set(du, 0xF);
+  const auto mask = Set(du, uint8_t{0xF});

  // Change polynomial basis to GF(2^4)
  {
@ -1034,7 +1026,7 @@ HWY_INLINE V SubBytes(V state) {
      0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
  const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
  const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
-  return Xor(Xor(affL, affU), Set(du, 0x63));
+  return Xor(Xor(affL, affU), Set(du, uint8_t{0x63}));
 }

 }  // namespace detail
@ -1080,7 +1072,7 @@ HWY_API V MixColumns(const V state) {
      1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
  const RebindToSigned<decltype(du)> di;  // can only do signed comparisons
  const auto msb = Lt(BitCast(di, state), Zero(di));
-  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
+  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B})));
  const auto d = Xor(Add(state, state), overflow);  // = state*2 in GF(2^8).
  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
  const auto d_s2301 = Xor(d, s2301);
@ -1200,7 +1192,7 @@ HWY_API V PopulationCount(V v) {
  HWY_ALIGN constexpr uint8_t kLookup[16] = {
      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  };
-  const auto lo = And(v, Set(d, 0xF));
+  const auto lo = And(v, Set(d, uint8_t{0xF}));
  const auto hi = ShiftRight<4>(v);
  const auto lookup = LoadDup128(d, kLookup);
  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
@ -1215,9 +1207,10 @@ HWY_API V PopulationCount(V v) {
  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
  const D d;
  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
-  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
-  return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
+  const V k33 = Set(d, uint8_t{0x33});
+  v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55})));
+  v = Add(And(ShiftRight<2>(v), k33), And(v, k33));
+  return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F}));
 }
 #endif  // HWY_TARGET != HWY_RVV

@ -1227,7 +1220,7 @@ HWY_API V PopulationCount(V v) {
  const D d;
  const Repartition<uint8_t, decltype(d)> d8;
  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
-  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
+  return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF})));
 }

 template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
@ -1236,7 +1229,7 @@ HWY_API V PopulationCount(V v) {
  const D d;
  Repartition<uint16_t, decltype(d)> d16;
  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
-  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
+  return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF})));
 }

 #if HWY_HAVE_INTEGER64
@ -1246,7 +1239,7 @@ HWY_API V PopulationCount(V v) {
  const D d;
  Repartition<uint32_t, decltype(d)> d32;
  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
-  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
+  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL)));
 }
 #endif

--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
@ -494,9 +494,11 @@ using VFromD = decltype(Set(D(), TFromD<D>()));

 // ------------------------------ Zero

-template <typename T, size_t N, int kPow2>
-HWY_API VFromD<Simd<T, N, kPow2>> Zero(Simd<T, N, kPow2> d) {
-  return Set(d, T(0));
+template <class D>
+HWY_API VFromD<D> Zero(D d) {
+  // Cast to support bfloat16_t.
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, 0));
 }

 // ------------------------------ Undefined
@ -1109,6 +1111,9 @@ HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
 // ------------------------------ Xor
 HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)

+// ------------------------------ ExclusiveNeither
+HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, ExclusiveNeither, xnor)
+
 #undef HWY_RVV_RETM_ARGMM

 // ------------------------------ IfThenElse
@ -1219,14 +1224,19 @@ HWY_API V IfNegativeThenElse(V v, V yes, V no) {

 // ------------------------------ FindFirstTrue

-#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
-  template <class D>                                        \
-  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {  \
-    static_assert(MLenFromD(d) == MLEN, "Type mismatch");   \
-    return vfirst_m_b##MLEN(m, Lanes(d));                   \
+#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)    \
+  template <class D>                                           \
+  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {     \
+    static_assert(MLenFromD(d) == MLEN, "Type mismatch");      \
+    return vfirst_m_b##MLEN(m, Lanes(d));                      \
+  }                                                            \
+  template <class D>                                           \
+  HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) {  \
+    static_assert(MLenFromD(d) == MLEN, "Type mismatch");      \
+    return static_cast<size_t>(vfirst_m_b##MLEN(m, Lanes(d))); \
  }

-HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, _, _)
+HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, , _)
 #undef HWY_RVV_FIND_FIRST_TRUE

 // ------------------------------ AllFalse
@ -2642,9 +2652,10 @@ HWY_API V ShiftLeftLanes(const D d, const V v) {
  using TI = TFromD<decltype(di)>;
  const auto shifted = detail::SlideUp(v, v, kLanes);
  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
-  const auto idx_mod = detail::AndS(
-      detail::Iota0(di), static_cast<TI>(detail::LanesPerBlock(di) - 1));
-  const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
+  const auto idx_mod =
+      detail::AndS(BitCast(di, detail::Iota0(di)),
+                   static_cast<TI>(detail::LanesPerBlock(di) - 1));
+  const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
  return IfThenZeroElse(clear, shifted);
 }

@ -2681,9 +2692,8 @@ HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
  const size_t lpb = detail::LanesPerBlock(di);
  const auto idx_mod =
-      detail::AndS(detail::Iota0(di), static_cast<TI>(lpb - 1));
-  const auto keep =
-      detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
+      detail::AndS(BitCast(di, detail::Iota0(di)), static_cast<TI>(lpb - 1));
+  const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
  return IfThenElseZero(keep, shifted);
 }

@ -2827,12 +2837,14 @@ HWY_API V PopulationCount(V v) {

 // ------------------------------ LoadDup128

-template <class D, typename T = TFromD<D>>
-HWY_API VFromD<D> LoadDup128(D d, const T* const HWY_RESTRICT p) {
-  const auto loaded = Load(d, p);
-  // Broadcast the first block
-  const auto idx = detail::AndS(detail::Iota0(d),
-                                static_cast<T>(detail::LanesPerBlock(d) - 1));
+template <class D>
+HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
+  const VFromD<D> loaded = Load(d, p);
+  // idx must be unsigned for TableLookupLanes.
+  using TU = MakeUnsigned<TFromD<D>>;
+  const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
+  // Broadcast the first block.
+  const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(d), mask);
  return TableLookupLanes(loaded, idx);
 }

@ -3086,7 +3098,7 @@ HWY_INLINE V MulOdd(const V a, const V b) {
  return OddEven(hi, detail::Slide1Down(lo));
 }

-// ------------------------------ ReorderDemote2To (OddEven)
+// ------------------------------ ReorderDemote2To (OddEven, Combine)

 template <size_t N, int kPow2>
 HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
@ -3099,22 +3111,42 @@ HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+// If LMUL is not the max, Combine first to avoid another DemoteTo.
+template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
+          class D32 = RepartitionToWide<Simd<int16_t, N, kPow2>>>
+HWY_API VFromD<Simd<int16_t, N, kPow2>> ReorderDemote2To(
+    Simd<int16_t, N, kPow2> d16, VFromD<D32> a, VFromD<D32> b) {
+  const Twice<D32> d32t;
+  const VFromD<decltype(d32t)> ab = Combine(d32t, a, b);
+  return DemoteTo(d16, ab);
+}
+
+// Max LMUL: must DemoteTo first, then Combine.
+template <size_t N, class V32 = VFromD<RepartitionToWide<Simd<int16_t, N, 3>>>>
+HWY_API VFromD<Simd<int16_t, N, 3>> ReorderDemote2To(Simd<int16_t, N, 3> d16,
+                                                     V32 a, V32 b) {
+  const Half<decltype(d16)> d16h;
+  const VFromD<decltype(d16h)> a16 = DemoteTo(d16h, a);
+  const VFromD<decltype(d16h)> b16 = DemoteTo(d16h, b);
+  return Combine(d16, a16, b16);
+}
+
 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)

-template <class DF>
-using DU16FromDF = RepartitionToNarrow<RebindToUnsigned<DF>>;
+namespace detail {

-template <size_t N, int kPow2>
-HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
-                                       VFromD<DU16FromDF<decltype(df32)>> a,
-                                       VFromD<DU16FromDF<decltype(df32)>> b,
-                                       const VFromD<decltype(df32)> sum0,
-                                       VFromD<decltype(df32)>& sum1)
-    -> VFromD<decltype(df32)> {
-  const DU16FromDF<decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
+// Non-overloaded wrapper function so we can define DF32 in template args.
+template <
+    size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
+    class VF32 = VFromD<DF32>,
+    class DU16 = RepartitionToNarrow<RebindToUnsigned<Simd<float, N, kPow2>>>>
+HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd<float, N, kPow2> df32,
+                                           VFromD<DU16> a, VFromD<DU16> b,
+                                           const VF32 sum0, VF32& sum1) {
+  const DU16 du16;
+  const RebindToUnsigned<DF32> du32;
  using VU32 = VFromD<decltype(du32)>;
-  const VFromD<decltype(du16)> zero = Zero(du16);
+  const VFromD<DU16> zero = Zero(du16);
  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
@ -3123,10 +3155,68 @@ HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
+                           SHIFT, MLEN, NAME, OP)                              \
+  template <size_t N>                                                          \
+  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME(                                   \
+      HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
+      HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {            \
+    return OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d));                         \
+  }
+
+HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, vwmacc_vv_, _EXT_VIRT)
+#undef HWY_RVV_WIDEN_MACC
+
+// If LMUL is not the max, we can WidenMul first (3 instructions).
+template <size_t N, int kPow2, hwy::EnableIf<(kPow2 < 3)>* = nullptr,
+          class D32 = Simd<int32_t, N, kPow2>, class V32 = VFromD<D32>,
+          class D16 = RepartitionToNarrow<D32>>
+HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, kPow2> d32,
+                                                 VFromD<D16> a, VFromD<D16> b,
+                                                 const V32 sum0, V32& sum1) {
+  const Twice<decltype(d32)> d32t;
+  using V32T = VFromD<decltype(d32t)>;
+  V32T sum = Combine(d32t, sum0, sum1);
+  sum = detail::WidenMulAcc(d32t, sum, a, b);
+  sum1 = UpperHalf(d32, sum);
+  return LowerHalf(d32, sum);
+}
+
+// Max LMUL: must LowerHalf first (4 instructions).
+template <size_t N, class D32 = Simd<int32_t, N, 3>, class V32 = VFromD<D32>,
+          class D16 = RepartitionToNarrow<D32>>
+HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(Simd<int32_t, N, 3> d32,
+                                                 VFromD<D16> a, VFromD<D16> b,
+                                                 const V32 sum0, V32& sum1) {
+  const Half<D16> d16h;
+  using V16H = VFromD<decltype(d16h)>;
+  const V16H a0 = LowerHalf(d16h, a);
+  const V16H a1 = UpperHalf(d16h, a);
+  const V16H b0 = LowerHalf(d16h, b);
+  const V16H b1 = UpperHalf(d16h, b);
+  sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
+  return detail::WidenMulAcc(d32, sum0, a0, b0);
+}
+
+}  // namespace detail
+
+template <size_t N, int kPow2, class VN, class VW>
+HWY_API VW ReorderWidenMulAccumulate(Simd<float, N, kPow2> d32, VN a, VN b,
+                                     const VW sum0, VW& sum1) {
+  return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
+}
+
+template <size_t N, int kPow2, class VN, class VW>
+HWY_API VW ReorderWidenMulAccumulate(Simd<int32_t, N, kPow2> d32, VN a, VN b,
+                                     const VW sum0, VW& sum1) {
+  return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
+}
+
 // ------------------------------ Lt128
 template <class D>
 HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  // Truth table of Eq and Compare for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
@ -3152,7 +3242,8 @@ HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
 // ------------------------------ Lt128Upper
 template <class D>
 HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  // Replicate H to its neighbor.
  return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
@ -3161,7 +3252,8 @@ HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
 // ------------------------------ Eq128
 template <class D>
 HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  const VFromD<D> eqLH = Reverse2(d, eqHL);
  return MaskFromVec(And(eqHL, eqLH));
@ -3170,12 +3262,33 @@ HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
 // ------------------------------ Eq128Upper
 template <class D>
 HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  // Replicate H to its neighbor.
  return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
 }

+// ------------------------------ Ne128
+template <class D>
+HWY_INLINE MFromD<D> Ne128(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
+  const VFromD<D> neLH = Reverse2(d, neHL);
+  return MaskFromVec(Or(neHL, neLH));
+}
+
+// ------------------------------ Ne128Upper
+template <class D>
+HWY_INLINE MFromD<D> Ne128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
+  // Replicate H to its neighbor.
+  return MaskFromVec(OddEven(neHL, detail::Slide1Down(neHL)));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 template <class D>
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
@ -33,6 +33,9 @@ using Sisd = Simd<T, 1, 0>;
 // (Wrapper class required for overloading comparison operators.)
 template <typename T>
 struct Vec1 {
+  using PrivateT = T;                     // only for DFromV
+  static constexpr size_t kPrivateN = 1;  // only for DFromV
+
  HWY_INLINE Vec1() = default;
  Vec1(const Vec1&) = default;
  Vec1& operator=(const Vec1&) = default;
@ -78,23 +81,11 @@ class Mask1 {
  Raw bits;
 };

-namespace detail {
-
-// Deduce Sisd<T> from Vec1<T>
-struct Deduce1 {
-  template <typename T>
-  Sisd<T> operator()(Vec1<T>) const {
-    return Sisd<T>();
-  }
-};
-
-}  // namespace detail
+template <class V>
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class V>
-using DFromV = decltype(detail::Deduce1()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
+using TFromV = typename V::PrivateT;

 // ------------------------------ BitCast

@ -341,6 +332,12 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T>
+HWY_API Mask1<T> ExclusiveNeither(const Mask1<T> a, Mask1<T> b) {
+  const Sisd<T> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ================================================== SHIFTS

 // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
@ -365,7 +362,7 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
    // signed shifts are still implementation-defined.
    using TU = hwy::MakeUnsigned<T>;
    const Sisd<TU> du;
-    const TU shifted = BitCast(du, v).raw >> kBits;
+    const TU shifted = static_cast<TU>(BitCast(du, v).raw >> kBits);
    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
    const size_t sign_shift =
        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
@ -426,7 +423,7 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
    // signed shifts are still implementation-defined.
    using TU = hwy::MakeUnsigned<T>;
    const Sisd<TU> du;
-    const TU shifted = BitCast(du, v).raw >> bits;
+    const TU shifted = static_cast<TU>(BitCast(du, v).raw >> bits);
    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
    const size_t sign_shift =
        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
@ -557,16 +554,47 @@ HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
 template <typename T>
 HWY_API Vec1<T> Abs(const Vec1<T> a) {
  const T i = a.raw;
-  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
+  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(static_cast<T>(-i));
 }
 HWY_API Vec1<float> Abs(const Vec1<float> a) {
-  return Vec1<float>(std::abs(a.raw));
+  return Vec1<float>(fabsf(a.raw));
 }
 HWY_API Vec1<double> Abs(const Vec1<double> a) {
-  return Vec1<double>(std::abs(a.raw));
+  return Vec1<double>(fabs(a.raw));
 }

-// ------------------------------ min/max
+// ------------------------------ Min/Max
+
+// <cmath> may be unavailable, so implement our own.
+namespace detail {
+
+static inline float Abs(float f) {
+  uint32_t i;
+  CopyBytes<4>(&f, &i);
+  i &= 0x7FFFFFFFu;
+  CopyBytes<4>(&i, &f);
+  return f;
+}
+static inline double Abs(double f) {
+  uint64_t i;
+  CopyBytes<8>(&f, &i);
+  i &= 0x7FFFFFFFFFFFFFFFull;
+  CopyBytes<8>(&i, &f);
+  return f;
+}
+
+static inline bool SignBit(float f) {
+  uint32_t i;
+  CopyBytes<4>(&f, &i);
+  return (i >> 31) != 0;
+}
+static inline bool SignBit(double f) {
+  uint64_t i;
+  CopyBytes<8>(&f, &i);
+  return (i >> 63) != 0;
+}
+
+}  // namespace detail

 template <typename T, HWY_IF_NOT_FLOAT(T)>
 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
@ -575,8 +603,8 @@ HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {

 template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
-  if (std::isnan(a.raw)) return b;
-  if (std::isnan(b.raw)) return a;
+  if (isnan(a.raw)) return b;
+  if (isnan(b.raw)) return a;
  return Vec1<T>(HWY_MIN(a.raw, b.raw));
 }

@ -587,8 +615,8 @@ HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {

 template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
-  if (std::isnan(a.raw)) return b;
-  if (std::isnan(b.raw)) return a;
+  if (isnan(a.raw)) return b;
+  if (isnan(b.raw)) return a;
  return Vec1<T>(HWY_MAX(a.raw, b.raw));
 }

@ -707,10 +735,10 @@ HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {

 // Square root
 HWY_API Vec1<float> Sqrt(const Vec1<float> v) {
-  return Vec1<float>(std::sqrt(v.raw));
+  return Vec1<float>(sqrtf(v.raw));
 }
 HWY_API Vec1<double> Sqrt(const Vec1<double> v) {
-  return Vec1<double>(std::sqrt(v.raw));
+  return Vec1<double>(sqrt(v.raw));
 }

 // ------------------------------ Floating-point rounding
@ -725,7 +753,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
  const TI rounded = static_cast<TI>(v.raw + bias);
  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
  // Round to even
-  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
+  if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
    return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
  }
  return Vec1<T>(static_cast<T>(rounded));
@ -737,12 +765,12 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
  using TI = int32_t;

  const T abs = Abs(v).raw;
-  const bool signbit = std::signbit(v.raw);
+  const bool is_sign = detail::SignBit(v.raw);

  if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
    // Check if too large to cast or NaN
    if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
-      return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
+      return Vec1<TI>(is_sign ? LimitsMin<TI>() : LimitsMax<TI>());
    }
    return Vec1<int32_t>(static_cast<TI>(v.raw));
  }
@ -750,8 +778,8 @@ HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
  const TI rounded = static_cast<TI>(v.raw + bias);
  if (rounded == 0) return Vec1<int32_t>(0);
  // Round to even
-  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
-    return Vec1<TI>(rounded - (signbit ? -1 : 1));
+  if ((rounded & 1) && detail::Abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
+    return Vec1<TI>(rounded - (is_sign ? -1 : 1));
  }
  return Vec1<TI>(rounded);
 }
@ -1090,19 +1118,19 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
 // so we overload for FromT=double and ToT={float,int32_t}.
 HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
  // Prevent ubsan errors when converting float to narrower integer/float
-  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
-    return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
-                                              : HighestValue<float>());
+  if (isinf(from.raw) ||
+      fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
+    return Vec1<float>(detail::SignBit(from.raw) ? LowestValue<float>()
+                                                 : HighestValue<float>());
  }
  return Vec1<float>(static_cast<float>(from.raw));
 }
 HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
-  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
-    return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
-                                                : HighestValue<int32_t>());
+  if (isinf(from.raw) ||
+      fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
+    return Vec1<int32_t>(detail::SignBit(from.raw) ? LowestValue<int32_t>()
+                                                   : HighestValue<int32_t>());
  }
  return Vec1<int32_t>(static_cast<int32_t>(from.raw));
 }
@ -1196,10 +1224,9 @@ HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
  // float## -> int##: return closest representable value. We cannot exactly
  // represent LimitsMax<ToT> in FromT, so use double.
  const double f = static_cast<double>(from.raw);
-  if (std::isinf(from.raw) ||
-      std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
-    return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
-                                            : LimitsMax<ToT>());
+  if (isinf(from.raw) || fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
+    return Vec1<ToT>(detail::SignBit(from.raw) ? LimitsMin<ToT>()
+                                               : LimitsMax<ToT>());
  }
  return Vec1<ToT>(static_cast<ToT>(from.raw));
 }
@ -1468,6 +1495,11 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
  return mask.bits == 0 ? -1 : 0;
 }

+template <typename T>
+HWY_API size_t FindKnownFirstTrue(Sisd<T> /* tag */, const Mask1<T> /* m */) {
+  return 0;  // There is only one lane and we know it is true.
+}
+
 // ------------------------------ Compress, CompressBits

 template <typename T>
@ -1530,6 +1562,14 @@ HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
                Vec1<float>(F32FromBF16(b.raw)), sum0);
 }

+HWY_API Vec1<int32_t> ReorderWidenMulAccumulate(Sisd<int32_t> /* tag */,
+                                                Vec1<int16_t> a,
+                                                Vec1<int16_t> b,
+                                                const Vec1<int32_t> sum0,
+                                                Vec1<int32_t>& /* sum1 */) {
+  return Vec1<int32_t>(a.raw * b.raw + sum0.raw);
+}
+
 // ================================================== REDUCTIONS

 // Sum of all lanes, i.e. the only one.
--- a/third_party/highway/hwy/ops/set_macros-inl.h
+++ b/third_party/highway/hwy/ops/set_macros-inl.h
@ -319,7 +319,7 @@
 #define HWY_HAVE_FLOAT64 0
 #define HWY_MEM_OPS_MIGHT_FAULT 1
 #define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
+#define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0

 #define HWY_NAMESPACE N_WASM_EMU256
--- a/third_party/highway/hwy/ops/shared-inl.h
+++ b/third_party/highway/hwy/ops/shared-inl.h
@ -15,7 +15,17 @@

 // Per-target definitions shared by ops/*.h and user code.

-#include <cmath>
+// We are covered by the highway.h include guard, but generic_ops-inl.h
+// includes this again #if HWY_IDE.
+#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
+#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
+#else
+#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
+#endif
+
+#include <math.h>

 #include "hwy/base.h"

@ -218,6 +228,9 @@ using Half = typename D::Half;
 template <class D>
 using Twice = typename D::Twice;

+template <typename T>
+using Full16 = Simd<T, 2 / sizeof(T), 0>;
+
 template <typename T>
 using Full32 = Simd<T, 4 / sizeof(T), 0>;

@ -309,3 +322,5 @@ using VecArg = V;
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_OPS_SHARED_TOGGLE
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
@ -49,6 +49,11 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+#if HWY_TARGET == HWY_WASM_EMU256
+template <typename T>
+using Full256 = Simd<T, 32 / sizeof(T), 0>;
+#endif
+
 namespace detail {

 template <typename T>
@ -67,6 +72,9 @@ class Vec128 {
  using Raw = typename detail::Raw128<T>::type;

 public:
+  using PrivateT = T;                     // only for DFromV
+  static constexpr size_t kPrivateN = N;  // only for DFromV
+
  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec128& operator*=(const Vec128 other) {
@ -100,29 +108,20 @@ using Vec64 = Vec128<T, 8 / sizeof(T)>;
 template <typename T>
 using Vec32 = Vec128<T, 4 / sizeof(T)>;

+template <typename T>
+using Vec16 = Vec128<T, 2 / sizeof(T)>;
+
 // FF..FF or 0.
 template <typename T, size_t N = 16 / sizeof(T)>
 struct Mask128 {
  typename detail::Raw128<T>::type raw;
 };

-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
+template <class V>
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class V>
-using DFromV = decltype(detail::DeduceD()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
+using TFromV = typename V::PrivateT;

 // ------------------------------ BitCast

@ -237,7 +236,7 @@ HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
 HWY_DIAGNOSTICS(pop)

 // Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
+template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
 Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
  HWY_ALIGN T lanes[16 / sizeof(T)];
  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
@ -1219,7 +1218,7 @@ HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,

 // ------------------------------ FirstN (Iota, Lt)

-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
@ -1412,6 +1411,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 // ------------------------------ Shl (BroadcastSignBit, IfThenElse)

 // The x86 multiply-by-Pow2() trick will not work because WASM saturates
@ -1568,7 +1573,7 @@ HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
 }

 // LoadU == Load.
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
  return Load(d, p);
 }
@ -2516,7 +2521,7 @@ HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
 // ------------------------------ TableLookupLanes

 // Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T, size_t N>
+template <typename T, size_t N = 16 / sizeof(T)>
 struct Indices128 {
  __v128_u raw;
 };
@ -2822,7 +2827,7 @@ HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
 // ------------------------------ Combine (InterleaveLower)

 // N = N/2 + N/2 (upper half undefined)
-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
                             Vec128<T, N / 2> lo_half) {
  const Half<decltype(d)> d2;
@ -2836,7 +2841,7 @@ HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,

 // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)

-template <typename T, size_t N>
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
 }
@ -3095,75 +3100,75 @@ HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
 // ------------------------------ Promotions (part w/ narrow lanes -> full)

 // Unsigned: zero-extend.
-template <size_t N>
+template <size_t N, HWY_IF_LE128(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
                                      const Vec128<uint8_t, N> v) {
  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(uint32_t, N)>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
                                      const Vec128<uint8_t, N> v) {
  return Vec128<uint32_t, N>{
      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int16_t, N)>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
                                     const Vec128<uint8_t, N> v) {
  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                     const Vec128<uint8_t, N> v) {
  return Vec128<int32_t, N>{
      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(uint32_t, N)>
 HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
                                      const Vec128<uint16_t, N> v) {
  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(uint64_t, N)>
 HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
                                      const Vec128<uint32_t, N> v) {
  return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
 }

-template <size_t N>
+template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                     const Vec128<uint16_t, N> v) {
  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
 }

 // Signed: replicate sign bit.
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int16_t, N)>
 HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
                                     const Vec128<int8_t, N> v) {
  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                     const Vec128<int8_t, N> v) {
  return Vec128<int32_t, N>{
      wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int32_t, N)>
 HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
                                     const Vec128<int16_t, N> v) {
  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
 }
-template <size_t N>
+template <size_t N, HWY_IF_LE128(int64_t, N)>
 HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
                                     const Vec128<int32_t, N> v) {
  return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
 }

-template <size_t N>
+template <size_t N, HWY_IF_LE128(double, N)>
 HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
                                    const Vec128<int32_t, N> v) {
  return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
 }

-template <size_t N>
+template <size_t N, HWY_IF_LE128(float, N)>
 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
                                   const Vec128<float16_t, N> v) {
  const RebindToSigned<decltype(df32)> di32;
@ -3184,7 +3189,7 @@ HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
 }

-template <size_t N>
+template <size_t N, HWY_IF_LE128(float, N)>
 HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
                                   const Vec128<bfloat16_t, N> v) {
  const Rebind<uint16_t, decltype(df32)> du16;
@ -3285,7 +3290,33 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
  const RebindToUnsigned<decltype(dbf16)> du16;
  const Repartition<uint32_t, decltype(dbf16)> du32;
  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
+  const auto u16 = OddEven(BitCast(du16, a), BitCast(du16, b_in_even));
+  return BitCast(dbf16, u16);
+}
+
+// Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes
+// above 2*N.
+HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
+                                            Vec128<int32_t, 1> a,
+                                            Vec128<int32_t, 1> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
+                                            Vec128<int32_t, 2> a,
+                                            Vec128<int32_t, 2> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(a.raw, b.raw)};
 }

 // For already range-limited input [0, 255].
@ -3308,8 +3339,8 @@ HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
  return Vec128<To, 1>{v1.raw};
 }

-HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
-                                      const Vec128<uint64_t> v) {
+HWY_API Vec16<uint8_t> TruncateTo(Full16<uint8_t> /* tag */,
+                                  const Vec128<uint64_t> v) {
  const Full128<uint8_t> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = ConcatEven(d, v1, v1);
@ -3317,16 +3348,16 @@ HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
  return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
 }

-HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t> v) {
+HWY_API Vec32<uint16_t> TruncateTo(Full32<uint16_t> /* tag */,
+                                   const Vec128<uint64_t> v) {
  const Full128<uint16_t> d;
  const auto v1 = BitCast(d, v);
  const auto v2 = ConcatEven(d, v1, v1);
  return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
 }

-HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t> v) {
+HWY_API Vec64<uint32_t> TruncateTo(Full64<uint32_t> /* tag */,
+                                   const Vec128<uint64_t> v) {
  const Full128<uint32_t> d;
  const auto v1 = BitCast(d, v);
  return LowerHalf(ConcatEven(d, v1, v1));
@ -3683,6 +3714,13 @@ HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
  return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
 }

+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint64_t bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(bits);
+}
+
 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                               const Mask128<T, N> mask) {
@ -4102,7 +4140,11 @@ HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {

 template <typename T>
 struct CompressIsPartition {
+#if HWY_TARGET == HWY_WASM_EMU256
+  enum { value = 0 };
+#else
  enum { value = 1 };
+#endif
 };

 // Single lane: no-op
@ -4265,6 +4307,16 @@ HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+// Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is
+// safe.
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
+    Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
+    Vec128<int32_t, N>& /*sum1*/) {
+  return sum0 + Vec128<int32_t, N>{wasm_i32x4_dot_i16x8(a.raw, b.raw)};
+}
+
 // ------------------------------ Reductions

 namespace detail {
@ -4353,6 +4405,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+
 template <size_t N, HWY_IF_GE32(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
                                       Vec128<uint16_t, N> v) {
@ -4422,7 +4498,7 @@ HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
  // Truth table of Eq and Lt for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
@ -4459,7 +4535,7 @@ HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
 template <typename T, size_t N, HWY_IF_LE128(T, N)>
 HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
 }
@ -4471,6 +4547,23 @@ HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
 }

+// ------------------------------ Ne128
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128(Simd<T, N, 0> d, Vec128<T, N> a,
+                               Vec128<T, N> b) {
+  static_assert(!IsSigned<T>() && sizeof(T) == 8, "T must be u64");
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
+}
+
+template <typename T, size_t N, HWY_IF_LE128(T, N)>
+HWY_INLINE Mask128<T, N> Ne128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
+                                    Vec128<T, N> b) {
+  const Vec128<T, N> neHL = VecFromMask(d, Ne(a, b));
+  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 // Without a native OddEven, it seems infeasible to go faster than Lt128.
--- a/third_party/highway/hwy/ops/wasm_256-inl.h
+++ b/third_party/highway/hwy/ops/wasm_256-inl.h
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
@ -21,7 +21,7 @@
 #include "hwy/base.h"

 // Avoid uninitialized warnings in GCC's emmintrin.h - see
-// https://github.com/google/highway/issues/710 and pull/902)
+// https://github.com/google/highway/issues/710 and pull/902
 HWY_DIAGNOSTICS(push)
 #if HWY_COMPILER_GCC_ACTUAL
 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
@ -49,17 +49,6 @@ HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
-
-#if HWY_TARGET <= HWY_AVX2
-template <typename T>
-using Full256 = Simd<T, 32 / sizeof(T), 0>;
-#endif
-
-#if HWY_TARGET <= HWY_AVX3
-template <typename T>
-using Full512 = Simd<T, 64 / sizeof(T), 0>;
-#endif
-
 namespace detail {

 template <typename T>
@ -82,6 +71,9 @@ class Vec128 {
  using Raw = typename detail::Raw128<T>::type;

 public:
+  using PrivateT = T;                     // only for DFromV
+  static constexpr size_t kPrivateN = N;  // only for DFromV
+
  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec128& operator*=(const Vec128 other) {
@ -117,10 +109,6 @@ using Vec32 = Vec128<T, 4 / sizeof(T)>;

 #if HWY_TARGET <= HWY_AVX3

-// Forward-declare for use by DeduceD, see below.
-template <typename T>
-class Vec512;
-
 namespace detail {

 // Template arg: sizeof(lane type)
@ -166,49 +154,11 @@ struct Mask128 {

 #endif  // HWY_TARGET <= HWY_AVX3

-#if HWY_TARGET <= HWY_AVX2
-// Forward-declare for use by DeduceD, see below.
-template <typename T>
-class Vec256;
-#endif
-
-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
-// incomplete types at this point; this is simpler than avoiding multiple
-// definitions of DFromV via #if)
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(const Vec128<T, N>*) const {
-    return Simd<T, N, 0>();
-  }
-#if HWY_TARGET <= HWY_AVX2
-  template <typename T>
-  Full256<T> operator()(const hwy::HWY_NAMESPACE::Vec256<T>*) const {
-    return Full256<T>();
-  }
-#endif
-#if HWY_TARGET <= HWY_AVX3
-  template <typename T>
-  Full512<T> operator()(const hwy::HWY_NAMESPACE::Vec512<T>*) const {
-    return Full512<T>();
-  }
-#endif
-};
-
-// Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
 template <class V>
-struct ExpandDFromV {
-  using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
-};
-
-}  // namespace detail
+using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;

 template <class V>
-using DFromV = typename detail::ExpandDFromV<V>::type;
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
+using TFromV = typename V::PrivateT;

 // ------------------------------ BitCast

@ -983,6 +933,47 @@ HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
 #endif
 }

+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask128<T, N>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
+#endif
+}
+template <typename T, size_t N>
+HWY_INLINE Mask128<T, N> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                          const Mask128<T, N> a,
+                                          const Mask128<T, N> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask128<T, N>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)};
+#else
+  return Mask128<T, N>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)};
+#endif
+}
+
 }  // namespace detail

 template <typename T, size_t N>
@ -1012,6 +1003,11 @@ HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
  return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
 }

+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 #else  // AVX2 or below

 // ------------------------------ Mask
@ -1109,6 +1105,12 @@ HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T, size_t N>
+HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
+  const Simd<T, N, 0> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 #endif  // HWY_TARGET <= HWY_AVX3

 // ------------------------------ ShiftLeft
@ -5170,26 +5172,33 @@ HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,

 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)

-template <size_t N>
-HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
-                                                   Vec128<bfloat16_t, 2 * N> a,
-                                                   Vec128<bfloat16_t, 2 * N> b,
-                                                   const Vec128<float, N> sum0,
-                                                   Vec128<float, N>& sum1) {
+template <class V, size_t N, class D16 = Simd<bfloat16_t, 2 * N, 0>>
+HWY_API V ReorderWidenMulAccumulate(Simd<float, N, 0> df32, VFromD<D16> a,
+                                    VFromD<D16> b, const V sum0, V& sum1) {
  // TODO(janwas): _mm_dpbf16_ps when available
  const Repartition<uint16_t, decltype(df32)> du16;
  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
+  const auto zero = Zero(du16);
  // Lane order within sum0/1 is undefined, hence we can avoid the
  // longer-latency lane-crossing PromoteTo.
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
+  using VU32 = VFromD<RebindToUnsigned<decltype(df32)>>;
+  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
+  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
+  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
+  const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
 }

+// Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe.
+template <size_t N>
+HWY_API Vec128<int32_t, N> ReorderWidenMulAccumulate(
+    Simd<int32_t, N, 0> /*d32*/, Vec128<int16_t, 2 * N> a,
+    Vec128<int16_t, 2 * N> b, const Vec128<int32_t, N> sum0,
+    Vec128<int32_t, N>& /*sum1*/) {
+  return sum0 + Vec128<int32_t, N>{_mm_madd_epi16(a.raw, b.raw)};
+}
+
 // ================================================== CONVERT

 // ------------------------------ Promotions (part w/ narrow lanes -> full)
@ -5461,6 +5470,30 @@ HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+// Specializations for partial vectors because packs_epi32 sets lanes above 2*N.
+HWY_API Vec128<int16_t, 2> ReorderDemote2To(Simd<int16_t, 2, 0> dn,
+                                            Vec128<int32_t, 1> a,
+                                            Vec128<int32_t, 1> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 2> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 2> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t, 4> ReorderDemote2To(Simd<int16_t, 4, 0> dn,
+                                            Vec128<int32_t, 2> a,
+                                            Vec128<int32_t, 2> b) {
+  const Half<decltype(dn)> dnh;
+  // Pretend the result has twice as many lanes so we can InterleaveLower.
+  const Vec128<int16_t, 4> an{DemoteTo(dnh, a).raw};
+  const Vec128<int16_t, 4> bn{DemoteTo(dnh, b).raw};
+  return InterleaveLower(an, bn);
+}
+HWY_API Vec128<int16_t> ReorderDemote2To(Full128<int16_t> /*d16*/,
+                                         Vec128<int32_t> a, Vec128<int32_t> b) {
+  return Vec128<int16_t>{_mm_packs_epi32(a.raw, b.raw)};
+}
+
 template <size_t N>
 HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
                                  const Vec128<double, N> v) {
@ -6035,6 +6068,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
  return PopCount(mask_bits);
 }

+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
+  return Num0BitsBelowLS1Bit_Nonzero32(mask_bits);
+}
+
 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                               const Mask128<T, N> mask) {
@ -6500,6 +6540,13 @@ HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
  return PopCount(detail::BitsFromMask(mask));
 }

+template <typename T, size_t N>
+HWY_API size_t FindKnownFirstTrue(const Simd<T, N, 0> /* tag */,
+                                  const Mask128<T, N> mask) {
+  const uint64_t mask_bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
+}
+
 template <typename T, size_t N>
 HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
                               const Mask128<T, N> mask) {
@ -7161,6 +7208,30 @@ HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

+template <size_t N, HWY_IF_GE32(uint16_t, N)>
+HWY_API Vec128<uint16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                       Vec128<uint16_t, N> v) {
+  const Simd<uint16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+template <size_t N, HWY_IF_GE32(int16_t, N)>
+HWY_API Vec128<int16_t, N> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                      Vec128<int16_t, N> v) {
+  const Simd<int16_t, N, 0> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+
 template <size_t N, HWY_IF_GE32(uint16_t, N)>
 HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
                                       Vec128<uint16_t, N> v) {
@ -7232,7 +7303,8 @@ namespace detail {
 // Returns vector-mask for Lt128. Also used by x86_256/x86_512.
 template <class D, class V = VFromD<D>>
 HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  // Truth table of Eq and Lt for Hi and Lo u64.
  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  // =H =L cH cL  | out = cH | (=H & cL)
@ -7256,12 +7328,22 @@ HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
 // Returns vector-mask for Eq128. Also used by x86_256/x86_512.
 template <class D, class V = VFromD<D>>
 HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
  const auto eqHL = VecFromMask(d, Eq(a, b));
  const auto eqLH = Reverse2(d, eqHL);
  return And(eqHL, eqLH);
 }

+template <class D, class V = VFromD<D>>
+HWY_INLINE V Ne128Vec(const D d, const V a, const V b) {
+  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8,
+                "D must be u64");
+  const auto neHL = VecFromMask(d, Ne(a, b));
+  const auto neLH = Reverse2(d, neHL);
+  return Or(neHL, neLH);
+}
+
 template <class D, class V = VFromD<D>>
 HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
  // No specialization required for AVX-512: Mask <-> Vec is fast, and
@ -7278,6 +7360,14 @@ HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
  return InterleaveUpper(d, eqHL, eqHL);
 }

+template <class D, class V = VFromD<D>>
+HWY_INLINE V Ne128UpperVec(const D d, const V a, const V b) {
+  // No specialization required for AVX-512: Mask <-> Vec is fast, and
+  // copying mask bits to their neighbor seems infeasible.
+  const V neHL = VecFromMask(d, Ne(a, b));
+  return InterleaveUpper(d, neHL, neHL);
+}
+
 }  // namespace detail

 template <class D, class V = VFromD<D>>
@ -7290,6 +7380,11 @@ HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
  return MaskFromVec(detail::Eq128Vec(d, a, b));
 }

+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Ne128(D d, const V a, const V b) {
+  return MaskFromVec(detail::Ne128Vec(d, a, b));
+}
+
 template <class D, class V = VFromD<D>>
 HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
@ -7300,6 +7395,11 @@ HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
 }

+template <class D, class V = VFromD<D>>
+HWY_API MFromD<D> Ne128Upper(D d, const V a, const V b) {
+  return MaskFromVec(detail::Ne128UpperVec(d, a, b));
+}
+
 // ------------------------------ Min128, Max128 (Lt128)

 // Avoids the extra MaskFromVec in Lt128.
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
@ -83,6 +83,9 @@ class Vec256 {
  using Raw = typename detail::Raw256<T>::type;

 public:
+  using PrivateT = T;                                  // only for DFromV
+  static constexpr size_t kPrivateN = 32 / sizeof(T);  // only for DFromV
+
  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec256& operator*=(const Vec256 other) {
@ -157,6 +160,9 @@ struct Mask256 {

 #endif  // HWY_TARGET <= HWY_AVX3

+template <typename T>
+using Full256 = Simd<T, 32 / sizeof(T), 0>;
+
 // ------------------------------ BitCast

 namespace detail {
@ -764,6 +770,43 @@ HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
 #endif
 }

+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask32(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask256<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                       const Mask256<T> a, const Mask256<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask256<T>{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)};
+#else
+  return Mask256<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)};
+#endif
+}
+
 }  // namespace detail

 template <typename T>
@ -793,6 +836,11 @@ HWY_API Mask256<T> Not(const Mask256<T> m) {
  return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
 }

+template <typename T>
+HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 #else  // AVX2

 // ------------------------------ Mask
@ -883,6 +931,12 @@ HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
 }

+template <typename T>
+HWY_API Mask256<T> ExclusiveNeither(const Mask256<T> a, Mask256<T> b) {
+  const Full256<T> d;
+  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
+}
+
 #endif  // HWY_TARGET <= HWY_AVX3

 // ================================================== COMPARE
@ -2866,6 +2920,7 @@ HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
 }

+// Used by generic_ops-inl.h
 namespace detail {

 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
@ -3694,7 +3749,7 @@ HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,

 namespace detail {

-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
+#if HWY_TARGET > HWY_AVX3 && !HWY_IDE  // AVX2 or older

 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
 template <typename T>
@ -3721,7 +3776,7 @@ HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {

 HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
                                Vec256<uint16_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
+#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
  return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
 #else
  return v * Pow2(bits);
@ -3757,7 +3812,7 @@ HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
 // ------------------------------ Shr (MulHigh, IfThenElse, Not)

 HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
+#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
  return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
 #else
  Full256<uint16_t> d;
@ -3798,7 +3853,7 @@ HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {

 HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
                                    const Vec256<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
+  const Full256<uint64_t> du64;
  const RepartitionToNarrow<decltype(du64)> du32;
  const auto maskL = Set(du64, 0xFFFFFFFFULL);
  const auto a32 = BitCast(du32, a);
@ -3827,7 +3882,7 @@ HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,

 HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
                                   const Vec256<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
+  const Full256<uint64_t> du64;
  const RepartitionToNarrow<decltype(du64)> du32;
  const auto maskL = Set(du64, 0xFFFFFFFFULL);
  const auto a32 = BitCast(du32, a);
@ -3852,25 +3907,13 @@ HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
  return InterleaveUpper(du64, mulL, mulH);
 }

-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
-                                                Vec256<bfloat16_t> a,
-                                                Vec256<bfloat16_t> b,
-                                                const Vec256<float> sum0,
-                                                Vec256<float>& sum1) {
-  // TODO(janwas): _mm256_dpbf16_ps when available
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec256<uint16_t> zero = Zero(du16);
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo.
-  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
+// ------------------------------ ReorderWidenMulAccumulate
+HWY_API Vec256<int32_t> ReorderWidenMulAccumulate(Full256<int32_t> /*d32*/,
+                                                  Vec256<int16_t> a,
+                                                  Vec256<int16_t> b,
+                                                  const Vec256<int32_t> sum0,
+                                                  Vec256<int32_t>& /*sum1*/) {
+  return sum0 + Vec256<int32_t>{_mm256_madd_epi16(a.raw, b.raw)};
 }

 // ================================================== CONVERT
@ -4053,6 +4096,11 @@ HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+HWY_API Vec256<int16_t> ReorderDemote2To(Full256<int16_t> /*d16*/,
+                                         Vec256<int32_t> a, Vec256<int32_t> b) {
+  return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
+}
+
 HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
                               const Vec256<double> v) {
  return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
@ -4218,7 +4266,7 @@ HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
  const RebindToSigned<decltype(df)> d32;

  const auto msk_lo = Set(du32, 0xFFFF);
-  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
+  const auto cnst2_16_flt = Set(df, 65536.0f);  // 2^16

  // Extract the 16 lowest/highest significant bits of v and cast to signed int
  const auto v_lo = BitCast(d32, And(v, msk_lo));
@ -4238,9 +4286,9 @@ HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
  using VU = VFromD<decltype(d64)>;

  const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
-  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
+  const auto cnst2_32_dbl = Set(dd, 4294967296.0);  // 2^32

-   // Extract the 32 lowest significant bits of v
+  // Extract the 32 lowest significant bits of v
  const VU v_lo = And(v, msk_lo);
  const VU v_hi = ShiftRight<32>(v);

@ -4458,9 +4506,15 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
 }

 template <typename T>
-HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
-                               const Mask256<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
+                                  const Mask256<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
+}
+
+template <typename T>
+HWY_API intptr_t FindFirstTrue(const Full256<T> d, const Mask256<T> mask) {
+  return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
+                  : intptr_t{-1};
 }

 // Beware: the suffix indicates the number of mask bits, not lane size!
@ -4903,6 +4957,13 @@ HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
  return PopCount(detail::BitsFromMask(mask));
 }

+template <typename T>
+HWY_API size_t FindKnownFirstTrue(const Full256<T> /* tag */,
+                                  const Mask256<T> mask) {
+  const uint64_t mask_bits = detail::BitsFromMask(mask);
+  return Num0BitsBelowLS1Bit_Nonzero64(mask_bits);
+}
+
 template <typename T>
 HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
                               const Mask256<T> mask) {
@ -4915,8 +4976,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
 namespace detail {

 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> d32;
  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
@ -4925,49 +4985,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
  // bits, for a total of 1 KiB.
  alignas(16) constexpr uint32_t packed_array[256] = {
      // PrintCompress32x8Tables
-      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
-      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
-      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
-      0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
-      0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
-      0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
-      0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
-      0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
-      0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
-      0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
-      0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
-      0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
-      0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
-      0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
-      0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
-      0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
-      0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
-      0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
-      0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
-      0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
-      0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
-      0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
-      0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
-      0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
-      0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
-      0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
-      0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
-      0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
-      0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
-      0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
-      0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
-      0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
-      0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
-      0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
-      0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
-      0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
-      0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
-      0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
-      0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
-      0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
-      0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
-      0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
-      0x10765432, 0x17654320, 0x07654321, 0x76543210};
+      0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
+      0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
+      0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
+      0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
+      0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
+      0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
+      0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
+      0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
+      0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
+      0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
+      0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
+      0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
+      0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
+      0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
+      0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
+      0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
+      0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
+      0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
+      0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
+      0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
+      0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
+      0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
+      0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
+      0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
+      0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
+      0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
+      0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
+      0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
+      0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
+      0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
+      0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
+      0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
+      0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
+      0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
+      0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
+      0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
+      0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
+      0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
+      0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
+      0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
+      0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
+      0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
+      0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};

  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@ -4975,12 +5035,11 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
  // latency, it may be faster to use LoadDup128 and PSHUFB.
  const auto packed = Set(d32, packed_array[mask_bits]);
  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
+  return packed >> Load(d32, shifts);
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
  const Repartition<uint32_t, decltype(d)> d32;

  // For 64-bit, we still need 32-bit indices because there is no 64-bit
@ -4988,18 +5047,20 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
  // unpacking and load the entire index vector directly.
  alignas(32) constexpr uint32_t u32_indices[128] = {
      // PrintCompress64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
-      2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
-      0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
-      0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
-      2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+      0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 2,  3,  4,  5,  6,  7,
+      10, 11, 0,  1,  4,  5,  6, 7, 8, 9, 10, 11, 4,  5,  6,  7,
+      12, 13, 0,  1,  2,  3,  6, 7, 8, 9, 12, 13, 2,  3,  6,  7,
+      10, 11, 12, 13, 0,  1,  6, 7, 8, 9, 10, 11, 12, 13, 6,  7,
+      14, 15, 0,  1,  2,  3,  4, 5, 8, 9, 14, 15, 2,  3,  4,  5,
+      10, 11, 14, 15, 0,  1,  4, 5, 8, 9, 10, 11, 14, 15, 4,  5,
+      12, 13, 14, 15, 0,  1,  2, 3, 8, 9, 12, 13, 14, 15, 2,  3,
+      10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> d32;
  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
@ -5008,49 +5069,49 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
  // bits, for a total of 1 KiB.
  alignas(16) constexpr uint32_t packed_array[256] = {
      // PrintCompressNot32x8Tables
-      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
-      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
-      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
-      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
-      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
-      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
-      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
-      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
-      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
-      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
-      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
-      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
-      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
-      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
-      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
-      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
-      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
-      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
-      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
-      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
-      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
-      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
-      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
-      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
-      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
-      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
-      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
-      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
-      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
-      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
-      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
-      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
-      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
-      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
-      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
-      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
-      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
-      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
-      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
-      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
-      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
-      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
-      0x76543210, 0x76543201, 0x76543210, 0x76543210};
+      0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
+      0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
+      0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
+      0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
+      0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
+      0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
+      0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
+      0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
+      0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
+      0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
+      0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
+      0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
+      0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
+      0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
+      0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
+      0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
+      0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
+      0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
+      0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
+      0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
+      0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
+      0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
+      0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
+      0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
+      0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
+      0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
+      0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
+      0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
+      0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
+      0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
+      0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
+      0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
+      0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
+      0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
+      0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
+      0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
+      0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
+      0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
+      0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
+      0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
+      0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
+      0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
+      0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};

  // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
@ -5058,12 +5119,12 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
  // latency, it may be faster to use LoadDup128 and PSHUFB.
  const auto packed = Set(d32, packed_array[mask_bits]);
  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
+  return packed >> Load(d32, shifts);
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
+HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
+                                               uint64_t mask_bits) {
  const Repartition<uint32_t, decltype(d)> d32;

  // For 64-bit, we still need 32-bit indices because there is no 64-bit
@ -5071,13 +5132,15 @@ HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
  // unpacking and load the entire index vector directly.
  alignas(32) constexpr uint32_t u32_indices[128] = {
      // PrintCompressNot64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
-      2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
-      0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
-      2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
-      4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,
+      8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11,
+      8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8,  9,  12, 13,
+      8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8,  9,  14, 15,
+      8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8,  9,  10, 11, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8,  9,  12, 13, 14, 15,
+      8, 9, 10, 11, 12, 13, 14, 15, 8,  9,  10, 11, 12, 13, 14, 15};
+  return Load(d32, u32_indices + 8 * mask_bits);
 }
 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
 HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
@ -5085,7 +5148,9 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
  const Repartition<uint32_t, decltype(d)> du32;

  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromBits(d, mask_bits);
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
 }

@ -5135,7 +5200,9 @@ HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
  const Repartition<uint32_t, decltype(d)> du32;

  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromNotBits(d, mask_bits);
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64).
+  const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
 }

@ -5199,7 +5266,22 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
                                    T* HWY_RESTRICT unaligned) {
  const uint64_t mask_bits = detail::BitsFromMask(m);
  const size_t count = PopCount(mask_bits);
-  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
+
+  const Repartition<uint32_t, decltype(d)> du32;
+  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
+  // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is
+  // no instruction for 4x64). Nibble MSB encodes FirstN.
+  const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
+  // Shift nibble MSB into MSB
+  const Mask256<uint32_t> mask32 = MaskFromVec(ShiftLeft<28>(idx_and_mask));
+  // First cast to unsigned (RebindMask cannot change lane size)
+  const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
+  const Mask256<T> mask = RebindMask(d, mask_u);
+  const Vec256<T> compressed =
+      BitCast(d, TableLookupLanes(BitCast(du32, v),
+                                  Indices256<uint32_t>{idx_and_mask.raw}));
+
+  BlendedStore(compressed, mask, d, unaligned);
  // Workaround for MSAN not marking output as initialized (b/233326619)
 #if HWY_IS_MSAN
  __msan_unpoison(unaligned, count * sizeof(T));
@ -5429,6 +5511,28 @@ HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
  return Max(v10, v01);
 }

+HWY_API Vec256<uint16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                    Vec256<uint16_t> v) {
+  const Full256<uint16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+HWY_API Vec256<int16_t> SumOfLanes(hwy::SizeTag<2> /* tag */,
+                                   Vec256<int16_t> v) {
+  const Full256<int16_t> d;
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(hwy::SizeTag<4>(), even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+
 HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
                                    Vec256<uint16_t> v) {
  const Full256<uint16_t> d;
@ -5475,7 +5579,7 @@ HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,

 }  // namespace detail

-// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
+// Supported for {uif}{32,64},{ui}16. Returns the broadcasted result.
 template <typename T>
 HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
@ -113,6 +113,9 @@ class Vec512 {
  using Raw = typename detail::Raw512<T>::type;

 public:
+  using PrivateT = T;                                  // only for DFromV
+  static constexpr size_t kPrivateN = 64 / sizeof(T);  // only for DFromV
+
  // Compound assignment. Only usable if there is a corresponding non-member
  // binary operator overload. For example, only f32 and f64 support division.
  HWY_INLINE Vec512& operator*=(const Vec512 other) {
@ -146,6 +149,9 @@ struct Mask512 {
  typename detail::RawMask512<sizeof(T)>::type raw;
 };

+template <typename T>
+using Full512 = Simd<T, 64 / sizeof(T), 0>;
+
 // ------------------------------ BitCast

 namespace detail {
@ -1775,6 +1781,43 @@ HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
 #endif
 }

+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<1> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask64(a.raw, b.raw)};
+#else
+  return Mask512<T>{~(a.raw ^ b.raw)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<2> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask32(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<4> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask16(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)};
+#endif
+}
+template <typename T>
+HWY_INLINE Mask512<T> ExclusiveNeither(hwy::SizeTag<8> /*tag*/,
+                                       const Mask512<T> a, const Mask512<T> b) {
+#if HWY_COMPILER_HAS_MASK_INTRINSICS
+  return Mask512<T>{_kxnor_mask8(a.raw, b.raw)};
+#else
+  return Mask512<T>{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)};
+#endif
+}
+
 }  // namespace detail

 template <typename T>
@ -1802,6 +1845,11 @@ HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) {
  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
 }

+template <typename T>
+HWY_API Mask512<T> ExclusiveNeither(const Mask512<T> a, Mask512<T> b) {
+  return detail::ExclusiveNeither(hwy::SizeTag<sizeof(T)>(), a, b);
+}
+
 // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)

 HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) {
@ -3285,6 +3333,11 @@ HWY_API Vec512<bfloat16_t> ReorderDemote2To(Full512<bfloat16_t> dbf16,
  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
 }

+HWY_API Vec512<int16_t> ReorderDemote2To(Full512<int16_t> /*d16*/,
+                                         Vec512<int32_t> a, Vec512<int32_t> b) {
+  return Vec512<int16_t>{_mm512_packs_epi32(a.raw, b.raw)};
+}
+
 HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
                               const Vec512<double> v) {
  return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
@ -3646,15 +3699,21 @@ HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
 }

 template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
+                                  const Mask512<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero32(mask.raw);
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
+HWY_API size_t FindKnownFirstTrue(const Full512<T> /* tag */,
+                                  const Mask512<T> mask) {
+  return Num0BitsBelowLS1Bit_Nonzero64(mask.raw);
+}
+
+template <typename T>
+HWY_API intptr_t FindFirstTrue(const Full512<T> d, const Mask512<T> mask) {
+  return mask.raw ? static_cast<intptr_t>(FindKnownFirstTrue(d, mask))
+                  : intptr_t{-1};
 }

 // ------------------------------ Compress
@ -3672,7 +3731,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
  // See CompressIsPartition. u64 is faster than u32.
  alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompress32x8Tables
+      // From PrintCompress32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
@ -3781,7 +3842,7 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-  const auto idx = LoadU(du, iota + 32 - num0);
+  const Vec512<uint16_t> idx = LoadU(du, iota + 32 - num0);
  const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
      demoted0.raw, m_upper, idx.raw, demoted1.raw)};
 #endif  // HWY_TARGET == HWY_AVX3_DL
@ -3800,7 +3861,9 @@ template <typename T, HWY_IF_LANE_SIZE(T, 8)>
 HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
  // See CompressIsPartition. u64 is faster than u32.
  alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompressNot32x8Tables
+      // From PrintCompressNot32x8Tables, without the FirstN extension (there is
+      // no benefit to including them because 64-bit CompressStore is anyway
+      // masked, but also no harm because TableLookupLanes ignores the MSB).
      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
@ -4149,7 +4212,7 @@ HWY_API void StoreTransposedBlocks4(const Vec512<T> i, const Vec512<T> j,

 HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
                                    const Vec512<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
+  const Full512<uint64_t> du64;
  const RepartitionToNarrow<decltype(du64)> du32;
  const auto maskL = Set(du64, 0xFFFFFFFFULL);
  const auto a32 = BitCast(du32, a);
@ -4178,7 +4241,7 @@ HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,

 HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
                                   const Vec512<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
+  const Full512<uint64_t> du64;
  const RepartitionToNarrow<decltype(du64)> du32;
  const auto maskL = Set(du64, 0xFFFFFFFFULL);
  const auto a32 = BitCast(du32, a);
@ -4203,25 +4266,13 @@ HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
  return InterleaveUpper(du64, mulL, mulH);
 }

-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec512<float> ReorderWidenMulAccumulate(Full512<float> df32,
-                                                Vec512<bfloat16_t> a,
-                                                Vec512<bfloat16_t> b,
-                                                const Vec512<float> sum0,
-                                                Vec512<float>& sum1) {
-  // TODO(janwas): _mm512_dpbf16_ps when available
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec512<uint16_t> zero = Zero(du16);
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo.
-  const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
+// ------------------------------ ReorderWidenMulAccumulate
+HWY_API Vec512<int32_t> ReorderWidenMulAccumulate(Full512<int32_t> /*d32*/,
+                                                  Vec512<int16_t> a,
+                                                  Vec512<int16_t> b,
+                                                  const Vec512<int32_t> sum0,
+                                                  Vec512<int32_t>& /*sum1*/) {
+  return sum0 + Vec512<int32_t>{_mm512_madd_epi16(a.raw, b.raw)};
 }

 // ------------------------------ Reductions
@ -4245,6 +4296,23 @@ HWY_API Vec512<float> SumOfLanes(Full512<float> d, Vec512<float> v) {
 HWY_API Vec512<double> SumOfLanes(Full512<double> d, Vec512<double> v) {
  return Set(d, _mm512_reduce_add_pd(v.raw));
 }
+HWY_API Vec512<uint16_t> SumOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(d32, even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}
+HWY_API Vec512<int16_t> SumOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
+  const RepartitionToWide<decltype(d)> d32;
+  // Sign-extend
+  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
+  const auto odd = ShiftRight<16>(BitCast(d32, v));
+  const auto sum = SumOfLanes(d32, even + odd);
+  // Also broadcast into odd lanes.
+  return OddEven(BitCast(d, ShiftLeft<16>(sum)), BitCast(d, sum));
+}

 // Returns the minimum in each lane.
 HWY_API Vec512<int32_t> MinOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
--- a/third_party/highway/hwy/print-inl.h
+++ b/third_party/highway/hwy/print-inl.h
@ -35,7 +35,7 @@ namespace hwy {
 namespace HWY_NAMESPACE {

 // Prints lanes around `lane`, in memory order.
-template <class D, class V = Vec<D>>
+template <class D, class V = VFromD<D>>
 void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
           size_t max_lanes = 7) {
  const size_t N = Lanes(d);
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@ -43,7 +43,6 @@
 #endif  // HWY_COMPILER_MSVC

 #elif HWY_ARCH_ARM && HWY_OS_LINUX
-#include <asm/hwcap.h>
 #include <sys/auxv.h>
 #endif  // HWY_ARCH_*

@ -104,7 +103,7 @@ int64_t supported_targets_for_test_ = 0;
 int64_t supported_mask_ = LimitsMax<int64_t>();

 #if HWY_ARCH_X86
-// Arbritrary bit indices indicating which instruction set extensions are
+// Arbitrary bit indices indicating which instruction set extensions are
 // supported. Use enum to ensure values are distinct.
 enum class FeatureIndex : uint32_t {
  kSSE = 0,
--- a/third_party/highway/hwy/targets.h
+++ b/third_party/highway/hwy/targets.h
@ -16,7 +16,11 @@
 #ifndef HIGHWAY_HWY_TARGETS_H_
 #define HIGHWAY_HWY_TARGETS_H_

+// Allows opting out of C++ standard library usage, which is not available in
+// some Compiler Explorer environments.
+#ifndef HWY_NO_LIBCXX
 #include <vector>
+#endif

 // For SIMD module implementations and their callers. Defines which targets to
 // generate and call.
@ -25,7 +29,7 @@
 #include "hwy/detect_targets.h"
 #include "hwy/highway_export.h"

-#if !HWY_ARCH_RVV
+#if !HWY_ARCH_RVV && !defined(HWY_NO_LIBCXX)
 #include <atomic>
 #endif

@ -61,6 +65,8 @@ HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
 // all targets.
 HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);

+#ifndef HWY_NO_LIBCXX
+
 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
 // is affected by the current SetSupportedTargetsForTest() mock if any.
@ -74,6 +80,8 @@ HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
  return ret;
 }

+#endif  // HWY_NO_LIBCXX
+
 static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
  switch (target) {
 #if HWY_ARCH_X86
@ -296,8 +304,8 @@ struct ChosenTarget {
  }

 private:
-  // TODO(janwas): remove #if once <atomic> is available
-#if HWY_ARCH_RVV
+  // TODO(janwas): remove RVV once <atomic> is available
+#if HWY_ARCH_RVV || defined(HWY_NO_LIBCXX)
  int64_t LoadMask() const { return mask_; }
  void StoreMask(int64_t mask) { mask_ = mask; }

--- a/third_party/highway/hwy/targets_test.cc
+++ b/third_party/highway/hwy/targets_test.cc
@ -37,6 +37,7 @@ DECLARE_FUNCTION(SVE_256)
 DECLARE_FUNCTION(SVE2_128)
 DECLARE_FUNCTION(PPC8)
 DECLARE_FUNCTION(WASM)
+DECLARE_FUNCTION(WASM_EMU256)
 DECLARE_FUNCTION(RVV)
 DECLARE_FUNCTION(SCALAR)
 DECLARE_FUNCTION(EMU128)
@ -81,6 +82,7 @@ void CheckFakeFunction() {
  CallFunctionForTarget(HWY_SVE2_128, __LINE__);
  CallFunctionForTarget(HWY_PPC8, __LINE__);
  CallFunctionForTarget(HWY_WASM, __LINE__);
+  CallFunctionForTarget(HWY_WASM_EMU256, __LINE__);
  CallFunctionForTarget(HWY_RVV, __LINE__);
  // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
  // is opt-in only.
--- a/third_party/highway/hwy/tests/blockwise_shift_test.cc
+++ b/third_party/highway/hwy/tests/blockwise_shift_test.cc
@ -17,6 +17,8 @@
 #include <stdint.h>
 #include <string.h>  // memcpy

+#include <algorithm>  // std::fill
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
--- a/third_party/highway/hwy/tests/blockwise_test.cc
+++ b/third_party/highway/hwy/tests/blockwise_test.cc
@ -17,6 +17,8 @@
 #include <stdint.h>
 #include <string.h>

+#include <algorithm>  // std::fill
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
--- a/third_party/highway/hwy/tests/combine_test.cc
+++ b/third_party/highway/hwy/tests/combine_test.cc
@ -17,6 +17,8 @@
 #include <stdint.h>
 #include <string.h>  // memcpy

+#include <algorithm>  // std::fill
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/combine_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
--- a/third_party/highway/hwy/tests/compare_test.cc
+++ b/third_party/highway/hwy/tests/compare_test.cc
@ -338,7 +338,7 @@ HWY_NOINLINE void TestAllLt128Upper() {
  ForGEVectors<128, TestLt128Upper>()(uint64_t());
 }

-struct TestEq128 {
+struct TestEq128 {  // Also Ne128
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    using V = Vec<D>;
@ -353,15 +353,24 @@ struct TestEq128 {
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, v10, v10));

    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v11));

    // Reversed order
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, v01));

    // Also check 128-bit blocks are independent
    const V iota = Iota(d, 1);
@ -369,10 +378,16 @@ struct TestEq128 {
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, Add(iota, v10), iota));

    // Max value
    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128(d, vm, vm));
+
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
@ -381,12 +396,21 @@ struct TestEq128 {
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128(d, v11, vm));
  }
 };

 HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }

-struct TestEq128Upper {
+struct TestEq128Upper {  // Also Ne128Upper
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    using V = Vec<D>;
@ -401,26 +425,43 @@ struct TestEq128Upper {
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v10, v10));

    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v00, v01));
+
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, v11));

    // Reversed order
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, v01, v00));
+
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, v01));

    // Also check 128-bit blocks are independent
    const V iota = Iota(d, 1);
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, iota, Add(iota, v01)));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, iota, Add(iota, v10)));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, Add(iota, v01), iota));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, Add(iota, v10), iota));

    // Max value
    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne128Upper(d, vm, vm));
+
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
@ -429,6 +470,15 @@ struct TestEq128Upper {
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v00));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v01));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v10));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, vm, v11));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v00, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v01, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v10, vm));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne128Upper(d, v11, vm));
  }
 };

--- a/third_party/highway/hwy/tests/compress_test.cc
+++ b/third_party/highway/hwy/tests/compress_test.cc
@ -37,13 +37,15 @@ namespace HWY_NAMESPACE {
 #if !HWY_PRINT_TABLES || HWY_IDE

 template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
-void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
-                 size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
+void CheckStored(D d, DI di, const char* op, size_t expected_pos,
+                 size_t actual_pos, size_t num_to_check,
+                 const AlignedFreeUniquePtr<T[]>& in,
                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
                 int line) {
  if (expected_pos != actual_pos) {
-    hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
+    hwy::Abort(__FILE__, line,
+               "%s: size mismatch for %s: expected %d, actual %d\n", op,
               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
               static_cast<int>(actual_pos));
  }
@ -51,7 +53,7 @@ void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
  for (size_t i = 0; i < num_to_check; ++i) {
    if (!IsEqual(expected[i], actual_u[i])) {
      const size_t N = Lanes(d);
-      fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
+      fprintf(stderr, "%s: mismatch at i=%d of %d, line %d:\n\n", op,
              static_cast<int>(i), static_cast<int>(num_to_check), line);
      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
      Print(d, "in", Load(d, in.get()), 0, N);
@ -91,9 +93,9 @@ struct TestCompress {
      for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
        size_t expected_pos = 0;
        for (size_t i = 0; i < N; ++i) {
-          const uint64_t bits = Random32(&rng);
+          const uint64_t r = Random32(&rng);
          in_lanes[i] = T();  // cannot initialize float16_t directly.
-          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);  // not same size
+          CopyBytes<sizeof(T)>(&r, &in_lanes[i]);  // not same size
          mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
          if (mask_lanes[i] > 0) {
            expected[expected_pos++] = in_lanes[i];
@ -124,30 +126,32 @@ struct TestCompress {
        // Compress
        memset(actual_u, 0, N * sizeof(T));
        StoreU(Compress(in, mask), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "Compress", expected_pos, expected_pos, num_to_check,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);

        // CompressNot
        memset(actual_u, 0, N * sizeof(T));
        StoreU(CompressNot(in, Not(mask)), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "CompressNot", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);

        // CompressStore
        memset(actual_u, 0, N * sizeof(T));
        const size_t size1 = CompressStore(in, mask, d, actual_u);
        // expected_pos instead of num_to_check because this op is not
        // affected by CompressIsPartition.
-        CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "CompressStore", expected_pos, size1, expected_pos,
+                    in_lanes, mask_lanes, expected, actual_u, __LINE__);

        // CompressBlendedStore
        memset(actual_u, 0, N * sizeof(T));
        const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
        // expected_pos instead of num_to_check because this op only writes
        // the mask=true lanes.
-        CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "CompressBlendedStore", expected_pos, size2,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
        // Subsequent lanes are untouched.
        for (size_t i = size2; i < N; ++i) {
          HWY_ASSERT_EQ(zero, actual_u[i]);
@ -156,16 +160,18 @@ struct TestCompress {
        // CompressBits
        memset(actual_u, 0, N * sizeof(T));
        StoreU(CompressBits(in, bits.get()), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "CompressBits", expected_pos, expected_pos,
+                    num_to_check, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);

        // CompressBitsStore
        memset(actual_u, 0, N * sizeof(T));
        const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
        // expected_pos instead of num_to_check because this op is not
        // affected by CompressIsPartition.
-        CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
+        CheckStored(d, di, "CompressBitsStore", expected_pos, size3,
+                    expected_pos, in_lanes, mask_lanes, expected, actual_u,
+                    __LINE__);
      }  // rep
    }    // frac
  }      // operator()
@ -230,8 +236,9 @@ struct TestCompressBlocks {
      // CompressBlocksNot
      memset(actual.get(), 0, N * sizeof(T));
      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
-      CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                  mask_lanes, expected, actual.get(), __LINE__);
+      CheckStored(d, di, "CompressBlocksNot", expected_pos, expected_pos,
+                  num_to_check, in_lanes, mask_lanes, expected, actual.get(),
+                  __LINE__);
    }  // rep
 #endif  // HWY_TARGET == HWY_SCALAR
  }     // operator()
@ -305,11 +312,13 @@ void PrintCompressNot16x8Tables() {
  printf("\n");
 }

-// Compressed to nibbles, unpacked via variable right shift
+// Compressed to nibbles, unpacked via variable right shift. Also includes
+// FirstN bits in the nibble MSB.
 void PrintCompress32x8Tables() {
  printf("======================================= 32/64x8\n");
  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
    std::array<uint32_t, N> indices{0};
    size_t pos = 0;
    // All lanes where mask = true
@ -330,6 +339,10 @@ void PrintCompress32x8Tables() {
    uint64_t packed = 0;
    for (size_t i = 0; i < N; ++i) {
      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
      packed += indices[i] << (i * 4);
    }

@ -344,6 +357,7 @@ void PrintCompressNot32x8Tables() {
  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
    std::array<uint32_t, N> indices{0};
    size_t pos = 0;
    // All lanes where mask = true
@ -364,6 +378,10 @@ void PrintCompressNot32x8Tables() {
    uint64_t packed = 0;
    for (size_t i = 0; i < N; ++i) {
      HWY_ASSERT(indices[i] < N);
+      if (i < count) {
+        indices[i] |= N;
+        HWY_ASSERT(indices[i] < 0x10);
+      }
      packed += indices[i] << (i * 4);
    }

@ -504,11 +522,13 @@ void PrintCompressNot64x4Tables() {
  printf("\n");
 }

-// Same as above, but prints pairs of u32 indices (for AVX2)
+// Same as above, but prints pairs of u32 indices (for AVX2). Also includes
+// FirstN bits in the nibble MSB.
 void PrintCompress64x4PairTables() {
  printf("======================================= 64x4 u32 index\n");
  constexpr size_t N = 4;  // AVX2
  for (uint64_t code = 0; code < (1ull << N); ++code) {
+    const size_t count = PopCount(code);
    std::array<size_t, N> indices{0};
    size_t pos = 0;
    // All lanes where mask = true
@ -530,8 +550,10 @@ void PrintCompress64x4PairTables() {
    // interpreted modulo N. Compression is not worth the extra shift+AND
    // because the table is anyway only 512 bytes.
    for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
    }
  }
  printf("\n");
@ -542,6 +564,7 @@ void PrintCompressNot64x4PairTables() {
  constexpr size_t N = 4;  // AVX2
  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
    const uint64_t code = ~not_code;
+    const size_t count = PopCount(code);
    std::array<size_t, N> indices{0};
    size_t pos = 0;
    // All lanes where mask = true
@ -563,8 +586,10 @@ void PrintCompressNot64x4PairTables() {
    // interpreted modulo N. Compression is not worth the extra shift+AND
    // because the table is anyway only 512 bytes.
    for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
+      const int first_n_bit = i < count ? 8 : 0;
+      const int low = static_cast<int>(2 * indices[i]) + first_n_bit;
+      HWY_ASSERT(low < 0x10);
+      printf("%d, %d, ", low, low + 1);
    }
  }
  printf("\n");
--- a/third_party/highway/hwy/tests/demote_test.cc
+++ b/third_party/highway/hwy/tests/demote_test.cc
@ -16,6 +16,8 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <cmath>  // std::isfinite
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/demote_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
--- a/third_party/highway/hwy/tests/float_test.cc
+++ b/third_party/highway/hwy/tests/float_test.cc
@ -18,8 +18,9 @@
 #include <stddef.h>
 #include <stdint.h>

-#include <algorithm>
+#include <algorithm>  // std::copy, std::fill
 #include <limits>
+#include <cmath>  // std::abs, std::isnan, std::isinf, std::ceil, std::floor

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/float_test.cc"
--- a/third_party/highway/hwy/tests/hwy_gtest.h
+++ b/third_party/highway/hwy/tests/hwy_gtest.h
@ -22,7 +22,7 @@
 #include <stdint.h>

 #include <string>
-#include <utility>  // std::tuple
+#include <tuple>

 #include "gtest/gtest.h"
 #include "hwy/highway.h"
--- a/third_party/highway/hwy/tests/mask_test.cc
+++ b/third_party/highway/hwy/tests/mask_test.cc
@ -17,6 +17,8 @@
 #include <stdint.h>
 #include <string.h>  // memcmp

+#include <algorithm>  // std::fill
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mask_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
@ -189,7 +191,7 @@ HWY_NOINLINE void TestAllCountTrue() {
  ForAllTypes(ForPartialVectors<TestCountTrue>());
 }

-struct TestFindFirstTrue {
+struct TestFindFirstTrue {  // Also FindKnownFirstTrue
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    using TI = MakeSigned<T>;  // For mask > 0 comparison
@ -203,17 +205,18 @@ struct TestFindFirstTrue {

    HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
    HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
+    HWY_ASSERT_EQ(size_t(0), FindKnownFirstTrue(d, MaskTrue(d)));

    for (size_t code = 1; code < (1ull << max_lanes); ++code) {
      for (size_t i = 0; i < max_lanes; ++i) {
        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
      }

-      const intptr_t expected = static_cast<intptr_t>(
-          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)));
+      const size_t expected =
+          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code));
      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const intptr_t actual = FindFirstTrue(d, mask);
-      HWY_ASSERT_EQ(expected, actual);
+      HWY_ASSERT_EQ(static_cast<intptr_t>(expected), FindFirstTrue(d, mask));
+      HWY_ASSERT_EQ(expected, FindKnownFirstTrue(d, mask));
    }
  }
 };
@ -237,6 +240,11 @@ struct TestLogicalMask {
    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));

+    Print(d, ".", VecFromMask(d, ExclusiveNeither(m0, m0)));
+    HWY_ASSERT_MASK_EQ(d, m_all, ExclusiveNeither(m0, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m_all, m0));
+    HWY_ASSERT_MASK_EQ(d, m0, ExclusiveNeither(m0, m_all));
+
    // For all combinations of zero/nonzero state of subset of lanes:
    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
--- a/third_party/highway/hwy/tests/memory_test.cc
+++ b/third_party/highway/hwy/tests/memory_test.cc
@ -23,6 +23,8 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <algorithm>  // std::fill
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/memory_test.cc"
 #include "hwy/cache_control.h"
--- a/third_party/highway/hwy/tests/mul_test.cc
+++ b/third_party/highway/hwy/tests/mul_test.cc
@ -26,6 +26,15 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+template <size_t kBits>
+constexpr uint64_t FirstBits() {
+  return (1ull << kBits) - 1;
+}
+template <>
+constexpr uint64_t FirstBits<64>() {
+  return ~uint64_t{0};
+}
+
 struct TestUnsignedMul {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -56,9 +65,8 @@ struct TestUnsignedMul {
    HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
    HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));

-    const size_t bits = sizeof(T) * 8;
-    const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
-    const T max2 = (static_cast<uint64_t>(max) * max) & mask;
+    constexpr uint64_t kMask = FirstBits<sizeof(T) * 8>();
+    const T max2 = (static_cast<uint64_t>(max) * max) & kMask;
    HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
  }
 };
@ -349,64 +357,65 @@ struct TestReorderWidenMulAccumulate {
  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
    using TW = MakeWide<TN>;
    const RepartitionToWide<DN> dw;
-    const auto f0 = Zero(dw);
-    const auto f1 = Set(dw, 1.0f);
-    const auto fi = Iota(dw, 1);
-    const auto bf0 = ReorderDemote2To(dn, f0, f0);
-    const auto bf1 = ReorderDemote2To(dn, f1, f1);
-    const auto bfi = ReorderDemote2To(dn, fi, fi);
-    const size_t NW = Lanes(dw);
-    auto delta = AllocateAligned<TW>(2 * NW);
-    for (size_t i = 0; i < 2 * NW; ++i) {
-      delta[i] = 0.0f;
-    }
+    const Half<DN> dnh;
+    using VW = Vec<decltype(dw)>;
+    using VN = Vec<decltype(dn)>;
+    const size_t NN = Lanes(dn);
+
+    const VW f0 = Zero(dw);
+    const VW f1 = Set(dw, TW{1});
+    const VN bf0 = Zero(dn);
+    // Cannot Set() bfloat16_t directly.
+    const VN bf1 = ReorderDemote2To(dn, f1, f1);

    // Any input zero => both outputs zero
-    auto sum1 = f0;
+    VW sum1 = f0;
    HWY_ASSERT_VEC_EQ(dw, f0,
                      ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
+                      ReorderWidenMulAccumulate(dw, bf0, bf1, f0, sum1));
    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
+                      ReorderWidenMulAccumulate(dw, bf1, bf0, f0, sum1));
    HWY_ASSERT_VEC_EQ(dw, f0, sum1);

-    // delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
-    for (size_t p = 0; p < 2 * NW; ++p) {
-      delta[p] = 1.0f;
-      const auto delta0 = Load(dw, delta.get() + 0);
-      const auto delta1 = Load(dw, delta.get() + NW);
-      delta[p] = 0.0f;
-      const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
+    // delta[p] := 1, all others zero. For each p: Dot(delta, all-ones) == 1.
+    auto delta_w = AllocateAligned<TW>(NN);
+    for (size_t p = 0; p < NN; ++p) {
+      // Workaround for incorrect Clang wasm codegen: re-initialize the entire
+      // array rather than zero-initialize once and then toggle lane p.
+      for (size_t i = 0; i < NN; ++i) {
+        delta_w[i] = static_cast<TW>(i == p);
+      }
+      const VW delta0 = Load(dw, delta_w.get());
+      const VW delta1 = Load(dw, delta_w.get() + NN / 2);
+      const VN delta = ReorderDemote2To(dn, delta0, delta1);

      {
        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        const VW sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
      }
      // Swapped arg order
      {
        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        const VW sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, f0, sum1);
+        HWY_ASSERT_EQ(TW{1}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
      }
      // Start with nonzero sum0 or sum1
      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, delta, bf1, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
      }
      // Start with nonzero sum0 or sum1, and swap arg order
      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        VW sum0 = PromoteTo(dw, LowerHalf(dnh, delta));
+        sum1 = PromoteTo(dw, UpperHalf(dnh, delta));
+        sum0 = ReorderWidenMulAccumulate(dw, bf1, delta, sum0, sum1);
+        HWY_ASSERT_EQ(TW{2}, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
      }
    }
  }
@ -414,6 +423,7 @@ struct TestReorderWidenMulAccumulate {

 HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
+  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(int16_t());
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/tests/reduction_test.cc
+++ b/third_party/highway/hwy/tests/reduction_test.cc
@ -54,6 +54,7 @@ struct TestSumOfLanes {

 HWY_NOINLINE void TestAllSumOfLanes() {
  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
+  ForUI16(ForPartialVectors<TestSumOfLanes>());
 }

 struct TestMinOfLanes {
@ -170,10 +171,8 @@ HWY_NOINLINE void TestAllMinMaxOfLanes() {
  const ForPartialVectors<TestMaxOfLanes> test_max;
  ForUIF3264(test_min);
  ForUIF3264(test_max);
-  test_min(uint16_t());
-  test_max(uint16_t());
-  test_min(int16_t());
-  test_max(int16_t());
+  ForUI16(test_min);
+  ForUI16(test_max);
 }

 struct TestSumsOf8 {
--- a/third_party/highway/hwy/tests/test_util.h
+++ b/third_party/highway/hwy/tests/test_util.h
@ -22,6 +22,7 @@
 #include <stdint.h>
 #include <string.h>

+#include <cmath>  // std::isnan
 #include <string>

 #include "hwy/aligned_allocator.h"
--- a/third_party/highway/hwy/tests/test_util_test.cc
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@ -16,6 +16,8 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <string>
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
 #include "hwy/foreach_target.h"  // IWYU pragma: keep
--- a/third_party/highway/libhwy-test.pc.in
+++ b/third_party/highway/libhwy-test.pc.in
@ -1,4 +1,5 @@
 prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
 libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
 includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

--- a/third_party/highway/run_tests.sh
+++ b/third_party/highway/run_tests.sh
@ -19,11 +19,11 @@ cd ..
 rm -rf build

 #######################################
-echo DEBUG Clang 7
+echo DEBUG Clang 9
 rm -rf build_dbg
 mkdir build_dbg
 cd build_dbg
-CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
+CXX=clang++-9 CC=clang-9 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
 make -j
 ctest -j
 cd ..
@ -41,7 +41,7 @@ cd ..
 rm -rf build_32

 #######################################
-for VER in 8 9 10; do
+for VER in 10 11 12; do
  echo GCC $VER
  rm -rf build_g$VER
  mkdir build_g$VER
--- a/third_party/jpeg-xl/AUTHORS
+++ b/third_party/jpeg-xl/AUTHORS
@ -24,6 +24,7 @@ Aous Naman <aous@unsw.edu.au>
 Artem Selishchev
 Biswapriyo Nath <nathbappai@gmail.com>
 CanadianBaconBoi <beamconnor@gmail.com>
+Damiano Albani <damiano.albani@gmail.com>
 Daniel Novomeský <dnovomesky@gmail.com>
 David Burnett <vargolsoft@gmail.com>
 Dirk Lemstra <dirk@lemstra.org>
@ -31,6 +32,7 @@ Don Olmstead <don.j.olmstead@gmail.com>
 Even Rouault <even.rouault@spatialys.com>
 Fred Brennan <copypaste@kittens.ph>
 Heiko Becker <heirecka@exherbo.org>
+Jim Robinson <jimbo2150@gmail.com>
 Jon Sneyers <jon@cloudinary.com>
 Kai Hollberg <Schweinepriester@users.noreply.github.com>
 Kleis Auke Wolthuizen <github@kleisauke.nl>
--- a/third_party/jpeg-xl/CHANGELOG.md
+++ b/third_party/jpeg-xl/CHANGELOG.md
@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   of the input buffer.
 - decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
   of the output buffer.
+ - encoder API: add an effort 10 option for lossless compression.

 ## [0.7] - 2022-07-21

--- a/third_party/jpeg-xl/CMakeLists.txt
+++ b/third_party/jpeg-xl/CMakeLists.txt
@ -100,6 +100,10 @@ set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
    "Build JPEGXL developer tools.")
 set(JPEGXL_ENABLE_TOOLS true CACHE BOOL
    "Build JPEGXL user tools: cjxl and djxl.")
+set(JPEGXL_ENABLE_JPEGLI true CACHE BOOL
+    "Build jpegli library.")
+set(JPEGXL_ENABLE_JPEGLI_LIBJPEG true CACHE BOOL
+    "Build libjpeg.so shared library based on jpegli.")
 set(JPEGXL_ENABLE_DOXYGEN true CACHE BOOL
    "Generate C API documentation using Doxygen.")
 set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
--- a/third_party/jpeg-xl/ci.sh
+++ b/third_party/jpeg-xl/ci.sh
@ -69,6 +69,11 @@ if [[ "${ENABLE_WASM_SIMD}" -ne "0" ]]; then
  CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -msimd128"
 fi

+if [[ "${ENABLE_WASM_SIMD}" -eq "2" ]]; then
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_WANT_WASM2"
+  CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -DHWY_WANT_WASM2"
+fi
+
 if [[ ! -z "${HWY_BASELINE_TARGETS}" ]]; then
  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS}"
 fi
@ -139,6 +144,7 @@ detect_clang_version() {
  fi
  local clang_version=$("${CC:-clang}" --version | head -n1)
  clang_version=${clang_version#"Debian "}
+  clang_version=${clang_version#"Ubuntu "}
  local llvm_tag
  case "${clang_version}" in
    "clang version 6."*)
@ -547,6 +553,7 @@ cmd_coverage_report() {
    # Only print coverage information for the libjxl directories. The rest
    # is not part of the code under test.
    --filter '.*jxl/.*'
+    --exclude '.*_gbench.cc'
    --exclude '.*_test.cc'
    --exclude '.*_testonly..*'
    --exclude '.*_debug.*'
--- a/third_party/jpeg-xl/debian/rules
+++ b/third_party/jpeg-xl/debian/rules
@ -14,4 +14,5 @@ override_dh_auto_configure:
 	  -DJPEGXL_FORCE_SYSTEM_GTEST=ON \
 	  -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
 	  -DJPEGXL_FORCE_SYSTEM_HWY=ON \
+	  -DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
 	  -DJPEGXL_ENABLE_PLUGINS=ON 
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
 # Git revisions we use for the given submodules. Update these whenever you
 # update a git submodule.
 THIRD_PARTY_BROTLI="35ef5c554d888bef217d449346067de05e269b30"
-THIRD_PARTY_HIGHWAY="22e3d7276f4157d4a47586ba9fd91dd6303f441a"
+THIRD_PARTY_HIGHWAY="f670ea580bb70b4113b63b9cdaa42ba9b10cd13a"
 THIRD_PARTY_SKCMS="b25b07b4b07990811de121c0356155b2ba0f4318"
 THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
 THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
--- a/third_party/jpeg-xl/experimental/fast_lossless/README.md
+++ b/third_party/jpeg-xl/experimental/fast_lossless/README.md
@ -0,0 +1,10 @@
+# Fast-lossless
+This is a script to compile a standalone version of a JXL encoder that supports
+lossless compression, up to 16 bits, of 1- to 4-channel images and animations; it is
+very fast and compression is slightly worse than PNG for 8-bit nonphoto content
+and better or much better than PNG for all other situations.
+
+The main encoder is made out of two files, `lib/jxl/enc_fast_lossless.{cc,h}`;
+it automatically selects and runs a SIMD implementation supported by your CPU.
+
+This folder contains an example build script and `main` file.
--- a/third_party/jpeg-xl/experimental/fast_lossless/build-android.sh
+++ b/third_party/jpeg-xl/experimental/fast_lossless/build-android.sh
@ -20,7 +20,8 @@ fi
 [ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
 [ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c

-"$CXX" -O3 -DFASTLL_ENABLE_NEON_INTRINSICS -fopenmp \
+"$CXX" -O3 \
  -I. lodepng.o \
-  "${DIR}"/fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
+  -I"${DIR}"/../../ \
+  "${DIR}"/../../lib/jxl/enc_fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
  -o fast_lossless
--- a/third_party/jpeg-xl/experimental/fast_lossless/build.sh
+++ b/third_party/jpeg-xl/experimental/fast_lossless/build.sh
@ -18,9 +18,10 @@ fi

 [ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
 [ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
-[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -mavx2 -o lodepng.o -c
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c

-"$CXX" -O3 -mavx2 -DFASTLL_ENABLE_AVX2_INTRINSICS -fopenmp \
-  -I. lodepng.o \
-  "$DIR"/fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+"$CXX" -O3 \
+  -I. -g lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
  -o fast_lossless
--- a/third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
+++ b/third_party/jpeg-xl/experimental/fast_lossless/cross_compile_aarch64.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+set -e
+
+DIR=$(realpath "$(dirname "$0")")
+mkdir -p "$DIR"/build-aarch64
+cd "$DIR"/build-aarch64
+
+CXX="${CXX-aarch64-linux-gnu-c++}"
+if ! command -v "$CXX" >/dev/null ; then
+  printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
+  exit 1
+fi
+
+[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
+[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
+
+"$CXX" -O3 -static \
+  -I. lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+  -o fast_lossless
--- a/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless.cc
+++ b/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless.cc
--- a/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless.h
+++ b/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless.h
@ -1,23 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef FAST_LOSSLESS_H
-#define FAST_LOSSLESS_H
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
-                             size_t row_stride, size_t height, size_t nb_chans,
-                             size_t bitdepth, int effort,
-                             unsigned char** output);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif
--- a/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless_main.cc
+++ b/third_party/jpeg-xl/experimental/fast_lossless/fast_lossless_main.cc
@ -7,16 +7,20 @@
 #include <stdlib.h>
 #include <string.h>

+#include <atomic>
 #include <chrono>
 #include <thread>
+#include <vector>

-#include "fast_lossless.h"
+#include "lib/jxl/enc_fast_lossless.h"
 #include "lodepng.h"
 #include "pam-input.h"

 int main(int argc, char** argv) {
  if (argc < 3) {
-    fprintf(stderr, "Usage: %s in.png out.jxl [effort] [num_reps]\n", argv[0]);
+    fprintf(stderr,
+            "Usage: %s in.png out.jxl [effort] [num_reps] [num_threads]\n",
+            argv[0]);
    return 1;
  }

@ -24,6 +28,7 @@ int main(int argc, char** argv) {
  const char* out = argv[2];
  int effort = argc >= 4 ? atoi(argv[3]) : 2;
  size_t num_reps = argc >= 5 ? atoi(argv[4]) : 1;
+  size_t num_threads = argc >= 6 ? atoi(argv[5]) : 0;

  if (effort < 0 || effort > 127) {
    fprintf(
@ -44,6 +49,35 @@ int main(int argc, char** argv) {
    return 1;
  }

+  auto parallel_runner = [](void* num_threads_ptr, void* opaque,
+                            void fun(void*, size_t), size_t count) {
+    size_t num_threads = *(size_t*)num_threads_ptr;
+    if (num_threads == 0) {
+      num_threads = std::thread::hardware_concurrency();
+    }
+    if (num_threads > count) {
+      num_threads = count;
+    }
+    if (num_threads == 1) {
+      for (size_t i = 0; i < count; i++) {
+        fun(opaque, i);
+      }
+    } else {
+      std::atomic<int> task{0};
+      std::vector<std::thread> threads;
+      for (size_t i = 0; i < num_threads; i++) {
+        threads.push_back(std::thread([count, opaque, fun, &task]() {
+          while (true) {
+            int t = task++;
+            if (t >= count) break;
+            fun(opaque, t);
+          }
+        }));
+      }
+      for (auto& t : threads) t.join();
+    }
+  };
+
  size_t encoded_size = 0;
  unsigned char* encoded = nullptr;
  size_t stride = width * nb_chans * (bitdepth > 8 ? 2 : 1);
@ -51,8 +85,9 @@ int main(int argc, char** argv) {
  auto start = std::chrono::high_resolution_clock::now();
  for (size_t _ = 0; _ < num_reps; _++) {
    free(encoded);
-    encoded_size = JxlFastLosslessEncode(png, width, stride, height, nb_chans,
-                                         bitdepth, effort, &encoded);
+    encoded_size = JxlFastLosslessEncode(
+        png, width, stride, height, nb_chans, bitdepth,
+        /*big_endian=*/true, effort, &encoded, &num_threads, +parallel_runner);
  }
  auto stop = std::chrono::high_resolution_clock::now();
  if (num_reps > 1) {
--- a/third_party/jpeg-xl/experimental/fast_lossless/pam-input.h
+++ b/third_party/jpeg-xl/experimental/fast_lossless/pam-input.h
@ -270,8 +270,8 @@ bool DecodePAM(const char* filename, uint8_t** buffer, size_t* w, size_t* h,
  const uint8_t* pos = nullptr;
  if (!parser.ParseHeader(&header, &pos)) return false;

-  if (header.bits_per_sample == 0 || header.bits_per_sample > 12) {
-    return error_msg("PNM: bits_per_sample invalid (can do at most 12-bit)");
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
+    return error_msg("PNM: bits_per_sample invalid (can do at most 16-bit)");
  }
  *w = header.xsize;
  *h = header.ysize;
--- a/third_party/jpeg-xl/lib/CMakeLists.txt
+++ b/third_party/jpeg-xl/lib/CMakeLists.txt
@ -132,6 +132,15 @@ set(JPEGXL_COVERAGE_FLAGS
 endif()  # JPEGXL_ENABLE_COVERAGE
 endif()  #!MSVC

+# strips the -static suffix from all the elements in LIST
+function(strip_static OUTPUT_VAR LIB_LIST)
+  foreach(lib IN LISTS ${LIB_LIST})
+    string(REGEX REPLACE "-static$" "" lib "${lib}")
+    list(APPEND out_list "${lib}")
+  endforeach()
+  set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
+endfunction()
+
 # The jxl library definition.
 include(jxl.cmake)

@ -140,6 +149,11 @@ if(JPEGXL_ENABLE_TOOLS)
  include(jxl_extras.cmake)
 endif()
 include(jxl_threads.cmake)
+# We only build JPEGLI on linux for now.
+find_package(JPEG)
+if (JPEG_FOUND AND JPEGXL_ENABLE_JPEGLI)
+  include(jpegli.cmake)
+endif()

 # Install all the library headers from the source and the generated ones. There
 # is no distinction on which libraries use which header since it is expected
--- a/third_party/jpeg-xl/lib/extras/codec_test.cc
+++ b/third_party/jpeg-xl/lib/extras/codec_test.cc
@ -14,14 +14,17 @@
 #include <utility>
 #include <vector>

+#include "lib/extras/dec/jpegli.h"
 #include "lib/extras/dec/pgx.h"
 #include "lib/extras/dec/pnm.h"
 #include "lib/extras/enc/encode.h"
+#include "lib/extras/encode_jpeg.h"
 #include "lib/extras/packed_image_convert.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
@ -174,6 +177,7 @@ struct TestImageParams {
  bool add_alpha;
  bool big_endian;
  bool add_extra_channels;
+  bool jpegli_decode = false;

  bool ShouldTestRoundtrip() const {
    if (codec == Codec::kPNG) {
@ -273,11 +277,32 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
    color_hints.Add("color_space",
                    params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
  }
-  ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
-                          color_hints, SizeConstraints(), &ppf_out));
-
-  if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
-      params.codec != Codec::kEXR) {
+  if (params.codec == Codec::kJPG && params.jpegli_decode) {
+#if JPEGXL_ENABLE_JPEG
+    ASSERT_TRUE(
+        DecodeJpeg(encoded.bitstreams[0], JXL_TYPE_UINT8, pool, &ppf_out));
+#endif
+  } else {
+    ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
+                            color_hints, SizeConstraints(), &ppf_out));
+  }
+  if (params.codec == Codec::kPNG && ppf_out.icc.empty()) {
+    // Decoding a PNG may drop the ICC profile if there's a valid cICP chunk.
+    // Rendering intent is not preserved in this case.
+    EXPECT_EQ(ppf_in.color_encoding.color_space,
+              ppf_out.color_encoding.color_space);
+    EXPECT_EQ(ppf_in.color_encoding.white_point,
+              ppf_out.color_encoding.white_point);
+    if (ppf_in.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+      EXPECT_EQ(ppf_in.color_encoding.primaries,
+                ppf_out.color_encoding.primaries);
+    }
+    EXPECT_EQ(ppf_in.color_encoding.transfer_function,
+              ppf_out.color_encoding.transfer_function);
+    EXPECT_EQ(ppf_out.color_encoding.rendering_intent,
+              JXL_RENDERING_INTENT_RELATIVE);
+  } else if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
+             params.codec != Codec::kEXR) {
    EXPECT_EQ(ppf_in.icc, ppf_out.icc);
  }

@ -322,6 +347,10 @@ TEST(CodecTest, TestRoundTrip) {
              params.add_extra_channels = true;
              TestRoundTrip(params, &pool);
            }
+            if (codec == Codec::kJPG) {
+              params.jpegli_decode = true;
+              TestRoundTrip(params, &pool);
+            }
          }
        }
      }
@ -362,6 +391,78 @@ TEST(CodecTest, LosslessPNMRoundtrip) {
  }
 }

+#if JPEGXL_ENABLE_JPEG
+TEST(CodecTest, JpegliXYBEncodeTest) {
+  ThreadPool* pool = nullptr;
+  CodecInOut io;
+  const PaddedBytes orig =
+      ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = true;
+  ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
+
+  CodecInOut io2;
+  ASSERT_TRUE(
+      SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
+
+  double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
+  EXPECT_THAT(bpp, IsSlightlyBelow(1.5f));
+  EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              IsSlightlyBelow(1.3f));
+}
+
+TEST(CodecTest, JpegliYUVEncodeTest) {
+  ThreadPool* pool = nullptr;
+  CodecInOut io;
+  const PaddedBytes orig =
+      ReadTestData("jxl/flower/flower_small.rgb.depth8.ppm");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
+
+  CodecInOut io2;
+  ASSERT_TRUE(
+      SetFromBytes(Span<const uint8_t>(compressed), ColorHints(), &io2));
+
+  double bpp = compressed.size() * 8.0 / (io.xsize() * io.ysize());
+  EXPECT_THAT(bpp, IsSlightlyBelow(2.3f));
+  EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              IsSlightlyBelow(1.3f));
+}
+
+TEST(CodecTest, Jpegli16bitRoundtripTest) {
+  ThreadPool* pool = nullptr;
+  CodecInOut io;
+  const PaddedBytes orig = ReadTestData(
+      "external/raw.pixls/"
+      "Google-Pixel2XL-16bit_srgb8_v4_krita.png");
+  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), ColorHints(), &io));
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(io.Main(), settings, pool, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeJpeg(compressed, JXL_TYPE_UINT16, pool, &ppf_out));
+  CodecInOut io2;
+  ASSERT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf_out, pool, &io2));
+
+  EXPECT_THAT(compressed.size(), IsSlightlyBelow(3500u));
+  EXPECT_THAT(ButteraugliDistance(io, io2, ButteraugliParams(), GetJxlCms(),
+                                  /*distmap=*/nullptr, nullptr),
+              IsSlightlyBelow(1.13f));
+}
+#endif
+
 CodecInOut DecodeRoundtrip(const std::string& pathname, ThreadPool* pool,
                           const ColorHints& color_hints = ColorHints()) {
  CodecInOut io;
--- a/third_party/jpeg-xl/lib/extras/dec/apng.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/apng.cc
@ -76,11 +76,145 @@ Status DecodeSRGB(const unsigned char* payload, const size_t payload_size,
  if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size");
  // (PNG uses the same values as ICC.)
  if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent");
+  color_encoding->white_point = JXL_WHITE_POINT_D65;
+  color_encoding->primaries = JXL_PRIMARIES_SRGB;
+  color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
  color_encoding->rendering_intent =
      static_cast<JxlRenderingIntent>(payload[0]);
  return true;
 }

+// If the cICP profile is not fully supported, return false and leave
+// color_encoding unmodified.
+Status DecodeCICP(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 4) return JXL_FAILURE("Wrong cICP size");
+  JxlColorEncoding color_enc = *color_encoding;
+
+  // From https://www.itu.int/rec/T-REC-H.273-202107-I/en
+  if (payload[0] == 1) {
+    // IEC 61966-2-1 sRGB
+    color_enc.primaries = JXL_PRIMARIES_SRGB;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 4) {
+    // Rec. ITU-R BT.470-6 System M
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.67;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.21;
+    color_enc.primaries_green_xy[1] = 0.71;
+    color_enc.primaries_blue_xy[0] = 0.14;
+    color_enc.primaries_blue_xy[1] = 0.08;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 5) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.64;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.29;
+    color_enc.primaries_green_xy[1] = 0.60;
+    color_enc.primaries_blue_xy[0] = 0.15;
+    color_enc.primaries_blue_xy[1] = 0.06;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 6 || payload[0] == 7) {
+    // SMPTE ST 170 (2004) / SMPTE ST 240 (1999)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.310;
+    color_enc.primaries_green_xy[1] = 0.595;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.070;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 8) {
+    // Generic film (colour filters using Illuminant C)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.681;
+    color_enc.primaries_red_xy[1] = 0.319;
+    color_enc.primaries_green_xy[0] = 0.243;
+    color_enc.primaries_green_xy[1] = 0.692;
+    color_enc.primaries_blue_xy[0] = 0.145;
+    color_enc.primaries_blue_xy[1] = 0.049;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 9) {
+    // Rec. ITU-R BT.2100-2
+    color_enc.primaries = JXL_PRIMARIES_2100;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 10) {
+    // CIE 1931 XYZ
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 1;
+    color_enc.primaries_red_xy[1] = 0;
+    color_enc.primaries_green_xy[0] = 0;
+    color_enc.primaries_green_xy[1] = 1;
+    color_enc.primaries_blue_xy[0] = 0;
+    color_enc.primaries_blue_xy[1] = 0;
+    color_enc.white_point = JXL_WHITE_POINT_E;
+  } else if (payload[0] == 11) {
+    // SMPTE RP 431-2 (2011)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_DCI;
+  } else if (payload[0] == 12) {
+    // SMPTE EG 432-1 (2010)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 22) {
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.295;
+    color_enc.primaries_green_xy[1] = 0.605;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.077;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else {
+    JXL_WARNING("Unsupported primaries specified in cICP chunk: %d",
+                static_cast<int>(payload[0]));
+    return false;
+  }
+
+  if (payload[1] == 1 || payload[1] == 6 || payload[1] == 14 ||
+      payload[1] == 15) {
+    // Rec. ITU-R BT.709-6
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_709;
+  } else if (payload[1] == 4) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.2;
+  } else if (payload[1] == 5) {
+    // Rec. ITU-R BT.470-6 System B, G
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.8;
+  } else if (payload[1] == 8 || payload[1] == 13 || payload[1] == 16 ||
+             payload[1] == 17 || payload[1] == 18) {
+    // These codes all match the corresponding JXL enum values
+    color_enc.transfer_function = static_cast<JxlTransferFunction>(payload[1]);
+  } else {
+    JXL_WARNING("Unsupported transfer function specified in cICP chunk: %d",
+                static_cast<int>(payload[1]));
+    return false;
+  }
+
+  if (payload[2] != 0) {
+    JXL_WARNING("Unsupported color space specified in cICP chunk: %d",
+                static_cast<int>(payload[2]));
+    return false;
+  }
+  if (payload[3] != 1) {
+    JXL_WARNING("Unsupported full-range flag specified in cICP chunk: %d",
+                static_cast<int>(payload[3]));
+    return false;
+  }
+  // cICP has no rendering intent, so use the default
+  color_enc.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
+  *color_encoding = color_enc;
+  return true;
+}
+
 Status DecodeGAMA(const unsigned char* payload, const size_t payload_size,
                  JxlColorEncoding* color_encoding) {
  if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size");
@ -286,6 +420,7 @@ constexpr uint32_t kId_fcTL = 0x4C546366;
 constexpr uint32_t kId_IDAT = 0x54414449;
 constexpr uint32_t kId_fdAT = 0x54416466;
 constexpr uint32_t kId_IEND = 0x444E4549;
+constexpr uint32_t kId_cICP = 0x50434963;
 constexpr uint32_t kId_iCCP = 0x50434369;
 constexpr uint32_t kId_sRGB = 0x42475273;
 constexpr uint32_t kId_gAMA = 0x414D4167;
@ -469,7 +604,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,

  ppf->frames.clear();

-  bool have_color = false, have_srgb = false;
+  bool have_color = false;
+  bool have_cicp = false, have_iccp = false, have_srgb = false;
  bool errorstate = true;
  if (id == kId_IHDR && chunkIHDR.size() == 25) {
    x0 = 0;
@ -490,6 +626,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
    ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
    ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
    ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+    ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;

    if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
                          chunkIHDR, chunksInfo)) {
@ -625,7 +762,17 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
                              chunk.size() - 4)) {
            break;
          }
-        } else if (id == kId_iCCP) {
+        } else if (id == kId_cICP) {
+          // Color profile chunks: cICP has the highest priority, followed by
+          // iCCP and sRGB (which shouldn't co-exist, but if they do, we use
+          // iCCP), followed finally by gAMA and cHRM.
+          if (DecodeCICP(chunk.data() + 8, chunk.size() - 12,
+                         &ppf->color_encoding)) {
+            have_cicp = true;
+            have_color = true;
+            ppf->icc.clear();
+          }
+        } else if (!have_cicp && id == kId_iCCP) {
          if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
            JXL_WARNING("Corrupt iCCP chunk");
            break;
@ -642,19 +789,20 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
          if (ok && proflen) {
            ppf->icc.assign(profile, profile + proflen);
            have_color = true;
+            have_iccp = true;
          } else {
            // TODO(eustas): JXL_WARNING?
          }
-        } else if (id == kId_sRGB) {
+        } else if (!have_cicp && !have_iccp && id == kId_sRGB) {
          JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12,
                                         &ppf->color_encoding));
          have_srgb = true;
          have_color = true;
-        } else if (id == kId_gAMA) {
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) {
          JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12,
                                         &ppf->color_encoding));
          have_color = true;
-        } else if (id == kId_cHRM) {
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) {
          JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12,
                                         &ppf->color_encoding));
          have_color = true;
@ -677,12 +825,6 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
      }
    }

-    if (have_srgb) {
-      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
-      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
-      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
-      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
-    }
    JXL_RETURN_IF_ERROR(ApplyColorHints(
        color_hints, have_color, ppf->info.num_color_channels == 1, ppf));
  }
--- a/third_party/jpeg-xl/lib/extras/dec/decode.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.cc
@ -107,7 +107,8 @@ Status DecodeBytes(const Span<const uint8_t> bytes,
  }
 #endif
 #if JPEGXL_ENABLE_JPEG
-  else if (DecodeImageJPG(bytes, color_hints, constraints, ppf)) {
+  else if (DecodeImageJPG(bytes, color_hints, constraints,
+                          /*output_bit_depth=*/8, ppf)) {
    codec = Codec::kJPG;
  }
 #endif
--- a/third_party/jpeg-xl/lib/extras/dec/jpegli.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/jpegli.cc
@ -0,0 +1,209 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jpegli.h"
+
+#include <setjmp.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+constexpr int kExifMarker = JPEG_APP0 + 1;
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+static inline bool IsJPG(const std::vector<uint8_t>& bytes) {
+  if (bytes.size() < 2) return false;
+  if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
+  return true;
+}
+
+bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
+  return marker->marker == kExifMarker &&
+         marker->data_length >= sizeof kExifSignature + 2 &&
+         std::equal(std::begin(kExifSignature), std::end(kExifSignature),
+                    marker->data);
+}
+
+Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
+                      std::vector<uint8_t>* const icc) {
+  uint8_t* icc_data_ptr;
+  unsigned int icc_data_len;
+  if (jpegli_read_icc_profile(cinfo, &icc_data_ptr, &icc_data_len)) {
+    icc->assign(icc_data_ptr, icc_data_ptr + icc_data_len);
+    free(icc_data_ptr);
+    return true;
+  }
+  return false;
+}
+
+void ReadExif(jpeg_decompress_struct* const cinfo,
+              std::vector<uint8_t>* const exif) {
+  constexpr size_t kExifSignatureSize = sizeof kExifSignature;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    // marker is initialized by libjpeg, which we are not instrumenting with
+    // msan.
+    msan::UnpoisonMemory(marker, sizeof(*marker));
+    msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (!MarkerIsExif(marker)) continue;
+    size_t marker_length = marker->data_length - kExifSignatureSize;
+    exif->resize(marker_length);
+    std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
+    return;
+  }
+}
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+void MyOutputMessage(j_common_ptr cinfo) {
+#if JXL_DEBUG_WARNING == 1
+  char buf[JMSG_LENGTH_MAX + 1];
+  (*cinfo->err->format_message)(cinfo, buf);
+  buf[JMSG_LENGTH_MAX] = 0;
+  JXL_WARNING("%s", buf);
+#endif
+}
+
+}  // namespace
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  JxlDataType output_data_type, ThreadPool* pool,
+                  PackedPixelFile* ppf) {
+  // Don't do anything for non-JPEG files (no need to report an error)
+  if (!IsJPG(compressed)) return false;
+
+  // TODO(veluca): use JPEGData also for pixels?
+
+  // We need to declare all the non-trivial destructor local variables before
+  // the call to setjmp().
+  std::unique_ptr<JSAMPLE[]> row;
+
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_decompress_struct cinfo;
+    // cinfo is initialized by libjpeg, which we are not instrumenting with
+    // msan, therefore we need to initialize cinfo here.
+    msan::UnpoisonMemory(&cinfo, sizeof(cinfo));
+    // Setup error handling in jpeg library so we can deal with broken jpegs in
+    // the fuzzer.
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    jerr.output_message = &MyOutputMessage;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo,
+                   reinterpret_cast<const unsigned char*>(compressed.data()),
+                   compressed.size());
+    jpegli_save_markers(&cinfo, kICCMarker, 0xFFFF);
+    jpegli_save_markers(&cinfo, kExifMarker, 0xFFFF);
+    const auto failure = [&cinfo](const char* str) -> Status {
+      jpegli_abort_decompress(&cinfo);
+      jpegli_destroy_decompress(&cinfo);
+      return JXL_FAILURE("%s", str);
+    };
+    jpegli_read_header(&cinfo, TRUE);
+    // Might cause CPU-zip bomb.
+    if (cinfo.arith_code) {
+      return failure("arithmetic code JPEGs are not supported");
+    }
+    int nbcomp = cinfo.num_components;
+    if (nbcomp != 1 && nbcomp != 3) {
+      return failure("unsupported number of components in JPEG");
+    }
+    if (!ReadICCProfile(&cinfo, &ppf->icc)) {
+      ppf->icc.clear();
+      // Default to SRGB
+      // Actually, (cinfo.output_components == nbcomp) will be checked after
+      // `jpegli_start_decompress`.
+      ppf->color_encoding.color_space =
+          (nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
+      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
+    }
+    ReadExif(&cinfo, &ppf->metadata.exif);
+
+    ppf->info.xsize = cinfo.image_width;
+    ppf->info.ysize = cinfo.image_height;
+    if (output_data_type == JXL_TYPE_UINT8) {
+      ppf->info.bits_per_sample = 8;
+    } else if (output_data_type == JXL_TYPE_UINT16) {
+      ppf->info.bits_per_sample = 16;
+    } else {
+      return failure("unsupported data type");
+    }
+    ppf->info.exponent_bits_per_sample = 0;
+    ppf->info.uses_original_profile = true;
+
+    // No alpha in JPG
+    ppf->info.alpha_bits = 0;
+    ppf->info.alpha_exponent_bits = 0;
+
+    ppf->info.num_color_channels = nbcomp;
+    ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+    // Set output bit depth.
+    cinfo.quantize_colors = FALSE;
+    cinfo.desired_number_of_colors = 1 << ppf->info.bits_per_sample;
+    jpegli_start_decompress(&cinfo);
+    JXL_ASSERT(cinfo.output_components == nbcomp);
+
+    const JxlPixelFormat format{
+        /*num_channels=*/static_cast<uint32_t>(nbcomp),
+        output_data_type,
+        /*endianness=*/JXL_NATIVE_ENDIAN,
+        /*align=*/0,
+    };
+    ppf->frames.clear();
+    // Allocates the frame buffer.
+    ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
+    const auto& frame = ppf->frames.back();
+    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.output_components * cinfo.image_width <=
+               frame.color.stride);
+
+    for (size_t y = 0; y < cinfo.image_height; ++y) {
+      JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
+          static_cast<uint8_t*>(frame.color.pixels()) +
+          frame.color.stride * y)};
+      jpegli_read_scanlines(&cinfo, rows, 1);
+      msan::UnpoisonMemory(rows[0], sizeof(JSAMPLE) * cinfo.output_components *
+                                        cinfo.image_width);
+    }
+
+    jpegli_finish_decompress(&cinfo);
+    jpegli_destroy_decompress(&cinfo);
+    return true;
+  };
+
+  return try_catch_block();
+}
+
+}  // namespace extras
+}  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/dec/jpegli.h
+++ b/third_party/jpeg-xl/lib/extras/dec/jpegli.h
@ -0,0 +1,30 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_JPEGLI_H_
+#define LIB_EXTRAS_DEC_JPEGLI_H_
+
+// Decodes JPG pixels and metadata in memory using the libjpegli library.
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "jxl/types.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  JxlDataType output_data_type, ThreadPool* pool,
+                  PackedPixelFile* ppf);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_JPEGLI_H_
--- a/third_party/jpeg-xl/lib/extras/dec/jpg.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/jpg.cc
@ -165,7 +165,7 @@ void MyOutputMessage(j_common_ptr cinfo) {
 Status DecodeImageJPG(const Span<const uint8_t> bytes,
                      const ColorHints& color_hints,
                      const SizeConstraints& constraints,
-                      PackedPixelFile* ppf) {
+                      size_t output_bit_depth, PackedPixelFile* ppf) {
  // Don't do anything for non-JPEG files (no need to report an error)
  if (!IsJPG(bytes)) return false;

@ -175,6 +175,10 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
  // the call to setjmp().
  std::unique_ptr<JSAMPLE[]> row;

+  if (output_bit_depth == 0 || output_bit_depth > 16) {
+    return JXL_FAILURE("Invalid output bitdepth");
+  }
+
  const auto try_catch_block = [&]() -> bool {
    jpeg_decompress_struct cinfo;
    // cinfo is initialized by libjpeg, which we are not instrumenting with
@ -252,12 +256,24 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
    ppf->info.num_color_channels = nbcomp;
    ppf->info.orientation = JXL_ORIENT_IDENTITY;

+    // Try setting output bit depth. In libjpeg-turbo, this combination of
+    // parameters will be ignored, but in libjpegli it will override output bit
+    // depth.
+    cinfo.quantize_colors = FALSE;
+    cinfo.desired_number_of_colors = 1 << output_bit_depth;
    jpeg_start_decompress(&cinfo);
    JXL_ASSERT(cinfo.output_components == nbcomp);
+    if (cinfo.desired_number_of_colors == 0) {
+      // We know that the output bit depth was set because
+      // desired_number_of_colors was reset to zero by libjpegli.
+      ppf->info.bits_per_sample = output_bit_depth;
+    }
+    JxlDataType data_type =
+        ppf->info.bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;

    const JxlPixelFormat format{
        /*num_channels=*/static_cast<uint32_t>(nbcomp),
-        /*data_type=*/BITS_IN_JSAMPLE == 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
+        data_type,
        /*endianness=*/JXL_NATIVE_ENDIAN,
        /*align=*/0,
    };
--- a/third_party/jpeg-xl/lib/extras/dec/jpg.h
+++ b/third_party/jpeg-xl/lib/extras/dec/jpg.h
@ -25,7 +25,8 @@ namespace extras {
 // `elapsed_deinterleave`, if non-null, will be set to the time (in seconds)
 // that it took to deinterleave the raw JSAMPLEs to planar floats.
 Status DecodeImageJPG(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      const SizeConstraints& constraints,
+                      size_t output_bit_depth, PackedPixelFile* ppf);

 }  // namespace extras
 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/dec/jxl.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/jxl.cc
@ -223,7 +223,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
      fprintf(stderr,
              "Input file is truncated (total bytes: %" PRIuS
              ", processed bytes: %" PRIuS
-              ") and allow_partial_input was disabled.",
+              ") and --allow_partial_files is not present.\n",
              bytes_size, bytes_size - released_size);
      return false;
    } else if (status == JXL_DEC_BOX) {
--- a/third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/dec_group_jpeg.cc
@ -1,559 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/extras/dec_group_jpeg.h"
-
-#include <stdint.h>
-#include <string.h>
-
-#include <algorithm>
-#include <memory>
-#include <utility>
-
-#ifdef MEMORY_SANITIZER
-#define JXL_MEMORY_SANITIZER 1
-#elif defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define JXL_MEMORY_SANITIZER 1
-#else
-#define JXL_MEMORY_SANITIZER 0
-#endif
-#else
-#define JXL_MEMORY_SANITIZER 0
-#endif
-
-#if JXL_MEMORY_SANITIZER
-#include "sanitizer/msan_interface.h"
-#endif
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/extras/dec_group_jpeg.cc"
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-HWY_BEFORE_NAMESPACE();
-namespace jxl {
-namespace HWY_NAMESPACE {
-
-// These templates are not found via ADL.
-using hwy::HWY_NAMESPACE::Abs;
-using hwy::HWY_NAMESPACE::Add;
-using hwy::HWY_NAMESPACE::Clamp;
-using hwy::HWY_NAMESPACE::Gt;
-using hwy::HWY_NAMESPACE::IfThenElseZero;
-using hwy::HWY_NAMESPACE::Mul;
-using hwy::HWY_NAMESPACE::MulAdd;
-using hwy::HWY_NAMESPACE::NearestInt;
-using hwy::HWY_NAMESPACE::NegMulAdd;
-using hwy::HWY_NAMESPACE::Rebind;
-using hwy::HWY_NAMESPACE::Sub;
-using hwy::HWY_NAMESPACE::Vec;
-using hwy::HWY_NAMESPACE::Xor;
-
-using D = HWY_FULL(float);
-using DI = HWY_FULL(int32_t);
-constexpr D d;
-constexpr DI di;
-
-using D8 = HWY_CAPPED(float, 8);
-constexpr D8 d8;
-
-void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
-                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
-                      int32_t* JXL_RESTRICT sumabs) {
-  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
-    size_t k = i % kDCTBlockSize;
-    const Rebind<int16_t, DI> di16;
-    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
-    const auto abs_coeff = Abs(coeff);
-    const auto not_0 = Gt(abs_coeff, Zero(di));
-    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
-    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
-    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
-  }
-}
-
-void DequantBlock(const int16_t* JXL_RESTRICT qblock,
-                  const float* JXL_RESTRICT dequant,
-                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
-  for (size_t k = 0; k < kDCTBlockSize; k += Lanes(d)) {
-    const auto mul = Load(d, dequant + k);
-    const auto bias = Load(d, biases + k);
-    const Rebind<int16_t, DI> di16;
-    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
-    const Rebind<float, DI> df;
-    const auto quant = ConvertTo(df, quant_i);
-    const auto abs_quant = Abs(quant);
-    const auto not_0 = Gt(abs_quant, Zero(df));
-    const auto sign_quant = Xor(quant, abs_quant);
-    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
-    const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
-    Store(dequant, d, block + k);
-  }
-}
-
-#if HWY_CAP_GE256
-JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
-                                  float* JXL_RESTRICT to) {
-  const D8 d;
-  auto i0 = Load(d, from);
-  auto i1 = Load(d, from + 1 * 8);
-  auto i2 = Load(d, from + 2 * 8);
-  auto i3 = Load(d, from + 3 * 8);
-  auto i4 = Load(d, from + 4 * 8);
-  auto i5 = Load(d, from + 5 * 8);
-  auto i6 = Load(d, from + 6 * 8);
-  auto i7 = Load(d, from + 7 * 8);
-
-  const auto q0 = InterleaveLower(d, i0, i2);
-  const auto q1 = InterleaveLower(d, i1, i3);
-  const auto q2 = InterleaveUpper(d, i0, i2);
-  const auto q3 = InterleaveUpper(d, i1, i3);
-  const auto q4 = InterleaveLower(d, i4, i6);
-  const auto q5 = InterleaveLower(d, i5, i7);
-  const auto q6 = InterleaveUpper(d, i4, i6);
-  const auto q7 = InterleaveUpper(d, i5, i7);
-
-  const auto r0 = InterleaveLower(d, q0, q1);
-  const auto r1 = InterleaveUpper(d, q0, q1);
-  const auto r2 = InterleaveLower(d, q2, q3);
-  const auto r3 = InterleaveUpper(d, q2, q3);
-  const auto r4 = InterleaveLower(d, q4, q5);
-  const auto r5 = InterleaveUpper(d, q4, q5);
-  const auto r6 = InterleaveLower(d, q6, q7);
-  const auto r7 = InterleaveUpper(d, q6, q7);
-
-  i0 = ConcatLowerLower(d, r4, r0);
-  i1 = ConcatLowerLower(d, r5, r1);
-  i2 = ConcatLowerLower(d, r6, r2);
-  i3 = ConcatLowerLower(d, r7, r3);
-  i4 = ConcatUpperUpper(d, r4, r0);
-  i5 = ConcatUpperUpper(d, r5, r1);
-  i6 = ConcatUpperUpper(d, r6, r2);
-  i7 = ConcatUpperUpper(d, r7, r3);
-
-  Store(i0, d, to);
-  Store(i1, d, to + 1 * 8);
-  Store(i2, d, to + 2 * 8);
-  Store(i3, d, to + 3 * 8);
-  Store(i4, d, to + 4 * 8);
-  Store(i5, d, to + 5 * 8);
-  Store(i6, d, to + 6 * 8);
-  Store(i7, d, to + 7 * 8);
-}
-#elif HWY_TARGET != HWY_SCALAR
-JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
-                                  float* JXL_RESTRICT to) {
-  const HWY_CAPPED(float, 4) d;
-  for (size_t n = 0; n < 8; n += 4) {
-    for (size_t m = 0; m < 8; m += 4) {
-      auto p0 = Load(d, from + n * 8 + m);
-      auto p1 = Load(d, from + (n + 1) * 8 + m);
-      auto p2 = Load(d, from + (n + 2) * 8 + m);
-      auto p3 = Load(d, from + (n + 3) * 8 + m);
-      const auto q0 = InterleaveLower(d, p0, p2);
-      const auto q1 = InterleaveLower(d, p1, p3);
-      const auto q2 = InterleaveUpper(d, p0, p2);
-      const auto q3 = InterleaveUpper(d, p1, p3);
-
-      const auto r0 = InterleaveLower(d, q0, q1);
-      const auto r1 = InterleaveUpper(d, q0, q1);
-      const auto r2 = InterleaveLower(d, q2, q3);
-      const auto r3 = InterleaveUpper(d, q2, q3);
-      Store(r0, d, to + m * 8 + n);
-      Store(r1, d, to + (1 + m) * 8 + n);
-      Store(r2, d, to + (2 + m) * 8 + n);
-      Store(r3, d, to + (3 + m) * 8 + n);
-    }
-  }
-}
-#else
-JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
-                                  float* JXL_RESTRICT to) {
-  for (size_t n = 0; n < 8; ++n) {
-    for (size_t m = 0; m < 8; ++m) {
-      to[8 * n + m] = from[8 * m + n];
-    }
-  }
-}
-#endif
-
-template <size_t N>
-void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
-                    float* JXL_RESTRICT aout) {
-  for (size_t i = 0; i < N / 2; i++) {
-    auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
-    Store(in1, d8, aout + i * 8);
-  }
-  for (size_t i = N / 2; i < N; i++) {
-    auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
-    Store(in1, d8, aout + i * 8);
-  }
-}
-
-template <size_t N>
-void BTranspose(float* JXL_RESTRICT coeff) {
-  for (size_t i = N - 1; i > 0; i--) {
-    auto in1 = Load(d8, coeff + i * 8);
-    auto in2 = Load(d8, coeff + (i - 1) * 8);
-    Store(Add(in1, in2), d8, coeff + i * 8);
-  }
-  constexpr float kSqrt2 = 1.41421356237f;
-  auto sqrt2 = Set(d8, kSqrt2);
-  auto in1 = Load(d8, coeff);
-  Store(Mul(in1, sqrt2), d8, coeff);
-}
-
-// Constants for DCT implementation. Generated by the following snippet:
-// for i in range(N // 2):
-//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
-template <size_t N>
-struct WcMultipliers;
-
-template <>
-struct WcMultipliers<4> {
-  static constexpr float kMultipliers[] = {
-      0.541196100146197,
-      1.3065629648763764,
-  };
-};
-
-template <>
-struct WcMultipliers<8> {
-  static constexpr float kMultipliers[] = {
-      0.5097955791041592,
-      0.6013448869350453,
-      0.8999762231364156,
-      2.5629154477415055,
-  };
-};
-
-constexpr float WcMultipliers<4>::kMultipliers[];
-constexpr float WcMultipliers<8>::kMultipliers[];
-
-template <size_t N>
-void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
-                    size_t out_stride) {
-  for (size_t i = 0; i < N / 2; i++) {
-    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
-    auto in1 = Load(d8, coeff + i * 8);
-    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
-    auto out1 = MulAdd(mul, in2, in1);
-    auto out2 = NegMulAdd(mul, in2, in1);
-    StoreU(out1, d8, out + i * out_stride);
-    StoreU(out2, d8, out + (N - i - 1) * out_stride);
-  }
-}
-
-template <size_t N>
-struct IDCT1DImpl;
-
-template <>
-struct IDCT1DImpl<1> {
-  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
-                             size_t to_stride) {
-    StoreU(LoadU(d8, from), d8, to);
-  }
-};
-
-template <>
-struct IDCT1DImpl<2> {
-  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
-                             size_t to_stride) {
-    JXL_DASSERT(from_stride >= 8);
-    JXL_DASSERT(to_stride >= 8);
-    auto in1 = LoadU(d8, from);
-    auto in2 = LoadU(d8, from + from_stride);
-    StoreU(Add(in1, in2), d8, to);
-    StoreU(Sub(in1, in2), d8, to + to_stride);
-  }
-};
-
-template <size_t N>
-struct IDCT1DImpl {
-  void operator()(const float* from, size_t from_stride, float* to,
-                  size_t to_stride) {
-    JXL_DASSERT(from_stride >= 8);
-    JXL_DASSERT(to_stride >= 8);
-    HWY_ALIGN float tmp[64];
-    ForwardEvenOdd<N>(from, from_stride, tmp);
-    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
-    BTranspose<N / 2>(tmp + N * 4);
-    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
-    MultiplyAndAdd<N>(tmp, to, to_stride);
-  }
-};
-
-template <size_t N>
-void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
-            size_t output_stride) {
-  for (size_t i = 0; i < 8; i += Lanes(d8)) {
-    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
-  }
-}
-
-void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
-                       float* JXL_RESTRICT output, size_t output_stride) {
-  Transpose8x8Block(block0, block1);
-  IDCT1D<8>(block1, block0, 8);
-  Transpose8x8Block(block0, block1);
-  IDCT1D<8>(block1, output, output_stride);
-}
-
-void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
-                     const float* JXL_RESTRICT dequant,
-                     const float* JXL_RESTRICT biases,
-                     float* JXL_RESTRICT scratch_space,
-                     float* JXL_RESTRICT output, size_t output_stride) {
-  float* JXL_RESTRICT block0 = scratch_space;
-  float* JXL_RESTRICT block1 = scratch_space + kDCTBlockSize;
-  DequantBlock(qblock, dequant, biases, block0);
-  ComputeScaledIDCT(block0, block1, output, output_stride);
-}
-
-#if HWY_CAP_GE512
-using hwy::HWY_NAMESPACE::Half;
-using hwy::HWY_NAMESPACE::Vec;
-template <size_t i, class DF, class V>
-HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
-  using HF = Half<DF>;
-  using HHF = Half<HF>;
-  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
-  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
-}
-
-template <class DF, class V>
-HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
-  using HF = Half<DF>;
-  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
-}
-
-#endif
-
-// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
-// aligned.
-template <class DF, class V, typename T>
-void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
-  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
-#if HWY_TARGET == HWY_SCALAR
-  Store(v0, df, mem);
-  Store(v1, df, mem + 1);
-#elif !HWY_CAP_GE256
-  Store(InterleaveLower(df, v0, v1), df, mem);
-  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
-#else
-  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
-    auto t0 = InterleaveLower(df, v0, v1);
-    auto t1 = InterleaveUpper(df, v0, v1);
-    Store(ConcatLowerLower(df, t1, t0), df, mem);
-    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
-  } else {
-#if HWY_CAP_GE512
-    auto t0 = InterleaveLower(df, v0, v1);
-    auto t1 = InterleaveUpper(df, v0, v1);
-    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
-                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
-          df, mem);
-    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
-                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
-          df, mem + Lanes(df));
-#endif
-  }
-#endif
-}
-
-void Upsample2Horizontal(float* JXL_RESTRICT row_in,
-                         float* JXL_RESTRICT row_out, size_t len_out) {
-  HWY_FULL(float) df;
-  auto threefour = Set(df, 0.75f);
-  auto onefour = Set(df, 0.25f);
-  const size_t len_in = len_out >> 1;
-  row_in[-1] = row_in[0];
-  row_in[len_in] = row_in[len_in - 1];
-  for (size_t x = 0; x < len_in; x += Lanes(df)) {
-    auto current = Mul(Load(df, row_in + x), threefour);
-    auto prev = LoadU(df, row_in + x - 1);
-    auto next = LoadU(df, row_in + x + 1);
-    auto left = MulAdd(onefour, prev, current);
-    auto right = MulAdd(onefour, next, current);
-    StoreInterleaved(df, left, right, row_out + x * 2);
-  }
-}
-
-void Upsample2Vertical(const float* JXL_RESTRICT row_top,
-                       const float* JXL_RESTRICT row_mid,
-                       const float* JXL_RESTRICT row_bot,
-                       float* JXL_RESTRICT row_out0,
-                       float* JXL_RESTRICT row_out1, size_t len) {
-  HWY_FULL(float) df;
-  auto threefour = Set(df, 0.75f);
-  auto onefour = Set(df, 0.25f);
-  for (size_t x = 0; x < len; x += Lanes(df)) {
-    auto it = Load(df, row_top + x);
-    auto im = Load(df, row_mid + x);
-    auto ib = Load(df, row_bot + x);
-    auto im_scaled = Mul(im, threefour);
-    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
-    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
-  }
-}
-
-void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
-                float* JXL_RESTRICT row2, size_t xsize) {
-  const HWY_FULL(float) df;
-
-  // Full-range BT.601 as defined by JFIF Clause 7:
-  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
-  const auto c128 = Set(df, 128.0f / 255);
-  const auto crcr = Set(df, 1.402f);
-  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
-  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
-  const auto cbcb = Set(df, 1.772f);
-
-  for (size_t x = 0; x < xsize; x += Lanes(df)) {
-    const auto y_vec = Add(Load(df, row0 + x), c128);
-    const auto cb_vec = Load(df, row1 + x);
-    const auto cr_vec = Load(df, row2 + x);
-    const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
-    const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
-    const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
-    Store(r_vec, df, row0 + x);
-    Store(g_vec, df, row1 + x);
-    Store(b_vec, df, row2 + x);
-  }
-}
-
-void DecenterRow(float* row, size_t xsize) {
-  const HWY_FULL(float) df;
-  const auto c128 = Set(df, 128.0f / 255);
-  for (size_t x = 0; x < xsize; x += Lanes(df)) {
-    Store(Add(Load(df, row + x), c128), df, row + x);
-  }
-}
-
-template <typename T>
-void StoreUnsignedRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
-                      size_t num_channels, T* output) {
-  const HWY_FULL(float) d;
-  auto zero = Zero(d);
-  auto one = Set(d, 1.0f);
-  auto mul = Set(d, (1u << (sizeof(T) * 8)) - 1);
-  const Rebind<T, decltype(d)> du;
-#if JXL_MEMORY_SANITIZER
-  const size_t padding = RoundUpTo(len, Lanes(d)) - len;
-  for (size_t c = 0; c < num_channels; ++c) {
-    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
-  }
-#endif
-  if (num_channels == 1) {
-    for (size_t i = 0; i < len; i += Lanes(d)) {
-      auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
-      Store(DemoteTo(du, NearestInt(v0)), du, &output[i]);
-    }
-  } else if (num_channels == 3) {
-    for (size_t i = 0; i < len; i += Lanes(d)) {
-      auto v0 = Mul(Clamp(zero, Load(d, &input[0][x0 + i]), one), mul);
-      auto v1 = Mul(Clamp(zero, Load(d, &input[1][x0 + i]), one), mul);
-      auto v2 = Mul(Clamp(zero, Load(d, &input[2][x0 + i]), one), mul);
-      StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
-                        DemoteTo(du, NearestInt(v1)),
-                        DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
-    }
-  }
-#if JXL_MEMORY_SANITIZER
-  __msan_poison(output + num_channels * len,
-                sizeof(output[0]) * num_channels * padding);
-#endif
-}
-
-void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
-                        size_t len, uint8_t* JXL_RESTRICT scratch_space,
-                        extras::PackedImage* image) {
-  if (y0 >= image->ysize) return;
-  JxlPixelFormat format = image->format;
-  uint8_t* pixels = reinterpret_cast<uint8_t*>(image->pixels());
-  if (format.data_type == JXL_TYPE_UINT8) {
-    size_t offset = y0 * image->stride + x0 * format.num_channels;
-    JXL_CHECK(offset + len * format.num_channels <= image->pixels_size);
-    StoreUnsignedRow(rows, x0, len, format.num_channels, scratch_space);
-    memcpy(pixels + offset, scratch_space, len * format.num_channels);
-  } else if (format.data_type == JXL_TYPE_UINT16) {
-    size_t offset = y0 * image->stride + x0 * format.num_channels * 2;
-    JXL_CHECK(offset + len * format.num_channels * 2 <= image->pixels_size);
-    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
-    StoreUnsignedRow(rows, x0, len, format.num_channels, tmp);
-    // TODO(szabadka) Handle endianness.
-    memcpy(pixels + offset, tmp, len * format.num_channels * 2);
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace jxl
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace jxl {
-namespace {
-HWY_EXPORT(GatherBlockStats);
-HWY_EXPORT(DecodeJpegBlock);
-HWY_EXPORT(Upsample2Horizontal);
-HWY_EXPORT(Upsample2Vertical);
-HWY_EXPORT(YCbCrToRGB);
-HWY_EXPORT(DecenterRow);
-HWY_EXPORT(WriteToPackedImage);
-}  // namespace
-
-namespace extras {
-
-void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
-                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
-                      int32_t* JXL_RESTRICT sumabs) {
-  return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
-                                                sumabs);
-}
-
-void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
-                     const float* JXL_RESTRICT dequant_matrices,
-                     const float* JXL_RESTRICT biases,
-                     float* JXL_RESTRICT scratch_space,
-                     float* JXL_RESTRICT output, size_t output_stride) {
-  return HWY_DYNAMIC_DISPATCH(DecodeJpegBlock)(
-      qblock, dequant_matrices, biases, scratch_space, output, output_stride);
-}
-
-void Upsample2Horizontal(float* JXL_RESTRICT row_in,
-                         float* JXL_RESTRICT row_out, size_t len_out) {
-  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row_in, row_out, len_out);
-}
-
-void Upsample2Vertical(const float* JXL_RESTRICT row_top,
-                       const float* JXL_RESTRICT row_mid,
-                       const float* JXL_RESTRICT row_bot,
-                       float* JXL_RESTRICT row_out0,
-                       float* JXL_RESTRICT row_out1, size_t len) {
-  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
-                                                 row_out0, row_out1, len);
-}
-
-void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
-                float* JXL_RESTRICT row2, size_t xsize) {
-  return HWY_DYNAMIC_DISPATCH(YCbCrToRGB)(row0, row1, row2, xsize);
-}
-
-void DecenterRow(float* row, size_t xsize) {
-  return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
-}
-
-void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
-                        size_t len, uint8_t* JXL_RESTRICT scratch_space,
-                        extras::PackedImage* image) {
-  return HWY_DYNAMIC_DISPATCH(WriteToPackedImage)(rows, x0, y0, len,
-                                                  scratch_space, image);
-}
-
-}  // namespace extras
-}  // namespace jxl
-#endif  // HWY_ONCE
--- a/third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/dec_group_jpeg.h
@ -1,51 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_EXTRAS_DEC_GROUP_JPEG_H_
-#define LIB_EXTRAS_DEC_GROUP_JPEG_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <vector>
-
-#include "lib/extras/packed_image.h"
-#include "lib/jxl/base/compiler_specific.h"
-
-namespace jxl {
-namespace extras {
-
-void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
-                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
-                      int32_t* JXL_RESTRICT sumabs);
-
-void DecodeJpegBlock(const int16_t* JXL_RESTRICT qblock,
-                     const float* JXL_RESTRICT dequant_matrices,
-                     const float* JXL_RESTRICT biases,
-                     float* JXL_RESTRICT scratch_space,
-                     float* JXL_RESTRICT output, size_t output_stride);
-
-void Upsample2Horizontal(float* JXL_RESTRICT row_in,
-                         float* JXL_RESTRICT row_out, size_t len_out);
-
-void Upsample2Vertical(const float* JXL_RESTRICT row_top,
-                       const float* JXL_RESTRICT row_mid,
-                       const float* JXL_RESTRICT row_bot,
-                       float* JXL_RESTRICT row_out0,
-                       float* JXL_RESTRICT row_out1, size_t len);
-
-void YCbCrToRGB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
-                float* JXL_RESTRICT row2, size_t xsize);
-
-void DecenterRow(float* row, size_t xsize);
-
-void WriteToPackedImage(float* JXL_RESTRICT rows[3], size_t x0, size_t y0,
-                        size_t len, uint8_t* JXL_RESTRICT scratch_space,
-                        extras::PackedImage* image);
-
-}  // namespace extras
-}  // namespace jxl
-
-#endif  // LIB_EXTRAS_DEC_GROUP_JPEG_H_
--- a/third_party/jpeg-xl/lib/extras/decode_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/decode_jpeg.cc
--- a/third_party/jpeg-xl/lib/extras/decode_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/decode_jpeg.h
@ -1,276 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_EXTRAS_DECODE_JPEG_H_
-#define LIB_EXTRAS_DECODE_JPEG_H_
-
-#include <stdint.h>
-
-#include <array>
-#include <vector>
-
-#include "hwy/aligned_allocator.h"
-#include "lib/extras/packed_image.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/image.h"
-
-namespace jxl {
-namespace extras {
-
-constexpr int kMaxComponents = 4;
-
-typedef int16_t coeff_t;
-
-// Represents one component of a jpeg file.
-struct JPEGComponent {
-  JPEGComponent()
-      : id(0),
-        h_samp_factor(1),
-        v_samp_factor(1),
-        quant_idx(0),
-        width_in_blocks(0),
-        height_in_blocks(0) {}
-
-  // One-byte id of the component.
-  uint32_t id;
-  // Horizontal and vertical sampling factors.
-  // In interleaved mode, each minimal coded unit (MCU) has
-  // h_samp_factor x v_samp_factor DCT blocks from this component.
-  int h_samp_factor;
-  int v_samp_factor;
-  // The index of the quantization table used for this component.
-  uint32_t quant_idx;
-  // The dimensions of the component measured in 8x8 blocks.
-  uint32_t width_in_blocks;
-  uint32_t height_in_blocks;
-  // The DCT coefficients of this component, laid out block-by-block, divided
-  // through the quantization matrix values.
-  hwy::AlignedFreeUniquePtr<coeff_t[]> coeffs;
-};
-
-struct HuffmanTableEntry {
-  // Initialize the value to an invalid symbol so that we can recognize it
-  // when reading the bit stream using a Huffman code with space > 0.
-  HuffmanTableEntry() : bits(0), value(0xffff) {}
-
-  uint8_t bits;    // number of bits used for this symbol
-  uint16_t value;  // symbol value or table offset
-};
-
-// Quantization values for an 8x8 pixel block.
-struct JPEGQuantTable {
-  std::array<int32_t, kDCTBlockSize> values;
-  // The index of this quantization table as it was parsed from the input JPEG.
-  // Each DQT marker segment contains an 'index' field, and we save this index
-  // here. Valid values are 0 to 3.
-  uint32_t index = 0;
-};
-
-// Huffman table indexes and MCU dimensions used for one component of one scan.
-struct JPEGComponentScanInfo {
-  uint32_t comp_idx;
-  uint32_t dc_tbl_idx;
-  uint32_t ac_tbl_idx;
-  uint32_t mcu_ysize_blocks;
-  uint32_t mcu_xsize_blocks;
-};
-
-// Contains information that is used in one scan.
-struct JPEGScanInfo {
-  // Parameters used for progressive scans (named the same way as in the spec):
-  //   Ss : Start of spectral band in zig-zag sequence.
-  //   Se : End of spectral band in zig-zag sequence.
-  //   Ah : Successive approximation bit position, high.
-  //   Al : Successive approximation bit position, low.
-  uint32_t Ss;
-  uint32_t Se;
-  uint32_t Ah;
-  uint32_t Al;
-  uint32_t num_components = 0;
-  std::array<JPEGComponentScanInfo, kMaxComponents> components;
-  size_t MCU_rows;
-  size_t MCU_cols;
-};
-
-// State of the decoder that has to be saved before decoding one MCU in case
-// we run out of the bitstream.
-struct MCUCodingState {
-  coeff_t last_dc_coeff[kMaxComponents];
-  int eobrun;
-  std::vector<coeff_t> coeffs;
-};
-
-// Streaming JPEG decoding object.
-class JpegDecoder {
- public:
-  enum class Status {
-    kSuccess,
-    kNeedMoreInput,
-    kError,
-  };
-
-  // Sets the next chunk of input. It must be called before the first call to
-  // ReadHeaders() and every time a reder function returns
-  // Status::kNeedMoreInput.
-  Status SetInput(const uint8_t* data, size_t len);
-
-  // Sets the output image. Must be called between ReadHeaders() and
-  // ReadScanLines(). The provided image must have the dimensions and number of
-  // channels as the underlying JPEG bitstream.
-  Status SetOutput(PackedImage* image);
-
-  // Reads the header markers up to and including SOF marker. After this returns
-  // kSuccess, the image attribute accessors can be called.
-  Status ReadHeaders();
-
-  // Reads the bitstream after the SOF marker, and fills in at most
-  // max_output_rows scan lines of the provided image. Set *num_output_rows to
-  // the actual number of lines produced.
-  Status ReadScanLines(size_t* num_output_rows, size_t max_output_rows);
-
-  // Image attribute accessors, can be called after ReadHeaders() returns
-  // kSuccess.
-  size_t xsize() const { return xsize_; }
-  size_t ysize() const { return ysize_; }
-  size_t num_channels() const { return components_.size(); }
-  const std::vector<uint8_t>& icc_profile() const { return icc_profile_; }
-
- private:
-  enum class State {
-    kStart,
-    kProcessMarkers,
-    kScan,
-    kRender,
-    kEnd,
-  };
-  State state_ = State::kStart;
-
-  //
-  // Input handling state.
-  //
-  const uint8_t* next_in_ = nullptr;
-  size_t avail_in_ = 0;
-  // Codestream input data is copied here temporarily when the decoder needs
-  // more input bytes to process the next part of the stream.
-  std::vector<uint8_t> codestream_copy_;
-  // Number of bytes at the end of codestream_copy_ that were not yet consumed
-  // by calling AdvanceInput().
-  size_t codestream_unconsumed_ = 0;
-  // Position in the codestream_copy_ vector that the decoder already finished
-  // processing.
-  size_t codestream_pos_ = 0;
-  // Number of bits after codestream_pos_ that were already processed.
-  size_t codestream_bits_ahead_ = 0;
-
-  //
-  // Marker data processing state.
-  //
-  bool found_soi_ = false;
-  bool found_app0_ = false;
-  bool found_dri_ = false;
-  bool found_sof_ = false;
-  bool found_eoi_ = false;
-  size_t xsize_ = 0;
-  size_t ysize_ = 0;
-  bool is_ycbcr_ = true;
-  size_t icc_index_ = 0;
-  size_t icc_total_ = 0;
-  std::vector<uint8_t> icc_profile_;
-  size_t restart_interval_ = 0;
-  std::vector<JPEGQuantTable> quant_;
-  std::vector<JPEGComponent> components_;
-  std::vector<HuffmanTableEntry> dc_huff_lut_;
-  std::vector<HuffmanTableEntry> ac_huff_lut_;
-  uint8_t huff_slot_defined_[256] = {};
-
-  // Fields defined by SOF marker.
-  bool is_progressive_;
-  int max_h_samp_;
-  int max_v_samp_;
-  size_t iMCU_rows_;
-  size_t iMCU_cols_;
-  size_t iMCU_width_;
-  size_t iMCU_height_;
-
-  // Initialized at strat of frame.
-  uint16_t scan_progression_[kMaxComponents][kDCTBlockSize];
-
-  //
-  // Per scan state.
-  //
-  JPEGScanInfo scan_info_;
-  size_t scan_mcu_row_;
-  size_t scan_mcu_col_;
-  coeff_t last_dc_coeff_[kMaxComponents];
-  int eobrun_;
-  int restarts_to_go_;
-  int next_restart_marker_;
-
-  MCUCodingState mcu_;
-
-  //
-  // Rendering state.
-  //
-  PackedImage* output_;
-
-  Image3F MCU_row_buf_;
-  size_t MCU_buf_current_row_;
-  size_t MCU_buf_ready_rows_;
-
-  size_t output_row_;
-  size_t output_mcu_row_;
-  size_t output_ci_;
-
-  // Temporary buffers for vertically upsampled chroma components. We keep a
-  // ringbuffer of 3 * kBlockDim rows so that we have access for previous and
-  // next rows.
-  std::vector<ImageF> chroma_;
-  // In the rendering order, vertically upsampled chroma components come first.
-  std::vector<size_t> component_order_;
-  hwy::AlignedFreeUniquePtr<float[]> idct_scratch_;
-  hwy::AlignedFreeUniquePtr<float[]> upsample_scratch_;
-  hwy::AlignedFreeUniquePtr<uint8_t[]> output_scratch_;
-
-  hwy::AlignedFreeUniquePtr<float[]> dequant_;
-  // Per channel and per frequency statistics about the number of nonzeros and
-  // the sum of coefficient absolute values, used in dequantization bias
-  // computation.
-  hwy::AlignedFreeUniquePtr<int[]> nonzeros_;
-  hwy::AlignedFreeUniquePtr<int[]> sumabs_;
-  std::vector<size_t> num_processed_blocks_;
-  hwy::AlignedFreeUniquePtr<float[]> biases_;
-
-  void AdvanceInput(size_t size);
-  void AdvanceCodestream(size_t size);
-  Status RequestMoreInput();
-  Status GetCodestreamInput(const uint8_t** data, size_t* len);
-
-  Status ProcessMarker(const uint8_t* data, size_t len, size_t* pos);
-  Status ProcessSOF(const uint8_t* data, size_t len);
-  Status ProcessSOS(const uint8_t* data, size_t len);
-  Status ProcessDHT(const uint8_t* data, size_t len);
-  Status ProcessDQT(const uint8_t* data, size_t len);
-  Status ProcessDRI(const uint8_t* data, size_t len);
-  Status ProcessAPP(const uint8_t* data, size_t len);
-  Status ProcessCOM(const uint8_t* data, size_t len);
-
-  Status ProcessScan(const uint8_t* data, size_t len, size_t* pos);
-
-  void SaveMCUCodingState();
-  void RestoreMCUCodingState();
-
-  void PrepareForOutput();
-  void ProcessOutput(size_t* num_output_rows, size_t max_output_rows);
-};
-
-Status DecodeJpeg(const std::vector<uint8_t>& compressed,
-                  JxlDataType output_data_type, ThreadPool* pool,
-                  PackedPixelFile* ppf);
-
-}  // namespace extras
-}  // namespace jxl
-
-#endif  // LIB_EXTRAS_DECODE_JPEG_H_
--- a/third_party/jpeg-xl/lib/extras/decode_jpeg_test.cc
+++ b/third_party/jpeg-xl/lib/extras/decode_jpeg_test.cc
@ -1,190 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/extras/decode_jpeg.h"
-
-#include <stddef.h>
-#include <stdio.h>
-
-#if JPEGXL_ENABLE_JPEG
-#include "lib/extras/dec/jpg.h"
-#endif
-#include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
-
-namespace jxl {
-namespace extras {
-namespace {
-
-using test::DistanceRMS;
-
-struct TestConfig {
-  std::string fn;
-  std::string fn_desc;
-  size_t chunk_size;
-  size_t max_output_lines;
-};
-
-class DecodeJpegTestParam : public ::testing::TestWithParam<TestConfig> {};
-
-TEST_P(DecodeJpegTestParam, Streaming) {
-  TestConfig config = GetParam();
-  const PaddedBytes compressed = ReadTestData(config.fn.c_str());
-
-#if JPEGXL_ENABLE_JPEG
-  PackedPixelFile ppf_libjpeg;
-  EXPECT_TRUE(
-      DecodeImageJPG(Span<const uint8_t>(compressed.data(), compressed.size()),
-                     ColorHints(), SizeConstraints(), &ppf_libjpeg));
-  ASSERT_EQ(1, ppf_libjpeg.frames.size());
-#endif
-
-  JpegDecoder dec;
-
-  size_t chunk_size = config.chunk_size;
-  if (chunk_size == 0) chunk_size = compressed.size();
-  size_t pos = std::min(chunk_size, compressed.size());
-  ASSERT_EQ(JpegDecoder::Status::kSuccess,
-            dec.SetInput(compressed.data(), pos));
-
-  JpegDecoder::Status status;
-  for (;;) {
-    status = dec.ReadHeaders();
-    if (status == JpegDecoder::Status::kNeedMoreInput) {
-      ASSERT_LT(pos, compressed.size());
-      size_t len = std::min(chunk_size, compressed.size() - pos);
-      ASSERT_EQ(JpegDecoder::Status::kSuccess,
-                dec.SetInput(compressed.data() + pos, len));
-      pos += len;
-      continue;
-    }
-    ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
-    break;
-  }
-
-#if JPEGXL_ENABLE_JPEG
-  EXPECT_EQ(ppf_libjpeg.info.xsize, dec.xsize());
-  EXPECT_EQ(ppf_libjpeg.info.ysize, dec.ysize());
-  EXPECT_EQ(ppf_libjpeg.info.num_color_channels, dec.num_channels());
-#endif
-
-  JxlPixelFormat format = {static_cast<uint32_t>(dec.num_channels()),
-                           JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
-  PackedImage output(dec.xsize(), dec.ysize(), format);
-  ASSERT_EQ(JpegDecoder::Status::kSuccess, dec.SetOutput(&output));
-
-  size_t max_output_lines = config.max_output_lines;
-  if (max_output_lines == 0) max_output_lines = dec.ysize();
-
-  size_t total_output_lines = 0;
-  while (total_output_lines < dec.ysize()) {
-    size_t num_output_lines = 0;
-    status = dec.ReadScanLines(&num_output_lines, max_output_lines);
-    total_output_lines += num_output_lines;
-    if (status == JpegDecoder::Status::kNeedMoreInput) {
-      ASSERT_LT(pos, compressed.size());
-      size_t len = std::min(chunk_size, compressed.size() - pos);
-      ASSERT_EQ(JpegDecoder::Status::kSuccess,
-                dec.SetInput(compressed.data() + pos, len));
-      pos += len;
-      continue;
-    }
-    ASSERT_EQ(status, JpegDecoder::Status::kSuccess);
-    if (total_output_lines < dec.ysize()) {
-      EXPECT_EQ(num_output_lines, max_output_lines);
-    }
-  }
-
-#if JPEGXL_ENABLE_JPEG
-  const PackedImage& output_libjpeg = ppf_libjpeg.frames[0].color;
-  ASSERT_EQ(output.xsize, output_libjpeg.xsize);
-  ASSERT_EQ(output.ysize, output_libjpeg.ysize);
-  EXPECT_LE(
-      DistanceRMS(reinterpret_cast<const uint8_t*>(output.pixels()),
-                  reinterpret_cast<const uint8_t*>(output_libjpeg.pixels()),
-                  output.xsize, output.ysize, output.format),
-      0.0075);
-#endif
-}
-
-std::vector<TestConfig> GenerateTests() {
-  std::vector<TestConfig> all_tests;
-  {
-    std::vector<std::pair<std::string, std::string>> testfiles({
-        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
-        {"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
-        {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
-        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
-    });
-    for (const auto& it : testfiles) {
-      for (size_t chunk_size : {0, 1, 64, 65536}) {
-        for (size_t max_output_lines : {0, 1, 8, 16}) {
-          TestConfig config;
-          config.fn = it.first;
-          config.fn_desc = it.second;
-          config.chunk_size = chunk_size;
-          config.max_output_lines = max_output_lines;
-          all_tests.push_back(config);
-        }
-      }
-    }
-  }
-  {
-    std::vector<std::pair<std::string, std::string>> testfiles({
-        {"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
-        {"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
-        {"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
-        {"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
-        {"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
-        {"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
-        {"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
-        {"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
-         "Q85RGBSubsampleBlue"},
-    });
-    for (const auto& it : testfiles) {
-      for (size_t chunk_size : {0, 64}) {
-        for (size_t max_output_lines : {0, 16}) {
-          TestConfig config;
-          config.fn = it.first;
-          config.fn_desc = it.second;
-          config.chunk_size = chunk_size;
-          config.max_output_lines = max_output_lines;
-          all_tests.push_back(config);
-        }
-      }
-    }
-  }
-  return all_tests;
-}
-
-std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
-  os << c.fn_desc;
-  if (c.chunk_size == 0) {
-    os << "CompleteInput";
-  } else {
-    os << "InputChunks" << c.chunk_size;
-  }
-  if (c.max_output_lines == 0) {
-    os << "CompleteOutput";
-  } else {
-    os << "OutputLines" << c.max_output_lines;
-  }
-  return os;
-}
-
-std::string TestDescription(
-    const testing::TestParamInfo<DecodeJpegTestParam::ParamType>& info) {
-  std::stringstream name;
-  name << info.param;
-  return name.str();
-}
-
-JXL_GTEST_INSTANTIATE_TEST_SUITE_P(DecodeJpegTest, DecodeJpegTestParam,
-                                   testing::ValuesIn(GenerateTests()),
-                                   TestDescription);
-
-}  // namespace
-}  // namespace extras
-}  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/encode_jpeg.cc
+++ b/third_party/jpeg-xl/lib/extras/encode_jpeg.cc
@ -27,8 +27,9 @@ namespace jxl {
 namespace extras {
 namespace HWY_NAMESPACE {

-void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
-                            const FrameDimensions& frame_dim, const float* qm,
+void ComputeDCTCoefficients(const Image3F& opsin, const bool xyb,
+                            const ImageF& qf, const FrameDimensions& frame_dim,
+                            const float* qm,
                            std::vector<jpeg::JPEGComponent>* components) {
  int max_samp_factor = 1;
  for (const auto& c : *components) {
@ -75,7 +76,11 @@ void ComputeDCTCoefficients(const Image3F& opsin, const ImageF& qf,
            block[ix * 8 + iy] = cc;
          }
        }
-        block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
+        if (xyb) {
+          // ToXYB does not create zero-centered sample values like RgbToYcbcr
+          // does, so we apply an offset to the DC values instead.
+          block[0] = std::round((2040 * dct[0] - 1024) * qmc[0]);
+        }
      }
    }
  }
@ -95,12 +100,7 @@ HWY_EXPORT(ComputeDCTCoefficients);

 namespace {

-std::vector<uint8_t> CreateXybICCAppMarker() {
-  ColorEncoding c_xyb;
-  c_xyb.SetColorSpace(ColorSpace::kXYB);
-  c_xyb.rendering_intent = RenderingIntent::kPerceptual;
-  JXL_CHECK(c_xyb.CreateICC());
-  const auto& icc = c_xyb.ICC();
+std::vector<uint8_t> CreateICCAppMarker(const PaddedBytes& icc) {
  std::vector<uint8_t> icc_marker(17 + icc.size());
  // See the APP2 marker format for embedded ICC profile at
  // https://www.color.org/technotes/ICC-Technote-ProfileEmbedding.pdf
@ -116,7 +116,15 @@ std::vector<uint8_t> CreateXybICCAppMarker() {
  return icc_marker;
 }

-static constexpr float kBaseQuantMatrix[] = {
+std::vector<uint8_t> CreateXybICCAppMarker() {
+  ColorEncoding c_xyb;
+  c_xyb.SetColorSpace(ColorSpace::kXYB);
+  c_xyb.rendering_intent = RenderingIntent::kPerceptual;
+  JXL_CHECK(c_xyb.CreateICC());
+  return CreateICCAppMarker(c_xyb.ICC());
+}
+
+static constexpr float kBaseQuantMatrixXYB[] = {
    // c = 0
    0.010745695802f,
    0.014724285860f,
@ -314,9 +322,45 @@ static constexpr float kBaseQuantMatrix[] = {
    0.047241950370f,
 };

-void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
+// Y: mozjpeg q99; Cb, Cr: mozjpeg q95
+static constexpr float kBaseQuantMatrixYCbCr[] = {
+    // c = 0
+    1, 1, 1, 1, 1, 1, 1, 2,  //
+    1, 1, 1, 1, 1, 1, 1, 2,  //
+    1, 1, 1, 1, 1, 1, 2, 3,  //
+    1, 1, 1, 1, 1, 1, 2, 3,  //
+    1, 1, 1, 1, 1, 2, 3, 4,  //
+    1, 1, 1, 1, 2, 2, 3, 5,  //
+    1, 1, 2, 2, 3, 3, 5, 6,  //
+    2, 2, 3, 3, 4, 5, 6, 8,  //
+
+    // c = 1
+    2, 2, 2, 2, 3, 4, 6, 9,        //
+    2, 2, 2, 3, 3, 4, 5, 8,        //
+    2, 2, 2, 3, 4, 6, 9, 14,       //
+    2, 3, 3, 4, 5, 7, 11, 16,      //
+    3, 3, 4, 5, 7, 9, 13, 19,      //
+    4, 4, 6, 7, 9, 12, 17, 24,     //
+    6, 5, 9, 11, 13, 17, 23, 31,   //
+    9, 8, 14, 16, 19, 24, 31, 42,  //
+
+    // c = 2
+    2, 2, 2, 2, 3, 4, 6, 9,        //
+    2, 2, 2, 3, 3, 4, 5, 8,        //
+    2, 2, 2, 3, 4, 6, 9, 14,       //
+    2, 3, 3, 4, 5, 7, 11, 16,      //
+    3, 3, 4, 5, 7, 9, 13, 19,      //
+    4, 4, 6, 7, 9, 12, 17, 24,     //
+    6, 5, 9, 11, 13, 17, 23, 31,   //
+    9, 8, 14, 16, 19, 24, 31, 42,  //
+};
+
+void AddJpegQuantMatrices(const ImageF& qf, bool xyb, float dc_quant,
+                          float global_scale,
                          std::vector<jpeg::JPEGQuantTable>* quant_tables,
                          float* qm) {
+  const float* const base_quant_matrix =
+      xyb ? kBaseQuantMatrixXYB : kBaseQuantMatrixYCbCr;
  // Scale the base quant matrix based on the scaled XYB scales and the quant
  // field.
  float qfmin, qfmax;
@ -324,10 +368,10 @@ void AddJpegQuantMatrices(const ImageF& qf, float dc_quant, float global_scale,
  const float dc_scale = global_scale / dc_quant;
  const float ac_scale = global_scale / qfmax;
  for (size_t c = 0, ix = 0; c < 3; c++) {
-    qm[ix] = dc_scale * kBaseQuantMatrix[ix];
+    qm[ix] = dc_scale * base_quant_matrix[ix];
    ix++;
    for (size_t j = 1; j < kDCTBlockSize; j++, ix++) {
-      qm[ix] = ac_scale * kBaseQuantMatrix[ix];
+      qm[ix] = ac_scale * base_quant_matrix[ix];
    }
  }

@ -514,26 +558,37 @@ void SetJpegHuffmanCode(const JpegClusteredHistograms& clusters,
 }

 void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
-                  float global_scale, const bool subsample_blue,
-                  const FrameDimensions& frame_dim, jpeg::JPEGData* out) {
+                  float global_scale, const bool xyb, const bool subsample_blue,
+                  const PaddedBytes& icc, const FrameDimensions& frame_dim,
+                  jpeg::JPEGData* out) {
  *out = jpeg::JPEGData();
  // ICC
  out->marker_order.push_back(0xe2);
-  out->app_data.push_back(CreateXybICCAppMarker());
+  if (xyb) {
+    out->app_data.push_back(CreateXybICCAppMarker());
+  } else {
+    out->app_data.push_back(CreateICCAppMarker(icc));
+  }

  // DQT
  out->marker_order.emplace_back(0xdb);
  float qm[3 * kDCTBlockSize];
-  AddJpegQuantMatrices(qf, dc_quant, global_scale, &out->quant, qm);
+  AddJpegQuantMatrices(qf, xyb, dc_quant, global_scale, &out->quant, qm);

  // SOF
  out->marker_order.emplace_back(0xc2);
  out->components.resize(3);
  out->height = frame_dim.ysize;
  out->width = frame_dim.xsize;
-  out->components[0].id = 'R';
-  out->components[1].id = 'G';
-  out->components[2].id = 'B';
+  if (xyb) {
+    out->components[0].id = 'R';
+    out->components[1].id = 'G';
+    out->components[2].id = 'B';
+  } else {
+    out->components[0].id = 1;
+    out->components[1].id = 2;
+    out->components[2].id = 3;
+  }
  size_t max_samp_factor = subsample_blue ? 2 : 1;
  for (size_t c = 0; c < 3; ++c) {
    const size_t factor = (subsample_blue && c == 2) ? 2 : 1;
@ -546,7 +601,7 @@ void FillJPEGData(const Image3F& opsin, const ImageF& qf, float dc_quant,
    out->components[c].quant_idx = c;
  }
  HWY_DYNAMIC_DISPATCH(ComputeDCTCoefficients)
-  (opsin, qf, frame_dim, qm, &out->components);
+  (opsin, xyb, qf, frame_dim, qm, &out->components);

  // DHT (the actual Huffman codes will be added later).
  out->marker_order.emplace_back(0xc4);
@ -635,9 +690,9 @@ size_t JpegSize(const jpeg::JPEGData& jpeg_data) {

 }  // namespace

-Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
+Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
                  ThreadPool* pool, std::vector<uint8_t>* compressed) {
-  const bool subsample_blue = true;
+  const bool subsample_blue = jpeg_settings.xyb;
  const size_t max_shift = subsample_blue ? 1 : 0;
  FrameDimensions frame_dim;
  frame_dim.Set(input.xsize(), input.ysize(), 1, max_shift, max_shift, false,
@ -651,17 +706,35 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,

  // Compute adaptive quant field.
  ImageF mask;
-  ImageF qf = InitialQuantField(distance, opsin, frame_dim, pool, 1.0, &mask);
-  ScaleXYB(&opsin);
+  ImageF qf = InitialQuantField(jpeg_settings.distance, opsin, frame_dim, pool,
+                                1.0, &mask);
+  if (jpeg_settings.xyb) {
+    ScaleXYB(&opsin);
+  } else {
+    opsin.ShrinkTo(input.xsize(), input.ysize());
+    JXL_RETURN_IF_ERROR(RgbToYcbcr(
+        input.color().Plane(0), input.color().Plane(1), input.color().Plane(2),
+        &opsin.Plane(0), &opsin.Plane(1), &opsin.Plane(2), pool));
+    PadImageToBlockMultipleInPlace(&opsin, 8 << max_shift);
+  }

  // Create jpeg data and optimize Huffman codes.
  jpeg::JPEGData jpeg_data;
  float global_scale = 0.66f;
-  float dc_quant = InitialQuantDC(distance);
-  FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
-               &jpeg_data);
+  if (!jpeg_settings.xyb) {
+    global_scale /= 500;
+    if (input.metadata()->color_encoding.tf.IsPQ()) {
+      global_scale *= .4f;
+    } else if (input.metadata()->color_encoding.tf.IsHLG()) {
+      global_scale *= .5f;
+    }
+  }
+  float dc_quant = InitialQuantDC(jpeg_settings.distance);
+  FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
+               subsample_blue, input.metadata()->color_encoding.ICC(),
+               frame_dim, &jpeg_data);

-  if (target_size != 0) {
+  if (jpeg_settings.target_size != 0) {
    // Tweak the jpeg data so that the resulting compressed file is
    // approximately target_size long.
    size_t prev_size = 0;
@ -670,7 +743,7 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
    size_t iter = 0;
    for (;;) {
      size_t size = JpegSize(jpeg_data);
-      float error = size * 1.0f / target_size - 1.0f;
+      float error = size * 1.0f / jpeg_settings.target_size - 1.0f;
      if (std::abs(error) < std::abs(best_error)) {
        best_error = error;
        best_global_scale = global_scale;
@ -679,13 +752,15 @@ Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
        break;
      }
      global_scale *= 1.0f + error;
-      FillJPEGData(opsin, qf, dc_quant, global_scale, subsample_blue, frame_dim,
-                   &jpeg_data);
+      FillJPEGData(opsin, qf, dc_quant, global_scale, jpeg_settings.xyb,
+                   subsample_blue, input.metadata()->color_encoding.ICC(),
+                   frame_dim, &jpeg_data);
      prev_size = size;
      ++iter;
    }
    if (best_global_scale != global_scale) {
-      FillJPEGData(opsin, qf, dc_quant, best_global_scale, subsample_blue,
+      FillJPEGData(opsin, qf, dc_quant, best_global_scale, jpeg_settings.xyb,
+                   subsample_blue, input.metadata()->color_encoding.ICC(),
                   frame_dim, &jpeg_data);
    }
  }
--- a/third_party/jpeg-xl/lib/extras/encode_jpeg.h
+++ b/third_party/jpeg-xl/lib/extras/encode_jpeg.h
@ -16,7 +16,13 @@
 namespace jxl {
 namespace extras {

-Status EncodeJpeg(const ImageBundle& input, size_t target_size, float distance,
+struct JpegSettings {
+  bool xyb = true;
+  size_t target_size = 0;
+  float distance = 1.f;
+};
+
+Status EncodeJpeg(const ImageBundle& input, const JpegSettings& jpeg_settings,
                  ThreadPool* pool, std::vector<uint8_t>* compressed);

 }  // namespace extras
--- a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
@ -58,7 +58,6 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,

  JXL_RETURN_IF_ERROR(ConvertFromExternal(
      span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
-      /*alpha_is_premultiplied=*/info.alpha_premultiplied,
      frame_bits_per_sample, frame.color.format, pool, bundle));

  bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
--- a/third_party/jpeg-xl/lib/include/jxl/decode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/decode.h
@ -168,16 +168,6 @@ typedef enum {
   */
  JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3,

-  /** The decoder is able to decode a DC image and requests setting a DC output
-   * buffer using @ref JxlDecoderSetDCOutBuffer. This occurs if @ref
-   * JXL_DEC_DC_IMAGE is requested and it is possible to decode a DC image from
-   * the codestream and the DC out buffer was not yet set. This event re-occurs
-   * for new frames if there are multiple animation frames.
-   * @deprecated The DC feature in this form will be removed. For progressive
-   * rendering, @ref JxlDecoderFlushImage should be used.
-   */
-  JXL_DEC_NEED_DC_OUT_BUFFER = 4,
-
  /** The decoder requests an output buffer to store the full resolution image,
   * which can be set with @ref JxlDecoderSetImageOutBuffer or with @ref
   * JxlDecoderSetImageOutCallback. This event re-occurs for new frames if
@ -260,28 +250,12 @@ typedef enum {
   */
  JXL_DEC_FRAME = 0x400,

-  /** Informative event by @ref JxlDecoderProcessInput
-   * "JxlDecoderProcessInput": DC image, 8x8 sub-sampled frame, decoded. It is
-   * not guaranteed that the decoder will always return DC separately, but when
-   * it does it will do so before outputting the full frame. @ref
-   * JxlDecoderSetDCOutBuffer must be used after getting the basic image
-   * information to be able to get the DC pixels, if not this return status only
-   * indicates we're past this point in the codestream. This event occurs max
-   * once per frame and always later than @ref JXL_DEC_FRAME and other header
-   * events and earlier than full resolution pixel data.
-   *
-   * @deprecated The DC feature in this form will be removed. For progressive
-   * rendering, @ref JxlDecoderFlushImage should be used.
-   */
-  JXL_DEC_DC_IMAGE = 0x800,
-
  /** Informative event by @ref JxlDecoderProcessInput
   * "JxlDecoderProcessInput": full frame (or layer, in case coalescing is
   * disabled) is decoded. @ref JxlDecoderSetImageOutBuffer must be used after
   * getting the basic image information to be able to get the image pixels, if
   * not this return status only indicates we're past this point in the
-   * codestream. This event occurs max once per frame and always later than @ref
-   * JXL_DEC_DC_IMAGE.
+   * codestream. This event occurs max once per frame.
   * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
   * end of the frame (or if @ref JXL_DEC_JPEG_RECONSTRUCTION is subscribed to,
   * from the end of the last box that is needed for jpeg reconstruction) as
@ -599,8 +573,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
 *     available and this informative event is subscribed to.
 * @return @ref JXL_DEC_PREVIEW_IMAGE when preview pixel information is
 *     available and output in the preview buffer.
- * @return @ref JXL_DEC_DC_IMAGE when DC pixel information (8x8 downscaled
- *     version of the image) is available and output is in the DC buffer.
 * @return @ref JXL_DEC_FULL_IMAGE when all pixel information at highest detail
 *     is available and has been output in the pixel buffer.
 */
@ -992,44 +964,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
 JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
    const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);

-/**
- * Returns the minimum size in bytes of the DC image output buffer
- * for the given format. This is the buffer for @ref JxlDecoderSetDCOutBuffer.
- * Requires the basic image information is available in the decoder.
- *
- * @param dec decoder object
- * @param format format of pixels
- * @param size output value, buffer size in bytes
- * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
- *     information not available yet.
- *
- * @deprecated The DC feature in this form will be removed. Use @ref
- *     JxlDecoderFlushImage for progressive rendering.
- */
-JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize(
-    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
-
-/**
- * Sets the buffer to write the lower resolution (8x8 sub-sampled) DC image
- * to. The size of the buffer must be at least as large as given by @ref
- * JxlDecoderDCOutBufferSize. The buffer follows the format described by
- * JxlPixelFormat. The DC image has dimensions ceil(xsize / 8) * ceil(ysize /
- * 8). The buffer is owned by the caller.
- *
- * @param dec decoder object
- * @param format format of pixels. Object owned by user and its contents are
- *     copied internally.
- * @param buffer buffer type to output the pixel data to
- * @param size size of buffer in bytes
- * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
- *     size too small.
- *
- * @deprecated The DC feature in this form will be removed. Use @ref
- *     JxlDecoderFlushImage for progressive rendering.
- */
-JXL_DEPRECATED JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer(
-    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
-
 /**
 * Returns the minimum size in bytes of the image output pixel buffer for the
 * given format. This is the buffer for @ref JxlDecoderSetImageOutBuffer.
--- a/Show More
+++ b/Show More