[libc] New version of the mem* framework

This version is more composable and also simpler at the expense of being more explicit and more verbose. It also provides minimal implementations for ARM platforms. Codegen can be checked here https://godbolt.org/z/x19zvE59v Differential Revision: https://reviews.llvm.org/D135134
2024-11-24 22:30:13 +00:00 · 2022-10-14 12:42:34 +00:00 · 2022-10-14 12:42:34 +00:00 · 9721687835
commit 9721687835
parent 5a859432f3
26 changed files with 1771 additions and 1926 deletions
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
    len = available_capacity;

  if (len > 0) {
-    inline_memset(cur_buffer, new_char, len);
+    inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
    cur_buffer += len;
    available_capacity -= len;
  }
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@ -14,8 +14,8 @@ namespace __llvm_libc {

 LLVM_LIBC_FUNCTION(int, bcmp,
                   (const void *lhs, const void *rhs, size_t count)) {
-  return inline_bcmp(static_cast<const char *>(lhs),
-                     static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
+                                      static_cast<const char *>(rhs), count));
 }

 } // namespace __llvm_libc
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@ -15,8 +15,8 @@ namespace __llvm_libc {

 LLVM_LIBC_FUNCTION(int, memcmp,
                   (const void *lhs, const void *rhs, size_t count)) {
-  return inline_memcmp(static_cast<const char *>(lhs),
-                       static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
+                                        static_cast<const char *>(rhs), count));
 }

 } // namespace __llvm_libc
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@ -9,36 +9,104 @@
 #include "src/string/memmove.h"

 #include "src/__support/common.h"
-#include "src/__support/integer_operations.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include <stddef.h> // size_t, ptrdiff_t

+#include <stdio.h>
+
 namespace __llvm_libc {

-static inline void inline_memmove(char *dst, const char *src, size_t count) {
-  using namespace __llvm_libc::scalar;
+[[maybe_unused]] static inline void
+inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
+  if ((count == 0) || (dst == src))
+    return;
+  if (dst < src) {
+#pragma nounroll
+    for (size_t offset = 0; offset < count; ++offset)
+      builtin::Memcpy<1>::block(dst + offset, src + offset);
+  } else {
+#pragma nounroll
+    for (ptrdiff_t offset = count; offset >= 0; --offset)
+      builtin::Memcpy<1>::block(dst + offset, src + offset);
+  }
+}
+
+template <size_t MaxSize>
+[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
+                                                           size_t count) {
  if (count == 0)
    return;
  if (count == 1)
-    return move<_1>(dst, src);
+    return generic::Memmove<1, MaxSize>::block(dst, src);
  if (count <= 4)
-    return move<HeadTail<_2>>(dst, src, count);
+    return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
  if (count <= 8)
-    return move<HeadTail<_4>>(dst, src, count);
+    return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
  if (count <= 16)
-    return move<HeadTail<_8>>(dst, src, count);
+    return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
  if (count <= 32)
-    return move<HeadTail<_16>>(dst, src, count);
+    return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
  if (count <= 64)
-    return move<HeadTail<_32>>(dst, src, count);
+    return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
  if (count <= 128)
-    return move<HeadTail<_64>>(dst, src, count);
+    return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
+  if (dst < src) {
+    generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
+                                                                    count);
+    return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
+                                                                count);
+  } else {
+    generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
+                                                                     count);
+    return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
+                                                                 count);
+  }
+}

-  using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
-  if (dst < src)
-    return move<AlignedMoveLoop>(dst, src, count);
-  else if (dst > src)
-    return move_backward<AlignedMoveLoop>(dst, src, count);
+static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+#if defined(LLVM_LIBC_ARCH_X86)
+  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+                                     : x86::kAvx   ? 32
+                                     : x86::kSse2  ? 16
+                                                   : 8;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+#endif
+  // return inline_memmove_generic<kMaxSize>(dst, src, count);
+  if (count == 0)
+    return;
+  if (count == 1)
+    return generic::Memmove<1, kMaxSize>::block(dst, src);
+  if (count <= 4)
+    return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 8)
+    return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 16)
+    return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 32)
+    return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 64)
+    return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
+  if (count <= 128)
+    return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
+  if (dst < src) {
+    generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
+                                                                 count);
+  } else {
+    generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
+                                                                  count);
+  }
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_memmove_embedded_tiny(dst, src, count);
+#else
+#error "Unsupported platform"
+#endif
 }

 LLVM_LIBC_FUNCTION(void *, memmove,
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@ -2,13 +2,17 @@
 add_header_library(
  memory_utils
  HDRS
-    utils.h
-    elements.h
    bcmp_implementations.h
    bzero_implementations.h
    memcmp_implementations.h
    memcpy_implementations.h
    memset_implementations.h
+    op_aarch64.h
+    op_higher_order.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
  DEPS
    libc.src.__support.CPP.bit
 )
--- a/libc/src/string/memory_utils/README.md
+++ b/libc/src/string/memory_utils/README.md
@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@ -11,49 +11,164 @@

 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"

 #include <stddef.h> // size_t

 namespace __llvm_libc {

-// Fixed-size difference between 'lhs' and 'rhs'.
-template <typename Element> bool differs(const char *lhs, const char *rhs) {
-  return !Element::equals(lhs, rhs);
-}
-// Runtime-size difference between 'lhs' and 'rhs'.
-template <typename Element>
-bool differs(const char *lhs, const char *rhs, size_t size) {
-  return !Element::equals(lhs, rhs, size);
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+#pragma nounroll
+  for (size_t offset = 0; offset < count; ++offset)
+    if (auto value = generic::Bcmp<1>::block(p1, p2))
+      return value;
+  return BcmpReturnType::ZERO();
 }

-static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (count < 256)
+    return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
+  if (auto value = generic::Bcmp<64>::block(p1, p2))
+    return value;
+  align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+
 #if defined(LLVM_LIBC_ARCH_X86)
-  using namespace ::__llvm_libc::x86;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  using namespace ::__llvm_libc::aarch64;
-#else
-  using namespace ::__llvm_libc::scalar;
-#endif
-  if (count == 0)
-    return 0;
-  if (count == 1)
-    return differs<_1>(lhs, rhs);
-  if (count == 2)
-    return differs<_2>(lhs, rhs);
-  if (count == 3)
-    return differs<_3>(lhs, rhs);
-  if (count <= 8)
-    return differs<HeadTail<_4>>(lhs, rhs, count);
-  if (count <= 16)
-    return differs<HeadTail<_8>>(lhs, rhs, count);
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
  if (count <= 32)
-    return differs<HeadTail<_16>>(lhs, rhs, count);
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+  if (count < 256)
+    return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
+  if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
+    return value;
+  align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (count <= 32)
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
-    return differs<HeadTail<_32>>(lhs, rhs, count);
+    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
-    return differs<HeadTail<_64>>(lhs, rhs, count);
-  return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+    return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 256)) {
+    if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (count <= 32)
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+  if (count <= 64)
+    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+  if (count <= 128)
+    return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 256)) {
+    if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
+                                                              size_t count) {
+  if (count == 0)
+    return BcmpReturnType::ZERO();
+  if (count == 1)
+    return generic::Bcmp<1>::block(p1, p2);
+  if (count == 2)
+    return generic::Bcmp<2>::block(p1, p2);
+  if (count <= 4)
+    return generic::Bcmp<2>::head_tail(p1, p2, count);
+  if (count <= 8)
+    return generic::Bcmp<4>::head_tail(p1, p2, count);
+  if (count <= 16)
+    return generic::Bcmp<8>::head_tail(p1, p2, count);
+  if constexpr (x86::kAvx512BW)
+    return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
+  else if constexpr (x86::kAvx2)
+    return inline_bcmp_x86_avx2_gt16(p1, p2, count);
+  else if constexpr (x86::kSse2)
+    return inline_bcmp_x86_sse2_gt16(p1, p2, count);
+  else
+    return inline_bcmp_generic_gt16(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline BcmpReturnType
+inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
+  if (likely(count <= 32)) {
+    if (unlikely(count >= 16)) {
+      return generic::Bcmp<16>::head_tail(p1, p2, count);
+    }
+    switch (count) {
+    case 0:
+      return BcmpReturnType::ZERO();
+    case 1:
+      return generic::Bcmp<1>::block(p1, p2);
+    case 2:
+      return generic::Bcmp<2>::block(p1, p2);
+    case 3:
+      return generic::Bcmp<2>::head_tail(p1, p2, count);
+    case 4:
+      return generic::Bcmp<4>::block(p1, p2);
+    case 5:
+    case 6:
+    case 7:
+      return generic::Bcmp<4>::head_tail(p1, p2, count);
+    case 8:
+      return generic::Bcmp<8>::block(p1, p2);
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+    case 14:
+    case 15:
+      return generic::Bcmp<8>::head_tail(p1, p2, count);
+    }
+  }
+
+  if (count <= 64)
+    return generic::Bcmp<32>::head_tail(p1, p2, count);
+
+  // Aligned loop if > 256, otherwise normal loop
+  if (count > 256) {
+    if (auto value = generic::Bcmp<32>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+  return inline_bcmp_x86(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  return inline_bcmp_aarch64(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_bcmp_embedded_tiny(p1, p2, count);
+#else
+#error "Unsupported platform"
+#endif
 }

 } // namespace __llvm_libc
--- a/libc/src/string/memory_utils/elements.h
+++ b/libc/src/string/memory_utils/elements.h
@ -1,774 +0,0 @@
-//===-- Elementary operations to compose memory primitives ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#include "src/__support/endian.h"
-#include "src/string/memory_utils/utils.h"
-
-namespace __llvm_libc {
-
-// Elementary Operations
-// --------------------------------
-// We define abstract elementary operations acting on fixed chunks of memory.
-// These are low level building blocks that are meant to be assembled to compose
-// higher order abstractions. Each function is defined twice: once with
-// fixed-size operations, and once with runtime-size operations.
-
-// Fixed-size copy from 'src' to 'dst'.
-template <typename Element>
-void copy(char *__restrict dst, const char *__restrict src) {
-  Element::copy(dst, src);
-}
-// Runtime-size copy from 'src' to 'dst'.
-template <typename Element>
-void copy(char *__restrict dst, const char *__restrict src, size_t size) {
-  Element::copy(dst, src, size);
-}
-
-// Fixed-size move from 'src' to 'dst'.
-template <typename Element> void move(char *dst, const char *src) {
-  Element::move(dst, src);
-}
-// Runtime-size move from 'src' to 'dst'.
-template <typename Element> void move(char *dst, const char *src, size_t size) {
-  Element::move(dst, src, size);
-}
-// Runtime-size move from 'src' to 'dst'.
-template <typename Element>
-void move_backward(char *dst, const char *src, size_t size) {
-  Element::move_backward(dst, src, size);
-}
-
-// Fixed-size equality between 'lhs' and 'rhs'.
-template <typename Element> bool equals(const char *lhs, const char *rhs) {
-  return Element::equals(lhs, rhs);
-}
-// Runtime-size equality between 'lhs' and 'rhs'.
-template <typename Element>
-bool equals(const char *lhs, const char *rhs, size_t size) {
-  return Element::equals(lhs, rhs, size);
-}
-
-// Fixed-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int three_way_compare(const char *lhs, const char *rhs) {
-  return Element::three_way_compare(lhs, rhs);
-}
-// Runtime-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-  return Element::three_way_compare(lhs, rhs, size);
-}
-
-// Fixed-size initialization.
-template <typename Element>
-void splat_set(char *dst, const unsigned char value) {
-  Element::splat_set(dst, value);
-}
-// Runtime-size initialization.
-template <typename Element>
-void splat_set(char *dst, const unsigned char value, size_t size) {
-  Element::splat_set(dst, value, size);
-}
-
-// Stack placeholder for Move operations.
-template <typename Element> struct Storage { char bytes[Element::SIZE]; };
-
-// Fixed-size Higher-Order Operations
-// ----------------------------------
-// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
-// - Chained<Types...>: Chain the operation of several types.
-
-// Repeat the operation several times in a row.
-template <typename Element, size_t ElementCount> struct Repeated {
-  static constexpr size_t SIZE = ElementCount * Element::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      Element::copy(dst + offset, src + offset);
-    }
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value = load(src);
-    store(dst, value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      if (!Element::equals(lhs + offset, rhs + offset))
-        return false;
-    }
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      // We make the assumption that 'equals' is cheaper than
-      // 'three_way_compare'.
-      if (Element::equals(lhs + offset, rhs + offset))
-        continue;
-      return Element::three_way_compare(lhs + offset, rhs + offset);
-    }
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      Element::splat_set(dst + offset, value);
-    }
-  }
-
-  static Storage<Repeated> load(const char *ptr) {
-    Storage<Repeated> value;
-    copy(reinterpret_cast<char *>(&value), ptr);
-    return value;
-  }
-
-  static void store(char *ptr, Storage<Repeated> value) {
-    copy(ptr, reinterpret_cast<const char *>(&value));
-  }
-};
-
-template <typename Element> struct Repeated<Element, 0> {
-  static void move(char *, const char *) {}
-};
-
-// Chain the operation of several types.
-// For instance, to handle a 3 bytes operation, one can use:
-// Chained<UINT16, UINT8>::Operation();
-template <typename... Types> struct Chained;
-
-template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
-  static constexpr size_t SIZE = Head::SIZE + Chained<Tail...>::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Chained<Tail...>::copy(dst + Head::SIZE, src + Head::SIZE);
-    __llvm_libc::copy<Head>(dst, src);
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value = Head::load(src);
-    Chained<Tail...>::move(dst + Head::SIZE, src + Head::SIZE);
-    Head::store(dst, value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    if (!__llvm_libc::equals<Head>(lhs, rhs))
-      return false;
-    return Chained<Tail...>::equals(lhs + Head::SIZE, rhs + Head::SIZE);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    if (__llvm_libc::equals<Head>(lhs, rhs))
-      return Chained<Tail...>::three_way_compare(lhs + Head::SIZE,
-                                                 rhs + Head::SIZE);
-    return __llvm_libc::three_way_compare<Head>(lhs, rhs);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Chained<Tail...>::splat_set(dst + Head::SIZE, value);
-    __llvm_libc::splat_set<Head>(dst, value);
-  }
-};
-
-template <> struct Chained<> {
-  static constexpr size_t SIZE = 0;
-  static void copy(char *__restrict, const char *__restrict) {}
-  static void move(char *, const char *) {}
-  static bool equals(const char *, const char *) { return true; }
-  static int three_way_compare(const char *, const char *) { return 0; }
-  static void splat_set(char *, const unsigned char) {}
-};
-
-// Overlap ElementA and ElementB so they span Size bytes.
-template <size_t Size, typename ElementA, typename ElementB = ElementA>
-struct Overlap {
-  static constexpr size_t SIZE = Size;
-  static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big");
-  static_assert(ElementA::SIZE <= Size, "ElementA too big");
-  static_assert((ElementA::SIZE + ElementB::SIZE) >= Size,
-                "Elements too small to overlap");
-  static constexpr size_t OFFSET = SIZE - ElementB::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    ElementA::copy(dst, src);
-    ElementB::copy(dst + OFFSET, src + OFFSET);
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value_a = ElementA::load(src);
-    const auto value_b = ElementB::load(src + OFFSET);
-    ElementB::store(dst + OFFSET, value_b);
-    ElementA::store(dst, value_a);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    if (!ElementA::equals(lhs, rhs))
-      return false;
-    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
-      return false;
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    if (!ElementA::equals(lhs, rhs))
-      return ElementA::three_way_compare(lhs, rhs);
-    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
-      return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET);
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    ElementA::splat_set(dst, value);
-    ElementB::splat_set(dst + OFFSET, value);
-  }
-};
-
-// Runtime-size Higher-Order Operations
-// ------------------------------------
-// - Tail<T>: Perform the operation on the last 'T::SIZE' bytes of the buffer.
-// - HeadTail<T>: Perform the operation on the first and last 'T::SIZE' bytes
-//   of the buffer.
-// - Loop<T>: Perform a loop of fixed-sized operations.
-
-// Perform the operation on the last 'T::SIZE' bytes of the buffer.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::SIZE`.
-template <typename T> struct Tail {
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    return T::copy(dst + offset(size), src + offset(size));
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    return T::equals(lhs + offset(size), rhs + offset(size));
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    return T::three_way_compare(lhs + offset(size), rhs + offset(size));
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    return T::splat_set(dst + offset(size), value);
-  }
-
-  static size_t offset(size_t size) { return size - T::SIZE; }
-};
-
-// Perform the operation on the first and last 'T::SIZE' bytes of the buffer.
-// This is useful for overlapping operations.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`.
-template <typename T> struct HeadTail {
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    T::copy(dst, src);
-    Tail<T>::copy(dst, src, size);
-  }
-
-  static void move(char *dst, const char *src, size_t size) {
-    const size_t offset = Tail<T>::offset(size);
-    const auto head_value = T::load(src);
-    const auto tail_value = T::load(src + offset);
-    T::store(dst + offset, tail_value);
-    T::store(dst, head_value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    if (!T::equals(lhs, rhs))
-      return false;
-    return Tail<T>::equals(lhs, rhs, size);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    if (!T::equals(lhs, rhs))
-      return T::three_way_compare(lhs, rhs);
-    return Tail<T>::three_way_compare(lhs, rhs, size);
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    T::splat_set(dst, value);
-    Tail<T>::splat_set(dst, value, size);
-  }
-};
-
-// Simple loop ending with a Tail operation.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXXXXXX_______________________]
-// [__________XXXXXXXX_______________]
-// [__________________XXXXXXXX_______]
-// [______________________XXXXXXXX___]
-//
-// Precondition:
-// - size >= T::SIZE
-template <typename T, typename TailT = T> struct Loop {
-  static_assert(T::SIZE == TailT::SIZE,
-                "Tail type must have the same size as T");
-
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    size_t offset = 0;
-    do {
-      T::copy(dst + offset, src + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    Tail<TailT>::copy(dst, src, size);
-  }
-
-  // Move forward suitable when dst < src. We load the tail bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [_________________________LLLLLLLL___]
-  // [___LLLLLLLL_________________________]
-  // [_SSSSSSSS___________________________]
-  // [___________LLLLLLLL_________________]
-  // [_________SSSSSSSS___________________]
-  // [___________________LLLLLLLL_________]
-  // [_________________SSSSSSSS___________]
-  // [_______________________SSSSSSSS_____]
-  static void move(char *dst, const char *src, size_t size) {
-    const size_t tail_offset = Tail<T>::offset(size);
-    const auto tail_value = TailT::load(src + tail_offset);
-    size_t offset = 0;
-    do {
-      T::move(dst + offset, src + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    TailT::store(dst + tail_offset, tail_value);
-  }
-
-  // Move forward suitable when dst > src. We load the head bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [___LLLLLLLL_________________________]
-  // [_________________________LLLLLLLL___]
-  // [___________________________SSSSSSSS_]
-  // [_________________LLLLLLLL___________]
-  // [___________________SSSSSSSS_________]
-  // [_________LLLLLLLL___________________]
-  // [___________SSSSSSSS_________________]
-  // [_____SSSSSSSS_______________________]
-  static void move_backward(char *dst, const char *src, size_t size) {
-    const auto head_value = TailT::load(src);
-    ptrdiff_t offset = size - T::SIZE;
-    do {
-      T::move(dst + offset, src + offset);
-      offset -= T::SIZE;
-    } while (offset >= 0);
-    TailT::store(dst, head_value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    size_t offset = 0;
-    do {
-      if (!T::equals(lhs + offset, rhs + offset))
-        return false;
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    return Tail<TailT>::equals(lhs, rhs, size);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    size_t offset = 0;
-    do {
-      if (!T::equals(lhs + offset, rhs + offset))
-        return T::three_way_compare(lhs + offset, rhs + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    return Tail<TailT>::three_way_compare(lhs, rhs, size);
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    size_t offset = 0;
-    do {
-      T::splat_set(dst + offset, value);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    Tail<TailT>::splat_set(dst, value, size);
-  }
-};
-
-namespace internal {
-
-template <Arg arg> struct ArgSelector {};
-
-template <> struct ArgSelector<Arg::_1> {
-  template <typename T1, typename T2>
-  static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) {
-    return p1ref;
-  }
-};
-
-template <> struct ArgSelector<Arg::_2> {
-  template <typename T1, typename T2>
-  static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) {
-    return p2ref;
-  }
-};
-
-// Provides a specialized bump function that adjusts pointers and size so first
-// argument (resp. second argument) gets aligned to Alignment.
-// We make sure the compiler knows about the adjusted pointer alignment.
-// The 'additional_bumps' parameter allows to reach previous / next aligned
-// pointers.
-template <Arg arg, size_t Alignment> struct Align {
-  template <typename T1, typename T2>
-  static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size,
-                   int additional_bumps = 0) {
-    auto &aligned_ptr = ArgSelector<arg>::Select(p1ref, p2ref);
-    auto offset = offset_to_next_aligned<Alignment>(aligned_ptr);
-    adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size);
-    aligned_ptr = assume_aligned<Alignment>(aligned_ptr);
-  }
-};
-
-} // namespace internal
-
-// An alignment operation that:
-// - executes the 'AlignmentT' operation
-// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
-//   pointer gets aligned, size is decreased accordingly.
-// - calls the 'NextT' operation.
-//
-// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
-// copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
-template <typename AlignmentT, Arg AlignOn = Arg::_1> struct Align {
-private:
-  static constexpr size_t ALIGNMENT = AlignmentT::SIZE;
-  static_assert(ALIGNMENT > 1, "Alignment must be more than 1");
-  static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2");
-
-public:
-  template <typename NextT> struct Then {
-    static void copy(char *__restrict dst, const char *__restrict src,
-                     size_t size) {
-      AlignmentT::copy(dst, src);
-      internal::Align<AlignOn, ALIGNMENT>::bump(dst, src, size);
-      NextT::copy(dst, src, size);
-    }
-
-    // Move forward suitable when dst < src. The alignment is performed with an
-    // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [___________LLLLLLLL______________]
-    // [_SSSSSSSS________________________]
-    // [________SSSSSSSS_________________]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [______LLLLLLLL___________________]
-    // [_SSSSSSSS________________________]
-    // [___SSSSSSSS______________________]
-    static void move(char *dst, const char *src, size_t size) {
-      char *next_dst = dst;
-      const char *next_src = src;
-      size_t next_size = size;
-      internal::Align<AlignOn, ALIGNMENT>::bump(next_dst, next_src, next_size,
-                                                1);
-      HeadTail<AlignmentT>::move(dst, src, size - next_size);
-      NextT::move(next_dst, next_src, next_size);
-    }
-
-    // Move backward suitable when dst > src. The alignment is performed with an
-    // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes backward and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _________________LLLLLLLL_______]
-    // [ ___________________LLLLLLLL_____]
-    // [____________________SSSSSSSS_____]
-    // [______________________SSSSSSSS___]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _______________LLLLLLLL_________]
-    // [ ___________________LLLLLLLL_____]
-    // [__________________SSSSSSSS_______]
-    // [______________________SSSSSSSS___]
-    static void move_backward(char *dst, const char *src, size_t size) {
-      char *headtail_dst = dst + size;
-      const char *headtail_src = src + size;
-      size_t headtail_size = 0;
-      internal::Align<AlignOn, ALIGNMENT>::bump(headtail_dst, headtail_src,
-                                                headtail_size, -2);
-      HeadTail<AlignmentT>::move(headtail_dst, headtail_src, headtail_size);
-      NextT::move_backward(dst, src, size - headtail_size);
-    }
-
-    static bool equals(const char *lhs, const char *rhs, size_t size) {
-      if (!AlignmentT::equals(lhs, rhs))
-        return false;
-      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
-      return NextT::equals(lhs, rhs, size);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs,
-                                 size_t size) {
-      if (!AlignmentT::equals(lhs, rhs))
-        return AlignmentT::three_way_compare(lhs, rhs);
-      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
-      return NextT::three_way_compare(lhs, rhs, size);
-    }
-
-    static void splat_set(char *dst, const unsigned char value, size_t size) {
-      AlignmentT::splat_set(dst, value);
-      char *dummy = nullptr;
-      internal::Align<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
-      NextT::splat_set(dst, value, size);
-    }
-  };
-};
-
-// An operation that allows to skip the specified amount of bytes.
-template <ptrdiff_t Bytes> struct Skip {
-  template <typename NextT> struct Then {
-    static void copy(char *__restrict dst, const char *__restrict src,
-                     size_t size) {
-      NextT::copy(dst + Bytes, src + Bytes, size - Bytes);
-    }
-
-    static void copy(char *__restrict dst, const char *__restrict src) {
-      NextT::copy(dst + Bytes, src + Bytes);
-    }
-
-    static bool equals(const char *lhs, const char *rhs, size_t size) {
-      return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes);
-    }
-
-    static bool equals(const char *lhs, const char *rhs) {
-      return NextT::equals(lhs + Bytes, rhs + Bytes);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs,
-                                 size_t size) {
-      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs) {
-      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes);
-    }
-
-    static void splat_set(char *dst, const unsigned char value, size_t size) {
-      NextT::splat_set(dst + Bytes, value, size - Bytes);
-    }
-
-    static void splat_set(char *dst, const unsigned char value) {
-      NextT::splat_set(dst + Bytes, value);
-    }
-  };
-};
-
-// Fixed-size Builtin Operations
-// -----------------------------
-// Note: Do not use 'builtin' right now as it requires the implementation of the
-// `_inline` versions of all the builtins. Theoretically, Clang can still turn
-// them into calls to the C library leading to reentrancy problems.
-namespace builtin {
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-template <size_t Size> struct Builtin {
-  static constexpr size_t SIZE = Size;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-    for_loop_copy(dst, src);
-#elif __has_builtin(__builtin_memcpy_inline)
-    // __builtin_memcpy_inline guarantees to never call external functions.
-    // Unfortunately it is not widely available.
-    __builtin_memcpy_inline(dst, src, SIZE);
-#else
-    for_loop_copy(dst, src);
-#endif
-  }
-
-  static void move(char *dst, const char *src) {
-#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-    for_loop_move(dst, src);
-#elif __has_builtin(__builtin_memmove)
-    __builtin_memmove(dst, src, SIZE);
-#else
-    for_loop_move(dst, src);
-#endif
-  }
-
-#if __has_builtin(__builtin_memcmp_inline)
-#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline
-#else
-#define LLVM_LIBC_MEMCMP __builtin_memcmp
-#endif
-
-  static bool equals(const char *lhs, const char *rhs) {
-    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    __builtin_memset(dst, value, SIZE);
-  }
-
-private:
-  // Copies `SIZE` bytes from `src` to `dst` using a for loop.
-  // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler
-  // from turning the for-loop back into `__builtin_memcpy`.
-  static void for_loop_copy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < SIZE; ++i)
-      dst[i] = src[i];
-  }
-
-  static void for_loop_move(char *dst, const char *src) {
-    for (size_t i = 0; i < SIZE; ++i)
-      dst[i] = src[i];
-  }
-};
-
-using _1 = Builtin<1>;
-using _2 = Builtin<2>;
-using _3 = Builtin<3>;
-using _4 = Builtin<4>;
-using _8 = Builtin<8>;
-using _16 = Builtin<16>;
-using _32 = Builtin<32>;
-using _64 = Builtin<64>;
-using _128 = Builtin<128>;
-
-} // namespace builtin
-
-// Fixed-size Scalar Operations
-// ----------------------------
-namespace scalar {
-
-// The Scalar type makes use of simple sized integers.
-template <typename T> struct Scalar {
-  static constexpr size_t SIZE = sizeof(T);
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    store(dst, load(src));
-  }
-
-  static void move(char *dst, const char *src) { store(dst, load(src)); }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    return load(lhs) == load(rhs);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    return scalar_three_way_compare(load(lhs), load(rhs));
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    store(dst, get_splatted_value(value));
-  }
-
-  static int scalar_three_way_compare(T a, T b);
-
-  static T load(const char *ptr) {
-    T value;
-    builtin::Builtin<SIZE>::copy(reinterpret_cast<char *>(&value), ptr);
-    return value;
-  }
-  static void store(char *ptr, T value) {
-    builtin::Builtin<SIZE>::copy(ptr, reinterpret_cast<const char *>(&value));
-  }
-
-private:
-  static T get_splatted_value(const unsigned char value) {
-    return T(~0) / T(0xFF) * T(value);
-  }
-};
-
-template <>
-inline int Scalar<uint8_t>::scalar_three_way_compare(uint8_t a, uint8_t b) {
-  const int16_t la = Endian::to_big_endian(a);
-  const int16_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint16_t>::scalar_three_way_compare(uint16_t a, uint16_t b) {
-  const int32_t la = Endian::to_big_endian(a);
-  const int32_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint32_t>::scalar_three_way_compare(uint32_t a, uint32_t b) {
-  const uint32_t la = Endian::to_big_endian(a);
-  const uint32_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-template <>
-inline int Scalar<uint64_t>::scalar_three_way_compare(uint64_t a, uint64_t b) {
-  const uint64_t la = Endian::to_big_endian(a);
-  const uint64_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-
-using UINT8 = Scalar<uint8_t>;   // 1 Byte
-using UINT16 = Scalar<uint16_t>; // 2 Bytes
-using UINT32 = Scalar<uint32_t>; // 4 Bytes
-using UINT64 = Scalar<uint64_t>; // 8 Bytes
-
-using _1 = UINT8;
-using _2 = UINT16;
-using _3 = Chained<UINT16, UINT8>;
-using _4 = UINT32;
-using _8 = UINT64;
-using _16 = Repeated<_8, 2>;
-using _32 = Repeated<_8, 4>;
-using _64 = Repeated<_8, 8>;
-using _128 = Repeated<_8, 16>;
-
-} // namespace scalar
-} // namespace __llvm_libc
-
-#include <src/string/memory_utils/elements_aarch64.h>
-#include <src/string/memory_utils/elements_x86.h>
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
--- a/libc/src/string/memory_utils/elements_aarch64.h
+++ b/libc/src/string/memory_utils/elements_aarch64.h
@ -1,130 +0,0 @@
-//===-- Elementary operations for aarch64 --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-
-#include <src/string/memory_utils/elements.h>
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace __llvm_libc {
-namespace aarch64_memset {
-#ifdef __ARM_NEON
-struct Splat8 {
-  static constexpr size_t SIZE = 8;
-  static void splat_set(char *dst, const unsigned char value) {
-    vst1_u8((uint8_t *)dst, vdup_n_u8(value));
-  }
-};
-
-struct Splat16 {
-  static constexpr size_t SIZE = 16;
-  static void splat_set(char *dst, const unsigned char value) {
-    vst1q_u8((uint8_t *)dst, vdupq_n_u8(value));
-  }
-};
-
-using _8 = Splat8;
-using _16 = Splat16;
-#else
-using _8 = __llvm_libc::scalar::_8;
-using _16 = Repeated<_8, 2>;
-#endif // __ARM_NEON
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _32 = Chained<_16, _16>;
-using _64 = Chained<_32, _32>;
-
-struct Zva64 {
-  static constexpr size_t SIZE = 64;
-
-  static void splat_set(char *dst, const unsigned char) {
-#if __SIZEOF_POINTER__ == 4
-    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
-#else
-    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
-#endif
-  }
-};
-
-inline static bool hasZva() {
-  uint64_t zva_val;
-  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
-  // DC ZVA is permitted if DZP, bit [4] is zero.
-  // BS, bits [3:0] is log2 of the block size in words.
-  // So the next line checks whether the instruction is permitted and block size
-  // is 16 words (i.e. 64 bytes).
-  return (zva_val & 0b11111) == 0b00100;
-}
-
-} // namespace aarch64_memset
-
-namespace aarch64 {
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _8 = __llvm_libc::scalar::_8;
-using _16 = __llvm_libc::scalar::_16;
-
-#ifdef __ARM_NEON
-struct N32 {
-  static constexpr size_t SIZE = 32;
-  static bool equals(const char *lhs, const char *rhs) {
-    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
-    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
-    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
-    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
-    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
-    uint64_t res =
-        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
-    return res == 0;
-  }
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
-    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
-    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
-    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
-    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
-    uint64_t res =
-        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
-    if (res == 0)
-      return 0;
-    size_t index = (__builtin_ctzl(res) >> 3) << 2;
-    uint32_t l = *((const uint32_t *)(lhs + index));
-    uint32_t r = *((const uint32_t *)(rhs + index));
-    return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r);
-  }
-};
-
-using _32 = N32;
-using _64 = Repeated<_32, 2>;
-#else
-using _32 = __llvm_libc::scalar::_32;
-using _64 = __llvm_libc::scalar::_64;
-#endif // __ARM_NEON
-
-} // namespace aarch64
-} // namespace __llvm_libc
-
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
--- a/libc/src/string/memory_utils/elements_x86.h
+++ b/libc/src/string/memory_utils/elements_x86.h
@ -1,189 +0,0 @@
-//===-- Elementary operations for x86 -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-
-#include "src/__support/CPP/bit.h"
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_X86)
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#endif //  __SSE2__
-
-#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
-
-// Fixed-size Vector Operations
-// ----------------------------
-
-namespace __llvm_libc {
-namespace x86 {
-
-#ifdef __SSE2__
-template <typename Base> struct Vector : public Base {
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Base::store(dst, Base::load(src));
-  }
-
-  static void move(char *dst, const char *src) {
-    Base::store(dst, Base::load(src));
-  }
-
-  static bool equals(const char *a, const char *b) {
-    return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0;
-  }
-
-  static int three_way_compare(const char *a, const char *b) {
-    const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b));
-    if (!mask)
-      return 0;
-    return char_diff(a, b, mask);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Base::store(dst, Base::get_splatted_value(value));
-  }
-
-  static int char_diff(const char *a, const char *b, uint64_t mask) {
-    const size_t diff_index = __builtin_ctzll(mask);
-    const int ca = (unsigned char)a[diff_index];
-    const int cb = (unsigned char)b[diff_index];
-    return ca - cb;
-  }
-};
-
-struct M128 {
-  static constexpr size_t SIZE = 16;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint16_t mask(T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return static_cast<uint16_t>(
-        _mm_movemask_epi8(cpp::bit_cast<__m128i>(value)));
-  }
-  static uint16_t not_equal_mask(T a, T b) { return mask(a != b); }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(
-        _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr),
-                            cpp::bit_cast<__m128i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector128 = Vector<M128>; // 16 Bytes
-
-#ifdef __AVX2__
-struct M256 {
-  static constexpr size_t SIZE = 32;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint32_t mask(T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value));
-  }
-  static uint32_t not_equal_mask(T a, T b) { return mask(a != b); }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(
-        _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
-                               cpp::bit_cast<__m256i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector256 = Vector<M256>; // 32 Bytes
-
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-struct M512 {
-  static constexpr size_t SIZE = 64;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint64_t not_equal_mask(T a, T b) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a),
-                                   cpp::bit_cast<__m512i>(b));
-  }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(_mm512_loadu_epi8(ptr));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-using Vector512 = Vector<M512>;
-
-#endif // defined(__AVX512F__) and defined(__AVX512BW__)
-#endif // __AVX2__
-#endif // __SSE2__
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _8 = __llvm_libc::scalar::_8;
-#if defined(__AVX512F__) && defined(__AVX512BW__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::x86::Vector512;
-using _128 = __llvm_libc::Repeated<_64, 2>;
-#elif defined(__AVX2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::Repeated<_32, 2>;
-using _128 = __llvm_libc::Repeated<_32, 4>;
-#elif defined(__SSE2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::Repeated<_16, 2>;
-using _64 = __llvm_libc::Repeated<_16, 4>;
-using _128 = __llvm_libc::Repeated<_16, 8>;
-#else
-using _16 = __llvm_libc::Repeated<_8, 2>;
-using _32 = __llvm_libc::Repeated<_8, 4>;
-using _64 = __llvm_libc::Repeated<_8, 8>;
-using _128 = __llvm_libc::Repeated<_8, 16>;
-#endif
-
-struct Accelerator {
-  static void copy(char *dst, const char *src, size_t count) {
-    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-  }
-};
-
-} // namespace x86
-} // namespace __llvm_libc
-
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@ -11,92 +11,133 @@

 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/utils.h"

 #include <stddef.h> // size_t

 namespace __llvm_libc {
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
+#pragma nounroll
+  for (size_t offset = 0; offset < count; ++offset)
+    if (auto value = generic::Memcmp<1>::block(p1, p2))
+      return value;
+  return MemcmpReturnType::ZERO();
+}
+
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (unlikely(count >= 384)) {
+    if (auto value = generic::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)

-static inline int inline_memcmp(const char *lhs, const char *rhs,
-                                size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_X86
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::x86;
-  if (count == 0)
-    return 0;
-  if (count == 1)
-    return three_way_compare<_1>(lhs, rhs);
-  if (count == 2)
-    return three_way_compare<_2>(lhs, rhs);
-  if (count == 3)
-    return three_way_compare<_3>(lhs, rhs);
-  if (count <= 8)
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
-  if (count <= 16)
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
-  if (count <= 32)
-    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
-  if (count <= 64)
-    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
-  if (count <= 128)
-    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
-  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_AARCH64
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::aarch64;
-  if (count == 0) // [0, 0]
-    return 0;
-  if (count == 1) // [1, 1]
-    return three_way_compare<_1>(lhs, rhs);
-  if (count == 2) // [2, 2]
-    return three_way_compare<_2>(lhs, rhs);
-  if (count == 3) // [3, 3]
-    return three_way_compare<_3>(lhs, rhs);
-  if (count < 8) // [4, 7]
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
-  if (count < 16) // [8, 15]
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
-  if (unlikely(count >= 128)) // [128, ∞]
-    return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
-  if (!equals<_16>(lhs, rhs)) // [16, 16]
-    return three_way_compare<_16>(lhs, rhs);
-  if (count < 32) // [17, 31]
-    return three_way_compare<Tail<_16>>(lhs, rhs, count);
-  if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
-    return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
-  if (count < 64) // [33, 63]
-    return three_way_compare<Tail<_32>>(lhs, rhs, count);
-  // [64, 127]
-  return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
-#else
-  /////////////////////////////////////////////////////////////////////////////
-  // Default
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::scalar;
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
+}

-  if (count == 0)
-    return 0;
-  if (count == 1)
-    return three_way_compare<_1>(lhs, rhs);
-  if (count == 2)
-    return three_way_compare<_2>(lhs, rhs);
-  if (count == 3)
-    return three_way_compare<_3>(lhs, rhs);
-  if (count <= 8)
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
-  if (count <= 16)
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
  if (count <= 32)
-    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
-    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
-    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
-  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+    return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
+      return value;
+    align_to_next_boundary<32, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
+}
+
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (count <= 32)
+    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+  if (count <= 64)
+    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+  if (count <= 128)
+    return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline MemcmpReturnType
+inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
+  if (unlikely(count >= 128)) { // [128, ∞]
+    if (auto value = generic::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+    return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
+  }
+  if (count < 32) // [17, 31]
+    return generic::Memcmp<16>::tail(p1, p2, count);
+  if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
+    return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
+  if (count < 64) // [33, 63]
+    return generic::Memcmp<32>::tail(p1, p2, count);
+  // [64, 127]
+  return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+  if (count == 0)
+    return MemcmpReturnType::ZERO();
+  if (count == 1)
+    return generic::Memcmp<1>::block(p1, p2);
+  if (count == 2)
+    return generic::Memcmp<2>::block(p1, p2);
+  if (count == 3)
+    return generic::Memcmp<3>::block(p1, p2);
+  if (count <= 8)
+    return generic::Memcmp<4>::head_tail(p1, p2, count);
+  if (count <= 16)
+    return generic::Memcmp<8>::head_tail(p1, p2, count);
+#if defined(LLVM_LIBC_ARCH_X86)
+  if constexpr (x86::kAvx512BW)
+    return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
+  else if constexpr (x86::kAvx2)
+    return inline_memcmp_x86_avx2_gt16(p1, p2, count);
+  else if constexpr (x86::kSse2)
+    return inline_memcmp_x86_sse2_gt16(p1, p2, count);
+  else
+    return inline_memcmp_generic_gt16(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  if constexpr (aarch64::kNeon)
+    return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
+  else
+    return inline_memcmp_generic_gt16(p1, p2, count);
+#endif
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_memcmp_embedded_tiny(p1, p2, count);
+#else
+#error "Unsupported platform"
 #endif
 }

--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@ -11,142 +11,123 @@

 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h"

 #include <stddef.h> // size_t

-// Design rationale
-// ================
-//
-// Using a profiler to observe size distributions for calls into libc
-// functions, it was found most operations act on a small number of bytes.
-// This makes it important to favor small sizes.
-//
-// The tests for `count` are in ascending order so the cost of branching is
-// proportional to the cost of copying.
-//
-// The function is written in C++ for several reasons:
-// - The compiler can __see__ the code, this is useful when performing Profile
-//   Guided Optimization as the optimized code can take advantage of branching
-//   probabilities.
-// - It also allows for easier customization and favors testing multiple
-//   implementation parameters.
-// - As compilers and processors get better, the generated code is improved
-//   with little change on the code side.
-
 namespace __llvm_libc {

+[[maybe_unused]] static inline void
+inline_memcpy_embedded_tiny(char *__restrict dst, const char *__restrict src,
+                            size_t count) {
+#pragma nounroll
+  for (size_t offset = 0; offset < count; ++offset)
+    builtin::Memcpy<1>::block(dst + offset, src + offset);
+}
+
+#if defined(LLVM_LIBC_ARCH_X86)
+[[maybe_unused]] static inline void
+inline_memcpy_x86(char *__restrict dst, const char *__restrict src,
+                  size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return builtin::Memcpy<1>::block(dst, src);
+  if (count == 2)
+    return builtin::Memcpy<2>::block(dst, src);
+  if (count == 3)
+    return builtin::Memcpy<3>::block(dst, src);
+  if (count == 4)
+    return builtin::Memcpy<4>::block(dst, src);
+  if (count < 8)
+    return builtin::Memcpy<4>::head_tail(dst, src, count);
+  if (count < 16)
+    return builtin::Memcpy<8>::head_tail(dst, src, count);
+  if (count < 32)
+    return builtin::Memcpy<16>::head_tail(dst, src, count);
+  if (count < 64)
+    return builtin::Memcpy<32>::head_tail(dst, src, count);
+  if (count < 128)
+    return builtin::Memcpy<64>::head_tail(dst, src, count);
+  if (x86::kAvx && count < 256)
+    return builtin::Memcpy<128>::head_tail(dst, src, count);
+  builtin::Memcpy<32>::block(dst, src);
+  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+  static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
+  return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
+}
+
+[[maybe_unused]] static inline void inline_memcpy_x86_maybe_interpose_repmovsb(
+    char *__restrict dst, const char *__restrict src, size_t count) {
+  // Whether to use rep;movsb exclusively, not at all, or only above a certain
+  // threshold.
+  // TODO: Use only a single preprocessor definition to simplify the code.
+#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
+#endif
+
+  static constexpr bool kUseOnlyRepMovsb =
+      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+  static constexpr size_t kRepMovsbThreshold =
+      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+  if constexpr (kUseOnlyRepMovsb)
+    return x86::Memcpy::repmovsb(dst, src, count);
+  else if constexpr (kRepMovsbThreshold >= 0) {
+    if (unlikely(count >= kRepMovsbThreshold))
+      return x86::Memcpy::repmovsb(dst, src, count);
+    else
+      return inline_memcpy_x86(dst, src, count);
+  } else {
+    return inline_memcpy_x86(dst, src, count);
+  }
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+[[maybe_unused]] static inline void
+inline_memcpy_aarch64(char *__restrict dst, const char *__restrict src,
+                      size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return builtin::Memcpy<1>::block(dst, src);
+  if (count == 2)
+    return builtin::Memcpy<2>::block(dst, src);
+  if (count == 3)
+    return builtin::Memcpy<3>::block(dst, src);
+  if (count == 4)
+    return builtin::Memcpy<4>::block(dst, src);
+  if (count < 8)
+    return builtin::Memcpy<4>::head_tail(dst, src, count);
+  if (count < 16)
+    return builtin::Memcpy<8>::head_tail(dst, src, count);
+  if (count < 32)
+    return builtin::Memcpy<16>::head_tail(dst, src, count);
+  if (count < 64)
+    return builtin::Memcpy<32>::head_tail(dst, src, count);
+  if (count < 128)
+    return builtin::Memcpy<64>::head_tail(dst, src, count);
+  builtin::Memcpy<16>::block(dst, src);
+  align_to_next_boundary<16, Arg::Src>(dst, src, count);
+  return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
 static inline void inline_memcpy(char *__restrict dst,
                                 const char *__restrict src, size_t count) {
  using namespace __llvm_libc::builtin;
 #if defined(LLVM_LIBC_ARCH_X86)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_X86
-  /////////////////////////////////////////////////////////////////////////////
-
-  // Whether to use only rep;movsb.
-  constexpr bool USE_ONLY_REP_MOVSB =
-      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
-
-  // kRepMovsBSize == -1 : Only CopyAligned is used.
-  // kRepMovsBSize ==  0 : Only RepMovsb is used.
-  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
-  constexpr size_t REP_MOVS_B_SIZE =
-#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
-      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-#else
-      -1;
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
-  // Whether target supports AVX instructions.
-  constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
-
-#if defined(__AVX__)
-  using LoopBlockSize = _64;
-#else
-  using LoopBlockSize = _32;
-#endif
-
-  if (USE_ONLY_REP_MOVSB)
-    return copy<x86::Accelerator>(dst, src, count);
-
-  if (count == 0)
-    return;
-  if (count == 1)
-    return copy<_1>(dst, src);
-  if (count == 2)
-    return copy<_2>(dst, src);
-  if (count == 3)
-    return copy<_3>(dst, src);
-  if (count == 4)
-    return copy<_4>(dst, src);
-  if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  if (HAS_AVX && count < 256)
-    return copy<HeadTail<_128>>(dst, src, count);
-  if (count <= REP_MOVS_B_SIZE)
-    return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
-                                                                 count);
-  return copy<x86::Accelerator>(dst, src, count);
+  return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
 #elif defined(LLVM_LIBC_ARCH_AARCH64)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_AARCH64
-  /////////////////////////////////////////////////////////////////////////////
-  if (count == 0)
-    return;
-  if (count == 1)
-    return copy<_1>(dst, src);
-  if (count == 2)
-    return copy<_2>(dst, src);
-  if (count == 3)
-    return copy<_3>(dst, src);
-  if (count == 4)
-    return copy<_4>(dst, src);
-  if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
+  return inline_memcpy_aarch64(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_memcpy_embedded_tiny(dst, src, count);
 #else
-  /////////////////////////////////////////////////////////////////////////////
-  // Default
-  /////////////////////////////////////////////////////////////////////////////
-  if (count == 0)
-    return;
-  if (count == 1)
-    return copy<_1>(dst, src);
-  if (count == 2)
-    return copy<_2>(dst, src);
-  if (count == 3)
-    return copy<_3>(dst, src);
-  if (count == 4)
-    return copy<_4>(dst, src);
-  if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
-  if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
-  if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
-  if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
-  if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
+#error "Unsupported platform"
 #endif
 }

--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@ -10,126 +10,104 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

 #include "src/__support/architectures.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h"

 #include <stddef.h> // size_t

 namespace __llvm_libc {

-// A general purpose implementation assuming cheap unaligned writes for sizes:
-// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
-// or 64 Bytes at a time, the compiler will expand them as needed.
-//
-// This implementation is subject to change as we benchmark more processors. We
-// may also want to customize it for processors with specialized instructions
-// that performs better (e.g. `rep stosb`).
-//
-// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
-// We want to balance two things here:
-//  - The number of redundant writes (when using `SetBlockOverlap`),
-//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
-//    such sizes).
-//
-// For the range 64-128:
-//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
-//  is wasteful near 65 but efficient toward 128.
-//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
-//  96 or 128 Bytes.
-//  - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
-//  for 65-96 and copy<96>+Overlap<32> for 97-128
-//
-// Benchmarks showed that redundant writes were cheap (for Intel X86) but
-// conditional were expensive, even on processor that do not support writing 64B
-// at a time (pre-AVX512F). We also want to favor short functions that allow
-// more hot code to fit in the iL1 cache.
-//
-// Above 128 we have to use conditionals since we don't know the upper bound in
-// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
-// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
-// superior for sizes that mattered.
-inline static void inline_memset(char *dst, unsigned char value, size_t count) {
+[[maybe_unused]] inline static void
+inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
+#pragma nounroll
+  for (size_t offset = 0; offset < count; ++offset)
+    generic::Memset<1, 1>::block(dst + offset, value);
+}
+
 #if defined(LLVM_LIBC_ARCH_X86)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_X86
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::x86;
+template <size_t MaxSize>
+[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value,
+                                                      size_t count) {
  if (count == 0)
    return;
  if (count == 1)
-    return splat_set<_1>(dst, value);
+    return generic::Memset<1, MaxSize>::block(dst, value);
  if (count == 2)
-    return splat_set<_2>(dst, value);
+    return generic::Memset<2, MaxSize>::block(dst, value);
  if (count == 3)
-    return splat_set<_3>(dst, value);
+    return generic::Memset<3, MaxSize>::block(dst, value);
  if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
+    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
  if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
+    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
  if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
+    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
  if (count <= 64)
-    return splat_set<HeadTail<_32>>(dst, value, count);
+    return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
  if (count <= 128)
-    return splat_set<HeadTail<_64>>(dst, value, count);
-  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_AARCH64
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::aarch64_memset;
+    return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
+  // Aligned loop
+  generic::Memset<32, MaxSize>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+template <size_t MaxSize>
+[[maybe_unused]] inline static void
+inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
  if (count == 0)
    return;
  if (count <= 3) {
-    splat_set<_1>(dst, value);
+    generic::Memset<1, MaxSize>::block(dst, value);
    if (count > 1)
-      splat_set<Tail<_2>>(dst, value, count);
+      generic::Memset<2, MaxSize>::tail(dst, value, count);
    return;
  }
  if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
+    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
  if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
+    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
  if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
+    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
  if (count <= (32 + 64)) {
-    splat_set<_32>(dst, value);
+    generic::Memset<32, MaxSize>::block(dst, value);
    if (count <= 64)
-      return splat_set<Tail<_32>>(dst, value, count);
-    splat_set<Skip<32>::Then<_32>>(dst, value);
-    splat_set<Tail<_32>>(dst, value, count);
+      return generic::Memset<32, MaxSize>::tail(dst, value, count);
+    generic::Memset<32, MaxSize>::block(dst + 32, value);
+    generic::Memset<32, MaxSize>::tail(dst, value, count);
    return;
  }
-  if (count >= 448 && value == 0 && hasZva())
-    return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
-                                                                  count);
-  else
-    return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
-#else
-  /////////////////////////////////////////////////////////////////////////////
-  // Default
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::scalar;
+  if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
+    generic::Memset<64, MaxSize>::block(dst, 0);
+    align_to_next_boundary<64>(dst, count);
+    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
+  } else {
+    generic::Memset<16, MaxSize>::block(dst, value);
+    align_to_next_boundary<16>(dst, count);
+    return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
+  }
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)

-  if (count == 0)
-    return;
-  if (count == 1)
-    return splat_set<_1>(dst, value);
-  if (count == 2)
-    return splat_set<_2>(dst, value);
-  if (count == 3)
-    return splat_set<_3>(dst, value);
-  if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
-  if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
-  if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
-  if (count <= 64)
-    return splat_set<HeadTail<_32>>(dst, value, count);
-  if (count <= 128)
-    return splat_set<HeadTail<_64>>(dst, value, count);
-  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
+#if defined(LLVM_LIBC_ARCH_X86)
+  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+                                     : x86::kAvx   ? 32
+                                     : x86::kSse2  ? 16
+                                                   : 8;
+  return inline_memset_x86<kMaxSize>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+  return inline_memset_aarch64<kMaxSize>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_ARM)
+  return inline_memset_embedded_tiny(dst, value, count);
+#else
+#error "Unsupported platform"
 #endif
 }

--- a/libc/src/string/memory_utils/op_aarch64.h
+++ b/libc/src/string/memory_utils/op_aarch64.h
@ -0,0 +1,175 @@
+//===-- aarch64 implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides aarch64 specific building blocks to compose memory
+// functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif //__ARM_NEON
+
+namespace __llvm_libc::aarch64 {
+
+static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
+
+namespace neon {
+
+template <size_t Size> struct BzeroCacheLine {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t) {
+    static_assert(Size == 64);
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    // Unaligned store, we can't use 'dc zva' here.
+    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
+    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block count in words.
+  // So the next line checks whether the instruction is permitted and block
+  // count is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace neon
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t BlockSize = 32;
+
+  static const unsigned char *as_u8(CPtr ptr) {
+    return reinterpret_cast<const unsigned char *>(ptr);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t an = veorq_u8(a, n);
+      uint8x16_t bo = veorq_u8(b, o);
+      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
+      // a difference between the two buffers.  We reduce this value down to 4
+      // bytes in two steps. First, calculate the saturated move value when
+      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
+      // a single 32 bit nonzero value if a mismatch occurred.
+      uint8x16_t anbo = vorrq_u8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr ((Size % BlockSize) == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if constexpr (Size <= 8) {
+      return generic::Bcmp<Size>::head_tail(p1, p2, count);
+    } else if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + count - 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + count - 16);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      // anbo = (a ^ n) | (b ^ o)
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t c = vld1q_u8(_p1 + count - 16);
+      uint8x16_t d = vld1q_u8(_p1 + count - 32);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t p = vld1q_u8(_p2 + count - 16);
+      uint8x16_t q = vld1q_u8(_p2 + count - 32);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      uint8x16_t cp = veorq_s8(c, p);
+      uint8x16_t dq = veorq_s8(d, q);
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint8x16_t cpdq = vorrq_s8(cp, dq);
+      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
+      // a nonzero 32 bit value if a mismatch occurred.
+      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
+      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
+      return vmaxv_u32(abnocpdq_reduced);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+} // namespace __llvm_libc::aarch64
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
--- a/libc/src/string/memory_utils/op_builtin.h
+++ b/libc/src/string/memory_utils/op_builtin.h
@ -0,0 +1,148 @@
+//===-- Implementation using the __builtin_XXX_inline ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks to compose memory functions.
+// They rely on the compiler to generate the best possible code through the use
+// of the `__builtin_XXX_inline` builtins. These builtins are currently only
+// available in Clang.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc::builtin {
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+template <size_t Size> struct Memcpy {
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    return __builtin_memcpy_inline(dst, src, SIZE);
+#else
+    deferred_static_assert("Missing __builtin_memcpy_inline");
+    (void)dst;
+    (void)src;
+#endif
+  }
+
+  static inline void tail(Ptr __restrict dst, CPtr __restrict src,
+                          size_t count) {
+    block(dst + count - SIZE, src + count - SIZE);
+  }
+
+  static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
+                               size_t count) {
+    block(dst, src);
+    tail(dst, src, count);
+  }
+
+  static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, src + offset);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, src, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+template <size_t Size> struct Memset {
+  using ME = Memset;
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr dst, uint8_t value) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    __builtin_memset_inline(dst, value, Size);
+#else
+    deferred_static_assert("Missing __builtin_memset_inline");
+    (void)dst;
+    (void)value;
+#endif
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  using ME = Bcmp;
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+template <size_t Size> struct Memcmp {
+  using ME = Memcmp;
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+};
+
+} // namespace __llvm_libc::builtin
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@ -0,0 +1,466 @@
+//===-- Generic implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks.
+// Depending on the requested size, the block operation uses unsigned integral
+// types, vector types or an array of the type with the maximum size.
+//
+// The maximum size is passed as a template argument. For instance, on x86
+// platforms that only supports integral types the maximum size would be 8
+// (corresponding to uint64_t). On this platform if we request the size 32, this
+// would be treated as a cpp::array<uint64_t, 4>.
+//
+// On the other hand, if the platform is x86 with support for AVX the maximum
+// size is 32 and the operation can be handled with a single native operation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc::generic {
+
+// CTPair and CTMap below implement a compile time map.
+// This is useful to map from a Size to a type handling this size.
+//
+// Example usage:
+// using MyMap = CTMap<CTPair<1, uint8_t>,
+//                     CTPair<2, uint16_t>,
+//                     >;
+// ...
+// using UInt8T = MyMap::find_type<1>;
+template <size_t I, typename T> struct CTPair {
+  using type = T;
+  static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
+};
+template <typename... Pairs> struct CTMap : public Pairs... {
+  using Pairs::get_pair...;
+  template <size_t I>
+  using find_type =
+      typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
+};
+
+// Helper to test if a type is void.
+template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
+
+// Implements load, store and splat for unsigned integral types.
+template <typename T> struct ScalarType {
+  using Type = T;
+  static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
+
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    return Type(~0) / Type(0xFF) * Type(value);
+  }
+};
+
+// Implements load, store and splat for vector types.
+template <size_t Size> struct VectorType {
+  using Type = uint8_t __attribute__((__vector_size__(Size)));
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    // This for loop is optimized out for vector types.
+    for (size_t i = 0; i < Size; ++i)
+      Out[i] = static_cast<uint8_t>(value);
+    return Out;
+  }
+};
+
+// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit.
+static_assert((UINTPTR_MAX == 4294967295U) ||
+              (UINTPTR_MAX == 18446744073709551615UL));
+
+// Map from sizes to structures offering static load, store and splat methods.
+// Note: On platforms lacking vector support, we use the ArrayType below and
+// decompose the operation in smaller pieces.
+using NativeTypeMap =
+    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
+          CTPair<2, ScalarType<uint16_t>>, //
+          CTPair<4, ScalarType<uint32_t>>, //
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
+#endif                                     //
+          CTPair<16, VectorType<16>>,      //
+          CTPair<32, VectorType<32>>,      //
+          CTPair<64, VectorType<64>>>;
+
+// Implements load, store and splat for sizes not natively supported by the
+// platform. SubType is either ScalarType or VectorType.
+template <typename SubType, size_t ArraySize> struct ArrayType {
+  using Type = cpp::array<typename SubType::Type, ArraySize>;
+  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
+  static inline Type load(CPtr src) {
+    Type Value;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Value[I] = SubType::load(src + (I * SizeOfElement));
+    return Value;
+  }
+  static inline void store(Ptr dst, Type Value) {
+    for (size_t I = 0; I < ArraySize; ++I)
+      SubType::store(dst + (I * SizeOfElement), Value[I]);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Out[I] = SubType::splat(value);
+    return Out;
+  }
+};
+
+// Checks whether we should use an ArrayType.
+template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
+  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
+         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
+}
+
+// Compute the type to handle an operation of Size bytes knowing that the
+// underlying platform only support native types up to MaxSize bytes.
+template <size_t Size, size_t MaxSize>
+using getTypeFor = cpp::conditional_t<
+    useArrayType<Size, MaxSize>(),
+    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
+    NativeTypeMap::find_type<Size>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+// When building with clang we can delegate to the builtin implementation.
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+// The MaxSize template argument gives the maximum size handled natively by the
+// platform. For instance on x86 with AVX support this would be 32. If a size
+// greater than MaxSize is requested we break the operation down in smaller
+// pieces of size MaxSize.
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size, size_t MaxSize> struct Memset {
+  static_assert(is_power2(MaxSize));
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    if constexpr (Size == 3) {
+      Memset<1, MaxSize>::block(dst + 2, value);
+      Memset<2, MaxSize>::block(dst, value);
+    } else {
+      using T = getTypeFor<Size, MaxSize>;
+      if constexpr (is_void_v<T>) {
+        deferred_static_assert("Unimplemented Size");
+      } else {
+        T::store(dst, T::splat(value));
+      }
+    }
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(SIZE > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
+    return load<T>(p1) ^ load<T>(p2);
+  }
+
+  template <typename T>
+  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
+    return load<T>(p1) != load<T>(p2);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    static constexpr size_t MaxSize = 8;
+    if constexpr (Size == 1) {
+      return load_xor<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_xor<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_xor<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_not_equal<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Memcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline T load_be(CPtr ptr) {
+    return Endian::to_big_endian(load<T>(ptr));
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
+    return load_be<T>(p1) - load_be<T>(p2);
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
+    const auto la = load_be<T>(p1);
+    const auto lb = load_be<T>(p2);
+    return la > lb ? 1 : la < lb ? -1 : 0;
+  }
+
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == 1) {
+      return load_be_diff<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_be_diff<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_be_cmp<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_be_cmp<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
+      return MemcmpReturnType::ZERO();
+    } else if constexpr (Size == 3) {
+      if (auto value = Memcmp<2>::block(p1, p2))
+        return value;
+      return Memcmp<1>::block(p1 + 2, p2 + 2);
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memmove
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size, size_t MaxSize> struct Memmove {
+  static_assert(is_power2(MaxSize));
+  using T = getTypeFor<Size, MaxSize>;
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, CPtr src) {
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      T::store(dst, T::load(src));
+    }
+  }
+
+  static inline void head_tail(Ptr dst, CPtr src, size_t count) {
+    const size_t offset = count - Size;
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      // The load and store operations can be performed in any order as long as
+      // they are not interleaved. More investigations are needed to determine
+      // the best order.
+      const auto head = T::load(src);
+      const auto tail = T::load(src + offset);
+      T::store(dst, head);
+      T::store(dst + offset, tail);
+    }
+  }
+
+  // Align forward suitable when dst < src. The alignment is performed with
+  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes forward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [___________LLLLLLLA______________]
+  // [_SSSSSSSS________________________]
+  // [________SSSSSSSS_________________]
+  //
+  // e.g. Moving two bytes forward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [______LLLLLLLL___________________]
+  // [_SSSSSSSS________________________]
+  // [___SSSSSSSA______________________]
+  template <Arg AlignOn>
+  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr prev_dst = dst;
+    CPtr prev_src = src;
+    size_t prev_count = count;
+    align_to_next_boundary<Size, AlignOn>(dst, src, count);
+    adjust(Size, dst, src, count);
+    head_tail(prev_dst, prev_src, prev_count - count);
+  }
+
+  // Align backward suitable when dst > src. The alignment is performed with
+  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes backward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _________________ALLLLLLL_______]
+  // [ ___________________LLLLLLLL_____]
+  // [____________________SSSSSSSS_____]
+  // [______________________SSSSSSSS___]
+  //
+  // e.g. Moving two bytes backward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _______________LLLLLLLL_________]
+  // [ ___________________LLLLLLLL_____]
+  // [__________________ASSSSSSS_______]
+  // [______________________SSSSSSSS___]
+  template <Arg AlignOn>
+  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr headtail_dst = dst + count;
+    CPtr headtail_src = src + count;
+    size_t headtail_size = 0;
+    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
+                                          headtail_size);
+    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
+    head_tail(headtail_dst, headtail_src, headtail_size);
+    count -= headtail_size;
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1);
+    const size_t tail_offset = count - Size;
+    const auto tail_value = T::load(src + tail_offset);
+    size_t offset = 0;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset += Size;
+    } while (offset < count - Size);
+    T::store(dst + tail_offset, tail_value);
+  }
+
+  // Move backward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1);
+    const auto head_value = T::load(src);
+    ptrdiff_t offset = count - Size;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset -= Size;
+    } while (offset >= 0);
+    T::store(dst, head_value);
+  }
+};
+
+} // namespace __llvm_libc::generic
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@ -0,0 +1,219 @@
+//===-- x86 implementation of memory function building blocks -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides x86 specific building blocks to compose memory functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86_64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#else
+// Define fake functions to prevent the compiler from failing on undefined
+// functions in case SSE2 is not present.
+#define _mm512_cmpneq_epi8_mask(A, B) 0
+#define _mm_movemask_epi8(A) 0
+#define _mm256_movemask_epi8(A) 0
+#endif //  __SSE2__
+
+namespace __llvm_libc::x86 {
+
+// A set of constants to check compile time features.
+static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
+static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
+static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
+static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy repmovsb implementation
+struct Memcpy {
+  static void repmovsb(char *dst, const char *src, size_t count) {
+    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+
+// Base implementation for the Bcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockBcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
+  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
+  // mask.
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
+  const bool mask_is_set = mask != 0;
+  return static_cast<uint32_t>(mask_is_set);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
+} // namespace avx512bw
+
+// Assuming that the mask is non zero, the index of the first mismatching byte
+// is the number of trailing zeros in the mask. Trailing zeros and not leading
+// zeros because the x86 architecture is little endian.
+static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
+                                                 uint64_t mask) {
+  const size_t diff_index = __builtin_ctzll(mask);
+  const int16_t ca = p1[diff_index];
+  const int16_t cb = p2[diff_index];
+  return ca - cb;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+
+// Base implementation for the Memcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockMemcmp is the function that implements the memcmp logic.
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
+struct MemcmpImpl {
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockMemcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return BlockMemcmp(p1 + offset, p2 + offset);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1);
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
+} // namespace avx512bw
+
+} // namespace __llvm_libc::x86
+
+#endif // LLVM_LIBC_ARCH_X86_64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@ -9,19 +9,8 @@
 #ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
 #define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H

-#include "src/__support/architectures.h"
-
-// Cache line sizes for ARM: These values are not strictly correct since
-// cache line sizes depend on implementations, not architectures.  There
-// are even implementations with cache line sizes configurable at boot
-// time.
-#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
-#define LLVM_LIBC_CACHELINE_SIZE 64
-#elif defined(LLVM_LIBC_ARCH_ARM)
-#define LLVM_LIBC_CACHELINE_SIZE 32
-#else
-#error "Unsupported platform for memory functions."
-#endif
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"

 #include <stddef.h> // size_t
 #include <stdint.h> // intptr_t / uintptr_t
@ -62,32 +51,46 @@ static constexpr size_t ge_power2(size_t value) {
  return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }

-template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+// Returns the number of bytes to substract from ptr to get to the previous
+// multiple of alignment. If ptr is already aligned returns 0.
+template <size_t alignment> uintptr_t distance_to_align_down(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }

-template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
+// Returns the number of bytes to add to ptr to get to the next multiple of
+// alignment. If ptr is already aligned returns 0.
+template <size_t alignment> uintptr_t distance_to_align_up(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  // The logic is not straightforward and involves unsigned modulo arithmetic
  // but the generated code is as fast as it can be.
  return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }

-// Returns the offset from `ptr` to the next cache line.
-static inline intptr_t offset_to_next_cache_line(const void *ptr) {
-  return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
+// Returns the number of bytes to add to ptr to get to the next multiple of
+// alignment. If ptr is already aligned returns alignment.
+template <size_t alignment>
+uintptr_t distance_to_next_aligned(const void *ptr) {
+  return alignment - distance_to_align_down<alignment>(ptr);
 }

+// Returns the same pointer but notifies the compiler that it is aligned.
 template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
  return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
 }
+
 #if defined __has_builtin
 #if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 #endif

+#if defined __has_builtin
+#if __has_builtin(__builtin_memset_inline)
+#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+#endif
+#endif
+
 // Performs a constant count copy.
 template <size_t Size>
 static inline void memcpy_inline(void *__restrict dst,
@ -103,28 +106,56 @@ static inline void memcpy_inline(void *__restrict dst,
 using Ptr = char *;        // Pointer to raw data.
 using CPtr = const char *; // Const pointer to raw data.

-// Loads bytes from memory (possibly unaligned) and materializes them as type.
+// This type makes sure that we don't accidentally promote an integral type to
+// another one. It is only constructible from the exact T type.
+template <typename T> struct StrictIntegralType {
+  static_assert(cpp::is_integral_v<T>);
+
+  // Can only be constructed from a T.
+  template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
+  StrictIntegralType(U value) : value(value) {}
+
+  // Allows using the type in an if statement.
+  explicit operator bool() const { return value; }
+
+  // If type is unsigned (bcmp) we allow bitwise OR operations.
+  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
+    static_assert(!cpp::is_signed_v<T>);
+    return value | Rhs.value;
+  }
+
+  // For interation with the C API we allow explicit conversion back to the
+  // `int` type.
+  explicit operator int() const {
+    // bit_cast makes sure that T and int have the same size.
+    return cpp::bit_cast<int>(value);
+  }
+
+  // Helper to get the zero value.
+  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
+
+private:
+  T value;
+};
+
+using MemcmpReturnType = StrictIntegralType<int32_t>;
+using BcmpReturnType = StrictIntegralType<uint32_t>;
+
+// Loads bytes from memory (possibly unaligned) and materializes them as
+// type.
 template <typename T> static inline T load(CPtr ptr) {
  T Out;
  memcpy_inline<sizeof(T)>(&Out, ptr);
  return Out;
 }

-// Stores a value of type T in memory (possibly unaligned)
+// Stores a value of type T in memory (possibly unaligned).
 template <typename T> static inline void store(Ptr ptr, T value) {
  memcpy_inline<sizeof(T)>(ptr, &value);
 }

-// For an operation like memset that operates on a pointer and a count, advances
-// the pointer by offset bytes and decrease count by the same amount.
-static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) {
-  ptr += offset;
-  count -= offset;
-}
-
-// For an operation like memcpy or memcmp that operates on two pointers and a
-// count, advances the pointers by offset bytes and decrease count by the same
-// amount.
+// Advances the pointers p1 and p2 by offset bytes and decrease count by the
+// same amount.
 template <typename T1, typename T2>
 static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
                          T2 *__restrict &p2, size_t &count) {
@ -133,31 +164,37 @@ static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
  count -= offset;
 }

-// For an operation like memset that operates on a pointer and a count, advances
-// the pointer so it is aligned to SIZE bytes and decrease count by the same
-// amount.
+// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
+// and decrease count by the same amount.
 // We make sure the compiler knows about the adjusted pointer alignment.
-template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
-  adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
-  ptr = assume_aligned<SIZE>(ptr);
+template <size_t SIZE, typename T1, typename T2>
+void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
+                               size_t &count) {
+  adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count);
+  p1 = assume_aligned<SIZE>(p1);
 }

-// For an operation like memcpy or memcmp that operates on two pointers and a
-// count, advances the pointers so one of them gets aligned to SIZE bytes and
-// decrease count by the same amount.
-// We make sure the compiler knows about the adjusted pointer alignment.
-enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
+// Same as align_p1_to_next_boundary above but with a single pointer instead.
+template <size_t SIZE, typename T1>
+void align_to_next_boundary(T1 *&p1, size_t &count) {
+  CPtr dummy;
+  align_p1_to_next_boundary<SIZE>(p1, dummy, count);
+}
+
+// An enum class that discriminates between the first and second pointer.
+enum class Arg { P1, P2, Dst = P1, Src = P2 };
+
+// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
+// Precondition: &p1 != &p2
 template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
-void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
-  if constexpr (AlignOn == Arg::_1) {
-    adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
-    p1 = assume_aligned<SIZE>(p1);
-  } else if constexpr (AlignOn == Arg::_2) {
-    adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
-    p2 = assume_aligned<SIZE>(p2);
-  } else {
-    deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
-  }
+void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
+                            size_t &count) {
+  if constexpr (AlignOn == Arg::P1)
+    align_p1_to_next_boundary<SIZE>(p1, p2, count);
+  else if constexpr (AlignOn == Arg::P2)
+    align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
+  else
+    deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
 }

 } // namespace __llvm_libc
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@ -13,8 +13,8 @@
 namespace __llvm_libc {

 LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  inline_memset(reinterpret_cast<char *>(dst),
-                static_cast<unsigned char>(value), count);
+  inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
+                count);
  return dst;
 }

--- a/libc/test/src/string/bcmp_test.cpp
+++ b/libc/test/src/string/bcmp_test.cpp
@ -12,25 +12,25 @@
 TEST(LlvmLibcBcmpTest, CmpZeroByte) {
  const char *lhs = "ab";
  const char *rhs = "bc";
-  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
+  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
 }

 TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
  const char *lhs = "ab";
  const char *rhs = "ab";
-  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }

 TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
  const char *lhs = "ab";
  const char *rhs = "ac";
-  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }

 TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
  const char *lhs = "ac";
  const char *rhs = "ab";
-  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }

 TEST(LlvmLibcBcmpTest, Sweep) {
@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) {
  reset(lhs);
  reset(rhs);
  for (size_t i = 0; i < K_MAX_SIZE; ++i)
-    EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
+    ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);

  reset(lhs);
  reset(rhs);
  for (size_t i = 0; i < K_MAX_SIZE; ++i) {
    rhs[i] = 'b';
-    EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
+    ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
    rhs[i] = 'a';
  }
 }
--- a/libc/test/src/string/memmove_test.cpp
+++ b/libc/test/src/string/memmove_test.cpp
@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) {
  void *const Dst = Buffer;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
  void *const Dst = Buffer;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
  void *const Dst = Buffer + 1;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
  void *const Dst = Buffer + 2;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 // e.g. `Dst` follow `src`.
@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) {
  void *const Dst = Buffer + 1;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
  void *const Dst = Buffer + 2;
  void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
  EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }

 static constexpr int kMaxSize = 512;
@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) {
      void *const Ret =
          __llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
      EXPECT_EQ(Ret, Dst);
-      EXPECT_MEM_EQ(Buffer, Expected);
+      ASSERT_MEM_EQ(Buffer, Expected);
    }
  }
 }
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@ -3,8 +3,6 @@ add_libc_unittest(
  SUITE
    libc_string_unittests
  SRCS
-    elements_test.cpp
-    memory_access_test.cpp
    utils_test.cpp
  COMPILE_OPTIONS
    ${LIBC_COMPILE_OPTIONS_NATIVE}
--- a/libc/test/src/string/memory_utils/elements_test.cpp
+++ b/libc/test/src/string/memory_utils/elements_test.cpp
@ -1,137 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/span.h"
-#include "src/string/memory_utils/elements.h"
-#include "utils/UnitTest/Test.h"
-
-namespace __llvm_libc {
-
-// Registering Types
-using FixedSizeTypes = testing::TypeList<
-#if defined(__SSE2__)
-    x86::Vector128, //
-#endif              // __SSE2__
-#if defined(__AVX2__)
-    x86::Vector256, //
-#endif              // __AVX2__
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-    x86::Vector512, //
-#endif              // defined(__AVX512F__) and defined(__AVX512BW__)
-    scalar::UINT8,  //
-    scalar::UINT16, //
-    scalar::UINT32, //
-    scalar::UINT64, //
-    Repeated<scalar::UINT64, 2>,                            //
-    Repeated<scalar::UINT64, 4>,                            //
-    Repeated<scalar::UINT64, 8>,                            //
-    Repeated<scalar::UINT64, 16>,                           //
-    Repeated<scalar::UINT64, 32>,                           //
-    Chained<scalar::UINT16, scalar::UINT8>,                 //
-    Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
-    builtin::_1,                                            //
-    builtin::_2,                                            //
-    builtin::_3,                                            //
-    builtin::_4,                                            //
-    builtin::_8                                             //
-    >;
-
-char GetRandomChar() {
-  static constexpr const uint64_t a = 1103515245;
-  static constexpr const uint64_t c = 12345;
-  static constexpr const uint64_t m = 1ULL << 31;
-  static uint64_t seed = 123456789;
-  seed = (a * seed + c) % m;
-  return seed;
-}
-
-void Randomize(cpp::span<char> buffer) {
-  for (auto &current : buffer)
-    current = GetRandomChar();
-}
-
-template <typename Element> using Buffer = cpp::array<char, Element::SIZE>;
-
-template <typename Element> Buffer<Element> GetRandomBuffer() {
-  Buffer<Element> buffer;
-  Randomize(buffer);
-  return buffer;
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const auto buffer = GetRandomBuffer<ParamType>();
-  copy<ParamType>(Dst.data(), buffer.data());
-  for (size_t i = 0; i < ParamType::SIZE; ++i)
-    EXPECT_EQ(Dst[i], buffer[i]);
-}
-
-template <typename T> T copy(const T &Input) {
-  T Output;
-  for (size_t I = 0; I < Input.size(); ++I)
-    Output[I] = Input[I];
-  return Output;
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) {
-  constexpr size_t SIZE = ParamType::SIZE;
-  using LargeBuffer = cpp::array<char, SIZE * 2>;
-  LargeBuffer GroundTruth;
-  Randomize(GroundTruth);
-  // Forward, we move the SIZE first bytes from offset 0 to SIZE.
-  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
-    LargeBuffer Buffer = copy(GroundTruth);
-    move<ParamType>(&Buffer[Offset], &Buffer[0]);
-    for (size_t I = 0; I < SIZE; ++I)
-      EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]);
-  }
-  // Backward, we move the SIZE last bytes from offset 0 to SIZE.
-  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
-    LargeBuffer Buffer = copy(GroundTruth);
-    move<ParamType>(&Buffer[Offset], &Buffer[SIZE]);
-    for (size_t I = 0; I < SIZE; ++I)
-      EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
-  const auto buffer = GetRandomBuffer<ParamType>();
-  EXPECT_TRUE(equals<ParamType>(buffer.data(), buffer.data()));
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) {
-  Buffer<ParamType> initial;
-  for (auto &c : initial)
-    c = 5;
-
-  // Testing equality
-  EXPECT_EQ(three_way_compare<ParamType>(initial.data(), initial.data()), 0);
-
-  // Testing all mismatching positions
-  for (size_t i = 0; i < ParamType::SIZE; ++i) {
-    auto copy = initial;
-    ++copy[i]; // copy is now lexicographycally greated than initial
-    const auto *less = initial.data();
-    const auto *greater = copy.data();
-    EXPECT_LT(three_way_compare<ParamType>(less, greater), 0);
-    EXPECT_GT(three_way_compare<ParamType>(greater, less), 0);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const cpp::array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
-  for (char value : values) {
-    splat_set<ParamType>(Dst.data(), value);
-    for (size_t i = 0; i < ParamType::SIZE; ++i)
-      EXPECT_EQ(Dst[i], value);
-  }
-}
-
-} // namespace __llvm_libc
--- a/libc/test/src/string/memory_utils/memory_access_test.cpp
+++ b/libc/test/src/string/memory_utils/memory_access_test.cpp
@ -1,228 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define LLVM_LIBC_UNITTEST_OBSERVE 1
-
-#include "src/__support/CPP/array.h"
-#include "src/string/memory_utils/elements.h"
-#include "utils/UnitTest/Test.h"
-
-#include <stdio.h>
-#include <string.h>
-
-namespace __llvm_libc {
-
-static constexpr const size_t kMaxBuffer = 32;
-
-struct BufferAccess : cpp::array<char, kMaxBuffer + 1> {
-  BufferAccess() { Reset(); }
-  void Reset() {
-    for (auto &value : *this)
-      value = '0';
-    this->operator[](kMaxBuffer) = '\0';
-  }
-  void Touch(ptrdiff_t offset, size_t size) {
-    if (offset < 0)
-      return;
-    for (size_t i = 0; i < size; ++i)
-      ++(*this)[offset + i];
-  }
-  operator const char *() const { return this->data(); }
-};
-
-struct Buffer {
-  ptrdiff_t Offset(const char *ptr) const {
-    const bool contained = ptr >= data.begin() && ptr < data.end();
-    return contained ? ptr - data.begin() : -1;
-  }
-  void Reset() {
-    reads.Reset();
-    writes.Reset();
-  }
-  cpp::array<char, kMaxBuffer> data;
-  BufferAccess __attribute__((aligned(64))) reads;
-  BufferAccess __attribute__((aligned(64))) writes;
-};
-
-struct MemoryAccessObserver {
-  void ObserveRead(const char *ptr, size_t size) {
-    Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void ObserveWrite(const char *ptr, size_t size) {
-    Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void Reset() {
-    Buffer1.Reset();
-    Buffer2.Reset();
-  }
-
-  Buffer Buffer1;
-  Buffer Buffer2;
-};
-
-MemoryAccessObserver Observer;
-
-template <size_t Size> struct TestingElement {
-  static constexpr size_t SIZE = Size;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Observer.ObserveRead(src, SIZE);
-    Observer.ObserveWrite(dst, SIZE);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, SIZE);
-    Observer.ObserveRead(rhs, SIZE);
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, SIZE);
-    Observer.ObserveRead(rhs, SIZE);
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Observer.ObserveWrite(dst, SIZE);
-  }
-};
-
-using Types = testing::TypeList<
-    TestingElement<1>,                                               // 1 Byte
-    TestingElement<2>,                                               // 2 Bytes
-    TestingElement<4>,                                               // 4 Bytes
-    Repeated<TestingElement<2>, 3>,                                  // 6 Bytes
-    Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
-    >;
-
-struct LlvmLibcTestAccessBase : public testing::Test {
-
-  template <typename HigherOrder, size_t Size, size_t Offset = 0>
-  void checkOperations(const BufferAccess &expected) {
-    static const BufferAccess untouched;
-
-    Observer.Reset();
-    HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(src().reads, expected);
-    ASSERT_STREQ(dst().writes, expected);
-    Observer.Reset();
-    HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset,
-                                   Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::splat_set(dst_ptr() + Offset, 5, Size);
-    ASSERT_STREQ(src().reads, untouched);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(dst().writes, expected);
-  }
-
-  void checkMaxAccess(const BufferAccess &expected, int max) {
-    for (size_t i = 0; i < kMaxBuffer; ++i) {
-      int value = (int)expected[i] - '0';
-      ASSERT_GE(value, 0);
-      ASSERT_LE(value, max);
-    }
-  }
-
-private:
-  const Buffer &lhs() const { return Observer.Buffer1; }
-  const Buffer &rhs() const { return Observer.Buffer2; }
-  const Buffer &src() const { return Observer.Buffer2; }
-  const Buffer &dst() const { return Observer.Buffer1; }
-  Buffer &dst() { return Observer.Buffer1; }
-
-  char *dst_ptr() { return dst().data.begin(); }
-  const char *src_ptr() { return src().data.begin(); }
-  const char *lhs_ptr() { return lhs().data.begin(); }
-  const char *rhs_ptr() { return rhs().data.begin(); }
-};
-
-template <typename ParamType>
-struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
-
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 1);
-    checkOperations<Tail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(0, ParamType::SIZE);
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<HeadTail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 20;
-
-    BufferAccess expected;
-    for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE)
-      expected.Touch(i, ParamType::SIZE);
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<Loop<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-    static constexpr size_t Offset = 2;
-    using AlignmentT = TestingElement<4>;
-
-    BufferAccess expected;
-    expected.Touch(Offset, AlignmentT::SIZE);
-    expected.Touch(AlignmentT::SIZE, ParamType::SIZE);
-    expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 3);
-    checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-    checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
-
-} // namespace __llvm_libc
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@ -72,55 +72,41 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) {
    EXPECT_EQ(ge_power2(i), kExpectedValues[i]);
 }

-using I = intptr_t;
+using UINT = uintptr_t;

 // Converts an offset into a pointer.
 const void *forge(size_t offset) {
  return reinterpret_cast<const void *>(offset);
 }

-TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
-  EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
+TEST(LlvmLibcUtilsTest, DistanceToNextAligned) {
+  EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16));
+  EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15));
+  EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16));
+  EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1));
+  EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16));
 }

-TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
-  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+TEST(LlvmLibcUtilsTest, DistanceToAlignUp) {
+  EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0));
+  EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15));
+  EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0));
+  EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1));
+  EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16));
 }

-TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
-  EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
-  EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
-  EXPECT_EQ(offset_to_next_cache_line(forge(1)),
-            I(LLVM_LIBC_CACHELINE_SIZE - 1));
-  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
-  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
-            I(1));
-}
-
-TEST(LlvmLibcUtilsTest, Adjust1) {
-  char a;
-  const size_t base_size = 10;
-  for (size_t I = -2; I < 2; ++I) {
-    auto *ptr = &a;
-    size_t size = base_size;
-    adjust(I, ptr, size);
-    EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
-    EXPECT_EQ(size, base_size - I);
-  }
+TEST(LlvmLibcUtilsTest, DistanceToAlignDown) {
+  EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0));
+  EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1));
+  EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0));
+  EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15));
+  EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16));
 }

 TEST(LlvmLibcUtilsTest, Adjust2) {
  char a, b;
  const size_t base_size = 10;
-  for (size_t I = -2; I < 2; ++I) {
+  for (ptrdiff_t I = -2; I < 2; ++I) {
    auto *p1 = &a;
    auto *p2 = &b;
    size_t size = base_size;
@ -131,19 +117,6 @@ TEST(LlvmLibcUtilsTest, Adjust2) {
  }
 }

-TEST(LlvmLibcUtilsTest, Align1) {
-  char a;
-  const size_t base_size = 10;
-  {
-    auto *ptr = &a;
-    size_t size = base_size;
-    align<128>(ptr, size);
-    EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
-    EXPECT_GE(ptr, &a);
-    EXPECT_EQ(size_t(ptr - &a), base_size - size);
-  }
-}
-
 TEST(LlvmLibcUtilsTest, Align2) {
  char a, b;
  const size_t base_size = 10;
@ -151,10 +124,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
    auto *p1 = &a;
    auto *p2 = &b;
    size_t size = base_size;
-    align<128, Arg::_1>(p1, p2, size);
+    align_to_next_boundary<128, Arg::P1>(p1, p2, size);
    EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
-    EXPECT_GE(p1, &a);
-    EXPECT_GE(p2, &b);
+    EXPECT_GT(p1, &a);
+    EXPECT_GT(p2, &b);
    EXPECT_EQ(size_t(p1 - &a), base_size - size);
    EXPECT_EQ(size_t(p2 - &b), base_size - size);
  }
@ -162,10 +135,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
    auto *p1 = &a;
    auto *p2 = &b;
    size_t size = base_size;
-    align<128, Arg::_2>(p1, p2, size);
+    align_to_next_boundary<128, Arg::P2>(p1, p2, size);
    EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
-    EXPECT_GE(p1, &a);
-    EXPECT_GE(p2, &b);
+    EXPECT_GT(p1, &a);
+    EXPECT_GT(p2, &b);
    EXPECT_EQ(size_t(p1 - &a), base_size - size);
    EXPECT_EQ(size_t(p2 - &b), base_size - size);
  }
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@ -973,9 +973,10 @@ no_sanitize_features = [
 cc_library(
    name = "string_memory_utils",
    hdrs = [
-        "src/string/memory_utils/elements.h",
-        "src/string/memory_utils/elements_aarch64.h",
-        "src/string/memory_utils/elements_x86.h",
+        "src/string/memory_utils/op_aarch64.h",
+        "src/string/memory_utils/op_builtin.h",
+        "src/string/memory_utils/op_generic.h",
+        "src/string/memory_utils/op_x86.h",
        "src/string/memory_utils/utils.h",
    ],
    textual_hdrs = [
@ -988,6 +989,8 @@ cc_library(
    deps = [
        ":__support_common",
        ":__support_cpp_bit",
+        ":__support_cpp_type_traits",
+        ":__support_cpp_array",
        ":libc_root",
    ],
 )