diff --git a/README.OpenSource b/README.OpenSource
index 55af8e65..adb4d8cd 100644
--- a/README.OpenSource
+++ b/README.OpenSource
@@ -3,8 +3,8 @@
         "Name": "vixl",
         "License": "BSD 3-clause",
         "License File": "LICENCE",
-        "Version Number": "7.0.0",
-        "Owner": "huanghuijin@huawei.com",
+        "Version Number": "8.0.0",
+        "Owner": "liyiming13@huawei.com",
         "Upstream URL": "https://github.com/Linaro/vixl",
         "Description": "vixl is a programmatic assemblers to generate A64, A32 or T32 code at runtime."
     }
diff --git a/README.md b/README.md
index f0255eaf..f114ac6a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-VIXL: ARMv8 Runtime Code Generation Library 7.0.0
+VIXL: ARMv8 Runtime Code Generation Library 8.0.0
 =================================================
 
 Contents:
diff --git a/SConstruct b/SConstruct
index 934a81e3..b855d646 100644
--- a/SConstruct
+++ b/SConstruct
@@ -98,7 +98,9 @@ options = {
       'CCFLAGS' : ['-O3'],
       },
     'simulator:aarch64' : {
-      'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64'],
+      'CCFLAGS' : ['-DVIXL_INCLUDE_SIMULATOR_AARCH64',
+                   '-pthread'],
+      'LINKFLAGS' : ['-pthread']
       },
     'symbols:on' : {
       'CCFLAGS' : ['-g'],
@@ -120,6 +122,9 @@ options = {
     'coverage:on' : {
       'CCFLAGS': ['-fprofile-instr-generate', '-fcoverage-mapping'],
       'LINKFLAGS': ['-fprofile-instr-generate', '-fcoverage-mapping']
+      },
+    'implicit_checks:on' : {
+      'CCFLAGS' : ['-DVIXL_ENABLE_IMPLICIT_CHECKS'],
       }
     }
 
@@ -265,6 +270,10 @@ vars.AddVariables(
     EnumVariable('negative_testing',
                   'Enable negative testing (needs exceptions)',
                  'off', allowed_values=['on', 'off']),
+    EnumVariable('implicit_checks',
+                 'Allow signals raised from simulated invalid (e.g: out of'
+                 + ' bounds) memory reads to be handled by the host.',
+                 'off', allowed_values=['on', 'off']),
     DefaultVariable('symbols', 'Include debugging symbols in the binaries',
                     ['on', 'off']),
     DefaultVariable('simulator', 'Simulators to include', ['aarch64', 'none']),
diff --git a/doc/range-limits.md b/doc/range-limits.md
new file mode 100644
index 00000000..cd7cf8bb
--- /dev/null
+++ b/doc/range-limits.md
@@ -0,0 +1,148 @@
+Immediate Range Limits in VIXL
+==============================
+
+VIXL's macro assembler tries to increase the range of branches and literal loads
+automatically for you, but applications must still be aware of these extended
+limits, and stay within them, in order to ensure valid code is generated.
+
+In debug builds, assertions prevent exceeding these limits at run time. In
+release builds, for performance reasons, the application is responsible for
+staying within the limits.
+
+You should decide what corrections should be applied in your application if it
+exceeds these limits.
+
+Terms
+-----
+
+**Bind** assigning an address to a label such that the instructions that refer
+to the label can be assigned PC-relative offsets.
+
+**Forward** a forward branch or load literal will refer to a location that will
+be bound later in code generation, ie. at a higher address.
+
+**Backward** a backward branch or load literal refers to a location that has
+already been bound earlier in code generation, ie. at a lower address.
+
+**Instruction range** the range of values that can be encoded in the instruction
+to be generated. Outside the instruction range, additional instructions may be
+generated to increase the range, branching further than would be possible in
+one instruction, for example.
+
+**Veneer** a sequence of additional instructions produced to increase the
+instruction range.
+
+**Adjusted PC** the PC including its architecturally-defined offset. In AArch32
+T32, this is the current PC plus four bytes. In AArch64, there is no adjustment;
+Adjusted PC is equal to PC.
+
+AArch64
+-------
+
+### Branches
+
+All instructions and targets must be aligned to the instruction size, four
+bytes.
+
+#### Unconditional immediate branches (`B`)
+
+* Unconditional immediate branches have an instruction range of -134,217,728 to
++134,217,724 bytes from the current PC.
+* No veneers are applied to unconditional immediate branches to extend their
+instruction range.
+* Callers can use the function `IsValidImmPCOffset(UncondBranchType, offset)` to
+check `offset` (in units of instruction) is within the instruction range.
+
+#### Conditional branches (`B.cond`) and compare-and-branch (`CBZ`, `CBNZ`)
+
+* Conditional branch and compare-and-branch instructions have the same
+instruction range.
+* The instruction range is -1,048,576 to +1,048,574 bytes from the current PC.
+* Veneers are applied to extend the range to -134,217,724 to +135,266,298 bytes
+from the current PC.
+  * Unconditional branch range minus one instruction backwards.
+  * Unconditional branch range plus conditional branch range forwards.
+* Callers can use the functions `IsValidImmPCOffset(CondBranchType, offset)` and
+`IsValidImmPCOffset(CompareBranchType, offset)` to check `offset` (in units of
+instruction) is within the instruction range.
+
+#### Test-and-branch (`TBZ`, `TBNZ`)
+
+* Test-and-branch instructions have an instruction range of -32,768 to 32,764
+bytes from the current PC.
+* Veneers are applied to extend the range to -134,217,728 to +135,299,062 bytes
+from the current PC.
+  * Unconditional branch range minus one instruction backwards.
+  * Unconditional branch range plus test-and-branch range forwards.
+* Callers can use the function `IsValidImmPCOffset(TestBranchType, offset)` to
+check `offset` (in units of instruction) is within the instruction range.
+
+### Literals
+
+#### Compute PC-relative address (`ADR`)
+
+* Compute PC-relative address instructions have an instruction range of
+-1,048,576 to +1,048,575 bytes from the current PC.
+* No veneers are applied to extend the instruction range.
+* Callers can use `IsInt21(offset)` to check `offset` (in bytes) is within the
+instruction range.
+
+#### Load from PC-relative address (`LDR`)
+
+* Load from PC-relative address instructions have an instruction range of
+-1,048,576 to +1,048,572 bytes from the current PC. The offset must be four-byte
+aligned.
+* Automatically-placed literals (eg. those created by `Ldr(reg, literal_value)`)
+will be emitted into code such that they are in range of the instructions that
+refer to them.
+* Veneers are not applied to manually-placed literals, ie. those created by
+`Literal<T> x(value)` and emitted by `place()`.
+* Callers can use `IsInt19(offset)` to check `offset` (in units of instruction)
+is within the instruction range.
+
+AArch32
+-------
+
+Limits stated in this section relate to the T32 instruction encodings only.
+
+### Branches
+
+#### Unconditional immediate branches (`B`)
+
+* Unconditional immediate branches have an instruction range of -16,777,216 to
++16,777,214 bytes from the current adjusted PC.
+* Veneers are applied to forward branches to extend them to an unlimited range.
+* No veneers are applied to backward branches.
+
+#### Conditional immediate branches (`B`)
+
+* Conditional immediate branches have an instruction range of -1,048,576 to
++1,048,574 bytes from the current adjusted PC.
+* Veneers are applied to forward branches to extend them to an unlimited range.
+* Veneers are applied to backward branches to extend the range to that of
+unconditional immediate branches, -16,777,216 bytes from the current adjusted
+PC.
+
+#### Compare and branch (`CBZ`, `CBNZ`)
+
+* Compare and branch has an instruction range of 0 to +126 bytes from the
+current adjusted PC.
+* Veneers are applied to forward branches to extend them to an unlimited range.
+* Veneers are applied to backward branches to extend the range to that of
+unconditional immediate branches, -16,777,216 bytes from the current adjusted
+PC.
+
+### Literals
+
+#### Compute/load PC-relative address (`ADR`, `LDR`)
+
+* Compute and load PC-relative address instructions have the same instruction
+range.
+* The instruction range is -4,095 to +4,095 bytes from the current adjusted PC.
+The PC is aligned down to a four-byte boundary before the offset is added.
+* Automatically-placed literals (ie. those created by `Literal<T> x(value)`)
+will be emitted into code such that they are in range of the instructions that
+refer to them.
+* Veneers are not applied to manually-placed literals, ie. those created by
+`Literal<T> x(value, RawLiteral::kManuallyPlaced)` and emitted by `Place()`.
+
diff --git a/src/aarch32/instructions-aarch32.cc b/src/aarch32/instructions-aarch32.cc
index fe5458f1..f3ed0e01 100644
--- a/src/aarch32/instructions-aarch32.cc
+++ b/src/aarch32/instructions-aarch32.cc
@@ -636,20 +636,15 @@ ImmediateT32::ImmediateT32(uint32_t imm) {
 }
 
 
-static inline uint32_t ror(uint32_t x, int i) {
-  VIXL_ASSERT((0 < i) && (i < 32));
-  return (x >> i) | (x << (32 - i));
-}
-
-
 bool ImmediateT32::IsImmediateT32(uint32_t imm) {
   /* abcdefgh abcdefgh abcdefgh abcdefgh */
-  if ((imm ^ ror(imm, 8)) == 0) return true;
+  if (AllBytesMatch(imm)) return true;
   /* 00000000 abcdefgh 00000000 abcdefgh */
   /* abcdefgh 00000000 abcdefgh 00000000 */
-  if ((imm ^ ror(imm, 16)) == 0 &&
-      (((imm & 0xff00) == 0) || ((imm & 0xff) == 0)))
+  if (AllHalfwordsMatch(imm) &&
+      (((imm & 0xff00) == 0) || ((imm & 0xff) == 0))) {
     return true;
+  }
   /* isolate least-significant set bit */
   uint32_t lsb = imm & UnsignedNegate(imm);
   /* if imm is less than lsb*256 then it fits, but instead we test imm/256 to
@@ -697,7 +692,7 @@ bool ImmediateA32::IsImmediateA32(uint32_t imm) {
   if (imm < 256) return true;
   /* avoid getting confused by wrapped-around bytes (this transform has no
    * effect on pass/fail results) */
-  if (imm & 0xff000000) imm = ror(imm, 16);
+  if (imm & 0xff000000) imm = static_cast<uint32_t>(RotateRight(imm, 16, 32));
   /* copy odd-numbered set bits into even-numbered bits immediately below, so
    * that the least-significant set bit is always an even bit */
   imm = imm | ((imm >> 1) & 0x55555555);
diff --git a/src/aarch32/location-aarch32.h b/src/aarch32/location-aarch32.h
index 0959a55a..38800046 100644
--- a/src/aarch32/location-aarch32.h
+++ b/src/aarch32/location-aarch32.h
@@ -80,6 +80,8 @@ class Location : public LocationBase<int32_t> {
 #endif
   }
 
+  Location(Location&&) = default; // movable
+
   bool IsReferenced() const { return referenced_; }
 
  private:
diff --git a/src/aarch32/macro-assembler-aarch32.cc b/src/aarch32/macro-assembler-aarch32.cc
index 3a837ae8..e04f6905 100644
--- a/src/aarch32/macro-assembler-aarch32.cc
+++ b/src/aarch32/macro-assembler-aarch32.cc
@@ -1268,6 +1268,57 @@ void MacroAssembler::Delegate(InstructionType type,
 }
 
 
+void MacroAssembler::Delegate(InstructionType type,
+                              InstructionCondSizeL instruction,
+                              Condition cond,
+                              EncodingSize size,
+                              Location* location) {
+  VIXL_ASSERT(type == kB);
+
+  CONTEXT_SCOPE;
+
+  // Apply veneer to increase range of backwards conditional branches.
+  // This replaces:
+  //   label:
+  //    <instructions>
+  //    bcond label   ; T3
+  // With:
+  //   label:
+  //    <instructions>
+  //    binvcond skip ; T1
+  //    b label       ; T4
+  //   skip:
+  Location::Offset offset = location->GetLocation() -
+    (GetCursorOffset() + GetArchitectureStatePCOffset());
+  if (IsUsingT32() && location->IsBound() && ((offset & 0x1) == 0) &&
+      !cond.Is(al) && cond.IsNotNever()) {
+    // Bound locations must be earlier in the code.
+    VIXL_ASSERT(offset < 0);
+
+    // The offset must be within range of a T4 branch, accounting for the
+    // conditional branch (T1) we emit first, in order to jump over it.
+    offset -= k16BitT32InstructionSizeInBytes;
+    if (offset >= -16777216) {
+      CodeBufferCheckScope scope(this, k16BitT32InstructionSizeInBytes +
+                                       k32BitT32InstructionSizeInBytes);
+#ifndef PANDA_BUILD
+      Label skip;
+#else
+      Label skip(allocator_);
+#endif
+      b(cond.Negate(), Narrow, &skip);
+      b(location);
+      Bind(&skip);
+      return;
+    } else {
+      VIXL_ABORT_WITH_MSG("Conditional branch too far for veneer.\n");
+    }
+  }
+
+  Assembler::Delegate(type, instruction, cond, size, location);
+}
+
+
 template <typename T>
 static inline bool IsI64BitPattern(T imm) {
   for (T mask = 0xff << ((sizeof(T) - 1) * 8); mask != 0; mask >>= 8) {
diff --git a/src/aarch32/macro-assembler-aarch32.h b/src/aarch32/macro-assembler-aarch32.h
index 9742b1bc..f1ef2edb 100644
--- a/src/aarch32/macro-assembler-aarch32.h
+++ b/src/aarch32/macro-assembler-aarch32.h
@@ -1041,6 +1041,12 @@ ITScope(AllocatorWrapper allocator, MacroAssembler* masm,
                         InstructionRL instruction,
                         Register rn,
                         Location* location) VIXL_OVERRIDE;
+  // B
+  virtual void Delegate(InstructionType type,
+                        InstructionCondSizeL instruction,
+                        Condition cond,
+                        EncodingSize size,
+                        Location* location) VIXL_OVERRIDE;
   // VMOV
   virtual void Delegate(InstructionType type,
                         InstructionCondDtSSop instruction,
diff --git a/src/aarch64/assembler-aarch64.cc b/src/aarch64/assembler-aarch64.cc
index c0227177..8e7cee5b 100644
--- a/src/aarch64/assembler-aarch64.cc
+++ b/src/aarch64/assembler-aarch64.cc
@@ -1918,6 +1918,12 @@ void Assembler::sys(int op, const Register& xt) {
 }
 
 
+void Assembler::sysl(int op, const Register& xt) {
+  VIXL_ASSERT(xt.Is64Bits());
+  Emit(SYSL | SysOp(op) | Rt(xt));
+}
+
+
 void Assembler::dc(DataCacheOp op, const Register& rt) {
   if (op == CVAP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCPoP));
   if (op == CVADP) VIXL_ASSERT(CPUHas(CPUFeatures::kDCCVADP));
@@ -1930,6 +1936,35 @@ void Assembler::ic(InstructionCacheOp op, const Register& rt) {
   sys(op, rt);
 }
 
+void Assembler::gcspushm(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sys(GCSPUSHM, rt);
+}
+
+void Assembler::gcspopm(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sysl(GCSPOPM, rt);
+}
+
+
+void Assembler::gcsss1(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sys(GCSSS1, rt);
+}
+
+
+void Assembler::gcsss2(const Register& rt) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kGCS));
+  sysl(GCSSS2, rt);
+}
+
+
+void Assembler::chkfeat(const Register& rd) {
+  VIXL_ASSERT(rd.Is(x16));
+  USE(rd);
+  hint(CHKFEAT);
+}
+
 
 void Assembler::hint(SystemHint code) { hint(static_cast<int>(code)); }
 
@@ -2913,6 +2948,25 @@ void Assembler::st1(const VRegister& vt, int lane, const MemOperand& dst) {
   LoadStoreStructSingle(vt, lane, dst, NEONLoadStoreSingleStructStore1);
 }
 
+void Assembler::pmull(const VRegister& vd,
+                      const VRegister& vn,
+                      const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(AreSameFormat(vn, vm));
+  VIXL_ASSERT((vn.Is8B() && vd.Is8H()) || (vn.Is1D() && vd.Is1Q()));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+  Emit(VFormat(vn) | NEON_PMULL | Rm(vm) | Rn(vn) | Rd(vd));
+}
+
+void Assembler::pmull2(const VRegister& vd,
+                       const VRegister& vn,
+                       const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(AreSameFormat(vn, vm));
+  VIXL_ASSERT((vn.Is16B() && vd.Is8H()) || (vn.Is2D() && vd.Is1Q()));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kPmull1Q) || vd.Is8H());
+  Emit(VFormat(vn) | NEON_PMULL2 | Rm(vm) | Rn(vn) | Rd(vd));
+}
 
 void Assembler::NEON3DifferentL(const VRegister& vd,
                                 const VRegister& vn,
@@ -2960,8 +3014,6 @@ void Assembler::NEON3DifferentHN(const VRegister& vd,
 
 // clang-format off
 #define NEON_3DIFF_LONG_LIST(V) \
-  V(pmull,  NEON_PMULL,  vn.IsVector() && vn.Is8B())                           \
-  V(pmull2, NEON_PMULL2, vn.IsVector() && vn.Is16B())                          \
   V(saddl,  NEON_SADDL,  vn.IsVector() && vn.IsD())                            \
   V(saddl2, NEON_SADDL2, vn.IsVector() && vn.IsQ())                            \
   V(sabal,  NEON_SABAL,  vn.IsVector() && vn.IsD())                            \
@@ -4336,7 +4388,7 @@ void Assembler::sqrdmlah(const VRegister& vd,
                          const VRegister& vm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kRDM));
   VIXL_ASSERT(AreSameFormat(vd, vn, vm));
-  VIXL_ASSERT(vd.IsVector() || !vd.IsQ());
+  VIXL_ASSERT(vd.IsLaneSizeH() || vd.IsLaneSizeS());
 
   Instr format, op = NEON_SQRDMLAH;
   if (vd.IsScalar()) {
@@ -4355,7 +4407,7 @@ void Assembler::sqrdmlsh(const VRegister& vd,
                          const VRegister& vm) {
   VIXL_ASSERT(CPUHas(CPUFeatures::kNEON, CPUFeatures::kRDM));
   VIXL_ASSERT(AreSameFormat(vd, vn, vm));
-  VIXL_ASSERT(vd.IsVector() || !vd.IsQ());
+  VIXL_ASSERT(vd.IsLaneSizeH() || vd.IsLaneSizeS());
 
   Instr format, op = NEON_SQRDMLSH;
   if (vd.IsScalar()) {
@@ -5824,6 +5876,263 @@ void Assembler::ummla(const VRegister& vd, const VRegister& vn, const VRegister&
   Emit(0x6e80a400 | Rd(vd) | Rn(vn) | Rm(vm));
 }
 
+void Assembler::bcax(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B());
+
+  Emit(0xce200000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::eor3(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B() && vm.Is16B() && va.Is16B());
+
+  Emit(0xce000000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::xar(const VRegister& vd, const VRegister& vn, const VRegister& vm, int rotate) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
+  VIXL_ASSERT(IsUint6(rotate));
+
+  Emit(0xce800000 | Rd(vd) | Rn(vn) | Rm(vm) | rotate << 10);
+}
+
+void Assembler::rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA3));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
+
+  Emit(0xce608c00 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha1c(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S());
+
+  Emit(0x5e000000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha1h(const VRegister& sd, const VRegister& sn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(sd.IsS() && sn.IsS());
+
+  Emit(0x5e280800 | Rd(sd) | Rn(sn));
+}
+
+void Assembler::sha1m(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S());
+
+  Emit(0x5e002000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha1p(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(vd.IsQ() && vn.IsS() && vm.Is4S());
+
+  Emit(0x5e001000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha1su0(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0x5e003000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha1su1(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA1));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S());
+
+  Emit(0x5e281800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sha256h(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2));
+  VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is4S());
+
+  Emit(0x5e004000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha256h2(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2));
+  VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is4S());
+
+  Emit(0x5e005000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha256su0(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S());
+
+  Emit(0x5e282800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sha256su1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA2));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0x5e006000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha512h(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512));
+  VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is2D());
+
+  Emit(0xce608000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha512h2(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512));
+  VIXL_ASSERT(vd.IsQ() && vn.IsQ() && vm.Is2D());
+
+  Emit(0xce608400 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sha512su0(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D());
+
+  Emit(0xcec08000 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sha512su1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSHA512));
+  VIXL_ASSERT(vd.Is2D() && vn.Is2D() && vm.Is2D());
+
+  Emit(0xce608800 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::aesd(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kAES));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B());
+
+  Emit(0x4e285800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::aese(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kAES));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B());
+
+  Emit(0x4e284800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::aesimc(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kAES));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B());
+
+  Emit(0x4e287800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::aesmc(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kAES));
+  VIXL_ASSERT(vd.Is16B() && vn.Is16B());
+
+  Emit(0x4e286800 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c000 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c400 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
+void Assembler::sm3ss1(const VRegister& vd, const VRegister& vn, const VRegister& vm, const VRegister& va) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S() && va.Is4S());
+
+  Emit(0xce400000 | Rd(vd) | Rn(vn) | Rm(vm) | Ra(va));
+}
+
+void Assembler::sm3tt1a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408000 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt1b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408400 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2a(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408800 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm3tt2b(const VRegister& vd, const VRegister& vn, const VRegister& vm, int index) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM3));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+  VIXL_ASSERT(IsUint2(index));
+
+  Instr i = static_cast<uint32_t>(index) << 12;
+  Emit(0xce408c00 | Rd(vd) | Rn(vn) | Rm(vm) | i);
+}
+
+void Assembler::sm4e(const VRegister& vd, const VRegister& vn) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM4));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S());
+
+  Emit(0xcec08400 | Rd(vd) | Rn(vn));
+}
+
+void Assembler::sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm) {
+  VIXL_ASSERT(CPUHas(CPUFeatures::kNEON));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSM4));
+  VIXL_ASSERT(vd.Is4S() && vn.Is4S() && vm.Is4S());
+
+  Emit(0xce60c800 | Rd(vd) | Rn(vn) | Rm(vm));
+}
+
 // Note:
 // For all ToImm instructions below, a difference in case
 // for the same letter indicates a negated bit.
@@ -6868,6 +7177,7 @@ bool Assembler::CPUHas(SystemRegister sysreg) const {
       return CPUHas(CPUFeatures::kRNG);
     case FPCR:
     case NZCV:
+    case DCZID_EL0:
       break;
   }
   return true;
diff --git a/src/aarch64/assembler-aarch64.h b/src/aarch64/assembler-aarch64.h
index da0a70c4..b4098c13 100644
--- a/src/aarch64/assembler-aarch64.h
+++ b/src/aarch64/assembler-aarch64.h
@@ -2183,6 +2183,9 @@ class Assembler : public vixl::internal::AssemblerBase {
   // System instruction with pre-encoded op (op1:crn:crm:op2).
   void sys(int op, const Register& xt = xzr);
 
+  // System instruction with result.
+  void sysl(int op, const Register& xt = xzr);
+
   // System data cache operation.
   void dc(DataCacheOp op, const Register& rt);
 
@@ -3643,6 +3646,123 @@ class Assembler : public vixl::internal::AssemblerBase {
   // Unsigned 8-bit integer matrix multiply-accumulate (vector).
   void ummla(const VRegister& vd, const VRegister& vn, const VRegister& vm);
 
+  // Bit Clear and exclusive-OR.
+  void bcax(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va);
+
+  // Three-way Exclusive-OR.
+  void eor3(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va);
+
+  // Exclusive-OR and Rotate.
+  void xar(const VRegister& vd,
+           const VRegister& vn,
+           const VRegister& vm,
+           int rotate);
+
+  // Rotate and Exclusive-OR
+  void rax1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA1 hash update (choose).
+  void sha1c(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA1 fixed rotate.
+  void sha1h(const VRegister& sd, const VRegister& sn);
+
+  // SHA1 hash update (majority).
+  void sha1m(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA1 hash update (parity).
+  void sha1p(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA1 schedule update 0.
+  void sha1su0(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA1 schedule update 1.
+  void sha1su1(const VRegister& vd, const VRegister& vn);
+
+  // SHA256 hash update (part 1).
+  void sha256h(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA256 hash update (part 2).
+  void sha256h2(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA256 schedule update 0.
+  void sha256su0(const VRegister& vd, const VRegister& vn);
+
+  // SHA256 schedule update 1.
+  void sha256su1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA512 hash update part 1.
+  void sha512h(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA512 hash update part 2.
+  void sha512h2(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SHA512 schedule Update 0.
+  void sha512su0(const VRegister& vd, const VRegister& vn);
+
+  // SHA512 schedule Update 1.
+  void sha512su1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // AES single round decryption.
+  void aesd(const VRegister& vd, const VRegister& vn);
+
+  // AES single round encryption.
+  void aese(const VRegister& vd, const VRegister& vn);
+
+  // AES inverse mix columns.
+  void aesimc(const VRegister& vd, const VRegister& vn);
+
+  // AES mix columns.
+  void aesmc(const VRegister& vd, const VRegister& vn);
+
+  // SM3PARTW1.
+  void sm3partw1(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SM3PARTW2.
+  void sm3partw2(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
+  // SM3SS1.
+  void sm3ss1(const VRegister& vd,
+              const VRegister& vn,
+              const VRegister& vm,
+              const VRegister& va);
+
+  // SM3TT1A.
+  void sm3tt1a(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT1B.
+  void sm3tt1b(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT2A.
+  void sm3tt2a(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM3TT2B.
+  void sm3tt2b(const VRegister& vd,
+               const VRegister& vn,
+               const VRegister& vm,
+               int index);
+
+  // SM4 Encode.
+  void sm4e(const VRegister& vd, const VRegister& vn);
+
+  // SM4 Key.
+  void sm4ekey(const VRegister& vd, const VRegister& vn, const VRegister& vm);
+
   // Scalable Vector Extensions.
 
   // Absolute value (predicated).
@@ -7097,6 +7217,21 @@ class Assembler : public vixl::internal::AssemblerBase {
   // Unsigned Minimum.
   void umin(const Register& rd, const Register& rn, const Operand& op);
 
+  // Check feature status.
+  void chkfeat(const Register& rd);
+
+  // Guarded Control Stack Push.
+  void gcspushm(const Register& rt);
+
+  // Guarded Control Stack Pop.
+  void gcspopm(const Register& rt);
+
+  // Guarded Control Stack Switch Stack 1.
+  void gcsss1(const Register& rt);
+
+  // Guarded Control Stack Switch Stack 2.
+  void gcsss2(const Register& rt);
+
   // Emit generic instructions.
 
   // Emit raw instructions into the instruction stream.
@@ -7565,6 +7700,8 @@ class Assembler : public vixl::internal::AssemblerBase {
   static Instr VFormat(VRegister vd) {
     if (vd.Is64Bits()) {
       switch (vd.GetLanes()) {
+        case 1:
+          return NEON_1D;
         case 2:
           return NEON_2S;
         case 4:
diff --git a/src/aarch64/assembler-sve-aarch64.cc b/src/aarch64/assembler-sve-aarch64.cc
index e99cfdcd..0c3c7f88 100644
--- a/src/aarch64/assembler-sve-aarch64.cc
+++ b/src/aarch64/assembler-sve-aarch64.cc
@@ -7410,13 +7410,13 @@ void Assembler::pmullb(const ZRegister& zd,
   //  size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0>
 
   VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ());
   VIXL_ASSERT(AreSameLaneSize(zn, zm));
   VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS());
   VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2);
-  // SVEPmull128 is not supported
-  VIXL_ASSERT(!zd.IsLaneSizeQ());
+  Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd);
 
-  Emit(0x45006800 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm));
+  Emit(0x45006800 | size | Rd(zd) | Rn(zn) | Rm(zm));
 }
 
 void Assembler::pmullt(const ZRegister& zd,
@@ -7427,13 +7427,13 @@ void Assembler::pmullt(const ZRegister& zd,
   //  size<23:22> | Zm<20:16> | op<12> | U<11> | T<10> | Zn<9:5> | Zd<4:0>
 
   VIXL_ASSERT(CPUHas(CPUFeatures::kSVE2));
+  VIXL_ASSERT(CPUHas(CPUFeatures::kSVEPmull128) || !zd.IsLaneSizeQ());
   VIXL_ASSERT(AreSameLaneSize(zn, zm));
   VIXL_ASSERT(!zd.IsLaneSizeB() && !zd.IsLaneSizeS());
   VIXL_ASSERT(zd.GetLaneSizeInBytes() == zn.GetLaneSizeInBytes() * 2);
-  // SVEPmull128 is not supported
-  VIXL_ASSERT(!zd.IsLaneSizeQ());
+  Instr size = zd.IsLaneSizeQ() ? 0 : SVESize(zd);
 
-  Emit(0x45006c00 | SVESize(zd) | Rd(zd) | Rn(zn) | Rm(zm));
+  Emit(0x45006c00 | size | Rd(zd) | Rn(zn) | Rm(zm));
 }
 
 void Assembler::raddhnb(const ZRegister& zd,
diff --git a/src/aarch64/constants-aarch64.h b/src/aarch64/constants-aarch64.h
index 20bd12f9..279587cf 100644
--- a/src/aarch64/constants-aarch64.h
+++ b/src/aarch64/constants-aarch64.h
@@ -389,7 +389,8 @@ enum SystemHint {
   BTI    = 32,
   BTI_c  = 34,
   BTI_j  = 36,
-  BTI_jc = 38
+  BTI_jc = 38,
+  CHKFEAT = 40
 };
 
 enum BranchTargetIdentifier {
@@ -500,7 +501,8 @@ enum SystemRegister {
   NZCV = SystemRegisterEncoder<3, 3, 4, 2, 0>::value,
   FPCR = SystemRegisterEncoder<3, 3, 4, 4, 0>::value,
   RNDR = SystemRegisterEncoder<3, 3, 2, 4, 0>::value,    // Random number.
-  RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value   // Reseeded random number.
+  RNDRRS = SystemRegisterEncoder<3, 3, 2, 4, 1>::value,  // Reseeded random number.
+  DCZID_EL0 = SystemRegisterEncoder<3, 3, 0, 0, 7>::value
 };
 
 template<int op1, int crn, int crm, int op2>
@@ -534,6 +536,13 @@ enum DataCacheOp {
   CIGDVAC = CacheOpEncoder<3, 7, 14, 5>::value
 };
 
+enum GCSOp {
+  GCSPUSHM = CacheOpEncoder<3, 7, 7, 0>::value,
+  GCSPOPM = CacheOpEncoder<3, 7, 7, 1>::value,
+  GCSSS1 = CacheOpEncoder<3, 7, 7, 2>::value,
+  GCSSS2 = CacheOpEncoder<3, 7, 7, 3>::value
+};
+
 // Some SVE instructions support a predicate constraint pattern. This is
 // interpreted as a VL-dependent value, and is typically used to initialise
 // predicates, or to otherwise limit the number of processed elements.
@@ -942,7 +951,8 @@ enum SystemSysOp {
   SystemSysFixed  = 0xD5080000,
   SystemSysFMask  = 0xFFF80000,
   SystemSysMask   = 0xFFF80000,
-  SYS             = SystemSysFixed | 0x00000000
+  SYS             = SystemSysFixed | 0x00000000,
+  SYSL            = SystemSysFixed | 0x00200000
 };
 
 // Exception.
diff --git a/src/aarch64/cpu-features-auditor-aarch64.cc b/src/aarch64/cpu-features-auditor-aarch64.cc
index 4efdef82..b447cad2 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.cc
+++ b/src/aarch64/cpu-features-auditor-aarch64.cc
@@ -244,16 +244,47 @@ void CPUFeaturesAuditor::VisitConditionalSelect(const Instruction* instr) {
 
 void CPUFeaturesAuditor::VisitCrypto2RegSHA(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
+  if (form_hash_ == "sha256su0_vv_cryptosha2"_h) {
+    scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA2);
+  } else {
+    scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA1);
+  }
   USE(instr);
 }
 
 void CPUFeaturesAuditor::VisitCrypto3RegSHA(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
+  switch (form_hash_) {
+    case "sha1c_qsv_cryptosha3"_h:
+    case "sha1m_qsv_cryptosha3"_h:
+    case "sha1p_qsv_cryptosha3"_h:
+    case "sha1su0_vvv_cryptosha3"_h:
+      scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA1);
+      break;
+    case "sha256h_qqv_cryptosha3"_h:
+    case "sha256h2_qqv_cryptosha3"_h:
+    case "sha256su1_vvv_cryptosha3"_h:
+      scope.Record(CPUFeatures::kNEON, CPUFeatures::kSHA2);
+      break;
+  }
   USE(instr);
 }
 
 void CPUFeaturesAuditor::VisitCryptoAES(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
+  scope.Record(CPUFeatures::kNEON, CPUFeatures::kAES);
+  USE(instr);
+}
+
+void CPUFeaturesAuditor::VisitCryptoSM3(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM3);
+  USE(instr);
+}
+
+void CPUFeaturesAuditor::VisitCryptoSM4(const Instruction* instr) {
+  RecordInstructionFeaturesScope scope(this);
+  scope.Record(CPUFeatures::kNEON, CPUFeatures::kSM4);
   USE(instr);
 }
 
@@ -733,6 +764,12 @@ void CPUFeaturesAuditor::VisitNEON3Different(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
   // All of these instructions require NEON.
   scope.Record(CPUFeatures::kNEON);
+  if (form_hash_ == "pmull_asimddiff_l"_h) {
+    if (instr->GetNEONSize() == 3) {
+      // Source is 1D or 2D, destination is 1Q.
+      scope.Record(CPUFeatures::kPmull1Q);
+    }
+  }
   USE(instr);
 }
 
@@ -1267,91 +1304,93 @@ VIXL_SIMPLE_SVE_VISITOR_LIST(VIXL_DEFINE_SIMPLE_SVE_VISITOR)
 
 void CPUFeaturesAuditor::VisitSystem(const Instruction* instr) {
   RecordInstructionFeaturesScope scope(this);
-  if (instr->Mask(SystemHintFMask) == SystemHintFixed) {
-    CPUFeatures required;
-    switch (instr->GetInstructionBits()) {
-      case PACIA1716:
-      case PACIB1716:
-      case AUTIA1716:
-      case AUTIB1716:
-      case PACIAZ:
-      case PACIASP:
-      case PACIBZ:
-      case PACIBSP:
-      case AUTIAZ:
-      case AUTIASP:
-      case AUTIBZ:
-      case AUTIBSP:
-      case XPACLRI:
-        required.Combine(CPUFeatures::kPAuth);
-        break;
-      default:
-        switch (instr->GetImmHint()) {
-          case ESB:
-            required.Combine(CPUFeatures::kRAS);
-            break;
-          case BTI:
-          case BTI_j:
-          case BTI_c:
-          case BTI_jc:
-            required.Combine(CPUFeatures::kBTI);
-            break;
-          default:
-            break;
-        }
-        break;
-    }
 
-    // These are all HINT instructions, and behave as NOPs if the corresponding
-    // features are not implemented, so we record the corresponding features
-    // only if they are available.
-    if (available_.Has(required)) scope.Record(required);
-  } else if (instr->Mask(SystemSysMask) == SYS) {
-    switch (instr->GetSysOp()) {
-      // DC instruction variants.
-      case CGVAC:
-      case CGDVAC:
-      case CGVAP:
-      case CGDVAP:
-      case CIGVAC:
-      case CIGDVAC:
-      case GVA:
-      case GZVA:
-        scope.Record(CPUFeatures::kMTE);
-        break;
-      case CVAP:
-        scope.Record(CPUFeatures::kDCPoP);
-        break;
-      case CVADP:
-        scope.Record(CPUFeatures::kDCCVADP);
-        break;
-      case IVAU:
-      case CVAC:
-      case CVAU:
-      case CIVAC:
-      case ZVA:
-        // No special CPU features.
-        break;
-    }
-  } else if (instr->Mask(SystemPStateFMask) == SystemPStateFixed) {
-    switch (instr->Mask(SystemPStateMask)) {
-      case CFINV:
-        scope.Record(CPUFeatures::kFlagM);
-        break;
-      case AXFLAG:
-      case XAFLAG:
-        scope.Record(CPUFeatures::kAXFlag);
-        break;
-    }
-  } else if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) {
-    if (instr->Mask(SystemSysRegMask) == MRS) {
+  CPUFeatures required;
+  switch (form_hash_) {
+    case "pacib1716_hi_hints"_h:
+    case "pacia1716_hi_hints"_h:
+    case "pacibsp_hi_hints"_h:
+    case "paciasp_hi_hints"_h:
+    case "pacibz_hi_hints"_h:
+    case "paciaz_hi_hints"_h:
+    case "autib1716_hi_hints"_h:
+    case "autia1716_hi_hints"_h:
+    case "autibsp_hi_hints"_h:
+    case "autiasp_hi_hints"_h:
+    case "autibz_hi_hints"_h:
+    case "autiaz_hi_hints"_h:
+    case "xpaclri_hi_hints"_h:
+      required.Combine(CPUFeatures::kPAuth);
+      break;
+    case "esb_hi_hints"_h:
+      required.Combine(CPUFeatures::kRAS);
+      break;
+    case "bti_hb_hints"_h:
+      required.Combine(CPUFeatures::kBTI);
+      break;
+  }
+
+  // The instructions above are all HINTs and behave as NOPs if the
+  // corresponding features are not implemented, so we record the corresponding
+  // features only if they are available.
+  if (available_.Has(required)) scope.Record(required);
+
+  switch (form_hash_) {
+    case "cfinv_m_pstate"_h:
+      scope.Record(CPUFeatures::kFlagM);
+      break;
+    case "axflag_m_pstate"_h:
+    case "xaflag_m_pstate"_h:
+      scope.Record(CPUFeatures::kAXFlag);
+      break;
+    case "mrs_rs_systemmove"_h:
       switch (instr->GetImmSystemRegister()) {
         case RNDR:
         case RNDRRS:
           scope.Record(CPUFeatures::kRNG);
           break;
       }
-    }
+      break;
+    case "sys_cr_systeminstrs"_h:
+      switch (instr->GetSysOp()) {
+        // DC instruction variants.
+        case CGVAC:
+        case CGDVAC:
+        case CGVAP:
+        case CGDVAP:
+        case CIGVAC:
+        case CIGDVAC:
+        case GVA:
+        case GZVA:
+          scope.Record(CPUFeatures::kMTE);
+          break;
+        case CVAP:
+          scope.Record(CPUFeatures::kDCPoP);
+          break;
+        case CVADP:
+          scope.Record(CPUFeatures::kDCCVADP);
+          break;
+        case IVAU:
+        case CVAC:
+        case CVAU:
+        case CIVAC:
+        case ZVA:
+          // No special CPU features.
+          break;
+        case GCSPUSHM:
+        case GCSSS1:
+          scope.Record(CPUFeatures::kGCS);
+          break;
+      }
+      break;
+    case "sysl_rc_systeminstrs"_h:
+      switch (instr->GetSysOp()) {
+        case GCSPOPM:
+        case GCSSS2:
+          scope.Record(CPUFeatures::kGCS);
+          break;
+      }
+      break;
   }
 }
 
@@ -1405,9 +1444,9 @@ void CPUFeaturesAuditor::VisitUnimplemented(const Instruction* instr) {
 void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) {
   VIXL_ASSERT(metadata->count("form") > 0);
   const std::string& form = (*metadata)["form"];
-  uint32_t form_hash = Hash(form.c_str());
+  form_hash_ = Hash(form.c_str());
   const FormToVisitorFnMap* fv = CPUFeaturesAuditor::GetFormToVisitorFnMap();
-  FormToVisitorFnMap::const_iterator it = fv->find(form_hash);
+  FormToVisitorFnMap::const_iterator it = fv->find(form_hash_);
   if (it == fv->end()) {
     RecordInstructionFeaturesScope scope(this);
     std::map<uint32_t, const CPUFeatures> features = {
@@ -1824,10 +1863,30 @@ void CPUFeaturesAuditor::Visit(Metadata* metadata, const Instruction* instr) {
         {"umax_64u_minmax_imm"_h, CPUFeatures::kCSSC},
         {"umin_32u_minmax_imm"_h, CPUFeatures::kCSSC},
         {"umin_64u_minmax_imm"_h, CPUFeatures::kCSSC},
+        {"bcax_vvv16_crypto4"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"eor3_vvv16_crypto4"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"rax1_vvv2_cryptosha512_3"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"xar_vvv2_crypto3_imm6"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3)},
+        {"sha512h_qqv_cryptosha512_3"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
+        {"sha512h2_qqv_cryptosha512_3"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
+        {"sha512su0_vv2_cryptosha512_2"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
+        {"sha512su1_vvv2_cryptosha512_3"_h,
+         CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512)},
+        {"pmullb_z_zz_q"_h,
+         CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)},
+        {"pmullt_z_zz_q"_h,
+         CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128)},
     };
 
-    if (features.count(form_hash) > 0) {
-      scope.Record(features[form_hash]);
+    if (features.count(form_hash_) > 0) {
+      scope.Record(features[form_hash_]);
     }
   } else {
     (it->second)(this, instr);
diff --git a/src/aarch64/cpu-features-auditor-aarch64.h b/src/aarch64/cpu-features-auditor-aarch64.h
index aa7bd852..d533d06d 100644
--- a/src/aarch64/cpu-features-auditor-aarch64.h
+++ b/src/aarch64/cpu-features-auditor-aarch64.h
@@ -31,7 +31,7 @@
 #include <iostream>
 #include <unordered_map>
 
-#include "cpu-features.h"
+#include "../cpu-features.h"
 
 #include "decoder-aarch64.h"
 #include "decoder-visitor-map-aarch64.h"
@@ -113,6 +113,8 @@ class CPUFeaturesAuditor : public DecoderVisitor {
 #define DECLARE(A) virtual void Visit##A(const Instruction* instr);
   VISITOR_LIST(DECLARE)
 #undef DECLARE
+  void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
 
   void LoadStoreHelper(const Instruction* instr);
   void LoadStorePairHelper(const Instruction* instr);
@@ -126,6 +128,7 @@ class CPUFeaturesAuditor : public DecoderVisitor {
   using FormToVisitorFnMap = FormToVisitorFnMapT<CPUFeaturesAuditor>;
 
   static const FormToVisitorFnMap* GetFormToVisitorFnMap();
+  uint32_t form_hash_;
 };
 
 }  // namespace aarch64
diff --git a/src/aarch64/debugger-aarch64.cc b/src/aarch64/debugger-aarch64.cc
index 3c6d2669..1abe7d14 100644
--- a/src/aarch64/debugger-aarch64.cc
+++ b/src/aarch64/debugger-aarch64.cc
@@ -33,7 +33,6 @@
 #include <cstring>
 #include <errno.h>
 #include <limits>
-#include <unistd.h>
 
 namespace vixl {
 namespace aarch64 {
@@ -205,7 +204,7 @@ std::optional<Debugger::RegisterParsedFormat> Debugger::ParseRegString(
     return std::nullopt;
   }
 
-  return {{reg_prefix, *reg_code}};
+  return {{reg_prefix, static_cast<unsigned int>(*reg_code)}};
 }
 
 
diff --git a/src/aarch64/debugger-aarch64.h b/src/aarch64/debugger-aarch64.h
index 2a96ee52..3eefa803 100644
--- a/src/aarch64/debugger-aarch64.h
+++ b/src/aarch64/debugger-aarch64.h
@@ -31,9 +31,9 @@
 #include <unordered_set>
 #include <vector>
 
+#include "../cpu-features.h"
 #include "../globals-vixl.h"
 #include "../utils-vixl.h"
-#include "cpu-features.h"
 
 #include "abi-aarch64.h"
 #include "cpu-features-auditor-aarch64.h"
diff --git a/src/aarch64/decoder-constants-aarch64.h b/src/aarch64/decoder-constants-aarch64.h
index 70e01a10..af50a552 100644
--- a/src/aarch64/decoder-constants-aarch64.h
+++ b/src/aarch64/decoder-constants-aarch64.h
@@ -3764,7 +3764,7 @@ static const DecodeMapping kDecodeMapping[] = {
       {"001110"_b, "autiaz_hi_hints"},
       {"001111"_b, "autibz_hi_hints"},
       {"0100xx"_b, "bti_hb_hints"},
-      {"010100"_b, "chkfeat_hi_hints"},
+      {"010100"_b, "chkfeat_hf_hints"},
       {"0101x1"_b, "hint_hm_hints"},
       {"01x110"_b, "hint_hm_hints"},
       {"10xxxx"_b, "hint_hm_hints"},
diff --git a/src/aarch64/decoder-visitor-map-aarch64.h b/src/aarch64/decoder-visitor-map-aarch64.h
index 8ae438c1..bda71ce1 100644
--- a/src/aarch64/decoder-visitor-map-aarch64.h
+++ b/src/aarch64/decoder-visitor-map-aarch64.h
@@ -2074,7 +2074,6 @@
       {"scvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16},       \
       {"ucvtf_asimdmiscfp16_r"_h, &VISITORCLASS::VisitNEON2RegMiscFP16},       \
       {"addhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},             \
-      {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
       {"raddhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},            \
       {"rsubhn_asimddiff_n"_h, &VISITORCLASS::VisitNEON3Different},            \
       {"sabal_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
@@ -2592,6 +2591,7 @@
       {"dmb_bo_barriers"_h, &VISITORCLASS::VisitSystem},                       \
       {"dsb_bo_barriers"_h, &VISITORCLASS::VisitSystem},                       \
       {"hint_hm_hints"_h, &VISITORCLASS::VisitSystem},                         \
+      {"chkfeat_hf_hints"_h, &VISITORCLASS::VisitSystem},                      \
       {"mrs_rs_systemmove"_h, &VISITORCLASS::VisitSystem},                     \
       {"msr_sr_systemmove"_h, &VISITORCLASS::VisitSystem},                     \
       {"psb_hc_hints"_h, &VISITORCLASS::VisitSystem},                          \
@@ -2638,7 +2638,6 @@
        &VISITORCLASS::VisitUnconditionalBranchToRegister},                     \
       {"ret_64r_branch_reg"_h,                                                 \
        &VISITORCLASS::VisitUnconditionalBranchToRegister},                     \
-      {"bcax_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"bfcvtn_asimdmisc_4s"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"bfdot_asimdelem_e"_h, &VISITORCLASS::VisitUnimplemented},              \
       {"bfdot_asimdsame2_d"_h, &VISITORCLASS::VisitUnimplemented},             \
@@ -2646,7 +2645,6 @@
       {"bfmlal_asimdsame2_f"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"bfmmla_asimdsame2_e"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"dsb_bon_barriers"_h, &VISITORCLASS::VisitUnimplemented},               \
-      {"eor3_vvv16_crypto4"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"ld64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"ldgm_64bulk_ldsttags"_h, &VISITORCLASS::VisitUnimplemented},           \
       {"ldtrb_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},           \
@@ -2658,20 +2656,15 @@
       {"ldtrsw_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"ldtr_32_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
       {"ldtr_64_ldst_unpriv"_h, &VISITORCLASS::VisitUnimplemented},            \
-      {"rax1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},       \
-      {"sha512h2_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},    \
-      {"sha512h_qqv_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},     \
-      {"sha512su0_vv2_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented},   \
-      {"sha512su1_vvv2_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitUnimplemented},            \
-      {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},      \
-      {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitUnimplemented},       \
-      {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitUnimplemented},    \
-      {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitUnimplemented},        \
+      {"sm3partw1_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3},      \
+      {"sm3partw2_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM3},      \
+      {"sm3ss1_vvv4_crypto4"_h, &VISITORCLASS::VisitCryptoSM3},                \
+      {"sm3tt1a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt1b_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt2a_vvv4_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},          \
+      {"sm3tt2b_vvv_crypto3_imm2"_h, &VISITORCLASS::VisitCryptoSM3},           \
+      {"sm4ekey_vvv4_cryptosha512_3"_h, &VISITORCLASS::VisitCryptoSM4},        \
+      {"sm4e_vv4_cryptosha512_2"_h, &VISITORCLASS::VisitCryptoSM4},            \
       {"st64b_64l_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"st64bv_64_memop"_h, &VISITORCLASS::VisitUnimplemented},                \
       {"st64bv0_64_memop"_h, &VISITORCLASS::VisitUnimplemented},               \
@@ -2686,7 +2679,6 @@
       {"ttest_br_systemresult"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"wfet_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented},  \
       {"wfit_only_systeminstrswithreg"_h, &VISITORCLASS::VisitUnimplemented},  \
-      {"xar_vvv2_crypto3_imm6"_h, &VISITORCLASS::VisitUnimplemented},          \
       {"bfcvt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented},               \
       {"bfcvtnt_z_p_z_s2bf"_h, &VISITORCLASS::VisitUnimplemented},             \
       {"bfdot_z_zzz"_h, &VISITORCLASS::VisitUnimplemented},                    \
@@ -2827,6 +2819,7 @@
       {"fmlal_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                  \
       {"fmlsl2_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                 \
       {"fmlsl_asimdsame_f"_h, &VISITORCLASS::VisitNEON3Same},                  \
+      {"pmull_asimddiff_l"_h, &VISITORCLASS::VisitNEON3Different},             \
       {"ushll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate},          \
       {"sshll_asimdshf_l"_h, &VISITORCLASS::VisitNEONShiftImmediate},          \
       {"shrn_asimdshf_n"_h, &VISITORCLASS::VisitNEONShiftImmediate},           \
@@ -2856,22 +2849,6 @@
        &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
       {"sqdmull_asisdelem_l"_h,                                                \
        &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmla_asisdelem_rh_h"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmla_asisdelem_r_sd"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmls_asisdelem_rh_h"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmls_asisdelem_r_sd"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmulx_asisdelem_rh_h"_h,                                               \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmulx_asisdelem_r_sd"_h,                                               \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmul_asisdelem_rh_h"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
-      {"fmul_asisdelem_r_sd"_h,                                                \
-       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
       {"fabd_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same},          \
       {"facge_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same},         \
       {"facgt_asisdsame_only"_h, &VISITORCLASS::VisitNEONScalar3Same},         \
@@ -2944,6 +2921,22 @@
       {"frecpe_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc},        \
       {"frecpx_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc},        \
       {"frsqrte_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc},       \
-      {"scvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc}, {       \
-    "ucvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc              \
+      {"scvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc},         \
+      {"ucvtf_asisdmisc_r"_h, &VISITORCLASS::VisitNEONScalar2RegMisc},         \
+      {"fmla_asisdelem_rh_h"_h,                                                \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmla_asisdelem_r_sd"_h,                                                \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmls_asisdelem_rh_h"_h,                                                \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmls_asisdelem_r_sd"_h,                                                \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmulx_asisdelem_rh_h"_h,                                               \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmulx_asisdelem_r_sd"_h,                                               \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+      {"fmul_asisdelem_rh_h"_h,                                                \
+       &VISITORCLASS::VisitNEONScalarByIndexedElement},                        \
+  {                                                                            \
+    "fmul_asisdelem_r_sd"_h, &VISITORCLASS::VisitNEONScalarByIndexedElement    \
   }
diff --git a/src/aarch64/disasm-aarch64.cc b/src/aarch64/disasm-aarch64.cc
index 86dd7388..4d78369c 100644
--- a/src/aarch64/disasm-aarch64.cc
+++ b/src/aarch64/disasm-aarch64.cc
@@ -2024,7 +2024,7 @@ void Disassembler::DisassembleNoArgs(const Instruction *instr) {
 
 void Disassembler::VisitSystem(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
-  const char *form = "(System)";
+  const char *form = "";
   const char *suffix = NULL;
 
   switch (form_hash_) {
@@ -2053,6 +2053,10 @@ void Disassembler::VisitSystem(const Instruction *instr) {
           break;
       }
       break;
+    case "chkfeat_hf_hints"_h:
+      mnemonic = "chkfeat";
+      form = "x16";
+      break;
     case "hint_hm_hints"_h:
       form = "'IH";
       break;
@@ -2073,9 +2077,6 @@ void Disassembler::VisitSystem(const Instruction *instr) {
       break;
     }
     case Hash("sys_cr_systeminstrs"): {
-      mnemonic = "dc";
-      suffix = ", 'Xt";
-
       const std::map<uint32_t, const char *> dcop = {
           {IVAU, "ivau"},
           {CVAC, "cvac"},
@@ -2098,17 +2099,36 @@ void Disassembler::VisitSystem(const Instruction *instr) {
       if (dcop.count(sysop)) {
         if (sysop == IVAU) {
           mnemonic = "ic";
+        } else {
+          mnemonic = "dc";
         }
         form = dcop.at(sysop);
+        suffix = ", 'Xt";
+      } else if (sysop == GCSSS1) {
+        mnemonic = "gcsss1";
+        form = "'Xt";
+      } else if (sysop == GCSPUSHM) {
+        mnemonic = "gcspushm";
+        form = "'Xt";
       } else {
         mnemonic = "sys";
         form = "'G1, 'Kn, 'Km, 'G2";
-        if (instr->GetRt() == 31) {
-          suffix = NULL;
+        if (instr->GetRt() < 31) {
+          suffix = ", 'Xt";
         }
-        break;
       }
+      break;
     }
+    case "sysl_rc_systeminstrs"_h:
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSPOPM) {
+        mnemonic = "gcspopm";
+        form = (instr->GetRt() == 31) ? "" : "'Xt";
+      } else if (sysop == GCSSS2) {
+        mnemonic = "gcsss2";
+        form = "'Xt";
+      }
+      break;
   }
   Format(instr, mnemonic, form, suffix);
 }
@@ -2154,17 +2174,74 @@ void Disassembler::VisitException(const Instruction *instr) {
 
 
 void Disassembler::VisitCrypto2RegSHA(const Instruction *instr) {
-  VisitUnimplemented(instr);
+  const char *form = "'Vd.4s, 'Vn.4s";
+  if (form_hash_ == "sha1h_ss_cryptosha2"_h) {
+    form = "'Sd, 'Sn";
+  }
+  FormatWithDecodedMnemonic(instr, form);
 }
 
 
 void Disassembler::VisitCrypto3RegSHA(const Instruction *instr) {
-  VisitUnimplemented(instr);
+  const char *form = "'Qd, 'Sn, 'Vm.4s";
+  switch (form_hash_) {
+    case "sha1su0_vvv_cryptosha3"_h:
+    case "sha256su1_vvv_cryptosha3"_h:
+      form = "'Vd.4s, 'Vn.4s, 'Vm.4s";
+      break;
+    case "sha256h_qqv_cryptosha3"_h:
+    case "sha256h2_qqv_cryptosha3"_h:
+      form = "'Qd, 'Qn, 'Vm.4s";
+      break;
+  }
+  FormatWithDecodedMnemonic(instr, form);
 }
 
 
 void Disassembler::VisitCryptoAES(const Instruction *instr) {
-  VisitUnimplemented(instr);
+  FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b");
+}
+
+void Disassembler::VisitCryptoSM3(const Instruction *instr) {
+  const char *form = "'Vd.4s, 'Vn.4s, 'Vm.";
+  const char *suffix = "4s";
+
+  switch (form_hash_) {
+    case "sm3ss1_vvv4_crypto4"_h:
+      suffix = "4s, 'Va.4s";
+      break;
+    case "sm3tt1a_vvv4_crypto3_imm2"_h:
+    case "sm3tt1b_vvv4_crypto3_imm2"_h:
+    case "sm3tt2a_vvv4_crypto3_imm2"_h:
+    case "sm3tt2b_vvv_crypto3_imm2"_h:
+      suffix = "s['u1312]";
+      break;
+  }
+
+  FormatWithDecodedMnemonic(instr, form, suffix);
+}
+
+void Disassembler::VisitCryptoSM4(const Instruction *instr) {
+  VIXL_ASSERT((form_hash_ == "sm4ekey_vvv4_cryptosha512_3"_h) ||
+              (form_hash_ == "sm4e_vv4_cryptosha512_2"_h));
+  const char *form = "'Vd.4s, 'Vn.4s";
+  const char *suffix =
+      (form_hash_ == "sm4e_vv4_cryptosha512_2"_h) ? NULL : ", 'Vm.4s";
+
+  FormatWithDecodedMnemonic(instr, form, suffix);
+}
+
+void Disassembler::DisassembleSHA512(const Instruction *instr) {
+  const char *form = "'Qd, 'Qn, 'Vm.2d";
+  const char *suffix = NULL;
+  switch (form_hash_) {
+    case "sha512su1_vvv2_cryptosha512_3"_h:
+      suffix = ", 'Vm.2d";
+      VIXL_FALLTHROUGH();
+    case "sha512su0_vv2_cryptosha512_2"_h:
+      form = "'Vd.2d, 'Vn.2d";
+  }
+  FormatWithDecodedMnemonic(instr, form, suffix);
 }
 
 void Disassembler::DisassembleNEON2RegAddlp(const Instruction *instr) {
@@ -2380,13 +2457,19 @@ void Disassembler::VisitNEON3SameFP16(const Instruction *instr) {
 }
 
 void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
-  static const NEONFormatMap map_usdot = {{30}, {NF_8B, NF_16B}};
+  static const NEONFormatMap map_dot =
+      {{23, 22, 30}, {NF_UNDEF, NF_UNDEF, NF_UNDEF, NF_UNDEF, NF_2S, NF_4S}};
+  static const NEONFormatMap map_fc =
+      {{23, 22, 30},
+       {NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_2S, NF_4S, NF_UNDEF, NF_2D}};
+  static const NEONFormatMap map_rdm =
+      {{23, 22, 30}, {NF_UNDEF, NF_UNDEF, NF_4H, NF_8H, NF_2S, NF_4S}};
 
   const char *mnemonic = mnemonic_.c_str();
   const char *form = "'Vd.%s, 'Vn.%s, 'Vm.%s";
   const char *suffix = NULL;
 
-  NEONFormatDecoder nfd(instr);
+  NEONFormatDecoder nfd(instr, &map_fc);
 
   switch (form_hash_) {
     case "fcmla_asimdsame2_c"_h:
@@ -2399,17 +2482,28 @@ void Disassembler::VisitNEON3SameExtra(const Instruction *instr) {
     case "sdot_asimdsame2_d"_h:
     case "udot_asimdsame2_d"_h:
     case "usdot_asimdsame2_d"_h:
-      nfd.SetFormatMap(1, &map_usdot);
-      nfd.SetFormatMap(2, &map_usdot);
+      nfd.SetFormatMaps(nfd.LogicalFormatMap());
+      nfd.SetFormatMap(0, &map_dot);
       break;
     default:
-      // sqrdml[as]h - nothing to do.
+      nfd.SetFormatMaps(&map_rdm);
       break;
   }
 
   Format(instr, mnemonic, nfd.Substitute(form), suffix);
 }
 
+void Disassembler::DisassembleNEON4Same(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.16b, 'Vn.16b, 'Vm.16b, 'Va.16b");
+}
+
+void Disassembler::DisassembleNEONXar(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d, #'u1510");
+}
+
+void Disassembler::DisassembleNEONRax1(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Vd.2d, 'Vn.2d, 'Vm.2d");
+}
 
 void Disassembler::VisitNEON3Different(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
@@ -2432,11 +2526,6 @@ void Disassembler::VisitNEON3Different(const Instruction *instr) {
       nfd.SetFormatMaps(nfd.LongIntegerFormatMap());
       nfd.SetFormatMap(0, nfd.IntegerFormatMap());
       break;
-    case "pmull_asimddiff_l"_h:
-      if (nfd.GetVectorFormat(0) != kFormat8H) {
-        mnemonic = NULL;
-      }
-      break;
     case "sqdmlal_asimddiff_l"_h:
     case "sqdmlsl_asimddiff_l"_h:
     case "sqdmull_asimddiff_l"_h:
@@ -2448,6 +2537,22 @@ void Disassembler::VisitNEON3Different(const Instruction *instr) {
   Format(instr, nfd.Mnemonic(mnemonic), nfd.Substitute(form));
 }
 
+void Disassembler::DisassembleNEONPolynomialMul(const Instruction *instr) {
+  const char *mnemonic = instr->ExtractBit(30) ? "pmull2" : "pmull";
+  const char *form = NULL;
+  int size = instr->ExtractBits(23, 22);
+  if (size == 0) {
+    // Bits 30:27 of the instruction are x001, where x is the Q bit. Map
+    // this to "8" and "16" by adding 7.
+    form = "'Vd.8h, 'Vn.'u3127+7b, 'Vm.'u3127+7b";
+  } else if (size == 3) {
+    form = "'Vd.1q, 'Vn.'?30:21d, 'Vm.'?30:21d";
+  } else {
+    mnemonic = NULL;
+  }
+  Format(instr, mnemonic, form);
+}
+
 void Disassembler::DisassembleNEONFPAcrossLanes(const Instruction *instr) {
   const char *mnemonic = mnemonic_.c_str();
   const char *form = "'Sd, 'Vn.4s";
@@ -2624,10 +2729,10 @@ void Disassembler::VisitNEONExtract(const Instruction *instr) {
 void Disassembler::VisitNEONLoadStoreMultiStruct(const Instruction *instr) {
   const char *mnemonic = NULL;
   const char *form = NULL;
-  const char *form_1v = "{'Vt.%1$s}, ['Xns]";
-  const char *form_2v = "{'Vt.%1$s, 'Vt2.%1$s}, ['Xns]";
-  const char *form_3v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s}, ['Xns]";
-  const char *form_4v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns]";
+  const char *form_1v = "{'Vt.%s}, ['Xns]";
+  const char *form_2v = "{'Vt.%s, 'Vt2.%s}, ['Xns]";
+  const char *form_3v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s}, ['Xns]";
+  const char *form_4v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns]";
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::LoadStoreFormatMap());
 
   switch (instr->Mask(NEONLoadStoreMultiStructMask)) {
@@ -2722,11 +2827,10 @@ void Disassembler::VisitNEONLoadStoreMultiStructPostIndex(
     const Instruction *instr) {
   const char *mnemonic = NULL;
   const char *form = NULL;
-  const char *form_1v = "{'Vt.%1$s}, ['Xns], 'Xmr1";
-  const char *form_2v = "{'Vt.%1$s, 'Vt2.%1$s}, ['Xns], 'Xmr2";
-  const char *form_3v = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s}, ['Xns], 'Xmr3";
-  const char *form_4v =
-      "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns], 'Xmr4";
+  const char *form_1v = "{'Vt.%s}, ['Xns], 'Xmr1";
+  const char *form_2v = "{'Vt.%s, 'Vt2.%s}, ['Xns], 'Xmr2";
+  const char *form_3v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s}, ['Xns], 'Xmr3";
+  const char *form_4v = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns], 'Xmr4";
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::LoadStoreFormatMap());
 
   switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) {
@@ -2929,7 +3033,7 @@ void Disassembler::VisitNEONLoadStoreSingleStruct(const Instruction *instr) {
       break;
     case NEON_LD4R:
       mnemonic = "ld4r";
-      form = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns]";
+      form = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns]";
       break;
     default:
       break;
@@ -3089,7 +3193,7 @@ void Disassembler::VisitNEONLoadStoreSingleStructPostIndex(
       break;
     case NEON_LD4R_post:
       mnemonic = "ld4r";
-      form = "{'Vt.%1$s, 'Vt2.%1$s, 'Vt3.%1$s, 'Vt4.%1$s}, ['Xns], 'Xmz4";
+      form = "{'Vt.%s, 'Vt2.%s, 'Vt3.%s, 'Vt4.%s}, ['Xns], 'Xmz4";
       break;
     default:
       break;
@@ -3305,6 +3409,8 @@ void Disassembler::VisitNEONScalar3Same(const Instruction *instr) {
       break;
     case "sqdmulh_asisdsame_only"_h:
     case "sqrdmulh_asisdsame_only"_h:
+    case "sqrdmlah_asisdsame2_only"_h:
+    case "sqrdmlsh_asisdsame2_only"_h:
       if ((vform == kFormatB) || (vform == kFormatD)) {
         mnemonic = NULL;
       }
@@ -3923,8 +4029,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) {
     }
 
     // Check 0x0000pq00_0000pq00 or 0xffffpq00_ffffpq00.
-    uint64_t rotvalue = RotateRight(value, 32, 64);
-    if (value == rotvalue) {
+    if (AllWordsMatch(value)) {
       generic_value &= 0xffffffff;
       if ((generic_value == 0xffff) || (generic_value == UINT32_MAX)) {
         return false;
@@ -3932,8 +4037,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) {
     }
 
     // Check 0xpq00pq00_pq00pq00.
-    rotvalue = RotateRight(value, 16, 64);
-    if (value == rotvalue) {
+    if (AllHalfwordsMatch(value)) {
       return false;
     }
   } else {
@@ -3947,8 +4051,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) {
     }
 
     // Check 0x000000pq_000000pq or 0xffffffpq_ffffffpq.
-    uint64_t rotvalue = RotateRight(value, 32, 64);
-    if (value == rotvalue) {
+    if (AllWordsMatch(value)) {
       generic_value &= 0xffffffff;
       if ((generic_value == 0xff) || (generic_value == UINT32_MAX)) {
         return false;
@@ -3956,8 +4059,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) {
     }
 
     // Check 0x00pq00pq_00pq00pq or 0xffpqffpq_ffpqffpq.
-    rotvalue = RotateRight(value, 16, 64);
-    if (value == rotvalue) {
+    if (AllHalfwordsMatch(value)) {
       generic_value &= 0xffff;
       if ((generic_value == 0xff) || (generic_value == UINT16_MAX)) {
         return false;
@@ -3965,8 +4067,7 @@ static bool SVEMoveMaskPreferred(uint64_t value, int lane_bytes_log2) {
     }
 
     // Check 0xpqpqpqpq_pqpqpqpq.
-    rotvalue = RotateRight(value, 8, 64);
-    if (value == rotvalue) {
+    if (AllBytesMatch(value)) {
       return false;
     }
   }
@@ -5748,15 +5849,26 @@ void Disassembler::Disassemble_ZdT_ZnTb(const Instruction *instr) {
   }
 }
 
+void Disassembler::DisassembleSVEPmull(const Instruction *instr) {
+  if (instr->GetSVEVectorFormat() == kFormatVnS) {
+    VisitUnallocated(instr);
+  } else {
+    Disassemble_ZdT_ZnTb_ZmTb(instr);
+  }
+}
+
+void Disassembler::DisassembleSVEPmull128(const Instruction *instr) {
+  FormatWithDecodedMnemonic(instr, "'Zd.q, 'Zn.d, 'Zm.d");
+}
+
 void Disassembler::Disassemble_ZdT_ZnTb_ZmTb(const Instruction *instr) {
-  const char *form = "'Zd.'t, 'Zn.'th, 'Zm.'th";
   if (instr->GetSVEVectorFormat() == kFormatVnB) {
     // TODO: This is correct for saddlbt, ssublbt, subltb, which don't have
-    // b-lane sized form, and for pmull[b|t] as feature `SVEPmull128` isn't
-    // supported, but may need changes for other instructions reaching here.
+    // b-lane sized form, but may need changes for other instructions reaching
+    // here.
     Format(instr, "unimplemented", "(ZdT_ZnTb_ZmTb)");
   } else {
-    Format(instr, mnemonic_.c_str(), form);
+    FormatWithDecodedMnemonic(instr, "'Zd.'t, 'Zn.'th, 'Zm.'th");
   }
 }
 
@@ -6908,6 +7020,9 @@ int Disassembler::SubstituteImmediateField(const Instruction *instr,
         case RNDRRS:
           AppendToOutput("rndrrs");
           break;
+        case DCZID_EL0:
+          AppendToOutput("dczid_el0");
+          break;
         default:
           AppendToOutput("S%d_%d_c%d_c%d_%d",
                          instr->GetSysOp0(),
diff --git a/src/aarch64/disasm-aarch64.h b/src/aarch64/disasm-aarch64.h
index 55c5047a..57c2a1ab 100644
--- a/src/aarch64/disasm-aarch64.h
+++ b/src/aarch64/disasm-aarch64.h
@@ -206,6 +206,8 @@ class Disassembler : public DecoderVisitor {
   void DisassembleSVEBitwiseTernary(const Instruction* instr);
   void DisassembleSVEFlogb(const Instruction* instr);
   void DisassembleSVEFPPair(const Instruction* instr);
+  void DisassembleSVEPmull(const Instruction* instr);
+  void DisassembleSVEPmull128(const Instruction* instr);
 
   void DisassembleNoArgs(const Instruction* instr);
 
@@ -238,6 +240,11 @@ class Disassembler : public DecoderVisitor {
   void DisassembleNEONScalarShiftRightNarrowImm(const Instruction* instr);
   void DisassembleNEONScalar2RegMiscOnlyD(const Instruction* instr);
   void DisassembleNEONFPScalar2RegMisc(const Instruction* instr);
+  void DisassembleNEONPolynomialMul(const Instruction* instr);
+  void DisassembleNEON4Same(const Instruction* instr);
+  void DisassembleNEONXar(const Instruction* instr);
+  void DisassembleNEONRax1(const Instruction* instr);
+  void DisassembleSHA512(const Instruction* instr);
 
   void DisassembleMTELoadTag(const Instruction* instr);
   void DisassembleMTEStoreTag(const Instruction* instr);
@@ -248,6 +255,9 @@ class Disassembler : public DecoderVisitor {
   void Disassemble_Xd_XnSP_Xm(const Instruction* instr);
   void Disassemble_Xd_XnSP_XmSP(const Instruction* instr);
 
+  void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
+
   void Format(const Instruction* instr,
               const char* mnemonic,
               const char* format0,
diff --git a/src/aarch64/instructions-aarch64.cc b/src/aarch64/instructions-aarch64.cc
index 298a7be8..adef87f4 100644
--- a/src/aarch64/instructions-aarch64.cc
+++ b/src/aarch64/instructions-aarch64.cc
@@ -603,6 +603,28 @@ std::pair<int, int> Instruction::GetSVEMulLongZmAndIndex() const {
   return std::make_pair(reg_code, index);
 }
 
+// Get the register and index for NEON indexed multiplies.
+std::pair<int, int> Instruction::GetNEONMulRmAndIndex() const {
+  int reg_code = GetRm();
+  int index = (GetNEONH() << 2) | (GetNEONL() << 1) | GetNEONM();
+  switch (GetNEONSize()) {
+    case 0:  // FP H-sized elements.
+    case 1:  // Integer H-sized elements.
+      // 4-bit Rm, 3-bit index.
+      reg_code &= 0xf;
+      break;
+    case 2:  // S-sized elements.
+      // 5-bit Rm, 2-bit index.
+      index >>= 1;
+      break;
+    case 3:  // FP D-sized elements.
+      // 5-bit Rm, 1-bit index.
+      index >>= 2;
+      break;
+  }
+  return std::make_pair(reg_code, index);
+}
+
 // Logical immediates can't encode zero, so a return value of zero is used to
 // indicate a failure case. Specifically, where the constraints on imm_s are
 // not met.
@@ -1011,6 +1033,8 @@ VectorFormat VectorFormatHalfWidth(VectorFormat vform) {
       return kFormat4H;
     case kFormat2D:
       return kFormat2S;
+    case kFormat1Q:
+      return kFormat1D;
     case kFormatH:
       return kFormatB;
     case kFormatS:
@@ -1023,6 +1047,8 @@ VectorFormat VectorFormatHalfWidth(VectorFormat vform) {
       return kFormatVnH;
     case kFormatVnD:
       return kFormatVnS;
+    case kFormatVnQ:
+      return kFormatVnD;
     default:
       VIXL_UNREACHABLE();
       return kFormatUndefined;
@@ -1095,6 +1121,8 @@ VectorFormat VectorFormatHalfWidthDoubleLanes(VectorFormat vform) {
       return kFormat2S;
     case kFormat2D:
       return kFormat4S;
+    case kFormat1Q:
+      return kFormat2D;
     case kFormatVnH:
       return kFormatVnB;
     case kFormatVnS:
@@ -1246,6 +1274,7 @@ unsigned RegisterSizeInBitsFromFormat(VectorFormat vform) {
     case kFormat8H:
     case kFormat4S:
     case kFormat2D:
+    case kFormat1Q:
       return kQRegSize;
     default:
       VIXL_UNREACHABLE();
@@ -1283,6 +1312,7 @@ unsigned LaneSizeInBitsFromFormat(VectorFormat vform) {
     case kFormat2D:
     case kFormatVnD:
       return 64;
+    case kFormat1Q:
     case kFormatVnQ:
       return 128;
     case kFormatVnO:
@@ -1348,6 +1378,7 @@ int LaneCountFromFormat(VectorFormat vform) {
     case kFormat2D:
       return 2;
     case kFormat1D:
+    case kFormat1Q:
     case kFormatB:
     case kFormatH:
     case kFormatS:
diff --git a/src/aarch64/instructions-aarch64.h b/src/aarch64/instructions-aarch64.h
index 38a0d67c..00aeb3cf 100644
--- a/src/aarch64/instructions-aarch64.h
+++ b/src/aarch64/instructions-aarch64.h
@@ -217,9 +217,10 @@ enum VectorFormat {
   kFormatVnQ = kFormatSVEQ | kFormatSVE,
   kFormatVnO = kFormatSVEO | kFormatSVE,
 
-  // An artificial value, used by simulator trace tests and a few oddball
+  // Artificial values, used by simulator trace tests and a few oddball
   // instructions (such as FMLAL).
-  kFormat2H = 0xfffffffe
+  kFormat2H = 0xfffffffe,
+  kFormat1Q = 0xfffffffd
 };
 
 // Instructions. ---------------------------------------------------------------
@@ -372,6 +373,7 @@ class Instruction {
 
   std::pair<int, int> GetSVEPermuteIndexAndLaneSizeLog2() const;
 
+  std::pair<int, int> GetNEONMulRmAndIndex() const;
   std::pair<int, int> GetSVEMulZmAndIndex() const;
   std::pair<int, int> GetSVEMulLongZmAndIndex() const;
 
@@ -854,11 +856,13 @@ class NEONFormatDecoder {
   // Set the format mapping for all or individual substitutions.
   void SetFormatMaps(const NEONFormatMap* format0,
                      const NEONFormatMap* format1 = NULL,
-                     const NEONFormatMap* format2 = NULL) {
+                     const NEONFormatMap* format2 = NULL,
+                     const NEONFormatMap* format3 = NULL) {
     VIXL_ASSERT(format0 != NULL);
     formats_[0] = format0;
     formats_[1] = (format1 == NULL) ? formats_[0] : format1;
     formats_[2] = (format2 == NULL) ? formats_[1] : format2;
+    formats_[3] = (format3 == NULL) ? formats_[2] : format3;
   }
   void SetFormatMap(unsigned index, const NEONFormatMap* format) {
     VIXL_ASSERT(index <= ArrayLength(formats_));
@@ -877,12 +881,15 @@ class NEONFormatDecoder {
   const char* Substitute(const char* string,
                          SubstitutionMode mode0 = kFormat,
                          SubstitutionMode mode1 = kFormat,
-                         SubstitutionMode mode2 = kFormat) {
+                         SubstitutionMode mode2 = kFormat,
+                         SubstitutionMode mode3 = kFormat) {
     const char* subst0 = GetSubstitute(0, mode0);
     const char* subst1 = GetSubstitute(1, mode1);
     const char* subst2 = GetSubstitute(2, mode2);
+    const char* subst3 = GetSubstitute(3, mode3);
 
-    if ((subst0 == NULL) || (subst1 == NULL) || (subst2 == NULL)) {
+    if ((subst0 == NULL) || (subst1 == NULL) || (subst2 == NULL) ||
+        (subst3 == NULL)) {
       return NULL;
     }
 
@@ -891,7 +898,8 @@ class NEONFormatDecoder {
              string,
              subst0,
              subst1,
-             subst2);
+             subst2,
+             subst3);
     return form_buffer_;
   }
 
@@ -1129,7 +1137,7 @@ class NEONFormatDecoder {
   }
 
   Instr instrbits_;
-  const NEONFormatMap* formats_[3];
+  const NEONFormatMap* formats_[4];
   char form_buffer_[64];
   char mne_buffer_[16];
 };
diff --git a/src/aarch64/logic-aarch64.cc b/src/aarch64/logic-aarch64.cc
index b41db923..ef5b07af 100644
--- a/src/aarch64/logic-aarch64.cc
+++ b/src/aarch64/logic-aarch64.cc
@@ -36,33 +36,33 @@ namespace aarch64 {
 using vixl::internal::SimFloat16;
 
 template <typename T>
-bool IsFloat64() {
+constexpr bool IsFloat64() {
   return false;
 }
 template <>
-bool IsFloat64<double>() {
+constexpr bool IsFloat64<double>() {
   return true;
 }
 
 template <typename T>
-bool IsFloat32() {
+constexpr bool IsFloat32() {
   return false;
 }
 template <>
-bool IsFloat32<float>() {
+constexpr bool IsFloat32<float>() {
   return true;
 }
 
 template <typename T>
-bool IsFloat16() {
+constexpr bool IsFloat16() {
   return false;
 }
 template <>
-bool IsFloat16<Float16>() {
+constexpr bool IsFloat16<Float16>() {
   return true;
 }
 template <>
-bool IsFloat16<SimFloat16>() {
+constexpr bool IsFloat16<SimFloat16>() {
   return true;
 }
 
@@ -168,11 +168,12 @@ SimFloat16 Simulator::UFixedToFloat16(uint64_t src,
 
 
 uint64_t Simulator::GenerateRandomTag(uint16_t exclude) {
-  uint64_t rtag = nrand48(rand_state_) >> 28;
+  // Generate a 4 bit integer from a 48bit random number
+  uint64_t rtag = rand_gen_() >> 44;
   VIXL_ASSERT(IsUint4(rtag));
 
   if (exclude == 0) {
-    exclude = nrand48(rand_state_) >> 27;
+    exclude = static_cast<uint16_t>(rand_gen_() >> 44);
   }
 
   // TODO: implement this to better match the specification, which calls for a
@@ -182,24 +183,28 @@ uint64_t Simulator::GenerateRandomTag(uint16_t exclude) {
 }
 
 
-void Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
+bool Simulator::ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
   dst.ClearForWrite(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst, vform, i, addr);
+    if (!LoadLane(dst, vform, i, addr)) {
+      return false;
+    }
     addr += LaneSizeInBytesFromFormat(vform);
   }
+  return true;
 }
 
 
-void Simulator::ld1(VectorFormat vform,
+bool Simulator::ld1(VectorFormat vform,
                     LogicVRegister dst,
                     int index,
                     uint64_t addr) {
-  LoadLane(dst, vform, index, addr);
+  dst.ClearForWrite(vform);
+  return LoadLane(dst, vform, index, addr);
 }
 
 
-void Simulator::ld1r(VectorFormat vform,
+bool Simulator::ld1r(VectorFormat vform,
                      VectorFormat unpack_vform,
                      LogicVRegister dst,
                      uint64_t addr,
@@ -208,20 +213,25 @@ void Simulator::ld1r(VectorFormat vform,
   dst.ClearForWrite(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
     if (is_signed) {
-      LoadIntToLane(dst, vform, unpack_size, i, addr);
+      if (!LoadIntToLane(dst, vform, unpack_size, i, addr)) {
+        return false;
+      }
     } else {
-      LoadUintToLane(dst, vform, unpack_size, i, addr);
+      if (!LoadUintToLane(dst, vform, unpack_size, i, addr)) {
+        return false;
+      }
     }
   }
+  return true;
 }
 
 
-void Simulator::ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
-  ld1r(vform, vform, dst, addr);
+bool Simulator::ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr) {
+  return ld1r(vform, vform, dst, addr);
 }
 
 
-void Simulator::ld2(VectorFormat vform,
+bool Simulator::ld2(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     uint64_t addr1) {
@@ -230,15 +240,17 @@ void Simulator::ld2(VectorFormat vform,
   int esize = LaneSizeInBytesFromFormat(vform);
   uint64_t addr2 = addr1 + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr1);
-    LoadLane(dst2, vform, i, addr2);
+    if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2)) {
+      return false;
+    }
     addr1 += 2 * esize;
     addr2 += 2 * esize;
   }
+  return true;
 }
 
 
-void Simulator::ld2(VectorFormat vform,
+bool Simulator::ld2(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     int index,
@@ -246,12 +258,12 @@ void Simulator::ld2(VectorFormat vform,
   dst1.ClearForWrite(vform);
   dst2.ClearForWrite(vform);
   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
-  LoadLane(dst1, vform, index, addr1);
-  LoadLane(dst2, vform, index, addr2);
+  return (LoadLane(dst1, vform, index, addr1) &&
+          LoadLane(dst2, vform, index, addr2));
 }
 
 
-void Simulator::ld2r(VectorFormat vform,
+bool Simulator::ld2r(VectorFormat vform,
                      LogicVRegister dst1,
                      LogicVRegister dst2,
                      uint64_t addr) {
@@ -259,13 +271,15 @@ void Simulator::ld2r(VectorFormat vform,
   dst2.ClearForWrite(vform);
   uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr);
-    LoadLane(dst2, vform, i, addr2);
+    if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2)) {
+      return false;
+    }
   }
+  return true;
 }
 
 
-void Simulator::ld3(VectorFormat vform,
+bool Simulator::ld3(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     LogicVRegister dst3,
@@ -277,17 +291,19 @@ void Simulator::ld3(VectorFormat vform,
   uint64_t addr2 = addr1 + esize;
   uint64_t addr3 = addr2 + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr1);
-    LoadLane(dst2, vform, i, addr2);
-    LoadLane(dst3, vform, i, addr3);
+    if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) ||
+        !LoadLane(dst3, vform, i, addr3)) {
+      return false;
+    }
     addr1 += 3 * esize;
     addr2 += 3 * esize;
     addr3 += 3 * esize;
   }
+  return true;
 }
 
 
-void Simulator::ld3(VectorFormat vform,
+bool Simulator::ld3(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     LogicVRegister dst3,
@@ -298,13 +314,13 @@ void Simulator::ld3(VectorFormat vform,
   dst3.ClearForWrite(vform);
   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
-  LoadLane(dst1, vform, index, addr1);
-  LoadLane(dst2, vform, index, addr2);
-  LoadLane(dst3, vform, index, addr3);
+  return (LoadLane(dst1, vform, index, addr1) &&
+          LoadLane(dst2, vform, index, addr2) &&
+          LoadLane(dst3, vform, index, addr3));
 }
 
 
-void Simulator::ld3r(VectorFormat vform,
+bool Simulator::ld3r(VectorFormat vform,
                      LogicVRegister dst1,
                      LogicVRegister dst2,
                      LogicVRegister dst3,
@@ -315,14 +331,16 @@ void Simulator::ld3r(VectorFormat vform,
   uint64_t addr2 = addr + LaneSizeInBytesFromFormat(vform);
   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr);
-    LoadLane(dst2, vform, i, addr2);
-    LoadLane(dst3, vform, i, addr3);
+    if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) ||
+        !LoadLane(dst3, vform, i, addr3)) {
+      return false;
+    }
   }
+  return true;
 }
 
 
-void Simulator::ld4(VectorFormat vform,
+bool Simulator::ld4(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     LogicVRegister dst3,
@@ -337,19 +355,20 @@ void Simulator::ld4(VectorFormat vform,
   uint64_t addr3 = addr2 + esize;
   uint64_t addr4 = addr3 + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr1);
-    LoadLane(dst2, vform, i, addr2);
-    LoadLane(dst3, vform, i, addr3);
-    LoadLane(dst4, vform, i, addr4);
+    if (!LoadLane(dst1, vform, i, addr1) || !LoadLane(dst2, vform, i, addr2) ||
+        !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) {
+      return false;
+    }
     addr1 += 4 * esize;
     addr2 += 4 * esize;
     addr3 += 4 * esize;
     addr4 += 4 * esize;
   }
+  return true;
 }
 
 
-void Simulator::ld4(VectorFormat vform,
+bool Simulator::ld4(VectorFormat vform,
                     LogicVRegister dst1,
                     LogicVRegister dst2,
                     LogicVRegister dst3,
@@ -363,14 +382,14 @@ void Simulator::ld4(VectorFormat vform,
   uint64_t addr2 = addr1 + LaneSizeInBytesFromFormat(vform);
   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
   uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform);
-  LoadLane(dst1, vform, index, addr1);
-  LoadLane(dst2, vform, index, addr2);
-  LoadLane(dst3, vform, index, addr3);
-  LoadLane(dst4, vform, index, addr4);
+  return (LoadLane(dst1, vform, index, addr1) &&
+          LoadLane(dst2, vform, index, addr2) &&
+          LoadLane(dst3, vform, index, addr3) &&
+          LoadLane(dst4, vform, index, addr4));
 }
 
 
-void Simulator::ld4r(VectorFormat vform,
+bool Simulator::ld4r(VectorFormat vform,
                      LogicVRegister dst1,
                      LogicVRegister dst2,
                      LogicVRegister dst3,
@@ -384,57 +403,61 @@ void Simulator::ld4r(VectorFormat vform,
   uint64_t addr3 = addr2 + LaneSizeInBytesFromFormat(vform);
   uint64_t addr4 = addr3 + LaneSizeInBytesFromFormat(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    LoadLane(dst1, vform, i, addr);
-    LoadLane(dst2, vform, i, addr2);
-    LoadLane(dst3, vform, i, addr3);
-    LoadLane(dst4, vform, i, addr4);
+    if (!LoadLane(dst1, vform, i, addr) || !LoadLane(dst2, vform, i, addr2) ||
+        !LoadLane(dst3, vform, i, addr3) || !LoadLane(dst4, vform, i, addr4)) {
+      return false;
+    }
   }
+  return true;
 }
 
 
-void Simulator::st1(VectorFormat vform, LogicVRegister src, uint64_t addr) {
+bool Simulator::st1(VectorFormat vform, LogicVRegister src, uint64_t addr) {
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    StoreLane(src, vform, i, addr);
+    if (!StoreLane(src, vform, i, addr)) return false;
     addr += LaneSizeInBytesFromFormat(vform);
   }
+  return true;
 }
 
 
-void Simulator::st1(VectorFormat vform,
+bool Simulator::st1(VectorFormat vform,
                     LogicVRegister src,
                     int index,
                     uint64_t addr) {
-  StoreLane(src, vform, index, addr);
+  return StoreLane(src, vform, index, addr);
 }
 
 
-void Simulator::st2(VectorFormat vform,
+bool Simulator::st2(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     uint64_t addr) {
   int esize = LaneSizeInBytesFromFormat(vform);
   uint64_t addr2 = addr + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    StoreLane(src, vform, i, addr);
-    StoreLane(src2, vform, i, addr2);
+    if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2)) {
+      return false;
+    }
     addr += 2 * esize;
     addr2 += 2 * esize;
   }
+  return true;
 }
 
 
-void Simulator::st2(VectorFormat vform,
+bool Simulator::st2(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     int index,
                     uint64_t addr) {
   int esize = LaneSizeInBytesFromFormat(vform);
-  StoreLane(src, vform, index, addr);
-  StoreLane(src2, vform, index, addr + 1 * esize);
+  return (StoreLane(src, vform, index, addr) &&
+          StoreLane(src2, vform, index, addr + 1 * esize));
 }
 
 
-void Simulator::st3(VectorFormat vform,
+bool Simulator::st3(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     LogicVRegister src3,
@@ -443,30 +466,32 @@ void Simulator::st3(VectorFormat vform,
   uint64_t addr2 = addr + esize;
   uint64_t addr3 = addr2 + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    StoreLane(src, vform, i, addr);
-    StoreLane(src2, vform, i, addr2);
-    StoreLane(src3, vform, i, addr3);
+    if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) ||
+        !StoreLane(src3, vform, i, addr3)) {
+      return false;
+    }
     addr += 3 * esize;
     addr2 += 3 * esize;
     addr3 += 3 * esize;
   }
+  return true;
 }
 
 
-void Simulator::st3(VectorFormat vform,
+bool Simulator::st3(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     LogicVRegister src3,
                     int index,
                     uint64_t addr) {
   int esize = LaneSizeInBytesFromFormat(vform);
-  StoreLane(src, vform, index, addr);
-  StoreLane(src2, vform, index, addr + 1 * esize);
-  StoreLane(src3, vform, index, addr + 2 * esize);
+  return (StoreLane(src, vform, index, addr) &&
+          StoreLane(src2, vform, index, addr + 1 * esize) &&
+          StoreLane(src3, vform, index, addr + 2 * esize));
 }
 
 
-void Simulator::st4(VectorFormat vform,
+bool Simulator::st4(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     LogicVRegister src3,
@@ -477,19 +502,21 @@ void Simulator::st4(VectorFormat vform,
   uint64_t addr3 = addr2 + esize;
   uint64_t addr4 = addr3 + esize;
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
-    StoreLane(src, vform, i, addr);
-    StoreLane(src2, vform, i, addr2);
-    StoreLane(src3, vform, i, addr3);
-    StoreLane(src4, vform, i, addr4);
+    if (!StoreLane(src, vform, i, addr) || !StoreLane(src2, vform, i, addr2) ||
+        !StoreLane(src3, vform, i, addr3) ||
+        !StoreLane(src4, vform, i, addr4)) {
+      return false;
+    }
     addr += 4 * esize;
     addr2 += 4 * esize;
     addr3 += 4 * esize;
     addr4 += 4 * esize;
   }
+  return true;
 }
 
 
-void Simulator::st4(VectorFormat vform,
+bool Simulator::st4(VectorFormat vform,
                     LogicVRegister src,
                     LogicVRegister src2,
                     LogicVRegister src3,
@@ -497,10 +524,10 @@ void Simulator::st4(VectorFormat vform,
                     int index,
                     uint64_t addr) {
   int esize = LaneSizeInBytesFromFormat(vform);
-  StoreLane(src, vform, index, addr);
-  StoreLane(src2, vform, index, addr + 1 * esize);
-  StoreLane(src3, vform, index, addr + 2 * esize);
-  StoreLane(src4, vform, index, addr + 3 * esize);
+  return (StoreLane(src, vform, index, addr) &&
+          StoreLane(src2, vform, index, addr + 1 * esize) &&
+          StoreLane(src3, vform, index, addr + 2 * esize) &&
+          StoreLane(src4, vform, index, addr + 3 * esize));
 }
 
 
@@ -895,23 +922,12 @@ LogicVRegister Simulator::sqrdmlsh(VectorFormat vform,
   return sqrdmlsh(vform, dst, src1, dup_element(indexform, temp, src2, index));
 }
 
-
 uint64_t Simulator::PolynomialMult(uint64_t op1,
                                    uint64_t op2,
                                    int lane_size_in_bits) const {
-  VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kSRegSize);
-  VIXL_ASSERT(IsUintN(lane_size_in_bits, op1));
-  VIXL_ASSERT(IsUintN(lane_size_in_bits, op2));
-  uint64_t result = 0;
-  for (int i = 0; i < lane_size_in_bits; ++i) {
-    if ((op1 >> i) & 1) {
-      result = result ^ (op2 << i);
-    }
-  }
-  return result;
+  return PolynomialMult128(op1, op2, lane_size_in_bits).second;
 }
 
-
 LogicVRegister Simulator::pmul(VectorFormat vform,
                                LogicVRegister dst,
                                const LogicVRegister& src1,
@@ -933,14 +949,16 @@ LogicVRegister Simulator::pmull(VectorFormat vform,
                                 const LogicVRegister& src1,
                                 const LogicVRegister& src2) {
   dst.ClearForWrite(vform);
-
   VectorFormat vform_src = VectorFormatHalfWidth(vform);
-  for (int i = 0; i < LaneCountFromFormat(vform); i++) {
+
+  // Process the elements in reverse to avoid problems when the destination
+  // register is the same as a source.
+  for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
     dst.SetUint(vform,
                 i,
-                PolynomialMult(src1.Uint(vform_src, i),
-                               src2.Uint(vform_src, i),
-                               LaneSizeInBitsFromFormat(vform_src)));
+                PolynomialMult128(src1.Uint(vform_src, i),
+                                  src2.Uint(vform_src, i),
+                                  LaneSizeInBitsFromFormat(vform_src)));
   }
 
   return dst;
@@ -951,16 +969,18 @@ LogicVRegister Simulator::pmull2(VectorFormat vform,
                                  LogicVRegister dst,
                                  const LogicVRegister& src1,
                                  const LogicVRegister& src2) {
-  VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
   dst.ClearForWrite(vform);
+  VectorFormat vform_src = VectorFormatHalfWidthDoubleLanes(vform);
+
   int lane_count = LaneCountFromFormat(vform);
   for (int i = 0; i < lane_count; i++) {
     dst.SetUint(vform,
                 i,
-                PolynomialMult(src1.Uint(vform_src, lane_count + i),
-                               src2.Uint(vform_src, lane_count + i),
-                               LaneSizeInBitsFromFormat(vform_src)));
+                PolynomialMult128(src1.Uint(vform_src, lane_count + i),
+                                  src2.Uint(vform_src, lane_count + i),
+                                  LaneSizeInBitsFromFormat(vform_src)));
   }
+
   return dst;
 }
 
@@ -2257,7 +2277,10 @@ LogicVRegister Simulator::extractnarrow(VectorFormat dstform,
     }
   }
 
-  if (!upperhalf) {
+  if (upperhalf) {
+    // Clear any bits beyond a Q register.
+    dst.ClearForWrite(kFormat16B);
+  } else {
     dst.ClearForWrite(dstform);
   }
   return dst;
@@ -2491,6 +2514,7 @@ LogicVRegister Simulator::ror(VectorFormat vform,
                               LogicVRegister dst,
                               const LogicVRegister& src,
                               int rotation) {
+  dst.ClearForWrite(vform);
   int width = LaneSizeInBitsFromFormat(vform);
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
     uint64_t value = src.Uint(vform, i);
@@ -2499,6 +2523,14 @@ LogicVRegister Simulator::ror(VectorFormat vform,
   return dst;
 }
 
+LogicVRegister Simulator::rol(VectorFormat vform,
+                              LogicVRegister dst,
+                              const LogicVRegister& src,
+                              int rotation) {
+  int ror_equivalent = LaneSizeInBitsFromFormat(vform) - rotation;
+  return ror(vform, dst, src, ror_equivalent);
+}
+
 LogicVRegister Simulator::ext(VectorFormat vform,
                               LogicVRegister dst,
                               const LogicVRegister& src1,
@@ -2507,10 +2539,10 @@ LogicVRegister Simulator::ext(VectorFormat vform,
   uint8_t result[kZRegMaxSizeInBytes] = {};
   int lane_count = LaneCountFromFormat(vform);
   for (int i = 0; i < lane_count - index; ++i) {
-    result[i] = src1.Uint(vform, i + index);
+    result[i] = static_cast<uint8_t>(src1.Uint(vform, i + index));
   }
   for (int i = 0; i < index; ++i) {
-    result[lane_count - index + i] = src2.Uint(vform, i);
+    result[lane_count - index + i] = static_cast<uint8_t>(src2.Uint(vform, i));
   }
   dst.ClearForWrite(vform);
   for (int i = 0; i < lane_count; ++i) {
@@ -2707,7 +2739,7 @@ LogicVRegister Simulator::fcmla(VectorFormat vform,
                                 int index,
                                 int rot) {
   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
-    VIXL_UNIMPLEMENTED();
+    fcmla<SimFloat16>(vform, dst, src1, src2, dst, index, rot);
   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     fcmla<float>(vform, dst, src1, src2, dst, index, rot);
   } else {
@@ -4153,7 +4185,7 @@ LogicVRegister Simulator::sqrdmlash_d(VectorFormat vform,
 
     // Arithmetic shift the whole value right by `esize - 1` bits.
     accum.second = (accum.first << 1) | (accum.second >> (esize - 1));
-    accum.first = -(accum.first >> (esize - 1));
+    accum.first = UnsignedNegate(accum.first >> (esize - 1));
 
     // Perform saturation.
     bool is_pos = (accum.first == 0) ? true : false;
@@ -4531,7 +4563,7 @@ T Simulator::FPMulx(T op1, T op2) {
   if ((IsInf(op1) && (op2 == 0.0)) || (IsInf(op2) && (op1 == 0.0))) {
     // inf * 0.0 returns +/-2.0.
     T two = 2.0;
-    return copysign(1.0, op1) * copysign(1.0, op2) * two;
+    return copysign(T(1.0), op1) * copysign(T(1.0), op2) * two;
   }
   return FPMul(op1, op2);
 }
@@ -4541,8 +4573,8 @@ template <typename T>
 T Simulator::FPMulAdd(T a, T op1, T op2) {
   T result = FPProcessNaNs3(a, op1, op2);
 
-  T sign_a = copysign(1.0, a);
-  T sign_prod = copysign(1.0, op1) * copysign(1.0, op2);
+  T sign_a = copysign(T(1.0), a);
+  T sign_prod = copysign(T(1.0), op1) * copysign(T(1.0), op2);
   bool isinf_prod = IsInf(op1) || IsInf(op2);
   bool operation_generates_nan =
       (IsInf(op1) && (op2 == 0.0)) ||                     // inf * 0.0
@@ -4568,7 +4600,7 @@ T Simulator::FPMulAdd(T a, T op1, T op2) {
   // Work around broken fma implementations for exact zero results: The sign of
   // exact 0.0 results is positive unless both a and op1 * op2 are negative.
   if (((op1 == 0.0) || (op2 == 0.0)) && (a == 0.0)) {
-    return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? -0.0 : 0.0;
+    return ((sign_a < T(0.0)) && (sign_prod < T(0.0))) ? T(-0.0) : T(0.0);
   }
 
   result = FusedMultiplyAdd(op1, op2, a);
@@ -4577,7 +4609,7 @@ T Simulator::FPMulAdd(T a, T op1, T op2) {
   // Work around broken fma implementations for rounded zero results: If a is
   // 0.0, the sign of the result is the sign of op1 * op2 before rounding.
   if ((a == 0.0) && (result == 0.0)) {
-    return copysign(0.0, sign_prod);
+    return copysign(T(0.0), sign_prod);
   }
 
   return result;
@@ -4642,9 +4674,9 @@ T Simulator::FPMax(T a, T b) {
 template <typename T>
 T Simulator::FPMaxNM(T a, T b) {
   if (IsQuietNaN(a) && !IsQuietNaN(b)) {
-    a = kFP64NegativeInfinity;
+    a = T(kFP64NegativeInfinity);
   } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
-    b = kFP64NegativeInfinity;
+    b = T(kFP64NegativeInfinity);
   }
 
   T result = FPProcessNaNs(a, b);
@@ -4669,9 +4701,9 @@ T Simulator::FPMin(T a, T b) {
 template <typename T>
 T Simulator::FPMinNM(T a, T b) {
   if (IsQuietNaN(a) && !IsQuietNaN(b)) {
-    a = kFP64PositiveInfinity;
+    a = T(kFP64PositiveInfinity);
   } else if (!IsQuietNaN(a) && IsQuietNaN(b)) {
-    b = kFP64PositiveInfinity;
+    b = T(kFP64PositiveInfinity);
   }
 
   T result = FPProcessNaNs(a, b);
@@ -4686,8 +4718,8 @@ T Simulator::FPRecipStepFused(T op1, T op2) {
     return two;
   } else if (IsInf(op1) || IsInf(op2)) {
     // Return +inf if signs match, otherwise -inf.
-    return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
-                                          : kFP64NegativeInfinity;
+    return ((op1 >= 0.0) == (op2 >= 0.0)) ? T(kFP64PositiveInfinity)
+                                          : T(kFP64NegativeInfinity);
   } else {
     return FusedMultiplyAdd(op1, op2, two);
   }
@@ -4716,8 +4748,8 @@ T Simulator::FPRSqrtStepFused(T op1, T op2) {
     return one_point_five;
   } else if (IsInf(op1) || IsInf(op2)) {
     // Return +inf if signs match, otherwise -inf.
-    return ((op1 >= 0.0) == (op2 >= 0.0)) ? kFP64PositiveInfinity
-                                          : kFP64NegativeInfinity;
+    return ((op1 >= 0.0) == (op2 >= 0.0)) ? T(kFP64PositiveInfinity)
+                                          : T(kFP64NegativeInfinity);
   } else {
     // The multiply-add-halve operation must be fully fused, so avoid interim
     // rounding by checking which operand can be losslessly divided by two
@@ -4746,7 +4778,7 @@ int32_t Simulator::FPToFixedJS(double value) {
       (value == kFP64NegativeInfinity)) {
     // +/- zero and infinity all return zero, however -0 and +/- Infinity also
     // unset the Z-flag.
-    result = 0.0;
+    result = 0;
     if ((value != 0.0) || std::signbit(value)) {
       Z = 0;
     }
@@ -5531,38 +5563,40 @@ LogicVRegister Simulator::fsqrt(VectorFormat vform,
 }
 
 
-#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP)                                    \
-  LogicVRegister Simulator::FNP(VectorFormat vform,                            \
-                                LogicVRegister dst,                            \
-                                const LogicVRegister& src1,                    \
-                                const LogicVRegister& src2) {                  \
-    SimVRegister temp1, temp2;                                                 \
-    uzp1(vform, temp1, src1, src2);                                            \
-    uzp2(vform, temp2, src1, src2);                                            \
-    FN(vform, dst, temp1, temp2);                                              \
-    if (IsSVEFormat(vform)) {                                                  \
-      interleave_top_bottom(vform, dst, dst);                                  \
-    }                                                                          \
-    return dst;                                                                \
-  }                                                                            \
-                                                                               \
-  LogicVRegister Simulator::FNP(VectorFormat vform,                            \
-                                LogicVRegister dst,                            \
-                                const LogicVRegister& src) {                   \
-    if (vform == kFormatH) {                                                   \
-      SimFloat16 result(OP(SimFloat16(RawbitsToFloat16(src.Uint(vform, 0))),   \
-                           SimFloat16(RawbitsToFloat16(src.Uint(vform, 1))))); \
-      dst.SetUint(vform, 0, Float16ToRawbits(result));                         \
-    } else if (vform == kFormatS) {                                            \
-      float result = OP(src.Float<float>(0), src.Float<float>(1));             \
-      dst.SetFloat(0, result);                                                 \
-    } else {                                                                   \
-      VIXL_ASSERT(vform == kFormatD);                                          \
-      double result = OP(src.Float<double>(0), src.Float<double>(1));          \
-      dst.SetFloat(0, result);                                                 \
-    }                                                                          \
-    dst.ClearForWrite(vform);                                                  \
-    return dst;                                                                \
+#define DEFINE_NEON_FP_PAIR_OP(FNP, FN, OP)                                   \
+  LogicVRegister Simulator::FNP(VectorFormat vform,                           \
+                                LogicVRegister dst,                           \
+                                const LogicVRegister& src1,                   \
+                                const LogicVRegister& src2) {                 \
+    SimVRegister temp1, temp2;                                                \
+    uzp1(vform, temp1, src1, src2);                                           \
+    uzp2(vform, temp2, src1, src2);                                           \
+    FN(vform, dst, temp1, temp2);                                             \
+    if (IsSVEFormat(vform)) {                                                 \
+      interleave_top_bottom(vform, dst, dst);                                 \
+    }                                                                         \
+    return dst;                                                               \
+  }                                                                           \
+                                                                              \
+  LogicVRegister Simulator::FNP(VectorFormat vform,                           \
+                                LogicVRegister dst,                           \
+                                const LogicVRegister& src) {                  \
+    if (vform == kFormatH) {                                                  \
+      SimFloat16 result(OP(SimFloat16(RawbitsToFloat16(                       \
+                               static_cast<uint16_t>(src.Uint(vform, 0)))),   \
+                           SimFloat16(RawbitsToFloat16(                       \
+                               static_cast<uint16_t>(src.Uint(vform, 1)))))); \
+      dst.SetUint(vform, 0, Float16ToRawbits(result));                        \
+    } else if (vform == kFormatS) {                                           \
+      float result = OP(src.Float<float>(0), src.Float<float>(1));            \
+      dst.SetFloat(0, result);                                                \
+    } else {                                                                  \
+      VIXL_ASSERT(vform == kFormatD);                                         \
+      double result = OP(src.Float<double>(0), src.Float<double>(1));         \
+      dst.SetFloat(0, result);                                                \
+    }                                                                         \
+    dst.ClearForWrite(vform);                                                 \
+    return dst;                                                               \
   }
 NEON_FPPAIRWISE_LIST(DEFINE_NEON_FP_PAIR_OP)
 #undef DEFINE_NEON_FP_PAIR_OP
@@ -5804,7 +5838,8 @@ LogicVRegister Simulator::frint(VectorFormat vform,
   } else if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
       float input = src.Float<float>(i);
-      float rounded = FPRoundInt(input, rounding_mode, frint_mode);
+      float rounded =
+          static_cast<float>(FPRoundInt(input, rounding_mode, frint_mode));
 
       if (inexact_exception && !IsNaN(input) && (input != rounded)) {
         FPProcessException();
@@ -5966,6 +6001,7 @@ LogicVRegister Simulator::fcvtu(VectorFormat vform,
 LogicVRegister Simulator::fcvtl(VectorFormat vform,
                                 LogicVRegister dst,
                                 const LogicVRegister& src) {
+  dst.ClearForWrite(vform);
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = LaneCountFromFormat(vform) - 1; i >= 0; i--) {
       // TODO: Full support for SimFloat16 in SimRegister(s).
@@ -5986,6 +6022,7 @@ LogicVRegister Simulator::fcvtl(VectorFormat vform,
 LogicVRegister Simulator::fcvtl2(VectorFormat vform,
                                  LogicVRegister dst,
                                  const LogicVRegister& src) {
+  dst.ClearForWrite(vform);
   int lane_count = LaneCountFromFormat(vform);
   if (LaneSizeInBitsFromFormat(vform) == kSRegSize) {
     for (int i = 0; i < lane_count; i++) {
@@ -6031,6 +6068,7 @@ LogicVRegister Simulator::fcvtn(VectorFormat vform,
 LogicVRegister Simulator::fcvtn2(VectorFormat vform,
                                  LogicVRegister dst,
                                  const LogicVRegister& src) {
+  dst.ClearForWrite(vform);
   int lane_count = LaneCountFromFormat(vform) / 2;
   if (LaneSizeInBitsFromFormat(vform) == kHRegSize) {
     for (int i = lane_count - 1; i >= 0; i--) {
@@ -6074,6 +6112,7 @@ LogicVRegister Simulator::fcvtxn2(VectorFormat vform,
                                   LogicVRegister dst,
                                   const LogicVRegister& src) {
   VIXL_ASSERT(LaneSizeInBitsFromFormat(vform) == kSRegSize);
+  dst.ClearForWrite(vform);
   int lane_count = LaneCountFromFormat(vform) / 2;
   for (int i = lane_count - 1; i >= 0; i--) {
     dst.SetFloat(i + lane_count,
@@ -6110,9 +6149,9 @@ T Simulator::FPRecipSqrtEstimate(T op) {
     return FPProcessNaN(op);
   } else if (op == 0.0) {
     if (copysign(1.0, op) < 0.0) {
-      return kFP64NegativeInfinity;
+      return T(kFP64NegativeInfinity);
     } else {
-      return kFP64PositiveInfinity;
+      return T(kFP64PositiveInfinity);
     }
   } else if (copysign(1.0, op) < 0.0) {
     FPProcessException();
@@ -6123,11 +6162,11 @@ T Simulator::FPRecipSqrtEstimate(T op) {
     uint64_t fraction;
     int exp, result_exp;
 
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       exp = Float16Exp(op);
       fraction = Float16Mantissa(op);
       fraction <<= 42;
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       exp = FloatExp(op);
       fraction = FloatMantissa(op);
       fraction <<= 29;
@@ -6152,9 +6191,9 @@ T Simulator::FPRecipSqrtEstimate(T op) {
       scaled = DoublePack(0, 1021, Bits(fraction, 51, 44) << 44);
     }
 
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       result_exp = (44 - exp) / 2;
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       result_exp = (380 - exp) / 2;
     } else {
       VIXL_ASSERT(IsFloat64<T>());
@@ -6163,11 +6202,11 @@ T Simulator::FPRecipSqrtEstimate(T op) {
 
     uint64_t estimate = DoubleToRawbits(recip_sqrt_estimate(scaled));
 
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
       uint16_t est_bits = static_cast<uint16_t>(Bits(estimate, 51, 42));
       return Float16Pack(0, exp_bits, est_bits);
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
       uint32_t est_bits = static_cast<uint32_t>(Bits(estimate, 51, 29));
       return FloatPack(0, exp_bits, est_bits);
@@ -6207,9 +6246,9 @@ template <typename T>
 T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
   uint32_t sign;
 
-  if (IsFloat16<T>()) {
+  if constexpr (IsFloat16<T>()) {
     sign = Float16Sign(op);
-  } else if (IsFloat32<T>()) {
+  } else if constexpr (IsFloat32<T>()) {
     sign = FloatSign(op);
   } else {
     VIXL_ASSERT(IsFloat64<T>());
@@ -6219,10 +6258,10 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
   if (IsNaN(op)) {
     return FPProcessNaN(op);
   } else if (IsInf(op)) {
-    return (sign == 1) ? -0.0 : 0.0;
+    return (sign == 1) ? T(-0.0) : T(0.0);
   } else if (op == 0.0) {
     FPProcessException();  // FPExc_DivideByZero exception.
-    return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
+    return (sign == 1) ? T(kFP64NegativeInfinity) : T(kFP64PositiveInfinity);
   } else if ((IsFloat16<T>() && (std::fabs(op) < std::pow(2.0, -16.0))) ||
              (IsFloat32<T>() && (std::fabs(op) < std::pow(2.0, -128.0))) ||
              (IsFloat64<T>() && (std::fabs(op) < std::pow(2.0, -1024.0)))) {
@@ -6245,12 +6284,12 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
     }
     FPProcessException();  // FPExc_Overflow and FPExc_Inexact.
     if (overflow_to_inf) {
-      return (sign == 1) ? kFP64NegativeInfinity : kFP64PositiveInfinity;
+      return (sign == 1) ? T(kFP64NegativeInfinity) : T(kFP64PositiveInfinity);
     } else {
       // Return FPMaxNormal(sign).
-      if (IsFloat16<T>()) {
+      if constexpr (IsFloat16<T>()) {
         return Float16Pack(sign, 0x1f, 0x3ff);
-      } else if (IsFloat32<T>()) {
+      } else if constexpr (IsFloat32<T>()) {
         return FloatPack(sign, 0xfe, 0x07fffff);
       } else {
         VIXL_ASSERT(IsFloat64<T>());
@@ -6261,12 +6300,12 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
     uint64_t fraction;
     int exp, result_exp;
 
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       sign = Float16Sign(op);
       exp = Float16Exp(op);
       fraction = Float16Mantissa(op);
       fraction <<= 42;
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       sign = FloatSign(op);
       exp = FloatExp(op);
       fraction = FloatMantissa(op);
@@ -6289,9 +6328,9 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
 
     double scaled = DoublePack(0, 1022, Bits(fraction, 51, 44) << 44);
 
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       result_exp = (29 - exp);  // In range 29-30 = -1 to 29+1 = 30.
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       result_exp = (253 - exp);  // In range 253-254 = -1 to 253+1 = 254.
     } else {
       VIXL_ASSERT(IsFloat64<T>());
@@ -6307,11 +6346,11 @@ T Simulator::FPRecipEstimate(T op, FPRounding rounding) {
       fraction = (UINT64_C(1) << 50) | Bits(fraction, 51, 2);
       result_exp = 0;
     }
-    if (IsFloat16<T>()) {
+    if constexpr (IsFloat16<T>()) {
       uint16_t exp_bits = static_cast<uint16_t>(Bits(result_exp, 4, 0));
       uint16_t frac_bits = static_cast<uint16_t>(Bits(fraction, 51, 42));
       return Float16Pack(sign, exp_bits, frac_bits);
-    } else if (IsFloat32<T>()) {
+    } else if constexpr (IsFloat32<T>()) {
       uint32_t exp_bits = static_cast<uint32_t>(Bits(result_exp, 7, 0));
       uint32_t frac_bits = static_cast<uint32_t>(Bits(fraction, 51, 29));
       return FloatPack(sign, exp_bits, frac_bits);
@@ -6457,12 +6496,12 @@ LogicVRegister Simulator::frecpx(VectorFormat vform,
     } else {
       int exp;
       uint32_t sign;
-      if (IsFloat16<T>()) {
+      if constexpr (IsFloat16<T>()) {
         sign = Float16Sign(op);
         exp = Float16Exp(op);
         exp = (exp == 0) ? (0x1F - 1) : static_cast<int>(Bits(~exp, 4, 0));
         result = Float16Pack(sign, exp, 0);
-      } else if (IsFloat32<T>()) {
+      } else if constexpr (IsFloat32<T>()) {
         sign = FloatSign(op);
         exp = FloatExp(op);
         exp = (exp == 0) ? (0xFF - 1) : static_cast<int>(Bits(~exp, 7, 0));
@@ -6766,18 +6805,21 @@ LogicVRegister Simulator::fexpa(VectorFormat vform,
 
   if (lane_size == kHRegSize) {
     index_highbit = 4;
-    VIXL_ASSERT(ArrayLength(fexpa_coeff16) == (1U << (index_highbit + 1)));
+    VIXL_ASSERT(ArrayLength(fexpa_coeff16) ==
+                (uint64_t{1} << (index_highbit + 1)));
     fexpa_coeff = fexpa_coeff16;
     op_highbit = 9;
     op_shift = 10;
   } else if (lane_size == kSRegSize) {
-    VIXL_ASSERT(ArrayLength(fexpa_coeff32) == (1U << (index_highbit + 1)));
+    VIXL_ASSERT(ArrayLength(fexpa_coeff32) ==
+                (uint64_t{1} << (index_highbit + 1)));
     fexpa_coeff = fexpa_coeff32;
     op_highbit = 13;
     op_shift = 23;
   } else {
     VIXL_ASSERT(lane_size == kDRegSize);
-    VIXL_ASSERT(ArrayLength(fexpa_coeff64) == (1U << (index_highbit + 1)));
+    VIXL_ASSERT(ArrayLength(fexpa_coeff64) ==
+                (uint64_t{1} << (index_highbit + 1)));
     fexpa_coeff = fexpa_coeff64;
     op_highbit = 16;
     op_shift = 52;
@@ -7274,7 +7316,9 @@ void Simulator::SVEStructuredStoreHelper(VectorFormat vform,
 
     for (int r = 0; r < reg_count; r++) {
       uint64_t element_address = addr.GetElementAddress(i, r);
-      StoreLane(zt[r], unpack_vform, i << unpack_shift, element_address);
+      if (!StoreLane(zt[r], unpack_vform, i << unpack_shift, element_address)) {
+        return;
+      }
     }
   }
 
@@ -7298,7 +7342,7 @@ void Simulator::SVEStructuredStoreHelper(VectorFormat vform,
   }
 }
 
-void Simulator::SVEStructuredLoadHelper(VectorFormat vform,
+bool Simulator::SVEStructuredLoadHelper(VectorFormat vform,
                                         const LogicPRegister& pg,
                                         unsigned zt_code,
                                         const LogicSVEAddressVector& addr,
@@ -7333,9 +7377,13 @@ void Simulator::SVEStructuredLoadHelper(VectorFormat vform,
       }
 
       if (is_signed) {
-        LoadIntToLane(zt[r], vform, msize_in_bytes, i, element_address);
+        if (!LoadIntToLane(zt[r], vform, msize_in_bytes, i, element_address)) {
+          return false;
+        }
       } else {
-        LoadUintToLane(zt[r], vform, msize_in_bytes, i, element_address);
+        if (!LoadUintToLane(zt[r], vform, msize_in_bytes, i, element_address)) {
+          return false;
+        }
       }
     }
   }
@@ -7354,6 +7402,7 @@ void Simulator::SVEStructuredLoadHelper(VectorFormat vform,
                        "<-",
                        addr);
   }
+  return true;
 }
 
 LogicPRegister Simulator::brka(LogicPRegister pd,
@@ -7448,7 +7497,7 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform,
 
   // Non-faulting loads are allowed to fail arbitrarily. To stress user
   // code, fail a random element in roughly one in eight full-vector loads.
-  uint32_t rnd = static_cast<uint32_t>(jrand48(rand_state_));
+  uint32_t rnd = static_cast<uint32_t>(rand_gen_());
   int fake_fault_at_lane = rnd % (LaneCountFromFormat(vform) * 8);
 
   for (int i = 0; i < LaneCountFromFormat(vform); i++) {
@@ -7461,7 +7510,9 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform,
         // First-faulting loads always load the first active element, regardless
         // of FFR. The result will be discarded if its FFR lane is inactive, but
         // it could still generate a fault.
-        value = MemReadUint(msize_in_bytes, element_address);
+        VIXL_DEFINE_OR_RETURN(mem_result,
+                              MemReadUint(msize_in_bytes, element_address));
+        value = mem_result;
         // All subsequent elements have non-fault semantics.
         type = kSVENonFaultLoad;
 
@@ -7473,7 +7524,9 @@ void Simulator::SVEFaultTolerantLoadHelper(VectorFormat vform,
         bool can_read = (i < fake_fault_at_lane) &&
                         CanReadMemory(element_address, msize_in_bytes);
         if (can_read) {
-          value = MemReadUint(msize_in_bytes, element_address);
+          VIXL_DEFINE_OR_RETURN(mem_result,
+                                MemReadUint(msize_in_bytes, element_address));
+          value = mem_result;
         } else {
           // Propagate the fault to the end of FFR.
           for (int j = i; j < LaneCountFromFormat(vform); j++) {
@@ -7851,6 +7904,653 @@ LogicVRegister Simulator::fmatmul(VectorFormat vform,
   return dst;
 }
 
+template <>
+uint64_t CryptoOp<"choose"_h>(uint64_t x, uint64_t y, uint64_t z) {
+  return ((y ^ z) & x) ^ z;
+}
+
+template <>
+uint64_t CryptoOp<"majority"_h>(uint64_t x, uint64_t y, uint64_t z) {
+  return (x & y) | ((x | y) & z);
+}
+
+template <>
+uint64_t CryptoOp<"parity"_h>(uint64_t x, uint64_t y, uint64_t z) {
+  return x ^ y ^ z;
+}
+
+template <typename T, unsigned A, unsigned B, unsigned C>
+static uint64_t SHASigma(uint64_t x) {
+  return static_cast<T>(RotateRight(x, A, sizeof(T) * kBitsPerByte) ^
+                        RotateRight(x, B, sizeof(T) * kBitsPerByte) ^
+                        RotateRight(x, C, sizeof(T) * kBitsPerByte));
+}
+
+LogicVRegister Simulator::sha2h(LogicVRegister srcdst,
+                                const LogicVRegister& src1,
+                                const LogicVRegister& src2,
+                                bool part1) {
+  uint64_t x[4] = {};
+  uint64_t y[4] = {};
+  if (part1) {
+    // Switch input order based on which part is being handled.
+    srcdst.UintArray(kFormat4S, x);
+    src1.UintArray(kFormat4S, y);
+  } else {
+    src1.UintArray(kFormat4S, x);
+    srcdst.UintArray(kFormat4S, y);
+  }
+
+  for (unsigned i = 0; i < ArrayLength(x); i++) {
+    uint64_t chs = CryptoOp<"choose"_h>(y[0], y[1], y[2]);
+    uint64_t maj = CryptoOp<"majority"_h>(x[0], x[1], x[2]);
+
+    uint64_t w = src2.Uint(kFormat4S, i);
+    uint64_t t = y[3] + SHASigma<uint32_t, 6, 11, 25>(y[0]) + chs + w;
+
+    x[3] += t;
+    y[3] = t + SHASigma<uint32_t, 2, 13, 22>(x[0]) + maj;
+
+    // y:x = ROL(y:x, 32)
+    SHARotateEltsLeftOne(x);
+    SHARotateEltsLeftOne(y);
+    std::swap(x[0], y[0]);
+  }
+
+  srcdst.SetUintArray(kFormat4S, part1 ? x : y);
+  return srcdst;
+}
+
+template <typename T, unsigned A, unsigned B, unsigned C>
+static uint64_t SHASURotate(uint64_t x) {
+  return RotateRight(x, A, sizeof(T) * kBitsPerByte) ^
+         RotateRight(x, B, sizeof(T) * kBitsPerByte) ^
+         ((x & ~static_cast<T>(0)) >> C);
+}
+
+LogicVRegister Simulator::sha2su0(LogicVRegister srcdst,
+                                  const LogicVRegister& src1) {
+  uint64_t w[4] = {};
+  uint64_t result[4];
+  srcdst.UintArray(kFormat4S, w);
+  uint64_t x = src1.Uint(kFormat4S, 0);
+
+  result[0] = SHASURotate<uint32_t, 7, 18, 3>(w[1]) + w[0];
+  result[1] = SHASURotate<uint32_t, 7, 18, 3>(w[2]) + w[1];
+  result[2] = SHASURotate<uint32_t, 7, 18, 3>(w[3]) + w[2];
+  result[3] = SHASURotate<uint32_t, 7, 18, 3>(x) + w[3];
+
+  srcdst.SetUintArray(kFormat4S, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sha2su1(LogicVRegister srcdst,
+                                  const LogicVRegister& src1,
+                                  const LogicVRegister& src2) {
+  uint64_t w[4] = {};
+  uint64_t x[4] = {};
+  uint64_t y[4] = {};
+  uint64_t result[4];
+  srcdst.UintArray(kFormat4S, w);
+  src1.UintArray(kFormat4S, x);
+  src2.UintArray(kFormat4S, y);
+
+  result[0] = SHASURotate<uint32_t, 17, 19, 10>(y[2]) + w[0] + x[1];
+  result[1] = SHASURotate<uint32_t, 17, 19, 10>(y[3]) + w[1] + x[2];
+  result[2] = SHASURotate<uint32_t, 17, 19, 10>(result[0]) + w[2] + x[3];
+  result[3] = SHASURotate<uint32_t, 17, 19, 10>(result[1]) + w[3] + y[0];
+
+  srcdst.SetUintArray(kFormat4S, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sha512h(LogicVRegister srcdst,
+                                  const LogicVRegister& src1,
+                                  const LogicVRegister& src2) {
+  uint64_t w[2] = {};
+  uint64_t x[2] = {};
+  uint64_t y[2] = {};
+  uint64_t result[2] = {};
+  srcdst.UintArray(kFormat2D, w);
+  src1.UintArray(kFormat2D, x);
+  src2.UintArray(kFormat2D, y);
+
+  result[1] = (y[1] & x[0]) ^ (~y[1] & x[1]);
+  result[1] += SHASigma<uint64_t, 14, 18, 41>(y[1]) + w[1];
+
+  uint64_t tmp = result[1] + y[0];
+
+  result[0] = (tmp & y[1]) ^ (~tmp & x[0]);
+  result[0] += SHASigma<uint64_t, 14, 18, 41>(tmp) + w[0];
+
+  srcdst.SetUintArray(kFormat2D, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sha512h2(LogicVRegister srcdst,
+                                   const LogicVRegister& src1,
+                                   const LogicVRegister& src2) {
+  uint64_t w[2] = {};
+  uint64_t x[2] = {};
+  uint64_t y[2] = {};
+  uint64_t result[2] = {};
+  srcdst.UintArray(kFormat2D, w);
+  src1.UintArray(kFormat2D, x);
+  src2.UintArray(kFormat2D, y);
+
+  result[1] = (x[0] & y[1]) ^ (x[0] & y[0]) ^ (y[1] & y[0]);
+  result[1] += SHASigma<uint64_t, 28, 34, 39>(y[0]) + w[1];
+
+  result[0] = (result[1] & y[0]) ^ (result[1] & y[1]) ^ (y[1] & y[0]);
+  result[0] += SHASigma<uint64_t, 28, 34, 39>(result[1]) + w[0];
+
+  srcdst.SetUintArray(kFormat2D, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sha512su0(LogicVRegister srcdst,
+                                    const LogicVRegister& src1) {
+  uint64_t w[2] = {};
+  uint64_t x[2] = {};
+  uint64_t result[2] = {};
+  srcdst.UintArray(kFormat2D, w);
+  src1.UintArray(kFormat2D, x);
+
+  result[0] = SHASURotate<uint64_t, 1, 8, 7>(w[1]) + w[0];
+  result[1] = SHASURotate<uint64_t, 1, 8, 7>(x[0]) + w[1];
+
+  srcdst.SetUintArray(kFormat2D, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sha512su1(LogicVRegister srcdst,
+                                    const LogicVRegister& src1,
+                                    const LogicVRegister& src2) {
+  uint64_t w[2] = {};
+  uint64_t x[2] = {};
+  uint64_t y[2] = {};
+  uint64_t result[2] = {};
+  srcdst.UintArray(kFormat2D, w);
+  src1.UintArray(kFormat2D, x);
+  src2.UintArray(kFormat2D, y);
+
+  result[1] = w[1] + SHASURotate<uint64_t, 19, 61, 6>(x[1]) + y[1];
+  result[0] = w[0] + SHASURotate<uint64_t, 19, 61, 6>(x[0]) + y[0];
+
+  srcdst.SetUintArray(kFormat2D, result);
+  return srcdst;
+}
+
+static uint8_t GalMul(int table, uint64_t x) {
+  // Galois multiplication lookup tables.
+  static const uint8_t ffmul02[256] = {
+      0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16,
+      0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e,
+      0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 0x40, 0x42, 0x44, 0x46,
+      0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e,
+      0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76,
+      0x78, 0x7a, 0x7c, 0x7e, 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e,
+      0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 0xa0, 0xa2, 0xa4, 0xa6,
+      0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe,
+      0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6,
+      0xd8, 0xda, 0xdc, 0xde, 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee,
+      0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 0x1b, 0x19, 0x1f, 0x1d,
+      0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05,
+      0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d,
+      0x23, 0x21, 0x27, 0x25, 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55,
+      0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, 0x7b, 0x79, 0x7f, 0x7d,
+      0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65,
+      0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d,
+      0x83, 0x81, 0x87, 0x85, 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5,
+      0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, 0xdb, 0xd9, 0xdf, 0xdd,
+      0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5,
+      0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed,
+      0xe3, 0xe1, 0xe7, 0xe5,
+  };
+
+  static const uint8_t ffmul03[256] = {
+      0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d,
+      0x14, 0x17, 0x12, 0x11, 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39,
+      0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, 0x60, 0x63, 0x66, 0x65,
+      0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71,
+      0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d,
+      0x44, 0x47, 0x42, 0x41, 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9,
+      0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, 0xf0, 0xf3, 0xf6, 0xf5,
+      0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1,
+      0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd,
+      0xb4, 0xb7, 0xb2, 0xb1, 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99,
+      0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, 0x9b, 0x98, 0x9d, 0x9e,
+      0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a,
+      0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6,
+      0xbf, 0xbc, 0xb9, 0xba, 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2,
+      0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, 0xcb, 0xc8, 0xcd, 0xce,
+      0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda,
+      0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46,
+      0x4f, 0x4c, 0x49, 0x4a, 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62,
+      0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, 0x3b, 0x38, 0x3d, 0x3e,
+      0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a,
+      0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16,
+      0x1f, 0x1c, 0x19, 0x1a,
+  };
+
+  static const uint8_t ffmul09[256] = {
+      0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53,
+      0x6c, 0x65, 0x7e, 0x77, 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf,
+      0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 0x3b, 0x32, 0x29, 0x20,
+      0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c,
+      0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8,
+      0xc7, 0xce, 0xd5, 0xdc, 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49,
+      0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, 0xe6, 0xef, 0xf4, 0xfd,
+      0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91,
+      0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e,
+      0x21, 0x28, 0x33, 0x3a, 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2,
+      0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, 0xec, 0xe5, 0xfe, 0xf7,
+      0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b,
+      0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f,
+      0x10, 0x19, 0x02, 0x0b, 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8,
+      0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 0x47, 0x4e, 0x55, 0x5c,
+      0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30,
+      0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9,
+      0xf6, 0xff, 0xe4, 0xed, 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35,
+      0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, 0xa1, 0xa8, 0xb3, 0xba,
+      0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6,
+      0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62,
+      0x5d, 0x54, 0x4f, 0x46,
+  };
+
+  static const uint8_t ffmul0b[256] = {
+      0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45,
+      0x74, 0x7f, 0x62, 0x69, 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81,
+      0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 0x7b, 0x70, 0x6d, 0x66,
+      0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12,
+      0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e,
+      0xbf, 0xb4, 0xa9, 0xa2, 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7,
+      0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, 0x46, 0x4d, 0x50, 0x5b,
+      0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f,
+      0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8,
+      0xf9, 0xf2, 0xef, 0xe4, 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c,
+      0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, 0xf7, 0xfc, 0xe1, 0xea,
+      0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e,
+      0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02,
+      0x33, 0x38, 0x25, 0x2e, 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd,
+      0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, 0x3c, 0x37, 0x2a, 0x21,
+      0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55,
+      0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44,
+      0x75, 0x7e, 0x63, 0x68, 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80,
+      0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 0x7a, 0x71, 0x6c, 0x67,
+      0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13,
+      0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f,
+      0xbe, 0xb5, 0xa8, 0xa3,
+  };
+
+  static const uint8_t ffmul0d[256] = {
+      0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f,
+      0x5c, 0x51, 0x46, 0x4b, 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3,
+      0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 0xbb, 0xb6, 0xa1, 0xac,
+      0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0,
+      0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14,
+      0x37, 0x3a, 0x2d, 0x20, 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e,
+      0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 0xbd, 0xb0, 0xa7, 0xaa,
+      0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6,
+      0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9,
+      0x8a, 0x87, 0x90, 0x9d, 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25,
+      0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, 0xda, 0xd7, 0xc0, 0xcd,
+      0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91,
+      0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75,
+      0x56, 0x5b, 0x4c, 0x41, 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42,
+      0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, 0xb1, 0xbc, 0xab, 0xa6,
+      0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa,
+      0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8,
+      0xeb, 0xe6, 0xf1, 0xfc, 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44,
+      0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 0x0c, 0x01, 0x16, 0x1b,
+      0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47,
+      0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3,
+      0x80, 0x8d, 0x9a, 0x97,
+  };
+
+  static const uint8_t ffmul0e[256] = {
+      0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62,
+      0x48, 0x46, 0x54, 0x5a, 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca,
+      0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 0xdb, 0xd5, 0xc7, 0xc9,
+      0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81,
+      0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59,
+      0x73, 0x7d, 0x6f, 0x61, 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87,
+      0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 0x4d, 0x43, 0x51, 0x5f,
+      0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17,
+      0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14,
+      0x3e, 0x30, 0x22, 0x2c, 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc,
+      0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, 0x41, 0x4f, 0x5d, 0x53,
+      0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b,
+      0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3,
+      0xe9, 0xe7, 0xf5, 0xfb, 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0,
+      0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 0x7a, 0x74, 0x66, 0x68,
+      0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20,
+      0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e,
+      0xa4, 0xaa, 0xb8, 0xb6, 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26,
+      0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, 0x37, 0x39, 0x2b, 0x25,
+      0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d,
+      0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5,
+      0x9f, 0x91, 0x83, 0x8d,
+  };
+
+  x &= 255;
+  switch (table) {
+    case 0x2:
+      return ffmul02[x];
+    case 0x3:
+      return ffmul03[x];
+    case 0x9:
+      return ffmul09[x];
+    case 0xb:
+      return ffmul0b[x];
+    case 0xd:
+      return ffmul0d[x];
+    case 0xe:
+      return ffmul0e[x];
+    case 0:
+      // Case 0 indicates no table lookup, used for some forward mix stages.
+      return static_cast<uint8_t>(x);
+    default:
+      VIXL_UNREACHABLE();
+      return static_cast<uint8_t>(x);
+  }
+}
+
+
+static uint8_t AESMixInner(uint64_t* x, int stage, bool inverse) {
+  VIXL_ASSERT(IsUint2(stage));
+
+  int imc_gm[7] = {0xb, 0xd, 0x9, 0xe};
+  int mc_gm[7] = {0x3, 0x0, 0x0, 0x2};
+
+  int* gm = inverse ? imc_gm : mc_gm;
+  int index = 3 - stage;
+
+  uint8_t result = 0;
+  for (int i = 0; i < 4; i++) {
+    result ^= GalMul(gm[(index + i) % 4], x[i]);
+  }
+  return result;
+}
+
+
+LogicVRegister Simulator::aesmix(LogicVRegister dst,
+                                 const LogicVRegister& src,
+                                 bool inverse) {
+  uint64_t in[16] = {};
+  src.UintArray(kFormat16B, in);
+  dst.ClearForWrite(kFormat16B);
+
+  for (int c = 0; c < 16; c++) {
+    int cmod4 = c % 4;
+    int d = c - cmod4;
+    VIXL_ASSERT((d == 0) || (d == 4) || (d == 8) || (d == 12));
+    dst.SetUint(kFormat16B, c, AESMixInner(&in[d], cmod4, inverse));
+  }
+
+  return dst;
+}
+
+LogicVRegister Simulator::aes(LogicVRegister dst,
+                              const LogicVRegister& src,
+                              bool decrypt) {
+  dst.ClearForWrite(kFormat16B);
+
+  // (Inverse) shift rows.
+  uint8_t shift[] = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
+  uint8_t shift_inv[] = {0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3};
+  for (int i = 0; i < LaneCountFromFormat(kFormat16B); i++) {
+    uint8_t index = decrypt ? shift_inv[i] : shift[i];
+    dst.SetUint(kFormat16B, i, src.Uint(kFormat16B, index));
+  }
+
+  // (Inverse) substitute bytes.
+  static const uint8_t gf2[256] = {
+      0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+      0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+      0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+      0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+      0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+      0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+      0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+      0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+      0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+      0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+      0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+      0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+      0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+      0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+      0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+      0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+      0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+      0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+      0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+      0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+      0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+      0xb0, 0x54, 0xbb, 0x16,
+  };
+  static const uint8_t gf2_inv[256] = {
+      0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e,
+      0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+      0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32,
+      0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+      0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49,
+      0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+      0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50,
+      0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+      0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05,
+      0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+      0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41,
+      0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+      0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8,
+      0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+      0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b,
+      0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+      0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59,
+      0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+      0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d,
+      0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+      0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63,
+      0x55, 0x21, 0x0c, 0x7d,
+  };
+
+  for (int i = 0; i < LaneCountFromFormat(kFormat16B); i++) {
+    const uint8_t* table = decrypt ? gf2_inv : gf2;
+    dst.SetUint(kFormat16B, i, table[dst.Uint(kFormat16B, i)]);
+  }
+  return dst;
+}
+
+LogicVRegister Simulator::sm3partw1(LogicVRegister srcdst,
+                                    const LogicVRegister& src1,
+                                    const LogicVRegister& src2) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  SimVRegister temp;
+
+  ext(kFormat16B, temp, src2, temp, 4);
+  rol(kFormat4S, temp, temp, 15);
+  eor(kFormat4S, temp, temp, src1);
+  LogicVRegister r = eor(kFormat4S, temp, temp, srcdst);
+
+  uint64_t result[4] = {};
+  r.UintArray(kFormat4S, result);
+  for (int i = 0; i < 4; i++) {
+    if (i == 3) {
+      // result[3] already contains srcdst[3] ^ src1[3] from the operations
+      // above.
+      result[i] ^= ROL(result[0], 15);
+    }
+    result[i] ^= ROL(result[i], 15) ^ ROL(result[i], 23);
+  }
+  srcdst.SetUintArray(kFormat4S, result);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3partw2(LogicVRegister srcdst,
+                                    const LogicVRegister& src1,
+                                    const LogicVRegister& src2) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  SimVRegister temp;
+  VectorFormat vf = kFormat4S;
+
+  rol(vf, temp, src2, 7);
+  LogicVRegister r = eor(vf, temp, temp, src1);
+  eor(vf, srcdst, temp, srcdst);
+
+  uint64_t tmp2 = ROL(r.Uint(vf, 0), 15);
+  tmp2 ^= ROL(tmp2, 15) ^ ROL(tmp2, 23);
+  srcdst.SetUint(vf, 3, srcdst.Uint(vf, 3) ^ tmp2);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3ss1(LogicVRegister dst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 const LogicVRegister& src3) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  VectorFormat vf = kFormat4S;
+  uint64_t result = ROL(src1.Uint(vf, 3), 12);
+  result += src2.Uint(vf, 3) + src3.Uint(vf, 3);
+  dst.Clear();
+  dst.SetUint(vf, 3, ROL(result, 7));
+  return dst;
+}
+
+LogicVRegister Simulator::sm3tt1(LogicVRegister srcdst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 int index,
+                                 bool is_a) {
+  VectorFormat vf = kFormat4S;
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+  auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+  VIXL_ASSERT(IsUint2(index));
+
+  uint64_t wjprime = src2.Uint(vf, index);
+  uint64_t ss2 = src1.Uint(vf, 3) ^ ROL(sd(3), 12);
+
+  uint64_t tt1;
+  if (is_a) {
+    tt1 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+  } else {
+    tt1 = CryptoOp<"majority"_h>(sd(1), sd(2), sd(3));
+  }
+  tt1 += sd(0) + ss2 + wjprime;
+
+  ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+  srcdst.SetUint(vf, 1, ROL(sd(1), 9));
+  srcdst.SetUint(vf, 3, tt1);
+  return srcdst;
+}
+
+LogicVRegister Simulator::sm3tt2(LogicVRegister srcdst,
+                                 const LogicVRegister& src1,
+                                 const LogicVRegister& src2,
+                                 int index,
+                                 bool is_a) {
+  VectorFormat vf = kFormat4S;
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+  auto sd = std::bind(&LogicVRegister::Uint, srcdst, vf, _1);
+
+  VIXL_ASSERT(IsUint2(index));
+
+  uint64_t wj = src2.Uint(vf, index);
+
+  uint64_t tt2;
+  if (is_a) {
+    tt2 = CryptoOp<"parity"_h>(sd(1), sd(2), sd(3));
+  } else {
+    tt2 = CryptoOp<"choose"_h>(sd(3), sd(2), sd(1));
+  }
+  tt2 += sd(0) + src1.Uint(vf, 3) + wj;
+
+  ext(kFormat16B, srcdst, srcdst, srcdst, 4);
+  srcdst.SetUint(vf, 1, ROL(sd(1), 19));
+  tt2 ^= ROL(tt2, 9) ^ ROL(tt2, 17);
+  srcdst.SetUint(vf, 3, tt2);
+  return srcdst;
+}
+
+static uint64_t SM4SBox(uint64_t x) {
+  static const uint8_t sbox[256] = {
+      0x48, 0x39, 0xcb, 0xd7, 0x3e, 0x5f, 0xee, 0x79, 0x20, 0x4d, 0xdc, 0x3a,
+      0xec, 0x7d, 0xf0, 0x18, 0x84, 0xc6, 0x6e, 0xc5, 0x09, 0xf1, 0xb9, 0x65,
+      0x7e, 0x77, 0x96, 0x0c, 0x4a, 0x97, 0x69, 0x89, 0xb0, 0xb4, 0xe5, 0xb8,
+      0x12, 0xd0, 0x74, 0x2d, 0xbd, 0x7b, 0xcd, 0xa5, 0x88, 0x31, 0xc1, 0x0a,
+      0xd8, 0x5a, 0x10, 0x1f, 0x41, 0x5c, 0xd9, 0x11, 0x7f, 0xbc, 0xdd, 0xbb,
+      0x92, 0xaf, 0x1b, 0x8d, 0x51, 0x5b, 0x6c, 0x6d, 0x72, 0x6a, 0xff, 0x03,
+      0x2f, 0x8e, 0xfd, 0xde, 0x45, 0x37, 0xdb, 0xd5, 0x6f, 0x4e, 0x53, 0x0d,
+      0xab, 0x23, 0x29, 0xc0, 0x60, 0xca, 0x66, 0x82, 0x2e, 0xe2, 0xf6, 0x1d,
+      0xe3, 0xb1, 0x8c, 0xf5, 0x30, 0x32, 0x93, 0xad, 0x55, 0x1a, 0x34, 0x9b,
+      0xa4, 0x5d, 0xae, 0xe0, 0xa1, 0x15, 0x61, 0xf9, 0xce, 0xf2, 0xf7, 0xa3,
+      0xb5, 0x38, 0xc7, 0x40, 0xd2, 0x8a, 0xbf, 0xea, 0x9e, 0xc8, 0xc4, 0xa0,
+      0xe7, 0x02, 0x36, 0x4c, 0x52, 0x27, 0xd3, 0x9f, 0x57, 0x46, 0x00, 0xd4,
+      0x87, 0x78, 0x21, 0x01, 0x3b, 0x7c, 0x22, 0x25, 0xa2, 0xd1, 0x58, 0x63,
+      0x5e, 0x0e, 0x24, 0x1e, 0x35, 0x9d, 0x56, 0x70, 0x4b, 0x0f, 0xeb, 0xf8,
+      0x8b, 0xda, 0x64, 0x71, 0xb2, 0x81, 0x6b, 0x68, 0xa8, 0x4f, 0x85, 0xe6,
+      0x19, 0x3c, 0x59, 0x83, 0xba, 0x17, 0x73, 0xf3, 0xfc, 0xa7, 0x07, 0x47,
+      0xa6, 0x3f, 0x8f, 0x75, 0xfa, 0x94, 0xdf, 0x80, 0x95, 0xe8, 0x08, 0xc9,
+      0xa9, 0x1c, 0xb3, 0xe4, 0x62, 0xac, 0xcf, 0xed, 0x43, 0x0b, 0x54, 0x33,
+      0x7a, 0x98, 0xef, 0x91, 0xf4, 0x50, 0x42, 0x9c, 0x99, 0x06, 0x86, 0x49,
+      0x26, 0x13, 0x44, 0xaa, 0xc3, 0x04, 0xbe, 0x2a, 0x76, 0x9a, 0x67, 0x2b,
+      0x05, 0x2c, 0xfb, 0x28, 0xc2, 0x14, 0xb6, 0x16, 0xb7, 0x3d, 0xe1, 0xcc,
+      0xfe, 0xe9, 0x90, 0xd6,
+  };
+  uint64_t result = 0;
+  for (int j = 24; j >= 0; j -= 8) {
+    uint8_t s = 255 - ((x >> j) & 0xff);
+    result = (result << 8) | sbox[s];
+  }
+  return result;
+}
+
+LogicVRegister Simulator::sm4(LogicVRegister srcdst,
+                              const LogicVRegister& src1,
+                              const LogicVRegister& src2,
+                              bool is_key) {
+  using namespace std::placeholders;
+  auto ROL = std::bind(RotateLeft, _1, _2, kSRegSize);
+
+  VectorFormat vf = kFormat4S;
+  uint64_t result[4] = {};
+  if (is_key) {
+    src1.UintArray(vf, result);
+  } else {
+    srcdst.UintArray(vf, result);
+  }
+
+  for (int i = 0; i < 4; i++) {
+    uint64_t k = is_key ? src2.Uint(vf, i) : src1.Uint(vf, i);
+    uint64_t intval = result[3] ^ result[2] ^ result[1] ^ k;
+    intval = SM4SBox(intval);
+
+    if (is_key) {
+      intval ^= ROL(intval, 13) ^ ROL(intval, 23);
+    } else {
+      intval ^=
+          ROL(intval, 2) ^ ROL(intval, 10) ^ ROL(intval, 18) ^ ROL(intval, 24);
+    }
+
+    intval ^= result[0];
+
+    result[0] = result[1];
+    result[1] = result[2];
+    result[2] = result[3];
+    result[3] = intval;
+  }
+  srcdst.SetUintArray(vf, result);
+  return srcdst;
+}
+
 }  // namespace aarch64
 }  // namespace vixl
 
diff --git a/src/aarch64/macro-assembler-aarch64.cc b/src/aarch64/macro-assembler-aarch64.cc
index 8e1bb2f2..51669fd0 100644
--- a/src/aarch64/macro-assembler-aarch64.cc
+++ b/src/aarch64/macro-assembler-aarch64.cc
@@ -1240,11 +1240,14 @@ void MacroAssembler::Ccmp(const Register& rn,
                           StatusFlags nzcv,
                           Condition cond) {
   VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.GetImmediate() < 0)) {
-    ConditionalCompareMacro(rn, -operand.GetImmediate(), nzcv, cond, CCMN);
-  } else {
-    ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP);
+  if (operand.IsImmediate()) {
+    int64_t imm = operand.GetImmediate();
+    if ((imm < 0) && CanBeNegated(imm)) {
+      ConditionalCompareMacro(rn, -imm, nzcv, cond, CCMN);
+      return;
+    }
   }
+  ConditionalCompareMacro(rn, operand, nzcv, cond, CCMP);
 }
 
 
@@ -1253,11 +1256,14 @@ void MacroAssembler::Ccmn(const Register& rn,
                           StatusFlags nzcv,
                           Condition cond) {
   VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate() && (operand.GetImmediate() < 0)) {
-    ConditionalCompareMacro(rn, -operand.GetImmediate(), nzcv, cond, CCMP);
-  } else {
-    ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN);
+  if (operand.IsImmediate()) {
+    int64_t imm = operand.GetImmediate();
+    if ((imm < 0) && CanBeNegated(imm)) {
+      ConditionalCompareMacro(rn, -imm, nzcv, cond, CCMP);
+      return;
+    }
   }
+  ConditionalCompareMacro(rn, operand, nzcv, cond, CCMN);
 }
 
 
@@ -1491,8 +1497,7 @@ void MacroAssembler::Add(const Register& rd,
   VIXL_ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate()) {
     int64_t imm = operand.GetImmediate();
-    if ((imm < 0) && (imm != std::numeric_limits<int64_t>::min()) &&
-        IsImmAddSub(-imm)) {
+    if ((imm < 0) && CanBeNegated(imm) && IsImmAddSub(-imm)) {
       AddSubMacro(rd, rn, -imm, S, SUB);
       return;
     }
@@ -1579,8 +1584,7 @@ void MacroAssembler::Sub(const Register& rd,
   VIXL_ASSERT(allow_macro_instructions_);
   if (operand.IsImmediate()) {
     int64_t imm = operand.GetImmediate();
-    if ((imm < 0) && (imm != std::numeric_limits<int64_t>::min()) &&
-        IsImmAddSub(-imm)) {
+    if ((imm < 0) && CanBeNegated(imm) && IsImmAddSub(-imm)) {
       AddSubMacro(rd, rn, -imm, S, ADD);
       return;
     }
@@ -1749,7 +1753,7 @@ void MacroAssembler::Fmov(VRegister vd, Float16 imm) {
 
 void MacroAssembler::Neg(const Register& rd, const Operand& operand) {
   VIXL_ASSERT(allow_macro_instructions_);
-  if (operand.IsImmediate()) {
+  if (operand.IsImmediate() && CanBeNegated(operand.GetImmediate())) {
     Mov(rd, -operand.GetImmediate());
   } else {
     Sub(rd, AppropriateZeroRegFor(rd), operand);
@@ -2065,6 +2069,22 @@ void MacroAssembler::Setf16(const Register& wn) {
   setf16(wn);
 }
 
+void MacroAssembler::Chkfeat(const Register& xdn) {
+  VIXL_ASSERT(allow_macro_instructions_);
+  MacroEmissionCheckScope guard(this);
+  if (xdn.Is(x16)) {
+    chkfeat(xdn);
+  } else {
+    UseScratchRegisterScope temps(this);
+    if (temps.TryAcquire(x16)) {
+      Mov(x16, xdn);
+      chkfeat(x16);
+      Mov(xdn, x16);
+    } else {
+      VIXL_ABORT();
+    }
+  }
+}
 
 #define DEFINE_FUNCTION(FN, REGTYPE, REG, OP)                          \
   void MacroAssembler::FN(const REGTYPE REG, const MemOperand& addr) { \
diff --git a/src/aarch64/macro-assembler-aarch64.h b/src/aarch64/macro-assembler-aarch64.h
index 0d231f62..4666550a 100644
--- a/src/aarch64/macro-assembler-aarch64.h
+++ b/src/aarch64/macro-assembler-aarch64.h
@@ -2844,6 +2844,27 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
     subps(xd, xn, xm);
   }
   void Cmpp(const Register& xn, const Register& xm) { Subps(xzr, xn, xm); }
+  void Chkfeat(const Register& xdn);
+  void Gcspushm(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcspushm(rt);
+  }
+  void Gcspopm(const Register& rt = xzr) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcspopm(rt);
+  }
+  void Gcsss1(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcsss1(rt);
+  }
+  void Gcsss2(const Register& rt) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    gcsss2(rt);
+  }
 
 // NEON 3 vector register instructions.
 #define NEON_3VREG_MACRO_LIST(V) \
@@ -2893,6 +2914,7 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   V(pmull2, Pmull2)              \
   V(raddhn, Raddhn)              \
   V(raddhn2, Raddhn2)            \
+  V(rax1, Rax1)                  \
   V(rsubhn, Rsubhn)              \
   V(rsubhn2, Rsubhn2)            \
   V(saba, Saba)                  \
@@ -2905,8 +2927,21 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   V(saddl2, Saddl2)              \
   V(saddw, Saddw)                \
   V(saddw2, Saddw2)              \
+  V(sha1c, Sha1c)                \
+  V(sha1m, Sha1m)                \
+  V(sha1p, Sha1p)                \
+  V(sha1su0, Sha1su0)            \
+  V(sha256h, Sha256h)            \
+  V(sha256h2, Sha256h2)          \
+  V(sha256su1, Sha256su1)        \
+  V(sha512h, Sha512h)            \
+  V(sha512h2, Sha512h2)          \
+  V(sha512su1, Sha512su1)        \
   V(shadd, Shadd)                \
   V(shsub, Shsub)                \
+  V(sm3partw1, Sm3partw1)        \
+  V(sm3partw2, Sm3partw2)        \
+  V(sm4ekey, Sm4ekey)            \
   V(smax, Smax)                  \
   V(smaxp, Smaxp)                \
   V(smin, Smin)                  \
@@ -3001,6 +3036,10 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   V(abs, Abs)                    \
   V(addp, Addp)                  \
   V(addv, Addv)                  \
+  V(aesd, Aesd)                  \
+  V(aese, Aese)                  \
+  V(aesimc, Aesimc)              \
+  V(aesmc, Aesmc)                \
   V(cls, Cls)                    \
   V(clz, Clz)                    \
   V(cnt, Cnt)                    \
@@ -3049,6 +3088,11 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   V(sadalp, Sadalp)              \
   V(saddlp, Saddlp)              \
   V(saddlv, Saddlv)              \
+  V(sha1h, Sha1h)                \
+  V(sha1su1, Sha1su1)            \
+  V(sha256su0, Sha256su0)        \
+  V(sha512su0, Sha512su0)        \
+  V(sm4e, Sm4e)                  \
   V(smaxv, Smaxv)                \
   V(sminv, Sminv)                \
   V(sqabs, Sqabs)                \
@@ -3139,7 +3183,11 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   V(umlsl, Umlsl)                    \
   V(umlsl2, Umlsl2)                  \
   V(sudot, Sudot)                    \
-  V(usdot, Usdot)
+  V(usdot, Usdot)                    \
+  V(sm3tt1a, Sm3tt1a)                \
+  V(sm3tt1b, Sm3tt1b)                \
+  V(sm3tt2a, Sm3tt2a)                \
+  V(sm3tt2b, Sm3tt2b)
 
 
 #define DEFINE_MACRO_ASM_FUNC(ASM, MASM)    \
@@ -3258,6 +3306,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
   SVE_3VREG_COMMUTATIVE_MACRO_LIST(DEFINE_MACRO_ASM_FUNC)
 #undef DEFINE_MACRO_ASM_FUNC
 
+  void Bcax(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    bcax(vd, vn, vm, va);
+  }
   void Bic(const VRegister& vd, const int imm8, const int left_shift = 0) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -3298,6 +3354,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
     SingleEmissionCheckScope guard(this);
     dup(vd, rn);
   }
+  void Eor3(const VRegister& vd,
+            const VRegister& vn,
+            const VRegister& vm,
+            const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    eor3(vd, vn, vm, va);
+  }
   void Ext(const VRegister& vd,
            const VRegister& vn,
            const VRegister& vm,
@@ -3594,6 +3658,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
     SingleEmissionCheckScope guard(this);
     st4(vt, vt2, vt3, vt4, lane, dst);
   }
+  void Sm3ss1(const VRegister& vd,
+              const VRegister& vn,
+              const VRegister& vm,
+              const VRegister& va) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    sm3ss1(vd, vn, vm, va);
+  }
   void Smov(const Register& rd, const VRegister& vn, int vn_index) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -3604,6 +3676,14 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
     SingleEmissionCheckScope guard(this);
     umov(rd, vn, vn_index);
   }
+  void Xar(const VRegister& vd,
+           const VRegister& vn,
+           const VRegister& vm,
+           int rotate) {
+    VIXL_ASSERT(allow_macro_instructions_);
+    SingleEmissionCheckScope guard(this);
+    xar(vd, vn, vm, rotate);
+  }
   void Crc32b(const Register& rd, const Register& rn, const Register& rm) {
     VIXL_ASSERT(allow_macro_instructions_);
     SingleEmissionCheckScope guard(this);
@@ -8311,9 +8391,10 @@ MacroAssembler(PandaAllocator* allocator, byte* buffer,
       UseScratchRegisterScope* scratch_scope);
 
   bool LabelIsOutOfRange(Label* label, ImmBranchType branch_type) {
+    int64_t offset = label->GetLocation() - GetCursorOffset();
+    VIXL_ASSERT(IsMultiple(offset, kInstructionSize));
     return !Instruction::IsValidImmPCOffset(branch_type,
-                                            label->GetLocation() -
-                                                GetCursorOffset());
+                                            offset / kInstructionSize);
   }
 
   void ConfigureSimulatorCPUFeaturesHelper(const CPUFeatures& features,
@@ -8727,6 +8808,16 @@ class UseScratchRegisterScope {
     return AcquireFrom(available, kGoverningPRegisterMask).P();
   }
 
+  // TODO: extend to other scratch register lists.
+  bool TryAcquire(const Register& required_reg) {
+    CPURegList* list = masm_->GetScratchRegisterList();
+    if (list->IncludesAliasOf(required_reg)) {
+      list->Remove(required_reg);
+      return true;
+    }
+    return false;
+  }
+
   Register AcquireRegisterOfSize(int size_in_bits);
   Register AcquireSameSizeAs(const Register& reg) {
     return AcquireRegisterOfSize(reg.GetSizeInBits());
diff --git a/src/aarch64/pointer-auth-aarch64.cc b/src/aarch64/pointer-auth-aarch64.cc
index a33f39a8..6bc3751d 100644
--- a/src/aarch64/pointer-auth-aarch64.cc
+++ b/src/aarch64/pointer-auth-aarch64.cc
@@ -151,7 +151,7 @@ uint64_t Simulator::AuthPAC(uint64_t ptr,
 
   uint64_t pac = ComputePAC(original_ptr, context, key);
 
-  uint64_t error_code = 1 << key.number;
+  uint64_t error_code = uint64_t{1} << key.number;
   if ((pac & pac_mask) == (ptr & pac_mask)) {
     return original_ptr;
   } else {
diff --git a/src/aarch64/simulator-aarch64.cc b/src/aarch64/simulator-aarch64.cc
index 8572774b..88827bff 100644
--- a/src/aarch64/simulator-aarch64.cc
+++ b/src/aarch64/simulator-aarch64.cc
@@ -32,8 +32,23 @@
 #include <cstring>
 #include <errno.h>
 #include <limits>
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <Windows.h>
+#undef MultiplyHigh
+#include <Memoryapi.h>
+#else
 #include <sys/mman.h>
 #include <unistd.h>
+#endif
+
+#ifdef _MSC_VER
+#define VIXL_SYNC() MemoryBarrier()
+#else
+#define VIXL_SYNC() __sync_synchronize()
+#endif
 
 namespace vixl {
 namespace aarch64 {
@@ -42,6 +57,25 @@ using vixl::internal::SimFloat16;
 
 const Instruction* Simulator::kEndOfSimAddress = NULL;
 
+MemoryAccessResult TryMemoryAccess(uintptr_t address, uintptr_t access_size) {
+#ifdef VIXL_ENABLE_IMPLICIT_CHECKS
+  for (uintptr_t i = 0; i < access_size; i++) {
+    if (_vixl_internal_ReadMemory(address, i) == MemoryAccessResult::Failure) {
+      // The memory access failed.
+      return MemoryAccessResult::Failure;
+    }
+  }
+
+  // Either the memory access did not raise a signal or the signal handler did
+  // not correctly return MemoryAccessResult::Failure.
+  return MemoryAccessResult::Success;
+#else
+  USE(address);
+  USE(access_size);
+  return MemoryAccessResult::Success;
+#endif  // VIXL_ENABLE_IMPLICIT_CHECKS
+}
+
 bool MetaDataDepot::MetaDataMTE::is_active = false;
 
 void SimSystemRegister::SetBits(int msb, int lsb, uint32_t bits) {
@@ -498,21 +532,27 @@ Simulator::GetFormToVisitorFnMap() {
 Simulator::Simulator(Decoder* decoder, FILE* stream, SimStack::Allocated stack)
     : memory_(std::move(stack)),
       last_instr_(NULL),
-      cpu_features_auditor_(decoder, CPUFeatures::All()) {
+      cpu_features_auditor_(decoder, CPUFeatures::All()),
+      gcs_(kGCSNoStack),
+      gcs_enabled_(false) {
 #else
 Simulator::Simulator(PandaAllocator* allocator, Decoder* decoder, SimStack::Allocated stack, FILE* stream)
     : memory_(std::move(stack)),
       last_instr_(NULL),
       allocator_(allocator),
       cpu_features_auditor_(decoder, CPUFeatures::All()),
-      saved_cpu_features_(allocator_.Adapter()) {
+      saved_cpu_features_(allocator_.Adapter()),
+      gcs_(kGCSNoStack),
+      gcs_enabled_(false) {
 #endif
   // Ensure that shift operations act as the simulator expects.
   VIXL_ASSERT((static_cast<int32_t>(-1) >> 1) == -1);
   VIXL_ASSERT((static_cast<uint32_t>(-1) >> 1) == 0x7fffffff);
 
   // Set up a placeholder pipe for CanReadMemory.
+#ifndef _WIN32
   VIXL_CHECK(pipe(placeholder_pipe_fd_) == 0);
+#endif
 
   // Set up the decoder.
   decoder_ = decoder;
@@ -554,9 +594,8 @@ Simulator::Simulator(PandaAllocator* allocator, Decoder* decoder, SimStack::Allo
   guard_pages_ = false;
 
   // Initialize the common state of RNDR and RNDRRS.
-  uint16_t seed[3] = {11, 22, 33};
-  VIXL_STATIC_ASSERT(sizeof(seed) == sizeof(rand_state_));
-  memcpy(rand_state_, seed, sizeof(rand_state_));
+  uint64_t seed = (11 + (22 << 16) + (static_cast<uint64_t>(33) << 32));
+  rand_gen_.seed(seed);
 
   // Initialize all bits of pseudo predicate register to true.
   LogicPRegister ones(pregister_all_true_);
@@ -634,6 +673,8 @@ void Simulator::ResetState() {
   ResetPRegisters();
 
   WriteSp(memory_.GetStack().GetBase());
+  ResetGCSState();
+  EnableGCSCheck();
 
   pc_ = NULL;
   pc_modified_ = false;
@@ -671,9 +712,16 @@ Simulator::~Simulator() {
 #ifdef PANDA_BUILD
   allocator_.DeleteObject(print_disasm_);
   allocator_.DeleteObject(debugger_);
+#else
+  delete print_disasm_;
 #endif
+#ifndef _WIN32
   close(placeholder_pipe_fd_[0]);
   close(placeholder_pipe_fd_[1]);
+#endif
+  if (IsAllocatedGCS(gcs_)) {
+    GetGCSManager().FreeStack(gcs_);
+  }
 }
 
 
@@ -974,6 +1022,19 @@ vixl_uint128_t Simulator::Add128(vixl_uint128_t x, vixl_uint128_t y) {
   return std::make_pair(sum_hi.first, sum_lo.first);
 }
 
+vixl_uint128_t Simulator::Lsl128(vixl_uint128_t x, unsigned shift) const {
+  VIXL_ASSERT(shift <= 64);
+  if (shift == 0) return x;
+  if (shift == 64) return std::make_pair(x.second, 0);
+  uint64_t lo = x.second << shift;
+  uint64_t hi = (x.first << shift) | (x.second >> (64 - shift));
+  return std::make_pair(hi, lo);
+}
+
+vixl_uint128_t Simulator::Eor128(vixl_uint128_t x, vixl_uint128_t y) const {
+  return std::make_pair(x.first ^ y.first, x.second ^ y.second);
+}
+
 vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) {
   // Negate the integer value. Throw an assertion when the input is INT128_MIN.
   VIXL_ASSERT((x.first != GetSignMask(64)) || (x.second != 0));
@@ -985,11 +1046,11 @@ vixl_uint128_t Simulator::Neg128(vixl_uint128_t x) {
 vixl_uint128_t Simulator::Mul64(uint64_t x, uint64_t y) {
   bool neg_result = false;
   if ((x >> 63) == 1) {
-    x = -x;
+    x = UnsignedNegate(x);
     neg_result = !neg_result;
   }
   if ((y >> 63) == 1) {
-    y = -y;
+    y = UnsignedNegate(y);
     neg_result = !neg_result;
   }
 
@@ -1008,10 +1069,25 @@ vixl_uint128_t Simulator::Mul64(uint64_t x, uint64_t y) {
   vixl_uint128_t result = Add128(a, b);
   result = Add128(result, c);
   result = Add128(result, d);
-  return neg_result ? std::make_pair(-result.first - 1, -result.second)
+  return neg_result ? std::make_pair(UnsignedNegate(result.first) - 1,
+                                     UnsignedNegate(result.second))
                     : result;
 }
 
+vixl_uint128_t Simulator::PolynomialMult128(uint64_t op1,
+                                            uint64_t op2,
+                                            int lane_size_in_bits) const {
+  VIXL_ASSERT(static_cast<unsigned>(lane_size_in_bits) <= kDRegSize);
+  vixl_uint128_t result = std::make_pair(0, 0);
+  vixl_uint128_t op2q = std::make_pair(0, op2);
+  for (int i = 0; i < lane_size_in_bits; i++) {
+    if ((op1 >> i) & 1) {
+      result = Eor128(result, Lsl128(op2q, i));
+    }
+  }
+  return result;
+}
+
 int64_t Simulator::ShiftOperand(unsigned reg_size,
                                 uint64_t uvalue,
                                 Shift shift_type,
@@ -1747,6 +1823,18 @@ void Simulator::PrintSystemRegister(SystemRegister id) {
   }
 }
 
+void Simulator::PrintGCS(bool is_push, uint64_t addr, size_t entry) {
+  const char* arrow = is_push ? "<-" : "->";
+  fprintf(stream_,
+          "# %sgcs0x%04" PRIx64 "[%" PRIu64 "]: %s %s 0x%016" PRIxPTR "\n",
+          clr_flag_name,
+          gcs_,
+          entry,
+          clr_normal,
+          arrow,
+          addr);
+}
+
 uint16_t Simulator::PrintPartialAccess(uint16_t access_mask,
                                        uint16_t future_access_mask,
                                        int struct_element_count,
@@ -1794,8 +1882,9 @@ uint16_t Simulator::PrintPartialAccess(uint16_t access_mask,
   const char* sep = "";
   for (int i = struct_element_count - 1; i >= 0; i--) {
     int offset = lane_size_in_bytes * i;
-    uint64_t nibble = MemReadUint(lane_size_in_bytes, address + offset);
-    fprintf(stream_, "%s%0*" PRIx64, sep, lane_size_in_nibbles, nibble);
+    auto nibble = MemReadUint(lane_size_in_bytes, address + offset);
+    VIXL_ASSERT(nibble);
+    fprintf(stream_, "%s%0*" PRIx64, sep, lane_size_in_nibbles, *nibble);
     sep = "'";
   }
   fprintf(stream_,
@@ -2812,6 +2901,23 @@ void Simulator::SimulateSVEInterleavedArithLong(const Instruction* instr) {
   }
 }
 
+void Simulator::SimulateSVEPmull128(const Instruction* instr) {
+  SimVRegister& zd = ReadVRegister(instr->GetRd());
+  SimVRegister& zm = ReadVRegister(instr->GetRm());
+  SimVRegister& zn = ReadVRegister(instr->GetRn());
+  SimVRegister zn_temp, zm_temp;
+
+  if (form_hash_ == "pmullb_z_zz_q"_h) {
+    pack_even_elements(kFormatVnD, zn_temp, zn);
+    pack_even_elements(kFormatVnD, zm_temp, zm);
+  } else {
+    VIXL_ASSERT(form_hash_ == "pmullt_z_zz_q"_h);
+    pack_odd_elements(kFormatVnD, zn_temp, zn);
+    pack_odd_elements(kFormatVnD, zm_temp, zm);
+  }
+  pmull(kFormatVnQ, zd, zn_temp, zm_temp);
+}
+
 void Simulator::SimulateSVEIntMulLongVec(const Instruction* instr) {
   VectorFormat vform = instr->GetSVEVectorFormat();
   SimVRegister& zd = ReadVRegister(instr->GetRd());
@@ -2826,15 +2932,15 @@ void Simulator::SimulateSVEIntMulLongVec(const Instruction* instr) {
 
   switch (form_hash_) {
     case "pmullb_z_zz"_h:
-      // '00' is reserved for Q-sized lane.
-      if (vform == kFormatVnB) {
+      // Size '10' is undefined.
+      if (vform == kFormatVnS) {
         VIXL_UNIMPLEMENTED();
       }
       pmull(vform, zd, zn_b, zm_b);
       break;
     case "pmullt_z_zz"_h:
-      // '00' is reserved for Q-sized lane.
-      if (vform == kFormatVnB) {
+      // Size '10' is undefined.
+      if (vform == kFormatVnS) {
         VIXL_UNIMPLEMENTED();
       }
       pmull(vform, zd, zn_t, zm_t);
@@ -3723,6 +3829,7 @@ void Simulator::VisitUnconditionalBranch(const Instruction* instr) {
   switch (instr->Mask(UnconditionalBranchMask)) {
     case BL:
       WriteLr(instr->GetNextInstruction());
+      GCSPush(reinterpret_cast<uint64_t>(instr->GetNextInstruction()));
       VIXL_FALLTHROUGH();
     case B:
       WritePc(instr->GetImmPCOffsetTarget());
@@ -3766,6 +3873,7 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) {
   bool authenticate = false;
   bool link = false;
   bool ret = false;
+  bool compare_gcs = false;
   uint64_t addr = ReadXRegister(instr->GetRn());
   uint64_t context = 0;
 
@@ -3802,16 +3910,13 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) {
       context = ReadXRegister(31, Reg31IsStackPointer);
       VIXL_FALLTHROUGH();
     case RET:
+      compare_gcs = true;
       ret = true;
       break;
     default:
       VIXL_UNREACHABLE();
   }
 
-  if (link) {
-    WriteLr(instr->GetNextInstruction());
-  }
-
   if (authenticate) {
     PACKey key = (instr->ExtractBit(10) == 0) ? kPACKeyIA : kPACKeyIB;
     addr = AuthPAC(addr, context, key, kInstructionPointer);
@@ -3822,6 +3927,34 @@ void Simulator::VisitUnconditionalBranchToRegister(const Instruction* instr) {
     }
   }
 
+  if (compare_gcs) {
+    uint64_t expected_lr = GCSPeek();
+    char msg[128];
+    if (expected_lr != 0) {
+      if ((expected_lr & 0x3) != 0) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS contains misaligned return address: 0x%016" PRIx64 "\n",
+                 expected_lr);
+        ReportGCSFailure(msg);
+      } else if ((addr != 0) && (addr != expected_lr)) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS mismatch: lr = 0x%016" PRIx64 ", gcs = 0x%016" PRIx64
+                 "\n",
+                 addr,
+                 expected_lr);
+        ReportGCSFailure(msg);
+      }
+      GCSPop();
+    }
+  }
+
+  if (link) {
+    WriteLr(instr->GetNextInstruction());
+    GCSPush(reinterpret_cast<uint64_t>(instr->GetNextInstruction()));
+  }
+
   if (!ret) {
     // Check for interceptions to the target address, if one is found, call it.
     MetaDataDepot::BranchInterceptionAbstract* interception =
@@ -4143,10 +4276,12 @@ void Simulator::LoadAcquireRCpcUnscaledOffsetHelper(const Instruction* instr) {
     VIXL_ALIGNMENT_EXCEPTION();
   }
 
-  WriteRegister<T1>(rt, static_cast<T1>(MemRead<T2>(address)));
+  VIXL_DEFINE_OR_RETURN(value, MemRead<T2>(address));
+
+  WriteRegister<T1>(rt, static_cast<T1>(value));
 
   // Approximate load-acquire by issuing a full barrier after the load.
-  __sync_synchronize();
+  VIXL_SYNC();
 
   LogRead(rt, GetPrintRegisterFormat(element_size), address);
 }
@@ -4171,9 +4306,9 @@ void Simulator::StoreReleaseUnscaledOffsetHelper(const Instruction* instr) {
   }
 
   // Approximate store-release by issuing a full barrier after the load.
-  __sync_synchronize();
+  VIXL_SYNC();
 
-  MemWrite<T>(address, ReadRegister<T>(rt));
+  if (!MemWrite<T>(address, ReadRegister<T>(rt))) return;
 
   LogWrite(rt, GetPrintRegisterFormat(element_size), address);
 }
@@ -4260,7 +4395,9 @@ void Simulator::VisitLoadStorePAC(const Instruction* instr) {
   // Verify that the calculated address is available to the host.
   VIXL_ASSERT(address == addr_ptr);
 
-  WriteXRegister(dst, MemRead<uint64_t>(addr_ptr), NoRegLog);
+  VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(addr_ptr));
+
+  WriteXRegister(dst, value, NoRegLog);
   unsigned access_size = 1 << 3;
   LogRead(dst, GetPrintRegisterFormatForSize(access_size), addr_ptr);
 }
@@ -4287,93 +4424,121 @@ void Simulator::LoadStoreHelper(const Instruction* instr,
   int extend_to_size = 0;
   LoadStoreOp op = static_cast<LoadStoreOp>(instr->Mask(LoadStoreMask));
   switch (op) {
-    case LDRB_w:
-      WriteWRegister(srcdst, MemRead<uint8_t>(address), NoRegLog);
+    case LDRB_w: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint8_t>(address));
+      WriteWRegister(srcdst, value, NoRegLog);
       extend_to_size = kWRegSizeInBytes;
       break;
-    case LDRH_w:
-      WriteWRegister(srcdst, MemRead<uint16_t>(address), NoRegLog);
+    }
+    case LDRH_w: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint16_t>(address));
+      WriteWRegister(srcdst, value, NoRegLog);
       extend_to_size = kWRegSizeInBytes;
       break;
-    case LDR_w:
-      WriteWRegister(srcdst, MemRead<uint32_t>(address), NoRegLog);
+    }
+    case LDR_w: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint32_t>(address));
+      WriteWRegister(srcdst, value, NoRegLog);
       extend_to_size = kWRegSizeInBytes;
       break;
-    case LDR_x:
-      WriteXRegister(srcdst, MemRead<uint64_t>(address), NoRegLog);
+    }
+    case LDR_x: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(address));
+      WriteXRegister(srcdst, value, NoRegLog);
       extend_to_size = kXRegSizeInBytes;
       break;
-    case LDRSB_w:
-      WriteWRegister(srcdst, MemRead<int8_t>(address), NoRegLog);
+    }
+    case LDRSB_w: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int8_t>(address));
+      WriteWRegister(srcdst, value, NoRegLog);
       extend_to_size = kWRegSizeInBytes;
       break;
-    case LDRSH_w:
-      WriteWRegister(srcdst, MemRead<int16_t>(address), NoRegLog);
+    }
+    case LDRSH_w: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int16_t>(address));
+      WriteWRegister(srcdst, value, NoRegLog);
       extend_to_size = kWRegSizeInBytes;
       break;
-    case LDRSB_x:
-      WriteXRegister(srcdst, MemRead<int8_t>(address), NoRegLog);
+    }
+    case LDRSB_x: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int8_t>(address));
+      WriteXRegister(srcdst, value, NoRegLog);
       extend_to_size = kXRegSizeInBytes;
       break;
-    case LDRSH_x:
-      WriteXRegister(srcdst, MemRead<int16_t>(address), NoRegLog);
+    }
+    case LDRSH_x: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int16_t>(address));
+      WriteXRegister(srcdst, value, NoRegLog);
       extend_to_size = kXRegSizeInBytes;
       break;
-    case LDRSW_x:
-      WriteXRegister(srcdst, MemRead<int32_t>(address), NoRegLog);
+    }
+    case LDRSW_x: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int32_t>(address));
+      WriteXRegister(srcdst, value, NoRegLog);
       extend_to_size = kXRegSizeInBytes;
       break;
-    case LDR_b:
-      WriteBRegister(srcdst, MemRead<uint8_t>(address), NoRegLog);
+    }
+    case LDR_b: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint8_t>(address));
+      WriteBRegister(srcdst, value, NoRegLog);
       rt_is_vreg = true;
       break;
-    case LDR_h:
-      WriteHRegister(srcdst, MemRead<uint16_t>(address), NoRegLog);
+    }
+    case LDR_h: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint16_t>(address));
+      WriteHRegister(srcdst, value, NoRegLog);
       rt_is_vreg = true;
       break;
-    case LDR_s:
-      WriteSRegister(srcdst, MemRead<float>(address), NoRegLog);
+    }
+    case LDR_s: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<float>(address));
+      WriteSRegister(srcdst, value, NoRegLog);
       rt_is_vreg = true;
       break;
-    case LDR_d:
-      WriteDRegister(srcdst, MemRead<double>(address), NoRegLog);
+    }
+    case LDR_d: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<double>(address));
+      WriteDRegister(srcdst, value, NoRegLog);
       rt_is_vreg = true;
       break;
-    case LDR_q:
-      WriteQRegister(srcdst, MemRead<qreg_t>(address), NoRegLog);
+    }
+    case LDR_q: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<qreg_t>(address));
+      WriteQRegister(srcdst, value, NoRegLog);
       rt_is_vreg = true;
       break;
+    }
 
     case STRB_w:
-      MemWrite<uint8_t>(address, ReadWRegister(srcdst));
+      if (!MemWrite<uint8_t>(address, ReadWRegister(srcdst))) return;
       break;
     case STRH_w:
-      MemWrite<uint16_t>(address, ReadWRegister(srcdst));
+      if (!MemWrite<uint16_t>(address, ReadWRegister(srcdst))) return;
       break;
     case STR_w:
-      MemWrite<uint32_t>(address, ReadWRegister(srcdst));
+      if (!MemWrite<uint32_t>(address, ReadWRegister(srcdst))) return;
       break;
     case STR_x:
-      MemWrite<uint64_t>(address, ReadXRegister(srcdst));
+      if (!MemWrite<uint64_t>(address, ReadXRegister(srcdst))) return;
       break;
     case STR_b:
-      MemWrite<uint8_t>(address, ReadBRegister(srcdst));
+      if (!MemWrite<uint8_t>(address, ReadBRegister(srcdst))) return;
       rt_is_vreg = true;
       break;
     case STR_h:
-      MemWrite<uint16_t>(address, ReadHRegisterBits(srcdst));
+      if (!MemWrite<uint16_t>(address, ReadHRegisterBits(srcdst))) return;
       rt_is_vreg = true;
       break;
     case STR_s:
-      MemWrite<float>(address, ReadSRegister(srcdst));
+      if (!MemWrite<float>(address, ReadSRegister(srcdst))) return;
       rt_is_vreg = true;
       break;
     case STR_d:
-      MemWrite<double>(address, ReadDRegister(srcdst));
+      if (!MemWrite<double>(address, ReadDRegister(srcdst))) return;
       rt_is_vreg = true;
       break;
     case STR_q:
-      MemWrite<qreg_t>(address, ReadQRegister(srcdst));
+      if (!MemWrite<qreg_t>(address, ReadQRegister(srcdst))) return;
       rt_is_vreg = true;
       break;
 
@@ -4454,64 +4619,76 @@ void Simulator::LoadStorePairHelper(const Instruction* instr,
     // Use NoRegLog to suppress the register trace (LOG_REGS, LOG_FP_REGS). We
     // will print a more detailed log.
     case LDP_w: {
-      WriteWRegister(rt, MemRead<uint32_t>(address), NoRegLog);
-      WriteWRegister(rt2, MemRead<uint32_t>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint32_t>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<uint32_t>(address2));
+      WriteWRegister(rt, value, NoRegLog);
+      WriteWRegister(rt2, value2, NoRegLog);
       break;
     }
     case LDP_s: {
-      WriteSRegister(rt, MemRead<float>(address), NoRegLog);
-      WriteSRegister(rt2, MemRead<float>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<float>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<float>(address2));
+      WriteSRegister(rt, value, NoRegLog);
+      WriteSRegister(rt2, value2, NoRegLog);
       rt_is_vreg = true;
       break;
     }
     case LDP_x: {
-      WriteXRegister(rt, MemRead<uint64_t>(address), NoRegLog);
-      WriteXRegister(rt2, MemRead<uint64_t>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<uint64_t>(address2));
+      WriteXRegister(rt, value, NoRegLog);
+      WriteXRegister(rt2, value2, NoRegLog);
       break;
     }
     case LDP_d: {
-      WriteDRegister(rt, MemRead<double>(address), NoRegLog);
-      WriteDRegister(rt2, MemRead<double>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<double>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<double>(address2));
+      WriteDRegister(rt, value, NoRegLog);
+      WriteDRegister(rt2, value2, NoRegLog);
       rt_is_vreg = true;
       break;
     }
     case LDP_q: {
-      WriteQRegister(rt, MemRead<qreg_t>(address), NoRegLog);
-      WriteQRegister(rt2, MemRead<qreg_t>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<qreg_t>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<qreg_t>(address2));
+      WriteQRegister(rt, value, NoRegLog);
+      WriteQRegister(rt2, value2, NoRegLog);
       rt_is_vreg = true;
       break;
     }
     case LDPSW_x: {
-      WriteXRegister(rt, MemRead<int32_t>(address), NoRegLog);
-      WriteXRegister(rt2, MemRead<int32_t>(address2), NoRegLog);
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int32_t>(address));
+      VIXL_DEFINE_OR_RETURN(value2, MemRead<int32_t>(address2));
+      WriteXRegister(rt, value, NoRegLog);
+      WriteXRegister(rt2, value2, NoRegLog);
       sign_extend = true;
       break;
     }
     case STP_w: {
-      MemWrite<uint32_t>(address, ReadWRegister(rt));
-      MemWrite<uint32_t>(address2, ReadWRegister(rt2));
+      if (!MemWrite<uint32_t>(address, ReadWRegister(rt))) return;
+      if (!MemWrite<uint32_t>(address2, ReadWRegister(rt2))) return;
       break;
     }
     case STP_s: {
-      MemWrite<float>(address, ReadSRegister(rt));
-      MemWrite<float>(address2, ReadSRegister(rt2));
+      if (!MemWrite<float>(address, ReadSRegister(rt))) return;
+      if (!MemWrite<float>(address2, ReadSRegister(rt2))) return;
       rt_is_vreg = true;
       break;
     }
     case STP_x: {
-      MemWrite<uint64_t>(address, ReadXRegister(rt));
-      MemWrite<uint64_t>(address2, ReadXRegister(rt2));
+      if (!MemWrite<uint64_t>(address, ReadXRegister(rt))) return;
+      if (!MemWrite<uint64_t>(address2, ReadXRegister(rt2))) return;
       break;
     }
     case STP_d: {
-      MemWrite<double>(address, ReadDRegister(rt));
-      MemWrite<double>(address2, ReadDRegister(rt2));
+      if (!MemWrite<double>(address, ReadDRegister(rt))) return;
+      if (!MemWrite<double>(address2, ReadDRegister(rt2))) return;
       rt_is_vreg = true;
       break;
     }
     case STP_q: {
-      MemWrite<qreg_t>(address, ReadQRegister(rt));
-      MemWrite<qreg_t>(address2, ReadQRegister(rt2));
+      if (!MemWrite<qreg_t>(address, ReadQRegister(rt))) return;
+      if (!MemWrite<qreg_t>(address2, ReadQRegister(rt2))) return;
       rt_is_vreg = true;
       break;
     }
@@ -4571,18 +4748,19 @@ void Simulator::CompareAndSwapHelper(const Instruction* instr) {
   // associated with that location, even if the compare subsequently fails.
   local_monitor_.Clear();
 
-  T data = MemRead<T>(address);
+  VIXL_DEFINE_OR_RETURN(data, MemRead<T>(address));
+
   if (is_acquire) {
     // Approximate load-acquire by issuing a full barrier after the load.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
 
   if (data == comparevalue) {
     if (is_release) {
       // Approximate store-release by issuing a full barrier before the store.
-      __sync_synchronize();
+      VIXL_SYNC();
     }
-    MemWrite<T>(address, newvalue);
+    if (!MemWrite<T>(address, newvalue)) return;
     LogWrite(rt, GetPrintRegisterFormatForSize(element_size), address);
   }
   WriteRegister<T>(rs, data, NoRegLog);
@@ -4618,12 +4796,12 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) {
   // associated with that location, even if the compare subsequently fails.
   local_monitor_.Clear();
 
-  T data_low = MemRead<T>(address);
-  T data_high = MemRead<T>(address2);
+  VIXL_DEFINE_OR_RETURN(data_low, MemRead<T>(address));
+  VIXL_DEFINE_OR_RETURN(data_high, MemRead<T>(address2));
 
   if (is_acquire) {
     // Approximate load-acquire by issuing a full barrier after the load.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
 
   bool same =
@@ -4631,11 +4809,11 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) {
   if (same) {
     if (is_release) {
       // Approximate store-release by issuing a full barrier before the store.
-      __sync_synchronize();
+      VIXL_SYNC();
     }
 
-    MemWrite<T>(address, newvalue_low);
-    MemWrite<T>(address2, newvalue_high);
+    if (!MemWrite<T>(address, newvalue_low)) return;
+    if (!MemWrite<T>(address2, newvalue_high)) return;
   }
 
   WriteRegister<T>(rs + 1, data_high, NoRegLog);
@@ -4652,6 +4830,7 @@ void Simulator::CompareAndSwapPairHelper(const Instruction* instr) {
 }
 
 bool Simulator::CanReadMemory(uintptr_t address, size_t size) {
+#ifndef _WIN32
   // To simulate fault-tolerant loads, we need to know what host addresses we
   // can access without generating a real fault. One way to do that is to
   // attempt to `write()` the memory to a placeholder pipe[1]. This is more
@@ -4709,6 +4888,44 @@ bool Simulator::CanReadMemory(uintptr_t address, size_t size) {
   }
 
   return can_read;
+#else
+  // To simulate fault-tolerant loads, we need to know what host addresses we
+  // can access without generating a real fault
+  // The pipe code above is almost but not fully compatible with Windows
+  // Instead, use the platform specific API VirtualQuery()
+  //
+  // [2]: https://stackoverflow.com/a/18395247/9109981
+
+  bool can_read = true;
+  MEMORY_BASIC_INFORMATION pageInfo;
+
+  size_t checked = 0;
+  while (can_read && (checked < size)) {
+    size_t result = VirtualQuery(reinterpret_cast<void*>(address + checked),
+                                 &pageInfo,
+                                 sizeof(pageInfo));
+
+    if (result < 0) {
+      can_read = false;
+      break;
+    }
+
+    if (pageInfo.State != MEM_COMMIT) {
+      can_read = false;
+      break;
+    }
+
+    if (pageInfo.Protect == PAGE_NOACCESS || pageInfo.Protect == PAGE_EXECUTE) {
+      can_read = false;
+      break;
+    }
+    checked += pageInfo.RegionSize -
+               ((address + checked) -
+                reinterpret_cast<uintptr_t>(pageInfo.BaseAddress));
+  }
+
+  return can_read;
+#endif
 }
 
 void Simulator::PrintExclusiveAccessWarning() {
@@ -4802,54 +5019,66 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) {
           case LDXRB_w:
           case LDAXRB_w:
           case LDARB_w:
-          case LDLARB:
-            WriteWRegister(rt, MemRead<uint8_t>(address), NoRegLog);
+          case LDLARB: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint8_t>(address));
+            WriteWRegister(rt, value, NoRegLog);
             reg_size = kWRegSizeInBytes;
             break;
+          }
           case LDXRH_w:
           case LDAXRH_w:
           case LDARH_w:
-          case LDLARH:
-            WriteWRegister(rt, MemRead<uint16_t>(address), NoRegLog);
+          case LDLARH: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint16_t>(address));
+            WriteWRegister(rt, value, NoRegLog);
             reg_size = kWRegSizeInBytes;
             break;
+          }
           case LDXR_w:
           case LDAXR_w:
           case LDAR_w:
-          case LDLAR_w:
-            WriteWRegister(rt, MemRead<uint32_t>(address), NoRegLog);
+          case LDLAR_w: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint32_t>(address));
+            WriteWRegister(rt, value, NoRegLog);
             reg_size = kWRegSizeInBytes;
             break;
+          }
           case LDXR_x:
           case LDAXR_x:
           case LDAR_x:
-          case LDLAR_x:
-            WriteXRegister(rt, MemRead<uint64_t>(address), NoRegLog);
+          case LDLAR_x: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(address));
+            WriteXRegister(rt, value, NoRegLog);
             reg_size = kXRegSizeInBytes;
             break;
+          }
           case LDXP_w:
-          case LDAXP_w:
-            WriteWRegister(rt, MemRead<uint32_t>(address), NoRegLog);
-            WriteWRegister(rt2,
-                           MemRead<uint32_t>(address + element_size),
-                           NoRegLog);
+          case LDAXP_w: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint32_t>(address));
+            VIXL_DEFINE_OR_RETURN(value2,
+                                  MemRead<uint32_t>(address + element_size));
+            WriteWRegister(rt, value, NoRegLog);
+            WriteWRegister(rt2, value2, NoRegLog);
             reg_size = kWRegSizeInBytes;
             break;
+          }
           case LDXP_x:
-          case LDAXP_x:
-            WriteXRegister(rt, MemRead<uint64_t>(address), NoRegLog);
-            WriteXRegister(rt2,
-                           MemRead<uint64_t>(address + element_size),
-                           NoRegLog);
+          case LDAXP_x: {
+            VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(address));
+            VIXL_DEFINE_OR_RETURN(value2,
+                                  MemRead<uint64_t>(address + element_size));
+            WriteXRegister(rt, value, NoRegLog);
+            WriteXRegister(rt2, value2, NoRegLog);
             reg_size = kXRegSizeInBytes;
             break;
+          }
           default:
             VIXL_UNREACHABLE();
         }
 
         if (is_acquire_release) {
           // Approximate load-acquire by issuing a full barrier after the load.
-          __sync_synchronize();
+          VIXL_SYNC();
         }
 
         PrintRegisterFormat format = GetPrintRegisterFormatForSize(reg_size);
@@ -4861,7 +5090,7 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) {
         if (is_acquire_release) {
           // Approximate store-release by issuing a full barrier before the
           // store.
-          __sync_synchronize();
+          VIXL_SYNC();
         }
 
         bool do_store = true;
@@ -4883,35 +5112,41 @@ void Simulator::VisitLoadStoreExclusive(const Instruction* instr) {
             case STLXRB_w:
             case STLRB_w:
             case STLLRB:
-              MemWrite<uint8_t>(address, ReadWRegister(rt));
+              if (!MemWrite<uint8_t>(address, ReadWRegister(rt))) return;
               break;
             case STXRH_w:
             case STLXRH_w:
             case STLRH_w:
             case STLLRH:
-              MemWrite<uint16_t>(address, ReadWRegister(rt));
+              if (!MemWrite<uint16_t>(address, ReadWRegister(rt))) return;
               break;
             case STXR_w:
             case STLXR_w:
             case STLR_w:
             case STLLR_w:
-              MemWrite<uint32_t>(address, ReadWRegister(rt));
+              if (!MemWrite<uint32_t>(address, ReadWRegister(rt))) return;
               break;
             case STXR_x:
             case STLXR_x:
             case STLR_x:
             case STLLR_x:
-              MemWrite<uint64_t>(address, ReadXRegister(rt));
+              if (!MemWrite<uint64_t>(address, ReadXRegister(rt))) return;
               break;
             case STXP_w:
             case STLXP_w:
-              MemWrite<uint32_t>(address, ReadWRegister(rt));
-              MemWrite<uint32_t>(address + element_size, ReadWRegister(rt2));
+              if (!MemWrite<uint32_t>(address, ReadWRegister(rt))) return;
+              if (!MemWrite<uint32_t>(address + element_size,
+                                      ReadWRegister(rt2))) {
+                return;
+              }
               break;
             case STXP_x:
             case STLXP_x:
-              MemWrite<uint64_t>(address, ReadXRegister(rt));
-              MemWrite<uint64_t>(address + element_size, ReadXRegister(rt2));
+              if (!MemWrite<uint64_t>(address, ReadXRegister(rt))) return;
+              if (!MemWrite<uint64_t>(address + element_size,
+                                      ReadXRegister(rt2))) {
+                return;
+              }
               break;
             default:
               VIXL_UNREACHABLE();
@@ -4944,11 +5179,11 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) {
 
   T value = ReadRegister<T>(rs);
 
-  T data = MemRead<T>(address);
+  VIXL_DEFINE_OR_RETURN(data, MemRead<T>(address));
 
   if (is_acquire) {
     // Approximate load-acquire by issuing a full barrier after the load.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
 
   T result = 0;
@@ -4982,7 +5217,7 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) {
 
   if (is_release) {
     // Approximate store-release by issuing a full barrier before the store.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
 
   WriteRegister<T>(rt, data, NoRegLog);
@@ -4994,7 +5229,7 @@ void Simulator::AtomicMemorySimpleHelper(const Instruction* instr) {
   PrintRegisterFormat format = GetPrintRegisterFormatForSize(register_size);
   LogExtendingRead(rt, format, element_size, address);
 
-  MemWrite<T>(address, result);
+  if (!MemWrite<T>(address, result)) return;
   format = GetPrintRegisterFormatForSize(element_size);
   LogWrite(rs, format, address);
 }
@@ -5013,17 +5248,18 @@ void Simulator::AtomicMemorySwapHelper(const Instruction* instr) {
 
   CheckIsValidUnalignedAtomicAccess(rn, address, element_size);
 
-  T data = MemRead<T>(address);
+  VIXL_DEFINE_OR_RETURN(data, MemRead<T>(address));
+
   if (is_acquire) {
     // Approximate load-acquire by issuing a full barrier after the load.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
 
   if (is_release) {
     // Approximate store-release by issuing a full barrier before the store.
-    __sync_synchronize();
+    VIXL_SYNC();
   }
-  MemWrite<T>(address, ReadRegister<T>(rs));
+  if (!MemWrite<T>(address, ReadRegister<T>(rs))) return;
 
   WriteRegister<T>(rt, data);
 
@@ -5042,10 +5278,12 @@ void Simulator::LoadAcquireRCpcHelper(const Instruction* instr) {
 
   CheckIsValidUnalignedAtomicAccess(rn, address, element_size);
 
-  WriteRegister<T>(rt, MemRead<T>(address));
+  VIXL_DEFINE_OR_RETURN(value, MemRead<T>(address));
+
+  WriteRegister<T>(rt, value);
 
   // Approximate load-acquire by issuing a full barrier after the load.
-  __sync_synchronize();
+  VIXL_SYNC();
 
   LogRead(rt, GetPrintRegisterFormatForSize(element_size), address);
 }
@@ -5162,30 +5400,42 @@ void Simulator::VisitLoadLiteral(const Instruction* instr) {
   switch (instr->Mask(LoadLiteralMask)) {
     // Use NoRegLog to suppress the register trace (LOG_REGS, LOG_VREGS), then
     // print a more detailed log.
-    case LDR_w_lit:
-      WriteWRegister(rt, MemRead<uint32_t>(address), NoRegLog);
+    case LDR_w_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint32_t>(address));
+      WriteWRegister(rt, value, NoRegLog);
       LogRead(rt, kPrintWReg, address);
       break;
-    case LDR_x_lit:
-      WriteXRegister(rt, MemRead<uint64_t>(address), NoRegLog);
+    }
+    case LDR_x_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<uint64_t>(address));
+      WriteXRegister(rt, value, NoRegLog);
       LogRead(rt, kPrintXReg, address);
       break;
-    case LDR_s_lit:
-      WriteSRegister(rt, MemRead<float>(address), NoRegLog);
+    }
+    case LDR_s_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<float>(address));
+      WriteSRegister(rt, value, NoRegLog);
       LogVRead(rt, kPrintSRegFP, address);
       break;
-    case LDR_d_lit:
-      WriteDRegister(rt, MemRead<double>(address), NoRegLog);
+    }
+    case LDR_d_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<double>(address));
+      WriteDRegister(rt, value, NoRegLog);
       LogVRead(rt, kPrintDRegFP, address);
       break;
-    case LDR_q_lit:
-      WriteQRegister(rt, MemRead<qreg_t>(address), NoRegLog);
+    }
+    case LDR_q_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<qreg_t>(address));
+      WriteQRegister(rt, value, NoRegLog);
       LogVRead(rt, kPrintReg1Q, address);
       break;
-    case LDRSW_x_lit:
-      WriteXRegister(rt, MemRead<int32_t>(address), NoRegLog);
+    }
+    case LDRSW_x_lit: {
+      VIXL_DEFINE_OR_RETURN(value, MemRead<int32_t>(address));
+      WriteXRegister(rt, value, NoRegLog);
       LogExtendingRead(rt, kPrintXReg, kWRegSizeInBytes, address);
       break;
+    }
 
     // Ignore prfm hint instructions.
     case PRFM_lit:
@@ -5294,7 +5544,7 @@ void Simulator::VisitConditionalSelect(const Instruction* instr) {
         break;
       case CSNEG_w:
       case CSNEG_x:
-        new_val = -new_val;
+        new_val = UnsignedNegate(new_val);
         break;
       default:
         VIXL_UNIMPLEMENTED();
@@ -6014,6 +6264,8 @@ void Simulator::VisitFPIntegerConvert(const Instruction* instr) {
       WriteDRegisterBits(dst, ReadXRegister(src));
       break;
     case FMOV_d1_x:
+      // Zero bits beyond the MSB of a Q register.
+      mov(kFormat16B, ReadVRegister(dst), ReadVRegister(dst));
       LogicVRegister(ReadVRegister(dst))
           .SetUint(kFormatD, 1, ReadXRegister(src));
       break;
@@ -6664,7 +6916,7 @@ bool Simulator::FPProcessNaNs(const Instruction* instr) {
 }
 
 
-void Simulator::SysOp_W(int op, int64_t val) {
+bool Simulator::SysOp_W(int op, int64_t val) {
   switch (op) {
     case IVAU:
     case CVAC:
@@ -6683,15 +6935,30 @@ void Simulator::SysOp_W(int op, int64_t val) {
       // so temporarily disable MTE.
       bool mte_enabled = MetaDataDepot::MetaDataMTE::IsActive();
       MetaDataDepot::MetaDataMTE::SetActive(false);
-      volatile uint8_t y = MemRead<uint8_t>(val);
+      volatile uint8_t y = *MemRead<uint8_t>(val);
       MetaDataDepot::MetaDataMTE::SetActive(mte_enabled);
       USE(y);
-      // TODO: Implement ZVA, GVA, GZVA.
       break;
     }
+    case ZVA: {
+      if ((dczid_ & 0x10) != 0) {  // Check dc zva is enabled.
+        return false;
+      }
+      int blocksize = (1 << (dczid_ & 0xf)) * kWRegSizeInBytes;
+      VIXL_ASSERT(IsMultiple(blocksize, sizeof(uint64_t)));
+      uintptr_t addr = AlignDown(val, blocksize);
+      for (int i = 0; i < blocksize; i += sizeof(uint64_t)) {
+        MemWrite<uint64_t>(addr + i, 0);
+        LogWriteU64(0, addr + i);
+      }
+      break;
+    }
+    // TODO: Implement GVA, GZVA.
     default:
       VIXL_UNIMPLEMENTED();
+      return false;
   }
+  return true;
 }
 
 void Simulator::PACHelper(int dst,
@@ -6763,8 +7030,8 @@ void Simulator::VisitSystem(const Instruction* instr) {
           break;
         case RNDR:
         case RNDRRS: {
-          uint64_t high = jrand48(rand_state_);
-          uint64_t low = jrand48(rand_state_);
+          uint64_t high = rand_gen_();
+          uint64_t low = rand_gen_();
           uint64_t rand_num = (high << 32) | (low & 0xffffffff);
           WriteXRegister(instr->GetRt(), rand_num);
           // Simulate successful random number generation.
@@ -6774,10 +7041,21 @@ void Simulator::VisitSystem(const Instruction* instr) {
           LogSystemRegister(NZCV);
           break;
         }
+        case DCZID_EL0:
+          WriteXRegister(instr->GetRt(), dczid_);
+          break;
         default:
           VIXL_UNIMPLEMENTED();
       }
       break;
+    case "chkfeat_hf_hints"_h: {
+      uint64_t feat_select = ReadXRegister(16);
+      uint64_t gcs_enabled = IsGCSCheckEnabled() ? 1 : 0;
+      feat_select &= ~gcs_enabled;
+      WriteXRegister(16, feat_select);
+      break;
+    }
+    case "hint_hm_hints"_h:
     case "nop_hi_hints"_h:
     case "esb_hi_hints"_h:
     case "csdb_hi_hints"_h:
@@ -6859,11 +7137,68 @@ void Simulator::VisitSystem(const Instruction* instr) {
     case "dsb_bo_barriers"_h:
     case "dmb_bo_barriers"_h:
     case "isb_bi_barriers"_h:
-      __sync_synchronize();
+      VIXL_SYNC();
       break;
-    case "sys_cr_systeminstrs"_h:
-      SysOp_W(instr->GetSysOp(), ReadXRegister(instr->GetRt()));
+    case "sys_cr_systeminstrs"_h: {
+      uint64_t rt = ReadXRegister(instr->GetRt());
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSSS1) {
+        uint64_t incoming_size = rt >> 32;
+        // Drop upper 32 bits to get GCS index.
+        uint64_t incoming_gcs = rt & 0xffffffff;
+        uint64_t outgoing_gcs = ActivateGCS(incoming_gcs);
+        uint64_t incoming_seal = GCSPop();
+        if (((incoming_seal ^ rt) != 1) ||
+            (GetActiveGCSPtr()->size() != incoming_size)) {
+          char msg[128];
+          snprintf(msg,
+                   sizeof(msg),
+                   "GCS: invalid incoming stack: 0x%016" PRIx64 "\n",
+                   incoming_seal);
+          ReportGCSFailure(msg);
+        }
+        GCSPush(outgoing_gcs + 5);
+      } else if (sysop == GCSPUSHM) {
+        GCSPush(ReadXRegister(instr->GetRt()));
+      } else {
+        if (!SysOp_W(sysop, rt)) {
+          VisitUnallocated(instr);
+        }
+      }
       break;
+    }
+    case "sysl_rc_systeminstrs"_h: {
+      uint32_t sysop = instr->GetSysOp();
+      if (sysop == GCSPOPM) {
+        uint64_t addr = GCSPop();
+        WriteXRegister(instr->GetRt(), addr);
+      } else if (sysop == GCSSS2) {
+        uint64_t outgoing_gcs = GCSPop();
+        // Check for token inserted by gcsss1.
+        if ((outgoing_gcs & 7) != 5) {
+          char msg[128];
+          snprintf(msg,
+                   sizeof(msg),
+                   "GCS: outgoing stack has no token: 0x%016" PRIx64 "\n",
+                   outgoing_gcs);
+          ReportGCSFailure(msg);
+        }
+        uint64_t incoming_gcs = ActivateGCS(outgoing_gcs);
+        outgoing_gcs &= ~UINT64_C(0x3ff);
+
+        // Encode the size into the outgoing stack seal, to check later.
+        uint64_t size = GetActiveGCSPtr()->size();
+        VIXL_ASSERT(IsUint32(size));
+        VIXL_ASSERT(IsUint32(outgoing_gcs + 1));
+        uint64_t outgoing_seal = (size << 32) | (outgoing_gcs + 1);
+        GCSPush(outgoing_seal);
+        ActivateGCS(incoming_gcs);
+        WriteXRegister(instr->GetRt(), outgoing_seal - 1);
+      } else {
+        VIXL_UNIMPLEMENTED();
+      }
+      break;
+    }
     default:
       VIXL_UNIMPLEMENTED();
   }
@@ -6928,19 +7263,161 @@ void Simulator::VisitException(const Instruction* instr) {
 
 
 void Simulator::VisitCrypto2RegSHA(const Instruction* instr) {
-  VisitUnimplemented(instr);
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+
+  switch (form_hash_) {
+    case "sha1h_ss_cryptosha2"_h:
+      ror(kFormatS, rd, rn, 2);
+      break;
+    case "sha1su1_vv_cryptosha2"_h: {
+      SimVRegister temp;
+
+      // temp = srcdst ^ (src >> 32);
+      ext(kFormat16B, temp, rn, temp, 4);
+      eor(kFormat16B, temp, rd, temp);
+
+      // srcdst = ROL(temp, 1) ^ (ROL(temp, 2) << 96)
+      rol(kFormat4S, rd, temp, 1);
+      rol(kFormatS, temp, temp, 2);  // kFormatS will zero bits <127:32>
+      ext(kFormat16B, temp, temp, temp, 4);
+      eor(kFormat16B, rd, rd, temp);
+      break;
+    }
+    case "sha256su0_vv_cryptosha2"_h:
+      sha2su0(rd, rn);
+      break;
+  }
 }
 
 
 void Simulator::VisitCrypto3RegSHA(const Instruction* instr) {
-  VisitUnimplemented(instr);
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  switch (form_hash_) {
+    case "sha1c_qsv_cryptosha3"_h:
+      sha1<"choose"_h>(rd, rn, rm);
+      break;
+    case "sha1m_qsv_cryptosha3"_h:
+      sha1<"majority"_h>(rd, rn, rm);
+      break;
+    case "sha1p_qsv_cryptosha3"_h:
+      sha1<"parity"_h>(rd, rn, rm);
+      break;
+    case "sha1su0_vvv_cryptosha3"_h: {
+      SimVRegister temp;
+      ext(kFormat16B, temp, rd, rn, 8);
+      eor(kFormat16B, temp, temp, rd);
+      eor(kFormat16B, rd, temp, rm);
+      break;
+    }
+    case "sha256h_qqv_cryptosha3"_h:
+      sha2h(rd, rn, rm, /* part1 = */ true);
+      break;
+    case "sha256h2_qqv_cryptosha3"_h:
+      sha2h(rd, rn, rm, /* part1 = */ false);
+      break;
+    case "sha256su1_vvv_cryptosha3"_h:
+      sha2su1(rd, rn, rm);
+      break;
+  }
 }
 
 
 void Simulator::VisitCryptoAES(const Instruction* instr) {
-  VisitUnimplemented(instr);
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister temp;
+
+  switch (form_hash_) {
+    case "aesd_b_cryptoaes"_h:
+      eor(kFormat16B, temp, rd, rn);
+      aes(rd, temp, /* decrypt = */ true);
+      break;
+    case "aese_b_cryptoaes"_h:
+      eor(kFormat16B, temp, rd, rn);
+      aes(rd, temp, /* decrypt = */ false);
+      break;
+    case "aesimc_b_cryptoaes"_h:
+      aesmix(rd, rn, /* inverse = */ true);
+      break;
+    case "aesmc_b_cryptoaes"_h:
+      aesmix(rd, rn, /* inverse = */ false);
+      break;
+  }
 }
 
+void Simulator::VisitCryptoSM3(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+  SimVRegister& ra = ReadVRegister(instr->GetRa());
+  int index = instr->ExtractBits(13, 12);
+
+  bool is_a = false;
+  switch (form_hash_) {
+    case "sm3partw1_vvv4_cryptosha512_3"_h:
+      sm3partw1(rd, rn, rm);
+      break;
+    case "sm3partw2_vvv4_cryptosha512_3"_h:
+      sm3partw2(rd, rn, rm);
+      break;
+    case "sm3ss1_vvv4_crypto4"_h:
+      sm3ss1(rd, rn, rm, ra);
+      break;
+    case "sm3tt1a_vvv4_crypto3_imm2"_h:
+      is_a = true;
+      VIXL_FALLTHROUGH();
+    case "sm3tt1b_vvv4_crypto3_imm2"_h:
+      sm3tt1(rd, rn, rm, index, is_a);
+      break;
+    case "sm3tt2a_vvv4_crypto3_imm2"_h:
+      is_a = true;
+      VIXL_FALLTHROUGH();
+    case "sm3tt2b_vvv_crypto3_imm2"_h:
+      sm3tt2(rd, rn, rm, index, is_a);
+      break;
+  }
+}
+
+void Simulator::VisitCryptoSM4(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  bool is_key = false;
+  switch (form_hash_) {
+    case "sm4ekey_vvv4_cryptosha512_3"_h:
+      is_key = true;
+      VIXL_FALLTHROUGH();
+    case "sm4e_vv4_cryptosha512_2"_h:
+      sm4(rd, rn, rm, is_key);
+      break;
+  }
+}
+
+void Simulator::SimulateSHA512(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+
+  switch (form_hash_) {
+    case "sha512h_qqv_cryptosha512_3"_h:
+      sha512h(rd, rn, rm);
+      break;
+    case "sha512h2_qqv_cryptosha512_3"_h:
+      sha512h2(rd, rn, rm);
+      break;
+    case "sha512su0_vv2_cryptosha512_2"_h:
+      sha512su0(rd, rn);
+      break;
+    case "sha512su1_vvv2_cryptosha512_3"_h:
+      sha512su1(rd, rn, rm);
+      break;
+  }
+}
 
 void Simulator::VisitNEON2RegMisc(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
@@ -7704,13 +8181,24 @@ void Simulator::VisitNEON3Different(const Instruction* instr) {
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   SimVRegister& rm = ReadVRegister(instr->GetRm());
+  int size = instr->GetNEONSize();
 
   switch (instr->Mask(NEON3DifferentMask)) {
     case NEON_PMULL:
-      pmull(vf_l, rd, rn, rm);
+      if ((size == 1) || (size == 2)) {  // S/D reserved.
+        VisitUnallocated(instr);
+      } else {
+        if (size == 3) vf_l = kFormat1Q;
+        pmull(vf_l, rd, rn, rm);
+      }
       break;
     case NEON_PMULL2:
-      pmull2(vf_l, rd, rn, rm);
+      if ((size == 1) || (size == 2)) {  // S/D reserved.
+        VisitUnallocated(instr);
+      } else {
+        if (size == 3) vf_l = kFormat1Q;
+        pmull2(vf_l, rd, rn, rm);
+      }
       break;
     case NEON_UADDL:
       uaddl(vf_l, rd, rn, rm);
@@ -7948,22 +8436,14 @@ void Simulator::VisitNEONAcrossLanes(const Instruction* instr) {
 void Simulator::SimulateNEONMulByElementLong(const Instruction* instr) {
   NEONFormatDecoder nfd(instr);
   VectorFormat vf = nfd.GetVectorFormat(nfd.LongIntegerFormatMap());
-
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
 
-  int rm_reg = instr->GetRm();
-  int index = (instr->GetNEONH() << 1) | instr->GetNEONL();
-  if (instr->GetNEONSize() == 1) {
-    rm_reg = instr->GetRmLow16();
-    index = (index << 1) | instr->GetNEONM();
-  }
-  SimVRegister& rm = ReadVRegister(rm_reg);
-
+  std::pair<int, int> rm_and_index = instr->GetNEONMulRmAndIndex();
   SimVRegister temp;
   VectorFormat indexform =
       VectorFormatHalfWidthDoubleLanes(VectorFormatFillQ(vf));
-  dup_element(indexform, temp, rm, index);
+  dup_elements_to_segments(indexform, temp, rm_and_index);
 
   bool is_2 = instr->Mask(NEON_Q) ? true : false;
 
@@ -8037,21 +8517,9 @@ void Simulator::SimulateNEONFPMulByElement(const Instruction* instr) {
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
 
-  int rm_reg = instr->GetRm();
-  int index =
-      (instr->GetNEONH() << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM();
-
-  if ((vform == kFormat4H) || (vform == kFormat8H)) {
-    rm_reg &= 0xf;
-  } else if ((vform == kFormat2S) || (vform == kFormat4S)) {
-    index >>= 1;
-  } else {
-    VIXL_ASSERT(vform == kFormat2D);
-    VIXL_ASSERT(instr->GetNEONL() == 0);
-    index >>= 2;
-  }
-
-  SimVRegister& rm = ReadVRegister(rm_reg);
+  std::pair<int, int> rm_and_index = instr->GetNEONMulRmAndIndex();
+  SimVRegister& rm = ReadVRegister(rm_and_index.first);
+  int index = rm_and_index.second;
 
   switch (form_hash_) {
     case "fmul_asimdelem_rh_h"_h:
@@ -8131,15 +8599,9 @@ void Simulator::VisitNEONByIndexedElement(const Instruction* instr) {
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
 
-  int rm_reg = instr->GetRm();
-  int index = (instr->GetNEONH() << 1) | instr->GetNEONL();
-
-  if ((vform == kFormat4H) || (vform == kFormat8H)) {
-    rm_reg &= 0xf;
-    index = (index << 1) | instr->GetNEONM();
-  }
-
-  SimVRegister& rm = ReadVRegister(rm_reg);
+  std::pair<int, int> rm_and_index = instr->GetNEONMulRmAndIndex();
+  SimVRegister& rm = ReadVRegister(rm_and_index.first);
+  int index = rm_and_index.second;
 
   switch (form_hash_) {
     case "mul_asimdelem_r"_h:
@@ -8180,8 +8642,10 @@ void Simulator::VisitNEONCopy(const Instruction* instr) {
   if (instr->Mask(NEONCopyInsElementMask) == NEON_INS_ELEMENT) {
     int imm4 = instr->GetImmNEON4();
     int rn_index = ExtractSignedBitfield32(31, tz, imm4);
+    mov(kFormat16B, rd, rd);  // Zero bits beyond the MSB of a Q register.
     ins_element(vf, rd, reg_index, rn, rn_index);
   } else if (instr->Mask(NEONCopyInsGeneralMask) == NEON_INS_GENERAL) {
+    mov(kFormat16B, rd, rd);  // Zero bits beyond the MSB of a Q register.
     ins_immediate(vf, rd, reg_index, ReadXRegister(instr->GetRn()));
   } else if (instr->Mask(NEONCopyUmovMask) == NEON_UMOV) {
     uint64_t value = LogicVRegister(rn).Uint(vf, reg_index);
@@ -8249,97 +8713,117 @@ void Simulator::NEONLoadStoreMultiStructHelper(const Instruction* instr,
   switch (instr->Mask(NEONLoadStoreMultiStructPostIndexMask)) {
     case NEON_LD1_4v:
     case NEON_LD1_4v_post:
-      ld1(vf, ReadVRegister(reg[3]), addr[3]);
+      if (!ld1(vf, ReadVRegister(reg[3]), addr[3])) {
+        return;
+      }
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_LD1_3v:
     case NEON_LD1_3v_post:
-      ld1(vf, ReadVRegister(reg[2]), addr[2]);
+      if (!ld1(vf, ReadVRegister(reg[2]), addr[2])) {
+        return;
+      }
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_LD1_2v:
     case NEON_LD1_2v_post:
-      ld1(vf, ReadVRegister(reg[1]), addr[1]);
+      if (!ld1(vf, ReadVRegister(reg[1]), addr[1])) {
+        return;
+      }
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_LD1_1v:
     case NEON_LD1_1v_post:
-      ld1(vf, ReadVRegister(reg[0]), addr[0]);
+      if (!ld1(vf, ReadVRegister(reg[0]), addr[0])) {
+        return;
+      }
       break;
     case NEON_ST1_4v:
     case NEON_ST1_4v_post:
-      st1(vf, ReadVRegister(reg[3]), addr[3]);
+      if (!st1(vf, ReadVRegister(reg[3]), addr[3])) return;
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_ST1_3v:
     case NEON_ST1_3v_post:
-      st1(vf, ReadVRegister(reg[2]), addr[2]);
+      if (!st1(vf, ReadVRegister(reg[2]), addr[2])) return;
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_ST1_2v:
     case NEON_ST1_2v_post:
-      st1(vf, ReadVRegister(reg[1]), addr[1]);
+      if (!st1(vf, ReadVRegister(reg[1]), addr[1])) return;
       reg_count++;
       VIXL_FALLTHROUGH();
     case NEON_ST1_1v:
     case NEON_ST1_1v_post:
-      st1(vf, ReadVRegister(reg[0]), addr[0]);
+      if (!st1(vf, ReadVRegister(reg[0]), addr[0])) return;
       log_read = false;
       break;
     case NEON_LD2_post:
     case NEON_LD2:
-      ld2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0]);
+      if (!ld2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0])) {
+        return;
+      }
       struct_parts = 2;
       reg_count = 2;
       break;
     case NEON_ST2:
     case NEON_ST2_post:
-      st2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0]);
+      if (!st2(vf, ReadVRegister(reg[0]), ReadVRegister(reg[1]), addr[0])) {
+        return;
+      }
       struct_parts = 2;
       reg_count = 2;
       log_read = false;
       break;
     case NEON_LD3_post:
     case NEON_LD3:
-      ld3(vf,
-          ReadVRegister(reg[0]),
-          ReadVRegister(reg[1]),
-          ReadVRegister(reg[2]),
-          addr[0]);
+      if (!ld3(vf,
+               ReadVRegister(reg[0]),
+               ReadVRegister(reg[1]),
+               ReadVRegister(reg[2]),
+               addr[0])) {
+        return;
+      }
       struct_parts = 3;
       reg_count = 3;
       break;
     case NEON_ST3:
     case NEON_ST3_post:
-      st3(vf,
-          ReadVRegister(reg[0]),
-          ReadVRegister(reg[1]),
-          ReadVRegister(reg[2]),
-          addr[0]);
+      if (!st3(vf,
+               ReadVRegister(reg[0]),
+               ReadVRegister(reg[1]),
+               ReadVRegister(reg[2]),
+               addr[0])) {
+        return;
+      }
       struct_parts = 3;
       reg_count = 3;
       log_read = false;
       break;
     case NEON_ST4:
     case NEON_ST4_post:
-      st4(vf,
-          ReadVRegister(reg[0]),
-          ReadVRegister(reg[1]),
-          ReadVRegister(reg[2]),
-          ReadVRegister(reg[3]),
-          addr[0]);
+      if (!st4(vf,
+               ReadVRegister(reg[0]),
+               ReadVRegister(reg[1]),
+               ReadVRegister(reg[2]),
+               ReadVRegister(reg[3]),
+               addr[0])) {
+        return;
+      }
       struct_parts = 4;
       reg_count = 4;
       log_read = false;
       break;
     case NEON_LD4_post:
     case NEON_LD4:
-      ld4(vf,
-          ReadVRegister(reg[0]),
-          ReadVRegister(reg[1]),
-          ReadVRegister(reg[2]),
-          ReadVRegister(reg[3]),
-          addr[0]);
+      if (!ld4(vf,
+               ReadVRegister(reg[0]),
+               ReadVRegister(reg[1]),
+               ReadVRegister(reg[2]),
+               ReadVRegister(reg[3]),
+               addr[0])) {
+        return;
+      }
       struct_parts = 4;
       reg_count = 4;
       break;
@@ -8514,75 +8998,95 @@ void Simulator::NEONLoadStoreSingleStructHelper(const Instruction* instr,
       reg_count = 1;
       if (replicating) {
         VIXL_ASSERT(do_load);
-        ld1r(vf, ReadVRegister(rt), addr);
+        if (!ld1r(vf, ReadVRegister(rt), addr)) {
+          return;
+        }
       } else if (do_load) {
-        ld1(vf, ReadVRegister(rt), lane, addr);
+        if (!ld1(vf, ReadVRegister(rt), lane, addr)) {
+          return;
+        }
       } else {
-        st1(vf, ReadVRegister(rt), lane, addr);
+        if (!st1(vf, ReadVRegister(rt), lane, addr)) return;
       }
       break;
     case NEONLoadStoreSingle2:
       reg_count = 2;
       if (replicating) {
         VIXL_ASSERT(do_load);
-        ld2r(vf, ReadVRegister(rt), ReadVRegister(rt2), addr);
+        if (!ld2r(vf, ReadVRegister(rt), ReadVRegister(rt2), addr)) {
+          return;
+        }
       } else if (do_load) {
-        ld2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr);
+        if (!ld2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr)) {
+          return;
+        }
       } else {
-        st2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr);
+        if (!st2(vf, ReadVRegister(rt), ReadVRegister(rt2), lane, addr)) return;
       }
       break;
     case NEONLoadStoreSingle3:
       reg_count = 3;
       if (replicating) {
         VIXL_ASSERT(do_load);
-        ld3r(vf,
-             ReadVRegister(rt),
-             ReadVRegister(rt2),
-             ReadVRegister(rt3),
-             addr);
+        if (!ld3r(vf,
+                  ReadVRegister(rt),
+                  ReadVRegister(rt2),
+                  ReadVRegister(rt3),
+                  addr)) {
+          return;
+        }
       } else if (do_load) {
-        ld3(vf,
-            ReadVRegister(rt),
-            ReadVRegister(rt2),
-            ReadVRegister(rt3),
-            lane,
-            addr);
+        if (!ld3(vf,
+                 ReadVRegister(rt),
+                 ReadVRegister(rt2),
+                 ReadVRegister(rt3),
+                 lane,
+                 addr)) {
+          return;
+        }
       } else {
-        st3(vf,
-            ReadVRegister(rt),
-            ReadVRegister(rt2),
-            ReadVRegister(rt3),
-            lane,
-            addr);
+        if (!st3(vf,
+                 ReadVRegister(rt),
+                 ReadVRegister(rt2),
+                 ReadVRegister(rt3),
+                 lane,
+                 addr)) {
+          return;
+        }
       }
       break;
     case NEONLoadStoreSingle4:
       reg_count = 4;
       if (replicating) {
         VIXL_ASSERT(do_load);
-        ld4r(vf,
-             ReadVRegister(rt),
-             ReadVRegister(rt2),
-             ReadVRegister(rt3),
-             ReadVRegister(rt4),
-             addr);
+        if (!ld4r(vf,
+                  ReadVRegister(rt),
+                  ReadVRegister(rt2),
+                  ReadVRegister(rt3),
+                  ReadVRegister(rt4),
+                  addr)) {
+          return;
+        }
       } else if (do_load) {
-        ld4(vf,
-            ReadVRegister(rt),
-            ReadVRegister(rt2),
-            ReadVRegister(rt3),
-            ReadVRegister(rt4),
-            lane,
-            addr);
+        if (!ld4(vf,
+                 ReadVRegister(rt),
+                 ReadVRegister(rt2),
+                 ReadVRegister(rt3),
+                 ReadVRegister(rt4),
+                 lane,
+                 addr)) {
+          return;
+        }
       } else {
-        st4(vf,
-            ReadVRegister(rt),
-            ReadVRegister(rt2),
-            ReadVRegister(rt3),
-            ReadVRegister(rt4),
-            lane,
-            addr);
+        if (!st4(vf,
+                 ReadVRegister(rt),
+                 ReadVRegister(rt2),
+                 ReadVRegister(rt3),
+                 ReadVRegister(rt4),
+                 lane,
+                 addr)) {
+          return;
+        }
       }
       break;
     default:
@@ -8676,7 +9180,7 @@ void Simulator::VisitNEONModifiedImmediate(const Instruction* instr) {
         vform = q ? kFormat2D : kFormat1D;
         imm = 0;
         for (int i = 0; i < 8; ++i) {
-          if (imm8 & (1 << i)) {
+          if (imm8 & (uint64_t{1} << i)) {
             imm |= (UINT64_C(0xff) << (8 * i));
           }
         }
@@ -9156,78 +9660,76 @@ void Simulator::VisitNEONScalar3SameExtra(const Instruction* instr) {
 void Simulator::VisitNEONScalarByIndexedElement(const Instruction* instr) {
   NEONFormatDecoder nfd(instr, NEONFormatDecoder::LongScalarFormatMap());
   VectorFormat vf = nfd.GetVectorFormat();
-  VectorFormat vf_r = nfd.GetVectorFormat(nfd.ScalarFormatMap());
-
   SimVRegister& rd = ReadVRegister(instr->GetRd());
   SimVRegister& rn = ReadVRegister(instr->GetRn());
   ByElementOp Op = NULL;
 
-  int rm_reg = instr->GetRm();
-  int index = (instr->GetNEONH() << 1) | instr->GetNEONL();
-  if (instr->GetNEONSize() == 1) {
-    rm_reg &= 0xf;
-    index = (index << 1) | instr->GetNEONM();
+  std::pair<int, int> rm_and_index = instr->GetNEONMulRmAndIndex();
+  std::unordered_map<uint32_t, ByElementOp> handler = {
+      {"sqdmull_asisdelem_l"_h, &Simulator::sqdmull},
+      {"sqdmlal_asisdelem_l"_h, &Simulator::sqdmlal},
+      {"sqdmlsl_asisdelem_l"_h, &Simulator::sqdmlsl},
+      {"sqdmulh_asisdelem_r"_h, &Simulator::sqdmulh},
+      {"sqrdmulh_asisdelem_r"_h, &Simulator::sqrdmulh},
+      {"sqrdmlah_asisdelem_r"_h, &Simulator::sqrdmlah},
+      {"sqrdmlsh_asisdelem_r"_h, &Simulator::sqrdmlsh},
+      {"fmul_asisdelem_rh_h"_h, &Simulator::fmul},
+      {"fmul_asisdelem_r_sd"_h, &Simulator::fmul},
+      {"fmla_asisdelem_rh_h"_h, &Simulator::fmla},
+      {"fmla_asisdelem_r_sd"_h, &Simulator::fmla},
+      {"fmls_asisdelem_rh_h"_h, &Simulator::fmls},
+      {"fmls_asisdelem_r_sd"_h, &Simulator::fmls},
+      {"fmulx_asisdelem_rh_h"_h, &Simulator::fmulx},
+      {"fmulx_asisdelem_r_sd"_h, &Simulator::fmulx},
+  };
+
+  std::unordered_map<uint32_t, ByElementOp>::const_iterator it =
+      handler.find(form_hash_);
+
+  if (it == handler.end()) {
+    VIXL_UNIMPLEMENTED();
+  } else {
+    Op = it->second;
   }
 
-  switch (instr->Mask(NEONScalarByIndexedElementMask)) {
-    case NEON_SQDMULL_byelement_scalar:
-      Op = &Simulator::sqdmull;
+  switch (form_hash_) {
+    case "sqdmull_asisdelem_l"_h:
+    case "sqdmlal_asisdelem_l"_h:
+    case "sqdmlsl_asisdelem_l"_h:
+      if ((vf == kFormatB) || (vf == kFormatH)) {
+        VisitUnallocated(instr);
+        return;
+      }
       break;
-    case NEON_SQDMLAL_byelement_scalar:
-      Op = &Simulator::sqdmlal;
+    case "sqdmulh_asisdelem_r"_h:
+    case "sqrdmulh_asisdelem_r"_h:
+    case "sqrdmlah_asisdelem_r"_h:
+    case "sqrdmlsh_asisdelem_r"_h:
+      vf = nfd.GetVectorFormat(nfd.ScalarFormatMap());
+      if ((vf == kFormatB) || (vf == kFormatD)) {
+        VisitUnallocated(instr);
+        return;
+      }
       break;
-    case NEON_SQDMLSL_byelement_scalar:
-      Op = &Simulator::sqdmlsl;
-      break;
-    case NEON_SQDMULH_byelement_scalar:
-      Op = &Simulator::sqdmulh;
-      vf = vf_r;
-      break;
-    case NEON_SQRDMULH_byelement_scalar:
-      Op = &Simulator::sqrdmulh;
-      vf = vf_r;
-      break;
-    case NEON_SQRDMLAH_byelement_scalar:
-      Op = &Simulator::sqrdmlah;
-      vf = vf_r;
-      break;
-    case NEON_SQRDMLSH_byelement_scalar:
-      Op = &Simulator::sqrdmlsh;
-      vf = vf_r;
-      break;
-    default:
+    case "fmul_asisdelem_r_sd"_h:
+    case "fmla_asisdelem_r_sd"_h:
+    case "fmls_asisdelem_r_sd"_h:
+    case "fmulx_asisdelem_r_sd"_h:
       vf = nfd.GetVectorFormat(nfd.FPScalarFormatMap());
-      index = instr->GetNEONH();
-      if (instr->GetFPType() == 0) {
-        index = (index << 2) | (instr->GetNEONL() << 1) | instr->GetNEONM();
-        rm_reg &= 0xf;
-        vf = kFormatH;
-      } else if ((instr->GetFPType() & 1) == 0) {
-        index = (index << 1) | instr->GetNEONL();
-      }
-      switch (instr->Mask(NEONScalarByIndexedElementFPMask)) {
-        case NEON_FMUL_H_byelement_scalar:
-        case NEON_FMUL_byelement_scalar:
-          Op = &Simulator::fmul;
-          break;
-        case NEON_FMLA_H_byelement_scalar:
-        case NEON_FMLA_byelement_scalar:
-          Op = &Simulator::fmla;
-          break;
-        case NEON_FMLS_H_byelement_scalar:
-        case NEON_FMLS_byelement_scalar:
-          Op = &Simulator::fmls;
-          break;
-        case NEON_FMULX_H_byelement_scalar:
-        case NEON_FMULX_byelement_scalar:
-          Op = &Simulator::fmulx;
-          break;
-        default:
-          VIXL_UNIMPLEMENTED();
-      }
+      break;
+    case "fmul_asisdelem_rh_h"_h:
+    case "fmla_asisdelem_rh_h"_h:
+    case "fmls_asisdelem_rh_h"_h:
+    case "fmulx_asisdelem_rh_h"_h:
+      vf = kFormatH;
+      break;
   }
 
-  (this->*Op)(vf, rd, rn, ReadVRegister(rm_reg), index);
+  (this->*Op)(vf,
+              rd,
+              rn,
+              ReadVRegister(rm_and_index.first),
+              rm_and_index.second);
 }
 
 
@@ -9634,6 +10136,34 @@ void Simulator::VisitNEONPerm(const Instruction* instr) {
   }
 }
 
+void Simulator::SimulateNEONSHA3(const Instruction* instr) {
+  SimVRegister& rd = ReadVRegister(instr->GetRd());
+  SimVRegister& rn = ReadVRegister(instr->GetRn());
+  SimVRegister& rm = ReadVRegister(instr->GetRm());
+  SimVRegister& ra = ReadVRegister(instr->GetRa());
+  SimVRegister temp;
+
+  switch (form_hash_) {
+    case "bcax_vvv16_crypto4"_h:
+      bic(kFormat16B, temp, rm, ra);
+      eor(kFormat16B, rd, rn, temp);
+      break;
+    case "eor3_vvv16_crypto4"_h:
+      eor(kFormat16B, temp, rm, ra);
+      eor(kFormat16B, rd, rn, temp);
+      break;
+    case "rax1_vvv2_cryptosha512_3"_h:
+      ror(kFormat2D, temp, rm, 63);  // rol(1) => ror(63)
+      eor(kFormat2D, rd, rn, temp);
+      break;
+    case "xar_vvv2_crypto3_imm6"_h:
+      int rot = instr->ExtractBits(15, 10);
+      eor(kFormat2D, temp, rn, rm);
+      ror(kFormat2D, rd, temp, rot);
+      break;
+  }
+}
+
 void Simulator::VisitSVEAddressGeneration(const Instruction* instr) {
   SimVRegister& zd = ReadVRegister(instr->GetRd());
   SimVRegister& zn = ReadVRegister(instr->GetRn());
@@ -11820,7 +12350,7 @@ void Simulator::VisitSVEBroadcastIntImm_Unpredicated(const Instruction* instr) {
   VectorFormat format = instr->GetSVEVectorFormat();
   int64_t imm = instr->GetImmSVEIntWideSigned();
   int shift = instr->ExtractBit(13) * 8;
-  imm *= 1 << shift;
+  imm *= uint64_t{1} << shift;
 
   switch (instr->Mask(SVEBroadcastIntImm_UnpredicatedMask)) {
     case DUP_z_i:
@@ -12062,7 +12592,7 @@ void Simulator::VisitSVELoadAndBroadcastElement(const Instruction* instr) {
   VectorFormat unpack_vform =
       SVEFormatFromLaneSizeInBytesLog2(msize_in_bytes_log2);
   SimVRegister temp;
-  ld1r(vform, unpack_vform, temp, base, is_signed);
+  if (!ld1r(vform, unpack_vform, temp, base, is_signed)) return;
   mov_zeroing(vform,
               ReadVRegister(instr->GetRt()),
               ReadPRegister(instr->GetPgLow8()),
@@ -12079,7 +12609,8 @@ void Simulator::VisitSVELoadPredicateRegister(const Instruction* instr) {
       uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer);
       uint64_t address = base + multiplier * pl;
       for (int i = 0; i < pl; i++) {
-        pt.Insert(i, MemRead<uint8_t>(address + i));
+        VIXL_DEFINE_OR_RETURN(value, MemRead<uint8_t>(address + i));
+        pt.Insert(i, value);
       }
       LogPRead(instr->GetPt(), address);
       break;
@@ -12100,7 +12631,8 @@ void Simulator::VisitSVELoadVectorRegister(const Instruction* instr) {
       uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer);
       uint64_t address = base + multiplier * vl;
       for (int i = 0; i < vl; i++) {
-        zt.Insert(i, MemRead<uint8_t>(address + i));
+        VIXL_DEFINE_OR_RETURN(value, MemRead<uint8_t>(address + i));
+        zt.Insert(i, value);
       }
       LogZRead(instr->GetRt(), address);
       break;
@@ -12486,7 +13018,7 @@ void Simulator::VisitSVELoadAndBroadcastQOWord_ScalarPlusImm(
   VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz);
 
   for (unsigned i = 0; i < dwords; i++) {
-    ld1(kFormatVnD, zt, i, addr + offset + (i * kDRegSizeInBytes));
+    if (!ld1(kFormatVnD, zt, i, addr + offset + (i * kDRegSizeInBytes))) return;
   }
   mov_zeroing(vform, zt, pg, zt);
   dup_element(vform_dst, zt, zt, 0);
@@ -12513,7 +13045,7 @@ void Simulator::VisitSVELoadAndBroadcastQOWord_ScalarPlusScalar(
   VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz);
   offset <<= msz;
   for (unsigned i = 0; i < bytes; i++) {
-    ld1(kFormatVnB, zt, i, addr + offset + i);
+    if (!ld1(kFormatVnB, zt, i, addr + offset + i)) return;
   }
   mov_zeroing(vform, zt, pg, zt);
   dup_element(vform_dst, zt, zt, 0);
@@ -12570,7 +13102,7 @@ void Simulator::VisitSVELoadMultipleStructures_ScalarPlusScalar(
     case LD4H_z_p_br_contiguous:
     case LD4W_z_p_br_contiguous: {
       int msz = instr->ExtractBits(24, 23);
-      uint64_t offset = ReadXRegister(instr->GetRm()) * (1 << msz);
+      uint64_t offset = ReadXRegister(instr->GetRm()) * (uint64_t{1} << msz);
       VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz);
       LogicSVEAddressVector addr(
           ReadXRegister(instr->GetRn(), Reg31IsStackPointer) + offset);
@@ -13006,7 +13538,7 @@ void Simulator::VisitSVEStoreMultipleStructures_ScalarPlusScalar(
     case ST4H_z_p_br_contiguous:
     case ST4W_z_p_br_contiguous: {
       int msz = instr->ExtractBits(24, 23);
-      uint64_t offset = ReadXRegister(instr->GetRm()) * (1 << msz);
+      uint64_t offset = ReadXRegister(instr->GetRm()) * (uint64_t{1} << msz);
       VectorFormat vform = SVEFormatFromLaneSizeInBytesLog2(msz);
       LogicSVEAddressVector addr(
           ReadXRegister(instr->GetRn(), Reg31IsStackPointer) + offset);
@@ -13034,7 +13566,7 @@ void Simulator::VisitSVEStorePredicateRegister(const Instruction* instr) {
       uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer);
       uint64_t address = base + multiplier * pl;
       for (int i = 0; i < pl; i++) {
-        MemWrite(address + i, pt.GetLane<uint8_t>(i));
+        if (!MemWrite(address + i, pt.GetLane<uint8_t>(i))) return;
       }
       LogPWrite(instr->GetPt(), address);
       break;
@@ -13055,7 +13587,7 @@ void Simulator::VisitSVEStoreVectorRegister(const Instruction* instr) {
       uint64_t base = ReadXRegister(instr->GetRn(), Reg31IsStackPointer);
       uint64_t address = base + multiplier * vl;
       for (int i = 0; i < vl; i++) {
-        MemWrite(address + i, zt.GetLane<uint8_t>(i));
+        if (!MemWrite(address + i, zt.GetLane<uint8_t>(i))) return;
       }
       LogZWrite(instr->GetRt(), address);
       break;
@@ -14140,7 +14672,7 @@ void Simulator::SimulateMTETagMaskInsert(const Instruction* instr) {
   uint64_t mask = ReadXRegister(instr->GetRm());
   uint64_t tag = GetAllocationTagFromAddress(
       ReadXRegister(instr->GetRn(), Reg31IsStackPointer));
-  uint64_t mask_bit = 1 << tag;
+  uint64_t mask_bit = uint64_t{1} << tag;
   WriteXRegister(instr->GetRd(), mask | mask_bit);
 }
 
@@ -14187,8 +14719,8 @@ void Simulator::SimulateMTEStoreTagPair(const Instruction* instr) {
   int tag = GetAllocationTagFromAddress(rn);
   meta_data_.SetMTETag(address, tag);
 
-  MemWrite<uint64_t>(address, rt);
-  MemWrite<uint64_t>(address + kXRegSizeInBytes, rt2);
+  if (!MemWrite<uint64_t>(address, rt)) return;
+  if (!MemWrite<uint64_t>(address + kXRegSizeInBytes, rt2)) return;
 }
 
 void Simulator::SimulateMTEStoreTag(const Instruction* instr) {
@@ -14250,8 +14782,7 @@ void Simulator::SimulateMTEStoreTag(const Instruction* instr) {
   uintptr_t address = AddressModeHelper(instr->GetRn(), offset, addr_mode);
 
   if (is_zeroing) {
-    if (!IsAligned(reinterpret_cast<uintptr_t>(address),
-                   kMTETagGranuleInBytes)) {
+    if (!IsAligned(address, kMTETagGranuleInBytes)) {
       VIXL_ALIGNMENT_EXCEPTION();
     }
     VIXL_STATIC_ASSERT(kMTETagGranuleInBytes >= sizeof(uint64_t));
@@ -14264,7 +14795,7 @@ void Simulator::SimulateMTEStoreTag(const Instruction* instr) {
 
     size_t fill_offset = 0;
     while (fill_offset < fill_size) {
-      MemWrite<uint64_t>(address + fill_offset, 0);
+      if (!MemWrite<uint64_t>(address + fill_offset, 0)) return;
       fill_offset += sizeof(uint64_t);
     }
   }
@@ -14348,8 +14879,8 @@ void Simulator::SimulateCpyM(const Instruction* instr) {
   }
 
   while (xn--) {
-    uint8_t temp = MemRead<uint8_t>(xs);
-    MemWrite<uint8_t>(xd, temp);
+    VIXL_DEFINE_OR_RETURN(temp, MemRead<uint8_t>(xs));
+    if (!MemWrite<uint8_t>(xd, temp)) return;
     LogMemTransfer(xd, xs, temp);
     xs += step;
     xd += step;
@@ -14388,7 +14919,7 @@ void Simulator::SimulateSetM(const Instruction* instr) {
 
   while (xn--) {
     LogWrite(instr->GetRs(), GetPrintRegPartial(kPrintRegLaneSizeB), xd);
-    MemWrite<uint8_t>(xd++, xs);
+    if (!MemWrite<uint8_t>(xd++, static_cast<uint8_t>(xs))) return;
   }
   WriteXRegister(instr->GetRd(), xd);
   WriteXRegister(instr->GetRn(), 0);
@@ -14598,22 +15129,46 @@ void Simulator::DoRuntimeCall(const Instruction* instr) {
   VIXL_STATIC_ASSERT(kRuntimeCallAddressSize == sizeof(uintptr_t));
   // The appropriate `Simulator::SimulateRuntimeCall()` wrapper and the function
   // to call are passed inlined in the assembly.
-  uintptr_t call_wrapper_address =
-      MemRead<uintptr_t>(instr + kRuntimeCallWrapperOffset);
-  uintptr_t function_address =
-      MemRead<uintptr_t>(instr + kRuntimeCallFunctionOffset);
-  RuntimeCallType call_type = static_cast<RuntimeCallType>(
-      MemRead<uint32_t>(instr + kRuntimeCallTypeOffset));
+  VIXL_DEFINE_OR_RETURN(call_wrapper_address,
+                        MemRead<uintptr_t>(instr + kRuntimeCallWrapperOffset));
+  VIXL_DEFINE_OR_RETURN(function_address,
+                        MemRead<uintptr_t>(instr + kRuntimeCallFunctionOffset));
+  VIXL_DEFINE_OR_RETURN(call_type,
+                        MemRead<uint32_t>(instr + kRuntimeCallTypeOffset));
   auto runtime_call_wrapper =
       reinterpret_cast<void (*)(Simulator*, uintptr_t)>(call_wrapper_address);
 
-  if (call_type == kCallRuntime) {
-    WriteRegister(kLinkRegCode,
-                  instr->GetInstructionAtOffset(kRuntimeCallLength));
+  if (static_cast<RuntimeCallType>(call_type) == kCallRuntime) {
+    const Instruction* addr = instr->GetInstructionAtOffset(kRuntimeCallLength);
+    WriteLr(addr);
+    GCSPush(reinterpret_cast<uint64_t>(addr));
   }
   runtime_call_wrapper(this, function_address);
   // Read the return address from `lr` and write it into `pc`.
-  WritePc(ReadRegister<Instruction*>(kLinkRegCode));
+  uint64_t addr = ReadRegister<uint64_t>(kLinkRegCode);
+  if (IsGCSCheckEnabled()) {
+    uint64_t expected_lr = GCSPeek();
+    char msg[128];
+    if (expected_lr != 0) {
+      if ((expected_lr & 0x3) != 0) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS contains misaligned return address: 0x%016" PRIx64 "\n",
+                 expected_lr);
+        ReportGCSFailure(msg);
+      } else if ((addr != 0) && (addr != expected_lr)) {
+        snprintf(msg,
+                 sizeof(msg),
+                 "GCS mismatch: lr = 0x%016" PRIx64 ", gcs = 0x%016" PRIx64
+                 "\n",
+                 addr,
+                 expected_lr);
+        ReportGCSFailure(msg);
+      }
+      GCSPop();
+    }
+  }
+  WritePc(reinterpret_cast<Instruction*>(addr));
 }
 #else
 void Simulator::DoRuntimeCall(const Instruction* instr) {
@@ -14638,7 +15193,7 @@ void Simulator::DoConfigureCPUFeatures(const Instruction* instr) {
   // Read the kNone-terminated list of features.
   CPUFeatures parameters;
   while (true) {
-    ElementType feature = MemRead<ElementType>(instr + offset);
+    VIXL_DEFINE_OR_RETURN(feature, MemRead<ElementType>(instr + offset));
     offset += element_size;
     if (feature == static_cast<ElementType>(CPUFeatures::kNone)) break;
     parameters.Combine(static_cast<CPUFeatures::Feature>(feature));
@@ -14681,6 +15236,7 @@ void Simulator::DoRestoreCPUFeatures(const Instruction* instr) {
   saved_cpu_features_.pop_back();
 }
 
+#ifdef VIXL_HAS_SIMULATED_MMAP
 void* Simulator::Mmap(
     void* address, size_t length, int prot, int flags, int fd, off_t offset) {
   // The underlying system `mmap` in the simulated environment doesn't recognize
@@ -14713,7 +15269,7 @@ int Simulator::Munmap(void* address, size_t length, int prot) {
 
   return munmap(address, length);
 }
-
+#endif  // VIXL_HAS_SIMULATED_MMAP
 
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/src/aarch64/simulator-aarch64.h b/src/aarch64/simulator-aarch64.h
index cdc17834..8cf085b7 100644
--- a/src/aarch64/simulator-aarch64.h
+++ b/src/aarch64/simulator-aarch64.h
@@ -28,12 +28,14 @@
 #define VIXL_AARCH64_SIMULATOR_AARCH64_H_
 
 #include <memory>
+#include <mutex>
+#include <random>
 #include <unordered_map>
 #include <vector>
 
+#include "../cpu-features.h"
 #include "../globals-vixl.h"
 #include "../utils-vixl.h"
-#include "cpu-features.h"
 
 #include "abi-aarch64.h"
 #include "cpu-features-auditor-aarch64.h"
@@ -68,6 +70,28 @@ namespace aarch64 {
 class Simulator;
 struct RuntimeCallStructHelper;
 
+enum class MemoryAccessResult { Success = 0, Failure = 1 };
+
+// Try to access a piece of memory at the given address. Accessing that memory
+// might raise a signal which, if handled by a custom signal handler, should
+// setup the native and simulated context in order to continue. Return whether
+// the memory access failed (i.e: raised a signal) or succeeded.
+MemoryAccessResult TryMemoryAccess(uintptr_t address, uintptr_t access_size);
+
+#ifdef VIXL_ENABLE_IMPLICIT_CHECKS
+// Access a byte of memory from the address at the given offset. If the memory
+// could be accessed then return MemoryAccessResult::Success. If the memory
+// could not be accessed, and therefore raised a signal, setup the simulated
+// context and return MemoryAccessResult::Failure.
+//
+// If a signal is raised then it is expected that the signal handler will place
+// MemoryAccessResult::Failure in the native return register and the address of
+// _vixl_internal_AccessMemory_continue into the native instruction pointer.
+extern "C" MemoryAccessResult _vixl_internal_ReadMemory(uintptr_t address,
+                                                        uintptr_t offset);
+extern "C" uintptr_t _vixl_internal_AccessMemory_continue();
+#endif  // VIXL_ENABLE_IMPLICIT_CHECKS
+
 class SimStack {
  public:
   SimStack() {}
@@ -136,7 +160,7 @@ class SimStack {
 
   // Allocate the stack, locking the parameters.
   Allocated Allocate() {
-    size_t align_to = 1 << align_log2_;
+    size_t align_to = uint64_t{1} << align_log2_;
     size_t l = AlignUp(limit_guard_size_, align_to);
     size_t u = AlignUp(usable_size_, align_to);
     size_t b = AlignUp(base_guard_size_, align_to);
@@ -366,7 +390,7 @@ class Memory {
   }
 
   template <typename T, typename A>
-  T Read(A address, Instruction const* pc = nullptr) const {
+  std::optional<T> Read(A address, Instruction const* pc = nullptr) const {
     T value;
     VIXL_STATIC_ASSERT((sizeof(value) == 1) || (sizeof(value) == 2) ||
                        (sizeof(value) == 4) || (sizeof(value) == 8) ||
@@ -378,12 +402,16 @@ class Memory {
     if (!IsMTETagsMatched(address, pc)) {
       VIXL_ABORT_WITH_MSG("Tag mismatch.");
     }
+    if (TryMemoryAccess(reinterpret_cast<uintptr_t>(base), sizeof(value)) ==
+        MemoryAccessResult::Failure) {
+      return std::nullopt;
+    }
     memcpy(&value, base, sizeof(value));
     return value;
   }
 
   template <typename T, typename A>
-  void Write(A address, T value, Instruction const* pc = nullptr) const {
+  bool Write(A address, T value, Instruction const* pc = nullptr) const {
     VIXL_STATIC_ASSERT((sizeof(value) == 1) || (sizeof(value) == 2) ||
                        (sizeof(value) == 4) || (sizeof(value) == 8) ||
                        (sizeof(value) == 16));
@@ -394,11 +422,16 @@ class Memory {
     if (!IsMTETagsMatched(address, pc)) {
       VIXL_ABORT_WITH_MSG("Tag mismatch.");
     }
+    if (TryMemoryAccess(reinterpret_cast<uintptr_t>(base), sizeof(value)) ==
+        MemoryAccessResult::Failure) {
+      return false;
+    }
     memcpy(base, &value, sizeof(value));
+    return true;
   }
 
   template <typename A>
-  uint64_t ReadUint(int size_in_bytes, A address) const {
+  std::optional<uint64_t> ReadUint(int size_in_bytes, A address) const {
     switch (size_in_bytes) {
       case 1:
         return Read<uint8_t>(address);
@@ -414,7 +447,7 @@ class Memory {
   }
 
   template <typename A>
-  int64_t ReadInt(int size_in_bytes, A address) const {
+  std::optional<int64_t> ReadInt(int size_in_bytes, A address) const {
     switch (size_in_bytes) {
       case 1:
         return Read<int8_t>(address);
@@ -430,7 +463,7 @@ class Memory {
   }
 
   template <typename A>
-  void Write(int size_in_bytes, A address, uint64_t value) const {
+  bool Write(int size_in_bytes, A address, uint64_t value) const {
     switch (size_in_bytes) {
       case 1:
         return Write(address, static_cast<uint8_t>(value));
@@ -442,6 +475,7 @@ class Memory {
         return Write(address, value);
     }
     VIXL_UNREACHABLE();
+    return false;
   }
 
   void AppendMetaData(MetaDataDepot* metadata_depot) {
@@ -650,7 +684,7 @@ class LogicPRegister {
 
   void SetAllBits() {
     int chunk_size = sizeof(ChunkType) * kBitsPerByte;
-    ChunkType bits = GetUintMask(chunk_size);
+    ChunkType bits = static_cast<ChunkType>(GetUintMask(chunk_size));
     for (int lane = 0;
          lane < (static_cast<int>(register_.GetSizeInBits() / chunk_size));
          lane++) {
@@ -703,6 +737,8 @@ class LogicPRegister {
   SimPRegister& register_;
 };
 
+using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
+
 // Representation of a vector register, with typed getters and setters for lanes
 // and additional information to represent lane state.
 class LogicVRegister {
@@ -831,6 +867,16 @@ class LogicVRegister {
     }
   }
 
+  void SetUint(VectorFormat vform, int index, vixl_uint128_t value) const {
+    if (LaneSizeInBitsFromFormat(vform) <= 64) {
+      SetUint(vform, index, value.second);
+      return;
+    }
+    VIXL_ASSERT((vform == kFormat1Q) || (vform == kFormatVnQ));
+    SetUint(kFormatVnD, 2 * index, value.second);
+    SetUint(kFormatVnD, 2 * index + 1, value.first);
+  }
+
   void SetUintArray(VectorFormat vform, const uint64_t* src) const {
     ClearForWrite(vform);
     for (int i = 0; i < LaneCountFromFormat(vform); i++) {
@@ -1234,9 +1280,10 @@ class SimExclusiveGlobalMonitor {
   uint32_t seed_;
 };
 
-
 class Debugger;
 
+template <uint32_t mode>
+uint64_t CryptoOp(uint64_t x, uint64_t y, uint64_t z);
 
 class Simulator : public DecoderVisitor {
  public:
@@ -1269,7 +1316,7 @@ class Simulator : public DecoderVisitor {
 
 
 #if defined(VIXL_HAS_ABI_SUPPORT) && __cplusplus >= 201103L && \
-    (defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1))
+    (defined(_MSC_VER) || defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1))
   // Templated `RunFrom` version taking care of passing arguments and returning
   // the result value.
   // This allows code like:
@@ -1472,6 +1519,7 @@ class Simulator : public DecoderVisitor {
   void SimulateSVESaturatingMulAddHigh(const Instruction* instr);
   void SimulateSVESaturatingMulHighIndex(const Instruction* instr);
   void SimulateSVEFPConvertLong(const Instruction* instr);
+  void SimulateSVEPmull128(const Instruction* instr);
   void SimulateMatrixMul(const Instruction* instr);
   void SimulateSVEFPMatrixMul(const Instruction* instr);
   void SimulateNEONMulByElementLong(const Instruction* instr);
@@ -1479,6 +1527,7 @@ class Simulator : public DecoderVisitor {
   void SimulateNEONFPMulByElementLong(const Instruction* instr);
   void SimulateNEONComplexMulByElement(const Instruction* instr);
   void SimulateNEONDotProdByElement(const Instruction* instr);
+  void SimulateNEONSHA3(const Instruction* instr);
   void SimulateMTEAddSubTag(const Instruction* instr);
   void SimulateMTETagMaskInsert(const Instruction* instr);
   void SimulateMTESubPointer(const Instruction* instr);
@@ -1498,7 +1547,10 @@ class Simulator : public DecoderVisitor {
   void SimulateSetGM(const Instruction* instr);
   void SimulateSignedMinMax(const Instruction* instr);
   void SimulateUnsignedMinMax(const Instruction* instr);
+  void SimulateSHA512(const Instruction* instr);
 
+  void VisitCryptoSM3(const Instruction* instr);
+  void VisitCryptoSM4(const Instruction* instr);
 
   // Integer register accessors.
 
@@ -2029,62 +2081,66 @@ class Simulator : public DecoderVisitor {
   }
 
   template <typename T, typename A>
-  T MemRead(A address) const {
+  std::optional<T> MemRead(A address) const {
     Instruction const* pc = ReadPc();
     return memory_.Read<T>(address, pc);
   }
 
   template <typename T, typename A>
-  void MemWrite(A address, T value) const {
+  bool MemWrite(A address, T value) const {
     Instruction const* pc = ReadPc();
     return memory_.Write(address, value, pc);
   }
 
   template <typename A>
-  uint64_t MemReadUint(int size_in_bytes, A address) const {
+  std::optional<uint64_t> MemReadUint(int size_in_bytes, A address) const {
     return memory_.ReadUint(size_in_bytes, address);
   }
 
   template <typename A>
-  int64_t MemReadInt(int size_in_bytes, A address) const {
+  std::optional<int64_t> MemReadInt(int size_in_bytes, A address) const {
     return memory_.ReadInt(size_in_bytes, address);
   }
 
   template <typename A>
-  void MemWrite(int size_in_bytes, A address, uint64_t value) const {
+  bool MemWrite(int size_in_bytes, A address, uint64_t value) const {
     return memory_.Write(size_in_bytes, address, value);
   }
 
-  void LoadLane(LogicVRegister dst,
+  bool LoadLane(LogicVRegister dst,
                 VectorFormat vform,
                 int index,
                 uint64_t addr) const {
     unsigned msize_in_bytes = LaneSizeInBytesFromFormat(vform);
-    LoadUintToLane(dst, vform, msize_in_bytes, index, addr);
+    return LoadUintToLane(dst, vform, msize_in_bytes, index, addr);
   }
 
-  void LoadUintToLane(LogicVRegister dst,
+  bool LoadUintToLane(LogicVRegister dst,
                       VectorFormat vform,
                       unsigned msize_in_bytes,
                       int index,
                       uint64_t addr) const {
-    dst.SetUint(vform, index, MemReadUint(msize_in_bytes, addr));
+    VIXL_DEFINE_OR_RETURN_FALSE(value, MemReadUint(msize_in_bytes, addr));
+    dst.SetUint(vform, index, value);
+    return true;
   }
 
-  void LoadIntToLane(LogicVRegister dst,
+  bool LoadIntToLane(LogicVRegister dst,
                      VectorFormat vform,
                      unsigned msize_in_bytes,
                      int index,
                      uint64_t addr) const {
-    dst.SetInt(vform, index, MemReadInt(msize_in_bytes, addr));
+    VIXL_DEFINE_OR_RETURN_FALSE(value, MemReadInt(msize_in_bytes, addr));
+    dst.SetInt(vform, index, value);
+    return true;
   }
 
-  void StoreLane(const LogicVRegister& src,
+  bool StoreLane(const LogicVRegister& src,
                  VectorFormat vform,
                  int index,
                  uint64_t addr) const {
     unsigned msize_in_bytes = LaneSizeInBytesFromFormat(vform);
-    MemWrite(msize_in_bytes, addr, src.Uint(vform, index));
+    return MemWrite(msize_in_bytes, addr, src.Uint(vform, index));
   }
 
   uint64_t ComputeMemOperandAddress(const MemOperand& mem_op) const;
@@ -2095,12 +2151,14 @@ class Simulator : public DecoderVisitor {
       return ReadCPURegister<T>(operand.GetCPURegister());
     } else {
       VIXL_ASSERT(operand.IsMemOperand());
-      return MemRead<T>(ComputeMemOperandAddress(operand.GetMemOperand()));
+      auto res = MemRead<T>(ComputeMemOperandAddress(operand.GetMemOperand()));
+      VIXL_ASSERT(res);
+      return *res;
     }
   }
 
   template <typename T>
-  void WriteGenericOperand(GenericOperand operand,
+  bool WriteGenericOperand(GenericOperand operand,
                            T value,
                            RegLogMode log_mode = LogRegWrites) {
     if (operand.IsCPURegister()) {
@@ -2116,8 +2174,9 @@ class Simulator : public DecoderVisitor {
       WriteCPURegister(operand.GetCPURegister(), raw, log_mode);
     } else {
       VIXL_ASSERT(operand.IsMemOperand());
-      MemWrite(ComputeMemOperandAddress(operand.GetMemOperand()), value);
+      return MemWrite(ComputeMemOperandAddress(operand.GetMemOperand()), value);
     }
+    return true;
   }
 
   bool ReadN() const { return nzcv_.GetN() != 0; }
@@ -2493,12 +2552,16 @@ class Simulator : public DecoderVisitor {
   // Other state updates, including system registers.
   void PrintSystemRegister(SystemRegister id);
   void PrintTakenBranch(const Instruction* target);
+  void PrintGCS(bool is_push, uint64_t addr, size_t entry);
   void LogSystemRegister(SystemRegister id) {
     if (ShouldTraceSysRegs()) PrintSystemRegister(id);
   }
   void LogTakenBranch(const Instruction* target) {
     if (ShouldTraceBranches()) PrintTakenBranch(target);
   }
+  void LogGCS(bool is_push, uint64_t addr, size_t entry) {
+    if (ShouldTraceSysRegs()) PrintGCS(is_push, addr, entry);
+  }
 
   // Trace memory accesses.
 
@@ -2528,6 +2591,14 @@ class Simulator : public DecoderVisitor {
   void PrintPWrite(int rt_code, uintptr_t address) {
     PrintPAccess(rt_code, "->", address);
   }
+  void PrintWriteU64(uint64_t x, uintptr_t address) {
+    fprintf(stream_,
+            "#      0x%016lx -> %s0x%016" PRIxPTR "%s\n",
+            x,
+            clr_memory_address,
+            address,
+            clr_normal);
+  }
 
   // Like Print* (above), but respect GetTraceParameters().
   void LogRead(int rt_code, PrintRegisterFormat format, uintptr_t address) {
@@ -2562,6 +2633,9 @@ class Simulator : public DecoderVisitor {
   void LogPWrite(int rt_code, uintptr_t address) {
     if (ShouldTraceWrites()) PrintPWrite(rt_code, address);
   }
+  void LogWriteU64(uint64_t x, uintptr_t address) {
+    if (ShouldTraceWrites()) PrintWriteU64(x, address);
+  }
   void LogMemTransfer(uintptr_t dst, uintptr_t src, uint8_t value) {
     if (ShouldTraceWrites()) PrintMemTransfer(dst, src, value);
   }
@@ -2860,7 +2934,7 @@ class Simulator : public DecoderVisitor {
     }
 
     if (offset == 0) {
-      while ((exclude & (1 << tag)) != 0) {
+      while ((exclude & (uint64_t{1} << tag)) != 0) {
         tag = (tag + 1) % 16;
       }
     }
@@ -2868,7 +2942,7 @@ class Simulator : public DecoderVisitor {
     while (offset > 0) {
       offset--;
       tag = (tag + 1) % 16;
-      while ((exclude & (1 << tag)) != 0) {
+      while ((exclude & (uint64_t{1} << tag)) != 0) {
         tag = (tag + 1) % 16;
       }
     }
@@ -2880,12 +2954,15 @@ class Simulator : public DecoderVisitor {
     return (addr & ~(UINT64_C(0xf) << 56)) | (tag << 56);
   }
 
+#if __linux__
+#define VIXL_HAS_SIMULATED_MMAP
   // Create or remove a mapping with memory protection. Memory attributes such
   // as MTE and BTI are represented by metadata in Simulator.
   void* Mmap(
       void* address, size_t length, int prot, int flags, int fd, off_t offset);
 
   int Munmap(void* address, size_t length, int prot);
+#endif
 
   // The common CPUFeatures interface with the set of available features.
 
@@ -2908,7 +2985,7 @@ class Simulator : public DecoderVisitor {
 // Also, the initialisation of the tuples in RuntimeCall(Non)Void is incorrect
 // in GCC before 4.9.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51253
 #if defined(VIXL_HAS_ABI_SUPPORT) && __cplusplus >= 201103L && \
-    (defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1))
+    (defined(_MSC_VER) || defined(__clang__) || GCC_VERSION_OR_NEWER(4, 9, 1))
 
 #define VIXL_HAS_SIMULATED_RUNTIME_CALL_SUPPORT
 
@@ -2966,7 +3043,10 @@ class Simulator : public DecoderVisitor {
     R return_value = DoRuntimeCall(function,
                                    argument_operands,
                                    __local_index_sequence_for<P...>{});
-    WriteGenericOperand(abi.GetReturnGenericOperand<R>(), return_value);
+    bool succeeded =
+        WriteGenericOperand(abi.GetReturnGenericOperand<R>(), return_value);
+    USE(succeeded);
+    VIXL_ASSERT(succeeded);
   }
 
   template <typename R, typename... P>
@@ -3154,6 +3234,43 @@ class Simulator : public DecoderVisitor {
 #endif
   }
 
+#ifdef VIXL_ENABLE_IMPLICIT_CHECKS
+  // Returns true if the faulting instruction address (usually the program
+  // counter or instruction pointer) comes from an internal VIXL memory access.
+  // This can be used by signal handlers to check if a signal was raised from
+  // the simulator (via TryMemoryAccess) before the actual
+  // access occurs.
+  bool IsSimulatedMemoryAccess(uintptr_t fault_pc) const {
+    return (fault_pc ==
+            reinterpret_cast<uintptr_t>(&_vixl_internal_ReadMemory));
+  }
+
+  // Get the instruction address of the internal VIXL memory access continuation
+  // label. Signal handlers can resume execution at this address to return to
+  // TryMemoryAccess which will continue simulation.
+  uintptr_t GetSignalReturnAddress() const {
+    return reinterpret_cast<uintptr_t>(&_vixl_internal_AccessMemory_continue);
+  }
+
+  // Replace the fault address reported by the kernel with the actual faulting
+  // address.
+  //
+  // This is required because TryMemoryAccess reads a section of
+  // memory 1 byte at a time meaning the fault address reported may not be the
+  // base address of memory being accessed.
+  void ReplaceFaultAddress(siginfo_t* siginfo, void* context) {
+#ifdef __x86_64__
+    // The base address being accessed is passed in as the first argument to
+    // _vixl_internal_ReadMemory.
+    ucontext_t* uc = reinterpret_cast<ucontext_t*>(context);
+    siginfo->si_addr = reinterpret_cast<void*>(uc->uc_mcontext.gregs[REG_RDI]);
+#else
+    USE(siginfo);
+    USE(context);
+#endif  // __x86_64__
+  }
+#endif  // VIXL_ENABLE_IMPLICIT_CHECKS
+
  protected:
   const char* clr_normal;
   const char* clr_flag_name;
@@ -3234,8 +3351,9 @@ class Simulator : public DecoderVisitor {
                                             uint64_t left,
                                             uint64_t right,
                                             int carry_in);
-  using vixl_uint128_t = std::pair<uint64_t, uint64_t>;
   vixl_uint128_t Add128(vixl_uint128_t x, vixl_uint128_t y);
+  vixl_uint128_t Lsl128(vixl_uint128_t x, unsigned shift) const;
+  vixl_uint128_t Eor128(vixl_uint128_t x, vixl_uint128_t y) const;
   vixl_uint128_t Mul64(uint64_t x, uint64_t y);
   vixl_uint128_t Neg128(vixl_uint128_t x);
   void LogicalHelper(const Instruction* instr, int64_t op2);
@@ -3317,92 +3435,95 @@ class Simulator : public DecoderVisitor {
   uint64_t PolynomialMult(uint64_t op1,
                           uint64_t op2,
                           int lane_size_in_bits) const;
+  vixl_uint128_t PolynomialMult128(uint64_t op1,
+                                   uint64_t op2,
+                                   int lane_size_in_bits) const;
 
-  void ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr);
-  void ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr);
-  void ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr);
-  void ld1r(VectorFormat vform,
+  bool ld1(VectorFormat vform, LogicVRegister dst, uint64_t addr);
+  bool ld1(VectorFormat vform, LogicVRegister dst, int index, uint64_t addr);
+  bool ld1r(VectorFormat vform, LogicVRegister dst, uint64_t addr);
+  bool ld1r(VectorFormat vform,
             VectorFormat unpack_vform,
             LogicVRegister dst,
             uint64_t addr,
             bool is_signed = false);
-  void ld2(VectorFormat vform,
+  bool ld2(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            uint64_t addr);
-  void ld2(VectorFormat vform,
+  bool ld2(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            int index,
            uint64_t addr);
-  void ld2r(VectorFormat vform,
+  bool ld2r(VectorFormat vform,
             LogicVRegister dst1,
             LogicVRegister dst2,
             uint64_t addr);
-  void ld3(VectorFormat vform,
+  bool ld3(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            LogicVRegister dst3,
            uint64_t addr);
-  void ld3(VectorFormat vform,
+  bool ld3(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            LogicVRegister dst3,
            int index,
            uint64_t addr);
-  void ld3r(VectorFormat vform,
+  bool ld3r(VectorFormat vform,
             LogicVRegister dst1,
             LogicVRegister dst2,
             LogicVRegister dst3,
             uint64_t addr);
-  void ld4(VectorFormat vform,
+  bool ld4(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            LogicVRegister dst3,
            LogicVRegister dst4,
            uint64_t addr);
-  void ld4(VectorFormat vform,
+  bool ld4(VectorFormat vform,
            LogicVRegister dst1,
            LogicVRegister dst2,
            LogicVRegister dst3,
            LogicVRegister dst4,
            int index,
            uint64_t addr);
-  void ld4r(VectorFormat vform,
+  bool ld4r(VectorFormat vform,
             LogicVRegister dst1,
             LogicVRegister dst2,
             LogicVRegister dst3,
             LogicVRegister dst4,
             uint64_t addr);
-  void st1(VectorFormat vform, LogicVRegister src, uint64_t addr);
-  void st1(VectorFormat vform, LogicVRegister src, int index, uint64_t addr);
-  void st2(VectorFormat vform,
+  bool st1(VectorFormat vform, LogicVRegister src, uint64_t addr);
+  bool st1(VectorFormat vform, LogicVRegister src, int index, uint64_t addr);
+  bool st2(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            uint64_t addr);
-  void st2(VectorFormat vform,
+  bool st2(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            int index,
            uint64_t addr);
-  void st3(VectorFormat vform,
+  bool st3(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            LogicVRegister src3,
            uint64_t addr);
-  void st3(VectorFormat vform,
+  bool st3(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            LogicVRegister src3,
            int index,
            uint64_t addr);
-  void st4(VectorFormat vform,
+  bool st4(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            LogicVRegister src3,
            LogicVRegister src4,
            uint64_t addr);
-  void st4(VectorFormat vform,
+  bool st4(VectorFormat vform,
            LogicVRegister src,
            LogicVRegister src2,
            LogicVRegister src3,
@@ -3688,6 +3809,10 @@ class Simulator : public DecoderVisitor {
                      LogicVRegister dst,
                      const LogicVRegister& src,
                      int rotation);
+  LogicVRegister rol(VectorFormat vform,
+                     LogicVRegister dst,
+                     const LogicVRegister& src,
+                     int rotation);
   LogicVRegister ext(VectorFormat vform,
                      LogicVRegister dst,
                      const LogicVRegister& src1,
@@ -4412,6 +4537,95 @@ class Simulator : public DecoderVisitor {
                          LogicVRegister srcdst,
                          const LogicVRegister& src1,
                          const LogicVRegister& src2);
+
+  template <unsigned N>
+  static void SHARotateEltsLeftOne(uint64_t (&x)[N]) {
+    VIXL_STATIC_ASSERT(N == 4);
+    uint64_t temp = x[3];
+    x[3] = x[2];
+    x[2] = x[1];
+    x[1] = x[0];
+    x[0] = temp;
+  }
+
+  template <uint32_t mode>
+  LogicVRegister sha1(LogicVRegister srcdst,
+                      const LogicVRegister& src1,
+                      const LogicVRegister& src2) {
+    uint64_t y = src1.Uint(kFormat4S, 0);
+    uint64_t sd[4] = {};
+    srcdst.UintArray(kFormat4S, sd);
+
+    for (unsigned i = 0; i < ArrayLength(sd); i++) {
+      uint64_t t = CryptoOp<mode>(sd[1], sd[2], sd[3]);
+
+      y += RotateLeft(sd[0], 5, kSRegSize) + t;
+      y += src2.Uint(kFormat4S, i);
+
+      sd[1] = RotateLeft(sd[1], 30, kSRegSize);
+
+      // y:sd = ROL(y:sd, 32)
+      SHARotateEltsLeftOne(sd);
+      std::swap(sd[0], y);
+    }
+
+    srcdst.SetUintArray(kFormat4S, sd);
+    return srcdst;
+  }
+
+  LogicVRegister sha2h(LogicVRegister srcdst,
+                       const LogicVRegister& src1,
+                       const LogicVRegister& src2,
+                       bool part1);
+  LogicVRegister sha2su0(LogicVRegister srcdst, const LogicVRegister& src1);
+  LogicVRegister sha2su1(LogicVRegister srcdst,
+                         const LogicVRegister& src1,
+                         const LogicVRegister& src2);
+  LogicVRegister sha512h(LogicVRegister srcdst,
+                         const LogicVRegister& src1,
+                         const LogicVRegister& src2);
+  LogicVRegister sha512h2(LogicVRegister srcdst,
+                          const LogicVRegister& src1,
+                          const LogicVRegister& src2);
+  LogicVRegister sha512su0(LogicVRegister srcdst, const LogicVRegister& src1);
+  LogicVRegister sha512su1(LogicVRegister srcdst,
+                           const LogicVRegister& src1,
+                           const LogicVRegister& src2);
+
+
+  LogicVRegister aes(LogicVRegister srcdst,
+                     const LogicVRegister& src1,
+                     bool decrypt);
+  LogicVRegister aesmix(LogicVRegister srcdst,
+                        const LogicVRegister& src1,
+                        bool inverse);
+
+  LogicVRegister sm3partw1(LogicVRegister dst,
+                           const LogicVRegister& src1,
+                           const LogicVRegister& src2);
+  LogicVRegister sm3partw2(LogicVRegister dst,
+                           const LogicVRegister& src1,
+                           const LogicVRegister& src2);
+  LogicVRegister sm3ss1(LogicVRegister dst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        const LogicVRegister& src3);
+  LogicVRegister sm3tt1(LogicVRegister srcdst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        int index,
+                        bool is_a);
+  LogicVRegister sm3tt2(LogicVRegister srcdst,
+                        const LogicVRegister& src1,
+                        const LogicVRegister& src2,
+                        int index,
+                        bool is_a);
+
+  LogicVRegister sm4(LogicVRegister dst,
+                     const LogicVRegister& src1,
+                     const LogicVRegister& src2,
+                     bool is_key);
+
 #define NEON_3VREG_LOGIC_LIST(V) \
   V(addhn)                       \
   V(addhn2)                      \
@@ -4825,7 +5039,7 @@ class Simulator : public DecoderVisitor {
   uint32_t Crc32Checksum(uint32_t acc, T val, uint32_t poly);
   uint32_t Crc32Checksum(uint32_t acc, uint64_t val, uint32_t poly);
 
-  void SysOp_W(int op, int64_t val);
+  bool SysOp_W(int op, int64_t val);
 
   template <typename T>
   T FPRecipSqrtEstimate(T op);
@@ -4979,7 +5193,8 @@ class Simulator : public DecoderVisitor {
                                 unsigned zt_code,
                                 const LogicSVEAddressVector& addr);
   // Load each active zt<i>[lane] from `addr.GetElementAddress(lane, ...)`.
-  void SVEStructuredLoadHelper(VectorFormat vform,
+  // Returns false if a load failed.
+  bool SVEStructuredLoadHelper(VectorFormat vform,
                                const LogicPRegister& pg,
                                unsigned zt_code,
                                const LogicSVEAddressVector& addr,
@@ -5178,10 +5393,12 @@ class Simulator : public DecoderVisitor {
 
   bool CanReadMemory(uintptr_t address, size_t size);
 
+#ifndef _WIN32
   // CanReadMemory needs placeholder file descriptors, so we use a pipe. We can
   // save some system call overhead by opening them on construction, rather than
   // on every call to CanReadMemory.
   int placeholder_pipe_fd_[2];
+#endif
 
   template <typename T>
   static T FPDefaultNaN();
@@ -5265,15 +5482,22 @@ class Simulator : public DecoderVisitor {
   Vector<CPUFeatures>saved_cpu_features_;
 #endif
 
-  // State for *rand48 functions, used to simulate randomness with repeatable
+  // linear_congruential_engine, used to simulate randomness with repeatable
   // behaviour (so that tests are deterministic). This is used to simulate RNDR
   // and RNDRRS, as well as to simulate a source of entropy for architecturally
   // undefined behaviour.
-  uint16_t rand_state_[3];
+  std::linear_congruential_engine<uint64_t,
+                                  0x5DEECE66D,
+                                  0xB,
+                                  static_cast<uint64_t>(1) << 48>
+      rand_gen_;
 
   // A configurable size of SVE vector registers.
   unsigned vector_length_;
 
+  // DC ZVA enable (= 0) status and block size.
+  unsigned dczid_ = (0 << 4) | 4;  // 2^4 words => 64-byte block size.
+
   // Representation of memory attributes such as MTE tagging and BTI page
   // protection in addition to branch interceptions.
   MetaDataDepot meta_data_;
@@ -5287,6 +5511,161 @@ class Simulator : public DecoderVisitor {
 #else
   Debugger* debugger_{nullptr};
 #endif
+
+  // The Guarded Control Stack is represented using a vector, where the more
+  // recently stored addresses are at higher-numbered indices.
+  using GuardedControlStack = std::vector<uint64_t>;
+
+  // The GCSManager handles the synchronisation of GCS across multiple
+  // Simulator instances. Each Simulator has its own stack, but all share
+  // a GCSManager instance. This allows exchanging stacks between Simulators
+  // in a threaded application.
+  class GCSManager {
+   public:
+    // Allocate a new Guarded Control Stack and add it to the vector of stacks.
+    uint64_t AllocateStack() {
+      const std::lock_guard<std::mutex> lock(stacks_mtx_);
+
+      GuardedControlStack* new_stack = new GuardedControlStack;
+      uint64_t result;
+
+      // Put the new stack into the first available slot.
+      for (result = 0; result < stacks_.size(); result++) {
+        if (stacks_[result] == nullptr) {
+          stacks_[result] = new_stack;
+          break;
+        }
+      }
+
+      // If there were no slots, create a new one.
+      if (result == stacks_.size()) {
+        stacks_.push_back(new_stack);
+      }
+
+      // Shift the index to look like a stack pointer aligned to a page.
+      result <<= kPageSizeLog2;
+
+      // Push the tagged index onto the new stack as a seal.
+      new_stack->push_back(result + 1);
+      return result;
+    }
+
+    // Free a Guarded Control Stack and set the stacks_ slot to null.
+    void FreeStack(uint64_t gcs) {
+      const std::lock_guard<std::mutex> lock(stacks_mtx_);
+      uint64_t gcs_index = GetGCSIndex(gcs);
+      GuardedControlStack* gcsptr = stacks_[gcs_index];
+      if (gcsptr == nullptr) {
+        VIXL_ABORT_WITH_MSG("Tried to free unallocated GCS ");
+      } else {
+        delete gcsptr;
+        stacks_[gcs_index] = nullptr;
+      }
+    }
+
+    // Get a pointer to the GCS vector using a GCS id.
+    GuardedControlStack* GetGCSPtr(uint64_t gcs) const {
+      return stacks_[GetGCSIndex(gcs)];
+    }
+
+   private:
+    uint64_t GetGCSIndex(uint64_t gcs) const { return gcs >> 12; }
+
+    std::vector<GuardedControlStack*> stacks_;
+    std::mutex stacks_mtx_;
+  };
+
+  // A GCS id indicating no GCS has been allocated.
+  static const uint64_t kGCSNoStack = kPageSize - 1;
+  uint64_t gcs_;
+  bool gcs_enabled_;
+
+ public:
+  GCSManager& GetGCSManager() {
+    static GCSManager manager;
+    return manager;
+  }
+
+  void EnableGCSCheck() { gcs_enabled_ = true; }
+  void DisableGCSCheck() { gcs_enabled_ = false; }
+  bool IsGCSCheckEnabled() const { return gcs_enabled_; }
+
+ private:
+  bool IsAllocatedGCS(uint64_t gcs) const { return gcs != kGCSNoStack; }
+  void ResetGCSState() {
+    GCSManager& m = GetGCSManager();
+    if (IsAllocatedGCS(gcs_)) {
+      m.FreeStack(gcs_);
+    }
+    ActivateGCS(m.AllocateStack());
+    GCSPop();  // Remove seal.
+  }
+
+  GuardedControlStack* GetGCSPtr(uint64_t gcs) {
+    GCSManager& m = GetGCSManager();
+    GuardedControlStack* result = m.GetGCSPtr(gcs);
+    return result;
+  }
+  GuardedControlStack* GetActiveGCSPtr() { return GetGCSPtr(gcs_); }
+
+  uint64_t ActivateGCS(uint64_t gcs) {
+    uint64_t outgoing_gcs = gcs_;
+    gcs_ = gcs;
+    return outgoing_gcs;
+  }
+
+  void GCSPush(uint64_t addr) {
+    GetActiveGCSPtr()->push_back(addr);
+    size_t entry = GetActiveGCSPtr()->size() - 1;
+    LogGCS(/* is_push = */ true, addr, entry);
+  }
+
+  uint64_t GCSPop() {
+    GuardedControlStack* gcs = GetActiveGCSPtr();
+    if (gcs->empty()) {
+      return 0;
+    }
+    uint64_t return_addr = gcs->back();
+    size_t entry = gcs->size() - 1;
+    gcs->pop_back();
+    LogGCS(/* is_push = */ false, return_addr, entry);
+    return return_addr;
+  }
+
+  uint64_t GCSPeek() {
+    GuardedControlStack* gcs = GetActiveGCSPtr();
+    if (gcs->empty()) {
+      return 0;
+    }
+    uint64_t return_addr = gcs->back();
+    return return_addr;
+  }
+
+  void ReportGCSFailure(const char* msg) {
+    if (IsGCSCheckEnabled()) {
+      GuardedControlStack* gcs = GetActiveGCSPtr();
+      printf("%s", msg);
+      if (gcs == nullptr) {
+        printf("GCS pointer is null\n");
+      } else {
+        printf("GCS records, most recent first:\n");
+        int most_recent_index = static_cast<int>(gcs->size()) - 1;
+        for (int i = 0; i < 8; i++) {
+          if (!gcs->empty()) {
+            uint64_t entry = gcs->back();
+            gcs->pop_back();
+            int index = most_recent_index - i;
+            printf(" gcs%" PRIu64 "[%d]: 0x%016" PRIx64 "\n",
+                   gcs_,
+                   index,
+                   entry);
+          }
+        }
+        printf("End of GCS records.\n");
+      }
+      VIXL_ABORT_WITH_MSG("GCS failed ");
+    }
+  }
 };
 
 #if defined(VIXL_HAS_SIMULATED_RUNTIME_CALL_SUPPORT) && __cplusplus < 201402L
diff --git a/src/cpu-features.h b/src/cpu-features.h
index 97eb661a..1a041f66 100644
--- a/src/cpu-features.h
+++ b/src/cpu-features.h
@@ -201,7 +201,8 @@ namespace vixl {
   /* Extended BFloat16 instructions                                         */ \
   V(kEBF16,               "EBF16",                  "ebf16")                   \
   V(kSVE_EBF16,           "EBF16 (SVE)",            "sveebf16")                \
-  V(kCSSC,                "CSSC",                   "cssc")
+  V(kCSSC,                "CSSC",                   "cssc")                    \
+  V(kGCS,                 "GCS",                    "gcs")
 // clang-format on
 
 
diff --git a/src/globals-vixl.h b/src/globals-vixl.h
index 2efed250..b096c7f3 100644
--- a/src/globals-vixl.h
+++ b/src/globals-vixl.h
@@ -215,6 +215,18 @@ inline void USE(const T1&, const T2&, const T3&, const T4&) {}
   } while (0)
 #endif
 
+// Evaluate 'init' to an std::optional and return if it's empty. If 'init' is
+// not empty then define a variable 'name' with the value inside the
+// std::optional.
+#define VIXL_DEFINE_OR_RETURN(name, init) \
+  auto opt##name = init;                  \
+  if (!opt##name) return;                 \
+  auto name = *opt##name;
+#define VIXL_DEFINE_OR_RETURN_FALSE(name, init) \
+  auto opt##name = init;                        \
+  if (!opt##name) return false;                 \
+  auto name = *opt##name;
+
 #if __cplusplus >= 201103L
 #define VIXL_NO_RETURN [[noreturn]]
 #else
diff --git a/src/invalset-vixl.h b/src/invalset-vixl.h
index bdd66025..12de273d 100644
--- a/src/invalset-vixl.h
+++ b/src/invalset-vixl.h
@@ -1,4 +1,3 @@
-// Copyright 2015, VIXL authors
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@@ -95,7 +94,7 @@ class InvalSet {
 #else
   InvalSet() = delete;
   InvalSet(AllocatorWrapper alocator);
-  InvalSet(InvalSet&&) = default;
+  InvalSet(InvalSet&&) = default;  // movable
 #endif
   ~InvalSet() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION;
 
diff --git a/src/pool-manager-impl.h b/src/pool-manager-impl.h
index 91bc4369..2dfd09c0 100644
--- a/src/pool-manager-impl.h
+++ b/src/pool-manager-impl.h
@@ -491,7 +491,7 @@ void PoolManager<T>::Release(T pc) {
 }
 
 template <typename T>
-PoolManager<T>::~PoolManager<T>() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION {
+PoolManager<T>::~PoolManager() VIXL_NEGATIVE_TESTING_ALLOW_EXCEPTION {
 #ifdef VIXL_DEBUG
   // Check for unbound objects.
   for (objects_iter iter = objects_.begin(); iter != objects_.end(); ++iter) {
diff --git a/src/utils-vixl.h b/src/utils-vixl.h
index 9b0dbc29..9e08ba7d 100644
--- a/src/utils-vixl.h
+++ b/src/utils-vixl.h
@@ -385,6 +385,11 @@ inline uint64_t RotateRight(uint64_t value,
   return value & width_mask;
 }
 
+inline uint64_t RotateLeft(uint64_t value,
+                           unsigned int rotate,
+                           unsigned int width) {
+  return RotateRight(value, width - rotate, width);
+}
 
 // Wrapper class for passing FP16 values through the assembler.
 // This is purely to aid with type checking/casting.
@@ -437,6 +442,12 @@ T UnsignedNegate(T value) {
   return ~value + 1;
 }
 
+template <typename T>
+bool CanBeNegated(T value) {
+  VIXL_STATIC_ASSERT(std::is_signed<T>::value);
+  return (value == std::numeric_limits<T>::min()) ? false : true;
+}
+
 // An absolute operation for signed integers that is defined for results outside
 // the representable range. Specifically, Abs(MIN_INT) is MIN_INT.
 template <typename T>
@@ -694,13 +705,14 @@ inline T SignExtend(T val, int size_in_bits) {
 template <typename T>
 T ReverseBytes(T value, int block_bytes_log2) {
   VIXL_ASSERT((sizeof(value) == 4) || (sizeof(value) == 8));
-  VIXL_ASSERT((1U << block_bytes_log2) <= sizeof(value));
+  VIXL_ASSERT((uint64_t{1} << block_bytes_log2) <= sizeof(value));
   // Split the 64-bit value into an 8-bit array, where b[0] is the least
   // significant byte, and b[7] is the most significant.
   uint8_t bytes[8];
   uint64_t mask = UINT64_C(0xff00000000000000);
   for (int i = 7; i >= 0; i--) {
-    bytes[i] = (static_cast<uint64_t>(value) & mask) >> (i * 8);
+    bytes[i] =
+        static_cast<uint8_t>((static_cast<uint64_t>(value) & mask) >> (i * 8));
     mask >>= 8;
   }
 
@@ -757,6 +769,39 @@ bool IsWordAligned(T pointer) {
   return IsAligned<4>(pointer);
 }
 
+template <unsigned BITS, typename T>
+bool IsRepeatingPattern(T value) {
+  VIXL_STATIC_ASSERT(std::is_unsigned<T>::value);
+  VIXL_ASSERT(IsMultiple(sizeof(value) * kBitsPerByte, BITS));
+  VIXL_ASSERT(IsMultiple(BITS, 2));
+  VIXL_STATIC_ASSERT(BITS >= 2);
+#if (defined(__x86_64__) || defined(__i386)) && __clang_major__ >= 17 && \
+    __clang_major__ <= 19
+  // Workaround for https://github.com/llvm/llvm-project/issues/108722
+  unsigned hbits = BITS / 2;
+  T midmask = (~static_cast<T>(0) >> BITS) << hbits;
+  // E.g. for bytes in a word (0xb3b2b1b0): .b3b2b1. == .b2b1b0.
+  return (((value >> hbits) & midmask) == ((value << hbits) & midmask));
+#else
+  return value == RotateRight(value, BITS, sizeof(value) * kBitsPerByte);
+#endif
+}
+
+template <typename T>
+bool AllBytesMatch(T value) {
+  return IsRepeatingPattern<kBitsPerByte>(value);
+}
+
+template <typename T>
+bool AllHalfwordsMatch(T value) {
+  return IsRepeatingPattern<kBitsPerByte * 2>(value);
+}
+
+template <typename T>
+bool AllWordsMatch(T value) {
+  return IsRepeatingPattern<kBitsPerByte * 4>(value);
+}
+
 // Increment a pointer until it has the specified alignment. The alignment must
 // be a power of two.
 template <class T>
diff --git a/test/aarch32/test-assembler-aarch32.cc b/test/aarch32/test-assembler-aarch32.cc
index 3432a806..d97e18be 100644
--- a/test/aarch32/test-assembler-aarch32.cc
+++ b/test/aarch32/test-assembler-aarch32.cc
@@ -177,17 +177,23 @@ namespace aarch32 {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH32
 // No simulator yet. We can't test the results.
 
-#define ASSERT_EQUAL_32(expected, result)
+#define ASSERT_EQUAL_32(expected, result) \
+  USE(expected, result)
 
-#define ASSERT_EQUAL_64(expected, result)
+#define ASSERT_EQUAL_64(expected, result) \
+  USE(expected, result)
 
-#define ASSERT_EQUAL_128(expected_h, expected_l, result)
+#define ASSERT_EQUAL_128(expected_h, expected_l, result) \
+  USE(expected_h, expected_l, result)
 
-#define ASSERT_EQUAL_FP32(expected, result)
+#define ASSERT_EQUAL_FP32(expected, result) \
+  USE(expected, result)
 
-#define ASSERT_EQUAL_FP64(expected, result)
+#define ASSERT_EQUAL_FP64(expected, result) \
+  USE(expected, result)
 
-#define ASSERT_EQUAL_NZCV(expected)
+#define ASSERT_EQUAL_NZCV(expected) \
+  USE(expected)
 
 #else
 
@@ -3634,8 +3640,6 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
   const int label_count = 15;
   const int literal_count = 31;
   Label* labels;
-  uint64_t* literal_values;
-  Literal<uint64_t>* literals[literal_count];
 
   // Use multiple iterations, as each produces a different predictably random
   // sequence.
@@ -3679,12 +3683,13 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
         labels = new Label[label_count];
 
         // Create new literal values.
-        literal_values = new uint64_t[literal_count];
+        std::vector<uint64_t> literal_values;
+        std::vector<Literal<uint64_t>> literals;
         for (int lit = 0; lit < literal_count; lit++) {
           // TODO: Generate pseudo-random data for literals. At the moment, the
           // disassembler breaks if we do this.
-          literal_values[lit] = lit;
-          literals[lit] = new Literal<uint64_t>(literal_values[lit]);
+          literal_values.push_back(lit);
+          literals.emplace_back(Literal<uint64_t>(literal_values[lit]));
         }
 
         for (;;) {
@@ -3736,13 +3741,13 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
               __ Nop();
               break;
             case 4:
-              __ Ldr(r2, literals[literal_index]);
+              __ Ldr(r2, &literals[literal_index]);
               __ Cmp(r2, static_cast<uint32_t>(literal_values[literal_index]));
               __ B(ne, &fail);
               __ Mov(r2, 0);
               break;
             case 5:
-              __ Ldrb(r2, literals[literal_index]);
+              __ Ldrb(r2, &literals[literal_index]);
               __ Cmp(r2,
                      static_cast<uint32_t>(literal_values[literal_index]) &
                          0xff);
@@ -3750,7 +3755,7 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
               __ Mov(r2, 0);
               break;
             case 6:
-              __ Ldrd(r2, r3, literals[literal_index]);
+              __ Ldrd(r2, r3, &literals[literal_index]);
               __ Cmp(r2, static_cast<uint32_t>(literal_values[literal_index]));
               __ B(ne, &fail);
               __ Mov(r2, 0);
@@ -3761,7 +3766,7 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
               __ Mov(r3, 0);
               break;
             case 7:
-              __ Vldr(s0, literals[literal_index]);
+              __ Vldr(s0, &literals[literal_index]);
               __ Vmov(s1, static_cast<uint32_t>(literal_values[literal_index]));
               __ Vcmp(s0, s1);
               __ B(ne, &fail);
@@ -3875,9 +3880,6 @@ static void NearBranchAndLiteralFuzzHelper(InstructionSet isa,
         // independent.
         masm.FinalizeCode(MacroAssembler::kFallThrough);
         delete[] labels;
-        for (int lit = 0; lit < literal_count; lit++) {
-          delete literals[lit];
-        }
       }
     }
   }
diff --git a/test/aarch32/test-disasm-a32.cc b/test/aarch32/test-disasm-a32.cc
index c229c2fb..95bb26ee 100644
--- a/test/aarch32/test-disasm-a32.cc
+++ b/test/aarch32/test-disasm-a32.cc
@@ -1700,6 +1700,41 @@ TEST(macro_assembler_Cbz) {
 }
 
 
+TEST(macro_assembler_b_cond_t32) {
+  SETUP();
+
+#ifdef VIXL_INCLUDE_TARGET_T32
+  // Ensure backward conditional branches are veneered correctly.
+  __ UseT32();
+  int pc_off = __ GetArchitectureStatePCOffset();
+
+  // Largest encodable backwards offset.
+  int curs = __ GetCursorOffset() + pc_off;
+  Label label_neg1m(curs - 1048576);
+  COMPARE_T32(B(ne, &label_neg1m), "bne 0xfff00004\n");
+
+  // Next largest cannot be encoded.
+  curs = __ GetCursorOffset() + pc_off;
+  Label label_neg1m_plus_inst(curs - (1048576 + 2));
+  COMPARE_T32(B(ne, &label_neg1m_plus_inst), "beq 0x00000006\n"
+                                             "b 0xfff00002\n");
+
+  // Offset that requires largest unconditional branch in veneer.
+  curs = __ GetCursorOffset() + pc_off;
+  Label label_neg16m(curs - (16777216 - 2));
+  COMPARE_T32(B(ne, &label_neg16m), "beq 0x00000006\n"
+                                    "b 0xff000006\n");
+
+  // Next largest cannot be veneered.
+  curs = __ GetCursorOffset() + pc_off;
+  Label label_neg16m_plus_inst(curs - 16777216);
+  MUST_FAIL_TEST_T32(B(ne, &label_neg16m_plus_inst),
+                     "Conditional branch too far for veneer.\n");
+#endif
+
+  CLEANUP();
+}
+
 #ifdef VIXL_NEGATIVE_TESTING
 TEST(assembler_crc_negative) {
   SETUP();
diff --git a/test/aarch64/test-api-aarch64.cc b/test/aarch64/test-api-aarch64.cc
index c724f178..3ac9efb7 100644
--- a/test/aarch64/test-api-aarch64.cc
+++ b/test/aarch64/test-api-aarch64.cc
@@ -27,6 +27,7 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
+#include <thread>
 
 #include "test-runner.h"
 #include "test-utils.h"
@@ -1763,6 +1764,24 @@ TEST(sim_stack) {
   VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 2048));
   VIXL_CHECK(s.IsAccessInGuardRegion(s.GetLimit() - 1280, 10000));
 }
+
+void AllocateAndFreeGCS() {
+  Decoder d;
+  Simulator s(&d);
+
+  for (int i = 0; i < 100000; i++) {
+    uint64_t gcs = s.GetGCSManager().AllocateStack();
+    s.GetGCSManager().FreeStack(gcs);
+  }
+}
+
+TEST(sim_gcs_manager) {
+  std::thread t1(AllocateAndFreeGCS);
+  std::thread t2(AllocateAndFreeGCS);
+
+  t1.join();
+  t2.join();
+}
 #endif
 
 }  // namespace aarch64
diff --git a/test/aarch64/test-assembler-aarch64.cc b/test/aarch64/test-assembler-aarch64.cc
index 00155471..a86b32e2 100644
--- a/test/aarch64/test-assembler-aarch64.cc
+++ b/test/aarch64/test-assembler-aarch64.cc
@@ -1634,11 +1634,19 @@ TEST(pacia_pacib_autia_autib) {
   START();
 
   Register pointer = x24;
-  Register modifier = x25;
+  Register retry_limit = x25;
+  Register modifier = x26;
+  Label retry;
 
+  // There is a small but not negligible chance (1 in 127 runs) that the PAC
+  // codes for keys A and B will collide, so retry a few times with different
+  // pointers.
   __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
   __ Mov(modifier, 0x477d469dec0b8760);
 
+  __ Bind(&retry);
+
   // Generate PACs using keys A and B.
   __ Mov(x0, pointer);
   __ Pacia(x0, modifier);
@@ -1660,21 +1668,24 @@ TEST(pacia_pacib_autia_autib) {
   __ Mov(x5, x0);
   __ Autib(x5, modifier);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x4, ZFlag, ne);
+  __ Ccmp(pointer, x5, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
 
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
     ASSERT_NOT_EQUAL_64(x0, x1);
 
     // Pointers correctly authenticated.
@@ -1682,8 +1693,13 @@ TEST(pacia_pacib_autia_autib) {
     ASSERT_EQUAL_64(pointer, x3);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0020000012345678, x4);
     ASSERT_EQUAL_64(0x0040000012345678, x5);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x4);
+    ASSERT_NOT_EQUAL_64(pointer, x5);
+#endif
   }
 }
 
@@ -1694,8 +1710,16 @@ TEST(paciza_pacizb_autiza_autizb) {
   START();
 
   Register pointer = x24;
+  Register retry_limit = x25;
+  Label retry;
 
+  // There is a small but not negligible chance (1 in 127 runs) that the PAC
+  // codes for keys A and B will collide, so retry a few times with different
+  // pointers.
   __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
+
+  __ Bind(&retry);
 
   // Generate PACs using keys A and B.
   __ Mov(x0, pointer);
@@ -1718,21 +1742,24 @@ TEST(paciza_pacizb_autiza_autizb) {
   __ Mov(x5, x0);
   __ Autizb(x5);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x4, ZFlag, ne);
+  __ Ccmp(pointer, x5, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
 
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
     ASSERT_NOT_EQUAL_64(x0, x1);
 
     // Pointers correctly authenticated.
@@ -1740,8 +1767,13 @@ TEST(paciza_pacizb_autiza_autizb) {
     ASSERT_EQUAL_64(pointer, x3);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0020000012345678, x4);
     ASSERT_EQUAL_64(0x0040000012345678, x5);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x4);
+    ASSERT_NOT_EQUAL_64(pointer, x5);
+#endif
   }
 }
 
@@ -1752,11 +1784,19 @@ TEST(pacda_pacdb_autda_autdb) {
   START();
 
   Register pointer = x24;
-  Register modifier = x25;
+  Register retry_limit = x25;
+  Register modifier = x26;
+  Label retry;
 
+  // There is a small but not negligible chance (1 in 127 runs) that the PAC
+  // codes for keys A and B will collide, so retry a few times with different
+  // pointers.
   __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
   __ Mov(modifier, 0x477d469dec0b8760);
 
+  __ Bind(&retry);
+
   // Generate PACs using keys A and B.
   __ Mov(x0, pointer);
   __ Pacda(x0, modifier);
@@ -1778,21 +1818,24 @@ TEST(pacda_pacdb_autda_autdb) {
   __ Mov(x5, x0);
   __ Autdb(x5, modifier);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x4, ZFlag, ne);
+  __ Ccmp(pointer, x5, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
 
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
     ASSERT_NOT_EQUAL_64(x0, x1);
 
     // Pointers correctly authenticated.
@@ -1800,8 +1843,13 @@ TEST(pacda_pacdb_autda_autdb) {
     ASSERT_EQUAL_64(pointer, x3);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0020000012345678, x4);
     ASSERT_EQUAL_64(0x0040000012345678, x5);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x4);
+    ASSERT_NOT_EQUAL_64(pointer, x5);
+#endif
   }
 }
 
@@ -1812,8 +1860,16 @@ TEST(pacdza_pacdzb_autdza_autdzb) {
   START();
 
   Register pointer = x24;
+  Register retry_limit = x25;
+  Label retry;
 
+  // There is a small but not negligible chance (1 in 127 runs) that the PAC
+  // codes for keys A and B will collide, so retry a few times with different
+  // pointers.
   __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
+
+  __ Bind(&retry);
 
   // Generate PACs using keys A and B.
   __ Mov(x0, pointer);
@@ -1836,21 +1892,24 @@ TEST(pacdza_pacdzb_autdza_autdzb) {
   __ Mov(x5, x0);
   __ Autdzb(x5);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x4, ZFlag, ne);
+  __ Ccmp(pointer, x5, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
 
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
     ASSERT_NOT_EQUAL_64(x0, x1);
 
     // Pointers correctly authenticated.
@@ -1858,8 +1917,13 @@ TEST(pacdza_pacdzb_autdza_autdzb) {
     ASSERT_EQUAL_64(pointer, x3);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0020000012345678, x4);
     ASSERT_EQUAL_64(0x0040000012345678, x5);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x4);
+    ASSERT_NOT_EQUAL_64(pointer, x5);
+#endif
   }
 }
 
@@ -1870,11 +1934,19 @@ TEST(pacga_xpaci_xpacd) {
   START();
 
   Register pointer = x24;
-  Register modifier = x25;
+  Register retry_limit = x25;
+  Register modifier = x26;
+  Label retry;
 
+  // There is a small but not negligible chance (1 in 127 runs) that the PAC
+  // codes for keys A and B will collide, so retry a few times with different
+  // pointers.
   __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
   __ Mov(modifier, 0x477d469dec0b8760);
 
+  __ Bind(&retry);
+
   // Generate generic PAC.
   __ Pacga(x0, pointer, modifier);
 
@@ -1890,25 +1962,24 @@ TEST(pacga_xpaci_xpacd) {
   __ Xpaci(x3);
   __ Xpacd(x4);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0xffffffff00000000);
-  __ And(x1, x1, 0x007f000000000000);
-  __ And(x2, x2, 0x007f000000000000);
+  // Retry on collisions.
+  __ Cmp(x1, x2);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x2, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
 
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-
-    ASSERT_NOT_EQUAL_64(0, x1);
-    ASSERT_NOT_EQUAL_64(0, x2);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
+    ASSERT_NOT_EQUAL_64(pointer, x2);
     ASSERT_NOT_EQUAL_64(x1, x2);
 
     ASSERT_EQUAL_64(pointer, x3);
@@ -2576,13 +2647,18 @@ TEST(return_to_reg_auth_guarded) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
 #endif
+    // On hardware, we'll run the test anyway, but mark it as SKIPPED until
+    // we've implemented a mechanism for marking Guarded pages.
+
     RUN();
 
     ASSERT_EQUAL_64(42, x0);
     ASSERT_EQUAL_64(84, x1);
+
+#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -2615,7 +2691,11 @@ TEST(branch_to_reg_auth_fail) {
   END();
 
   if (CAN_RUN()) {
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer.");
+#else
+    printf("SKIPPED: negative PAuth tests are unimplemented on hardware.");
+#endif
   }
 }
 #endif  // VIXL_NEGATIVE_TESTING
@@ -2651,7 +2731,11 @@ TEST(return_to_reg_auth_fail) {
   END();
 
   if (CAN_RUN()) {
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer.");
+#else
+    printf("SKIPPED: negative PAuth tests are unimplemented on hardware.");
+#endif
   }
 }
 #endif  // VIXL_NEGATIVE_TESTING
@@ -3654,7 +3738,11 @@ TEST(load_pauth_negative_test) {
   END();
 
   if (CAN_RUN()) {
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     MUST_FAIL_WITH_MESSAGE(RUN(), "Failed to authenticate pointer.");
+#else
+    printf("SKIPPED: negative PAuth tests are unimplemented on hardware.");
+#endif
   }
 }
 #endif  // VIXL_NEGATIVE_TESTING
@@ -5837,6 +5925,10 @@ TEST(rmif) {
   START();
   __ Mov(x0, 0x0123456789abcdef);
 
+  // Clear bits of `rmif` masks leave NZCV unmodified, so we need to initialise
+  // it to a known state to make the test reproducible.
+  __ Msr(NZCV, x0);
+
   // Set NZCV to 0b1011 (0xb)
   __ Rmif(x0, 0, NCVFlag);
   __ Mrs(x1, NZCV);
@@ -5883,6 +5975,9 @@ TEST(setf8_setf16) {
   __ Mov(x7, 0x10001);
   __ Mov(x8, 0xfffffffff);
 
+  // These instruction don't modify 'C', so give it a consistent value.
+  __ Ands(xzr, xzr, 0);
+
   __ Setf8(w0);
   __ Mrs(x9, NZCV);
   __ Setf8(w1);
@@ -7231,23 +7326,32 @@ TEST(system_pauth_a) {
   temps.Exclude(x16, x17);
   temps.Include(x10, x11);
 
-  // Backup stack pointer.
+  Register pointer = x21;
+  Register retry_limit = x22;
+  Label retry;
+
+  __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
+
+  // Back up stack pointer.
   __ Mov(x20, sp);
 
   // Modifiers
   __ Mov(x16, 0x477d469dec0b8760);
   __ Mov(sp, 0x477d469dec0b8760);
 
+  __ Bind(&retry);
+
   // Generate PACs using the 3 system instructions.
-  __ Mov(x17, 0x0000000012345678);
+  __ Mov(x17, pointer);
   __ Pacia1716();
   __ Mov(x0, x17);
 
-  __ Mov(lr, 0x0000000012345678);
+  __ Mov(lr, pointer);
   __ Paciaz();
   __ Mov(x1, lr);
 
-  __ Mov(lr, 0x0000000012345678);
+  __ Mov(lr, pointer);
   __ Paciasp();
   __ Mov(x2, lr);
 
@@ -7282,41 +7386,51 @@ TEST(system_pauth_a) {
   __ Xpaclri();
   __ Mov(x9, lr);
 
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x2, ZFlag, ne);
+  __ Ccmp(pointer, x6, ZFlag, ne);
+  __ Ccmp(pointer, x7, ZFlag, ne);
+  __ Ccmp(pointer, x8, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
+
   // Restore stack pointer.
   __ Mov(sp, x20);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
-  __ And(x2, x2, 0x007f000000000000);
-
   END();
 
   if (CAN_RUN()) {
     RUN();
 
-    // Check PAC codes have been generated and aren't equal.
-    // NOTE: with a different ComputePAC implementation, there may be a
-    // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
-    ASSERT_NOT_EQUAL_64(0, x2);
+    // Check PAC codes have been generated.
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
+    ASSERT_NOT_EQUAL_64(pointer, x2);
     ASSERT_NOT_EQUAL_64(x0, x1);
     ASSERT_EQUAL_64(x0, x2);
 
     // Pointers correctly authenticated.
-    ASSERT_EQUAL_64(0x0000000012345678, x3);
-    ASSERT_EQUAL_64(0x0000000012345678, x4);
-    ASSERT_EQUAL_64(0x0000000012345678, x5);
+    ASSERT_EQUAL_64(pointer, x3);
+    ASSERT_EQUAL_64(pointer, x4);
+    ASSERT_EQUAL_64(pointer, x5);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0020000012345678, x6);
     ASSERT_EQUAL_64(0x0020000012345678, x7);
     ASSERT_EQUAL_64(0x0020000012345678, x8);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x6);
+    ASSERT_NOT_EQUAL_64(pointer, x7);
+    ASSERT_NOT_EQUAL_64(pointer, x8);
+#endif
 
     // Pointer with code stripped.
-    ASSERT_EQUAL_64(0x0000000012345678, x9);
+    ASSERT_EQUAL_64(pointer, x9);
   }
 }
 
@@ -7331,13 +7445,22 @@ TEST(system_pauth_b) {
   temps.Exclude(x16, x17);
   temps.Include(x10, x11);
 
-  // Backup stack pointer.
+  Register pointer = x21;
+  Register retry_limit = x22;
+  Label retry;
+
+  __ Mov(pointer, 0x0000000012345678);
+  __ Mov(retry_limit, 0x0000000012345678 + 32);
+
+  // Back up stack pointer.
   __ Mov(x20, sp);
 
   // Modifiers
   __ Mov(x16, 0x477d469dec0b8760);
   __ Mov(sp, 0x477d469dec0b8760);
 
+  __ Bind(&retry);
+
   // Generate PACs using the 3 system instructions.
   __ Mov(x17, 0x0000000012345678);
   __ Pacib1716();
@@ -7382,15 +7505,21 @@ TEST(system_pauth_b) {
   __ Xpaclri();
   __ Mov(x9, lr);
 
+  // Retry on collisions.
+  __ Cmp(x0, x1);
+  __ Ccmp(pointer, x0, ZFlag, ne);
+  __ Ccmp(pointer, x1, ZFlag, ne);
+  __ Ccmp(pointer, x2, ZFlag, ne);
+  __ Ccmp(pointer, x6, ZFlag, ne);
+  __ Ccmp(pointer, x7, ZFlag, ne);
+  __ Ccmp(pointer, x8, ZFlag, ne);
+  __ Ccmp(pointer, retry_limit, ZFlag, eq);
+  __ Cinc(pointer, pointer, ne);
+  __ B(ne, &retry);
+
   // Restore stack pointer.
   __ Mov(sp, x20);
 
-  // Mask out just the PAC code bits.
-  // TODO: use Simulator::CalculatePACMask in a nice way.
-  __ And(x0, x0, 0x007f000000000000);
-  __ And(x1, x1, 0x007f000000000000);
-  __ And(x2, x2, 0x007f000000000000);
-
   END();
 
   if (CAN_RUN()) {
@@ -7399,24 +7528,30 @@ TEST(system_pauth_b) {
     // Check PAC codes have been generated and aren't equal.
     // NOTE: with a different ComputePAC implementation, there may be a
     // collision.
-    ASSERT_NOT_EQUAL_64(0, x0);
-    ASSERT_NOT_EQUAL_64(0, x1);
-    ASSERT_NOT_EQUAL_64(0, x2);
+    ASSERT_NOT_EQUAL_64(pointer, x0);
+    ASSERT_NOT_EQUAL_64(pointer, x1);
+    ASSERT_NOT_EQUAL_64(pointer, x2);
     ASSERT_NOT_EQUAL_64(x0, x1);
     ASSERT_EQUAL_64(x0, x2);
 
     // Pointers correctly authenticated.
-    ASSERT_EQUAL_64(0x0000000012345678, x3);
-    ASSERT_EQUAL_64(0x0000000012345678, x4);
-    ASSERT_EQUAL_64(0x0000000012345678, x5);
+    ASSERT_EQUAL_64(pointer, x3);
+    ASSERT_EQUAL_64(pointer, x4);
+    ASSERT_EQUAL_64(pointer, x5);
 
     // Pointers corrupted after failing to authenticate.
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     ASSERT_EQUAL_64(0x0040000012345678, x6);
     ASSERT_EQUAL_64(0x0040000012345678, x7);
     ASSERT_EQUAL_64(0x0040000012345678, x8);
+#else
+    ASSERT_NOT_EQUAL_64(pointer, x6);
+    ASSERT_NOT_EQUAL_64(pointer, x7);
+    ASSERT_NOT_EQUAL_64(pointer, x8);
+#endif
 
     // Pointer with code stripped.
-    ASSERT_EQUAL_64(0x0000000012345678, x9);
+    ASSERT_EQUAL_64(pointer, x9);
   }
 }
 
@@ -7501,11 +7636,12 @@ static void BtiHelper(Register ipreg) {
   __ Blr(x0);
   __ Adr(ipreg, &jump_call_target);
   __ Blr(ipreg);
-  __ Adr(lr, &done);  // Make Ret return to done label.
+  __ Mov(lr, 0);  // Zero lr so we branch to done.
   __ Br(ipreg);
   __ Bind(&call_target, EmitBTI_c);
   __ Ret();
   __ Bind(&jump_call_target, EmitBTI_jc);
+  __ Cbz(lr, &done);
   __ Ret();
   __ Bind(&done);
   END();
@@ -7513,10 +7649,15 @@ static void BtiHelper(Register ipreg) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
 #endif
+    // On hardware, we'll run the test anyway, but mark it as SKIPPED until
+    // we've implemented a mechanism for marking Guarded pages.
+
     RUN();
+
+#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -7529,36 +7670,42 @@ TEST(unguarded_bti_is_nop) {
   SETUP_WITH_FEATURES(CPUFeatures::kBTI);
 
   Label start, none, c, j, jc;
+  Label jump_to_c, call_to_j;
   START();
   __ B(&start);
   __ Bind(&none, EmitBTI);
   __ Bind(&c, EmitBTI_c);
   __ Bind(&j, EmitBTI_j);
   __ Bind(&jc, EmitBTI_jc);
-  VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 4 * kInstructionSize);
+  __ Hint(BTI);
+  __ Hint(BTI_c);
+  __ Hint(BTI_j);
+  __ Hint(BTI_jc);
+  VIXL_CHECK(__ GetSizeOfCodeGeneratedSince(&none) == 8 * kInstructionSize);
+  __ Cmp(x1, 1);
+  __ B(lt, &jump_to_c);
+  __ B(eq, &call_to_j);
   __ Ret();
 
-  Label jump_to_c, call_to_j;
   __ Bind(&start);
   __ Adr(x0, &none);
-  __ Adr(lr, &jump_to_c);
+  __ Mov(x1, 0);
   __ Br(x0);
 
   __ Bind(&jump_to_c);
   __ Adr(x0, &c);
-  __ Adr(lr, &call_to_j);
+  __ Mov(x1, 1);
   __ Br(x0);
 
   __ Bind(&call_to_j);
   __ Adr(x0, &j);
+  __ Mov(x1, 2);
   __ Blr(x0);
   END();
 
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(false);
-#else
-    VIXL_UNIMPLEMENTED();
 #endif
     RUN();
   }
@@ -7582,12 +7729,12 @@ TEST(bti_jump_to_ip_unidentified) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
-#endif
     MUST_FAIL_WITH_MESSAGE(RUN(),
                            "Executing non-BTI instruction with wrong "
                            "BType.");
+#else
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -7606,12 +7753,12 @@ TEST(bti_jump_to_unidentified) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
-#endif
     MUST_FAIL_WITH_MESSAGE(RUN(),
                            "Executing non-BTI instruction with wrong "
                            "BType.");
+#else
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -7630,12 +7777,12 @@ TEST(bti_call_to_unidentified) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
-#endif
     MUST_FAIL_WITH_MESSAGE(RUN(),
                            "Executing non-BTI instruction with wrong "
                            "BType.");
+#else
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -7655,10 +7802,10 @@ TEST(bti_jump_to_c) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
-#endif
     MUST_FAIL_WITH_MESSAGE(RUN(), "Executing BTI c with wrong BType.");
+#else
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -7678,10 +7825,10 @@ TEST(bti_call_to_j) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
-#endif
     MUST_FAIL_WITH_MESSAGE(RUN(), "Executing BTI j with wrong BType.");
+#else
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 #endif  // VIXL_NEGATIVE_TESTING
@@ -7706,12 +7853,17 @@ TEST(fall_through_bti) {
   if (CAN_RUN()) {
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
     simulator.SetGuardedPages(true);
-#else
-    VIXL_UNIMPLEMENTED();
 #endif
+    // On hardware, we'll run the test anyway, but mark it as SKIPPED until
+    // we've implemented a mechanism for marking Guarded pages.
+
     RUN();
 
     ASSERT_EQUAL_64(4, x0);
+
+#ifndef VIXL_INCLUDE_SIMULATOR_AARCH64
+    printf("SKIPPED: marking guarded pages is unimplemented on hardware");
+#endif
   }
 }
 
@@ -13364,6 +13516,76 @@ TEST(collision_literal_veneer_pools) {
   END();
 }
 
+static void VeneerBackwardBranchHelper(ImmBranchType type, int limit) {
+  SETUP();
+  START();
+
+  // This is a code generation test. The code generated is not executed.
+
+  __ Mov(x0, 1);
+
+  // Non-veneer case: generate 'limit' instructions, plus the branch itself.
+  Label start0;
+  __ Bind(&start0);
+  for (int i = 0; i < limit; i++) {
+    __ Nop();
+  }
+  switch (type) {
+    case CompareBranchType:
+      __ Cbz(x0, &start0);
+      break;
+    case TestBranchType:
+      __ Tbz(x0, 0, &start0);
+      break;
+    default:
+      VIXL_ASSERT(type == CondBranchType);
+      __ B(eq, &start0);
+  }
+  VIXL_CHECK(masm.GetSizeOfCodeGeneratedSince(&start0) ==
+             ((limit + 1) * kInstructionSize));
+
+  // Veneer case: As above, plus one extra nop and a branch for the veneer; we
+  // expect a total of limit + 3 instructions.
+  //
+  //  start1:
+  //    nop x (limit + 1)
+  //    tbnz skip_veneer
+  //    b start1
+  //  skip_veneer:
+  //
+  Label start1;
+  __ Bind(&start1);
+  for (int i = 0; i < limit; i++) {
+    __ Nop();
+  }
+  __ Nop();  // One extra instruction to exceed branch range.
+  switch (type) {
+    case CompareBranchType:
+      __ Cbz(x0, &start0);
+      break;
+    case TestBranchType:
+      __ Tbz(x0, 0, &start0);
+      break;
+    default:
+      VIXL_ASSERT(type == CondBranchType);
+      __ B(eq, &start0);
+  }
+  VIXL_CHECK(masm.GetSizeOfCodeGeneratedSince(&start1) ==
+             ((limit + 3) * kInstructionSize));
+
+  END();
+  DISASSEMBLE();
+}
+
+TEST(veneer_backward_tbz) { VeneerBackwardBranchHelper(TestBranchType, 8192); }
+
+TEST(veneer_backward_cbz) {
+  VeneerBackwardBranchHelper(CompareBranchType, 262144);
+}
+
+TEST(veneer_backward_bcond) {
+  VeneerBackwardBranchHelper(CondBranchType, 262144);
+}
 
 TEST(ldr_literal_explicit) {
   SETUP();
@@ -14085,20 +14307,24 @@ TEST(mte_irg) {
 
   __ Bind(&done);
 
-  // Insert random tags, excluding oddly-numbered tags, then orr them together.
-  // After 128 rounds, it's statistically likely that all but the least
-  // significant bit will be set.
+  // Insert random tags, excluding oddly-numbered tags, and set a bit in a
+  // result register for each tag used.
+  // After 128 rounds, it's statistically likely that all even bits in the
+  // least-significant half word will be set.
   __ Mov(x3, 0);
+  __ Mov(x4, 1);
   __ Mov(x10, 128);
   __ Mov(x11, 0xaaaa);
 
   Label loop2;
   __ Bind(&loop2);
   __ Irg(x2, x1, x11);
+  __ Lsr(x2, x2, 56);
+  __ Lsl(x2, x4, x2);
   __ Orr(x3, x3, x2);
   __ Subs(x10, x10, 1);
   __ B(ne, &loop2);
-  __ Lsr(x2, x3, 56);
+  __ Mov(x2, x3);
 
   // Check that excluding all tags results in zero tag insertion.
   __ Mov(x3, 0xffffffffffffffff);
@@ -14109,7 +14335,7 @@ TEST(mte_irg) {
     RUN();
 
     ASSERT_EQUAL_64(0, x1);
-    ASSERT_EQUAL_64(0xe, x2);
+    ASSERT_EQUAL_64(0x5555, x2);
     ASSERT_EQUAL_64(0xf0ffffffffffffff, x3);
   }
 }
@@ -14131,23 +14357,36 @@ TEST(mops_set) {
   __ Setp(x1, x2, x3);
   __ Setm(x1, x2, x3);
   __ Sete(x1, x2, x3);
+  __ Mrs(x20, NZCV);
 
   // x2 is now zero, so this should do nothing.
   __ Setp(x1, x2, x3);
   __ Setm(x1, x2, x3);
   __ Sete(x1, x2, x3);
+  __ Mrs(x21, NZCV);
 
   // Set dst[15] to zero using the masm helper.
   __ Add(x1, x0, 15);
   __ Mov(x2, 1);
   __ Set(x1, x2, xzr);
+  __ Mrs(x22, NZCV);
 
   // Load dst for comparison.
   __ Ldp(x10, x11, MemOperand(x0));
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //            NZCV    Xd                Xn
+    //  Option A: ....    end of buffer     0
+    //  Option B: ..C.    end of buffer     0
+
+    std::vector<uint64_t> allowed_flags = {NoFlag, CFlag};
+
     RUN();
+    ASSERT_EQUAL_64(allowed_flags, x20);
+    ASSERT_EQUAL_64(allowed_flags, x21);
+    ASSERT_EQUAL_64(allowed_flags, x22);
     ASSERT_EQUAL_64(dst_addr + 16, x1);
     ASSERT_EQUAL_64(0, x2);
     ASSERT_EQUAL_64(0x1234aa, x3);
@@ -14171,11 +14410,20 @@ TEST(mops_setn) {
   __ Mov(x2, 16);
   __ Mov(x3, 0x42);
   __ Setn(x1, x2, x3);
+  __ Mrs(x20, NZCV);
   __ Ldp(x10, x11, MemOperand(x0));
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //            NZCV    Xd                Xn
+    //  Option A: ....    end of buffer     0
+    //  Option B: ..C.    end of buffer     0
+
+    std::vector<uint64_t> allowed_flags = {NoFlag, CFlag};
+
     RUN();
+    ASSERT_EQUAL_64(allowed_flags, x20);
     ASSERT_EQUAL_64(dst_addr + 16, x1);
     ASSERT_EQUAL_64(0, x2);
     ASSERT_EQUAL_64(0x42, x3);
@@ -14187,10 +14435,10 @@ TEST(mops_setn) {
 TEST(mops_setg) {
   SETUP_WITH_FEATURES(CPUFeatures::kMOPS, CPUFeatures::kMTE);
 
-  uint8_t* dst_addr = nullptr;
+  uint8_t* dst = nullptr;
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
   const int dst_size = 32;
-  dst_addr = reinterpret_cast<uint8_t*>(
+  dst = reinterpret_cast<uint8_t*>(
       simulator.Mmap(NULL,
                      dst_size * sizeof(uint8_t),
                      PROT_READ | PROT_WRITE | PROT_MTE,
@@ -14198,32 +14446,47 @@ TEST(mops_setg) {
                      -1,
                      0));
 
-  VIXL_ASSERT(dst_addr != nullptr);
-  uint8_t* untagged_ptr = AddressUntag(dst_addr);
+  VIXL_ASSERT(dst != nullptr);
+  uint8_t* untagged_ptr = AddressUntag(dst);
   memset(untagged_ptr, 0xc9, dst_size);
 #else
 // TODO: Port the memory allocation to work on MTE supported platform natively.
 // Note that `CAN_RUN` prevents running in MTE-unsupported environments.
 #endif
 
+  uintptr_t dst_addr = reinterpret_cast<uintptr_t>(dst);
+  uint64_t tag_mask = 0xf0ff'ffff'ffff'ffff;
+
   START();
-  __ Mov(x0, reinterpret_cast<uint64_t>(dst_addr));
+  __ Mov(x0, dst_addr);
   __ Gmi(x2, x0, xzr);
   __ Irg(x1, x0, x2);  // Choose new tag for setg destination.
   __ Mov(x2, 16);
   __ Mov(x3, 0x42);
   __ Setg(x1, x2, x3);
+  __ Mrs(x20, NZCV);
 
   __ Ubfx(x4, x1, 56, 4);  // Extract new tag.
   __ Bfi(x0, x4, 56, 4);   // Tag dst_addr so set region can be loaded.
   __ Ldp(x10, x11, MemOperand(x0));
 
-  __ Mov(x0, reinterpret_cast<uint64_t>(dst_addr));
+  __ Mov(x0, dst_addr);
   __ Ldp(x12, x13, MemOperand(x0, 16));  // Unset region has original tag.
+
+  __ And(x1, x1, tag_mask);  // Strip tag for repeatable checks.
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //            NZCV    Xd                Xn
+    //  Option A: ....    end of buffer     0
+    //  Option B: ..C.    end of buffer     0
+
+    std::vector<uint64_t> allowed_flags = {NoFlag, CFlag};
+
     RUN();
+    ASSERT_EQUAL_64(allowed_flags, x20);
+    ASSERT_EQUAL_64((dst_addr & tag_mask) + 16, x1);
     ASSERT_EQUAL_64(0, x2);
     ASSERT_EQUAL_64(0x42, x3);
     ASSERT_EQUAL_64(0x4242'4242'4242'4242, x10);
@@ -14233,7 +14496,7 @@ TEST(mops_setg) {
   }
 
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
-  simulator.Munmap(dst_addr, dst_size, PROT_MTE);
+  simulator.Munmap(dst, dst_size, PROT_MTE);
 #endif
 }
 
@@ -14251,38 +14514,73 @@ TEST(mops_cpy) {
   __ Mov(x0, buf_addr);
 
   // Copy first eight bytes into second eight.
-  __ Mov(x2, x0);     // src = &buf[0]
-  __ Add(x3, x0, 8);  // dst = &buf[8]
-  __ Mov(x4, 8);      // count = 8
-  __ Cpyp(x3, x2, x4);
-  __ Cpym(x3, x2, x4);
-  __ Cpye(x3, x2, x4);
+  __ Mov(x1, x0);     // src = &buf[0]
+  __ Add(x2, x0, 8);  // dst = &buf[8]
+  __ Mov(x3, 8);      // count = 8
+  __ Cpyp(x2, x1, x3);
+  __ Cpym(x2, x1, x3);
+  __ Cpye(x2, x1, x3);
   __ Ldp(x10, x11, MemOperand(x0));
   __ Mrs(x20, NZCV);
 
-  // Copy first eight bytes to overlapping offset, causing reverse copy.
-  __ Mov(x5, x0);     // src = &buf[0]
-  __ Add(x6, x0, 4);  // dst = &buf[4]
-  __ Mov(x7, 8);      // count = 8
-  __ Cpy(x6, x5, x7);
+  // Copy first eight bytes to overlapping offset, forcing backwards copy.
+  __ Mov(x4, x0);     // src = &buf[0]
+  __ Add(x5, x0, 4);  // dst = &buf[4]
+  __ Mov(x6, 8);      // count = 8
+  __ Cpy(x5, x4, x6);
   __ Ldp(x12, x13, MemOperand(x0));
+  __ Mrs(x21, NZCV);
+
+  // Copy last eight bytes to overlapping offset, forcing forwards copy.
+  __ Add(x7, x0, 8);  // src = &buf[8]
+  __ Add(x8, x0, 6);  // dst = &buf[6]
+  __ Mov(x9, 8);      // count = 8
+  __ Cpy(x8, x7, x9);
+  __ Ldp(x14, x15, MemOperand(x0));
+  __ Mrs(x22, NZCV);
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //                        NZCV    Xs/Xd               Xn
+    //  Option A (forwards) : ....    ends of buffers     0
+    //  Option A (backwards): ....    starts of buffers   0
+    //  Option B (forwards) : ..C.    ends of buffers     0
+    //  Option B (backwards): N.C.    starts of buffers   0
+
+    std::vector<uint64_t> allowed_backwards_flags = {NoFlag, NCFlag};
+    std::vector<uint64_t> allowed_forwards_flags = {NoFlag, CFlag};
+
     RUN();
-    ASSERT_EQUAL_64(buf_addr + 8, x2);
-    ASSERT_EQUAL_64(buf_addr + 16, x3);
-    ASSERT_EQUAL_64(0, x4);
+    // IMPLEMENTATION DEFINED direction
+    if (static_cast<uintptr_t>(core.xreg(2)) > buf_addr) {
+      // Forwards
+      ASSERT_EQUAL_64(buf_addr + 8, x1);
+      ASSERT_EQUAL_64(buf_addr + 16, x2);
+      ASSERT_EQUAL_64(allowed_forwards_flags, x20);
+    } else {
+      // Backwards
+      ASSERT_EQUAL_64(buf_addr, x1);
+      ASSERT_EQUAL_64(buf_addr + 8, x2);
+      ASSERT_EQUAL_64(allowed_backwards_flags, x20);
+    }
+    ASSERT_EQUAL_64(0, x3);  // Xn
     ASSERT_EQUAL_64(0x0706'0504'0302'0100, x10);
     ASSERT_EQUAL_64(0x0706'0504'0302'0100, x11);
-    ASSERT_EQUAL_64(CFlag, x20);
 
-    ASSERT_EQUAL_64(buf_addr, x5);
-    ASSERT_EQUAL_64(buf_addr + 4, x6);
-    ASSERT_EQUAL_64(0, x7);
+    ASSERT_EQUAL_64(buf_addr, x4);      // Xs
+    ASSERT_EQUAL_64(buf_addr + 4, x5);  // Xd
+    ASSERT_EQUAL_64(0, x6);             // Xn
     ASSERT_EQUAL_64(0x0302'0100'0302'0100, x12);
     ASSERT_EQUAL_64(0x0706'0504'0706'0504, x13);
-    ASSERT_EQUAL_NZCV(NCFlag);
+    ASSERT_EQUAL_64(allowed_backwards_flags, x21);
+
+    ASSERT_EQUAL_64(buf_addr + 16, x7);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 14, x8);  // Xd
+    ASSERT_EQUAL_64(0, x9);              // Xn
+    ASSERT_EQUAL_64(0x0504'0100'0302'0100, x14);
+    ASSERT_EQUAL_64(0x0706'0706'0504'0706, x15);
+    ASSERT_EQUAL_64(allowed_forwards_flags, x22);
   }
 }
 
@@ -14302,44 +14600,61 @@ TEST(mops_cpyn) {
   START();
   __ Mov(x0, buf_addr);
 
-  __ Add(x2, x0, 1);  // src = &buf[1]
-  __ Mov(x3, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpyn(x3, x2, x4);
+  __ Add(x1, x0, 1);  // src = &buf[1]
+  __ Mov(x2, x0);     // dst = &buf[0]
+  __ Mov(x3, 15);     // count = 15
+  __ Cpyn(x2, x1, x3);
   __ Ldp(x10, x11, MemOperand(x0));
+  __ Mrs(x20, NZCV);
 
-  __ Add(x5, x0, 1);  // src = &buf[1]
-  __ Mov(x6, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpyrn(x6, x5, x4);
+  __ Add(x4, x0, 1);  // src = &buf[1]
+  __ Mov(x5, x0);     // dst = &buf[0]
+  __ Mov(x6, 15);     // count = 15
+  __ Cpyrn(x5, x4, x6);
   __ Ldp(x12, x13, MemOperand(x0));
+  __ Mrs(x21, NZCV);
 
   __ Add(x7, x0, 1);  // src = &buf[1]
   __ Mov(x8, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpywn(x8, x7, x4);
+  __ Mov(x9, 15);     // count = 15
+  __ Cpywn(x8, x7, x9);
   __ Ldp(x14, x15, MemOperand(x0));
+  __ Mrs(x22, NZCV);
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //                        NZCV    Xs/Xd               Xn
+    //  Option A (forwards) : ....    ends of buffers     0
+    //  Option A (backwards): ....    starts of buffers   0
+    //  Option B (forwards) : ..C.    ends of buffers     0
+    //  Option B (backwards): N.C.    starts of buffers   0
+    //
+    // All cases overlap to force a forwards copy.
+
+    std::vector<uint64_t> allowed_forwards_flags = {NoFlag, CFlag};
+
     RUN();
-    ASSERT_EQUAL_64(buf_addr + 16, x2);
-    ASSERT_EQUAL_64(buf_addr + 15, x3);
+    ASSERT_EQUAL_64(buf_addr + 16, x1);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x2);  // Xd
+    ASSERT_EQUAL_64(0, x3);              // Xn
+    ASSERT_EQUAL_64(allowed_forwards_flags, x20);
     ASSERT_EQUAL_64(0x0807'0605'0403'0201, x10);
     ASSERT_EQUAL_64(0x0f0f'0e0d'0c0b'0a09, x11);
 
-    ASSERT_EQUAL_64(buf_addr + 16, x5);
-    ASSERT_EQUAL_64(buf_addr + 15, x6);
+    ASSERT_EQUAL_64(buf_addr + 16, x4);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x5);  // Xd
+    ASSERT_EQUAL_64(0, x6);              // Xn
+    ASSERT_EQUAL_64(allowed_forwards_flags, x21);
     ASSERT_EQUAL_64(0x0908'0706'0504'0302, x12);
     ASSERT_EQUAL_64(0x0f0f'0f0e'0d0c'0b0a, x13);
 
-    ASSERT_EQUAL_64(buf_addr + 16, x7);
-    ASSERT_EQUAL_64(buf_addr + 15, x8);
+    ASSERT_EQUAL_64(buf_addr + 16, x7);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x8);  // Xd
+    ASSERT_EQUAL_64(0, x9);              // Xn
+    ASSERT_EQUAL_64(allowed_forwards_flags, x22);
     ASSERT_EQUAL_64(0x0a09'0807'0605'0403, x14);
     ASSERT_EQUAL_64(0x0f0f'0f0f'0e0d'0c0b, x15);
-
-    ASSERT_EQUAL_64(0, x4);
-    ASSERT_EQUAL_NZCV(CFlag);
   }
 }
 
@@ -14353,46 +14668,79 @@ TEST(mops_cpyf) {
     buf[i] = i;
   }
 
-  // This test matches the cpy variant above, but using cpyf will result in a
-  // different answer for the overlapping copy.
+  // As `mops_cpy`, but `cpyf` always copies forwards, so is only useful for
+  // non-overlapping buffers, or those where the source address is greater than
+  // the destination address.
+
   START();
   __ Mov(x0, buf_addr);
 
-  // Copy first eight bytes into second eight.
-  __ Mov(x2, x0);     // src = &buf[0]
-  __ Add(x3, x0, 8);  // dst = &buf[8]
-  __ Mov(x4, 8);      // count = 8
-  __ Cpyf(x3, x2, x4);
+  // Copy first eight bytes into second eight, without overlap.
+  __ Mov(x1, x0);     // src = &buf[0]
+  __ Add(x2, x0, 8);  // dst = &buf[8]
+  __ Mov(x3, 8);      // count = 8
+  __ Cpyfp(x2, x1, x3);
+  __ Cpyfm(x2, x1, x3);
+  __ Cpyfe(x2, x1, x3);
   __ Ldp(x10, x11, MemOperand(x0));
   __ Mrs(x20, NZCV);
 
-  // Copy first eight bytes to overlapping offset.
-  __ Mov(x5, x0);     // src = &buf[0]
-  __ Add(x6, x0, 4);  // dst = &buf[4]
-  __ Mov(x7, 8);      // count = 8
-  __ Cpyf(x6, x5, x7);
+  // Copy last eight bytes to overlapping offset where src < dst.
+  __ Add(x4, x0, 8);  // src = &buf[8]
+  __ Add(x5, x0, 6);  // dst = &buf[6]
+  __ Mov(x6, 8);      // count = 8
+  __ Cpyf(x5, x4, x6);
   __ Ldp(x12, x13, MemOperand(x0));
+  __ Mrs(x21, NZCV);
+
+  // Copy first eight bytes to overlapping offset where src > dst.
+  __ Mov(x7, x0);     // src = &buf[0]
+  __ Add(x8, x0, 4);  // dst = &buf[4]
+  __ Mov(x9, 8);      // count = 8
+  __ Cpyf(x8, x7, x9);
+  // The only testable result is the first and last four bytes, which are not
+  // written at all.
+  __ Ldr(w14, MemOperand(x0));
+  __ Ldr(w15, MemOperand(x0, 12));
+  __ Mrs(x22, NZCV);
+
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //            NZCV    Xs/Xd               Xn
+    //  Option A: ....    ends of buffers     0
+    //  Option B: ..C.    ends of buffers     0
+
+    std::vector<uint64_t> allowed_forwards_flags = {NoFlag, CFlag};
+
     RUN();
-    ASSERT_EQUAL_64(buf_addr + 8, x2);
-    ASSERT_EQUAL_64(buf_addr + 16, x3);
-    ASSERT_EQUAL_64(0, x4);
+
+    // No overlap.
+    ASSERT_EQUAL_64(buf_addr + 8, x1);   // Xs
+    ASSERT_EQUAL_64(buf_addr + 16, x2);  // Xd
+    ASSERT_EQUAL_64(0, x3);              // Xn
+    ASSERT_EQUAL_64(allowed_forwards_flags, x20);
     ASSERT_EQUAL_64(0x0706'0504'0302'0100, x10);
     ASSERT_EQUAL_64(0x0706'0504'0302'0100, x11);
-    ASSERT_EQUAL_64(CFlag, x20);
 
-    ASSERT_EQUAL_64(buf_addr + 8, x5);
-    ASSERT_EQUAL_64(buf_addr + 12, x6);
-    ASSERT_EQUAL_64(0, x7);
-    ASSERT_EQUAL_NZCV(CFlag);
+    // Overlap, src > dst.
+    ASSERT_EQUAL_64(buf_addr + 16, x4);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 14, x5);  // Xd
+    ASSERT_EQUAL_64(0, x6);              // Xn
+    ASSERT_EQUAL_64(0x0100'0504'0302'0100, x12);
+    ASSERT_EQUAL_64(0x0706'0706'0504'0302, x13);
+    ASSERT_EQUAL_64(allowed_forwards_flags, x21);
 
-    // These results are not architecturally defined. They may change if the
-    // simulator is implemented in a different, but still architecturally
-    // correct, way.
-    ASSERT_EQUAL_64(0x0302'0100'0302'0100, x12);
-    ASSERT_EQUAL_64(0x0706'0504'0302'0100, x13);
+    // Overlap, src < dst.
+    ASSERT_EQUAL_64(buf_addr + 8, x7);   // Xs
+    ASSERT_EQUAL_64(buf_addr + 12, x8);  // Xd
+    ASSERT_EQUAL_64(0, x9);              // Xn
+    // We can only reliably test that the operation didn't write outside the
+    // specified region.
+    ASSERT_EQUAL_32(0x0302'0100, w14);
+    ASSERT_EQUAL_32(0x0706'0706, w15);
+    ASSERT_EQUAL_64(allowed_forwards_flags, x22);
   }
 }
 
@@ -14412,44 +14760,57 @@ TEST(mops_cpyfn) {
   START();
   __ Mov(x0, buf_addr);
 
-  __ Add(x2, x0, 1);  // src = &buf[1]
-  __ Mov(x3, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpyfn(x3, x2, x4);
+  __ Add(x1, x0, 1);  // src = &buf[1]
+  __ Mov(x2, x0);     // dst = &buf[0]
+  __ Mov(x3, 15);     // count = 15
+  __ Cpyfn(x2, x1, x3);
   __ Ldp(x10, x11, MemOperand(x0));
+  __ Mrs(x20, NZCV);
 
-  __ Add(x5, x0, 1);  // src = &buf[1]
-  __ Mov(x6, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpyfrn(x6, x5, x4);
+  __ Add(x4, x0, 1);  // src = &buf[1]
+  __ Mov(x5, x0);     // dst = &buf[0]
+  __ Mov(x6, 15);     // count = 15
+  __ Cpyfrn(x5, x4, x6);
   __ Ldp(x12, x13, MemOperand(x0));
+  __ Mrs(x21, NZCV);
 
   __ Add(x7, x0, 1);  // src = &buf[1]
   __ Mov(x8, x0);     // dst = &buf[0]
-  __ Mov(x4, 15);     // count = 15
-  __ Cpyfwn(x8, x7, x4);
+  __ Mov(x9, 15);     // count = 15
+  __ Cpyfwn(x8, x7, x9);
   __ Ldp(x14, x15, MemOperand(x0));
+  __ Mrs(x22, NZCV);
   END();
 
   if (CAN_RUN()) {
+    // Permitted results:
+    //            NZCV    Xs/Xd               Xn
+    //  Option A: ....    ends of buffers     0
+    //  Option B: ..C.    ends of buffers     0
+
+    std::vector<uint64_t> allowed_flags = {NoFlag, CFlag};
+
     RUN();
-    ASSERT_EQUAL_64(buf_addr + 16, x2);
-    ASSERT_EQUAL_64(buf_addr + 15, x3);
+    ASSERT_EQUAL_64(buf_addr + 16, x1);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x2);  // Xd
+    ASSERT_EQUAL_64(0, x3);              // Xn
+    ASSERT_EQUAL_64(allowed_flags, x20);
     ASSERT_EQUAL_64(0x0807'0605'0403'0201, x10);
     ASSERT_EQUAL_64(0x0f0f'0e0d'0c0b'0a09, x11);
 
-    ASSERT_EQUAL_64(buf_addr + 16, x5);
-    ASSERT_EQUAL_64(buf_addr + 15, x6);
+    ASSERT_EQUAL_64(buf_addr + 16, x4);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x5);  // Xd
+    ASSERT_EQUAL_64(0, x6);              // Xn
+    ASSERT_EQUAL_64(allowed_flags, x21);
     ASSERT_EQUAL_64(0x0908'0706'0504'0302, x12);
     ASSERT_EQUAL_64(0x0f0f'0f0e'0d0c'0b0a, x13);
 
-    ASSERT_EQUAL_64(buf_addr + 16, x7);
-    ASSERT_EQUAL_64(buf_addr + 15, x8);
+    ASSERT_EQUAL_64(buf_addr + 16, x7);  // Xs
+    ASSERT_EQUAL_64(buf_addr + 15, x8);  // Xd
+    ASSERT_EQUAL_64(0, x9);              // Xn
+    ASSERT_EQUAL_64(allowed_flags, x22);
     ASSERT_EQUAL_64(0x0a09'0807'0605'0403, x14);
     ASSERT_EQUAL_64(0x0f0f'0f0f'0e0d'0c0b, x15);
-
-    ASSERT_EQUAL_64(0, x4);
-    ASSERT_EQUAL_NZCV(CFlag);
   }
 }
 
@@ -14723,6 +15084,298 @@ TEST(cssc_smax) {
   MinMaxHelper(op, true, s64min, s64max, 0, s64max);
 }
 
+static void ChkfeatHelper(uint64_t initial,
+                          uint64_t chkfeat,
+                          CPUFeatures require) {
+  SETUP_WITH_FEATURES(require);
+
+  START();
+  __ Mov(x16, initial);
+  __ Chkfeat(x16);
+  __ Mov(x0, x16);
+
+  __ Mov(x1, initial);
+  __ Chkfeat(x1);
+  END();
+
+  if (CAN_RUN()) {
+    RUN_WITHOUT_SEEN_FEATURE_CHECK();
+    ASSERT_EQUAL_64(chkfeat, x0);
+    ASSERT_EQUAL_64(x0, x1);
+  }
+}
+
+TEST(chkfeat) { ChkfeatHelper(0x0, 0x0, CPUFeatures::None()); }
+
+TEST(chkfeat_gcs) { ChkfeatHelper(0x1, 0x0, CPUFeatures::kGCS); }
+
+TEST(chkfeat_unused) {
+  // Bits 1-63 are reserved. This test ensures that they are unmodified by
+  // `chkfeat`, but it will need to be updated if these bits are assigned in the
+  // future.
+  ChkfeatHelper(0xffff'ffff'ffff'fffe,
+                0xffff'ffff'ffff'fffe,
+                CPUFeatures::None());
+}
+
+TEST(gcs_feature_off) {
+  SETUP();
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  simulator.DisableGCSCheck();
+#else
+// TODO: Disable GCS via operating system for this test, here and in the
+// gcs_off_pac_on test below.
+#endif
+  __ Mov(x16, 0x0123'4567'89ab'cdef);
+  __ Chkfeat(x16);
+
+  // This sequence would fail with GCS enabled.
+  Label lab, end;
+  __ Bl(&lab);
+  __ B(&end);
+
+  __ Bind(&lab);
+  __ Adr(lr, &end);
+  __ Ret();
+
+  __ Bind(&end);
+  END();
+
+  if (CAN_RUN()) {
+    // TODO: This will currently fail on GCS-supporting hardware.
+    RUN();
+    ASSERT_EQUAL_64(0x0123'4567'89ab'cdef, x16);
+  }
+}
+
+TEST(gcs_gcspushm) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label ret;
+  START();
+  __ Adr(x0, &ret);
+  __ Gcspushm(x0);
+  __ Ret(x0);
+  __ Nop();
+  __ Bind(&ret);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+  }
+}
+
+TEST(gcs_gcspopm) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label lab, ret;
+  START();
+  __ Adr(x0, &ret);
+  __ Bl(&lab);
+  __ Bind(&ret);
+  __ Nop();
+  __ Bind(&lab);
+  __ Gcspopm(x1);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(x0, x1);
+  }
+}
+
+TEST(gcs_gcsss1) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  uint64_t new_gcs = simulator.GetGCSManager().AllocateStack();
+  __ Mov(x0, new_gcs);
+#else
+// TODO: Request new GCS from the operating system.
+#endif
+
+  // Partial stack swap to check GCS has changed, and a token is at the top
+  // of the new stack.
+  __ Gcsss1(x0);
+  __ Gcspopm(x1);
+
+  __ Bic(x0, x0, 7);  // Clear LSB of new GCS.
+  __ Bic(x2, x1, 7);  // Clear LSB of old GCS.
+  __ Cmp(x0, x2);
+  __ Cset(x0, eq);
+  __ And(x1, x1, 7);  // In progress token.
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(0, x0);  // GCS must not be equal.
+    ASSERT_EQUAL_64(5, x1);  // In progress token must be present.
+  }
+}
+
+// TODO: Add extra tests for combinations of PAC and GCS enabled.
+TEST(gcs_stack_swap) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  START();
+  Label stack_swap, sub_fn, end;
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  uint64_t new_gcs = simulator.GetGCSManager().AllocateStack();
+  __ Mov(x0, new_gcs);
+#else
+// TODO: Request new GCS from the operating system.
+#endif
+  __ Bl(&stack_swap);
+  __ B(&end);
+
+  __ Bind(&stack_swap);
+  __ Gcsss1(x0);  // x0 = new GCS.
+  __ Gcsss2(x1);  // x1 = old GCS.
+  __ Mov(x29, lr);
+  __ Bl(&sub_fn);
+  __ Mov(lr, x29);
+  __ Gcsss1(x1);  // Restore old GCS.
+  __ Gcsss2(x0);
+  __ Ret();
+
+  __ Bind(&sub_fn);
+  __ Mov(x2, 42);
+  __ Ret();
+
+  __ Bind(&end);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_64(42, x2);
+  }
+}
+
+TEST(gcs_off_pac_on) {
+  SETUP_WITH_FEATURES(CPUFeatures::kPAuth);
+
+  START();
+#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
+  simulator.DisableGCSCheck();
+#else
+// TODO: Disable GCS via operating system for this test, and enable for native.
+#endif
+  __ Mov(x16, 1);
+  __ Chkfeat(x16);
+  __ Mov(x1, x16);
+
+  Label fn1, after_fn1;
+
+  __ Mov(x28, sp);
+  __ Mov(x29, lr);
+  __ Mov(sp, 0x477d469dec0b8760);
+
+  __ Mov(x0, 0);
+  __ B(&after_fn1);
+
+  __ Bind(&fn1);
+  __ Mov(x0, 42);
+  __ Paciasp();
+  __ Retaa();
+
+  __ Bind(&after_fn1);
+  __ Bl(&fn1);
+
+  __ Mov(sp, x28);
+  __ Mov(lr, x29);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+
+    ASSERT_EQUAL_64(42, x0);
+    ASSERT_EQUAL_64(1, x1);
+  }
+}
+
+#ifdef VIXL_NEGATIVE_TESTING
+TEST(gcs_negative_test) {
+  SETUP_WITH_FEATURES(CPUFeatures::kGCS);
+
+  Label fn, bad_return_addr, done;
+  START();
+  __ Bl(&fn);
+  __ Nop();  // GCS enforces that fn() returns here...
+
+  __ Bind(&bad_return_addr);
+  __ B(&done);  // ... but this test attempts to return here.
+
+  __ Bind(&fn);
+  __ Adr(lr, &bad_return_addr);
+  __ Ret();
+
+  __ Bind(&done);
+  END();
+
+  if (CAN_RUN()) {
+    MUST_FAIL_WITH_MESSAGE(RUN(), "GCS failed");
+  }
+}
+#endif  // VIXL_NEGATIVE_TESTING
+
+TEST(dc_zva) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+
+  const int zva_blocksize = 64;  // Assumed blocksize.
+  uint8_t buf[2 * zva_blocksize];
+  uintptr_t buf_addr = reinterpret_cast<uintptr_t>(buf);
+  uintptr_t aligned_addr = AlignUp(buf_addr, zva_blocksize);
+
+  START();
+  // Skip this test if the ZVA blocksize is not 64 bytes.
+  // Set up initial register values to allow the test to pass when skipped.
+  Label skip;
+  __ Movi(q0.V16B(), 0);
+  __ Movi(q1.V16B(), 0);
+  __ Movi(q2.V16B(), 0);
+  __ Movi(q3.V16B(), 0);
+
+  __ Mrs(x1, DCZID_EL0);
+  __ Cmp(x1, 4);  // 4 => DC ZVA enabled with 64-byte blocks.
+  __ B(ne, &skip);
+
+  // Fill aligned region with a pattern.
+  __ Mov(x0, aligned_addr);
+  __ Movi(q0.V16B(), 0x55);
+  __ Movi(q1.V16B(), 0xaa);
+  __ Movi(q2.V16B(), 0x55);
+  __ Movi(q3.V16B(), 0xaa);
+  __ St4(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0));
+
+  // Misalign the address to check DC ZVA re-aligns.
+  __ Add(x0, x0, 42);
+
+  // Clear the aligned region.
+  __ Dc(ZVA, x0);
+
+  // Reload the aligned region to check contents.
+  __ Mov(x0, aligned_addr);
+  __ Ld1(q0.V16B(), q1.V16B(), q2.V16B(), q3.V16B(), MemOperand(x0));
+
+  __ Bind(&skip);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    if (core.xreg(1) == 4) {
+      ASSERT_EQUAL_128(0, 0, q0);
+      ASSERT_EQUAL_128(0, 0, q1);
+      ASSERT_EQUAL_128(0, 0, q2);
+      ASSERT_EQUAL_128(0, 0, q3);
+    } else {
+      printf("SKIPPED: DC ZVA chunksize not 64-bytes");
+    }
+  }
+}
+
 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
 // Test the pseudo-instructions that control CPUFeatures dynamically in the
 // Simulator. These are used by the test infrastructure itself, but in a fairly
diff --git a/test/aarch64/test-assembler-fp-aarch64.cc b/test/aarch64/test-assembler-fp-aarch64.cc
index 56073592..22010298 100644
--- a/test/aarch64/test-assembler-fp-aarch64.cc
+++ b/test/aarch64/test-assembler-fp-aarch64.cc
@@ -3670,720 +3670,280 @@ TEST(fcvt_half) {
   }
 }
 
+typedef void (MacroAssembler::*FcvtFn2)(const Register& rd,
+                                        const VRegister& vn);
+typedef void (MacroAssembler::*FcvtFn3)(const Register& rd,
+                                        const VRegister& vn,
+                                        int fbits);
+
+static void GenFcvt(MacroAssembler* m,
+                    FcvtFn2 fn,
+                    const Register& rd,
+                    const VRegister& vn) {
+  (m->*fn)(rd, vn);
+}
+static void GenFcvt(MacroAssembler* m,
+                    FcvtFn3 fn,
+                    const Register& rd,
+                    const VRegister& vn) {
+  (m->*fn)(rd, vn, 0);
+}
+
+template <typename F = FcvtFn2, typename T, size_t N>
+static void FcvtHelper(F fn,
+                       const T (&inputs)[N],
+                       const uint64_t (&expected)[N],
+                       int dstsize) {
+  VIXL_STATIC_ASSERT(N < 16);  // Use no more than 16 registers.
+
+  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  START();
+
+  for (unsigned i = 0; i < N; i++) {
+    Register wi = WRegister(i);
+    Register xi = XRegister(i);
+    VRegister si = SRegister(i);
+    VRegister di = DRegister(i);
+
+    if (std::is_same<float, T>::value) {
+      __ Fmov(si, inputs[i]);
+      if (dstsize == kWRegSize) {
+        GenFcvt(&masm, fn, wi, si);
+      } else {
+        VIXL_ASSERT(dstsize == kXRegSize);
+        GenFcvt(&masm, fn, xi, si);
+      }
+    } else {
+      __ Fmov(di, inputs[i]);
+      if (dstsize == kWRegSize) {
+        GenFcvt(&masm, fn, wi, di);
+      } else {
+        VIXL_ASSERT(dstsize == kXRegSize);
+        GenFcvt(&masm, fn, xi, di);
+      }
+    }
+  }
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+
+    for (unsigned i = 0; i < N; i++) {
+      ASSERT_EQUAL_64(expected[i], XRegister(i));
+    }
+  }
+}
+
+// Largest float/double < INT32_MAX.
+static const float kLargestF32ltI32Max = RawbitsToFloat(0x4effffff);
+static const double kLargestF64ltI32Max = kWMaxInt - 1;
+
+// Smallest float/double > INT32_MIN.
+static const float kSmallestF32gtI32Min = RawbitsToFloat(0xceffffff);
+static const double kSmallestF64gtI32Min = kWMinInt + 1;
+
+// Largest float/double < INT64_MAX.
+static const float kLargestF32ltI64Max = RawbitsToFloat(0x5effffff);
+static const double kLargestF64ltI64Max = RawbitsToDouble(0x43dfffffffffffff);
+
+// Smallest float/double > INT64_MIN.
+static const float kSmallestF32gtI64Min = RawbitsToFloat(0xdeffffff);
+static const double kSmallestF64gtI64Min = RawbitsToDouble(0xc3dfffffffffffff);
+
+// Largest float/double < UINT32_MAX.
+static const float kLargestF32ltU32Max = 0xffffff00;
+static const double kLargestF64ltU32Max = 0xfffffffe;
+
+// Largest float/double < UINT64_MAX.
+static const float kLargestF32ltU64Max = 0xffffff0000000000;
+static const double kLargestF64ltU64Max = 0xfffffffffffff800;
+
+TEST(fcvt_infinity) {
+  float inputs_s[] = {kFP32PositiveInfinity, kFP32NegativeInfinity};
+  double inputs_d[] = {kFP64PositiveInfinity, kFP64NegativeInfinity};
+  uint64_t expected_w[] = {0x7fffffff, 0x80000000};
+  uint64_t expected_x[] = {0x7fffffffffffffff, 0x8000000000000000};
+
+  // Test all combinations of fcvt, input size and output size.
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_s, expected_w, kWRegSize);
+
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_d, expected_w, kWRegSize);
+
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_x, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_s, expected_x, kXRegSize);
+
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_x, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_d, expected_x, kXRegSize);
+}
+
+TEST(fcvt_ws_minmax) {
+  float inputs[] = {kLargestF32ltI32Max, kSmallestF32gtI32Min};
+  uint64_t expected[] = {0x7fffff80, 0x80000080};
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs, expected, kWRegSize);
+
+  float inputs_u[] = {kLargestF32ltU32Max};
+  uint64_t expected_u[] = {0xffffff00};
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kWRegSize);
+}
+
+TEST(fcvt_wd_minmax) {
+  double inputs[] = {kLargestF64ltI32Max, kSmallestF64gtI32Min};
+  uint64_t expected[] = {0x7ffffffe, 0x80000001};
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs, expected, kWRegSize);
+
+  double inputs_u[] = {kLargestF64ltU32Max};
+  uint64_t expected_u[] = {0xfffffffe};
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kWRegSize);
+}
+
+TEST(fcvt_xs_minmax) {
+  float inputs[] = {kLargestF32ltI64Max, kSmallestF32gtI64Min};
+  uint64_t expected[] = {0x7fffff8000000000, 0x8000008000000000};
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs, expected, kXRegSize);
+
+  float inputs_u[] = {kLargestF32ltU64Max};
+  uint64_t expected_u[] = {0xffffff0000000000};
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kXRegSize);
+}
+
+TEST(fcvt_xd_minmax) {
+  double inputs[] = {kLargestF64ltI64Max, kSmallestF64gtI64Min};
+  uint64_t expected[] = {0x7ffffffffffffc00, 0x8000000000000400};
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs, expected, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs, expected, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs, expected, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs, expected, kXRegSize);
+
+  double inputs_u[] = {kLargestF64ltU64Max};
+  uint64_t expected_u[] = {0xfffffffffffff800};
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_u, expected_u, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_u, expected_u, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_u, expected_u, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_u, expected_u, kXRegSize);
+}
 
 TEST(fcvtas) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 2.5, -2.5};
+  double inputs_d[] = {1.0, 1.1, 2.5, -2.5};
+  uint64_t expected_w[] = {1, 1, 3, 0xfffffffd};
+  uint64_t expected_x[] = {1, 1, 3, 0xfffffffffffffffd};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 2.5);
-  __ Fmov(s3, -2.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 2.5);
-  __ Fmov(d11, -2.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 2.5);
-  __ Fmov(s19, -2.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 2.5);
-  __ Fmov(d26, -2.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
-
-  __ Fcvtas(w0, s0);
-  __ Fcvtas(w1, s1);
-  __ Fcvtas(w2, s2);
-  __ Fcvtas(w3, s3);
-  __ Fcvtas(w4, s4);
-  __ Fcvtas(w5, s5);
-  __ Fcvtas(w6, s6);
-  __ Fcvtas(w7, s7);
-  __ Fcvtas(w8, d8);
-  __ Fcvtas(w9, d9);
-  __ Fcvtas(w10, d10);
-  __ Fcvtas(w11, d11);
-  __ Fcvtas(w12, d12);
-  __ Fcvtas(w13, d13);
-  __ Fcvtas(w14, d14);
-  __ Fcvtas(w15, d15);
-  __ Fcvtas(x17, s17);
-  __ Fcvtas(x18, s18);
-  __ Fcvtas(x19, s19);
-  __ Fcvtas(x20, s20);
-  __ Fcvtas(x21, s21);
-  __ Fcvtas(x22, s22);
-  __ Fcvtas(x23, s23);
-  __ Fcvtas(x24, d24);
-  __ Fcvtas(x25, d25);
-  __ Fcvtas(x26, d26);
-  __ Fcvtas(x27, d27);
-  __ Fcvtas(x28, d28);
-  __ Fcvtas(x29, d29);
-  __ Fcvtas(x30, d30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(3, x2);
-    ASSERT_EQUAL_64(0xfffffffd, x3);
-    ASSERT_EQUAL_64(0x7fffffff, x4);
-    ASSERT_EQUAL_64(0x80000000, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0x80000080, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(3, x10);
-    ASSERT_EQUAL_64(0xfffffffd, x11);
-    ASSERT_EQUAL_64(0x7fffffff, x12);
-    ASSERT_EQUAL_64(0x80000000, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(0x80000001, x15);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(3, x18);
-    ASSERT_EQUAL_64(0xfffffffffffffffd, x19);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x20);
-    ASSERT_EQUAL_64(0x8000000000000000, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0x8000008000000000, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(3, x25);
-    ASSERT_EQUAL_64(0xfffffffffffffffd, x26);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x27);
-    ASSERT_EQUAL_64(0x8000000000000000, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0x8000000000000400, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtas, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtau) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 2.5, -2.5, 0x100000000};
+  double inputs_d[] = {1.0, 1.1, 2.5, -2.5, 0x100000000};
+  uint64_t expected_w[] = {1, 1, 3, 0, 0xffffffff};
+  uint64_t expected_x[] = {1, 1, 3, 0, 0x100000000};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 2.5);
-  __ Fmov(s3, -2.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0xffffff00);  // Largest float < UINT32_MAX.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 2.5);
-  __ Fmov(d11, -2.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, 0xfffffffe);
-  __ Fmov(s16, 1.0);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 2.5);
-  __ Fmov(s19, -2.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0xffffff0000000000);  // Largest float < UINT64_MAX.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 2.5);
-  __ Fmov(d26, -2.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0xfffffffffffff800);  // Largest double < UINT64_MAX.
-  __ Fmov(s30, 0x100000000);
-
-  __ Fcvtau(w0, s0);
-  __ Fcvtau(w1, s1);
-  __ Fcvtau(w2, s2);
-  __ Fcvtau(w3, s3);
-  __ Fcvtau(w4, s4);
-  __ Fcvtau(w5, s5);
-  __ Fcvtau(w6, s6);
-  __ Fcvtau(w8, d8);
-  __ Fcvtau(w9, d9);
-  __ Fcvtau(w10, d10);
-  __ Fcvtau(w11, d11);
-  __ Fcvtau(w12, d12);
-  __ Fcvtau(w13, d13);
-  __ Fcvtau(w14, d14);
-  __ Fcvtau(w15, d15);
-  __ Fcvtau(x16, s16);
-  __ Fcvtau(x17, s17);
-  __ Fcvtau(x18, s18);
-  __ Fcvtau(x19, s19);
-  __ Fcvtau(x20, s20);
-  __ Fcvtau(x21, s21);
-  __ Fcvtau(x22, s22);
-  __ Fcvtau(x24, d24);
-  __ Fcvtau(x25, d25);
-  __ Fcvtau(x26, d26);
-  __ Fcvtau(x27, d27);
-  __ Fcvtau(x28, d28);
-  __ Fcvtau(x29, d29);
-  __ Fcvtau(w30, s30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(3, x2);
-    ASSERT_EQUAL_64(0, x3);
-    ASSERT_EQUAL_64(0xffffffff, x4);
-    ASSERT_EQUAL_64(0, x5);
-    ASSERT_EQUAL_64(0xffffff00, x6);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(3, x10);
-    ASSERT_EQUAL_64(0, x11);
-    ASSERT_EQUAL_64(0xffffffff, x12);
-    ASSERT_EQUAL_64(0, x13);
-    ASSERT_EQUAL_64(0xfffffffe, x14);
-    ASSERT_EQUAL_64(1, x16);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(3, x18);
-    ASSERT_EQUAL_64(0, x19);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x20);
-    ASSERT_EQUAL_64(0, x21);
-    ASSERT_EQUAL_64(0xffffff0000000000, x22);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(3, x25);
-    ASSERT_EQUAL_64(0, x26);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x27);
-    ASSERT_EQUAL_64(0, x28);
-    ASSERT_EQUAL_64(0xfffffffffffff800, x29);
-    ASSERT_EQUAL_64(0xffffffff, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtau, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtms) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5};
+  uint64_t expected_w[] = {1, 1, 1, 0xfffffffe};
+  uint64_t expected_x[] = {1, 1, 1, 0xfffffffffffffffe};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
-
-  __ Fcvtms(w0, s0);
-  __ Fcvtms(w1, s1);
-  __ Fcvtms(w2, s2);
-  __ Fcvtms(w3, s3);
-  __ Fcvtms(w4, s4);
-  __ Fcvtms(w5, s5);
-  __ Fcvtms(w6, s6);
-  __ Fcvtms(w7, s7);
-  __ Fcvtms(w8, d8);
-  __ Fcvtms(w9, d9);
-  __ Fcvtms(w10, d10);
-  __ Fcvtms(w11, d11);
-  __ Fcvtms(w12, d12);
-  __ Fcvtms(w13, d13);
-  __ Fcvtms(w14, d14);
-  __ Fcvtms(w15, d15);
-  __ Fcvtms(x17, s17);
-  __ Fcvtms(x18, s18);
-  __ Fcvtms(x19, s19);
-  __ Fcvtms(x20, s20);
-  __ Fcvtms(x21, s21);
-  __ Fcvtms(x22, s22);
-  __ Fcvtms(x23, s23);
-  __ Fcvtms(x24, d24);
-  __ Fcvtms(x25, d25);
-  __ Fcvtms(x26, d26);
-  __ Fcvtms(x27, d27);
-  __ Fcvtms(x28, d28);
-  __ Fcvtms(x29, d29);
-  __ Fcvtms(x30, d30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(1, x2);
-    ASSERT_EQUAL_64(0xfffffffe, x3);
-    ASSERT_EQUAL_64(0x7fffffff, x4);
-    ASSERT_EQUAL_64(0x80000000, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0x80000080, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(1, x10);
-    ASSERT_EQUAL_64(0xfffffffe, x11);
-    ASSERT_EQUAL_64(0x7fffffff, x12);
-    ASSERT_EQUAL_64(0x80000000, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(0x80000001, x15);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(1, x18);
-    ASSERT_EQUAL_64(0xfffffffffffffffe, x19);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x20);
-    ASSERT_EQUAL_64(0x8000000000000000, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0x8000008000000000, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(1, x25);
-    ASSERT_EQUAL_64(0xfffffffffffffffe, x26);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x27);
-    ASSERT_EQUAL_64(0x8000000000000000, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0x8000000000000400, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtms, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtmu) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5};
+  uint64_t expected_w[] = {1, 1, 1, 0};
+  uint64_t expected_x[] = {1, 1, 1, 0};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
-
-  __ Fcvtmu(w0, s0);
-  __ Fcvtmu(w1, s1);
-  __ Fcvtmu(w2, s2);
-  __ Fcvtmu(w3, s3);
-  __ Fcvtmu(w4, s4);
-  __ Fcvtmu(w5, s5);
-  __ Fcvtmu(w6, s6);
-  __ Fcvtmu(w7, s7);
-  __ Fcvtmu(w8, d8);
-  __ Fcvtmu(w9, d9);
-  __ Fcvtmu(w10, d10);
-  __ Fcvtmu(w11, d11);
-  __ Fcvtmu(w12, d12);
-  __ Fcvtmu(w13, d13);
-  __ Fcvtmu(w14, d14);
-  __ Fcvtmu(x17, s17);
-  __ Fcvtmu(x18, s18);
-  __ Fcvtmu(x19, s19);
-  __ Fcvtmu(x20, s20);
-  __ Fcvtmu(x21, s21);
-  __ Fcvtmu(x22, s22);
-  __ Fcvtmu(x23, s23);
-  __ Fcvtmu(x24, d24);
-  __ Fcvtmu(x25, d25);
-  __ Fcvtmu(x26, d26);
-  __ Fcvtmu(x27, d27);
-  __ Fcvtmu(x28, d28);
-  __ Fcvtmu(x29, d29);
-  __ Fcvtmu(x30, d30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(1, x2);
-    ASSERT_EQUAL_64(0, x3);
-    ASSERT_EQUAL_64(0xffffffff, x4);
-    ASSERT_EQUAL_64(0, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(1, x10);
-    ASSERT_EQUAL_64(0, x11);
-    ASSERT_EQUAL_64(0xffffffff, x12);
-    ASSERT_EQUAL_64(0, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(1, x18);
-    ASSERT_EQUAL_64(0, x19);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x20);
-    ASSERT_EQUAL_64(0, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(1, x25);
-    ASSERT_EQUAL_64(0, x26);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x27);
-    ASSERT_EQUAL_64(0, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtmu, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtns) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5};
+  uint64_t expected_w[] = {1, 1, 2, 0xfffffffe};
+  uint64_t expected_x[] = {1, 1, 2, 0xfffffffffffffffe};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
-
-  __ Fcvtns(w0, s0);
-  __ Fcvtns(w1, s1);
-  __ Fcvtns(w2, s2);
-  __ Fcvtns(w3, s3);
-  __ Fcvtns(w4, s4);
-  __ Fcvtns(w5, s5);
-  __ Fcvtns(w6, s6);
-  __ Fcvtns(w7, s7);
-  __ Fcvtns(w8, d8);
-  __ Fcvtns(w9, d9);
-  __ Fcvtns(w10, d10);
-  __ Fcvtns(w11, d11);
-  __ Fcvtns(w12, d12);
-  __ Fcvtns(w13, d13);
-  __ Fcvtns(w14, d14);
-  __ Fcvtns(w15, d15);
-  __ Fcvtns(x17, s17);
-  __ Fcvtns(x18, s18);
-  __ Fcvtns(x19, s19);
-  __ Fcvtns(x20, s20);
-  __ Fcvtns(x21, s21);
-  __ Fcvtns(x22, s22);
-  __ Fcvtns(x23, s23);
-  __ Fcvtns(x24, d24);
-  __ Fcvtns(x25, d25);
-  __ Fcvtns(x26, d26);
-  __ Fcvtns(x27, d27);
-  __ Fcvtns(x28, d28);
-  __ Fcvtns(x29, d29);
-  __ Fcvtns(x30, d30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(2, x2);
-    ASSERT_EQUAL_64(0xfffffffe, x3);
-    ASSERT_EQUAL_64(0x7fffffff, x4);
-    ASSERT_EQUAL_64(0x80000000, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0x80000080, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(2, x10);
-    ASSERT_EQUAL_64(0xfffffffe, x11);
-    ASSERT_EQUAL_64(0x7fffffff, x12);
-    ASSERT_EQUAL_64(0x80000000, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(0x80000001, x15);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(2, x18);
-    ASSERT_EQUAL_64(0xfffffffffffffffe, x19);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x20);
-    ASSERT_EQUAL_64(0x8000000000000000, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0x8000008000000000, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(2, x25);
-    ASSERT_EQUAL_64(0xfffffffffffffffe, x26);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x27);
-    ASSERT_EQUAL_64(0x8000000000000000, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0x8000000000000400, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtns, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtnu) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5, 0x100000000};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5, 0x100000000};
+  uint64_t expected_w[] = {1, 1, 2, 0, 0xffffffff};
+  uint64_t expected_x[] = {1, 1, 2, 0, 0x100000000};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0xffffff00);  // Largest float < UINT32_MAX.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, 0xfffffffe);
-  __ Fmov(s16, 1.0);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0xffffff0000000000);  // Largest float < UINT64_MAX.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0xfffffffffffff800);  // Largest double < UINT64_MAX.
-  __ Fmov(s30, 0x100000000);
-
-  __ Fcvtnu(w0, s0);
-  __ Fcvtnu(w1, s1);
-  __ Fcvtnu(w2, s2);
-  __ Fcvtnu(w3, s3);
-  __ Fcvtnu(w4, s4);
-  __ Fcvtnu(w5, s5);
-  __ Fcvtnu(w6, s6);
-  __ Fcvtnu(w8, d8);
-  __ Fcvtnu(w9, d9);
-  __ Fcvtnu(w10, d10);
-  __ Fcvtnu(w11, d11);
-  __ Fcvtnu(w12, d12);
-  __ Fcvtnu(w13, d13);
-  __ Fcvtnu(w14, d14);
-  __ Fcvtnu(w15, d15);
-  __ Fcvtnu(x16, s16);
-  __ Fcvtnu(x17, s17);
-  __ Fcvtnu(x18, s18);
-  __ Fcvtnu(x19, s19);
-  __ Fcvtnu(x20, s20);
-  __ Fcvtnu(x21, s21);
-  __ Fcvtnu(x22, s22);
-  __ Fcvtnu(x24, d24);
-  __ Fcvtnu(x25, d25);
-  __ Fcvtnu(x26, d26);
-  __ Fcvtnu(x27, d27);
-  __ Fcvtnu(x28, d28);
-  __ Fcvtnu(x29, d29);
-  __ Fcvtnu(w30, s30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(2, x2);
-    ASSERT_EQUAL_64(0, x3);
-    ASSERT_EQUAL_64(0xffffffff, x4);
-    ASSERT_EQUAL_64(0, x5);
-    ASSERT_EQUAL_64(0xffffff00, x6);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(2, x10);
-    ASSERT_EQUAL_64(0, x11);
-    ASSERT_EQUAL_64(0xffffffff, x12);
-    ASSERT_EQUAL_64(0, x13);
-    ASSERT_EQUAL_64(0xfffffffe, x14);
-    ASSERT_EQUAL_64(1, x16);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(2, x18);
-    ASSERT_EQUAL_64(0, x19);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x20);
-    ASSERT_EQUAL_64(0, x21);
-    ASSERT_EQUAL_64(0xffffff0000000000, x22);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(2, x25);
-    ASSERT_EQUAL_64(0, x26);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x27);
-    ASSERT_EQUAL_64(0, x28);
-    ASSERT_EQUAL_64(0xfffffffffffff800, x29);
-    ASSERT_EQUAL_64(0xffffffff, x30);
-  }
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_s, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_d, expected_w, kWRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_s, expected_x, kXRegSize);
+  FcvtHelper(&MacroAssembler::Fcvtnu, inputs_d, expected_x, kXRegSize);
 }
 
-
 TEST(fcvtzs) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5};
+  uint64_t expected_w[] = {1, 1, 1, 0xffffffff};
+  uint64_t expected_x[] = {1, 1, 1, 0xffffffffffffffff};
 
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_s, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_d, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_s, expected_x, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzs, inputs_d, expected_x, kXRegSize);
+}
 
-  __ Fcvtzs(w0, s0);
-  __ Fcvtzs(w1, s1);
-  __ Fcvtzs(w2, s2);
-  __ Fcvtzs(w3, s3);
-  __ Fcvtzs(w4, s4);
-  __ Fcvtzs(w5, s5);
-  __ Fcvtzs(w6, s6);
-  __ Fcvtzs(w7, s7);
-  __ Fcvtzs(w8, d8);
-  __ Fcvtzs(w9, d9);
-  __ Fcvtzs(w10, d10);
-  __ Fcvtzs(w11, d11);
-  __ Fcvtzs(w12, d12);
-  __ Fcvtzs(w13, d13);
-  __ Fcvtzs(w14, d14);
-  __ Fcvtzs(w15, d15);
-  __ Fcvtzs(x17, s17);
-  __ Fcvtzs(x18, s18);
-  __ Fcvtzs(x19, s19);
-  __ Fcvtzs(x20, s20);
-  __ Fcvtzs(x21, s21);
-  __ Fcvtzs(x22, s22);
-  __ Fcvtzs(x23, s23);
-  __ Fcvtzs(x24, d24);
-  __ Fcvtzs(x25, d25);
-  __ Fcvtzs(x26, d26);
-  __ Fcvtzs(x27, d27);
-  __ Fcvtzs(x28, d28);
-  __ Fcvtzs(x29, d29);
-  __ Fcvtzs(x30, d30);
-  END();
+TEST(fcvtzu) {
+  float inputs_s[] = {1.0, 1.1, 1.5, -1.5};
+  double inputs_d[] = {1.0, 1.1, 1.5, -1.5};
+  uint64_t expected_w[] = {1, 1, 1, 0};
+  uint64_t expected_x[] = {1, 1, 1, 0};
 
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(1, x2);
-    ASSERT_EQUAL_64(0xffffffff, x3);
-    ASSERT_EQUAL_64(0x7fffffff, x4);
-    ASSERT_EQUAL_64(0x80000000, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0x80000080, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(1, x10);
-    ASSERT_EQUAL_64(0xffffffff, x11);
-    ASSERT_EQUAL_64(0x7fffffff, x12);
-    ASSERT_EQUAL_64(0x80000000, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(0x80000001, x15);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(1, x18);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x19);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x20);
-    ASSERT_EQUAL_64(0x8000000000000000, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0x8000008000000000, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(1, x25);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x26);
-    ASSERT_EQUAL_64(0x7fffffffffffffff, x27);
-    ASSERT_EQUAL_64(0x8000000000000000, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0x8000000000000400, x30);
-  }
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_s, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_d, expected_w, kWRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_s, expected_x, kXRegSize);
+  FcvtHelper<FcvtFn3>(&MacroAssembler::Fcvtzu, inputs_d, expected_x, kXRegSize);
 }
 
 void FjcvtzsHelper(uint64_t value, uint64_t expected, uint32_t expected_z) {
@@ -4489,107 +4049,6 @@ TEST(fjcvtzs) {
   }
 }
 
-TEST(fcvtzu) {
-  SETUP_WITH_FEATURES(CPUFeatures::kFP);
-
-  START();
-  __ Fmov(s0, 1.0);
-  __ Fmov(s1, 1.1);
-  __ Fmov(s2, 1.5);
-  __ Fmov(s3, -1.5);
-  __ Fmov(s4, kFP32PositiveInfinity);
-  __ Fmov(s5, kFP32NegativeInfinity);
-  __ Fmov(s6, 0x7fffff80);  // Largest float < INT32_MAX.
-  __ Fneg(s7, s6);          // Smallest float > INT32_MIN.
-  __ Fmov(d8, 1.0);
-  __ Fmov(d9, 1.1);
-  __ Fmov(d10, 1.5);
-  __ Fmov(d11, -1.5);
-  __ Fmov(d12, kFP64PositiveInfinity);
-  __ Fmov(d13, kFP64NegativeInfinity);
-  __ Fmov(d14, kWMaxInt - 1);
-  __ Fmov(d15, kWMinInt + 1);
-  __ Fmov(s17, 1.1);
-  __ Fmov(s18, 1.5);
-  __ Fmov(s19, -1.5);
-  __ Fmov(s20, kFP32PositiveInfinity);
-  __ Fmov(s21, kFP32NegativeInfinity);
-  __ Fmov(s22, 0x7fffff8000000000);  // Largest float < INT64_MAX.
-  __ Fneg(s23, s22);                 // Smallest float > INT64_MIN.
-  __ Fmov(d24, 1.1);
-  __ Fmov(d25, 1.5);
-  __ Fmov(d26, -1.5);
-  __ Fmov(d27, kFP64PositiveInfinity);
-  __ Fmov(d28, kFP64NegativeInfinity);
-  __ Fmov(d29, 0x7ffffffffffffc00);  // Largest double < INT64_MAX.
-  __ Fneg(d30, d29);                 // Smallest double > INT64_MIN.
-
-  __ Fcvtzu(w0, s0);
-  __ Fcvtzu(w1, s1);
-  __ Fcvtzu(w2, s2);
-  __ Fcvtzu(w3, s3);
-  __ Fcvtzu(w4, s4);
-  __ Fcvtzu(w5, s5);
-  __ Fcvtzu(w6, s6);
-  __ Fcvtzu(w7, s7);
-  __ Fcvtzu(w8, d8);
-  __ Fcvtzu(w9, d9);
-  __ Fcvtzu(w10, d10);
-  __ Fcvtzu(w11, d11);
-  __ Fcvtzu(w12, d12);
-  __ Fcvtzu(w13, d13);
-  __ Fcvtzu(w14, d14);
-  __ Fcvtzu(x17, s17);
-  __ Fcvtzu(x18, s18);
-  __ Fcvtzu(x19, s19);
-  __ Fcvtzu(x20, s20);
-  __ Fcvtzu(x21, s21);
-  __ Fcvtzu(x22, s22);
-  __ Fcvtzu(x23, s23);
-  __ Fcvtzu(x24, d24);
-  __ Fcvtzu(x25, d25);
-  __ Fcvtzu(x26, d26);
-  __ Fcvtzu(x27, d27);
-  __ Fcvtzu(x28, d28);
-  __ Fcvtzu(x29, d29);
-  __ Fcvtzu(x30, d30);
-  END();
-
-  if (CAN_RUN()) {
-    RUN();
-
-    ASSERT_EQUAL_64(1, x0);
-    ASSERT_EQUAL_64(1, x1);
-    ASSERT_EQUAL_64(1, x2);
-    ASSERT_EQUAL_64(0, x3);
-    ASSERT_EQUAL_64(0xffffffff, x4);
-    ASSERT_EQUAL_64(0, x5);
-    ASSERT_EQUAL_64(0x7fffff80, x6);
-    ASSERT_EQUAL_64(0, x7);
-    ASSERT_EQUAL_64(1, x8);
-    ASSERT_EQUAL_64(1, x9);
-    ASSERT_EQUAL_64(1, x10);
-    ASSERT_EQUAL_64(0, x11);
-    ASSERT_EQUAL_64(0xffffffff, x12);
-    ASSERT_EQUAL_64(0, x13);
-    ASSERT_EQUAL_64(0x7ffffffe, x14);
-    ASSERT_EQUAL_64(1, x17);
-    ASSERT_EQUAL_64(1, x18);
-    ASSERT_EQUAL_64(0, x19);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x20);
-    ASSERT_EQUAL_64(0, x21);
-    ASSERT_EQUAL_64(0x7fffff8000000000, x22);
-    ASSERT_EQUAL_64(0, x23);
-    ASSERT_EQUAL_64(1, x24);
-    ASSERT_EQUAL_64(1, x25);
-    ASSERT_EQUAL_64(0, x26);
-    ASSERT_EQUAL_64(0xffffffffffffffff, x27);
-    ASSERT_EQUAL_64(0, x28);
-    ASSERT_EQUAL_64(0x7ffffffffffffc00, x29);
-    ASSERT_EQUAL_64(0, x30);
-  }
-}
-
 // Test that scvtf and ucvtf can convert the 64-bit input into the expected
 // value. All possible values of 'fbits' are tested. The expected value is
 // modified accordingly in each case.
diff --git a/test/aarch64/test-assembler-neon-aarch64.cc b/test/aarch64/test-assembler-neon-aarch64.cc
index 1682d13e..2155db48 100644
--- a/test/aarch64/test-assembler-neon-aarch64.cc
+++ b/test/aarch64/test-assembler-neon-aarch64.cc
@@ -10975,8 +10975,26 @@ TEST(neon_usdot_element) {
   }
 }
 
+TEST(neon_pmull_regression_test) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+
+  START();
+  __ Movi(v0.V2D(), 0xdecafc0ffee);
+  __ Pmull(v0.V8H(), v0.V8B(), v0.V8B());
+
+  __ Movi(v1.V2D(), 0xaaaaaaaa55555555);
+  __ Pmull2(v1.V8H(), v1.V16B(), v1.V16B());
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_128(0x0000000000515450, 0x4455500055555454, q0);
+    ASSERT_EQUAL_128(0x4444444444444444, 0x1111111111111111, q1);
+  }
+}
+
 TEST(zero_high_b) {
-  SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kRDM);
+  SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON);
   START();
 
   __ Mov(x0, 0x55aa42ffaa42ff55);
@@ -10996,7 +11014,7 @@ TEST(zero_high_b) {
   __ Ror(x0, x0, 8);
 
   {
-    ExactAssemblyScope scope(&masm, 81 * kInstructionSize);
+    ExactAssemblyScope scope(&masm, 75 * kInstructionSize);
     __ movi(q9.V16B(), 0x55);
     __ dci(0x5e010409);  // mov b9, v0.b[0]
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
@@ -11013,14 +11031,6 @@ TEST(zero_high_b) {
     __ dci(0x7e207809);  // sqneg b9, b0
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
 
-    __ movi(q9.V16B(), 0x55);
-    __ dci(0x7e008429);  // sqrdmlah b9, b1, b0
-    __ orr(q30.V16B(), q30.V16B(), q9.V16B());
-
-    __ movi(q9.V16B(), 0x55);
-    __ dci(0x7e008c29);  // sqrdmlsh b9, b1, b0
-    __ orr(q30.V16B(), q30.V16B(), q9.V16B());
-
     __ movi(q9.V16B(), 0x55);
     __ dci(0x5e205c29);  // sqrshl b9, b1, b0
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
@@ -11821,10 +11831,7 @@ TEST(zero_high_s) {
 }
 
 TEST(zero_high_d) {
-  SETUP_WITH_FEATURES(CPUFeatures::kSVE,
-                      CPUFeatures::kNEON,
-                      CPUFeatures::kFP,
-                      CPUFeatures::kRDM);
+  SETUP_WITH_FEATURES(CPUFeatures::kSVE, CPUFeatures::kNEON, CPUFeatures::kFP);
   START();
 
   __ Mov(x0, 0x55aa42ffaa42ff55);
@@ -11844,7 +11851,7 @@ TEST(zero_high_d) {
   __ Ror(x0, x0, 8);
 
   {
-    ExactAssemblyScope scope(&masm, 291 * kInstructionSize);
+    ExactAssemblyScope scope(&masm, 285 * kInstructionSize);
     __ movi(q9.V16B(), 0x55);
     __ dci(0x5ee0b809);  // abs d9, d0
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
@@ -12113,14 +12120,6 @@ TEST(zero_high_d) {
     __ dci(0x7ee07809);  // sqneg d9, d0
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
 
-    __ movi(q9.V16B(), 0x55);
-    __ dci(0x7ec08429);  // sqrdmlah d9, d1, d0
-    __ orr(q30.V16B(), q30.V16B(), q9.V16B());
-
-    __ movi(q9.V16B(), 0x55);
-    __ dci(0x7ec08c29);  // sqrdmlsh d9, d1, d0
-    __ orr(q30.V16B(), q30.V16B(), q9.V16B());
-
     __ movi(q9.V16B(), 0x55);
     __ dci(0x5ee05c29);  // sqrshl d9, d1, d0
     __ orr(q30.V16B(), q30.V16B(), q9.V16B());
diff --git a/test/aarch64/test-assembler-sve-aarch64.cc b/test/aarch64/test-assembler-sve-aarch64.cc
index f16ab336..cc49d5b1 100644
--- a/test/aarch64/test-assembler-sve-aarch64.cc
+++ b/test/aarch64/test-assembler-sve-aarch64.cc
@@ -19729,6 +19729,709 @@ TEST_SVE(sudot_usdot) {
   }
 }
 
+TEST_SVE(neon_ins_zero_high_regression_test) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
+
+  START();
+  __ Movi(v0.V2D(), 0x0f0e0d0c0b0a0908, 0x0706050403020100);
+
+  // Check that both forms of ins zero bits <VL-1:128>
+  __ Index(z1.VnB(), 0, 1);
+  __ Ins(v1.V16B(), 0, wzr);
+  __ Index(z2.VnB(), 0, 1);
+  __ Ins(v2.V16B(), 3, v2.V16B(), 3);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_SVE(z0, z1);
+    ASSERT_EQUAL_SVE(z0, z2);
+  }
+}
+
+TEST_SVE(neon_fcvt_zero_high_regression_test) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kSVE);
+
+  START();
+  __ Mov(z1.VnD(), 0);
+  __ Mov(z2.VnD(), 0);
+  __ Mov(z3.VnD(), 0);
+  __ Mov(z4.VnD(), 0);
+  __ Mov(z5.VnD(), 0);
+  __ Mov(z6.VnD(), 0);
+  __ Mov(z10.VnD(), 0);
+
+  Label done;
+  // Skip calculations for VL128.
+  __ Rdvl(x0, 1);
+  __ Cmp(x0, 16);
+  __ B(eq, &done);
+
+  __ Movi(v0.V2D(), 0x3ff000003f800000);
+  __ Index(z1.VnB(), 0, 1);
+  __ Index(z2.VnB(), 0, 1);
+  __ Index(z3.VnB(), 0, 1);
+  __ Index(z4.VnB(), 0, 1);
+  __ Index(z5.VnB(), 0, 1);
+  __ Index(z6.VnB(), 0, 1);
+
+  // Test zeroing bits <VL-1:128> for fcvtl, fcvtn and fcvtxn.
+  __ Fcvtl(v1.V2D(), v0.V2S());
+  __ Fcvtl2(v2.V2D(), v0.V4S());
+
+  __ Fcvtn(v3.V2S(), v0.V2D());
+  __ Fcvtn2(v4.V4S(), v0.V2D());
+
+  __ Fcvtxn(v5.V2S(), v0.V2D());
+  __ Fcvtxn2(v6.V4S(), v0.V2D());
+
+  // Set the expected non-zero bits to zero.
+  __ Ext(z1.VnB(), z1.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
+  __ Ext(z2.VnB(), z2.VnB(), z10.VnB(), kDRegSizeInBytes * 2);
+  __ Ext(z3.VnB(), z3.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
+  __ Ext(z4.VnB(), z4.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
+  __ Ext(z5.VnB(), z5.VnB(), z10.VnB(), kSRegSizeInBytes * 2);
+  __ Ext(z6.VnB(), z6.VnB(), z10.VnB(), kSRegSizeInBytes * 4);
+
+  __ Bind(&done);
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_SVE(z10, z1);
+    ASSERT_EQUAL_SVE(z10, z2);
+    ASSERT_EQUAL_SVE(z10, z3);
+    ASSERT_EQUAL_SVE(z10, z4);
+    ASSERT_EQUAL_SVE(z10, z5);
+    ASSERT_EQUAL_SVE(z10, z6);
+  }
+}
+
+#define TEST_ZEROING(INST)  \
+  __ Index(z0.VnB(), 0, 1); \
+  __ INST;                  \
+  __ Orr(z10.VnB(), z10.VnB(), z0.VnB());
+
+TEST_SVE(neon_zero_high) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kFP,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kNEONHalf,
+                          CPUFeatures::kSVE,
+                          CPUFeatures::kFcma,
+                          CPUFeatures::kFHM,
+                          CPUFeatures::kFrintToFixedSizedInt,
+                          CPUFeatures::kDotProduct,
+                          CPUFeatures::kRDM,
+                          CPUFeatures::kI8MM);
+
+  START();
+  __ Mov(z10.VnD(), 0);  // Initialise cumulative result register.
+
+  TEST_ZEROING(Abs(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Abs(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Add(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Add(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Addhn2(v0.V16B(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Addhn(v0.V4H(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Addp(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Addp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(And(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Bic(v0.V8H(), 0, 0));
+  TEST_ZEROING(Bic(v0.V2S(), 255, 0));
+  TEST_ZEROING(Bic(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Bif(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Bit(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Bsl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cls(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cls(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Clz(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Clz(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Cmeq(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmeq(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Cmge(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmge(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Cmgt(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmgt(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmhi(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmhi(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmhs(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmhs(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cmle(v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Cmle(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Cmlt(v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Cmlt(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Cmtst(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Cmtst(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Cnt(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Dup(v0.V2S(), w0));
+  TEST_ZEROING(Dup(v0.V8B(), w0));
+  TEST_ZEROING(Dup(v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Dup(v0.V8B(), v0.B(), 0));
+  TEST_ZEROING(Eor(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Ext(v0.V16B(), v0.V16B(), v0.V16B(), 0));
+  TEST_ZEROING(Ext(v0.V8B(), v0.V8B(), v0.V8B(), 4));
+  TEST_ZEROING(Fabd(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fabd(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fabs(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fabs(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Facge(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Facge(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Facgt(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Facgt(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fadd(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fadd(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Faddp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Faddp(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcadd(v0.V2S(), v0.V2S(), v0.V2S(), 90));
+  TEST_ZEROING(Fcadd(v0.V8H(), v0.V8H(), v0.V8H(), 90));
+  TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), 0));
+  TEST_ZEROING(Fcmeq(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcmeq(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), 0));
+  TEST_ZEROING(Fcmge(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcmge(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), 0));
+  TEST_ZEROING(Fcmgt(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcmgt(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.H(), 0, 0));
+  TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.S(), 0, 0));
+  TEST_ZEROING(Fcmla(v0.V4S(), v0.V4S(), v0.V4S(), 0));
+  TEST_ZEROING(Fcmla(v0.V4H(), v0.V4H(), v0.V4H(), 0));
+  TEST_ZEROING(Fcmle(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Fcmle(v0.V8H(), v0.V8H(), 0));
+  TEST_ZEROING(Fcmlt(v0.V2S(), v0.V2S(), 0));
+  TEST_ZEROING(Fcmlt(v0.V8H(), v0.V8H(), 0));
+  TEST_ZEROING(Fcvtas(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtas(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtau(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtau(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtl2(v0.V4S(), v0.V8H()));
+  TEST_ZEROING(Fcvtl(v0.V2D(), v0.V2S()));
+  TEST_ZEROING(Fcvtms(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtms(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtmu(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtmu(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtn2(v0.V8H(), v0.V4S()));
+  TEST_ZEROING(Fcvtn(v0.V2S(), v0.V2D()));
+  TEST_ZEROING(Fcvtns(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtns(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtnu(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtnu(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtps(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtps(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtpu(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtpu(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtxn(v0.V2S(), v0.V2D()));
+  TEST_ZEROING(Fcvtxn2(v0.V4S(), v0.V2D()));
+  TEST_ZEROING(Fcvtzs(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtzs(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fcvtzs(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Fcvtzu(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fcvtzu(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fcvtzu(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Fdiv(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fdiv(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmax(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fmax(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmaxnm(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fmaxnm(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmaxnmp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fmaxnmp(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmaxp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fmaxp(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmin(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fmin(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fminnm(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fminnm(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fminnmp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fminnmp(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fminp(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Fminp(v0.V8H(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.H(), 2));
+  TEST_ZEROING(Fmla(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fmla(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Fmlal2(v0.V2S(), v0.V2H(), v0.H(), 2));
+  TEST_ZEROING(Fmlal2(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Fmlal(v0.V2S(), v0.V2H(), v0.H(), 2));
+  TEST_ZEROING(Fmlal(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.H(), 2));
+  TEST_ZEROING(Fmls(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fmls(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Fmlsl2(v0.V2S(), v0.V2H(), v0.H(), 2));
+  TEST_ZEROING(Fmlsl2(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Fmlsl(v0.V2S(), v0.V2H(), v0.H(), 2));
+  TEST_ZEROING(Fmlsl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmov(v0.V2D(), 2.0000));
+  TEST_ZEROING(Fmov(v0.V4H(), 2.0000));
+  TEST_ZEROING(Fmov(v0.D(), 1, x1));
+  TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.H(), 2));
+  TEST_ZEROING(Fmul(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fmul(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.H(), 2));
+  TEST_ZEROING(Fmulx(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fmulx(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fneg(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fneg(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frecpe(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frecpe(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frecps(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frecps(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frint32x(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frint32z(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frint64x(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frint64z(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frinta(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frinta(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frinti(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frinti(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frintm(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frintm(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frintn(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frintn(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frintp(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frintp(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frintx(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frintx(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frintz(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frintz(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frsqrte(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frsqrte(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Frsqrts(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Frsqrts(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fsqrt(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fsqrt(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Fsub(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Fsub(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Mov(v0.D(), 0, x0));
+  TEST_ZEROING(Mov(v0.S(), 0, w0));
+  TEST_ZEROING(Mov(v0.H(), 0, w0));
+  TEST_ZEROING(Mov(v0.B(), 0, w0));
+  TEST_ZEROING(Mov(v0.D(), 0, v0.D(), 0));
+  TEST_ZEROING(Mov(v0.S(), 0, v0.S(), 0));
+  TEST_ZEROING(Mov(v0.H(), 0, v0.H(), 0));
+  TEST_ZEROING(Mov(v0.B(), 0, v0.B(), 0));
+  TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Mla(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Mla(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Mls(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Mls(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Movi(v0.V2D(), 0xff));
+  TEST_ZEROING(Movi(v0.V2S(), 0xff));
+  TEST_ZEROING(Movi(v0.V4S(), 0x10, LSL, 8));
+  TEST_ZEROING(Movi(v0.V2S(), 0x10, LSL, 8));
+  TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Mul(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Mul(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
+  TEST_ZEROING(Mvni(v0.V4H(), 0x10, LSL, 8));
+  TEST_ZEROING(Neg(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Neg(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Mvn(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Mvn(v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Orn(v0.V8B(), v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Orn(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Orr(v0.V8H(), 0x10, 8));
+  TEST_ZEROING(Orr(v0.V4H(), 0x10, 8));
+  TEST_ZEROING(Mov(v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Mov(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Pmul(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Pmull(v0.V8H(), v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Pmull2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Raddhn2(v0.V16B(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Raddhn(v0.V4H(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Rbit(v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Rbit(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Rsubhn2(v0.V16B(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Rsubhn(v0.V4H(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Saba(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Saba(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Saba(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sabal2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sabal(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sabd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sabd(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Sabd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sabdl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sadalp(v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Saddl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Saddl(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Saddl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Saddw2(v0.V8H(), v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Saddw(v0.V4S(), v0.V4S(), v0.V4H()));
+  TEST_ZEROING(Scvtf(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Scvtf(v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Scvtf(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
+  TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
+  TEST_ZEROING(Sdot(v0.V4S(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sdot(v0.V2S(), v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Shadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Shadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Shl(v0.V2D(), v0.V2D(), 56));
+  TEST_ZEROING(Shll2(v0.V8H(), v0.V16B(), 8));
+  TEST_ZEROING(Shll(v0.V2D(), v0.V2S(), 32));
+  TEST_ZEROING(Shsub(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Shsub(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sli(v0.V2D(), v0.V2D(), 56));
+  TEST_ZEROING(Sli(v0.V2S(), v0.V2S(), 16));
+  TEST_ZEROING(Smax(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smax(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Smaxp(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smaxp(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Smin(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smin(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sminp(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sminp(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Smlal2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Smlal(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Smlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Smlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Smull2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Smull(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Sqabs(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqabs(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqdmlal2(v0.V4S(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Sqdmlal(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Sqdmlsl2(v0.V4S(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Sqdmlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Sqdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sqdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqdmull2(v0.V2D(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Sqdmull(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Sqneg(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqneg(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Sqrdmlah(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sqrdmlah(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Sqrdmlsh(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sqrdmlsh(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.S(), 0));
+  TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Sqrdmulh(v0.V4S(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sqrdmulh(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqshl(v0.V2D(), v0.V2D(), 56));
+  TEST_ZEROING(Sqshl(v0.V2S(), v0.V2S(), 16));
+  TEST_ZEROING(Sqshlu(v0.V2D(), v0.V2D(), 56));
+  TEST_ZEROING(Sqshlu(v0.V2S(), v0.V2S(), 16));
+  TEST_ZEROING(Sqsub(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sqsub(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sqxtn2(v0.V16B(), v0.V8H()));
+  TEST_ZEROING(Sqxtn(v0.V2S(), v0.V2D()));
+  TEST_ZEROING(Sqxtun2(v0.V16B(), v0.V8H()));
+  TEST_ZEROING(Sqxtun(v0.V2S(), v0.V2D()));
+  TEST_ZEROING(Srhadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Srhadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sri(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Sri(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Srshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Srshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Srshr(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Srshr(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Srsra(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Srsra(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Sshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Sshr(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Sshr(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Ssra(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Ssra(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Ssubl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Ssubl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Ssubw2(v0.V8H(), v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Ssubw(v0.V4S(), v0.V4S(), v0.V4H()));
+  TEST_ZEROING(Sub(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Sub(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Subhn2(v0.V16B(), v0.V8H(), v0.V8H()));
+  TEST_ZEROING(Subhn(v0.V4H(), v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Sudot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
+  TEST_ZEROING(Sudot(v0.V2S(), v0.V8B(), v0.S4B(), 2));
+  TEST_ZEROING(Suqadd(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Suqadd(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Tbl(v0.V8B(), {v0.V16B()}, v0.V8B()));
+  TEST_ZEROING(Tbl(v0.V16B(), {v0.V16B()}, v0.V16B()));
+  TEST_ZEROING(Tbx(v0.V8B(), {v0.V16B()}, v0.V8B()));
+  TEST_ZEROING(Tbx(v0.V16B(), {v0.V16B()}, v0.V16B()));
+  TEST_ZEROING(Trn1(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Trn1(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Trn2(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Trn2(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uaba(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uaba(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uabal2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uabal(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uabd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uabd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uabdl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uabdl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uadalp(v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Uadalp(v0.V2S(), v0.V4H()));
+  TEST_ZEROING(Uaddl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uaddl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uaddlp(v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Uaddlp(v0.V2S(), v0.V4H()));
+  TEST_ZEROING(Uaddw2(v0.V8H(), v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Uaddw(v0.V4S(), v0.V4S(), v0.V4H()));
+  TEST_ZEROING(Ucvtf(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Ucvtf(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Ucvtf(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Ucvtf(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
+  TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.S4B(), 0));
+  TEST_ZEROING(Udot(v0.V2S(), v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Udot(v0.V4S(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uhadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uhadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uhsub(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uhsub(v0.V2S(), v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Umax(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umax(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umaxp(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umaxp(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umin(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umin(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uminp(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uminp(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umlal2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umlal(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Umlal(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Umlsl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umlsl(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Umlsl(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Umull2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Umull(v0.V2D(), v0.V2S(), v0.S(), 0));
+  TEST_ZEROING(Umull(v0.V4S(), v0.V4H(), v0.H(), 0));
+  TEST_ZEROING(Uqadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uqadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uqrshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uqrshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uqshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uqshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uqsub(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uqsub(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uqxtn2(v0.V16B(), v0.V8H()));
+  TEST_ZEROING(Uqxtn(v0.V2S(), v0.V2D()));
+  TEST_ZEROING(Urecpe(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Urecpe(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Urhadd(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Urhadd(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Urshl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Urshl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Urshr(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Urshr(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Ursqrte(v0.V4S(), v0.V4S()));
+  TEST_ZEROING(Ursqrte(v0.V2S(), v0.V2S()));
+  TEST_ZEROING(Ursra(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Ursra(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.S4B(), 0));
+  TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.S4B(), 1));
+  TEST_ZEROING(Usdot(v0.V4S(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Usdot(v0.V2S(), v0.V8B(), v0.V8B()));
+  TEST_ZEROING(Ushl(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Ushl(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Ushr(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Ushr(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Usqadd(v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Usqadd(v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Usra(v0.V2D(), v0.V2D(), 8));
+  TEST_ZEROING(Usra(v0.V2S(), v0.V2S(), 8));
+  TEST_ZEROING(Usubl2(v0.V8H(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Usubl(v0.V4S(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Usubw2(v0.V8H(), v0.V8H(), v0.V16B()));
+  TEST_ZEROING(Usubw(v0.V4S(), v0.V4S(), v0.V4H()));
+  TEST_ZEROING(Uzp1(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uzp1(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Uzp2(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Uzp2(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Xtn2(v0.V16B(), v0.V8H()));
+  TEST_ZEROING(Xtn(v0.V4H(), v0.V4S()));
+  TEST_ZEROING(Zip1(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Zip1(v0.V4H(), v0.V4H(), v0.V4H()));
+  TEST_ZEROING(Zip2(v0.V16B(), v0.V16B(), v0.V16B()));
+  TEST_ZEROING(Zip2(v0.V4H(), v0.V4H(), v0.V4H()));
+
+  __ Mov(z11.VnD(), 0);
+
+  Label done, zero_127_to_0;
+  __ Rdvl(x0, 1);
+  __ Cmp(x0, 16);
+  __ B(gt, &zero_127_to_0);
+
+  // For 128-bit VL, there's nothing to be tested, so zero the whole register.
+  __ Mov(z10.VnD(), 0);
+  __ B(&done);
+
+  // Set the expected non-zero bits to zero.
+  __ Bind(&zero_127_to_0);
+  __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
+
+  __ Bind(&done);
+
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_SVE(z11, z10);
+  }
+}
+
+#undef TEST_ZEROING
+
+#define TEST_ZEROING_1(INST) \
+  __ Index(z0.VnB(), 0, 1);  \
+  __ INST;                   \
+  __ Orr(z10.VnB(), z10.VnB(), z0.VnB());
+#define TEST_ZEROING_2(INST)              \
+  __ Index(z0.VnB(), 0, 1);               \
+  __ Index(z1.VnB(), 0, 1);               \
+  __ INST;                                \
+  __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z1.VnB());
+#define TEST_ZEROING_3(INST)              \
+  __ Index(z0.VnB(), 0, 1);               \
+  __ Index(z1.VnB(), 0, 1);               \
+  __ Index(z2.VnB(), 0, 1);               \
+  __ INST;                                \
+  __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z2.VnB());
+#define TEST_ZEROING_4(INST)              \
+  __ Index(z0.VnB(), 0, 1);               \
+  __ Index(z1.VnB(), 0, 1);               \
+  __ Index(z2.VnB(), 0, 1);               \
+  __ Index(z3.VnB(), 0, 1);               \
+  __ INST;                                \
+  __ Orr(z10.VnB(), z10.VnB(), z0.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z1.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z2.VnB()); \
+  __ Orr(z10.VnB(), z10.VnB(), z3.VnB());
+
+TEST_SVE(neon_load_zero_high) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kSVE);
+
+  START();
+  __ Mov(z10.VnD(), 0);  // Initialise cumulative result register.
+
+  // Initialise x0 to point to a buffer from which data is loaded. The contents
+  // does not need to be defined.
+  int data_size = 4 * kQRegSizeInBytes;
+  uint8_t* data = new uint8_t[data_size];
+  __ Mov(x0, reinterpret_cast<uintptr_t>(&data[data_size]));
+
+  MemOperand mop = MemOperand(x0);
+  TEST_ZEROING_1(Ld1(v0.V16B(), mop));
+  TEST_ZEROING_1(Ld1(v0.V4H(), mop));
+  TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), mop));
+  TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), mop));
+  TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), mop));
+  TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), mop));
+  TEST_ZEROING_1(Ld1(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
+  TEST_ZEROING_1(Ld1(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
+  TEST_ZEROING_1(Ld1(v0.B(), 1, mop));
+  TEST_ZEROING_1(Ld1(v0.D(), 1, mop));
+  TEST_ZEROING_1(Ld1(v0.H(), 1, mop));
+  TEST_ZEROING_1(Ld1(v0.S(), 1, mop));
+  TEST_ZEROING_1(Ld1r(v0.V16B(), mop));
+  TEST_ZEROING_1(Ld1r(v0.V4H(), mop));
+  TEST_ZEROING_2(Ld2(v0.V16B(), v1.V16B(), mop));
+  TEST_ZEROING_2(Ld2(v0.V4H(), v1.V4H(), mop));
+  TEST_ZEROING_2(Ld2(v0.B(), v1.B(), 1, mop));
+  TEST_ZEROING_2(Ld2(v0.D(), v1.D(), 1, mop));
+  TEST_ZEROING_2(Ld2(v0.H(), v1.H(), 1, mop));
+  TEST_ZEROING_2(Ld2(v0.S(), v1.S(), 1, mop));
+  TEST_ZEROING_2(Ld2r(v0.V16B(), v1.V16B(), mop));
+  TEST_ZEROING_2(Ld2r(v0.V4H(), v1.V4H(), mop));
+  TEST_ZEROING_3(Ld3(v0.V16B(), v1.V16B(), v2.V16B(), mop));
+  TEST_ZEROING_3(Ld3(v0.V4H(), v1.V4H(), v2.V4H(), mop));
+  TEST_ZEROING_3(Ld3(v0.B(), v1.B(), v2.B(), 1, mop));
+  TEST_ZEROING_3(Ld3(v0.D(), v1.D(), v2.D(), 1, mop));
+  TEST_ZEROING_3(Ld3(v0.H(), v1.H(), v2.H(), 1, mop));
+  TEST_ZEROING_3(Ld3(v0.S(), v1.S(), v2.S(), 1, mop));
+  TEST_ZEROING_3(Ld3r(v0.V16B(), v1.V16B(), v2.V16B(), mop));
+  TEST_ZEROING_3(Ld3r(v0.V4H(), v1.V4H(), v2.V4H(), mop));
+  TEST_ZEROING_4(Ld4(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
+  TEST_ZEROING_4(Ld4(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
+  TEST_ZEROING_4(Ld4(v0.B(), v1.B(), v2.B(), v3.B(), 1, mop));
+  TEST_ZEROING_4(Ld4(v0.D(), v1.D(), v2.D(), v3.D(), 1, mop));
+  TEST_ZEROING_4(Ld4(v0.H(), v1.H(), v2.H(), v3.H(), 1, mop));
+  TEST_ZEROING_4(Ld4(v0.S(), v1.S(), v2.S(), v3.S(), 1, mop));
+  TEST_ZEROING_4(Ld4r(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B(), mop));
+  TEST_ZEROING_4(Ld4r(v0.V4H(), v1.V4H(), v2.V4H(), v3.V4H(), mop));
+
+  __ Mov(z11.VnD(), 0);
+
+  Label done, zero_127_to_0;
+  __ Rdvl(x0, 1);
+  __ Cmp(x0, 16);
+  __ B(gt, &zero_127_to_0);
+
+  // For 128-bit VL, there's nothing to be tested, so zero the whole register.
+  __ Mov(z10.VnD(), 0);
+  __ B(&done);
+
+  // Set the expected non-zero bits to zero.
+  __ Bind(&zero_127_to_0);
+  __ Ext(z10.VnB(), z10.VnB(), z11.VnB(), kDRegSizeInBytes * 2);
+
+  __ Bind(&done);
+
+  END();
+
+  if (CAN_RUN()) {
+    RUN();
+    ASSERT_EQUAL_SVE(z11, z10);
+  }
+}
+
+#undef TEST_ZEROING_1
+#undef TEST_ZEROING_2
+#undef TEST_ZEROING_3
+#undef TEST_ZEROING_4
+
 TEST_SVE(sve_load_store_sp_base_regression_test) {
   SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE);
   START();
@@ -19945,6 +20648,8 @@ TEST_SVE(sve_load_store_sp_base_regression_test) {
     __ dci(0xe58043e0);  // str z0, [sp]
   }
 
+  __ Drop(128 * 2 * kXRegSizeInBytes);
+
   END();
 
   if (CAN_RUN()) {
diff --git a/test/aarch64/test-cpu-features-aarch64.cc b/test/aarch64/test-cpu-features-aarch64.cc
index 4a82127f..c018f49b 100644
--- a/test/aarch64/test-cpu-features-aarch64.cc
+++ b/test/aarch64/test-cpu-features-aarch64.cc
@@ -3778,5 +3778,91 @@ TEST_FP_FCMA_NEON_NEONHALF(fcmla_1, fcmla(v0.V8H(), v1.V8H(), v2.H(), 2, 180))
 TEST_FP_FCMA_NEON_NEONHALF(fcmla_2, fcmla(v0.V4H(), v1.V4H(), v2.V4H(), 180))
 TEST_FP_FCMA_NEON_NEONHALF(fcmla_3, fcmla(v0.V8H(), v1.V8H(), v2.V8H(), 0))
 
+#define TEST_FEAT(NAME, ASM)                                            \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kPmull1Q), \
+                NEON_Pmull1Q_##NAME,                                    \
+                ASM)
+TEST_FEAT(pmull1q_0, pmull(v5.V1Q(), v6.V1D(), v7.V1D()))
+#undef TEST_FEAT
+
+#define TEST_NEON_SHA3(NAME, ASM)                                    \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA3), \
+                NEON_SHA3_##NAME,                                    \
+                ASM)
+TEST_NEON_SHA3(bcax_0, bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
+TEST_NEON_SHA3(eor3_0, eor3(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()))
+TEST_NEON_SHA3(xar_0, xar(v0.V2D(), v1.V2D(), v2.V2D(), 42))
+TEST_NEON_SHA3(rax1_0, rax1(v0.V2D(), v1.V2D(), v2.V2D()))
+
+#define TEST_NEON_SHA1(NAME, ASM)                                    \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA1), \
+                NEON_SHA1_##NAME,                                    \
+                ASM)
+TEST_NEON_SHA1(sha1c_0, sha1c(q0, s12, v20.V4S()))
+TEST_NEON_SHA1(sha1m_0, sha1m(q22, s2, v13.V4S()))
+TEST_NEON_SHA1(sha1p_0, sha1p(q31, s5, v15.V4S()))
+TEST_NEON_SHA1(sha1su0_0, sha1su0(v19.V4S(), v9.V4S(), v27.V4S()))
+TEST_NEON_SHA1(sha1h_0, sha1h(s12, s0))
+TEST_NEON_SHA1(sha1su1_0, sha1su1(v2.V4S(), v4.V4S()))
+
+#define TEST_FEAT(NAME, ASM)                                         \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA2), \
+                NEON_SHA2_##NAME,                                    \
+                ASM)
+TEST_FEAT(sha256h_0, sha256h(q0, q12, v20.V4S()))
+TEST_FEAT(sha256h2_0, sha256h2(q22, q2, v13.V4S()))
+TEST_FEAT(sha256su0_0, sha256su0(v2.V4S(), v4.V4S()))
+TEST_FEAT(sha256su1_0, sha256su1(v19.V4S(), v9.V4S(), v27.V4S()))
+#undef TEST_FEAT
+
+#define TEST_FEAT(NAME, ASM)                                           \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSHA512), \
+                NEON_SHA512_##NAME,                                    \
+                ASM)
+TEST_FEAT(sha512h_0, sha512h(q0, q12, v20.V2D()))
+TEST_FEAT(sha512h2_0, sha512h2(q22, q2, v13.V2D()))
+TEST_FEAT(sha512su0_0, sha512su0(v2.V2D(), v4.V2D()))
+TEST_FEAT(sha512su1_0, sha512su1(v19.V2D(), v9.V2D(), v27.V2D()))
+#undef TEST_FEAT
+
+#define TEST_FEAT(NAME, ASM)                                        \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kAES), \
+                NEON_AES_##NAME,                                    \
+                ASM)
+TEST_FEAT(aesd_0, aesd(v0.V16B(), v29.V16B()))
+TEST_FEAT(aese_0, aese(v0.V16B(), v29.V16B()))
+TEST_FEAT(aesimc_0, aesimc(v0.V16B(), v29.V16B()))
+TEST_FEAT(aesmc_0, aesmc(v0.V16B(), v29.V16B()))
+#undef TEST_FEAT
+
+#define TEST_FEAT(NAME, ASM)                                        \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM3), \
+                NEON_SM3_##NAME,                                    \
+                ASM)
+TEST_FEAT(sm3partw1_0, sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3partw2_0, sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()))
+TEST_FEAT(sm3ss1_0, sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()))
+TEST_FEAT(sm3tt1a_0, sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1))
+TEST_FEAT(sm3tt1b_0, sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3))
+TEST_FEAT(sm3tt2a_0, sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2))
+TEST_FEAT(sm3tt2b_0, sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0))
+#undef TEST_FEAT
+
+#define TEST_FEAT(NAME, ASM)                                        \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kNEON, CPUFeatures::kSM4), \
+                NEON_SM4_##NAME,                                    \
+                ASM)
+TEST_FEAT(sm4e, sm4e(v12.V4S(), v13.V4S()))
+TEST_FEAT(sm4ekey, sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()))
+#undef TEST_FEAT
+
+#define TEST_FEAT(NAME, ASM)                                                \
+  TEST_TEMPLATE(CPUFeatures(CPUFeatures::kSVE2, CPUFeatures::kSVEPmull128), \
+                SVE_PMULL128_##NAME,                                        \
+                ASM)
+TEST_FEAT(pmullb, pmullb(z12.VnQ(), z21.VnD(), z12.VnD()))
+TEST_FEAT(pmullt, pmullt(z12.VnQ(), z21.VnD(), z12.VnD()))
+#undef TEST_FEAT
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-disasm-aarch64.cc b/test/aarch64/test-disasm-aarch64.cc
index 2babb9e6..40abef1e 100644
--- a/test/aarch64/test-disasm-aarch64.cc
+++ b/test/aarch64/test-disasm-aarch64.cc
@@ -2611,6 +2611,7 @@ TEST(system_mrs) {
   COMPARE(mrs(x15, FPCR), "mrs x15, fpcr");
   COMPARE(mrs(x20, RNDR), "mrs x20, rndr");
   COMPARE(mrs(x5, RNDRRS), "mrs x5, rndrrs");
+  COMPARE(mrs(x9, DCZID_EL0), "mrs x9, dczid_el0");
 
   // Test mrs that use system registers we haven't named.
   COMPARE(dci(MRS | (0x5555 << 5)), "mrs x0, S3_2_c10_c10_5");
@@ -3359,6 +3360,20 @@ TEST(cssc) {
   CLEANUP();
 }
 
+TEST(gcs) {
+  SETUP();
+
+  COMPARE_MACRO(Chkfeat(x16), "chkfeat x16");
+  COMPARE_MACRO(Gcspopm(x0), "gcspopm x0");
+  COMPARE_MACRO(Gcspopm(), "gcspopm");
+  COMPARE_MACRO(Gcspopm(xzr), "gcspopm");
+  COMPARE_MACRO(Gcsss1(x4), "gcsss1 x4");
+  COMPARE_MACRO(Gcsss2(x2), "gcsss2 x2");
+  COMPARE_MACRO(Gcspushm(x1), "gcspushm x1");
+
+  CLEANUP();
+}
+
 TEST(architecture_features) {
   SETUP();
 
@@ -3543,19 +3558,19 @@ TEST(architecture_features) {
   COMPARE_PREFIX(dci(0xf8e08000), "swpal");      // SWPAL_64_memop
 
   // ARMv8.1 - RDM
-  COMPARE_PREFIX(dci(0x2e008400), "sqrdmlah");  // SQRDMLAH_asimdsame2_only
-  COMPARE_PREFIX(dci(0x2e008c00), "sqrdmlsh");  // SQRDMLSH_asimdsame2_only
+  COMPARE_PREFIX(dci(0x2e808400), "sqrdmlah");  // SQRDMLAH_asimdsame2_only
+  COMPARE_PREFIX(dci(0x2e808c00), "sqrdmlsh");  // SQRDMLSH_asimdsame2_only
   COMPARE_PREFIX(dci(0x2f40d000), "sqrdmlah");  // SQRDMLAH_asimdelem_R
   COMPARE_PREFIX(dci(0x2f40f000), "sqrdmlsh");  // SQRDMLSH_asimdelem_R
-  COMPARE_PREFIX(dci(0x7e008400), "sqrdmlah");  // SQRDMLAH_asisdsame2_only
-  COMPARE_PREFIX(dci(0x7e008c00), "sqrdmlsh");  // SQRDMLSH_asisdsame2_only
+  COMPARE_PREFIX(dci(0x7e408400), "sqrdmlah");  // SQRDMLAH_asisdsame2_only
+  COMPARE_PREFIX(dci(0x7e408c00), "sqrdmlsh");  // SQRDMLSH_asisdsame2_only
   COMPARE_PREFIX(dci(0x7f40d000), "sqrdmlah");  // SQRDMLAH_asisdelem_R
   COMPARE_PREFIX(dci(0x7f40f000), "sqrdmlsh");  // SQRDMLSH_asisdelem_R
 
   // ARMv8.2 - DotProd
-  COMPARE_PREFIX(dci(0x0e009400), "sdot");  // SDOT_asimdsame2_D
+  COMPARE_PREFIX(dci(0x0e809400), "sdot");  // SDOT_asimdsame2_D
   COMPARE_PREFIX(dci(0x0f00e000), "sdot");  // SDOT_asimdelem_D
-  COMPARE_PREFIX(dci(0x2e009400), "udot");  // UDOT_asimdsame2_D
+  COMPARE_PREFIX(dci(0x2e809400), "udot");  // UDOT_asimdsame2_D
   COMPARE_PREFIX(dci(0x2f00e000), "udot");  // UDOT_asimdelem_D
 
   // ARMv8.2 - FHM
@@ -3775,42 +3790,39 @@ TEST(architecture_features) {
   COMPARE_PREFIX(dci(0xd503221f), "esb");  // ESB_HI_hints
 
   // ARMv8.2 - SHA3
-  // COMPARE_PREFIX(dci(0xce000000), "eor3");   // EOR3_VVV16_crypto4
-  // COMPARE_PREFIX(dci(0xce200000), "bcax");   // BCAX_VVV16_crypto4
-  // COMPARE_PREFIX(dci(0xce608c00), "rax1");   // RAX1_VVV2_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce800000), "xar");   // XAR_VVV2_crypto3_imm6
+  COMPARE_PREFIX(dci(0xce000000), "eor3");  // EOR3_VVV16_crypto4
+  COMPARE_PREFIX(dci(0xce200000), "bcax");  // BCAX_VVV16_crypto4
+  COMPARE_PREFIX(dci(0xce608c00), "rax1");  // RAX1_VVV2_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce800000), "xar");   // XAR_VVV2_crypto3_imm6
 
   // ARMv8.2 - SHA512
-  // COMPARE_PREFIX(dci(0xce608000), "sha512h");   // SHA512H_QQV_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce608400), "sha512h2");   //
-  // SHA512H2_QQV_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce608800), "sha512su1");   //
-  // SHA512SU1_VVV2_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xcec08000), "sha512su0");   //
-  // SHA512SU0_VV2_cryptosha512_2
+  COMPARE_PREFIX(dci(0xce608000), "sha512h");   // SHA512H_QQV_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce608400), "sha512h2");  // SHA512H2_QQV_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce608800),
+                 "sha512su1");  // SHA512SU1_VVV2_cryptosha512_3
+  COMPARE_PREFIX(dci(0xcec08000), "sha512su0");  // SHA512SU0_VV2_cryptosha512_2
 
   // ARMv8.2 - SM3
-  // COMPARE_PREFIX(dci(0xce400000), "sm3ss1");   // SM3SS1_VVV4_crypto4
-  // COMPARE_PREFIX(dci(0xce408000), "sm3tt1a");   // SM3TT1A_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408400), "sm3tt1b");   // SM3TT1B_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408800), "sm3tt2a");   // SM3TT2A_VVV4_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b");   // SM3TT2B_VVV_crypto3_imm2
-  // COMPARE_PREFIX(dci(0xce60c000), "sm3partw1");   //
-  // SM3PARTW1_VVV4_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xce60c400), "sm3partw2");   //
-  // SM3PARTW2_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce400000), "sm3ss1");   // SM3SS1_VVV4_crypto4
+  COMPARE_PREFIX(dci(0xce408000), "sm3tt1a");  // SM3TT1A_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408400), "sm3tt1b");  // SM3TT1B_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408800), "sm3tt2a");  // SM3TT2A_VVV4_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce408c00), "sm3tt2b");  // SM3TT2B_VVV_crypto3_imm2
+  COMPARE_PREFIX(dci(0xce60c000),
+                 "sm3partw1");  // SM3PARTW1_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xce60c400),
+                 "sm3partw2");  // SM3PARTW2_VVV4_cryptosha512_3
 
   // ARMv8.2 - SM4
-  // COMPARE_PREFIX(dci(0xce60c800), "sm4ekey");   //
-  // SM4EKEY_VVV4_cryptosha512_3
-  // COMPARE_PREFIX(dci(0xcec08400), "sm4e");   // SM4E_VV4_cryptosha512_2
+  COMPARE_PREFIX(dci(0xce60c800), "sm4ekey");  // SM4EKEY_VVV4_cryptosha512_3
+  COMPARE_PREFIX(dci(0xcec08400), "sm4e");     // SM4E_VV4_cryptosha512_2
 
   // ARMv8.2 - SPE
   // COMPARE_PREFIX(dci(0xd503223f), "psb");   // PSB_HC_hints
 
   // ARMv8.3 - FCMA
   COMPARE_PREFIX(dci(0x2e40c400), "fcmla");  // FCMLA_asimdsame2_C
-  COMPARE_PREFIX(dci(0x2e00e400), "fcadd");  // FCADD_asimdsame2_C
+  COMPARE_PREFIX(dci(0x2e40e400), "fcadd");  // FCADD_asimdsame2_C
   COMPARE_PREFIX(dci(0x2f401000), "fcmla");  // FCMLA_asimdelem_C_H
   COMPARE_PREFIX(dci(0x6f801000), "fcmla");  // FCMLA_asimdelem_C_S
 
diff --git a/test/aarch64/test-disasm-neon-aarch64.cc b/test/aarch64/test-disasm-neon-aarch64.cc
index 14dd18a6..f50e5a60 100644
--- a/test/aarch64/test-disasm-neon-aarch64.cc
+++ b/test/aarch64/test-disasm-neon-aarch64.cc
@@ -1792,6 +1792,34 @@ TEST(neon_3same) {
   COMPARE_MACRO(Pmul(v6.V16B(), v7.V16B(), v8.V16B()),
                 "pmul v6.16b, v7.16b, v8.16b");
 
+  // Check unallocated vector types for SDOT.
+  COMPARE(dci(0x0e009400), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x4e009400), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x0e409400), "unallocated (Unallocated)");  // 4H
+  COMPARE(dci(0x4e409400), "unallocated (Unallocated)");  // 8H
+  COMPARE(dci(0x0ec09400), "unallocated (Unallocated)");  // 1D
+  COMPARE(dci(0x4ec09400), "unallocated (Unallocated)");  // 2D
+
+  // Check unallocated vector types for UDOT.
+  COMPARE(dci(0x2e009400), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6e009400), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2e409400), "unallocated (Unallocated)");  // 4H
+  COMPARE(dci(0x6e409400), "unallocated (Unallocated)");  // 8H
+  COMPARE(dci(0x2ec09400), "unallocated (Unallocated)");  // 1D
+  COMPARE(dci(0x6ec09400), "unallocated (Unallocated)");  // 2D
+
+  // Check unallocated vector types for SQRDMLAH.
+  COMPARE(dci(0x2e008400), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6e008400), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2ec08400), "unallocated (Unallocated)");  // 1D
+  COMPARE(dci(0x6ec08400), "unallocated (Unallocated)");  // 2D
+
+  // Check unallocated vector types for SQRDMLSH.
+  COMPARE(dci(0x2e008c00), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6e008c00), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2ec08c00), "unallocated (Unallocated)");  // 1D
+  COMPARE(dci(0x6ec08c00), "unallocated (Unallocated)");  // 2D
+
   CLEANUP();
 }
 
@@ -1924,6 +1952,16 @@ TEST(neon_3same_extra_fcadd) {
   COMPARE(dci(0x2e00ec00), "unallocated (Unallocated)");  // opcode = 0x1101
   COMPARE(dci(0x2e00fc00), "unallocated (Unallocated)");  // opcode = 0x1111
 
+  // Check unallocated vector types for FCADD.
+  COMPARE(dci(0x2e00e400), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6e00e400), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2ec0e400), "unallocated (Unallocated)");  // 1D
+
+  // Check unallocated vector types for FCMLA.
+  COMPARE(dci(0x2e00c400), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6e00c400), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2ec0c400), "unallocated (Unallocated)");  // 1D
+
   CLEANUP();
 }
 
@@ -2594,6 +2632,13 @@ TEST(neon_fp_byelement) {
   COMPARE_MACRO(Fcmla(v0.V8H(), v1.V8H(), v31.H(), 3, 0),
                 "fcmla v0.8h, v1.8h, v31.h[3], #0");
 
+  // Check unallocated vector types for FCMLA.
+  COMPARE(dci(0x2f001000), "unallocated (Unallocated)");  // 8B
+  COMPARE(dci(0x6f001000), "unallocated (Unallocated)");  // 16B
+  COMPARE(dci(0x2f801000), "unallocated (Unallocated)");  // 2S
+  COMPARE(dci(0x2fc01000), "unallocated (Unallocated)");  // 1D
+  COMPARE(dci(0x6fc01000), "unallocated (Unallocated)");  // 2D
+
   CLEANUP();
 }
 
@@ -2904,6 +2949,10 @@ TEST(neon_3different) {
                 "pmull v0.8h, v1.8b, v2.8b");
   COMPARE_MACRO(Pmull2(v2.V8H(), v3.V16B(), v4.V16B()),
                 "pmull2 v2.8h, v3.16b, v4.16b");
+  COMPARE_MACRO(Pmull(v5.V1Q(), v6.V1D(), v7.V1D()),
+                "pmull v5.1q, v6.1d, v7.1d");
+  COMPARE_MACRO(Pmull2(v8.V1Q(), v9.V2D(), v10.V2D()),
+                "pmull2 v8.1q, v9.2d, v10.2d");
 
   CLEANUP();
 }
@@ -4467,6 +4516,100 @@ TEST(neon_matmul) {
   CLEANUP();
 }
 
+TEST(neon_sha3) {
+  SETUP();
+
+  COMPARE_MACRO(Bcax(v0.V16B(), v1.V16B(), v2.V16B(), v3.V16B()),
+                "bcax v0.16b, v1.16b, v2.16b, v3.16b");
+  COMPARE_MACRO(Eor3(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B()),
+                "eor3 v10.16b, v11.16b, v12.16b, v13.16b");
+  COMPARE_MACRO(Xar(v20.V2D(), v21.V2D(), v22.V2D(), 42),
+                "xar v20.2d, v21.2d, v22.2d, #42");
+  COMPARE_MACRO(Rax1(v0.V2D(), v1.V2D(), v2.V2D()), "rax1 v0.2d, v1.2d, v2.2d");
+
+  CLEANUP();
+}
+
+TEST(neon_sha1) {
+  SETUP();
+
+  COMPARE_MACRO(Sha1c(q0, s12, v20.V4S()), "sha1c q0, s12, v20.4s");
+  COMPARE_MACRO(Sha1m(q22, s2, v13.V4S()), "sha1m q22, s2, v13.4s");
+  COMPARE_MACRO(Sha1p(q31, s5, v15.V4S()), "sha1p q31, s5, v15.4s");
+  COMPARE_MACRO(Sha1su0(v19.V4S(), v9.V4S(), v27.V4S()),
+                "sha1su0 v19.4s, v9.4s, v27.4s");
+  COMPARE_MACRO(Sha1h(s12, s0), "sha1h s12, s0");
+  COMPARE_MACRO(Sha1su1(v2.V4S(), v4.V4S()), "sha1su1 v2.4s, v4.4s");
+
+  CLEANUP();
+}
+
+TEST(neon_sha2) {
+  SETUP();
+
+  COMPARE_MACRO(Sha256h(q0, q12, v20.V4S()), "sha256h q0, q12, v20.4s");
+  COMPARE_MACRO(Sha256h2(q22, q2, v13.V4S()), "sha256h2 q22, q2, v13.4s");
+  COMPARE_MACRO(Sha256su0(v2.V4S(), v4.V4S()), "sha256su0 v2.4s, v4.4s");
+  COMPARE_MACRO(Sha256su1(v19.V4S(), v9.V4S(), v27.V4S()),
+                "sha256su1 v19.4s, v9.4s, v27.4s");
+
+  CLEANUP();
+}
+
+TEST(neon_sha512) {
+  SETUP();
+
+  COMPARE_MACRO(Sha512h(q0, q12, v20.V2D()), "sha512h q0, q12, v20.2d");
+  COMPARE_MACRO(Sha512h2(q22, q2, v13.V2D()), "sha512h2 q22, q2, v13.2d");
+  COMPARE_MACRO(Sha512su0(v2.V2D(), v4.V2D()), "sha512su0 v2.2d, v4.2d");
+  COMPARE_MACRO(Sha512su1(v19.V2D(), v9.V2D(), v27.V2D()),
+                "sha512su1 v19.2d, v9.2d, v27.2d");
+
+  CLEANUP();
+}
+
+TEST(neon_aes) {
+  SETUP();
+
+  COMPARE_MACRO(Aesd(v0.V16B(), v29.V16B()), "aesd v0.16b, v29.16b");
+  COMPARE_MACRO(Aese(v0.V16B(), v29.V16B()), "aese v0.16b, v29.16b");
+  COMPARE_MACRO(Aesimc(v0.V16B(), v29.V16B()), "aesimc v0.16b, v29.16b");
+  COMPARE_MACRO(Aesmc(v0.V16B(), v29.V16B()), "aesmc v0.16b, v29.16b");
+
+  CLEANUP();
+}
+
+TEST(neon_sm3) {
+  SETUP();
+
+  COMPARE_MACRO(Sm3partw1(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm3partw1 v12.4s, v13.4s, v14.4s");
+  COMPARE_MACRO(Sm3partw2(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm3partw2 v12.4s, v13.4s, v14.4s");
+  COMPARE_MACRO(Sm3ss1(v13.V4S(), v15.V4S(), v17.V4S(), v21.V4S()),
+                "sm3ss1 v13.4s, v15.4s, v17.4s, v21.4s");
+  COMPARE_MACRO(Sm3tt1a(v30.V4S(), v29.V4S(), v9.V4S(), 1),
+                "sm3tt1a v30.4s, v29.4s, v9.s[1]");
+  COMPARE_MACRO(Sm3tt1b(v30.V4S(), v29.V4S(), v9.V4S(), 3),
+                "sm3tt1b v30.4s, v29.4s, v9.s[3]");
+  COMPARE_MACRO(Sm3tt2a(v30.V4S(), v29.V4S(), v9.V4S(), 2),
+                "sm3tt2a v30.4s, v29.4s, v9.s[2]");
+  COMPARE_MACRO(Sm3tt2b(v30.V4S(), v29.V4S(), v9.V4S(), 0),
+                "sm3tt2b v30.4s, v29.4s, v9.s[0]");
+
+  CLEANUP();
+}
+
+TEST(neon_sm4) {
+  SETUP();
+
+  COMPARE_MACRO(Sm4e(v12.V4S(), v13.V4S()), "sm4e v12.4s, v13.4s");
+  COMPARE_MACRO(Sm4ekey(v12.V4S(), v13.V4S(), v14.V4S()),
+                "sm4ekey v12.4s, v13.4s, v14.4s");
+
+  CLEANUP();
+}
+
 TEST(neon_unallocated_regression_test) {
   SETUP();
 
@@ -4562,8 +4705,6 @@ TEST(neon_unallocated_regression_test) {
   COMPARE_PREFIX(dci(0x2efb9dbd), "unallocated");  // pmul v.und, v.und, v.und
   COMPARE_PREFIX(dci(0x4eace101), "unallocated");  // pmull v.d, v.s, v.s
   COMPARE_PREFIX(dci(0x0e6de3ad), "unallocated");  // pmull v.s, v.h, v.h
-  COMPARE_PREFIX(dci(0x4ee3e2c0), "unallocated");  // pmull v.und, v.d, v.d
-  COMPARE_PREFIX(dci(0x0eede060), "unallocated");  // pmull v.und, v.und, v.und
   COMPARE_PREFIX(dci(0x6ee00afd), "unallocated");  // rev v.d, v.d
   COMPARE_PREFIX(dci(0x4e601975), "unallocated");  // rev v.h, v.h
   COMPARE_PREFIX(dci(0x4ea019f3), "unallocated");  // rev v.s, v.s
@@ -4633,10 +4774,14 @@ TEST(neon_unallocated_regression_test) {
   COMPARE_PREFIX(dci(0x6fd6d80f), "unallocated");  // sqrdmlah v.d, v.d, v.d[]
   COMPARE_PREFIX(dci(0x2fecdae5),
                  "unallocated");  // sqrdmlah v.und, v.und, v.d[]
+  COMPARE_PREFIX(dci(0x7e008429), "unallocated");  // sqrdmlah b9, b1, b0
+  COMPARE_PREFIX(dci(0x7ec08429), "unallocated");  // sqrdmlah d9, d1, d0
   COMPARE_PREFIX(dci(0x7fe0f992), "unallocated");  // sqrdmlsh d, d, v.d[]
   COMPARE_PREFIX(dci(0x6ff1f9df), "unallocated");  // sqrdmlsh v.d, v.d, v.d[]
   COMPARE_PREFIX(dci(0x2fcdfad1),
                  "unallocated");  // sqrdmlsh v.und, v.und, v.d[]
+  COMPARE_PREFIX(dci(0x7e008c29), "unallocated");  // sqrdmlsh b9, b1, b0
+  COMPARE_PREFIX(dci(0x7ec08c29), "unallocated");  // sqrdmlsh d9, d1, d0
   COMPARE_PREFIX(dci(0x7e23b7fa), "unallocated");  // sqrdmulh b, b, b
   COMPARE_PREFIX(dci(0x5f1ad272), "unallocated");  // sqrdmulh b, b, v.b[]
   COMPARE_PREFIX(dci(0x7ef8b6e0), "unallocated");  // sqrdmulh d, d, d
diff --git a/test/aarch64/test-disasm-sve-aarch64.cc b/test/aarch64/test-disasm-sve-aarch64.cc
index 5e001e7e..fbdff335 100644
--- a/test/aarch64/test-disasm-sve-aarch64.cc
+++ b/test/aarch64/test-disasm-sve-aarch64.cc
@@ -7673,13 +7673,14 @@ TEST(sve2_integer_multiply_long_vector) {
   COMPARE(sqdmullt(z7.VnD(), z4.VnS(), z0.VnS(), 0),
           "sqdmullt z7.d, z4.s, z0.s[0]");
 
-  // Feature `SVEPmull128` is not supported.
-  // COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()),
-  //                "pmullb z12.q, z21.d, z12.d");
   COMPARE(pmullb(z12.VnH(), z21.VnB(), z12.VnB()),
           "pmullb z12.h, z21.b, z12.b");
   COMPARE(pmullt(z31.VnD(), z30.VnS(), z26.VnS()),
           "pmullt z31.d, z30.s, z26.s");
+  COMPARE(pmullb(z12.VnQ(), z21.VnD(), z12.VnD()),
+          "pmullb z12.q, z21.d, z12.d");
+  COMPARE(pmullt(z12.VnQ(), z21.VnD(), z12.VnD()),
+          "pmullt z12.q, z21.d, z12.d");
 
   COMPARE(smullb(z10.VnD(), z4.VnS(), z4.VnS()), "smullb z10.d, z4.s, z4.s");
   COMPARE(smullb(z11.VnH(), z14.VnB(), z14.VnB()),
@@ -7701,6 +7702,10 @@ TEST(sve2_integer_multiply_long_vector) {
   COMPARE(umullt(z24.VnH(), z7.VnB(), z16.VnB()), "umullt z24.h, z7.b, z16.b");
   COMPARE(umullt(z24.VnS(), z8.VnH(), z26.VnH()), "umullt z24.s, z8.h, z26.h");
 
+  // Check related but undefined encodings.
+  COMPARE(dci(0x45806800), "unallocated (Unallocated)");  // pmullb s, h, h
+  COMPARE(dci(0x45806c00), "unallocated (Unallocated)");  // pmullt s, h, h
+
   CLEANUP();
 }
 
diff --git a/test/aarch64/test-simulator-aarch64.cc b/test/aarch64/test-simulator-aarch64.cc
index 0a9dabed..e9d8fdbd 100644
--- a/test/aarch64/test-simulator-aarch64.cc
+++ b/test/aarch64/test-simulator-aarch64.cc
@@ -102,6 +102,95 @@ namespace aarch64 {
   /* The simulator can run every test. */                               \
   *skipped = false
 
+#ifdef VIXL_ENABLE_IMPLICIT_CHECKS
+// The signal handler needs access to the simulator.
+Simulator* gImplicitCheckSim;
+
+#ifdef __x86_64__
+#include <signal.h>
+#include <ucontext.h>
+void HandleSegFault(int sig, siginfo_t* info, void* context) {
+  USE(sig);
+  USE(info);
+  Simulator* sim = gImplicitCheckSim;
+
+  // Did the signal come from the simulator?
+  ucontext_t* uc = reinterpret_cast<ucontext_t*>(context);
+  uintptr_t fault_pc = uc->uc_mcontext.gregs[REG_RIP];
+  VIXL_CHECK(sim->IsSimulatedMemoryAccess(fault_pc));
+
+  // Increment the counter (x1) each time we handle a signal.
+  int64_t counter = reinterpret_cast<int64_t>(sim->ReadXRegister(1));
+  sim->WriteXRegister(1, ++counter);
+
+  // Return to the VIXL memory access continuation point, which is also the
+  // next instruction, after this handler.
+  uc->uc_mcontext.gregs[REG_RIP] = sim->GetSignalReturnAddress();
+  // Return that the memory access failed.
+  uc->uc_mcontext.gregs[REG_RAX] =
+      static_cast<greg_t>(MemoryAccessResult::Failure);
+}
+#endif  // __x86_64__
+
+// Start an implicit check test with a counter and start label so the number of
+// faults can be counted. Note: each instruction after the start will be
+// expected to fault.
+#define START_IMPLICIT_CHECK()                                                \
+  gImplicitCheckSim = &simulator;                                             \
+  /* Set up a signal handler to count the number of faulting instructions. */ \
+  struct sigaction sa;                                                        \
+  sa.sa_sigaction = HandleSegFault;                                           \
+  sigaction(SIGSEGV, &sa, NULL);                                              \
+  START();                                                                    \
+  /* Reset the counter. */                                                    \
+  __ Mov(x1, 0);                                                              \
+  /* Use a consistent bad address. */                                         \
+  __ Mov(x15, xzr);                                                           \
+  __ Mov(ip0, xzr);                                                           \
+  /* Load an amount of data to load. */                                       \
+  __ Mov(ip1, 4096);                                                          \
+  [[maybe_unused]] MemOperand bad_memory = MemOperand(ip0);                   \
+  if (masm.GetCPUFeatures()->Has(CPUFeatures::kSVE)) {                        \
+    /* Turn on all lanes to ensure all loads/stores are tested. */            \
+    __ Ptrue(p0.VnB());                                                       \
+    __ Ptrue(p1.VnB());                                                       \
+    __ Ptrue(p2.VnB());                                                       \
+    __ Ptrue(p3.VnB());                                                       \
+    __ Ptrue(p4.VnB());                                                       \
+    __ Ptrue(p5.VnB());                                                       \
+    __ Ptrue(p6.VnB());                                                       \
+    __ Ptrue(p7.VnB());                                                       \
+    __ Ptrue(p8.VnB());                                                       \
+    __ Ptrue(p9.VnB());                                                       \
+    __ Ptrue(p10.VnB());                                                      \
+    __ Ptrue(p11.VnB());                                                      \
+    __ Ptrue(p12.VnB());                                                      \
+    __ Ptrue(p13.VnB());                                                      \
+    __ Ptrue(p14.VnB());                                                      \
+    __ Ptrue(p15.VnB());                                                      \
+  }                                                                           \
+  Label l_start, l_end;                                                       \
+  __ Bind(&l_start);
+
+#define END_IMPLICIT_CHECK() \
+  __ Bind(&l_end);           \
+  /* Return the counter. */  \
+  __ Mov(x0, x1);            \
+  END();
+
+#define TRY_RUN_IMPLICIT_CHECK()                                              \
+  bool skipped;                                                               \
+  TRY_RUN(&skipped);                                                          \
+  /* Implicit checks should only be used with the simulator. */               \
+  VIXL_ASSERT(!skipped);                                                      \
+  /* Check that each load/store instruction generated a segfault that was */  \
+  /* raised and dealt with. */                                                \
+  size_t result = simulator.ReadXRegister(0);                                 \
+  size_t num_of_faulting_instr = masm.GetSizeOfCodeGeneratedSince(&l_start) - \
+                                 masm.GetSizeOfCodeGeneratedSince(&l_end);    \
+  VIXL_CHECK((result * kInstructionSize) == num_of_faulting_instr);
+
+#endif  // VIXL_ENABLE_IMPLICIT_CHECKS
 
 #else  // VIXL_INCLUDE_SIMULATOR_AARCH64
 
@@ -2850,7 +2939,7 @@ static void TestOpImmOpImmNEON(const char* name,
         }
       }
     }
-    VIXL_ASSERT(counted_length == expected_length);
+    VIXL_CHECK(counted_length == expected_length);
     if (error_count > kErrorReportLimit) {
       printf("%u other errors follow.\n", error_count - kErrorReportLimit);
     }
@@ -5012,6 +5101,802 @@ DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl, Basic, Basic, Basic)
 DEFINE_TEST_NEON_FHM_BYELEMENT(fmlsl2, Basic, Basic, Basic)
 
 
+#ifdef VIXL_ENABLE_IMPLICIT_CHECKS
+TEST(ImplicitCheck) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+  START_IMPLICIT_CHECK();
+
+  EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
+  // Invalid memory reads.
+  __ ldar(w3, bad_memory);
+  __ ldar(x4, bad_memory);
+  __ ldarb(w5, bad_memory);
+  __ ldarb(x6, bad_memory);
+  __ ldarh(w7, bad_memory);
+  __ ldarh(x8, bad_memory);
+  __ ldaxp(w9, w10, bad_memory);
+  __ ldaxp(x11, x12, bad_memory);
+  __ ldaxr(w13, bad_memory);
+  __ ldaxr(x14, bad_memory);
+  __ ldaxrb(w15, bad_memory);
+  __ ldaxrb(x16, bad_memory);
+  __ ldaxrh(w17, bad_memory);
+  __ ldaxrh(x18, bad_memory);
+  __ ldnp(w19, w20, bad_memory);
+  __ ldnp(x21, x22, bad_memory);
+  __ ldp(w23, w24, bad_memory);
+  __ ldp(x25, x26, bad_memory);
+  __ ldpsw(x27, x28, bad_memory);
+  __ ldr(w29, bad_memory);
+  __ ldr(x2, bad_memory);
+  __ ldrb(w3, bad_memory);
+  __ ldrb(x4, bad_memory);
+  __ ldrh(w5, bad_memory);
+  __ ldrh(x6, bad_memory);
+  __ ldrsb(w7, bad_memory);
+  __ ldrsb(x8, bad_memory);
+  __ ldrsh(w9, bad_memory);
+  __ ldrsh(x10, bad_memory);
+  __ ldrsw(x11, bad_memory);
+  __ ldur(w12, bad_memory);
+  __ ldur(x13, bad_memory);
+  __ ldurb(w14, bad_memory);
+  __ ldurb(x15, bad_memory);
+  __ ldurh(w16, bad_memory);
+  __ ldurh(x17, bad_memory);
+  __ ldursb(w18, bad_memory);
+  __ ldursb(x19, bad_memory);
+  __ ldursh(w20, bad_memory);
+  __ ldursh(x21, bad_memory);
+  __ ldursw(x22, bad_memory);
+  __ ldxp(w23, w24, bad_memory);
+  __ ldxp(x25, x26, bad_memory);
+  __ ldxr(w27, bad_memory);
+  __ ldxr(x28, bad_memory);
+  __ ldxrb(w29, bad_memory);
+  __ ldxrb(x2, bad_memory);
+  __ ldxrh(w3, bad_memory);
+  __ ldxrh(x4, bad_memory);
+
+  // Invalid memory writes. Note: exclusive store instructions are not tested
+  // because they can fail due to the global monitor before trying to perform a
+  // memory store.
+  __ stlr(w18, bad_memory);
+  __ stlr(x19, bad_memory);
+  __ stlrb(w20, bad_memory);
+  __ stlrb(x21, bad_memory);
+  __ stlrh(w22, bad_memory);
+  __ stlrh(x23, bad_memory);
+  __ stnp(w14, w15, bad_memory);
+  __ stnp(x16, x17, bad_memory);
+  __ stp(w18, w19, bad_memory);
+  __ stp(x20, x21, bad_memory);
+  __ str(w22, bad_memory);
+  __ str(x23, bad_memory);
+  __ strb(w24, bad_memory);
+  __ strb(x25, bad_memory);
+  __ strh(w26, bad_memory);
+  __ strh(x27, bad_memory);
+  __ stur(w28, bad_memory);
+  __ stur(x29, bad_memory);
+  __ sturb(w2, bad_memory);
+  __ sturb(x3, bad_memory);
+  __ sturh(w4, bad_memory);
+  __ sturh(x5, bad_memory);
+
+  END_IMPLICIT_CHECK();
+  TRY_RUN_IMPLICIT_CHECK();
+}
+
+TEST(ImplicitCheckNeon) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON);
+  START_IMPLICIT_CHECK();
+
+  EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
+  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
+  __ ld1(v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
+  __ ld1(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory);
+  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), bad_memory);
+  __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), bad_memory);
+  __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
+  __ ld1(v17.V16B(), v18.V16B(), bad_memory);
+  __ ld1(v20.V16B(), v21.V16B(), bad_memory);
+  __ ld1(v28.V16B(), v29.V16B(), bad_memory);
+  __ ld1(v29.V16B(), bad_memory);
+  __ ld1(v21.V16B(), bad_memory);
+  __ ld1(v4.V16B(), bad_memory);
+  __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory);
+  __ ld1(v17.V1D(), v18.V1D(), v19.V1D(), v20.V1D(), bad_memory);
+  __ ld1(v28.V1D(), v29.V1D(), v30.V1D(), v31.V1D(), bad_memory);
+  __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), bad_memory);
+  __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), bad_memory);
+  __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), bad_memory);
+  __ ld1(v29.V1D(), v30.V1D(), bad_memory);
+  __ ld1(v31.V1D(), v0.V1D(), bad_memory);
+  __ ld1(v3.V1D(), v4.V1D(), bad_memory);
+  __ ld1(v28.V1D(), bad_memory);
+  __ ld1(v11.V1D(), bad_memory);
+  __ ld1(v29.V1D(), bad_memory);
+  __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
+  __ ld1(v8.V2D(), v9.V2D(), v10.V2D(), v11.V2D(), bad_memory);
+  __ ld1(v14.V2D(), v15.V2D(), v16.V2D(), v17.V2D(), bad_memory);
+  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory);
+  __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), bad_memory);
+  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), bad_memory);
+  __ ld1(v18.V2D(), v19.V2D(), bad_memory);
+  __ ld1(v21.V2D(), v22.V2D(), bad_memory);
+  __ ld1(v17.V2D(), v18.V2D(), bad_memory);
+  __ ld1(v5.V2D(), bad_memory);
+  __ ld1(v6.V2D(), bad_memory);
+  __ ld1(v15.V2D(), bad_memory);
+  __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), bad_memory);
+  __ ld1(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
+  __ ld1(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory);
+  __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), bad_memory);
+  __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), bad_memory);
+  __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), bad_memory);
+  __ ld1(v0.V2S(), v1.V2S(), bad_memory);
+  __ ld1(v13.V2S(), v14.V2S(), bad_memory);
+  __ ld1(v3.V2S(), v4.V2S(), bad_memory);
+  __ ld1(v26.V2S(), bad_memory);
+  __ ld1(v0.V2S(), bad_memory);
+  __ ld1(v11.V2S(), bad_memory);
+  __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory);
+  __ ld1(v24.V4H(), v25.V4H(), v26.V4H(), v27.V4H(), bad_memory);
+  __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory);
+  __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), bad_memory);
+  __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), bad_memory);
+  __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), bad_memory);
+  __ ld1(v3.V4H(), v4.V4H(), bad_memory);
+  __ ld1(v3.V4H(), v4.V4H(), bad_memory);
+  __ ld1(v23.V4H(), v24.V4H(), bad_memory);
+  __ ld1(v26.V4H(), bad_memory);
+  __ ld1(v1.V4H(), bad_memory);
+  __ ld1(v14.V4H(), bad_memory);
+  __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), bad_memory);
+  __ ld1(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory);
+  __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), bad_memory);
+  __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory);
+  __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), bad_memory);
+  __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), bad_memory);
+  __ ld1(v20.V4S(), v21.V4S(), bad_memory);
+  __ ld1(v30.V4S(), v31.V4S(), bad_memory);
+  __ ld1(v11.V4S(), v12.V4S(), bad_memory);
+  __ ld1(v15.V4S(), bad_memory);
+  __ ld1(v12.V4S(), bad_memory);
+  __ ld1(v0.V4S(), bad_memory);
+  __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), bad_memory);
+  __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory);
+  __ ld1(v9.V8B(), v10.V8B(), v11.V8B(), v12.V8B(), bad_memory);
+  __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), bad_memory);
+  __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory);
+  __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory);
+  __ ld1(v10.V8B(), v11.V8B(), bad_memory);
+  __ ld1(v11.V8B(), v12.V8B(), bad_memory);
+  __ ld1(v27.V8B(), v28.V8B(), bad_memory);
+  __ ld1(v31.V8B(), bad_memory);
+  __ ld1(v10.V8B(), bad_memory);
+  __ ld1(v28.V8B(), bad_memory);
+  __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), bad_memory);
+  __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
+  __ ld1(v10.V8H(), v11.V8H(), v12.V8H(), v13.V8H(), bad_memory);
+  __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
+  __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
+  __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), bad_memory);
+  __ ld1(v4.V8H(), v5.V8H(), bad_memory);
+  __ ld1(v21.V8H(), v22.V8H(), bad_memory);
+  __ ld1(v4.V8H(), v5.V8H(), bad_memory);
+  __ ld1(v9.V8H(), bad_memory);
+  __ ld1(v27.V8H(), bad_memory);
+  __ ld1(v26.V8H(), bad_memory);
+  __ ld1(v19.B(), 1, bad_memory);
+  __ ld1(v12.B(), 3, bad_memory);
+  __ ld1(v27.B(), 12, bad_memory);
+  __ ld1(v10.D(), 1, bad_memory);
+  __ ld1(v26.D(), 1, bad_memory);
+  __ ld1(v7.D(), 1, bad_memory);
+  __ ld1(v19.H(), 5, bad_memory);
+  __ ld1(v10.H(), 1, bad_memory);
+  __ ld1(v5.H(), 4, bad_memory);
+  __ ld1(v21.S(), 2, bad_memory);
+  __ ld1(v13.S(), 2, bad_memory);
+  __ ld1(v1.S(), 2, bad_memory);
+  __ ld1r(v2.V16B(), bad_memory);
+  __ ld1r(v2.V16B(), bad_memory);
+  __ ld1r(v22.V16B(), bad_memory);
+  __ ld1r(v25.V1D(), bad_memory);
+  __ ld1r(v9.V1D(), bad_memory);
+  __ ld1r(v23.V1D(), bad_memory);
+  __ ld1r(v19.V2D(), bad_memory);
+  __ ld1r(v21.V2D(), bad_memory);
+  __ ld1r(v30.V2D(), bad_memory);
+  __ ld1r(v24.V2S(), bad_memory);
+  __ ld1r(v26.V2S(), bad_memory);
+  __ ld1r(v28.V2S(), bad_memory);
+  __ ld1r(v19.V4H(), bad_memory);
+  __ ld1r(v1.V4H(), bad_memory);
+  __ ld1r(v21.V4H(), bad_memory);
+  __ ld1r(v15.V4S(), bad_memory);
+  __ ld1r(v21.V4S(), bad_memory);
+  __ ld1r(v23.V4S(), bad_memory);
+  __ ld1r(v26.V8B(), bad_memory);
+  __ ld1r(v14.V8B(), bad_memory);
+  __ ld1r(v19.V8B(), bad_memory);
+  __ ld1r(v13.V8H(), bad_memory);
+  __ ld1r(v30.V8H(), bad_memory);
+  __ ld1r(v27.V8H(), bad_memory);
+  __ ld2(v21.V16B(), v22.V16B(), bad_memory);
+  __ ld2(v21.V16B(), v22.V16B(), bad_memory);
+  __ ld2(v12.V16B(), v13.V16B(), bad_memory);
+  __ ld2(v14.V2D(), v15.V2D(), bad_memory);
+  __ ld2(v0.V2D(), v1.V2D(), bad_memory);
+  __ ld2(v12.V2D(), v13.V2D(), bad_memory);
+  __ ld2(v27.V2S(), v28.V2S(), bad_memory);
+  __ ld2(v2.V2S(), v3.V2S(), bad_memory);
+  __ ld2(v12.V2S(), v13.V2S(), bad_memory);
+  __ ld2(v9.V4H(), v10.V4H(), bad_memory);
+  __ ld2(v23.V4H(), v24.V4H(), bad_memory);
+  __ ld2(v1.V4H(), v2.V4H(), bad_memory);
+  __ ld2(v20.V4S(), v21.V4S(), bad_memory);
+  __ ld2(v10.V4S(), v11.V4S(), bad_memory);
+  __ ld2(v24.V4S(), v25.V4S(), bad_memory);
+  __ ld2(v17.V8B(), v18.V8B(), bad_memory);
+  __ ld2(v13.V8B(), v14.V8B(), bad_memory);
+  __ ld2(v7.V8B(), v8.V8B(), bad_memory);
+  __ ld2(v30.V8H(), v31.V8H(), bad_memory);
+  __ ld2(v4.V8H(), v5.V8H(), bad_memory);
+  __ ld2(v13.V8H(), v14.V8H(), bad_memory);
+  __ ld2(v5.B(), v6.B(), 12, bad_memory);
+  __ ld2(v16.B(), v17.B(), 7, bad_memory);
+  __ ld2(v29.B(), v30.B(), 2, bad_memory);
+  __ ld2(v11.D(), v12.D(), 1, bad_memory);
+  __ ld2(v26.D(), v27.D(), 0, bad_memory);
+  __ ld2(v25.D(), v26.D(), 0, bad_memory);
+  __ ld2(v18.H(), v19.H(), 7, bad_memory);
+  __ ld2(v17.H(), v18.H(), 5, bad_memory);
+  __ ld2(v30.H(), v31.H(), 2, bad_memory);
+  __ ld2(v29.S(), v30.S(), 3, bad_memory);
+  __ ld2(v28.S(), v29.S(), 0, bad_memory);
+  __ ld2(v6.S(), v7.S(), 1, bad_memory);
+  __ ld2r(v26.V16B(), v27.V16B(), bad_memory);
+  __ ld2r(v21.V16B(), v22.V16B(), bad_memory);
+  __ ld2r(v5.V16B(), v6.V16B(), bad_memory);
+  __ ld2r(v26.V1D(), v27.V1D(), bad_memory);
+  __ ld2r(v14.V1D(), v15.V1D(), bad_memory);
+  __ ld2r(v23.V1D(), v24.V1D(), bad_memory);
+  __ ld2r(v11.V2D(), v12.V2D(), bad_memory);
+  __ ld2r(v29.V2D(), v30.V2D(), bad_memory);
+  __ ld2r(v15.V2D(), v16.V2D(), bad_memory);
+  __ ld2r(v26.V2S(), v27.V2S(), bad_memory);
+  __ ld2r(v22.V2S(), v23.V2S(), bad_memory);
+  __ ld2r(v2.V2S(), v3.V2S(), bad_memory);
+  __ ld2r(v2.V4H(), v3.V4H(), bad_memory);
+  __ ld2r(v9.V4H(), v10.V4H(), bad_memory);
+  __ ld2r(v6.V4H(), v7.V4H(), bad_memory);
+  __ ld2r(v7.V4S(), v8.V4S(), bad_memory);
+  __ ld2r(v19.V4S(), v20.V4S(), bad_memory);
+  __ ld2r(v21.V4S(), v22.V4S(), bad_memory);
+  __ ld2r(v26.V8B(), v27.V8B(), bad_memory);
+  __ ld2r(v20.V8B(), v21.V8B(), bad_memory);
+  __ ld2r(v11.V8B(), v12.V8B(), bad_memory);
+  __ ld2r(v12.V8H(), v13.V8H(), bad_memory);
+  __ ld2r(v6.V8H(), v7.V8H(), bad_memory);
+  __ ld2r(v25.V8H(), v26.V8H(), bad_memory);
+  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory);
+  __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), bad_memory);
+  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), bad_memory);
+  __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), bad_memory);
+  __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), bad_memory);
+  __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), bad_memory);
+  __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory);
+  __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), bad_memory);
+  __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), bad_memory);
+  __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), bad_memory);
+  __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), bad_memory);
+  __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
+  __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), bad_memory);
+  __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), bad_memory);
+  __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), bad_memory);
+  __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory);
+  __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), bad_memory);
+  __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), bad_memory);
+  __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), bad_memory);
+  __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), bad_memory);
+  __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), bad_memory);
+  __ ld3(v21.B(), v22.B(), v23.B(), 11, bad_memory);
+  __ ld3(v5.B(), v6.B(), v7.B(), 9, bad_memory);
+  __ ld3(v23.B(), v24.B(), v25.B(), 0, bad_memory);
+  __ ld3(v16.D(), v17.D(), v18.D(), 0, bad_memory);
+  __ ld3(v30.D(), v31.D(), v0.D(), 0, bad_memory);
+  __ ld3(v28.D(), v29.D(), v30.D(), 1, bad_memory);
+  __ ld3(v13.H(), v14.H(), v15.H(), 2, bad_memory);
+  __ ld3(v22.H(), v23.H(), v24.H(), 7, bad_memory);
+  __ ld3(v14.H(), v15.H(), v16.H(), 3, bad_memory);
+  __ ld3(v22.S(), v23.S(), v24.S(), 3, bad_memory);
+  __ ld3(v30.S(), v31.S(), v0.S(), 2, bad_memory);
+  __ ld3(v12.S(), v13.S(), v14.S(), 1, bad_memory);
+  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
+  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
+  __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), bad_memory);
+  __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), bad_memory);
+  __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), bad_memory);
+  __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), bad_memory);
+  __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
+  __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), bad_memory);
+  __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), bad_memory);
+  __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), bad_memory);
+  __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), bad_memory);
+  __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
+  __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), bad_memory);
+  __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), bad_memory);
+  __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
+  __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
+  __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), bad_memory);
+  __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
+  __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), bad_memory);
+  __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), bad_memory);
+  __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
+  __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), bad_memory);
+  __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), bad_memory);
+  __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), bad_memory);
+  __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), bad_memory);
+  __ ld4(v2.V16B(), v3.V16B(), v4.V16B(), v5.V16B(), bad_memory);
+  __ ld4(v5.V16B(), v6.V16B(), v7.V16B(), v8.V16B(), bad_memory);
+  __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), bad_memory);
+  __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), bad_memory);
+  __ ld4(v29.V2D(), v30.V2D(), v31.V2D(), v0.V2D(), bad_memory);
+  __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), bad_memory);
+  __ ld4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
+  __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), bad_memory);
+  __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), bad_memory);
+  __ ld4(v23.V4H(), v24.V4H(), v25.V4H(), v26.V4H(), bad_memory);
+  __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), bad_memory);
+  __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), bad_memory);
+  __ ld4(v28.V4S(), v29.V4S(), v30.V4S(), v31.V4S(), bad_memory);
+  __ ld4(v29.V4S(), v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
+  __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory);
+  __ ld4(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
+  __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), bad_memory);
+  __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
+  __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), bad_memory);
+  __ ld4(v20.V8H(), v21.V8H(), v22.V8H(), v23.V8H(), bad_memory);
+  __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, bad_memory);
+  __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, bad_memory);
+  __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, bad_memory);
+  __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, bad_memory);
+  __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, bad_memory);
+  __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, bad_memory);
+  __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, bad_memory);
+  __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, bad_memory);
+  __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, bad_memory);
+  __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, bad_memory);
+  __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, bad_memory);
+  __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, bad_memory);
+  __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), bad_memory);
+  __ ld4r(v13.V16B(), v14.V16B(), v15.V16B(), v16.V16B(), bad_memory);
+  __ ld4r(v9.V16B(), v10.V16B(), v11.V16B(), v12.V16B(), bad_memory);
+  __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), bad_memory);
+  __ ld4r(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), bad_memory);
+  __ ld4r(v26.V1D(), v27.V1D(), v28.V1D(), v29.V1D(), bad_memory);
+  __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), bad_memory);
+  __ ld4r(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
+  __ ld4r(v15.V2D(), v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
+  __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), bad_memory);
+  __ ld4r(v28.V2S(), v29.V2S(), v30.V2S(), v31.V2S(), bad_memory);
+  __ ld4r(v11.V2S(), v12.V2S(), v13.V2S(), v14.V2S(), bad_memory);
+  __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), bad_memory);
+  __ ld4r(v22.V4H(), v23.V4H(), v24.V4H(), v25.V4H(), bad_memory);
+  __ ld4r(v20.V4H(), v21.V4H(), v22.V4H(), v23.V4H(), bad_memory);
+  __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), bad_memory);
+  __ ld4r(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
+  __ ld4r(v23.V4S(), v24.V4S(), v25.V4S(), v26.V4S(), bad_memory);
+  __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), bad_memory);
+  __ ld4r(v27.V8B(), v28.V8B(), v29.V8B(), v30.V8B(), bad_memory);
+  __ ld4r(v29.V8B(), v30.V8B(), v31.V8B(), v0.V8B(), bad_memory);
+  __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), bad_memory);
+  __ ld4r(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), bad_memory);
+  __ ld4r(v22.V8H(), v23.V8H(), v24.V8H(), v25.V8H(), bad_memory);
+
+  __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), bad_memory);
+  __ st1(v10.V16B(), v11.V16B(), v12.V16B(), v13.V16B(), bad_memory);
+  __ st1(v27.V16B(), v28.V16B(), v29.V16B(), v30.V16B(), bad_memory);
+  __ st1(v16.V16B(), v17.V16B(), v18.V16B(), bad_memory);
+  __ st1(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory);
+  __ st1(v9.V16B(), v10.V16B(), v11.V16B(), bad_memory);
+  __ st1(v7.V16B(), v8.V16B(), bad_memory);
+  __ st1(v26.V16B(), v27.V16B(), bad_memory);
+  __ st1(v22.V16B(), v23.V16B(), bad_memory);
+  __ st1(v23.V16B(), bad_memory);
+  __ st1(v28.V16B(), bad_memory);
+  __ st1(v2.V16B(), bad_memory);
+  __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), bad_memory);
+  __ st1(v12.V1D(), v13.V1D(), v14.V1D(), v15.V1D(), bad_memory);
+  __ st1(v30.V1D(), v31.V1D(), v0.V1D(), v1.V1D(), bad_memory);
+  __ st1(v16.V1D(), v17.V1D(), v18.V1D(), bad_memory);
+  __ st1(v3.V1D(), v4.V1D(), v5.V1D(), bad_memory);
+  __ st1(v14.V1D(), v15.V1D(), v16.V1D(), bad_memory);
+  __ st1(v18.V1D(), v19.V1D(), bad_memory);
+  __ st1(v5.V1D(), v6.V1D(), bad_memory);
+  __ st1(v2.V1D(), v3.V1D(), bad_memory);
+  __ st1(v4.V1D(), bad_memory);
+  __ st1(v27.V1D(), bad_memory);
+  __ st1(v23.V1D(), bad_memory);
+  __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), bad_memory);
+  __ st1(v22.V2D(), v23.V2D(), v24.V2D(), v25.V2D(), bad_memory);
+  __ st1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), bad_memory);
+  __ st1(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
+  __ st1(v16.V2D(), v17.V2D(), v18.V2D(), bad_memory);
+  __ st1(v22.V2D(), v23.V2D(), v24.V2D(), bad_memory);
+  __ st1(v21.V2D(), v22.V2D(), bad_memory);
+  __ st1(v6.V2D(), v7.V2D(), bad_memory);
+  __ st1(v27.V2D(), v28.V2D(), bad_memory);
+  __ st1(v21.V2D(), bad_memory);
+  __ st1(v29.V2D(), bad_memory);
+  __ st1(v20.V2D(), bad_memory);
+  __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
+  __ st1(v8.V2S(), v9.V2S(), v10.V2S(), v11.V2S(), bad_memory);
+  __ st1(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory);
+  __ st1(v2.V2S(), v3.V2S(), v4.V2S(), bad_memory);
+  __ st1(v23.V2S(), v24.V2S(), v25.V2S(), bad_memory);
+  __ st1(v7.V2S(), v8.V2S(), v9.V2S(), bad_memory);
+  __ st1(v28.V2S(), v29.V2S(), bad_memory);
+  __ st1(v29.V2S(), v30.V2S(), bad_memory);
+  __ st1(v23.V2S(), v24.V2S(), bad_memory);
+  __ st1(v6.V2S(), bad_memory);
+  __ st1(v11.V2S(), bad_memory);
+  __ st1(v17.V2S(), bad_memory);
+  __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), bad_memory);
+  __ st1(v9.V4H(), v10.V4H(), v11.V4H(), v12.V4H(), bad_memory);
+  __ st1(v25.V4H(), v26.V4H(), v27.V4H(), v28.V4H(), bad_memory);
+  __ st1(v11.V4H(), v12.V4H(), v13.V4H(), bad_memory);
+  __ st1(v10.V4H(), v11.V4H(), v12.V4H(), bad_memory);
+  __ st1(v12.V4H(), v13.V4H(), v14.V4H(), bad_memory);
+  __ st1(v13.V4H(), v14.V4H(), bad_memory);
+  __ st1(v15.V4H(), v16.V4H(), bad_memory);
+  __ st1(v21.V4H(), v22.V4H(), bad_memory);
+  __ st1(v16.V4H(), bad_memory);
+  __ st1(v8.V4H(), bad_memory);
+  __ st1(v30.V4H(), bad_memory);
+  __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), bad_memory);
+  __ st1(v25.V4S(), v26.V4S(), v27.V4S(), v28.V4S(), bad_memory);
+  __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), bad_memory);
+  __ st1(v31.V4S(), v0.V4S(), v1.V4S(), bad_memory);
+  __ st1(v30.V4S(), v31.V4S(), v0.V4S(), bad_memory);
+  __ st1(v6.V4S(), v7.V4S(), v8.V4S(), bad_memory);
+  __ st1(v17.V4S(), v18.V4S(), bad_memory);
+  __ st1(v31.V4S(), v0.V4S(), bad_memory);
+  __ st1(v1.V4S(), v2.V4S(), bad_memory);
+  __ st1(v26.V4S(), bad_memory);
+  __ st1(v15.V4S(), bad_memory);
+  __ st1(v13.V4S(), bad_memory);
+  __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
+  __ st1(v10.V8B(), v11.V8B(), v12.V8B(), v13.V8B(), bad_memory);
+  __ st1(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), bad_memory);
+  __ st1(v19.V8B(), v20.V8B(), v21.V8B(), bad_memory);
+  __ st1(v31.V8B(), v0.V8B(), v1.V8B(), bad_memory);
+  __ st1(v9.V8B(), v10.V8B(), v11.V8B(), bad_memory);
+  __ st1(v12.V8B(), v13.V8B(), bad_memory);
+  __ st1(v2.V8B(), v3.V8B(), bad_memory);
+  __ st1(v0.V8B(), v1.V8B(), bad_memory);
+  __ st1(v16.V8B(), bad_memory);
+  __ st1(v25.V8B(), bad_memory);
+  __ st1(v31.V8B(), bad_memory);
+  __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), bad_memory);
+  __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), bad_memory);
+  __ st1(v26.V8H(), v27.V8H(), v28.V8H(), v29.V8H(), bad_memory);
+  __ st1(v10.V8H(), v11.V8H(), v12.V8H(), bad_memory);
+  __ st1(v21.V8H(), v22.V8H(), v23.V8H(), bad_memory);
+  __ st1(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
+  __ st1(v26.V8H(), v27.V8H(), bad_memory);
+  __ st1(v24.V8H(), v25.V8H(), bad_memory);
+  __ st1(v17.V8H(), v18.V8H(), bad_memory);
+  __ st1(v29.V8H(), bad_memory);
+  __ st1(v19.V8H(), bad_memory);
+  __ st1(v23.V8H(), bad_memory);
+  __ st1(v19.B(), 15, bad_memory);
+  __ st1(v25.B(), 9, bad_memory);
+  __ st1(v4.B(), 8, bad_memory);
+  __ st1(v13.D(), 0, bad_memory);
+  __ st1(v30.D(), 0, bad_memory);
+  __ st1(v3.D(), 0, bad_memory);
+  __ st1(v22.H(), 0, bad_memory);
+  __ st1(v31.H(), 7, bad_memory);
+  __ st1(v23.H(), 3, bad_memory);
+  __ st1(v0.S(), 0, bad_memory);
+  __ st1(v11.S(), 3, bad_memory);
+  __ st1(v24.S(), 3, bad_memory);
+  __ st2(v7.V16B(), v8.V16B(), bad_memory);
+  __ st2(v5.V16B(), v6.V16B(), bad_memory);
+  __ st2(v18.V16B(), v19.V16B(), bad_memory);
+  __ st2(v14.V2D(), v15.V2D(), bad_memory);
+  __ st2(v7.V2D(), v8.V2D(), bad_memory);
+  __ st2(v24.V2D(), v25.V2D(), bad_memory);
+  __ st2(v22.V2S(), v23.V2S(), bad_memory);
+  __ st2(v4.V2S(), v5.V2S(), bad_memory);
+  __ st2(v2.V2S(), v3.V2S(), bad_memory);
+  __ st2(v23.V4H(), v24.V4H(), bad_memory);
+  __ st2(v8.V4H(), v9.V4H(), bad_memory);
+  __ st2(v7.V4H(), v8.V4H(), bad_memory);
+  __ st2(v17.V4S(), v18.V4S(), bad_memory);
+  __ st2(v6.V4S(), v7.V4S(), bad_memory);
+  __ st2(v26.V4S(), v27.V4S(), bad_memory);
+  __ st2(v31.V8B(), v0.V8B(), bad_memory);
+  __ st2(v0.V8B(), v1.V8B(), bad_memory);
+  __ st2(v21.V8B(), v22.V8B(), bad_memory);
+  __ st2(v7.V8H(), v8.V8H(), bad_memory);
+  __ st2(v22.V8H(), v23.V8H(), bad_memory);
+  __ st2(v4.V8H(), v5.V8H(), bad_memory);
+  __ st2(v8.B(), v9.B(), 15, bad_memory);
+  __ st2(v8.B(), v9.B(), 15, bad_memory);
+  __ st2(v7.B(), v8.B(), 4, bad_memory);
+  __ st2(v25.D(), v26.D(), 0, bad_memory);
+  __ st2(v17.D(), v18.D(), 1, bad_memory);
+  __ st2(v3.D(), v4.D(), 1, bad_memory);
+  __ st2(v4.H(), v5.H(), 3, bad_memory);
+  __ st2(v0.H(), v1.H(), 5, bad_memory);
+  __ st2(v22.H(), v23.H(), 2, bad_memory);
+  __ st2(v14.S(), v15.S(), 3, bad_memory);
+  __ st2(v23.S(), v24.S(), 3, bad_memory);
+  __ st2(v0.S(), v1.S(), 2, bad_memory);
+  __ st3(v26.V16B(), v27.V16B(), v28.V16B(), bad_memory);
+  __ st3(v21.V16B(), v22.V16B(), v23.V16B(), bad_memory);
+  __ st3(v24.V16B(), v25.V16B(), v26.V16B(), bad_memory);
+  __ st3(v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
+  __ st3(v23.V2D(), v24.V2D(), v25.V2D(), bad_memory);
+  __ st3(v10.V2D(), v11.V2D(), v12.V2D(), bad_memory);
+  __ st3(v9.V2S(), v10.V2S(), v11.V2S(), bad_memory);
+  __ st3(v13.V2S(), v14.V2S(), v15.V2S(), bad_memory);
+  __ st3(v22.V2S(), v23.V2S(), v24.V2S(), bad_memory);
+  __ st3(v31.V4H(), v0.V4H(), v1.V4H(), bad_memory);
+  __ st3(v8.V4H(), v9.V4H(), v10.V4H(), bad_memory);
+  __ st3(v19.V4H(), v20.V4H(), v21.V4H(), bad_memory);
+  __ st3(v18.V4S(), v19.V4S(), v20.V4S(), bad_memory);
+  __ st3(v25.V4S(), v26.V4S(), v27.V4S(), bad_memory);
+  __ st3(v16.V4S(), v17.V4S(), v18.V4S(), bad_memory);
+  __ st3(v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
+  __ st3(v29.V8B(), v30.V8B(), v31.V8B(), bad_memory);
+  __ st3(v30.V8B(), v31.V8B(), v0.V8B(), bad_memory);
+  __ st3(v8.V8H(), v9.V8H(), v10.V8H(), bad_memory);
+  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
+  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), bad_memory);
+  __ st3(v31.B(), v0.B(), v1.B(), 10, bad_memory);
+  __ st3(v4.B(), v5.B(), v6.B(), 5, bad_memory);
+  __ st3(v5.B(), v6.B(), v7.B(), 1, bad_memory);
+  __ st3(v5.D(), v6.D(), v7.D(), 0, bad_memory);
+  __ st3(v6.D(), v7.D(), v8.D(), 0, bad_memory);
+  __ st3(v0.D(), v1.D(), v2.D(), 0, bad_memory);
+  __ st3(v31.H(), v0.H(), v1.H(), 2, bad_memory);
+  __ st3(v14.H(), v15.H(), v16.H(), 5, bad_memory);
+  __ st3(v21.H(), v22.H(), v23.H(), 6, bad_memory);
+  __ st3(v21.S(), v22.S(), v23.S(), 0, bad_memory);
+  __ st3(v11.S(), v12.S(), v13.S(), 1, bad_memory);
+  __ st3(v15.S(), v16.S(), v17.S(), 0, bad_memory);
+  __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), bad_memory);
+  __ st4(v24.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), bad_memory);
+  __ st4(v15.V16B(), v16.V16B(), v17.V16B(), v18.V16B(), bad_memory);
+  __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), bad_memory);
+  __ st4(v17.V2D(), v18.V2D(), v19.V2D(), v20.V2D(), bad_memory);
+  __ st4(v9.V2D(), v10.V2D(), v11.V2D(), v12.V2D(), bad_memory);
+  __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), bad_memory);
+  __ st4(v15.V2S(), v16.V2S(), v17.V2S(), v18.V2S(), bad_memory);
+  __ st4(v24.V2S(), v25.V2S(), v26.V2S(), v27.V2S(), bad_memory);
+  __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), bad_memory);
+  __ st4(v18.V4H(), v19.V4H(), v20.V4H(), v21.V4H(), bad_memory);
+  __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), bad_memory);
+  __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), bad_memory);
+  __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), bad_memory);
+  __ st4(v15.V4S(), v16.V4S(), v17.V4S(), v18.V4S(), bad_memory);
+  __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), bad_memory);
+  __ st4(v25.V8B(), v26.V8B(), v27.V8B(), v28.V8B(), bad_memory);
+  __ st4(v19.V8B(), v20.V8B(), v21.V8B(), v22.V8B(), bad_memory);
+  __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), bad_memory);
+  __ st4(v15.V8H(), v16.V8H(), v17.V8H(), v18.V8H(), bad_memory);
+  __ st4(v31.V8H(), v0.V8H(), v1.V8H(), v2.V8H(), bad_memory);
+  __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, bad_memory);
+  __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, bad_memory);
+  __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, bad_memory);
+  __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, bad_memory);
+  __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, bad_memory);
+  __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, bad_memory);
+  __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, bad_memory);
+  __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, bad_memory);
+  __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, bad_memory);
+  __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, bad_memory);
+  __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, bad_memory);
+  __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, bad_memory);
+
+  END_IMPLICIT_CHECK();
+  TRY_RUN_IMPLICIT_CHECK();
+}
+
+TEST(ImplicitCheckSve) {
+  SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                      CPUFeatures::kSVE2,
+                      CPUFeatures::kNEON);
+  START_IMPLICIT_CHECK();
+
+  SVEMemOperand bad_sve_memory = SVEMemOperand(ip0);
+
+  EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
+  // Simple, unpredicated loads and stores.
+  __ Str(p12.VnD(), bad_sve_memory);
+  __ Str(p13.VnS(), bad_sve_memory);
+  __ Str(p14.VnH(), bad_sve_memory);
+  __ Str(p15.VnB(), bad_sve_memory);
+  __ Ldr(p8.VnD(), bad_sve_memory);
+  __ Ldr(p9.VnS(), bad_sve_memory);
+  __ Ldr(p10.VnH(), bad_sve_memory);
+  __ Ldr(p11.VnB(), bad_sve_memory);
+
+  __ Str(z0.VnD(), bad_sve_memory);
+  __ Str(z1.VnS(), bad_sve_memory);
+  __ Str(z2.VnH(), bad_sve_memory);
+  __ Str(z3.VnB(), bad_sve_memory);
+  __ Ldr(z20.VnD(), bad_sve_memory);
+  __ Ldr(z21.VnS(), bad_sve_memory);
+  __ Ldr(z22.VnH(), bad_sve_memory);
+  __ Ldr(z23.VnB(), bad_sve_memory);
+
+  // Structured accesses.
+  __ St1b(z0.VnB(), p2, bad_sve_memory);
+  __ St1h(z1.VnH(), p1, bad_sve_memory);
+  __ St1w(z2.VnS(), p1, bad_sve_memory);
+  __ St1d(z3.VnD(), p2, bad_sve_memory);
+  __ Ld1b(z20.VnB(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1h(z21.VnH(), p2.Zeroing(), bad_sve_memory);
+  __ Ld1w(z22.VnS(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1d(z23.VnD(), p1.Zeroing(), bad_sve_memory);
+
+  // Structured, packed accesses.
+  __ St1b(z2.VnH(), p1, bad_sve_memory);
+  __ St1b(z3.VnS(), p2, bad_sve_memory);
+  __ St1b(z4.VnD(), p2, bad_sve_memory);
+  __ St1h(z0.VnS(), p1, bad_sve_memory);
+  __ St1h(z1.VnD(), p1, bad_sve_memory);
+  __ St1w(z2.VnD(), p1, bad_sve_memory);
+  __ Ld1b(z20.VnH(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1b(z21.VnS(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1b(z22.VnD(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1h(z23.VnS(), p2.Zeroing(), bad_sve_memory);
+  __ Ld1h(z24.VnD(), p2.Zeroing(), bad_sve_memory);
+  __ Ld1w(z20.VnD(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1sb(z21.VnH(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1sb(z22.VnS(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1sb(z23.VnD(), p2.Zeroing(), bad_sve_memory);
+  __ Ld1sh(z24.VnS(), p2.Zeroing(), bad_sve_memory);
+  __ Ld1sh(z20.VnD(), p1.Zeroing(), bad_sve_memory);
+  __ Ld1sw(z21.VnD(), p1.Zeroing(), bad_sve_memory);
+
+  // Structured, interleaved accesses.
+  __ St2b(z0.VnB(), z1.VnB(), p4, bad_sve_memory);
+  __ St2h(z1.VnH(), z2.VnH(), p4, bad_sve_memory);
+  __ St2w(z2.VnS(), z3.VnS(), p3, bad_sve_memory);
+  __ St2d(z3.VnD(), z4.VnD(), p4, bad_sve_memory);
+  __ Ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), bad_sve_memory);
+  __ Ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), bad_sve_memory);
+  __ Ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), bad_sve_memory);
+  __ Ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), bad_sve_memory);
+
+  __ St3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, bad_sve_memory);
+  __ St3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, bad_sve_memory);
+  __ St3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, bad_sve_memory);
+  __ St3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, bad_sve_memory);
+  __ Ld3b(z24.VnB(), z25.VnB(), z26.VnB(), p5.Zeroing(), bad_sve_memory);
+  __ Ld3h(z25.VnH(), z26.VnH(), z27.VnH(), p6.Zeroing(), bad_sve_memory);
+  __ Ld3w(z26.VnS(), z27.VnS(), z28.VnS(), p6.Zeroing(), bad_sve_memory);
+  __ Ld3d(z27.VnD(), z28.VnD(), z29.VnD(), p5.Zeroing(), bad_sve_memory);
+
+  __ St4b(z31.VnB(), z0.VnB(), z1.VnB(), z2.VnB(), p4, bad_sve_memory);
+  __ St4h(z0.VnH(), z1.VnH(), z2.VnH(), z3.VnH(), p4, bad_sve_memory);
+  __ St4w(z1.VnS(), z2.VnS(), z3.VnS(), z4.VnS(), p3, bad_sve_memory);
+  __ St4d(z2.VnD(), z3.VnD(), z4.VnD(), z5.VnD(), p4, bad_sve_memory);
+  __ Ld4b(z25.VnB(),
+          z26.VnB(),
+          z27.VnB(),
+          z28.VnB(),
+          p5.Zeroing(),
+          bad_sve_memory);
+  __ Ld4h(z26.VnH(),
+          z27.VnH(),
+          z28.VnH(),
+          z29.VnH(),
+          p6.Zeroing(),
+          bad_sve_memory);
+  __ Ld4w(z27.VnS(),
+          z28.VnS(),
+          z29.VnS(),
+          z30.VnS(),
+          p6.Zeroing(),
+          bad_sve_memory);
+  __ Ld4d(z28.VnD(),
+          z29.VnD(),
+          z30.VnD(),
+          z31.VnD(),
+          p5.Zeroing(),
+          bad_sve_memory);
+
+  END_IMPLICIT_CHECK();
+  TRY_RUN_IMPLICIT_CHECK();
+}
+
+TEST(ImplicitCheckAtomics) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kAtomics);
+  START_IMPLICIT_CHECK();
+
+  EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
+#define INST_LIST(OP)                 \
+  __ Ld##OP##b(w0, w0, bad_memory);   \
+  __ Ld##OP##ab(w0, w1, bad_memory);  \
+  __ Ld##OP##lb(w0, w2, bad_memory);  \
+  __ Ld##OP##alb(w0, w3, bad_memory); \
+  __ Ld##OP##h(w0, w0, bad_memory);   \
+  __ Ld##OP##ah(w0, w1, bad_memory);  \
+  __ Ld##OP##lh(w0, w2, bad_memory);  \
+  __ Ld##OP##alh(w0, w3, bad_memory); \
+  __ Ld##OP(w0, w0, bad_memory);      \
+  __ Ld##OP##a(w0, w1, bad_memory);   \
+  __ Ld##OP##l(w0, w2, bad_memory);   \
+  __ Ld##OP##al(w0, w3, bad_memory);  \
+  __ Ld##OP(x0, x0, bad_memory);      \
+  __ Ld##OP##a(x0, x1, bad_memory);   \
+  __ Ld##OP##l(x0, x2, bad_memory);   \
+  __ Ld##OP##al(x0, x3, bad_memory);  \
+  __ St##OP##b(w0, bad_memory);       \
+  __ St##OP##lb(w0, bad_memory);      \
+  __ St##OP##h(w0, bad_memory);       \
+  __ St##OP##lh(w0, bad_memory);      \
+  __ St##OP(w0, bad_memory);          \
+  __ St##OP##l(w0, bad_memory);       \
+  __ St##OP(x0, bad_memory);          \
+  __ St##OP##l(x0, bad_memory);
+
+  INST_LIST(add);
+  INST_LIST(set);
+  INST_LIST(eor);
+  INST_LIST(smin);
+  INST_LIST(smax);
+  INST_LIST(umin);
+  INST_LIST(umax);
+  INST_LIST(clr);
+
+#undef INST_LIST
+
+  END_IMPLICIT_CHECK();
+  TRY_RUN_IMPLICIT_CHECK();
+}
+
+TEST(ImplicitCheckMops) {
+  SETUP_WITH_FEATURES(CPUFeatures::kNEON, CPUFeatures::kMOPS);
+  START_IMPLICIT_CHECK();
+
+  EmissionCheckScope guard(&masm, masm.GetBuffer()->GetRemainingBytes());
+  __ Set(x15, ip1, ip0);
+  __ Setn(x15, ip1, ip0);
+  __ Setg(x15, ip1, ip0);
+  __ Setgn(x15, ip1, ip0);
+
+  __ Cpy(x15, ip0, ip1);
+  __ Cpyn(x15, ip0, ip1);
+  __ Cpyrn(x15, ip0, ip1);
+  __ Cpywn(x15, ip0, ip1);
+  __ Cpyf(x15, ip0, ip1);
+  __ Cpyfn(x15, ip0, ip1);
+  __ Cpyfrn(x15, ip0, ip1);
+  __ Cpyfwn(x15, ip0, ip1);
+
+  // The macro-assembler expands each instruction into prologue, main and
+  // epilogue instructions where only the main instruction will fail. Increase
+  // the counter to account for those additional instructions and the following
+  // instructions.
+  __ Mov(x0, 3);
+  __ Mul(x1, x1, x0);
+  __ Add(x1, x1, x0);
+
+  END_IMPLICIT_CHECK();
+  TRY_RUN_IMPLICIT_CHECK();
+}
+#endif  // VIXL_ENABLE_IMPLICIT_CHECKS
+
 #undef __
 #define __ masm->
 
@@ -5140,6 +6025,7 @@ TEST(RunFrom) {
                                                         3.0);
   VIXL_CHECK(res_double == 6.0);
 }
+
 #endif
 
 
diff --git a/test/aarch64/test-simulator-sve-aarch64.cc b/test/aarch64/test-simulator-sve-aarch64.cc
index 51f7d82f..6b5b9582 100644
--- a/test/aarch64/test-simulator-sve-aarch64.cc
+++ b/test/aarch64/test-simulator-sve-aarch64.cc
@@ -267,5 +267,1776 @@ TEST_SVE(sve_fmatmul_s) {
   }
 }
 
+// Below here, there are tests for Neon instructions. As these forms of test
+// check the entire register state, they also need SVE features.
+
+TEST_SVE(neon_pmull) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kPmull1Q);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 40 * kInstructionSize);
+    __ dci(0x4e20e000);  // pmull2 v0.8h, v0.16b, v0.16b
+    // vl128 state = 0x5eba4d4f
+    __ dci(0x4e20e228);  // pmull2 v8.8h, v17.16b, v0.16b
+    // vl128 state = 0x86bceb87
+    __ dci(0x4ee0e22a);  // pmull2 v10.1q, v17.2d, v0.2d
+    // vl128 state = 0x1332fe02
+    __ dci(0x0ee8e222);  // pmull v2.1q, v17.1d, v8.1d
+    // vl128 state = 0xd357dc7b
+    __ dci(0x4eece226);  // pmull2 v6.1q, v17.2d, v12.2d
+    // vl128 state = 0xdff409ad
+    __ dci(0x0eece276);  // pmull v22.1q, v19.1d, v12.1d
+    // vl128 state = 0xd8af1dc6
+    __ dci(0x0eede232);  // pmull v18.1q, v17.1d, v13.1d
+    // vl128 state = 0x41e6ed0e
+    __ dci(0x0efde216);  // pmull v22.1q, v16.1d, v29.1d
+    // vl128 state = 0x1f10365f
+    __ dci(0x0effe23e);  // pmull v30.1q, v17.1d, v31.1d
+    // vl128 state = 0x9779ece5
+    __ dci(0x0ee7e23f);  // pmull v31.1q, v17.1d, v7.1d
+    // vl128 state = 0x11fc8ce9
+    __ dci(0x0ee2e23e);  // pmull v30.1q, v17.1d, v2.1d
+    // vl128 state = 0x101d5a6f
+    __ dci(0x0ee2e23c);  // pmull v28.1q, v17.1d, v2.1d
+    // vl128 state = 0xcc4fe26e
+    __ dci(0x0eeae27d);  // pmull v29.1q, v19.1d, v10.1d
+    // vl128 state = 0xc84be9f4
+    __ dci(0x4eeae24d);  // pmull2 v13.1q, v18.2d, v10.2d
+    // vl128 state = 0x2fc540b4
+    __ dci(0x4eeae25d);  // pmull2 v29.1q, v18.2d, v10.2d
+    // vl128 state = 0x1b2d99cd
+    __ dci(0x4eeae2ed);  // pmull2 v13.1q, v23.2d, v10.2d
+    // vl128 state = 0x8a278b95
+    __ dci(0x4eeae2e9);  // pmull2 v9.1q, v23.2d, v10.2d
+    // vl128 state = 0x3359b4c8
+    __ dci(0x4efee2e8);  // pmull2 v8.1q, v23.2d, v30.2d
+    // vl128 state = 0x5c25ed31
+    __ dci(0x4effe3e0);  // pmull2 v0.1q, v31.2d, v31.2d
+    // vl128 state = 0x28ff67d1
+    __ dci(0x4eefe3d0);  // pmull2 v16.1q, v30.2d, v15.2d
+    // vl128 state = 0x1543436d
+    __ dci(0x4ee7e2d1);  // pmull2 v17.1q, v22.2d, v7.2d
+    // vl128 state = 0x71b8bc90
+    __ dci(0x4eefe3d5);  // pmull2 v21.1q, v30.2d, v15.2d
+    // vl128 state = 0x3d35ca02
+    __ dci(0x4eefe314);  // pmull2 v20.1q, v24.2d, v15.2d
+    // vl128 state = 0x40e8fade
+    __ dci(0x4eefe310);  // pmull2 v16.1q, v24.2d, v15.2d
+    // vl128 state = 0xb8affb87
+    __ dci(0x4eefe300);  // pmull2 v0.1q, v24.2d, v15.2d
+    // vl128 state = 0x4824ee5c
+    __ dci(0x4eede350);  // pmull2 v16.1q, v26.2d, v13.2d
+    // vl128 state = 0x39202868
+    __ dci(0x4ee7e354);  // pmull2 v20.1q, v26.2d, v7.2d
+    // vl128 state = 0xc8fde340
+    __ dci(0x4e27e356);  // pmull2 v22.8h, v26.16b, v7.16b
+    // vl128 state = 0x0f02316b
+    __ dci(0x4e37e15e);  // pmull2 v30.8h, v10.16b, v23.16b
+    // vl128 state = 0xced4f8bd
+    __ dci(0x4e33e05f);  // pmull2 v31.8h, v2.16b, v19.16b
+    // vl128 state = 0x0c76bdb3
+    __ dci(0x0e23e05e);  // pmull v30.8h, v2.8b, v3.8b
+    // vl128 state = 0x0e36962b
+    __ dci(0x4e23e25f);  // pmull2 v31.8h, v18.16b, v3.16b
+    // vl128 state = 0x11a8dcc3
+    __ dci(0x4e23e25b);  // pmull2 v27.8h, v18.16b, v3.16b
+    // vl128 state = 0xf01bfe16
+    __ dci(0x4e23e259);  // pmull2 v25.8h, v18.16b, v3.16b
+    // vl128 state = 0xea351afe
+    __ dci(0x4e22e2c9);  // pmull2 v9.8h, v22.16b, v2.16b
+    // vl128 state = 0x16e933ef
+    __ dci(0x4e3ae2c8);  // pmull2 v8.8h, v22.16b, v26.16b
+    // vl128 state = 0x02528a2a
+    __ dci(0x4e32e249);  // pmull2 v9.8h, v18.16b, v18.16b
+    // vl128 state = 0xe7e20633
+    __ dci(0x4e36e20d);  // pmull2 v13.8h, v16.16b, v22.16b
+    // vl128 state = 0x6f231732
+    __ dci(0x4e36e205);  // pmull2 v5.8h, v16.16b, v22.16b
+    // vl128 state = 0x423eb7ea
+    __ dci(0x4e22e20d);  // pmull2 v13.8h, v16.16b, v2.16b
+    // vl128 state = 0xfc0d1c14
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xfc0d1c14,
+        0x4cb040a3,
+        0x4b913ebe,
+        0xfa35b836,
+        0x78745d20,
+        0x6666b09a,
+        0xee2868f4,
+        0x1936a795,
+        0x1025244a,
+        0xe8551950,
+        0xae73af02,
+        0x0fdd5fc7,
+        0x22e9827b,
+        0x384ce1ac,
+        0xc833cbeb,
+        0x255baab5,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha1_2reg) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA1);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x5e280800);  // sha1h s0, s0
+    // vl128 state = 0xc388d4f8
+    __ dci(0x5e280a28);  // sha1h s8, s17
+    // vl128 state = 0x5c88b904
+    __ dci(0x5e280a2a);  // sha1h s10, s17
+    // vl128 state = 0x6f63c596
+    __ dci(0x5e281aae);  // sha1su1 v14.4s, v21.4s
+    // vl128 state = 0x85e1119d
+    __ dci(0x5e281abe);  // sha1su1 v30.4s, v21.4s
+    // vl128 state = 0x9b814260
+    __ dci(0x5e281a0e);  // sha1su1 v14.4s, v16.4s
+    // vl128 state = 0x8ccca0ab
+    __ dci(0x5e281a0a);  // sha1su1 v10.4s, v16.4s
+    // vl128 state = 0x42262836
+    __ dci(0x5e281acb);  // sha1su1 v11.4s, v22.4s
+    // vl128 state = 0xabcde33d
+    __ dci(0x5e281acf);  // sha1su1 v15.4s, v22.4s
+    // vl128 state = 0xdf44e7be
+    __ dci(0x5e281adf);  // sha1su1 v31.4s, v22.4s
+    // vl128 state = 0x48c332a3
+    __ dci(0x5e280a9d);  // sha1h s29, s20
+    // vl128 state = 0x56bafe13
+    __ dci(0x5e28188d);  // sha1su1 v13.4s, v4.4s
+    // vl128 state = 0x218eb351
+    __ dci(0x5e2808cf);  // sha1h s15, s6
+    // vl128 state = 0xc1720d9f
+    __ dci(0x5e2808cb);  // sha1h s11, s6
+    // vl128 state = 0x67119e1c
+    __ dci(0x5e2808c9);  // sha1h s9, s6
+    // vl128 state = 0x31f69637
+    __ dci(0x5e2808c1);  // sha1h s1, s6
+    // vl128 state = 0x214a25ff
+    __ dci(0x5e280871);  // sha1h s17, s3
+    // vl128 state = 0xa5e88b55
+    __ dci(0x5e280815);  // sha1h s21, s0
+    // vl128 state = 0xc8c91e29
+    __ dci(0x5e28185d);  // sha1su1 v29.4s, v2.4s
+    // vl128 state = 0x5582c6a8
+    __ dci(0x5e28185f);  // sha1su1 v31.4s, v2.4s
+    // vl128 state = 0xd3288a61
+    __ dci(0x5e28087e);  // sha1h s30, s3
+    // vl128 state = 0x350b39c2
+    __ dci(0x5e28093f);  // sha1h s31, s9
+    // vl128 state = 0xbdc1ac98
+    __ dci(0x5e28093b);  // sha1h s27, s9
+    // vl128 state = 0x62f828bf
+    __ dci(0x5e28092b);  // sha1h s11, s9
+    // vl128 state = 0xc8f2f671
+    __ dci(0x5e2819bb);  // sha1su1 v27.4s, v13.4s
+    // vl128 state = 0x24ec8c34
+    __ dci(0x5e281b93);  // sha1su1 v19.4s, v28.4s
+    // vl128 state = 0x71e188de
+    __ dci(0x5e281b97);  // sha1su1 v23.4s, v28.4s
+    // vl128 state = 0x22490375
+    __ dci(0x5e281b95);  // sha1su1 v21.4s, v28.4s
+    // vl128 state = 0x016b70d1
+    __ dci(0x5e281b51);  // sha1su1 v17.4s, v26.4s
+    // vl128 state = 0xa6252086
+    __ dci(0x5e2819d3);  // sha1su1 v19.4s, v14.4s
+    // vl128 state = 0x78683885
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x78683885,
+        0x59574c2a,
+        0x449978bf,
+        0x0ddab775,
+        0x1a043ef3,
+        0xf501e2e7,
+        0xa219e725,
+        0xf17f57c8,
+        0x4ccdbf99,
+        0x419d4fc3,
+        0x7302571d,
+        0xd6bee170,
+        0x7d81c301,
+        0xbaa7d729,
+        0xf33f0bc4,
+        0xff8b070a,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha1_3reg) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA1);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x5e1f02bd);  // sha1c q29, s21, v31.4s
+    // vl128 state = 0xec2a37ad
+    __ dci(0x5e0810af);  // sha1p q15, s5, v8.4s
+    // vl128 state = 0x3fe9252a
+    __ dci(0x5e122227);  // sha1m q7, s17, v18.4s
+    // vl128 state = 0x4465789e
+    __ dci(0x5e0b039d);  // sha1c q29, s28, v11.4s
+    // vl128 state = 0x2186488a
+    __ dci(0x5e1a03e9);  // sha1c q9, s31, v26.4s
+    // vl128 state = 0x9eddf8e3
+    __ dci(0x5e0c138c);  // sha1p q12, s28, v12.4s
+    // vl128 state = 0x0ca7cd3d
+    __ dci(0x5e1f1316);  // sha1p q22, s24, v31.4s
+    // vl128 state = 0xb80a61c0
+    __ dci(0x5e052204);  // sha1m q4, s16, v5.4s
+    // vl128 state = 0x941821ca
+    __ dci(0x5e0a00d6);  // sha1c q22, s6, v10.4s
+    // vl128 state = 0x5e71ccae
+    __ dci(0x5e0e032e);  // sha1c q14, s25, v14.4s
+    // vl128 state = 0x7ed4486a
+    __ dci(0x5e1d1098);  // sha1p q24, s4, v29.4s
+    // vl128 state = 0x0978a637
+    __ dci(0x5e0400d9);  // sha1c q25, s6, v4.4s
+    // vl128 state = 0x34c8609e
+    __ dci(0x5e1a330e);  // sha1su0 v14.4s, v24.4s, v26.4s
+    // vl128 state = 0xcb078fad
+    __ dci(0x5e1e30f5);  // sha1su0 v21.4s, v7.4s, v30.4s
+    // vl128 state = 0x885200be
+    __ dci(0x5e1e32e1);  // sha1su0 v1.4s, v23.4s, v30.4s
+    // vl128 state = 0xabc6a188
+    __ dci(0x5e0733d3);  // sha1su0 v19.4s, v30.4s, v7.4s
+    // vl128 state = 0x37a4fe6f
+    __ dci(0x5e0b22e6);  // sha1m q6, s23, v11.4s
+    // vl128 state = 0x68b788d2
+    __ dci(0x5e011210);  // sha1p q16, s16, v1.4s
+    // vl128 state = 0x6b36b092
+    __ dci(0x5e1702e1);  // sha1c q1, s23, v23.4s
+    // vl128 state = 0x74ef56f5
+    __ dci(0x5e1e30f6);  // sha1su0 v22.4s, v7.4s, v30.4s
+    // vl128 state = 0x5a150dfd
+    __ dci(0x5e1b3348);  // sha1su0 v8.4s, v26.4s, v27.4s
+    // vl128 state = 0xe0a45d9c
+    __ dci(0x5e0a3041);  // sha1su0 v1.4s, v2.4s, v10.4s
+    // vl128 state = 0x6ba02d02
+    __ dci(0x5e17119a);  // sha1p q26, s12, v23.4s
+    // vl128 state = 0x3bf511fc
+    __ dci(0x5e0b32c7);  // sha1su0 v7.4s, v22.4s, v11.4s
+    // vl128 state = 0xf5c513b6
+    __ dci(0x5e063016);  // sha1su0 v22.4s, v0.4s, v6.4s
+    // vl128 state = 0x3eb44b28
+    __ dci(0x5e05323c);  // sha1su0 v28.4s, v17.4s, v5.4s
+    // vl128 state = 0x7c2d3adf
+    __ dci(0x5e1d132a);  // sha1p q10, s25, v29.4s
+    // vl128 state = 0x2b0963c4
+    __ dci(0x5e13003c);  // sha1c q28, s1, v19.4s
+    // vl128 state = 0x4a582d00
+    __ dci(0x5e13322c);  // sha1su0 v12.4s, v17.4s, v19.4s
+    // vl128 state = 0x7bb2cc8c
+    __ dci(0x5e032330);  // sha1m q16, s25, v3.4s
+    // vl128 state = 0x2a8b4c0d
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x2a8b4c0d,
+        0x114e25bb,
+        0x4f035af9,
+        0x23db7966,
+        0x3d106b42,
+        0x62651fcf,
+        0x44c20879,
+        0xadf71d73,
+        0xe6858f82,
+        0x93a74ae5,
+        0xc270310e,
+        0x3d07058c,
+        0x69f83d0e,
+        0x28c5813b,
+        0xbb9de2c1,
+        0xe06b94cd,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha2h) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA2);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x5e0152a2);  // sha256h2 q2, q21, v1.4s
+    // vl128 state = 0x6bda8984
+    __ dci(0x5e1552b2);  // sha256h2 q18, q21, v21.4s
+    // vl128 state = 0xe985c68a
+    __ dci(0x5e055293);  // sha256h2 q19, q20, v5.4s
+    // vl128 state = 0xab18a98b
+    __ dci(0x5e055297);  // sha256h2 q23, q20, v5.4s
+    // vl128 state = 0x896bad28
+    __ dci(0x5e0752a7);  // sha256h2 q7, q21, v7.4s
+    // vl128 state = 0x4e00ba08
+    __ dci(0x5e175223);  // sha256h2 q3, q17, v23.4s
+    // vl128 state = 0x380f3893
+    __ dci(0x5e1f5262);  // sha256h2 q2, q19, v31.4s
+    // vl128 state = 0xb431122d
+    __ dci(0x5e1f5272);  // sha256h2 q18, q19, v31.4s
+    // vl128 state = 0x18140047
+    __ dci(0x5e1e4262);  // sha256h q2, q19, v30.4s
+    // vl128 state = 0x721779be
+    __ dci(0x5e164363);  // sha256h q3, q27, v22.4s
+    // vl128 state = 0x383ad878
+    __ dci(0x5e175361);  // sha256h2 q1, q27, v23.4s
+    // vl128 state = 0xd985bd85
+    __ dci(0x5e115360);  // sha256h2 q0, q27, v17.4s
+    // vl128 state = 0xfa5e77f3
+    __ dci(0x5e135270);  // sha256h2 q16, q19, v19.4s
+    // vl128 state = 0x4fc1f5cc
+    __ dci(0x5e195260);  // sha256h2 q0, q19, v25.4s
+    // vl128 state = 0x89435952
+    __ dci(0x5e1952c4);  // sha256h2 q4, q22, v25.4s
+    // vl128 state = 0x93c60c86
+    __ dci(0x5e1a52c6);  // sha256h2 q6, q22, v26.4s
+    // vl128 state = 0xedc42105
+    __ dci(0x5e1a52c4);  // sha256h2 q4, q22, v26.4s
+    // vl128 state = 0xd5d638a8
+    __ dci(0x5e1a4285);  // sha256h q5, q20, v26.4s
+    // vl128 state = 0x9f9da446
+    __ dci(0x5e1a428d);  // sha256h q13, q20, v26.4s
+    // vl128 state = 0x87d49cfb
+    __ dci(0x5e1b42cf);  // sha256h q15, q22, v27.4s
+    // vl128 state = 0xa6802b10
+    __ dci(0x5e1b43ed);  // sha256h q13, q31, v27.4s
+    // vl128 state = 0x2e346937
+    __ dci(0x5e0b436f);  // sha256h q15, q27, v11.4s
+    // vl128 state = 0x1005f372
+    __ dci(0x5e03433f);  // sha256h q31, q25, v3.4s
+    // vl128 state = 0xd908918c
+    __ dci(0x5e13532f);  // sha256h2 q15, q25, v19.4s
+    // vl128 state = 0x31c73fe0
+    __ dci(0x5e01533f);  // sha256h2 q31, q25, v1.4s
+    // vl128 state = 0x84e35a20
+    __ dci(0x5e03523d);  // sha256h2 q29, q17, v3.4s
+    // vl128 state = 0x40da34aa
+    __ dci(0x5e0b527c);  // sha256h2 q28, q19, v11.4s
+    // vl128 state = 0x506a21d9
+    __ dci(0x5e0f5238);  // sha256h2 q24, q17, v15.4s
+    // vl128 state = 0x6a67f033
+    __ dci(0x5e0d5210);  // sha256h2 q16, q16, v13.4s
+    // vl128 state = 0x317e084c
+    __ dci(0x5e0d5214);  // sha256h2 q20, q16, v13.4s
+    // vl128 state = 0xdd0eb379
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xdd0eb379,
+        0x15384d69,
+        0x32bbc73a,
+        0xc5879e77,
+        0x9241294d,
+        0xfc01bad8,
+        0xf5e79af5,
+        0xee66e696,
+        0x535158e8,
+        0x09cfa8b6,
+        0x8cd83eae,
+        0x93ff18b0,
+        0x561444e4,
+        0xa6249eea,
+        0x830e4c73,
+        0xb516eaae,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha2su0) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA2);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x5e2828e3);  // sha256su0 v3.4s, v7.4s
+    // vl128 state = 0xbc7a7764
+    __ dci(0x5e282be1);  // sha256su0 v1.4s, v31.4s
+    // vl128 state = 0x6138a856
+    __ dci(0x5e282be9);  // sha256su0 v9.4s, v31.4s
+    // vl128 state = 0x49c6be17
+    __ dci(0x5e282beb);  // sha256su0 v11.4s, v31.4s
+    // vl128 state = 0xca658743
+    __ dci(0x5e2829bb);  // sha256su0 v27.4s, v13.4s
+    // vl128 state = 0x1bf1d233
+    __ dci(0x5e2829ba);  // sha256su0 v26.4s, v13.4s
+    // vl128 state = 0xafb0c6ae
+    __ dci(0x5e2829aa);  // sha256su0 v10.4s, v13.4s
+    // vl128 state = 0x2182e90d
+    __ dci(0x5e282b2e);  // sha256su0 v14.4s, v25.4s
+    // vl128 state = 0x401d297d
+    __ dci(0x5e282aaf);  // sha256su0 v15.4s, v21.4s
+    // vl128 state = 0x6c01fefa
+    __ dci(0x5e282aad);  // sha256su0 v13.4s, v21.4s
+    // vl128 state = 0x0f4c191d
+    __ dci(0x5e282a7d);  // sha256su0 v29.4s, v19.4s
+    // vl128 state = 0xcf26aa1b
+    __ dci(0x5e282ad9);  // sha256su0 v25.4s, v22.4s
+    // vl128 state = 0xae04081e
+    __ dci(0x5e282ac9);  // sha256su0 v9.4s, v22.4s
+    // vl128 state = 0x08149009
+    __ dci(0x5e282acb);  // sha256su0 v11.4s, v22.4s
+    // vl128 state = 0xa691e487
+    __ dci(0x5e282ac3);  // sha256su0 v3.4s, v22.4s
+    // vl128 state = 0xd728e1b5
+    __ dci(0x5e282ac7);  // sha256su0 v7.4s, v22.4s
+    // vl128 state = 0x120fac30
+    __ dci(0x5e282ac5);  // sha256su0 v5.4s, v22.4s
+    // vl128 state = 0x88086f82
+    __ dci(0x5e282ac4);  // sha256su0 v4.4s, v22.4s
+    // vl128 state = 0x625160b7
+    __ dci(0x5e282a65);  // sha256su0 v5.4s, v19.4s
+    // vl128 state = 0x308feecd
+    __ dci(0x5e282a6d);  // sha256su0 v13.4s, v19.4s
+    // vl128 state = 0x65f03097
+    __ dci(0x5e282a65);  // sha256su0 v5.4s, v19.4s
+    // vl128 state = 0x44d9fbb6
+    __ dci(0x5e282a67);  // sha256su0 v7.4s, v19.4s
+    // vl128 state = 0x694fe04a
+    __ dci(0x5e282a17);  // sha256su0 v23.4s, v16.4s
+    // vl128 state = 0x3d5c139b
+    __ dci(0x5e282a13);  // sha256su0 v19.4s, v16.4s
+    // vl128 state = 0x922f40a5
+    __ dci(0x5e282b3b);  // sha256su0 v27.4s, v25.4s
+    // vl128 state = 0x4f9c34f2
+    __ dci(0x5e282ab9);  // sha256su0 v25.4s, v21.4s
+    // vl128 state = 0x18a4f581
+    __ dci(0x5e282ab1);  // sha256su0 v17.4s, v21.4s
+    // vl128 state = 0x69da3844
+    __ dci(0x5e282ab9);  // sha256su0 v25.4s, v21.4s
+    // vl128 state = 0x57f8ce0b
+    __ dci(0x5e282a1d);  // sha256su0 v29.4s, v16.4s
+    // vl128 state = 0xafa03001
+    __ dci(0x5e282ad5);  // sha256su0 v21.4s, v22.4s
+    // vl128 state = 0x029b78a8
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x029b78a8,
+        0x479a8911,
+        0x6bdbdb48,
+        0x5ef3718b,
+        0x695ce173,
+        0x586543d0,
+        0xd00a22be,
+        0xe63a91b9,
+        0x42bb89a2,
+        0xea48ee79,
+        0x9788ac35,
+        0x1e8599a3,
+        0xd0d2d6ee,
+        0xfe7aaaf7,
+        0x77da6831,
+        0xb93fb875,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha2su1) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA2);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x5e1e6146);  // sha256su1 v6.4s, v10.4s, v30.4s
+    // vl128 state = 0x3bbf7782
+    __ dci(0x5e0f6144);  // sha256su1 v4.4s, v10.4s, v15.4s
+    // vl128 state = 0xf8c83149
+    __ dci(0x5e0e6174);  // sha256su1 v20.4s, v11.4s, v14.4s
+    // vl128 state = 0x3b8c353b
+    __ dci(0x5e0e6170);  // sha256su1 v16.4s, v11.4s, v14.4s
+    // vl128 state = 0x1041e30e
+    __ dci(0x5e0a6131);  // sha256su1 v17.4s, v9.4s, v10.4s
+    // vl128 state = 0xe4d81cd2
+    __ dci(0x5e0a6135);  // sha256su1 v21.4s, v9.4s, v10.4s
+    // vl128 state = 0x24869db3
+    __ dci(0x5e0a6131);  // sha256su1 v17.4s, v9.4s, v10.4s
+    // vl128 state = 0xfb093436
+    __ dci(0x5e0a6199);  // sha256su1 v25.4s, v12.4s, v10.4s
+    // vl128 state = 0x0c7939ba
+    __ dci(0x5e0e639b);  // sha256su1 v27.4s, v28.4s, v14.4s
+    // vl128 state = 0xa7e5c40a
+    __ dci(0x5e0663ab);  // sha256su1 v11.4s, v29.4s, v6.4s
+    // vl128 state = 0xc4ae571c
+    __ dci(0x5e06619b);  // sha256su1 v27.4s, v12.4s, v6.4s
+    // vl128 state = 0xf84ef221
+    __ dci(0x5e066199);  // sha256su1 v25.4s, v12.4s, v6.4s
+    // vl128 state = 0x24f98d3c
+    __ dci(0x5e0e6118);  // sha256su1 v24.4s, v8.4s, v14.4s
+    // vl128 state = 0xcdb43a3b
+    __ dci(0x5e0f601a);  // sha256su1 v26.4s, v0.4s, v15.4s
+    // vl128 state = 0x85fd37e9
+    __ dci(0x5e096012);  // sha256su1 v18.4s, v0.4s, v9.4s
+    // vl128 state = 0xabccd3f6
+    __ dci(0x5e0c601a);  // sha256su1 v26.4s, v0.4s, v12.4s
+    // vl128 state = 0x8c0232e5
+    __ dci(0x5e1c602a);  // sha256su1 v10.4s, v1.4s, v28.4s
+    // vl128 state = 0xcdcf37ba
+    __ dci(0x5e1e622e);  // sha256su1 v14.4s, v17.4s, v30.4s
+    // vl128 state = 0x25129c9a
+    __ dci(0x5e1e623e);  // sha256su1 v30.4s, v17.4s, v30.4s
+    // vl128 state = 0xd0a281b7
+    __ dci(0x5e1e630e);  // sha256su1 v14.4s, v24.4s, v30.4s
+    // vl128 state = 0x3ed92f18
+    __ dci(0x5e1f639e);  // sha256su1 v30.4s, v28.4s, v31.4s
+    // vl128 state = 0xda1056b9
+    __ dci(0x5e0f629f);  // sha256su1 v31.4s, v20.4s, v15.4s
+    // vl128 state = 0x367274fa
+    __ dci(0x5e0f63bd);  // sha256su1 v29.4s, v29.4s, v15.4s
+    // vl128 state = 0x46a79748
+    __ dci(0x5e0f63b5);  // sha256su1 v21.4s, v29.4s, v15.4s
+    // vl128 state = 0xdc427315
+    __ dci(0x5e0b63f7);  // sha256su1 v23.4s, v31.4s, v11.4s
+    // vl128 state = 0x91547f41
+    __ dci(0x5e0263e7);  // sha256su1 v7.4s, v31.4s, v2.4s
+    // vl128 state = 0x1c233ffa
+    __ dci(0x5e0062f7);  // sha256su1 v23.4s, v23.4s, v0.4s
+    // vl128 state = 0x8c2948a1
+    __ dci(0x5e1062c7);  // sha256su1 v7.4s, v22.4s, v16.4s
+    // vl128 state = 0x8b72f498
+    __ dci(0x5e1062c6);  // sha256su1 v6.4s, v22.4s, v16.4s
+    // vl128 state = 0x43d27746
+    __ dci(0x5e1063ee);  // sha256su1 v14.4s, v31.4s, v16.4s
+    // vl128 state = 0xa864e589
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xa864e589,
+        0xc588dfe0,
+        0x171add38,
+        0x884ca9db,
+        0x5f47fb6a,
+        0x0bd024c5,
+        0xa6921cce,
+        0x01dc8899,
+        0x0f5b4b19,
+        0x948260c1,
+        0x4d4faafe,
+        0x76ee7ff7,
+        0xd9a56156,
+        0x63c8e138,
+        0xe687f7c3,
+        0x51785434,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha3) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 60 * kInstructionSize);
+    __ dci(0xce608c00);  // rax1 v0.2d, v0.2d, v0.2d
+    // vl128 state = 0x960c2b9f
+    __ dci(0xce608e28);  // rax1 v8.2d, v17.2d, v0.2d
+    // vl128 state = 0x89ea3f7b
+    __ dci(0xce618e6c);  // rax1 v12.2d, v19.2d, v1.2d
+    // vl128 state = 0xa7801384
+    __ dci(0xce718e48);  // rax1 v8.2d, v18.2d, v17.2d
+    // vl128 state = 0x4477d70d
+    __ dci(0xce738e60);  // rax1 v0.2d, v19.2d, v19.2d
+    // vl128 state = 0xdee66854
+    __ dci(0xce6b8e61);  // rax1 v1.2d, v19.2d, v11.2d
+    // vl128 state = 0x2e383dc2
+    __ dci(0xce6e8e60);  // rax1 v0.2d, v19.2d, v14.2d
+    // vl128 state = 0xa022bb6d
+    __ dci(0xce6e8e62);  // rax1 v2.2d, v19.2d, v14.2d
+    // vl128 state = 0x923f5d32
+    __ dci(0xce668e23);  // rax1 v3.2d, v17.2d, v6.2d
+    // vl128 state = 0xc2c6ca00
+    __ dci(0xce260e33);  // bcax v19.16b, v17.16b, v6.16b, v3.16b
+    // vl128 state = 0x517e85e9
+    __ dci(0xce260e23);  // bcax v3.16b, v17.16b, v6.16b, v3.16b
+    // vl128 state = 0xbcf4c332
+    __ dci(0xce260e93);  // bcax v19.16b, v20.16b, v6.16b, v3.16b
+    // vl128 state = 0x5d9d51ef
+    __ dci(0xce260a11);  // bcax v17.16b, v16.16b, v6.16b, v2.16b
+    // vl128 state = 0x69ce0099
+    __ dci(0xce260a15);  // bcax v21.16b, v16.16b, v6.16b, v2.16b
+    // vl128 state = 0x9a2cdc9f
+    __ dci(0xce244a11);  // bcax v17.16b, v16.16b, v4.16b, v18.16b
+    // vl128 state = 0x27eeff29
+    __ dci(0xce304a10);  // bcax v16.16b, v16.16b, v16.16b, v18.16b
+    // vl128 state = 0x6d586875
+    __ dci(0xce314b18);  // bcax v24.16b, v24.16b, v17.16b, v18.16b
+    // vl128 state = 0xe38b6054
+    __ dci(0xce214b28);  // bcax v8.16b, v25.16b, v1.16b, v18.16b
+    // vl128 state = 0x27a3f5f6
+    __ dci(0xce294f38);  // bcax v24.16b, v25.16b, v9.16b, v19.16b
+    // vl128 state = 0x7d7ffa9b
+    __ dci(0xce214e39);  // bcax v25.16b, v17.16b, v1.16b, v19.16b
+    // vl128 state = 0x936374f0
+    __ dci(0xce216a3d);  // bcax v29.16b, v17.16b, v1.16b, v26.16b
+    // vl128 state = 0x1c5136d5
+    __ dci(0xce296b39);  // bcax v25.16b, v25.16b, v9.16b, v26.16b
+    // vl128 state = 0x75cd7131
+    __ dci(0xce216338);  // bcax v24.16b, v25.16b, v1.16b, v24.16b
+    // vl128 state = 0xcc747626
+    __ dci(0xce2163f9);  // bcax v25.16b, v31.16b, v1.16b, v24.16b
+    // vl128 state = 0x9409c8bc
+    __ dci(0xce2043f1);  // bcax v17.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0x8db3a0c8
+    __ dci(0xce2043f5);  // bcax v21.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0xa55f8d7d
+    __ dci(0xce2043e5);  // bcax v5.16b, v31.16b, v0.16b, v16.16b
+    // vl128 state = 0xe1960c7a
+    __ dci(0xce224be7);  // bcax v7.16b, v31.16b, v2.16b, v18.16b
+    // vl128 state = 0xc9599bde
+    __ dci(0xce204bb7);  // bcax v23.16b, v29.16b, v0.16b, v18.16b
+    // vl128 state = 0x7176d08d
+    __ dci(0xce004b9f);  // eor3 v31.16b, v28.16b, v0.16b, v18.16b
+    // vl128 state = 0x10620821
+    __ dci(0xce000baf);  // eor3 v15.16b, v29.16b, v0.16b, v2.16b
+    // vl128 state = 0x0aba0288
+    __ dci(0xce0a0bab);  // eor3 v11.16b, v29.16b, v10.16b, v2.16b
+    // vl128 state = 0xe6517156
+    __ dci(0xce0e1baf);  // eor3 v15.16b, v29.16b, v14.16b, v6.16b
+    // vl128 state = 0x6b7021fb
+    __ dci(0xce0e3fa7);  // eor3 v7.16b, v29.16b, v14.16b, v15.16b
+    // vl128 state = 0x05761b1f
+    __ dci(0xce0e2fe5);  // eor3 v5.16b, v31.16b, v14.16b, v11.16b
+    // vl128 state = 0xe01822c6
+    __ dci(0xce2e2fc7);  // bcax v7.16b, v30.16b, v14.16b, v11.16b
+    // vl128 state = 0xdc6444d7
+    __ dci(0xce3e2dcf);  // bcax v15.16b, v14.16b, v30.16b, v11.16b
+    // vl128 state = 0xa5ecad2e
+    __ dci(0xce3e3fdf);  // bcax v31.16b, v30.16b, v30.16b, v15.16b
+    // vl128 state = 0x2124dc42
+    __ dci(0xce3a3ede);  // bcax v30.16b, v22.16b, v26.16b, v15.16b
+    // vl128 state = 0x57f77204
+    __ dci(0xce3a2e9c);  // bcax v28.16b, v20.16b, v26.16b, v11.16b
+    // vl128 state = 0x6e8d303d
+    __ dci(0xce3a2294);  // bcax v20.16b, v20.16b, v26.16b, v8.16b
+    // vl128 state = 0xdb53d42c
+    __ dci(0xce38029c);  // bcax v28.16b, v20.16b, v24.16b, v0.16b
+    // vl128 state = 0x258d49b8
+    __ dci(0xce38088c);  // bcax v12.16b, v4.16b, v24.16b, v2.16b
+    // vl128 state = 0xe751a348
+    __ dci(0xce28008e);  // bcax v14.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0x8ce0aa1a
+    __ dci(0xce28008a);  // bcax v10.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0x1fdf89a5
+    __ dci(0xce280088);  // bcax v8.16b, v4.16b, v8.16b, v0.16b
+    // vl128 state = 0xcc51f5e1
+    __ dci(0xce2a1089);  // bcax v9.16b, v4.16b, v10.16b, v4.16b
+    // vl128 state = 0xdaf766b0
+    __ dci(0xce0b1081);  // eor3 v1.16b, v4.16b, v11.16b, v4.16b
+    // vl128 state = 0x2da7deb5
+    __ dci(0xce0a1011);  // eor3 v17.16b, v0.16b, v10.16b, v4.16b
+    // vl128 state = 0xcc86f5d4
+    __ dci(0xce121010);  // eor3 v16.16b, v0.16b, v18.16b, v4.16b
+    // vl128 state = 0xfb722105
+    __ dci(0xce921118);  // xar v24.2d, v8.2d, v18.2d, #4
+    // vl128 state = 0x9a7752e3
+    __ dci(0xce9a1199);  // xar v25.2d, v12.2d, v26.2d, #4
+    // vl128 state = 0x83a251c2
+    __ dci(0xce9e11dd);  // xar v29.2d, v14.2d, v30.2d, #4
+    // vl128 state = 0x1e31c9d5
+    __ dci(0xce9e915c);  // xar v28.2d, v10.2d, v30.2d, #36
+    // vl128 state = 0x0e421d73
+    __ dci(0xce1e115d);  // eor3 v29.16b, v10.16b, v30.16b, v4.16b
+    // vl128 state = 0xb5a8c677
+    __ dci(0xce3e515c);  // bcax v28.16b, v10.16b, v30.16b, v20.16b
+    // vl128 state = 0x21587300
+    __ dci(0xce3e5154);  // bcax v20.16b, v10.16b, v30.16b, v20.16b
+    // vl128 state = 0x9459c629
+    __ dci(0xce3e1056);  // bcax v22.16b, v2.16b, v30.16b, v4.16b
+    // vl128 state = 0xdb02263a
+    __ dci(0xce2a105e);  // bcax v30.16b, v2.16b, v10.16b, v4.16b
+    // vl128 state = 0xc9d210aa
+    __ dci(0xce3a5056);  // bcax v22.16b, v2.16b, v26.16b, v20.16b
+    // vl128 state = 0x4cc56293
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x4cc56293,
+        0xee8bac03,
+        0xc1253ac9,
+        0x9fe5aa0f,
+        0x43df27f4,
+        0x19f03be6,
+        0xd26c928b,
+        0x7b9da4c4,
+        0xe13149a7,
+        0x9fa11ed9,
+        0xe02cc4dd,
+        0x7848dfe7,
+        0x5ed1726f,
+        0x983e0123,
+        0x34166240,
+        0xc4ee172f,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha512) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA512);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 50 * kInstructionSize);
+    __ dci(0xce6583cc);  // sha512h q12, q30, v5.2d
+    // vl128 state = 0xecc5733a
+    __ dci(0xce6586c8);  // sha512h2 q8, q22, v5.2d
+    // vl128 state = 0xe05f2087
+    __ dci(0xce7586e0);  // sha512h2 q0, q23, v21.2d
+    // vl128 state = 0x1925555b
+    __ dci(0xce7187e8);  // sha512h2 q8, q31, v17.2d
+    // vl128 state = 0x891dba65
+    __ dci(0xce7586ec);  // sha512h2 q12, q23, v21.2d
+    // vl128 state = 0xdfbe3239
+    __ dci(0xce7580fc);  // sha512h q28, q7, v21.2d
+    // vl128 state = 0xba49dbc1
+    __ dci(0xce7580f4);  // sha512h q20, q7, v21.2d
+    // vl128 state = 0x3ad11a23
+    __ dci(0xce6780f6);  // sha512h q22, q7, v7.2d
+    // vl128 state = 0xcf9e1803
+    __ dci(0xce6780f7);  // sha512h q23, q7, v7.2d
+    // vl128 state = 0xe2baee15
+    __ dci(0xce6785e7);  // sha512h2 q7, q15, v7.2d
+    // vl128 state = 0x900a337c
+    __ dci(0xce6f8565);  // sha512h2 q5, q11, v15.2d
+    // vl128 state = 0xc6e5d7eb
+    __ dci(0xce6f8424);  // sha512h2 q4, q1, v15.2d
+    // vl128 state = 0xcbcb6ac1
+    __ dci(0xce6b84a6);  // sha512h2 q6, q5, v11.2d
+    // vl128 state = 0xa3c1a679
+    __ dci(0xce7b848e);  // sha512h2 q14, q4, v27.2d
+    // vl128 state = 0x47c4e54d
+    __ dci(0xce7d849e);  // sha512h2 q30, q4, v29.2d
+    // vl128 state = 0x9f519a29
+    __ dci(0xce7f859c);  // sha512h2 q28, q12, v31.2d
+    // vl128 state = 0xa4433415
+    __ dci(0xce778494);  // sha512h2 q20, q4, v23.2d
+    // vl128 state = 0xf03a69ec
+    __ dci(0xce778484);  // sha512h2 q4, q4, v23.2d
+    // vl128 state = 0x2c728333
+    __ dci(0xce77850c);  // sha512h2 q12, q8, v23.2d
+    // vl128 state = 0xaedc423e
+    __ dci(0xce77815c);  // sha512h q28, q10, v23.2d
+    // vl128 state = 0xea9346ea
+    __ dci(0xce7381cc);  // sha512h q12, q14, v19.2d
+    // vl128 state = 0x05ad87d1
+    __ dci(0xce7a81dc);  // sha512h q28, q14, v26.2d
+    // vl128 state = 0x9b1cd7b3
+    __ dci(0xce7285d4);  // sha512h2 q20, q14, v18.2d
+    // vl128 state = 0x154201ac
+    __ dci(0xce7280d6);  // sha512h q22, q6, v18.2d
+    // vl128 state = 0xd8640492
+    __ dci(0xce7a81d4);  // sha512h q20, q14, v26.2d
+    // vl128 state = 0x908eb258
+    __ dci(0xce7281f0);  // sha512h q16, q15, v18.2d
+    // vl128 state = 0x0067f162
+    __ dci(0xce728572);  // sha512h2 q18, q11, v18.2d
+    // vl128 state = 0xca9bc751
+    __ dci(0xce728422);  // sha512h2 q2, q1, v18.2d
+    // vl128 state = 0x06b7318d
+    __ dci(0xce738412);  // sha512h2 q18, q0, v19.2d
+    // vl128 state = 0xad019588
+    __ dci(0xce718016);  // sha512h q22, q0, v17.2d
+    // vl128 state = 0x55a29e9b
+    __ dci(0xce718834);  // sha512su1 v20.2d, v1.2d, v17.2d
+    // vl128 state = 0x953a9c7a
+    __ dci(0xce738876);  // sha512su1 v22.2d, v3.2d, v19.2d
+    // vl128 state = 0x4f194c71
+    __ dci(0xce638826);  // sha512su1 v6.2d, v1.2d, v3.2d
+    // vl128 state = 0x08e50d47
+    __ dci(0xce6b886e);  // sha512su1 v14.2d, v3.2d, v11.2d
+    // vl128 state = 0x4bdfb870
+    __ dci(0xce6b88de);  // sha512su1 v30.2d, v6.2d, v11.2d
+    // vl128 state = 0xbcf4b6c5
+    __ dci(0xce7f88df);  // sha512su1 v31.2d, v6.2d, v31.2d
+    // vl128 state = 0x916dede1
+    __ dci(0xce6f8acf);  // sha512su1 v15.2d, v22.2d, v15.2d
+    // vl128 state = 0x3b776003
+    __ dci(0xce6d8bcb);  // sha512su1 v11.2d, v30.2d, v13.2d
+    // vl128 state = 0x5d5cb7d9
+    __ dci(0xce6d83ea);  // sha512h q10, q31, v13.2d
+    // vl128 state = 0x18df9e46
+    __ dci(0xce6d8328);  // sha512h q8, q25, v13.2d
+    // vl128 state = 0xde5807d0
+    __ dci(0xce6583b8);  // sha512h q24, q29, v5.2d
+    // vl128 state = 0x861020e7
+    __ dci(0xce6d83f9);  // sha512h q25, q31, v13.2d
+    // vl128 state = 0x39d960f4
+    __ dci(0xce6d8b78);  // sha512su1 v24.2d, v27.2d, v13.2d
+    // vl128 state = 0x3afc2b5c
+    __ dci(0xce6c8968);  // sha512su1 v8.2d, v11.2d, v12.2d
+    // vl128 state = 0x74d44114
+    __ dci(0xce6c8b49);  // sha512su1 v9.2d, v26.2d, v12.2d
+    // vl128 state = 0x72e6b5cd
+    __ dci(0xce6c8b39);  // sha512su1 v25.2d, v25.2d, v12.2d
+    // vl128 state = 0x6aaa4658
+    __ dci(0xce6c8b9d);  // sha512su1 v29.2d, v28.2d, v12.2d
+    // vl128 state = 0x7c076c9b
+    __ dci(0xce648b0d);  // sha512su1 v13.2d, v24.2d, v4.2d
+    // vl128 state = 0x1082519d
+    __ dci(0xce648385);  // sha512h q5, q28, v4.2d
+    // vl128 state = 0x9ed9d190
+    __ dci(0xce648715);  // sha512h2 q21, q24, v4.2d
+    // vl128 state = 0xaace5a02
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xaace5a02,
+        0x912905de,
+        0xc62c0756,
+        0xac6646d5,
+        0xd3c2e6af,
+        0x029ae35f,
+        0xf5e83b54,
+        0x49f8d50c,
+        0xc5175320,
+        0xb51c8ebd,
+        0x2dc184b0,
+        0x01e01875,
+        0x28df0d5a,
+        0x01d2fff2,
+        0x5f5f5909,
+        0x6aead9d8,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sha512su0) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSHA512);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0xcec083f6);  // sha512su0 v22.2d, v31.2d
+    // vl128 state = 0xf7a54f2b
+    __ dci(0xcec083e6);  // sha512su0 v6.2d, v31.2d
+    // vl128 state = 0x919c170d
+    __ dci(0xcec08347);  // sha512su0 v7.2d, v26.2d
+    // vl128 state = 0x8a1800d6
+    __ dci(0xcec082c6);  // sha512su0 v6.2d, v22.2d
+    // vl128 state = 0x353aa8bf
+    __ dci(0xcec082c4);  // sha512su0 v4.2d, v22.2d
+    // vl128 state = 0x87d75b6c
+    __ dci(0xcec082c0);  // sha512su0 v0.2d, v22.2d
+    // vl128 state = 0xf2ee6974
+    __ dci(0xcec082c1);  // sha512su0 v1.2d, v22.2d
+    // vl128 state = 0xf2ec1e17
+    __ dci(0xcec082c0);  // sha512su0 v0.2d, v22.2d
+    // vl128 state = 0x1bcca060
+    __ dci(0xcec082c4);  // sha512su0 v4.2d, v22.2d
+    // vl128 state = 0x67773394
+    __ dci(0xcec082c5);  // sha512su0 v5.2d, v22.2d
+    // vl128 state = 0xbb344c8d
+    __ dci(0xcec083e1);  // sha512su0 v1.2d, v31.2d
+    // vl128 state = 0x595e2eb0
+    __ dci(0xcec081a5);  // sha512su0 v5.2d, v13.2d
+    // vl128 state = 0x7d7f4e15
+    __ dci(0xcec081a7);  // sha512su0 v7.2d, v13.2d
+    // vl128 state = 0xba4b1bc6
+    __ dci(0xcec081a3);  // sha512su0 v3.2d, v13.2d
+    // vl128 state = 0x2c56ee6e
+    __ dci(0xcec083f3);  // sha512su0 v19.2d, v31.2d
+    // vl128 state = 0xefe9b855
+    __ dci(0xcec08397);  // sha512su0 v23.2d, v28.2d
+    // vl128 state = 0x6f0d20ba
+    __ dci(0xcec08396);  // sha512su0 v22.2d, v28.2d
+    // vl128 state = 0x9be77fdb
+    __ dci(0xcec081b7);  // sha512su0 v23.2d, v13.2d
+    // vl128 state = 0x5d981c55
+    __ dci(0xcec080ff);  // sha512su0 v31.2d, v7.2d
+    // vl128 state = 0x9126079f
+    __ dci(0xcec080fd);  // sha512su0 v29.2d, v7.2d
+    // vl128 state = 0x3199dc9e
+    __ dci(0xcec081dc);  // sha512su0 v28.2d, v14.2d
+    // vl128 state = 0x20fb48d7
+    __ dci(0xcec081cc);  // sha512su0 v12.2d, v14.2d
+    // vl128 state = 0x4ae6221a
+    __ dci(0xcec08088);  // sha512su0 v8.2d, v4.2d
+    // vl128 state = 0x17e8b62d
+    __ dci(0xcec0808a);  // sha512su0 v10.2d, v4.2d
+    // vl128 state = 0x90d73468
+    __ dci(0xcec0809a);  // sha512su0 v26.2d, v4.2d
+    // vl128 state = 0x1f02f97f
+    __ dci(0xcec081de);  // sha512su0 v30.2d, v14.2d
+    // vl128 state = 0xe5ef3e67
+    __ dci(0xcec081bf);  // sha512su0 v31.2d, v13.2d
+    // vl128 state = 0xd1bcc363
+    __ dci(0xcec081bb);  // sha512su0 v27.2d, v13.2d
+    // vl128 state = 0x8bcfab58
+    __ dci(0xcec08033);  // sha512su0 v19.2d, v1.2d
+    // vl128 state = 0x93fb8bad
+    __ dci(0xcec080fb);  // sha512su0 v27.2d, v7.2d
+    // vl128 state = 0x3598e921
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x3598e921,
+        0x7e3ee16c,
+        0x4856987c,
+        0x193bda79,
+        0x84154d6f,
+        0x861f1795,
+        0xb74d39b3,
+        0x9653d8b3,
+        0x6690a066,
+        0x00a29b51,
+        0xb2c795ce,
+        0xcbd03b05,
+        0x9fb2aaec,
+        0x0216b732,
+        0x96eb6864,
+        0x4024f5c7,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_aes) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kAES);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x4e285a86);  // aesd v6.16b, v20.16b
+    // vl128 state = 0x801bfc08
+    __ dci(0x4e2858ae);  // aesd v14.16b, v5.16b
+    // vl128 state = 0xbd83a757
+    __ dci(0x4e2858ac);  // aesd v12.16b, v5.16b
+    // vl128 state = 0x9fb1dc6b
+    __ dci(0x4e2858ae);  // aesd v14.16b, v5.16b
+    // vl128 state = 0xfa1fa7e4
+    __ dci(0x4e28482a);  // aese v10.16b, v1.16b
+    // vl128 state = 0xecfcfe2d
+    __ dci(0x4e28483a);  // aese v26.16b, v1.16b
+    // vl128 state = 0x05e22f07
+    __ dci(0x4e28488a);  // aese v10.16b, v4.16b
+    // vl128 state = 0xdd53df5f
+    __ dci(0x4e28488e);  // aese v14.16b, v4.16b
+    // vl128 state = 0x9d2ac50f
+    __ dci(0x4e28484f);  // aese v15.16b, v2.16b
+    // vl128 state = 0xf45146ab
+    __ dci(0x4e28484b);  // aese v11.16b, v2.16b
+    // vl128 state = 0xf1260a7c
+    __ dci(0x4e28485b);  // aese v27.16b, v2.16b
+    // vl128 state = 0x3a0844da
+    __ dci(0x4e285819);  // aesd v25.16b, v0.16b
+    // vl128 state = 0xaca89993
+    __ dci(0x4e284a09);  // aese v9.16b, v16.16b
+    // vl128 state = 0xef4e9a5f
+    __ dci(0x4e285a4b);  // aesd v11.16b, v18.16b
+    // vl128 state = 0x209a44bc
+    __ dci(0x4e285a4f);  // aesd v15.16b, v18.16b
+    // vl128 state = 0xc6d2d718
+    __ dci(0x4e285a4d);  // aesd v13.16b, v18.16b
+    // vl128 state = 0x1aceef8f
+    __ dci(0x4e285a45);  // aesd v5.16b, v18.16b
+    // vl128 state = 0x7ed056c6
+    __ dci(0x4e285af5);  // aesd v21.16b, v23.16b
+    // vl128 state = 0x429ed71e
+    __ dci(0x4e285a91);  // aesd v17.16b, v20.16b
+    // vl128 state = 0xd7a1f687
+    __ dci(0x4e284ad9);  // aese v25.16b, v22.16b
+    // vl128 state = 0x8fa44574
+    __ dci(0x4e284adb);  // aese v27.16b, v22.16b
+    // vl128 state = 0xd2792169
+    __ dci(0x4e285afa);  // aesd v26.16b, v23.16b
+    // vl128 state = 0xe502f095
+    __ dci(0x4e285bbb);  // aesd v27.16b, v29.16b
+    // vl128 state = 0x0e3d3238
+    __ dci(0x4e285bbf);  // aesd v31.16b, v29.16b
+    // vl128 state = 0x0ad06592
+    __ dci(0x4e285baf);  // aesd v15.16b, v29.16b
+    // vl128 state = 0xb94f3c19
+    __ dci(0x4e284b3f);  // aese v31.16b, v25.16b
+    // vl128 state = 0xf31a0da1
+    __ dci(0x4e284917);  // aese v23.16b, v8.16b
+    // vl128 state = 0x7d2d7811
+    __ dci(0x4e284913);  // aese v19.16b, v8.16b
+    // vl128 state = 0x41b7b854
+    __ dci(0x4e284911);  // aese v17.16b, v8.16b
+    // vl128 state = 0x60600536
+    __ dci(0x4e2849d5);  // aese v21.16b, v14.16b
+    // vl128 state = 0x3e0cc74f
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x3e0cc74f,
+        0x7f17ba2e,
+        0xd59f8e91,
+        0x9f15a51b,
+        0x11d92e66,
+        0xcd53d015,
+        0xbc652785,
+        0x6974fa54,
+        0x953d342e,
+        0xf1aa56b3,
+        0xde8ca1d3,
+        0xba408b82,
+        0x48094fa4,
+        0xb757bcf1,
+        0x2cc5be58,
+        0x6e7a0f58,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_aesmc) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kAES);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0x4e287800);  // aesimc v0.16b, v0.16b
+    // vl128 state = 0x03554749
+    __ dci(0x4e287a28);  // aesimc v8.16b, v17.16b
+    // vl128 state = 0x59d5fedd
+    __ dci(0x4e287a2a);  // aesimc v10.16b, v17.16b
+    // vl128 state = 0xcda29514
+    __ dci(0x4e286aae);  // aesmc v14.16b, v21.16b
+    // vl128 state = 0xae8f019a
+    __ dci(0x4e286abe);  // aesmc v30.16b, v21.16b
+    // vl128 state = 0x7b04c6c0
+    __ dci(0x4e286a0e);  // aesmc v14.16b, v16.16b
+    // vl128 state = 0xaf6c5ce6
+    __ dci(0x4e286a0a);  // aesmc v10.16b, v16.16b
+    // vl128 state = 0xf1d7fd2b
+    __ dci(0x4e286acb);  // aesmc v11.16b, v22.16b
+    // vl128 state = 0x5d693c63
+    __ dci(0x4e286acf);  // aesmc v15.16b, v22.16b
+    // vl128 state = 0xec8971ad
+    __ dci(0x4e286adf);  // aesmc v31.16b, v22.16b
+    // vl128 state = 0x6389b200
+    __ dci(0x4e287a9d);  // aesimc v29.16b, v20.16b
+    // vl128 state = 0xd69341fb
+    __ dci(0x4e28688d);  // aesmc v13.16b, v4.16b
+    // vl128 state = 0x6344af95
+    __ dci(0x4e2878cf);  // aesimc v15.16b, v6.16b
+    // vl128 state = 0x5c58dfac
+    __ dci(0x4e2878cb);  // aesimc v11.16b, v6.16b
+    // vl128 state = 0x7dc9cf34
+    __ dci(0x4e2878c9);  // aesimc v9.16b, v6.16b
+    // vl128 state = 0xff4b3544
+    __ dci(0x4e2878c1);  // aesimc v1.16b, v6.16b
+    // vl128 state = 0xd1937de2
+    __ dci(0x4e287871);  // aesimc v17.16b, v3.16b
+    // vl128 state = 0x7cabd208
+    __ dci(0x4e287815);  // aesimc v21.16b, v0.16b
+    // vl128 state = 0xbc06df94
+    __ dci(0x4e28685d);  // aesmc v29.16b, v2.16b
+    // vl128 state = 0xfc4478bb
+    __ dci(0x4e28685f);  // aesmc v31.16b, v2.16b
+    // vl128 state = 0x0c72c200
+    __ dci(0x4e28787e);  // aesimc v30.16b, v3.16b
+    // vl128 state = 0xdd822b9d
+    __ dci(0x4e28793f);  // aesimc v31.16b, v9.16b
+    // vl128 state = 0x1397dcc6
+    __ dci(0x4e28793b);  // aesimc v27.16b, v9.16b
+    // vl128 state = 0x43f3abd6
+    __ dci(0x4e28792b);  // aesimc v11.16b, v9.16b
+    // vl128 state = 0xeb8ca365
+    __ dci(0x4e2869bb);  // aesmc v27.16b, v13.16b
+    // vl128 state = 0x0a957f4f
+    __ dci(0x4e286b93);  // aesmc v19.16b, v28.16b
+    // vl128 state = 0xbc5da8bd
+    __ dci(0x4e286b97);  // aesmc v23.16b, v28.16b
+    // vl128 state = 0xc49343cc
+    __ dci(0x4e286b95);  // aesmc v21.16b, v28.16b
+    // vl128 state = 0x8c80c144
+    __ dci(0x4e286b51);  // aesmc v17.16b, v26.16b
+    // vl128 state = 0xeda3255d
+    __ dci(0x4e2869d3);  // aesmc v19.16b, v14.16b
+    // vl128 state = 0x8db8a9d0
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x8db8a9d0,
+        0xb13d8e1e,
+        0x9f33ca70,
+        0x38f7ef7a,
+        0x65352b29,
+        0xc4257260,
+        0xf49587c2,
+        0xb3f61256,
+        0x8ef4a534,
+        0x6e061aa9,
+        0x7270527d,
+        0x3e1f82f9,
+        0x1fe79e60,
+        0x985cab68,
+        0xe77b4484,
+        0xe3817f4e,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 10 * kInstructionSize);
+    __ dci(0xce591017);  // sm3ss1 v23.4s, v0.4s, v25.4s, v4.4s
+    // vl128 state = 0xad4bba0a
+    __ dci(0xce49121f);  // sm3ss1 v31.4s, v16.4s, v9.4s, v4.4s
+    // vl128 state = 0x84adef21
+    __ dci(0xce49121e);  // sm3ss1 v30.4s, v16.4s, v9.4s, v4.4s
+    // vl128 state = 0xccfd7e5a
+    __ dci(0xce49301a);  // sm3ss1 v26.4s, v0.4s, v9.4s, v12.4s
+    // vl128 state = 0x60833cc7
+    __ dci(0xce49720a);  // sm3ss1 v10.4s, v16.4s, v9.4s, v28.4s
+    // vl128 state = 0x03f03263
+    __ dci(0xce58721a);  // sm3ss1 v26.4s, v16.4s, v24.4s, v28.4s
+    // vl128 state = 0x31845f40
+    __ dci(0xce58702a);  // sm3ss1 v10.4s, v1.4s, v24.4s, v28.4s
+    // vl128 state = 0x54c64f70
+    __ dci(0xce58753a);  // sm3ss1 v26.4s, v9.4s, v24.4s, v29.4s
+    // vl128 state = 0x3d5cb04f
+    __ dci(0xce507518);  // sm3ss1 v24.4s, v8.4s, v16.4s, v29.4s
+    // vl128 state = 0xe02de221
+    __ dci(0xce406519);  // sm3ss1 v25.4s, v8.4s, v0.4s, v25.4s
+    // vl128 state = 0x73d36ae8
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x73d36ae8,
+        0xcbcda2db,
+        0x6ee9ad3d,
+        0xa6857a16,
+        0xa238ec05,
+        0x1bc82d1d,
+        0xe4530773,
+        0xfb0d092e,
+        0xe62aff0a,
+        0xf56a593f,
+        0x3967d590,
+        0xebcd14a0,
+        0xa7bedcb8,
+        0x867fa43c,
+        0x1679eab5,
+        0x0a836861,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3partw12) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 30 * kInstructionSize);
+    __ dci(0xce70c201);  // sm3partw1 v1.4s, v16.4s, v16.4s
+    // vl128 state = 0x6f2069a6
+    __ dci(0xce72c303);  // sm3partw1 v3.4s, v24.4s, v18.4s
+    // vl128 state = 0x986fa56c
+    __ dci(0xce76c381);  // sm3partw1 v1.4s, v28.4s, v22.4s
+    // vl128 state = 0x5dbd953c
+    __ dci(0xce7ec3b1);  // sm3partw1 v17.4s, v29.4s, v30.4s
+    // vl128 state = 0xc72ccca5
+    __ dci(0xce7ac1b5);  // sm3partw1 v21.4s, v13.4s, v26.4s
+    // vl128 state = 0x33cdfd6a
+    __ dci(0xce7ac1b7);  // sm3partw1 v23.4s, v13.4s, v26.4s
+    // vl128 state = 0x4303e945
+    __ dci(0xce7ac1bf);  // sm3partw1 v31.4s, v13.4s, v26.4s
+    // vl128 state = 0x56acac84
+    __ dci(0xce78c1fd);  // sm3partw1 v29.4s, v15.4s, v24.4s
+    // vl128 state = 0x5e2a2793
+    __ dci(0xce78c5df);  // sm3partw2 v31.4s, v14.4s, v24.4s
+    // vl128 state = 0xf7c457f3
+    __ dci(0xce70c55d);  // sm3partw2 v29.4s, v10.4s, v16.4s
+    // vl128 state = 0xfa3557ac
+    __ dci(0xce60c159);  // sm3partw1 v25.4s, v10.4s, v0.4s
+    // vl128 state = 0xb3ae6830
+    __ dci(0xce62c55b);  // sm3partw2 v27.4s, v10.4s, v2.4s
+    // vl128 state = 0xa7747c70
+    __ dci(0xce66c753);  // sm3partw2 v19.4s, v26.4s, v6.4s
+    // vl128 state = 0xb55f5895
+    __ dci(0xce67c551);  // sm3partw2 v17.4s, v10.4s, v7.4s
+    // vl128 state = 0x519b1342
+    __ dci(0xce65c750);  // sm3partw2 v16.4s, v26.4s, v5.4s
+    // vl128 state = 0xc4e6e4b9
+    __ dci(0xce61c718);  // sm3partw2 v24.4s, v24.4s, v1.4s
+    // vl128 state = 0x127c483c
+    __ dci(0xce61c71c);  // sm3partw2 v28.4s, v24.4s, v1.4s
+    // vl128 state = 0x92783ecc
+    __ dci(0xce6dc714);  // sm3partw2 v20.4s, v24.4s, v13.4s
+    // vl128 state = 0xe11e87d3
+    __ dci(0xce65c756);  // sm3partw2 v22.4s, v26.4s, v5.4s
+    // vl128 state = 0x8b6878d0
+    __ dci(0xce65c5d2);  // sm3partw2 v18.4s, v14.4s, v5.4s
+    // vl128 state = 0xf2fb1e86
+    __ dci(0xce64c550);  // sm3partw2 v16.4s, v10.4s, v4.4s
+    // vl128 state = 0x73ad3b0f
+    __ dci(0xce66c578);  // sm3partw2 v24.4s, v11.4s, v6.4s
+    // vl128 state = 0x7e03900d
+    __ dci(0xce76c55c);  // sm3partw2 v28.4s, v10.4s, v22.4s
+    // vl128 state = 0x1d0b5df6
+    __ dci(0xce76c54c);  // sm3partw2 v12.4s, v10.4s, v22.4s
+    // vl128 state = 0x1a3d7a77
+    __ dci(0xce7ec448);  // sm3partw2 v8.4s, v2.4s, v30.4s
+    // vl128 state = 0x3ed2e4bd
+    __ dci(0xce6ec409);  // sm3partw2 v9.4s, v0.4s, v14.4s
+    // vl128 state = 0x826dd348
+    __ dci(0xce6ec52b);  // sm3partw2 v11.4s, v9.4s, v14.4s
+    // vl128 state = 0x3ff5e482
+    __ dci(0xce66c72f);  // sm3partw2 v15.4s, v25.4s, v6.4s
+    // vl128 state = 0x6fd24cd4
+    __ dci(0xce65c73f);  // sm3partw2 v31.4s, v25.4s, v5.4s
+    // vl128 state = 0xd51ac474
+    __ dci(0xce67c77b);  // sm3partw2 v27.4s, v27.4s, v7.4s
+    // vl128 state = 0x720d7419
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x720d7419,
+        0x31445e06,
+        0xd2aee240,
+        0x45a27e4b,
+        0xd6c46f08,
+        0xcaed7f9e,
+        0x734820c7,
+        0x377e1f38,
+        0x12e03585,
+        0x1b9cbe63,
+        0x1d58d49a,
+        0xc160a9dc,
+        0x22c2fe25,
+        0x86b7af0f,
+        0xfeae7bf5,
+        0xf8dfcc40,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3tt1) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce53a363);  // sm3tt1a v3.4s, v27.4s, v19.s[2]
+    // vl128 state = 0xaaa8c715
+    __ dci(0xce58a7a7);  // sm3tt1b v7.4s, v29.4s, v24.s[2]
+    // vl128 state = 0xb99a301d
+    __ dci(0xce5eb2b7);  // sm3tt1a v23.4s, v21.4s, v30.s[3]
+    // vl128 state = 0xe8dabe99
+    __ dci(0xce43b6ce);  // sm3tt1b v14.4s, v22.4s, v3.s[3]
+    // vl128 state = 0xaa498ae5
+    __ dci(0xce448027);  // sm3tt1a v7.4s, v1.4s, v4.s[0]
+    // vl128 state = 0x32093547
+    __ dci(0xce4286d8);  // sm3tt1b v24.4s, v22.4s, v2.s[0]
+    // vl128 state = 0xe03e3a81
+    __ dci(0xce44a0f3);  // sm3tt1a v19.4s, v7.4s, v4.s[2]
+    // vl128 state = 0xcb555b4a
+    __ dci(0xce418233);  // sm3tt1a v19.4s, v17.4s, v1.s[0]
+    // vl128 state = 0x751e4f7d
+    __ dci(0xce58a49f);  // sm3tt1b v31.4s, v4.4s, v24.s[2]
+    // vl128 state = 0xcaff7580
+    __ dci(0xce548326);  // sm3tt1a v6.4s, v25.4s, v20.s[0]
+    // vl128 state = 0xc4308a78
+    __ dci(0xce548124);  // sm3tt1a v4.4s, v9.4s, v20.s[0]
+    // vl128 state = 0x1f1bfdfb
+    __ dci(0xce5fb282);  // sm3tt1a v2.4s, v20.4s, v31.s[3]
+    // vl128 state = 0xa632c0b2
+    __ dci(0xce549573);  // sm3tt1b v19.4s, v11.4s, v20.s[1]
+    // vl128 state = 0x7fb7c2d3
+    __ dci(0xce4387ae);  // sm3tt1b v14.4s, v29.4s, v3.s[0]
+    // vl128 state = 0xe8d4c534
+    __ dci(0xce5094eb);  // sm3tt1b v11.4s, v7.4s, v16.s[1]
+    // vl128 state = 0xf34a4fbc
+    __ dci(0xce51b59f);  // sm3tt1b v31.4s, v12.4s, v17.s[3]
+    // vl128 state = 0x98e388e9
+    __ dci(0xce50a7bf);  // sm3tt1b v31.4s, v29.4s, v16.s[2]
+    // vl128 state = 0x7cd7a6ac
+    __ dci(0xce5ca52e);  // sm3tt1b v14.4s, v9.4s, v28.s[2]
+    // vl128 state = 0xce9410c5
+    __ dci(0xce5aa741);  // sm3tt1b v1.4s, v26.4s, v26.s[2]
+    // vl128 state = 0xd83fbd58
+    __ dci(0xce5e94da);  // sm3tt1b v26.4s, v6.4s, v30.s[1]
+    // vl128 state = 0xc6055fe3
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xc6055fe3,
+        0xa2c33f98,
+        0x1cc9a227,
+        0xf29eb254,
+        0xd1739d6e,
+        0x1c4fff34,
+        0x0c182795,
+        0x96e46836,
+        0x43d010c9,
+        0xd7c4f94c,
+        0x78c387f2,
+        0x4319fef3,
+        0x72407eef,
+        0xa77d3869,
+        0x3c81c49a,
+        0x68cc20ef,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm3tt2) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM3);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce439d42);  // sm3tt2b v2.4s, v10.4s, v3.s[1]
+    // vl128 state = 0x388642cc
+    __ dci(0xce42b89d);  // sm3tt2a v29.4s, v4.4s, v2.s[3]
+    // vl128 state = 0x66f4e60a
+    __ dci(0xce4da95d);  // sm3tt2a v29.4s, v10.4s, v13.s[2]
+    // vl128 state = 0x95d4651d
+    __ dci(0xce49b926);  // sm3tt2a v6.4s, v9.4s, v9.s[3]
+    // vl128 state = 0x826919fe
+    __ dci(0xce5cae33);  // sm3tt2b v19.4s, v17.4s, v28.s[2]
+    // vl128 state = 0xb5cfefb0
+    __ dci(0xce478959);  // sm3tt2a v25.4s, v10.4s, v7.s[0]
+    // vl128 state = 0xfe17b730
+    __ dci(0xce549cc2);  // sm3tt2b v2.4s, v6.4s, v20.s[1]
+    // vl128 state = 0x769a0d76
+    __ dci(0xce4c9f90);  // sm3tt2b v16.4s, v28.4s, v12.s[1]
+    // vl128 state = 0x8f633b95
+    __ dci(0xce508d49);  // sm3tt2b v9.4s, v10.4s, v16.s[0]
+    // vl128 state = 0x5eab6daa
+    __ dci(0xce59ad79);  // sm3tt2b v25.4s, v11.4s, v25.s[2]
+    // vl128 state = 0xfb197616
+    __ dci(0xce458fd6);  // sm3tt2b v22.4s, v30.4s, v5.s[0]
+    // vl128 state = 0x875ff29d
+    __ dci(0xce4ab92c);  // sm3tt2a v12.4s, v9.4s, v10.s[3]
+    // vl128 state = 0xad159c01
+    __ dci(0xce598a1c);  // sm3tt2a v28.4s, v16.4s, v25.s[0]
+    // vl128 state = 0x3da313e4
+    __ dci(0xce43989f);  // sm3tt2a v31.4s, v4.4s, v3.s[1]
+    // vl128 state = 0xc0a54179
+    __ dci(0xce459c8a);  // sm3tt2b v10.4s, v4.4s, v5.s[1]
+    // vl128 state = 0x4739cdbf
+    __ dci(0xce539959);  // sm3tt2a v25.4s, v10.4s, v19.s[1]
+    // vl128 state = 0xd85f84ab
+    __ dci(0xce429be1);  // sm3tt2a v1.4s, v31.4s, v2.s[1]
+    // vl128 state = 0x85b5871c
+    __ dci(0xce5d9fe3);  // sm3tt2b v3.4s, v31.4s, v29.s[1]
+    // vl128 state = 0x2be5bd95
+    __ dci(0xce4ebe16);  // sm3tt2b v22.4s, v16.4s, v14.s[3]
+    // vl128 state = 0x2f8146e9
+    __ dci(0xce599a63);  // sm3tt2a v3.4s, v19.4s, v25.s[1]
+    // vl128 state = 0xa6e513e2
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xa6e513e2,
+        0x6bf4ae47,
+        0x74e074db,
+        0xae1a57e0,
+        0x0db67f09,
+        0x85332e49,
+        0xc40d6565,
+        0x07ed81aa,
+        0xfa0e10bb,
+        0x9addadfa,
+        0xa9cea561,
+        0xa481e17b,
+        0x7c2be34e,
+        0xd4cf493f,
+        0x8b30cc5e,
+        0xe44416d3,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm4e) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM4);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xcec08400);  // sm4e v0.4s, v0.4s
+    // vl128 state = 0xa687bacc
+    __ dci(0xcec08628);  // sm4e v8.4s, v17.4s
+    // vl128 state = 0xf174e346
+    __ dci(0xcec0862a);  // sm4e v10.4s, v17.4s
+    // vl128 state = 0xab88f8ca
+    __ dci(0xcec08628);  // sm4e v8.4s, v17.4s
+    // vl128 state = 0x000d3840
+    __ dci(0xcec08638);  // sm4e v24.4s, v17.4s
+    // vl128 state = 0xd980ddc2
+    __ dci(0xcec08688);  // sm4e v8.4s, v20.4s
+    // vl128 state = 0xd501f2c2
+    __ dci(0xcec0868c);  // sm4e v12.4s, v20.4s
+    // vl128 state = 0x699d6b6f
+    __ dci(0xcec0864d);  // sm4e v13.4s, v18.4s
+    // vl128 state = 0x67baf406
+    __ dci(0xcec08649);  // sm4e v9.4s, v18.4s
+    // vl128 state = 0x178b048e
+    __ dci(0xcec08659);  // sm4e v25.4s, v18.4s
+    // vl128 state = 0x552a70d9
+    __ dci(0xcec0865d);  // sm4e v29.4s, v18.4s
+    // vl128 state = 0x3be534d1
+    __ dci(0xcec0865f);  // sm4e v31.4s, v18.4s
+    // vl128 state = 0x396fdf70
+    __ dci(0xcec08657);  // sm4e v23.4s, v18.4s
+    // vl128 state = 0x836c474b
+    __ dci(0xcec086e7);  // sm4e v7.4s, v23.4s
+    // vl128 state = 0x71aebad7
+    __ dci(0xcec08683);  // sm4e v3.4s, v20.4s
+    // vl128 state = 0xadfd515c
+    __ dci(0xcec08681);  // sm4e v1.4s, v20.4s
+    // vl128 state = 0xf1465ab4
+    __ dci(0xcec087c0);  // sm4e v0.4s, v30.4s
+    // vl128 state = 0x8555b40f
+    __ dci(0xcec087c4);  // sm4e v4.4s, v30.4s
+    // vl128 state = 0x2cb3f99f
+    __ dci(0xcec087d4);  // sm4e v20.4s, v30.4s
+    // vl128 state = 0x733336fd
+    __ dci(0xcec085fc);  // sm4e v28.4s, v15.4s
+    // vl128 state = 0x11b138f9
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x11b138f9,
+        0x5993c196,
+        0xb9eef6b5,
+        0xf96d88cf,
+        0x8e92bd49,
+        0x04d27185,
+        0x8833f291,
+        0x77933d5b,
+        0x135500cc,
+        0xe5ca977f,
+        0x3e4536af,
+        0xb169aa9d,
+        0xe0b4425b,
+        0x35c1f76e,
+        0x54e3448a,
+        0x4dbf0c92,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
+TEST_SVE(neon_sm4ekey) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSM4);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 20 * kInstructionSize);
+    __ dci(0xce6fc9d4);  // sm4ekey v20.4s, v14.4s, v15.4s
+    // vl128 state = 0x4bb7b396
+    __ dci(0xce6bc8d5);  // sm4ekey v21.4s, v6.4s, v11.4s
+    // vl128 state = 0xf4354b26
+    __ dci(0xce6bc8c5);  // sm4ekey v5.4s, v6.4s, v11.4s
+    // vl128 state = 0x0a331378
+    __ dci(0xce6bc8cd);  // sm4ekey v13.4s, v6.4s, v11.4s
+    // vl128 state = 0x7ed4c2a7
+    __ dci(0xce6fc8e5);  // sm4ekey v5.4s, v7.4s, v15.4s
+    // vl128 state = 0x38a433fd
+    __ dci(0xce6fc8e4);  // sm4ekey v4.4s, v7.4s, v15.4s
+    // vl128 state = 0xc1ad0d76
+    __ dci(0xce6bcaec);  // sm4ekey v12.4s, v23.4s, v11.4s
+    // vl128 state = 0x81660ce3
+    __ dci(0xce6bcae8);  // sm4ekey v8.4s, v23.4s, v11.4s
+    // vl128 state = 0x79f3e5c1
+    __ dci(0xce7bcaaa);  // sm4ekey v10.4s, v21.4s, v27.4s
+    // vl128 state = 0x231e0a79
+    __ dci(0xce72caa8);  // sm4ekey v8.4s, v21.4s, v18.4s
+    // vl128 state = 0xd931c858
+    __ dci(0xce7ac8aa);  // sm4ekey v10.4s, v5.4s, v26.4s
+    // vl128 state = 0x2476ef6a
+    __ dci(0xce7bc888);  // sm4ekey v8.4s, v4.4s, v27.4s
+    // vl128 state = 0xd4a9ac83
+    __ dci(0xce7bc889);  // sm4ekey v9.4s, v4.4s, v27.4s
+    // vl128 state = 0x149fd9b3
+    __ dci(0xce7bc9cd);  // sm4ekey v13.4s, v14.4s, v27.4s
+    // vl128 state = 0xece67fce
+    __ dci(0xce79cbc5);  // sm4ekey v5.4s, v30.4s, v25.4s
+    // vl128 state = 0xccb45863
+    __ dci(0xce71cac4);  // sm4ekey v4.4s, v22.4s, v17.4s
+    // vl128 state = 0xafb23c9d
+    __ dci(0xce71c8e0);  // sm4ekey v0.4s, v7.4s, v17.4s
+    // vl128 state = 0x5c808694
+    __ dci(0xce71c882);  // sm4ekey v2.4s, v4.4s, v17.4s
+    // vl128 state = 0x6cea5132
+    __ dci(0xce73c803);  // sm4ekey v3.4s, v0.4s, v19.4s
+    // vl128 state = 0x67e316db
+    __ dci(0xce71c847);  // sm4ekey v7.4s, v2.4s, v17.4s
+    // vl128 state = 0x317aafac
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0x317aafac,
+        0xbacd34de,
+        0x3e92f0b2,
+        0x3043dbe3,
+        0x6dda4d17,
+        0x6e59ba0d,
+        0xa29887cf,
+        0x3bee1f56,
+        0xacd43191,
+        0x97ab7ada,
+        0x39ebcf53,
+        0xea7b411e,
+        0xd8e1efe9,
+        0x2b99fc57,
+        0xf5f62e02,
+        0xd50621d1,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-simulator-sve2-aarch64.cc b/test/aarch64/test-simulator-sve2-aarch64.cc
index a7c0f401..621754d2 100644
--- a/test/aarch64/test-simulator-sve2-aarch64.cc
+++ b/test/aarch64/test-simulator-sve2-aarch64.cc
@@ -9117,5 +9117,130 @@ TEST_SVE(sve2_extract) {
   }
 }
 
+TEST_SVE(sve2_pmull128) {
+  SVE_SETUP_WITH_FEATURES(CPUFeatures::kSVE,
+                          CPUFeatures::kSVE2,
+                          CPUFeatures::kNEON,
+                          CPUFeatures::kCRC32,
+                          CPUFeatures::kSVEPmull128);
+  START();
+
+  SetInitialMachineState(&masm);
+  // state = 0xe2bd2480
+
+  {
+    ExactAssemblyScope scope(&masm, 40 * kInstructionSize);
+    __ dci(0x45006800);  // pmullb z0.q, z0.d, z0.d
+    // vl128 state = 0x4107ca0c
+    __ dci(0x45006a28);  // pmullb z8.q, z17.d, z0.d
+    // vl128 state = 0xa87d231a
+    __ dci(0x45016a6c);  // pmullb z12.q, z19.d, z1.d
+    // vl128 state = 0xc547fcf6
+    __ dci(0x45116e68);  // pmullt z8.q, z19.d, z17.d
+    // vl128 state = 0x6a01d521
+    __ dci(0x45106a69);  // pmullb z9.q, z19.d, z16.d
+    // vl128 state = 0x64a7ba8a
+    __ dci(0x45006a4d);  // pmullb z13.q, z18.d, z0.d
+    // vl128 state = 0xe59e3f8e
+    __ dci(0x45086e5d);  // pmullt z29.q, z18.d, z8.d
+    // vl128 state = 0xbfbb9316
+    __ dci(0x450a6e75);  // pmullt z21.q, z19.d, z10.d
+    // vl128 state = 0x29f6a4c7
+    __ dci(0x45126e74);  // pmullt z20.q, z19.d, z18.d
+    // vl128 state = 0x4ced9406
+    __ dci(0x45176e75);  // pmullt z21.q, z19.d, z23.d
+    // vl128 state = 0xd09e5676
+    __ dci(0x45176e77);  // pmullt z23.q, z19.d, z23.d
+    // vl128 state = 0x568c0e25
+    __ dci(0x45176e75);  // pmullt z21.q, z19.d, z23.d
+    // vl128 state = 0xb2f13c36
+    __ dci(0x45176b71);  // pmullb z17.q, z27.d, z23.d
+    // vl128 state = 0x160bec4f
+    __ dci(0x451f6b30);  // pmullb z16.q, z25.d, z31.d
+    // vl128 state = 0x2d7e7f49
+    __ dci(0x451f6b20);  // pmullb z0.q, z25.d, z31.d
+    // vl128 state = 0x113d828b
+    __ dci(0x451f6b90);  // pmullb z16.q, z28.d, z31.d
+    // vl128 state = 0xb8b3b3d9
+    __ dci(0x451f6f12);  // pmullt z18.q, z24.d, z31.d
+    // vl128 state = 0x277aacb8
+    __ dci(0x451f6f16);  // pmullt z22.q, z24.d, z31.d
+    // vl128 state = 0xef79c8da
+    __ dci(0x450b6f17);  // pmullt z23.q, z24.d, z11.d
+    // vl128 state = 0x1dc19104
+    __ dci(0x450a6e1f);  // pmullt z31.q, z16.d, z10.d
+    // vl128 state = 0x3ccb4ea8
+    __ dci(0x451a6e2f);  // pmullt z15.q, z17.d, z26.d
+    // vl128 state = 0x14e13481
+    __ dci(0x45126a3f);  // pmullb z31.q, z17.d, z18.d
+    // vl128 state = 0x4e6502f9
+    __ dci(0x451a6b3e);  // pmullb z30.q, z25.d, z26.d
+    // vl128 state = 0xf6f18478
+    __ dci(0x45126a3a);  // pmullb z26.q, z17.d, z18.d
+    // vl128 state = 0xdd4f14fb
+    __ dci(0x45126afb);  // pmullb z27.q, z23.d, z18.d
+    // vl128 state = 0xcbf3bee2
+    __ dci(0x45126aff);  // pmullb z31.q, z23.d, z18.d
+    // vl128 state = 0x627bec09
+    __ dci(0x45126aef);  // pmullb z15.q, z23.d, z18.d
+    // vl128 state = 0xf5de1fa9
+    __ dci(0x45106abf);  // pmullb z31.q, z21.d, z16.d
+    // vl128 state = 0x44bb6385
+    __ dci(0x451a6abb);  // pmullb z27.q, z21.d, z26.d
+    // vl128 state = 0x5c5fa224
+    __ dci(0x450a68b3);  // pmullb z19.q, z5.d, z10.d
+    // vl128 state = 0x28b6085c
+    __ dci(0x450e69b2);  // pmullb z18.q, z13.d, z14.d
+    // vl128 state = 0x450898d6
+    __ dci(0x450e69b6);  // pmullb z22.q, z13.d, z14.d
+    // vl128 state = 0x79d7911b
+    __ dci(0x450e69b4);  // pmullb z20.q, z13.d, z14.d
+    // vl128 state = 0x98bf6939
+    __ dci(0x450f6924);  // pmullb z4.q, z9.d, z15.d
+    // vl128 state = 0xb8a1bbc7
+    __ dci(0x45176925);  // pmullb z5.q, z9.d, z23.d
+    // vl128 state = 0x631b41c8
+    __ dci(0x451f69a4);  // pmullb z4.q, z13.d, z31.d
+    // vl128 state = 0x617fc272
+    __ dci(0x451b69e0);  // pmullb z0.q, z15.d, z27.d
+    // vl128 state = 0x77780ac1
+    __ dci(0x451b69e8);  // pmullb z8.q, z15.d, z27.d
+    // vl128 state = 0xce5ae18f
+    __ dci(0x450f69e0);  // pmullb z0.q, z15.d, z15.d
+    // vl128 state = 0xa037371a
+    __ dci(0x450b6be8);  // pmullb z8.q, z31.d, z11.d
+    // vl128 state = 0xb59be233
+  }
+
+  uint32_t state;
+  ComputeMachineStateHash(&masm, &state);
+  __ Mov(x0, reinterpret_cast<uint64_t>(&state));
+  __ Ldr(w0, MemOperand(x0));
+
+  END();
+  if (CAN_RUN()) {
+    RUN();
+    uint32_t expected_hashes[] = {
+        0xb59be233,
+        0x32430624,
+        0x5cc3ec66,
+        0xecfdffe7,
+        0x6d77a270,
+        0xa0d604f2,
+        0x2178aa11,
+        0xabdcbeaa,
+        0xab3b974f,
+        0x11a874f5,
+        0xf2eb6131,
+        0x6d311c6c,
+        0xd4e99b72,
+        0x5177ce8e,
+        0x32aa02f0,
+        0x681ef977,
+    };
+    ASSERT_EQUAL_64(expected_hashes[core.GetSVELaneCount(kQRegSize) - 1], x0);
+  }
+}
+
 }  // namespace aarch64
 }  // namespace vixl
diff --git a/test/aarch64/test-utils-aarch64.cc b/test/aarch64/test-utils-aarch64.cc
index 398ed4fd..c23f4e8b 100644
--- a/test/aarch64/test-utils-aarch64.cc
+++ b/test/aarch64/test-utils-aarch64.cc
@@ -89,6 +89,34 @@ bool Equal64(uint64_t reference,
 }
 
 
+bool Equal64(std::vector<uint64_t> reference_list,
+             const RegisterDump*,
+             uint64_t result,
+             ExpectedResult option) {
+  switch (option) {
+    case kExpectEqual:
+      for (uint64_t reference : reference_list) {
+        if (result == reference) return true;
+      }
+      printf("Expected a result in (\n");
+      break;
+    case kExpectNotEqual:
+      for (uint64_t reference : reference_list) {
+        if (result == reference) {
+          printf("Expected a result not in (\n");
+          break;
+        }
+      }
+      return true;
+  }
+  for (uint64_t reference : reference_list) {
+    printf("  0x%016" PRIx64 ",\n", reference);
+  }
+  printf(")\t Found 0x%016" PRIx64 "\n", result);
+  return false;
+}
+
+
 bool Equal128(QRegisterValue expected,
               const RegisterDump*,
               QRegisterValue result) {
@@ -200,6 +228,16 @@ bool Equal64(uint64_t reference,
 }
 
 
+bool Equal64(std::vector<uint64_t> reference_list,
+             const RegisterDump* core,
+             const Register& reg,
+             ExpectedResult option) {
+  VIXL_ASSERT(reg.Is64Bits());
+  uint64_t result = core->xreg(reg.GetCode());
+  return Equal64(reference_list, core, result, option);
+}
+
+
 bool NotEqual64(uint64_t reference,
                 const RegisterDump* core,
                 const Register& reg) {
diff --git a/test/aarch64/test-utils-aarch64.h b/test/aarch64/test-utils-aarch64.h
index 9cf91549..40a5aa5e 100644
--- a/test/aarch64/test-utils-aarch64.h
+++ b/test/aarch64/test-utils-aarch64.h
@@ -345,6 +345,10 @@ bool Equal64(uint64_t reference,
              const RegisterDump*,
              uint64_t result,
              ExpectedResult option = kExpectEqual);
+bool Equal64(std::vector<uint64_t> reference_list,
+             const RegisterDump*,
+             uint64_t result,
+             ExpectedResult option = kExpectEqual);
 bool Equal128(QRegisterValue expected,
               const RegisterDump*,
               QRegisterValue result);
@@ -358,6 +362,10 @@ bool Equal64(uint64_t reference,
              const RegisterDump* core,
              const Register& reg,
              ExpectedResult option = kExpectEqual);
+bool Equal64(std::vector<uint64_t> reference_list,
+             const RegisterDump* core,
+             const Register& reg,
+             ExpectedResult option = kExpectEqual);
 bool Equal64(uint64_t expected,
              const RegisterDump* core,
              const VRegister& vreg);
diff --git a/test/test-invalset.cc b/test/test-invalset.cc
index ac53a04d..548f67ea 100644
--- a/test/test-invalset.cc
+++ b/test/test-invalset.cc
@@ -397,5 +397,27 @@ TEST(stl_forward_iterator) {
 #endif
 }
 
+TEST(move) {
+  TestSet set1;
+
+  set1.insert(Obj(-123, 456));
+  set1.insert(Obj(2718, 2871828));
+
+  TestSet set2(std::move(set1));
+  VIXL_CHECK(set1.empty());
+  VIXL_CHECK(set2.size() == 2);
+  VIXL_CHECK(set2.GetMinElement() == Obj(-123, 456));
+
+  // Test with more elements.
+  for (unsigned i = 0; i < 4 * kNPreallocatedElements; i++) {
+    set2.insert(Obj(i, -1));
+  }
+
+  TestSet set3(std::move(set2));
+  VIXL_CHECK(set2.empty());
+  VIXL_CHECK(set3.size() == 2 + 4 * kNPreallocatedElements);
+  VIXL_CHECK(set3.GetMinElement() == Obj(-123, 456));
+}
+
 
 }  // namespace vixl
diff --git a/test/test-pool-manager.cc b/test/test-pool-manager.cc
index eb22ae6f..194154b3 100644
--- a/test/test-pool-manager.cc
+++ b/test/test-pool-manager.cc
@@ -376,7 +376,7 @@ TEST(FuzzObjectDeletedWhenPlaced) {
   }
 
   int32_t pc = 0;
-  for (int i = 0; !objects.empty(); ++i) {
+  while (!objects.empty()) {
     IF_VERBOSE(printf("PC = 0x%x (%d)\n", pc, pc));
     int32_t pc_increment = RandomPCIncrement();
     IF_VERBOSE(printf("Attempting to increment PC by %d\n", pc_increment));
@@ -451,7 +451,7 @@ TEST(FuzzObjectUpdatedWhenPlaced) {
   }
 
   int32_t pc = 0;
-  for (int i = 0; !objects.empty(); ++i) {
+  while (!objects.empty()) {
     IF_VERBOSE(printf("PC = 0x%x (%d)\n", pc, pc));
 
     int32_t pc_increment = RandomPCIncrement();
diff --git a/tools/code_coverage.log b/tools/code_coverage.log
index c27ab83a..d787f6fe 100644
--- a/tools/code_coverage.log
+++ b/tools/code_coverage.log
@@ -14,11 +14,22 @@
 1660224011 82.79% 97.51% 95.50%
 1663161852 82.79% 97.51% 95.50%
 1666104118 82.79% 97.51% 95.50%
+1668785529 82.75% 97.44% 95.40%
 1669202345 82.79% 97.51% 95.51%
 1673432155 82.79% 97.51% 95.51%
 1677171445 82.78% 97.56% 94.81%
 1681814646 82.90% 97.57% 94.87%
 1686666000 82.90% 97.57% 94.87%
 1693487542 82.91% 97.57% 94.87%
+1694008240 82.72% 97.50% 94.95%
+1697036303 82.87% 97.56% 94.76%
+1698228274 82.93% 97.68% 94.90%
+1698330215 82.92% 97.57% 94.88%
 1702052331 82.89% 97.59% 94.77%
+1706691191 82.87% 97.59% 94.74%
 1707395574 82.89% 97.59% 94.77%
+1715261843 82.84% 97.60% 94.69%
+1718190785 82.85% 97.60% 94.70%
+1722595938 82.94% 97.78% 94.72%
+1728570468 82.94% 97.78% 94.71%
+1736874659 82.94% 97.63% 94.78%
diff --git a/tools/lint.py b/tools/lint.py
index 4820439d..f67799b2 100755
--- a/tools/lint.py
+++ b/tools/lint.py
@@ -91,7 +91,7 @@ def Lint(filename, progress_prefix = ''):
   printer.Print(outerr)
 
   # Find the number of errors in this file.
-  res = re.search('Total errors found: (\d+)', outerr)
+  res = re.search(r'Total errors found: (\d+)', outerr)
   if res:
     n_errors_str = res.string[res.start(1):res.end(1)]
     n_errors = int(n_errors_str)
@@ -192,7 +192,7 @@ def IsCppLintAvailable():
     return retcode == 0
 
 
-CPP_EXT_REGEXP = re.compile('\.(cc|h)$')
+CPP_EXT_REGEXP = re.compile(r'\.(cc|h)$')
 def IsLinterInput(filename):
   # lint all C++ files.
   return CPP_EXT_REGEXP.search(filename) != None
diff --git a/tools/util.py b/tools/util.py
index ed41461e..240c6972 100644
--- a/tools/util.py
+++ b/tools/util.py
@@ -89,7 +89,7 @@ def GetCompilerDirectives(env):
     match.group(1): match.group(2)
     for match in [
       # Capture macro name.
-      re.search('^#define (\S+?) (.+)$', macro)
+      re.search(r'^#define (\S+?) (.+)$', macro)
       for macro in out.split('\n')
     ]
     # Filter out non-matches.
@@ -183,7 +183,7 @@ class CompilerInformation(object):
   # "{compiler}-{major}.{minor}". The comparison is done using the provided
   # `operator` argument.
   def CompareVersion(self, operator, description):
-    match = re.search('^(\S+)-(.*?)$', description)
+    match = re.search(r'^(\S+)-(.*?)$', description)
     if not match:
       raise Exception("A version number is required when comparing compilers")
     compiler, version = match.group(1), match.group(2)