Bug 1678097 - Add vector registers on ARM64. r=lth,nbp.

In short, this patch adds support for vector registers on ARM64, including machinery to save and restore them on the stack. It also cleans up and documents some of the save/restore logic. There are many changes: * We add a new type, Bitset128, that can represent the register set on ARM64 with vector registers. This is a mostly-constexpr class with two uint64_t's that behaves like a 128-bit integer in the cases we need for register sets. * Since the new type is 16 bytes wide, misc code throughout the system that switches on register set width is updated to handle the case. * The FloatRegisters::SetType is changed to use Bitset128, and various constants defined in ARM64 FloatRegisters are updated to reflect this. As the constants are constexpr, we can be reasonably sure that there are no initialization-order problems introduced by this. * The registers in ARM64 FloatRegisters are rearranged so that single comes before double, which comes before vector registers, to conform to the order used everywhere else. * In MacroAssembler.h, the semantics required of PushRegsInMask and its associated functions have been further documented. * For ARM64, PushRegsInMask and its associated functions (PushRegsInMask, storeRegsInMask, PopRegsInMaskIgnore, ReduceSetForPush, GetPushSizeInBytes, getRegisterDumpOffsetInBytes, and new function FloatRegister::BroadcastToAllSizes), have been rewritten to handle saving/restoring of SIMD registers. PushRegsInMask and storeRegsInMask have been merged into a single function so as to reduce code duplication and the associated danger of mistakenly writing different formats. * In many places that are ARM64-specific, code guarded by `#ifndef ENABLE_WASM_SIMD` has been removed, so that in effect we always assume that SIMD register state is available on the target, and can be loaded and stored, even though we haven't yet enabled ARM64 SIMD support per se. * The routines PushRegsInMaskForWasmStubs, PopRegsInMaskForWasmStubs and GetPushSizeInBytesForWasmStubs have been removed, since their operation has now been subsumed into their non-`ForWasmStubs` equivalents. * In Safepoints.cpp, {Write,Read}FloatRegisterMask have been rewritten to support 128-bit sets. One side effect is that reads/writes of 64-bit chunks have been changed to use new routines {write,read}Unsigned64 rather than two uses of {write,read}Unsigned. The effect is to save one byte when bits 64:32 are all zero. * Ridealong cleanup: the constant ION_FRAME_SLACK_SIZE has been removed. It has no known uses. * Note that RABALDR_SIDEALLOC_V128 is still in place in the wasm baseline compiler. This patch does not remove it. Differential Revision: https://phabricator.services.mozilla.com/D100116
2024-10-08 19:04:45 +00:00 · 2021-05-05 05:45:35 +00:00 · 2021-05-05 05:45:35 +00:00 · 40dabb3533
commit 40dabb3533
parent 7f923a39e7
16 changed files with 765 additions and 440 deletions
--- a/js/src/jit/CompactBuffer.h
+++ b/js/src/jit/CompactBuffer.h
@ -48,6 +48,21 @@ class CompactBufferReader {
    }
  }

+  uint64_t readVariableLength64() {
+    uint64_t val = 0;
+    uint32_t shift = 0;
+    uint8_t byte;
+    while (true) {
+      MOZ_ASSERT(shift < 64);
+      byte = readByte();
+      val |= (uint64_t(byte) >> 1) << shift;
+      shift += 7;
+      if (!(byte & 1)) {
+        return val;
+      }
+    }
+  }
+
 public:
  CompactBufferReader(const uint8_t* start, const uint8_t* end)
      : buffer_(start), end_(end) {}
@ -74,6 +89,7 @@ class CompactBufferReader {
    return *reinterpret_cast<const uint32_t*>(buffer_);
  }
  uint32_t readUnsigned() { return readVariableLength(); }
+  uint32_t readUnsigned64() { return readVariableLength64(); }
  int32_t readSigned() {
    uint8_t b = readByte();
    bool isNegative = !!(b & (1 << 0));
@ -169,6 +185,13 @@ class CompactBufferWriter {
      original >>= 7;
    } while (original);
  }
+  void writeUnsigned64(uint64_t value) {
+    do {
+      uint8_t byte = ((value & 0x7F) << 1) | (value > 0x7F);
+      writeByte(byte);
+      value >>= 7;
+    } while (value);
+  }
  void writeSigned(int32_t v) {
    bool isNegative = v < 0;
    uint32_t value = isNegative ? -v : v;
--- a/js/src/jit/LIR.h
+++ b/js/src/jit/LIR.h
@ -196,7 +196,11 @@ class LUse : public LAllocation {
  static const uint32_t POLICY_BITS = 3;
  static const uint32_t POLICY_SHIFT = 0;
  static const uint32_t POLICY_MASK = (1 << POLICY_BITS) - 1;
+#ifdef JS_CODEGEN_ARM64
+  static const uint32_t REG_BITS = 7;
+#else
  static const uint32_t REG_BITS = 6;
+#endif
  static const uint32_t REG_SHIFT = POLICY_SHIFT + POLICY_BITS;
  static const uint32_t REG_MASK = (1 << REG_BITS) - 1;

--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@ -399,13 +399,20 @@ class MacroAssembler : public MacroAssemblerSpecific {
  // layout.  Any inconsistencies will certainly lead to crashing in generated
  // code:
  //
-  //   PushRegsInMaskSizeInBytes PushRegsInMask storeRegsInMask
-  //   PopRegsInMask PopRegsInMaskIgnore
+  //   MacroAssembler::PushRegsInMaskSizeInBytes
+  //   MacroAssembler::PushRegsInMask
+  //   MacroAssembler::storeRegsInMask
+  //   MacroAssembler::PopRegsInMask
+  //   MacroAssembler::PopRegsInMaskIgnore
+  //   FloatRegister::getRegisterDumpOffsetInBytes
+  //   (no class) PushRegisterDump
+  //   (union) RegisterContent
  //
  // To be more exact, the invariants are:
  //
  // * The save area is conceptually viewed as starting at a highest address
-  //   and working down to some lower address.
+  //   (really, at "highest address - 1") and working down to some lower
+  //   address.
  //
  // * PushRegsInMask, storeRegsInMask and PopRegsInMask{Ignore} must use
  //   exactly the same memory layout, when starting from the abovementioned
@ -422,6 +429,44 @@ class MacroAssembler : public MacroAssemblerSpecific {
  // * Hence, regardless of whether the save area is created with
  //   storeRegsInMask or PushRegsInMask, it is guaranteed to fit inside an
  //   area of size calculated by PushRegsInMaskSizeInBytes.
+  //
+  // * For the `ignore` argument of PopRegsInMaskIgnore, equality checking
+  //   for the floating point/SIMD registers is done on the basis of the
+  //   underlying physical register, regardless of width.  For example, if the
+  //   to-restore set contains v17 (the SIMD register with encoding 17) and
+  //   the ignore set contains d17 (the double register with encoding 17) then
+  //   no part of the physical register with encoding 17 will be restored.
+  //   (This is probably not true on arm32, since that has aliased float32
+  //   registers; but none of our other targets do.)
+  //
+  // * {Push,store}RegsInMask/storeRegsInMask are further constrained as
+  //   follows: when given the argument AllFloatRegisters, the resulting
+  //   memory area must contain exactly all the SIMD/FP registers for the
+  //   target at their widest width (that we care about).  [We have no targets
+  //   where the SIMD registers and FP register sets are disjoint.]  They must
+  //   be packed end-to-end with no holes, with the register with the lowest
+  //   encoding number (0), as returned by FloatRegister::encoding(), at the
+  //   abovementioned highest address, register 1 just below that, etc.
+  //
+  //   Furthermore the sizeof(RegisterContent) must equal the size of a SIMD
+  //   register in the abovementioned array.
+  //
+  //   Furthermore the value returned by
+  //   FloatRegister::getRegisterDumpOffsetInBytes must be a correct index
+  //   into the abovementioned array.  Given the constraints, the only correct
+  //   value is `reg.encoding() * sizeof(RegisterContent)`.
+
+  // Regarding JitRuntime::generateInvalidator and the first two fields of of
+  // class InvalidationBailoutStack (`fpregs_` and `regs_`).  These form their
+  // own layout-equivalence class.  That is, they must be format-consistent.
+  // But they are not part of the equivalence class that PushRegsInMask et al
+  // belong to. JitRuntime::generateInvalidator may use PushRegsInMask to
+  // generate part of the layout, but that's only a happy coincidence; some
+  // targets roll their own save-code instead.
+  //
+  // Nevertheless, because some targets *do* call PushRegsInMask from
+  // JitRuntime::generateInvalidator, you should check carefully all of the
+  // ::generateInvalidator methods if you change the PushRegsInMask format.

  // The size of the area used by PushRegsInMask.
  size_t PushRegsInMaskSizeInBytes(LiveRegisterSet set)
@ -433,7 +478,11 @@ class MacroAssembler : public MacroAssemblerSpecific {

  // Like PushRegsInMask, but instead of pushing the registers, store them to
  // |dest|. |dest| should point to the end of the reserved space, so the
-  // first register will be stored at |dest.offset - sizeof(register)|.
+  // first register will be stored at |dest.offset - sizeof(register)|.  It is
+  // required that |dest.offset| is at least as large as the value computed by
+  // PushRegsInMaskSizeInBytes for this |set|.  In other words, |dest.base|
+  // must point to either the lowest address in the save area, or some address
+  // below that.
  void storeRegsInMask(LiveRegisterSet set, Address dest, Register scratch)
      DEFINED_ON(arm, arm64, mips32, mips64, x86_shared);

--- a/js/src/jit/Safepoints.cpp
+++ b/js/src/jit/Safepoints.cpp
@ -56,29 +56,52 @@ static PackedRegisterMask ReadRegisterMask(CompactBufferReader& stream) {
  return stream.readUnsigned();
 }

-static void WriteFloatRegisterMask(CompactBufferWriter& stream, uint64_t bits) {
-  if (sizeof(FloatRegisters::SetType) == 1) {
-    stream.writeByte(bits);
-  } else if (sizeof(FloatRegisters::SetType) == 4) {
-    stream.writeUnsigned(bits);
-  } else {
-    MOZ_ASSERT(sizeof(FloatRegisters::SetType) == 8);
-    stream.writeUnsigned(bits & 0xffffffff);
-    stream.writeUnsigned(bits >> 32);
+static void WriteFloatRegisterMask(CompactBufferWriter& stream,
+                                   FloatRegisters::SetType bits) {
+  switch (sizeof(FloatRegisters::SetType)) {
+#ifdef JS_CODEGEN_ARM64
+    case 16:
+      stream.writeUnsigned64(bits.low());
+      stream.writeUnsigned64(bits.high());
+      break;
+#else
+    case 1:
+      stream.writeByte(bits);
+      break;
+    case 4:
+      stream.writeUnsigned(bits);
+      break;
+    case 8:
+      stream.writeUnsigned64(bits);
+      break;
+#endif
+    default:
+      MOZ_CRASH("WriteFloatRegisterMask: unexpected size");
  }
 }

-static int64_t ReadFloatRegisterMask(CompactBufferReader& stream) {
-  if (sizeof(FloatRegisters::SetType) == 1) {
-    return stream.readByte();
+static FloatRegisters::SetType ReadFloatRegisterMask(
+    CompactBufferReader& stream) {
+  switch (sizeof(FloatRegisters::SetType)) {
+#ifdef JS_CODEGEN_ARM64
+    case 16: {
+      uint64_t low = stream.readUnsigned64();
+      uint64_t high = stream.readUnsigned64();
+      return Bitset128(high, low);
+    }
+#else
+    case 1:
+      return stream.readByte();
+    case 2:
+    case 3:
+    case 4:
+      return stream.readUnsigned();
+    case 8:
+      return stream.readUnsigned64();
+#endif
+    default:
+      MOZ_CRASH("ReadFloatRegisterMask: unexpected size");
  }
-  if (sizeof(FloatRegisters::SetType) <= 4) {
-    return stream.readUnsigned();
-  }
-  MOZ_ASSERT(sizeof(FloatRegisters::SetType) == 8);
-  uint64_t ret = stream.readUnsigned();
-  ret |= uint64_t(stream.readUnsigned()) << 32;
-  return ret;
 }

 void SafepointWriter::writeGcRegs(LSafepoint* safepoint) {
@ -399,6 +422,7 @@ SafepointReader::SafepointReader(IonScript* script, const SafepointIndex* si)
    valueSpills_ = GeneralRegisterSet(ReadRegisterMask(stream_));
 #endif
  }
+
  allFloatSpills_ = FloatRegisterSet(ReadFloatRegisterMask(stream_));

  advanceFromGcRegs();
--- a/js/src/jit/arm/Architecture-arm.h
+++ b/js/src/jit/arm/Architecture-arm.h
@ -26,12 +26,6 @@
 namespace js {
 namespace jit {

-// In bytes: slots needed for potential memory->memory move spills.
-//   +8 for cycles
-//   +4 for gpr spills
-//   +8 for double spills
-static const uint32_t ION_FRAME_SLACK_SIZE = 20;
-
 // These offsets are specific to nunboxing, and capture offsets into the
 // components of a js::Value.
 static const int32_t NUNBOX32_TYPE_OFFSET = 4;
--- a/js/src/jit/arm64/Architecture-arm64.cpp
+++ b/js/src/jit/arm64/Architecture-arm64.cpp
@ -46,33 +46,71 @@ FloatRegisters::Code FloatRegisters::FromName(const char* name) {
  return Invalid;
 }

-// These assume no SIMD registers as the register sets do not directly support
-// SIMD.  When SIMD is needed (wasm baseline + stubs), other routines are used.
-
+// This must sync with GetPushSizeInBytes just below and also with
+// MacroAssembler::PushRegsInMask.
 FloatRegisterSet FloatRegister::ReduceSetForPush(const FloatRegisterSet& s) {
-  LiveFloatRegisterSet ret;
-  for (FloatRegisterIterator iter(s); iter.more(); ++iter) {
-    ret.addUnchecked(FromCode((*iter).encoding()));
-  }
-  return ret.set();
+  SetType all = s.bits();
+  SetType set128b =
+      (all & FloatRegisters::AllSimd128Mask) >> FloatRegisters::ShiftSimd128;
+  SetType doubleSet =
+      (all & FloatRegisters::AllDoubleMask) >> FloatRegisters::ShiftDouble;
+  SetType singleSet =
+      (all & FloatRegisters::AllSingleMask) >> FloatRegisters::ShiftSingle;
+
+  // See GetPushSizeInBytes.
+  SetType set64b = (singleSet | doubleSet) & ~set128b;
+
+  SetType reduced = (set128b << FloatRegisters::ShiftSimd128) |
+                    (set64b << FloatRegisters::ShiftDouble);
+  return FloatRegisterSet(reduced);
 }

+// Compute the size of the dump area for |s.ReduceSetForPush()|, as defined by
+// MacroAssembler::PushRegsInMask for this target.
 uint32_t FloatRegister::GetPushSizeInBytes(const FloatRegisterSet& s) {
-  return s.size() * sizeof(double);
+  SetType all = s.bits();
+  SetType set128b =
+      (all & FloatRegisters::AllSimd128Mask) >> FloatRegisters::ShiftSimd128;
+  SetType doubleSet =
+      (all & FloatRegisters::AllDoubleMask) >> FloatRegisters::ShiftDouble;
+  SetType singleSet =
+      (all & FloatRegisters::AllSingleMask) >> FloatRegisters::ShiftSingle;
+
+  // PushRegsInMask pushes singles as if they were doubles.  Also we need to
+  // remove singles or doubles which are also pushed as part of a vector
+  // register.
+  SetType set64b = (singleSet | doubleSet) & ~set128b;
+
+  // The "+ 1) & ~1" is to take into account the alignment hole below the
+  // double-reg dump area.  See MacroAssembler::PushRegsInMaskSizeInBytes.
+  return ((set64b.size() + 1) & ~1) * sizeof(double) +
+         set128b.size() * SizeOfSimd128;
 }

 uint32_t FloatRegister::getRegisterDumpOffsetInBytes() {
-  // Although registers are 128-bits wide, only the first 64 need saving per
-  // ABI.
-  return encoding() * sizeof(double);
+  // See block comment in MacroAssembler.h for further required invariants.
+  static_assert(sizeof(jit::FloatRegisters::RegisterContent) == 16);
+  return encoding() * sizeof(jit::FloatRegisters::RegisterContent);
 }

-#if defined(ENABLE_WASM_SIMD)
-uint32_t FloatRegister::GetPushSizeInBytesForWasmStubs(
-    const FloatRegisterSet& s) {
-  return s.size() * SizeOfSimd128;
+// For N in 0..31, if any of sN, dN or qN is a member of `s`, the returned set
+// will contain all of sN, dN and qN.
+FloatRegisterSet FloatRegister::BroadcastToAllSizes(const FloatRegisterSet& s) {
+  SetType all = s.bits();
+  SetType set128b =
+      (all & FloatRegisters::AllSimd128Mask) >> FloatRegisters::ShiftSimd128;
+  SetType doubleSet =
+      (all & FloatRegisters::AllDoubleMask) >> FloatRegisters::ShiftDouble;
+  SetType singleSet =
+      (all & FloatRegisters::AllSingleMask) >> FloatRegisters::ShiftSingle;
+
+  SetType merged = set128b | doubleSet | singleSet;
+  SetType broadcasted = (merged << FloatRegisters::ShiftSimd128) |
+                        (merged << FloatRegisters::ShiftDouble) |
+                        (merged << FloatRegisters::ShiftSingle);
+
+  return FloatRegisterSet(broadcasted);
 }
-#endif

 uint32_t GetARM64Flags() { return 0; }

--- a/js/src/jit/arm64/Architecture-arm64.h
+++ b/js/src/jit/arm64/Architecture-arm64.h
@ -220,6 +220,107 @@ typedef uint32_t PackedRegisterMask;
 template <typename T>
 class TypedRegisterSet;

+// 128-bit bitset for FloatRegisters::SetType.
+
+class Bitset128 {
+  // The order (hi, lo) looks best in the debugger.
+  uint64_t hi, lo;
+
+ public:
+  MOZ_IMPLICIT constexpr Bitset128(uint64_t initial) : hi(0), lo(initial) {}
+  MOZ_IMPLICIT constexpr Bitset128(const Bitset128& that)
+      : hi(that.hi), lo(that.lo) {}
+
+  constexpr Bitset128(uint64_t hi, uint64_t lo) : hi(hi), lo(lo) {}
+
+  constexpr uint64_t high() const { return hi; }
+
+  constexpr uint64_t low() const { return lo; }
+
+  constexpr Bitset128 operator|(Bitset128 that) const {
+    return Bitset128(hi | that.hi, lo | that.lo);
+  }
+
+  constexpr Bitset128 operator&(Bitset128 that) const {
+    return Bitset128(hi & that.hi, lo & that.lo);
+  }
+
+  constexpr Bitset128 operator^(Bitset128 that) const {
+    return Bitset128(hi ^ that.hi, lo ^ that.lo);
+  }
+
+  constexpr Bitset128 operator~() const { return Bitset128(~hi, ~lo); }
+
+  // We must avoid shifting by the word width, which is complex.  Inlining plus
+  // shift-by-constant will remove a lot of code in the normal case.
+
+  constexpr Bitset128 operator<<(size_t shift) const {
+    if (shift == 0) {
+      return *this;
+    }
+    if (shift < 64) {
+      return Bitset128((hi << shift) | (lo >> (64 - shift)), lo << shift);
+    }
+    if (shift == 64) {
+      return Bitset128(lo, 0);
+    }
+    return Bitset128(lo << (shift - 64), 0);
+  }
+
+  constexpr Bitset128 operator>>(size_t shift) const {
+    if (shift == 0) {
+      return *this;
+    }
+    if (shift < 64) {
+      return Bitset128(hi >> shift, (lo >> shift) | (hi << (64 - shift)));
+    }
+    if (shift == 64) {
+      return Bitset128(0, hi);
+    }
+    return Bitset128(0, hi >> (shift - 64));
+  }
+
+  constexpr bool operator==(Bitset128 that) const {
+    return lo == that.lo && hi == that.hi;
+  }
+
+  constexpr bool operator!=(Bitset128 that) const {
+    return lo != that.lo || hi != that.hi;
+  }
+
+  constexpr bool operator!() const { return (hi | lo) == 0; }
+
+  Bitset128& operator|=(const Bitset128& that) {
+    hi |= that.hi;
+    lo |= that.lo;
+    return *this;
+  }
+
+  Bitset128& operator&=(const Bitset128& that) {
+    hi &= that.hi;
+    lo &= that.lo;
+    return *this;
+  }
+
+  uint32_t size() const {
+    return mozilla::CountPopulation64(hi) + mozilla::CountPopulation64(lo);
+  }
+
+  uint32_t countTrailingZeroes() const {
+    if (lo) {
+      return mozilla::CountTrailingZeroes64(lo);
+    }
+    return mozilla::CountTrailingZeroes64(hi) + 64;
+  }
+
+  uint32_t countLeadingZeroes() const {
+    if (hi) {
+      return mozilla::CountLeadingZeroes64(hi);
+    }
+    return mozilla::CountLeadingZeroes64(lo) + 64;
+  }
+};
+
 class FloatRegisters {
 public:
  enum FPRegisterID {
@ -324,69 +425,34 @@ class FloatRegisters {
  // Eight bits: (invalid << 7) | (kind << 5) | encoding
  typedef uint8_t Code;
  typedef FPRegisterID Encoding;
-  typedef uint64_t SetType;
+  typedef Bitset128 SetType;

-  // WARNING!  About SIMD registers on Arm64:
-  //
-  // There is a Kind 'Simd128' but registers of this kind cannot be stored in
-  // register sets, the kind exists only to tag FloatRegisters as vector
-  // registers for use outside the sets, see below.  The reason for this
-  // weirdness is that the 64-bit SetType is too small to hold information about
-  // vector registers, and we have poor options for making SetType larger.
-  //
-  // (We have two options for increasing the size of SetType: __uint128_t and
-  // some simulation of __uint128_t or perhaps __uint96_t.  Using __uint128_t
-  // does not work because C++ compilers generate aligned accesses to
-  // __uint128_t fields, and structures containing register sets are frequently
-  // not properly aligned because they are allocated with TempAllocator.  Using
-  // a simulation will result in a lot of code, possibly reduce inlining of set
-  // operations, and slow down JS compilation.  We don't want to pay the penalty
-  // unless we have to.)
-  //
-  // Only the baseline compiler and the wasm stubs code need to deal with Arm64
-  // vector registers, Ion will never be exposed because JS does not have SIMD
-  // and we don't have Arm64 wasm support in Ion.  So a fix that introduces the
-  // notion of vector registers but does not allow them to be put into sets
-  // works fairly well: The baseline compiler manages vector registers by
-  // managing the double registers which alias the vector registers, while the
-  // stubs code uses special save and restore paths that always save and restore
-  // the vector registers when they may contain meaningful data.  The complexity
-  // is local to the stubs code, the lowest level of the register management
-  // code in the baseline compiler, and the code in this file.
-
-  enum Kind : uint8_t {
-    Double,
-    Single,
-#ifdef ENABLE_WASM_SIMD
-    Simd128,
-#endif
-    NumTypes
-  };
-
-  static constexpr int NumScalarTypes = 2;
+  enum Kind : uint8_t { Single, Double, Simd128, NumTypes };

  static constexpr Code Invalid = 0x80;

  static const char* GetName(uint32_t code) {
-    // Doubles precede singles, see `Kind` enum above.
+    // clang-format off
    static const char* const Names[] = {
+        "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",  "s8",  "s9",
+        "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19",
+        "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29",
+        "s30", "s31",
+
        "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",  "d8",  "d9",
        "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19",
        "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
-        "d30", "d31", "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
-        "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15", "s16", "s17",
-        "s18", "s19", "s20", "s21", "s22", "s23", "s24", "s25", "s26", "s27",
-        "s28", "s29", "s30", "s31",
-#ifdef ENABLE_WASM_SIMD
+        "d30", "d31",
+
        "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",  "v8",  "v9",
        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
        "v30", "v31",
-#endif
    };
-    static_assert(TotalWithSimd == sizeof(Names) / sizeof(Names[0]),
+    // clang-format on
+    static_assert(Total == sizeof(Names) / sizeof(Names[0]),
                  "Table is the correct size");
-    if (code >= TotalWithSimd) {
+    if (code >= Total) {
      return "invalid";
    }
    return Names[code];
@ -395,64 +461,83 @@ class FloatRegisters {
  static Code FromName(const char* name);

  static const uint32_t TotalPhys = 32;
-  static const uint32_t Total = TotalPhys * NumScalarTypes;
-  static const uint32_t TotalWithSimd = TotalPhys * NumTypes;
+  static const uint32_t Total = TotalPhys * NumTypes;
  static const uint32_t Allocatable = 31;  // Without d31, the scratch register.

  static_assert(sizeof(SetType) * 8 >= Total,
                "SetType should be large enough to enumerate all registers.");

-  static const SetType SpreadSingle = SetType(1)
-                                      << (uint32_t(Single) * TotalPhys);
-  static const SetType SpreadDouble = SetType(1)
-                                      << (uint32_t(Double) * TotalPhys);
-  static const SetType Spread = SpreadSingle | SpreadDouble;
+  static constexpr unsigned ShiftSingle = uint32_t(Single) * TotalPhys;
+  static constexpr unsigned ShiftDouble = uint32_t(Double) * TotalPhys;
+  static constexpr unsigned ShiftSimd128 = uint32_t(Simd128) * TotalPhys;

-  static const SetType AllPhysMask = (SetType(1) << TotalPhys) - 1;
-  static const SetType AllMask = AllPhysMask * Spread;
-  static const SetType AllDoubleMask = AllPhysMask * SpreadDouble;
-  static const SetType AllSingleMask = AllPhysMask * SpreadSingle;
-  static const SetType NoneMask = SetType(0);
+  static constexpr SetType NoneMask = SetType(0);
+  static constexpr SetType AllPhysMask = ~(~SetType(0) << TotalPhys);
+  static constexpr SetType AllSingleMask = AllPhysMask << ShiftSingle;
+  static constexpr SetType AllDoubleMask = AllPhysMask << ShiftDouble;
+  static constexpr SetType AllSimd128Mask = AllPhysMask << ShiftSimd128;
+  static constexpr SetType AllMask =
+      AllDoubleMask | AllSingleMask | AllSimd128Mask;
+  static constexpr SetType AliasMask = (SetType(1) << ShiftSingle) |
+                                       (SetType(1) << ShiftDouble) |
+                                       (SetType(1) << ShiftSimd128);
+
+  static_assert(ShiftSingle == 0,
+                "Or the NonVolatileMask must be computed differently");
+
+  // s31 is the ScratchFloatReg.
+  static constexpr SetType NonVolatileSingleMask =
+      SetType((1 << FloatRegisters::s8) | (1 << FloatRegisters::s9) |
+              (1 << FloatRegisters::s10) | (1 << FloatRegisters::s11) |
+              (1 << FloatRegisters::s12) | (1 << FloatRegisters::s13) |
+              (1 << FloatRegisters::s14) | (1 << FloatRegisters::s15) |
+              (1 << FloatRegisters::s16) | (1 << FloatRegisters::s17) |
+              (1 << FloatRegisters::s18) | (1 << FloatRegisters::s19) |
+              (1 << FloatRegisters::s20) | (1 << FloatRegisters::s21) |
+              (1 << FloatRegisters::s22) | (1 << FloatRegisters::s23) |
+              (1 << FloatRegisters::s24) | (1 << FloatRegisters::s25) |
+              (1 << FloatRegisters::s26) | (1 << FloatRegisters::s27) |
+              (1 << FloatRegisters::s28) | (1 << FloatRegisters::s29) |
+              (1 << FloatRegisters::s30));
+
+  static constexpr SetType NonVolatileMask =
+      (NonVolatileSingleMask << ShiftSingle) |
+      (NonVolatileSingleMask << ShiftDouble) |
+      (NonVolatileSingleMask << ShiftSimd128);
+
+  static constexpr SetType VolatileMask = AllMask & ~NonVolatileMask;
+
+  static constexpr SetType WrapperMask = VolatileMask;
+
+  static_assert(ShiftSingle == 0,
+                "Or the NonAllocatableMask must be computed differently");

  // d31 is the ScratchFloatReg.
-  static const SetType NonVolatileMask =
-      SetType((1 << FloatRegisters::d8) | (1 << FloatRegisters::d9) |
-              (1 << FloatRegisters::d10) | (1 << FloatRegisters::d11) |
-              (1 << FloatRegisters::d12) | (1 << FloatRegisters::d13) |
-              (1 << FloatRegisters::d14) | (1 << FloatRegisters::d15) |
-              (1 << FloatRegisters::d16) | (1 << FloatRegisters::d17) |
-              (1 << FloatRegisters::d18) | (1 << FloatRegisters::d19) |
-              (1 << FloatRegisters::d20) | (1 << FloatRegisters::d21) |
-              (1 << FloatRegisters::d22) | (1 << FloatRegisters::d23) |
-              (1 << FloatRegisters::d24) | (1 << FloatRegisters::d25) |
-              (1 << FloatRegisters::d26) | (1 << FloatRegisters::d27) |
-              (1 << FloatRegisters::d28) | (1 << FloatRegisters::d29) |
-              (1 << FloatRegisters::d30)) *
-      Spread;
+  static constexpr SetType NonAllocatableSingleMask =
+      (SetType(1) << FloatRegisters::s31);

-  static const SetType VolatileMask = AllMask & ~NonVolatileMask;
+  static constexpr SetType NonAllocatableMask =
+      NonAllocatableSingleMask | (NonAllocatableSingleMask << ShiftDouble) |
+      (NonAllocatableSingleMask << ShiftSimd128);

-  static const SetType WrapperMask = VolatileMask;
+  static constexpr SetType AllocatableMask = AllMask & ~NonAllocatableMask;

-  // d31 is the ScratchFloatReg.
-  static const SetType NonAllocatableMask =
-      (SetType(1) << FloatRegisters::d31) * Spread;
-
-  static const SetType AllocatableMask = AllMask & ~NonAllocatableMask;
+  // Content spilled during bailouts.
  union RegisterContent {
    float s;
    double d;
+    uint8_t v128[16];
  };

  static constexpr Encoding encoding(Code c) {
    // assert() not available in constexpr function.
-    // assert(c < TotalWithSimd);
+    // assert(c < Total);
    return Encoding(c & 31);
  }

  static constexpr Kind kind(Code c) {
    // assert() not available in constexpr function.
-    // assert(c < TotalWithSimd && ((c >> 5) & 3) < NumTypes);
+    // assert(c < Total && ((c >> 5) & 3) < NumTypes);
    return Kind((c >> 5) & 3);
  }

@ -462,12 +547,6 @@ class FloatRegisters {
  }
 };

-// In bytes: slots needed for potential memory->memory move spills.
-//   +8 for cycles
-//   +8 for gpr spills
-//   +8 for double spills
-static const uint32_t ION_FRAME_SLACK_SIZE = 24;
-
 static const uint32_t ShadowStackSpace = 0;

 // When our only strategy for far jumps is to encode the offset directly, and
@ -493,19 +572,22 @@ struct FloatRegister {
  typedef Codes::SetType SetType;

  static uint32_t SetSize(SetType x) {
-    static_assert(sizeof(SetType) == 8, "SetType must be 64 bits");
+    static_assert(sizeof(SetType) == 16, "SetType must be 128 bits");
+    x |= x >> FloatRegisters::TotalPhys;
    x |= x >> FloatRegisters::TotalPhys;
    x &= FloatRegisters::AllPhysMask;
-    return mozilla::CountPopulation32(x);
+    MOZ_ASSERT(x.high() == 0);
+    MOZ_ASSERT((x.low() >> 32) == 0);
+    return mozilla::CountPopulation32(x.low());
  }

  static uint32_t FirstBit(SetType x) {
-    static_assert(sizeof(SetType) == 8, "SetType");
-    return mozilla::CountTrailingZeroes64(x);
+    static_assert(sizeof(SetType) == 16, "SetType");
+    return x.countTrailingZeroes();
  }
  static uint32_t LastBit(SetType x) {
-    static_assert(sizeof(SetType) == 8, "SetType");
-    return 63 - mozilla::CountLeadingZeroes64(x);
+    static_assert(sizeof(SetType) == 16, "SetType");
+    return 127 - x.countLeadingZeroes();
  }

  static constexpr size_t SizeOfSimd128 = 16;
@ -529,7 +611,7 @@ struct FloatRegister {
      : encoding_(0), kind_(FloatRegisters::Double), invalid_(true) {}

  static FloatRegister FromCode(uint32_t i) {
-    MOZ_ASSERT(i < Codes::TotalWithSimd);
+    MOZ_ASSERT(i < Codes::Total);
    return FloatRegister(FloatRegisters::encoding(i), FloatRegisters::kind(i));
  }

@ -543,11 +625,7 @@ struct FloatRegister {
  }
  bool isSimd128() const {
    MOZ_ASSERT(!invalid_);
-#ifdef ENABLE_WASM_SIMD
    return kind_ == FloatRegisters::Simd128;
-#else
-    return false;
-#endif
  }
  bool isInvalid() const { return invalid_; }

@ -561,11 +639,7 @@ struct FloatRegister {
  }
  FloatRegister asSimd128() const {
    MOZ_ASSERT(!invalid_);
-#ifdef ENABLE_WASM_SIMD
    return FloatRegister(Encoding(encoding_), FloatRegisters::Simd128);
-#else
-    MOZ_CRASH("No SIMD support");
-#endif
  }

  constexpr uint32_t size() const {
@ -576,12 +650,8 @@ struct FloatRegister {
    if (kind_ == FloatRegisters::Single) {
      return sizeof(float);
    }
-#ifdef ENABLE_WASM_SIMD
    MOZ_ASSERT(kind_ == FloatRegisters::Simd128);
-    return 16;
-#else
-    MOZ_CRASH("No SIMD support");
-#endif
+    return SizeOfSimd128;
  }

  constexpr Code code() const {
@ -619,24 +689,18 @@ struct FloatRegister {
    return kind_ == other.kind_;
  }

-  // numAliased is used only by Ion's register allocator, ergo we ignore SIMD
-  // registers here as Ion will not be exposed to SIMD on this platform.  See
-  // comments above in FloatRegisters.
-  uint32_t numAliased() const { return Codes::NumScalarTypes; }
+  uint32_t numAliased() const { return Codes::NumTypes; }
  uint32_t numAlignedAliased() { return numAliased(); }

  FloatRegister aliased(uint32_t aliasIdx) {
    MOZ_ASSERT(!invalid_);
    MOZ_ASSERT(aliasIdx < numAliased());
    return FloatRegister(Encoding(encoding_),
-                         Kind((aliasIdx + kind_) % numAliased()));
-  }
-  FloatRegister alignedAliased(uint32_t aliasIdx) {
-    MOZ_ASSERT(aliasIdx < numAliased());
-    return aliased(aliasIdx);
+                         Kind((aliasIdx + kind_) % Codes::NumTypes));
  }
+  FloatRegister alignedAliased(uint32_t aliasIdx) { return aliased(aliasIdx); }
  SetType alignedOrDominatedAliasedSet() const {
-    return Codes::Spread << encoding_;
+    return Codes::AliasMask << encoding_;
  }

  static constexpr RegTypeName DefaultType = RegTypeName::Float64;
@ -655,11 +719,12 @@ struct FloatRegister {
  static TypedRegisterSet<FloatRegister> ReduceSetForPush(
      const TypedRegisterSet<FloatRegister>& s);
  static uint32_t GetPushSizeInBytes(const TypedRegisterSet<FloatRegister>& s);
-#ifdef ENABLE_WASM_SIMD
-  static uint32_t GetPushSizeInBytesForWasmStubs(
-      const TypedRegisterSet<FloatRegister>& s);
-#endif
  uint32_t getRegisterDumpOffsetInBytes();
+
+  // For N in 0..31, if any of sN, dN or qN is a member of `s`, the
+  // returned set will contain all of sN, dN and qN.
+  static TypedRegisterSet<FloatRegister> BroadcastToAllSizes(
+      const TypedRegisterSet<FloatRegister>& s);
 };

 template <>
@ -674,6 +739,12 @@ FloatRegister::LiveAsIndexableSet<RegTypeName::Float64>(SetType set) {
  return set & FloatRegisters::AllDoubleMask;
 }

+template <>
+inline FloatRegister::SetType
+FloatRegister::LiveAsIndexableSet<RegTypeName::Vector128>(SetType set) {
+  return set & FloatRegisters::AllSimd128Mask;
+}
+
 template <>
 inline FloatRegister::SetType
 FloatRegister::LiveAsIndexableSet<RegTypeName::Any>(SetType set) {
--- a/js/src/jit/arm64/MacroAssembler-arm64.cpp
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@ -6,6 +6,8 @@

 #include "jit/arm64/MacroAssembler-arm64.h"

+#include "mozilla/Maybe.h"
+
 #include "jsmath.h"

 #include "jit/arm64/MoveEmitter-arm64.h"
@ -781,256 +783,373 @@ void MacroAssembler::flush() { Assembler::flush(); }

 // ===============================================================
 // Stack manipulation functions.
-//
-// These all assume no SIMD registers, because SIMD registers are handled with
-// other routines when that is necessary.  See lengthy comment in
-// Architecture-arm64.h.

 // Routines for saving/restoring registers on the stack.  The format is:
 //
 //   (highest address)
 //
-//   integer (X) regs in any order      size: 8 * # regs
+//   integer (X) regs in any order      size: 8 * # int regs
 //
-//   double (D) regs in any order       size: 8 * # regs
+//   if # int regs is odd,
+//     then an 8 byte alignment hole    size: 0 or 8
+//
+//   double (D) regs in any order       size: 8 * # double regs
+//
+//   if # double regs is odd,
+//     then an 8 byte alignment hole    size: 0 or 8
+//
+//   vector (Q) regs in any order       size: 16 * # vector regs
 //
 //   (lowest address)
+//
+// Hence the size of the save area is 0 % 16.  And, provided that the base
+// (highest) address is 16-aligned, then the vector reg save/restore accesses
+// will also be 16-aligned, as will pairwise operations for the double regs.
+//
+// Implied by this is that the format of the double and vector dump area
+// corresponds with what FloatRegister::GetPushSizeInBytes computes.
+// See block comment in MacroAssembler.h for more details.

 size_t MacroAssembler::PushRegsInMaskSizeInBytes(LiveRegisterSet set) {
-  return set.gprs().size() * sizeof(intptr_t) + set.fpus().getPushSizeInBytes();
+  size_t numIntRegs = set.gprs().size();
+  return ((numIntRegs + 1) & ~1) * sizeof(intptr_t) +
+         FloatRegister::GetPushSizeInBytes(set.fpus());
 }

-void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
-  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();
+// Generate code to dump the values in `set`, either on the stack if `dest` is
+// `Nothing` or working backwards from the address denoted by `dest` if it is
+// `Some`.  These two cases are combined so as to minimise the chance of
+// mistakenly generating different formats for the same `set`, given that the
+// `Some` `dest` case is used extremely rarely.
+static void PushOrStoreRegsInMask(MacroAssembler* masm, LiveRegisterSet set,
+                                  mozilla::Maybe<Address> dest) {
+  static_assert(sizeof(FloatRegisters::RegisterContent) == 16);

+  // If we're saving to arbitrary memory, check the destination is big enough.
+  if (dest) {
+    mozilla::DebugOnly<size_t> bytesRequired =
+        masm->PushRegsInMaskSizeInBytes(set);
+    MOZ_ASSERT(dest->offset >= 0);
+    MOZ_ASSERT(((size_t)dest->offset) >= bytesRequired);
+  }
+
+  // Note the high limit point; we'll check it again later.
+  mozilla::DebugOnly<size_t> maxExtentInitial =
+      dest ? dest->offset : masm->framePushed();
+
+  // Gather up the integer registers in groups of four, and either push each
+  // group as a single transfer so as to minimise the number of stack pointer
+  // changes, or write them individually to memory.  Take care to ensure the
+  // space used remains 16-aligned.
  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more();) {
    vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg, vixl::NoCPUReg,
                                vixl::NoCPUReg};
-
-    for (size_t i = 0; i < 4 && iter.more(); i++) {
+    size_t i;
+    for (i = 0; i < 4 && iter.more(); i++) {
      src[i] = ARMRegister(*iter, 64);
      ++iter;
-      adjustFrame(8);
    }
-    vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+    MOZ_ASSERT(i > 0);
+
+    if (i == 1 || i == 3) {
+      // Ensure the stack remains 16-aligned
+      MOZ_ASSERT(!iter.more());
+      src[i] = vixl::xzr;
+      i++;
+    }
+    MOZ_ASSERT(i == 2 || i == 4);
+
+    if (dest) {
+      for (size_t j = 0; j < i; j++) {
+        Register ireg = Register::FromCode(src[j].IsZero() ? Registers::xzr
+                                                           : src[j].code());
+        dest->offset -= sizeof(intptr_t);
+        masm->storePtr(ireg, *dest);
+      }
+    } else {
+      masm->adjustFrame(i * 8);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+    }
  }

-  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
-       iter.more();) {
-    vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg, vixl::NoCPUReg,
-                                vixl::NoCPUReg};
+  // Now the same for the FP double registers.  Note that because of how
+  // ReduceSetForPush works, an underlying AArch64 SIMD/FP register can either
+  // be present as a double register, or as a V128 register, but not both.
+  // Firstly, round up the registers to be pushed.

-    MOZ_ASSERT(sizeof(FloatRegisters::RegisterContent) == 8);
-    for (size_t i = 0; i < 4 && iter.more(); i++) {
-      FloatRegister reg = *iter;
-#ifdef ENABLE_WASM_SIMD
-      MOZ_RELEASE_ASSERT(reg.isDouble() || reg.isSingle());
-#endif
-      src[i] = ARMFPRegister(reg, 64);
-      ++iter;
-      adjustFrame(8);
+  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
+  vixl::CPURegister allSrcs[FloatRegisters::TotalPhys];
+  size_t numAllSrcs = 0;
+
+  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isDouble()) {
+      MOZ_RELEASE_ASSERT(numAllSrcs < FloatRegisters::TotalPhys);
+      allSrcs[numAllSrcs] = ARMFPRegister(reg, 64);
+      numAllSrcs++;
+    } else {
+      MOZ_ASSERT(reg.isSimd128());
    }
-    vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);

-  MOZ_ASSERT(framePushed() - framePushedInitial ==
-             PushRegsInMaskSizeInBytes(set));
+  if ((numAllSrcs & 1) == 1) {
+    // We've got an odd number of doubles.  In order to maintain 16-alignment,
+    // push the last register twice.  We'll skip over the duplicate in
+    // PopRegsInMaskIgnore.
+    allSrcs[numAllSrcs] = allSrcs[numAllSrcs - 1];
+    numAllSrcs++;
+  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);
+  MOZ_RELEASE_ASSERT((numAllSrcs & 1) == 0);
+
+  // And now generate the transfers.
+  size_t i;
+  if (dest) {
+    for (i = 0; i < numAllSrcs; i++) {
+      FloatRegister freg =
+          FloatRegister(FloatRegisters::FPRegisterID(allSrcs[i].code()),
+                        FloatRegisters::Kind::Double);
+      dest->offset -= sizeof(double);
+      masm->storeDouble(freg, *dest);
+    }
+  } else {
+    i = 0;
+    while (i < numAllSrcs) {
+      vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg,
+                                  vixl::NoCPUReg, vixl::NoCPUReg};
+      size_t j;
+      for (j = 0; j < 4 && j + i < numAllSrcs; j++) {
+        src[j] = allSrcs[j + i];
+      }
+      masm->adjustFrame(8 * j);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+      i += j;
+    }
+  }
+  MOZ_ASSERT(i == numAllSrcs);
+
+  // Finally, deal with the SIMD (V128) registers.  This is a bit simpler
+  // as there's no need for special-casing to maintain 16-alignment.
+
+  numAllSrcs = 0;
+  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isSimd128()) {
+      MOZ_RELEASE_ASSERT(numAllSrcs < FloatRegisters::TotalPhys);
+      allSrcs[numAllSrcs] = ARMFPRegister(reg, 128);
+      numAllSrcs++;
+    }
+  }
+  MOZ_RELEASE_ASSERT(numAllSrcs <= FloatRegisters::TotalPhys);
+
+  // Generate the transfers.
+  if (dest) {
+    for (i = 0; i < numAllSrcs; i++) {
+      FloatRegister freg =
+          FloatRegister(FloatRegisters::FPRegisterID(allSrcs[i].code()),
+                        FloatRegisters::Kind::Simd128);
+      dest->offset -= FloatRegister::SizeOfSimd128;
+      masm->storeUnalignedSimd128(freg, *dest);
+    }
+  } else {
+    i = 0;
+    while (i < numAllSrcs) {
+      vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg,
+                                  vixl::NoCPUReg, vixl::NoCPUReg};
+      size_t j;
+      for (j = 0; j < 4 && j + i < numAllSrcs; j++) {
+        src[j] = allSrcs[j + i];
+      }
+      masm->adjustFrame(16 * j);
+      masm->vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
+      i += j;
+    }
+  }
+  MOZ_ASSERT(i == numAllSrcs);
+
+  // Final overrun check.
+  if (dest) {
+    MOZ_ASSERT(maxExtentInitial - dest->offset ==
+               masm->PushRegsInMaskSizeInBytes(set));
+  } else {
+    MOZ_ASSERT(masm->framePushed() - maxExtentInitial ==
+               masm->PushRegsInMaskSizeInBytes(set));
+  }
+}
+
+void MacroAssembler::PushRegsInMask(LiveRegisterSet set) {
+  PushOrStoreRegsInMask(this, set, mozilla::Nothing());
 }

 void MacroAssembler::storeRegsInMask(LiveRegisterSet set, Address dest,
                                     Register scratch) {
-  mozilla::DebugOnly<size_t> offsetInitial = dest.offset;
+  PushOrStoreRegsInMask(this, set, mozilla::Some(dest));
+}

-  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());
-  unsigned numFpu = fpuSet.size();
-  int32_t diffF = fpuSet.getPushSizeInBytes();
-  int32_t diffG = set.gprs().size() * sizeof(intptr_t);
-
-  MOZ_ASSERT(dest.offset >= diffG + diffF);
-
-  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more(); ++iter) {
-    diffG -= sizeof(intptr_t);
-    dest.offset -= sizeof(intptr_t);
-    storePtr(*iter, dest);
+// This is a helper function for PopRegsInMaskIgnore below.  It emits the
+// loads described by dests[0] and [1] and offsets[0] and [1], generating a
+// load-pair if it can.
+static void GeneratePendingLoadsThenFlush(MacroAssembler* masm,
+                                          vixl::CPURegister* dests,
+                                          uint32_t* offsets,
+                                          uint32_t transactionSize) {
+  // Generate the loads ..
+  if (!dests[0].IsNone()) {
+    if (!dests[1].IsNone()) {
+      // [0] and [1] both present.
+      if (offsets[0] + transactionSize == offsets[1]) {
+        masm->Ldp(dests[0], dests[1],
+                  MemOperand(masm->GetStackPointer64(), offsets[0]));
+      } else {
+        // Theoretically we could check for a load-pair with the destinations
+        // switched, but our callers will never generate that.  Hence there's
+        // no loss in giving up at this point and generating two loads.
+        masm->Ldr(dests[0], MemOperand(masm->GetStackPointer64(), offsets[0]));
+        masm->Ldr(dests[1], MemOperand(masm->GetStackPointer64(), offsets[1]));
+      }
+    } else {
+      // [0] only.
+      masm->Ldr(dests[0], MemOperand(masm->GetStackPointer64(), offsets[0]));
+    }
+  } else {
+    if (!dests[1].IsNone()) {
+      // [1] only.  Can't happen because callers always fill [0] before [1].
+      MOZ_CRASH("GenerateLoadsThenFlush");
+    } else {
+      // Neither entry valid.  This can happen.
+    }
  }
-  MOZ_ASSERT(diffG == 0);

-  for (FloatRegisterBackwardIterator iter(fpuSet); iter.more(); ++iter) {
-    FloatRegister reg = *iter;
-#ifdef ENABLE_WASM_SIMD
-    MOZ_RELEASE_ASSERT(reg.isDouble() || reg.isSingle());
-#endif
-    diffF -= sizeof(double);
-    dest.offset -= sizeof(double);
-    numFpu -= 1;
-    storeDouble(reg, dest);
-  }
-  MOZ_ASSERT(numFpu == 0);
-  // Padding to keep the stack aligned, taken from the x64 and mips64
-  // implementations.
-  diffF -= diffF % sizeof(uintptr_t);
-  MOZ_ASSERT(diffF == 0);
-
-  MOZ_ASSERT(offsetInitial - dest.offset == PushRegsInMaskSizeInBytes(set));
+  // .. and flush.
+  dests[0] = dests[1] = vixl::NoCPUReg;
+  offsets[0] = offsets[1] = 0;
 }

 void MacroAssembler::PopRegsInMaskIgnore(LiveRegisterSet set,
                                         LiveRegisterSet ignore) {
+  mozilla::DebugOnly<size_t> framePushedInitial = framePushed();
+
  // The offset of the data from the stack pointer.
  uint32_t offset = 0;

-  for (FloatRegisterIterator iter(set.fpus().reduceSetForPush());
-       iter.more();) {
-    vixl::CPURegister dest[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
-    uint32_t nextOffset = offset;
+  // The set of FP/SIMD registers we need to restore.
+  FloatRegisterSet fpuSet(set.fpus().reduceSetForPush());

-    for (size_t i = 0; i < 2 && iter.more(); i++) {
-      FloatRegister reg = *iter;
-#ifdef ENABLE_WASM_SIMD
-      MOZ_RELEASE_ASSERT(reg.isDouble() || reg.isSingle());
-#endif
-      if (!ignore.has(reg)) {
-        dest[i] = ARMFPRegister(reg, 64);
-      }
-      ++iter;
-      nextOffset += sizeof(double);
+  // The set of registers to ignore.  BroadcastToAllSizes() is used to avoid
+  // any ambiguities arising from (eg) `fpuSet` containing q17 but `ignore`
+  // containing d17.
+  FloatRegisterSet ignoreFpusBroadcasted(
+      FloatRegister::BroadcastToAllSizes(ignore.fpus()));
+
+  // First recover the SIMD (V128) registers.  This is straightforward in that
+  // we don't need to think about alignment holes.
+
+  // These three form a two-entry queue that holds loads that we know we
+  // need, but which we haven't yet emitted.
+  vixl::CPURegister pendingDests[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
+  uint32_t pendingOffsets[2] = {0, 0};
+  size_t nPending = 0;
+
+  for (FloatRegisterIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isDouble()) {
+      continue;
+    }
+    MOZ_RELEASE_ASSERT(reg.isSimd128());
+
+    uint32_t offsetForReg = offset;
+    offset += FloatRegister::SizeOfSimd128;
+
+    if (ignoreFpusBroadcasted.hasRegisterIndex(reg)) {
+      continue;
    }

-    if (!dest[0].IsNone() && !dest[1].IsNone()) {
-      Ldp(dest[0], dest[1], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[0].IsNone()) {
-      Ldr(dest[0], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[1].IsNone()) {
-      Ldr(dest[1], MemOperand(GetStackPointer64(), offset + sizeof(double)));
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 16);
+      nPending = 0;
    }
+    pendingDests[nPending] = ARMFPRegister(reg, 128);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 16);
+  nPending = 0;

-    offset = nextOffset;
+  MOZ_ASSERT((offset % 16) == 0);
+
+  // Now recover the FP double registers.  This is more tricky in that we need
+  // to skip over the lowest-addressed of them if the number of them was odd.
+
+  if ((((fpuSet.bits() & FloatRegisters::AllDoubleMask).size()) & 1) == 1) {
+    offset += sizeof(double);
  }

+  for (FloatRegisterIterator iter(fpuSet); iter.more(); ++iter) {
+    FloatRegister reg = *iter;
+    if (reg.isSimd128()) {
+      continue;
+    }
+    /* true but redundant, per loop above: MOZ_RELEASE_ASSERT(reg.isDouble()) */
+
+    uint32_t offsetForReg = offset;
+    offset += sizeof(double);
+
+    if (ignoreFpusBroadcasted.hasRegisterIndex(reg)) {
+      continue;
+    }
+
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+      nPending = 0;
+    }
+    pendingDests[nPending] = ARMFPRegister(reg, 64);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+  nPending = 0;
+
+  MOZ_ASSERT((offset % 16) == 0);
  MOZ_ASSERT(offset == set.fpus().getPushSizeInBytes());

-  for (GeneralRegisterIterator iter(set.gprs()); iter.more();) {
-    vixl::CPURegister dest[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
-    uint32_t nextOffset = offset;
+  // And finally recover the integer registers, again skipping an alignment
+  // hole if it exists.

-    for (size_t i = 0; i < 2 && iter.more(); i++) {
-      if (!ignore.has(*iter)) {
-        dest[i] = ARMRegister(*iter, 64);
-      }
-      ++iter;
-      nextOffset += sizeof(uint64_t);
-    }
-
-    if (!dest[0].IsNone() && !dest[1].IsNone()) {
-      Ldp(dest[0], dest[1], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[0].IsNone()) {
-      Ldr(dest[0], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[1].IsNone()) {
-      Ldr(dest[1], MemOperand(GetStackPointer64(), offset + sizeof(uint64_t)));
-    }
-
-    offset = nextOffset;
+  if ((set.gprs().size() & 1) == 1) {
+    offset += sizeof(uint64_t);
  }

+  for (GeneralRegisterIterator iter(set.gprs()); iter.more(); ++iter) {
+    Register reg = *iter;
+
+    uint32_t offsetForReg = offset;
+    offset += sizeof(uint64_t);
+
+    if (ignore.has(reg)) {
+      continue;
+    }
+
+    MOZ_ASSERT(nPending <= 2);
+    if (nPending == 2) {
+      GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+      nPending = 0;
+    }
+    pendingDests[nPending] = ARMRegister(reg, 64);
+    pendingOffsets[nPending] = offsetForReg;
+    nPending++;
+  }
+  GeneratePendingLoadsThenFlush(this, pendingDests, pendingOffsets, 8);
+
+  MOZ_ASSERT((offset % 16) == 0);
+
  size_t bytesPushed = PushRegsInMaskSizeInBytes(set);
  MOZ_ASSERT(offset == bytesPushed);
  freeStack(bytesPushed);
 }

-#ifdef ENABLE_WASM_SIMD
-void MacroAssemblerCompat::PushRegsInMaskForWasmStubs(LiveRegisterSet set) {
-  for (GeneralRegisterBackwardIterator iter(set.gprs()); iter.more();) {
-    vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg, vixl::NoCPUReg,
-                                vixl::NoCPUReg};
-
-    for (size_t i = 0; i < 4 && iter.more(); i++) {
-      src[i] = ARMRegister(*iter, 64);
-      ++iter;
-      asMasm().adjustFrame(8);
-    }
-    vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
-  }
-
-  // reduceSetForPush returns a set with the unique encodings and kind==0.  For
-  // each encoding in the set, just push the SIMD register.
-  for (FloatRegisterBackwardIterator iter(set.fpus().reduceSetForPush());
-       iter.more();) {
-    vixl::CPURegister src[4] = {vixl::NoCPUReg, vixl::NoCPUReg, vixl::NoCPUReg,
-                                vixl::NoCPUReg};
-
-    for (size_t i = 0; i < 4 && iter.more(); i++) {
-      src[i] = ARMFPRegister(*iter, 128);
-      ++iter;
-      asMasm().adjustFrame(FloatRegister::SizeOfSimd128);
-    }
-    vixl::MacroAssembler::Push(src[0], src[1], src[2], src[3]);
-  }
-}
-
-void MacroAssemblerCompat::PopRegsInMaskForWasmStubs(LiveRegisterSet set,
-                                                     LiveRegisterSet ignore) {
-  // The offset of the data from the stack pointer.
-  uint32_t offset = 0;
-
-  // See comments above
-  for (FloatRegisterIterator iter(set.fpus().reduceSetForPush());
-       iter.more();) {
-    vixl::CPURegister dest[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
-    uint32_t nextOffset = offset;
-
-    for (size_t i = 0; i < 2 && iter.more(); i++) {
-      if (!ignore.has(*iter)) {
-        dest[i] = ARMFPRegister(*iter, 128);
-      }
-      ++iter;
-      nextOffset += FloatRegister::SizeOfSimd128;
-    }
-
-    if (!dest[0].IsNone() && !dest[1].IsNone()) {
-      Ldp(dest[0], dest[1], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[0].IsNone()) {
-      Ldr(dest[0], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[1].IsNone()) {
-      Ldr(dest[1], MemOperand(GetStackPointer64(), offset + 16));
-    }
-
-    offset = nextOffset;
-  }
-
-  MOZ_ASSERT(offset ==
-             FloatRegister::GetPushSizeInBytesForWasmStubs(set.fpus()));
-
-  for (GeneralRegisterIterator iter(set.gprs()); iter.more();) {
-    vixl::CPURegister dest[2] = {vixl::NoCPUReg, vixl::NoCPUReg};
-    uint32_t nextOffset = offset;
-
-    for (size_t i = 0; i < 2 && iter.more(); i++) {
-      if (!ignore.has(*iter)) {
-        dest[i] = ARMRegister(*iter, 64);
-      }
-      ++iter;
-      nextOffset += sizeof(uint64_t);
-    }
-
-    if (!dest[0].IsNone() && !dest[1].IsNone()) {
-      Ldp(dest[0], dest[1], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[0].IsNone()) {
-      Ldr(dest[0], MemOperand(GetStackPointer64(), offset));
-    } else if (!dest[1].IsNone()) {
-      Ldr(dest[1], MemOperand(GetStackPointer64(), offset + sizeof(uint64_t)));
-    }
-
-    offset = nextOffset;
-  }
-
-  size_t bytesPushed =
-      set.gprs().size() * sizeof(uint64_t) +
-      FloatRegister::GetPushSizeInBytesForWasmStubs(set.fpus());
-  MOZ_ASSERT(offset == bytesPushed);
-  asMasm().freeStack(bytesPushed);
-}
-#endif
-
 void MacroAssembler::Push(Register reg) {
  push(reg);
  adjustFrame(sizeof(intptr_t));
--- a/js/src/jit/arm64/MacroAssembler-arm64.h
+++ b/js/src/jit/arm64/MacroAssembler-arm64.h
@ -250,11 +250,6 @@ class MacroAssemblerCompat : public vixl::MacroAssembler {
    vixl::MacroAssembler::Drop(Operand(ARMRegister(amount, 64)));
  }

-#ifdef ENABLE_WASM_SIMD
-  void PushRegsInMaskForWasmStubs(LiveRegisterSet set);
-  void PopRegsInMaskForWasmStubs(LiveRegisterSet set, LiveRegisterSet ignore);
-#endif
-
  // Update sp with the value of the current active stack pointer, if necessary.
  void syncStackPtr() {
    if (!GetStackPointer64().Is(vixl::sp)) {
--- a/js/src/jit/arm64/MoveEmitter-arm64.cpp
+++ b/js/src/jit/arm64/MoveEmitter-arm64.cpp
@ -59,6 +59,9 @@ void MoveEmitterARM64::emitMove(const MoveOp& move) {
    case MoveOp::DOUBLE:
      emitDoubleMove(from, to);
      break;
+    case MoveOp::SIMD128:
+      emitSimd128Move(from, to);
+      break;
    case MoveOp::INT32:
      emitInt32Move(from, to);
      break;
@ -114,6 +117,28 @@ void MoveEmitterARM64::emitDoubleMove(const MoveOperand& from,
  masm.Str(scratch, toMemOperand(to));
 }

+void MoveEmitterARM64::emitSimd128Move(const MoveOperand& from,
+                                       const MoveOperand& to) {
+  if (from.isFloatReg()) {
+    if (to.isFloatReg()) {
+      masm.Mov(toFPReg(to, MoveOp::SIMD128), toFPReg(from, MoveOp::SIMD128));
+    } else {
+      masm.Str(toFPReg(from, MoveOp::SIMD128), toMemOperand(to));
+    }
+    return;
+  }
+
+  if (to.isFloatReg()) {
+    masm.Ldr(toFPReg(to, MoveOp::SIMD128), toMemOperand(from));
+    return;
+  }
+
+  vixl::UseScratchRegisterScope temps(&masm.asVIXL());
+  const ARMFPRegister scratch = temps.AcquireD();
+  masm.Ldr(scratch, toMemOperand(from));
+  masm.Str(scratch, toMemOperand(to));
+}
+
 void MoveEmitterARM64::emitInt32Move(const MoveOperand& from,
                                     const MoveOperand& to) {
  if (from.isGeneralReg()) {
@ -213,6 +238,17 @@ void MoveEmitterARM64::breakCycle(const MoveOperand& from,
      }
      break;

+    case MoveOp::SIMD128:
+      if (to.isMemory()) {
+        vixl::UseScratchRegisterScope temps(&masm.asVIXL());
+        const ARMFPRegister scratch128 = temps.AcquireQ();
+        masm.Ldr(scratch128, toMemOperand(to));
+        masm.Str(scratch128, cycleSlot());
+      } else {
+        masm.Str(toFPReg(to, type), cycleSlot());
+      }
+      break;
+
    case MoveOp::INT32:
      if (to.isMemory()) {
        vixl::UseScratchRegisterScope temps(&masm.asVIXL());
@ -265,6 +301,17 @@ void MoveEmitterARM64::completeCycle(const MoveOperand& from,
      }
      break;

+    case MoveOp::SIMD128:
+      if (to.isMemory()) {
+        vixl::UseScratchRegisterScope temps(&masm.asVIXL());
+        const ARMFPRegister scratch = temps.AcquireQ();
+        masm.Ldr(scratch, cycleSlot());
+        masm.Str(scratch, toMemOperand(to));
+      } else {
+        masm.Ldr(toFPReg(to, type), cycleSlot());
+      }
+      break;
+
    case MoveOp::INT32:
      if (to.isMemory()) {
        vixl::UseScratchRegisterScope temps(&masm.asVIXL());
--- a/js/src/jit/arm64/MoveEmitter-arm64.h
+++ b/js/src/jit/arm64/MoveEmitter-arm64.h
@ -50,12 +50,21 @@ class MoveEmitterARM64 {
  }
  ARMFPRegister toFPReg(const MoveOperand& operand, MoveOp::Type t) const {
    MOZ_ASSERT(operand.isFloatReg());
-    return ARMFPRegister(operand.floatReg().encoding(),
-                         t == MoveOp::FLOAT32 ? 32 : 64);
+    switch (t) {
+      case MoveOp::FLOAT32:
+        return ARMFPRegister(operand.floatReg().encoding(), 32);
+      case MoveOp::DOUBLE:
+        return ARMFPRegister(operand.floatReg().encoding(), 64);
+      case MoveOp::SIMD128:
+        return ARMFPRegister(operand.floatReg().encoding(), 128);
+      default:
+        MOZ_MAKE_COMPILER_ASSUME_IS_UNREACHABLE("Bad register type");
+    }
  }

  void emitFloat32Move(const MoveOperand& from, const MoveOperand& to);
  void emitDoubleMove(const MoveOperand& from, const MoveOperand& to);
+  void emitSimd128Move(const MoveOperand& from, const MoveOperand& to);
  void emitInt32Move(const MoveOperand& from, const MoveOperand& to);
  void emitGeneralMove(const MoveOperand& from, const MoveOperand& to);

--- a/js/src/jit/arm64/Trampoline-arm64.cpp
+++ b/js/src/jit/arm64/Trampoline-arm64.cpp
@ -342,6 +342,8 @@ static void PushRegisterDump(MacroAssembler& masm) {
  // be pushed. So pushing happens in two phases.
  //
  // Registers are pushed in reverse order of code.
+  //
+  // See block comment in MacroAssembler.h for further required invariants.

  // First, push the last four registers, passing zero for sp.
  // Zero is pushed for x28 and x31: the pseudo-SP and SP, respectively.
--- a/js/src/jit/arm64/vixl/MacroAssembler-vixl.h
+++ b/js/src/jit/arm64/vixl/MacroAssembler-vixl.h
@ -2429,6 +2429,7 @@ class UseScratchRegisterScope {
  Register AcquireX() { return AcquireNextAvailable(available_).X(); }
  VRegister AcquireS() { return AcquireNextAvailable(availablefp_).S(); }
  VRegister AcquireD() { return AcquireNextAvailable(availablefp_).D(); }
+  VRegister AcquireQ() { return AcquireNextAvailable(availablefp_).Q(); }


  Register AcquireSameSizeAs(const Register& reg);
--- a/js/src/jit/x86-shared/Architecture-x86-shared.h
+++ b/js/src/jit/x86-shared/Architecture-x86-shared.h
@ -22,21 +22,6 @@
 namespace js {
 namespace jit {

-#if defined(JS_CODEGEN_X86)
-// In bytes: slots needed for potential memory->memory move spills.
-//   +8 for cycles
-//   +4 for gpr spills
-//   +8 for double spills
-static const uint32_t ION_FRAME_SLACK_SIZE = 20;
-
-#elif defined(JS_CODEGEN_X64)
-// In bytes: slots needed for potential memory->memory move spills.
-//   +8 for cycles
-//   +8 for gpr spills
-//   +8 for double spills
-static const uint32_t ION_FRAME_SLACK_SIZE = 24;
-#endif
-
 #if defined(JS_CODEGEN_X86)
 // These offsets are specific to nunboxing, and capture offsets into the
 // components of a js::Value.
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -642,6 +642,9 @@ class BaseRegAlloc {
  // masm.  (This is the case on ARM64 for now, and is a consequence of needing
  // more than 64 bits for FloatRegisters::SetType to represent SIMD registers.
  // See lengty comment in Architecture-arm64.h.)
+  //
+  // FIXME: RABALDR_SIDEALLOC_V128 is no longer necessary on ARM64, we should
+  // be able to use SIMD normally there.

  BaseCompilerInterface* bc;
  AllocatableGeneralRegisterSet availGPR;
--- a/js/src/wasm/WasmStubs.cpp
+++ b/js/src/wasm/WasmStubs.cpp
@ -155,42 +155,6 @@ void ABIResultIter::settlePrev() {
  cur_ = ABIResult(type, nextStackOffset_);
 }

-// Register save/restore.
-//
-// On ARM64, the register sets are not able to represent SIMD registers (see
-// lengthy comment in Architecture-arm64.h for information), and so we use a
-// hack to save and restore them: on this architecture, when we care about SIMD,
-// we call special routines that know about them.
-//
-// In a couple of cases it is not currently necessary to save and restore SIMD
-// registers, but the extra traffic is all along slow paths and not really worth
-// optimizing.
-static void PushRegsInMask(MacroAssembler& masm, const LiveRegisterSet& set) {
-#if defined(ENABLE_WASM_SIMD) && defined(JS_CODEGEN_ARM64)
-  masm.PushRegsInMaskForWasmStubs(set);
-#else
-  masm.PushRegsInMask(set);
-#endif
-}
-
-static void PopRegsInMask(MacroAssembler& masm, const LiveRegisterSet& set) {
-#if defined(ENABLE_WASM_SIMD) && defined(JS_CODEGEN_ARM64)
-  masm.PopRegsInMaskForWasmStubs(set, LiveRegisterSet());
-#else
-  masm.PopRegsInMask(set);
-#endif
-}
-
-static void PopRegsInMaskIgnore(MacroAssembler& masm,
-                                const LiveRegisterSet& set,
-                                const LiveRegisterSet& ignore) {
-#if defined(ENABLE_WASM_SIMD) && defined(JS_CODEGEN_ARM64)
-  masm.PopRegsInMaskForWasmStubs(set, ignore);
-#else
-  masm.PopRegsInMaskIgnore(set, ignore);
-#endif
-}
-
 #ifdef WASM_CODEGEN_DEBUG
 template <class Closure>
 static void GenPrint(DebugChannel channel, MacroAssembler& masm,
@ -201,7 +165,7 @@ static void GenPrint(DebugChannel channel, MacroAssembler& masm,

  AllocatableRegisterSet regs(RegisterSet::All());
  LiveRegisterSet save(regs.asLiveSet());
-  PushRegsInMask(masm, save);
+  masm.PushRegsInMask(save);

  if (taken) {
    regs.take(taken.value());
@ -215,7 +179,7 @@ static void GenPrint(DebugChannel channel, MacroAssembler& masm,
    passArgAndCall(IsCompilingWasm(), temp);
  }

-  PopRegsInMask(masm, save);
+  masm.PopRegsInMask(save);
 }

 static void GenPrintf(DebugChannel channel, MacroAssembler& masm,
@ -774,15 +738,10 @@ static bool GenerateInterpEntry(MacroAssembler& masm, const FuncExport& fe,
  // Save all caller non-volatile registers before we clobber them here and in
  // the wasm callee (which does not preserve non-volatile registers).
  masm.setFramePushed(0);
-  PushRegsInMask(masm, NonVolatileRegs);
+  masm.PushRegsInMask(NonVolatileRegs);

  const unsigned nonVolatileRegsPushSize =
-#if defined(ENABLE_WASM_SIMD) && defined(JS_CODEGEN_ARM64)
-      NonVolatileRegs.gprs().size() * sizeof(intptr_t) +
-      FloatRegister::GetPushSizeInBytesForWasmStubs(NonVolatileRegs.fpus());
-#else
      masm.PushRegsInMaskSizeInBytes(NonVolatileRegs);
-#endif

  MOZ_ASSERT(masm.framePushed() == nonVolatileRegsPushSize);

@ -904,7 +863,7 @@ static bool GenerateInterpEntry(MacroAssembler& masm, const FuncExport& fe,
  masm.bind(&join);

  // Restore clobbered non-volatile registers of the caller.
-  PopRegsInMask(masm, NonVolatileRegs);
+  masm.PopRegsInMask(NonVolatileRegs);
  MOZ_ASSERT(masm.framePushed() == 0);

 #if defined(JS_CODEGEN_ARM64)
@ -992,7 +951,7 @@ static void GenerateBigIntInitialization(MacroAssembler& masm,
  // We need to avoid clobbering other argument registers and the input.
  AllocatableRegisterSet regs(RegisterSet::Volatile());
  LiveRegisterSet save(regs.asLiveSet());
-  PushRegsInMask(masm, save);
+  masm.PushRegsInMask(save);

  unsigned frameSize = StackDecrementForCall(
      ABIStackAlignment, masm.framePushed() + bytesPushedByPrologue, 0);
@ -1013,7 +972,7 @@ static void GenerateBigIntInitialization(MacroAssembler& masm,

  LiveRegisterSet ignore;
  ignore.add(scratch);
-  PopRegsInMaskIgnore(masm, save, ignore);
+  masm.PopRegsInMaskIgnore(save, ignore);

  masm.branchTest32(Assembler::Zero, scratch, scratch, fail);
  masm.initializeBigInt64(Scalar::BigInt64, scratch, input);
@ -2700,15 +2659,17 @@ static const LiveRegisterSet RegsToPreserve(
 // We assume that traps do not happen while lr is live. This both ensures that
 // the size of RegsToPreserve is a multiple of 2 (preserving WasmStackAlignment)
 // and gives us a register to clobber in the return path.
-//
-// Note there are no SIMD registers in the set; the doubles in the set stand in
-// for SIMD registers, which are pushed as appropriate.  See comments above at
-// PushRegsInMask and lengty comment in Architecture-arm64.h.
 static const LiveRegisterSet RegsToPreserve(
    GeneralRegisterSet(Registers::AllMask &
                       ~((Registers::SetType(1) << RealStackPointer.code()) |
                         (Registers::SetType(1) << Registers::lr))),
+#  ifdef ENABLE_WASM_SIMD
+    FloatRegisterSet(FloatRegisters::AllSimd128Mask));
+#  else
+    // If SIMD is not enabled, it's pointless to save/restore the upper 64
+    // bits of each vector register.
    FloatRegisterSet(FloatRegisters::AllDoubleMask));
+#  endif
 #elif defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
 // It's correct to use FloatRegisters::AllMask even when SIMD is not enabled;
 // PushRegsInMask strips out the high lanes of the XMM registers in this case,
@ -2762,7 +2723,7 @@ static bool GenerateTrapExit(MacroAssembler& masm, Label* throwLabel,
  // Push a dummy word to use as return address below.
  WasmPush(masm, ImmWord(TrapExitDummyValue));
  unsigned framePushedBeforePreserve = masm.framePushed();
-  PushRegsInMask(masm, RegsToPreserve);
+  masm.PushRegsInMask(RegsToPreserve);
  unsigned offsetOfReturnWord = masm.framePushed() - framePushedBeforePreserve;

  // We know that StackPointer is word-aligned, but not necessarily
@ -2786,7 +2747,7 @@ static bool GenerateTrapExit(MacroAssembler& masm, Label* throwLabel,
  // use to jump to via ret.
  masm.moveToStackPtr(preAlignStackPointer);
  masm.storePtr(ReturnReg, Address(masm.getStackPointer(), offsetOfReturnWord));
-  PopRegsInMask(masm, RegsToPreserve);
+  masm.PopRegsInMask(RegsToPreserve);
 #ifdef JS_CODEGEN_ARM64
  WasmPop(masm, lr);
  masm.abiret();
@ -2926,7 +2887,7 @@ static bool GenerateDebugTrapStub(MacroAssembler& masm, Label* throwLabel,
  GenerateExitPrologue(masm, 0, ExitReason::Fixed::DebugTrap, offsets);

  // Save all registers used between baseline compiler operations.
-  PushRegsInMask(masm, AllAllocatableRegs);
+  masm.PushRegsInMask(AllAllocatableRegs);

  uint32_t framePushed = masm.framePushed();

@ -2960,7 +2921,7 @@ static bool GenerateDebugTrapStub(MacroAssembler& masm, Label* throwLabel,
 #endif

  masm.setFramePushed(framePushed);
-  PopRegsInMask(masm, AllAllocatableRegs);
+  masm.PopRegsInMask(AllAllocatableRegs);

  GenerateExitEpilogue(masm, 0, ExitReason::Fixed::DebugTrap, offsets);