Backed out 13 changesets (bug 1641595, bug 1671998, bug 1672770, bug 1641641, bug 1672697, bug 1672911, bug 1671996, bug 1671907, bug 1640662) for causing sm failures in splat-x64-ion-codegen.js

CLOSED TREE Backed out changeset b5f6faabcb09 (bug 1641595) Backed out changeset 326887a2b839 (bug 1641595) Backed out changeset 2db3227f010f (bug 1672911) Backed out changeset 857eeed4ed64 (bug 1672770) Backed out changeset 388b419c10c6 (bug 1672697) Backed out changeset 3d7f502d9ead (bug 1671998) Backed out changeset 202d806bac14 (bug 1671996) Backed out changeset bda987b1c505 (bug 1640662) Backed out changeset 4e949bd393fa (bug 1640662) Backed out changeset 164c3fd55821 (bug 1640662) Backed out changeset 2026ca13fc92 (bug 1640662) Backed out changeset 2716ecaf485c (bug 1671907) Backed out changeset 92fe89072e8c (bug 1641641)
2024-10-17 23:35:34 +00:00 · 2020-10-26 16:49:09 +02:00 · 2020-10-26 16:49:09 +02:00 · 80f328510e
commit 80f328510e
parent d38909b148
40 changed files with 655 additions and 1821 deletions
--- a/js/src/jit-test/tests/wasm/simd/README-codegen.md
+++ b/js/src/jit-test/tests/wasm/simd/README-codegen.md
@ -1,28 +0,0 @@
-About x86 ion whitebox code generation tests (*-codegen.js):
-
-These test that extraneous moves are not inserted by the register
-allocator or code generator under ideal conditions: when it is in
-principle possible for the code generator and register allocator to
-use inputs where they are and generate outputs directly in the target
-registers.
-
-These tests are both limited in scope and brittle in the face of
-changes to the register allocator, but how else would we test that
-code generation and register allocation work when presented with the
-easiest case?  And if they don't work then, when will they work?
-
-For a reliable test, the inputs must be known to be in xmm0, xmm1,
-xmm2, etc (the function argument registers) and the result must be
-known to be desired in xmm0 (the function result).
-
-Sometimes, to test optimal codegen, we need the inputs to be in
-reversed or permuted locations so as to avoid generating moves that
-are inserted by the regalloc to adapt to the function signature.
-
-In the test cases, the expected output is expressed as a multi-line
-regular expression.  The first line of each expected output is the
-tail end of the prologue; subsequent lines comprise the operation;
-finally there is the beginning of the epilogue.  Sometimes there is
-only the end of the prologue and the beginning of the operation, as
-the operation is long and we don't care about its tail.
-
--- a/js/src/jit-test/tests/wasm/simd/ad-hack.js
+++ b/js/src/jit-test/tests/wasm/simd/ad-hack.js
@ -1589,7 +1589,6 @@ assertEq(ins.exports.const_extract_i64x2_1(), -2n);
 // operand 1 is v128 in memory
 // operand 2 is immediate scalar
 // lane index is immediate so we're testing something randomish but not zero
-// (note though that fp operations have special cases for zero)
 // result is v128 in memory

 var ins = wasmEvalText(`
@ -1607,9 +1606,6 @@ var ins = wasmEvalText(`
    (func (export "replace_i64x2_1") (param $value i64)
      (v128.store (i32.const 0)
        (i64x2.replace_lane 1 (v128.load (i32.const 16)) (local.get $value))))
-    (func (export "replace_f32x4_0") (param $value f32)
-      (v128.store (i32.const 0)
-        (f32x4.replace_lane 0 (v128.load (i32.const 16)) (local.get $value))))
    (func (export "replace_f32x4_3") (param $value f32)
      (v128.store (i32.const 0)
        (f32x4.replace_lane 3 (v128.load (i32.const 16)) (local.get $value))))
@ -1652,10 +1648,6 @@ assertSame(get(mem64, 0, 2), upd(as, 1, 42));
 var mem32 = new Float32Array(ins.exports.mem.buffer);
 var as = [1.5, 2.5, 3.5, 4.5];

-set(mem32, 4, as)
-ins.exports.replace_f32x4_0(42.5);
-assertSame(get(mem32, 0, 4), upd(as, 0, 42.5));
-
 set(mem32, 4, as)
 ins.exports.replace_f32x4_3(42.5);
 assertSame(get(mem32, 0, 4), upd(as, 3, 42.5));
--- a/js/src/jit-test/tests/wasm/simd/binop-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/binop-x64-ion-codegen.js
@ -1,90 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves or fixups for sundry SIMD binary
-// operations.  See README-codegen.md for general information about this type of
-// test case.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['f32x4.replace_lane 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f3 0f 10 c1               movss %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.replace_lane 1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 3a 21 c1 10         insertps \\$0x10, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.replace_lane 3',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 3a 21 c1 30         insertps \\$0x30, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.replace_lane 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f2 0f 10 c1               movsd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.replace_lane 1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c6 c1 00            shufpd \\$0x00, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ptype = op.substring(0,3);
-    let ins = wasmEvalText(`
-  (module
-    (func (export "f") (param v128) (param ${ptype}) (result v128)
-      (${op} (local.get 0) (local.get 1))))
-        `);
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-// Inputs (xmm1, xmm0)
-
-for ( let [op, expected] of [
-    ['f32x4.pmin',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f 5d c1                  minps %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.pmax',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f 5f c1                  maxps %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.pmin',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 5d c1               minpd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.pmax',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 5f c1               maxpd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ins = wasmEvalText(`
-  (module
-    (func (export "f") (param v128) (param v128) (result v128)
-      (${op} (local.get 1) (local.get 0))))
-        `);
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/bitselect-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/bitselect-x64-ion-codegen.js
@ -1,30 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves or fixups for SIMD bitselect
-// operations.  See README-codegen.md for general information about this type of
-// test case.
-
-// The codegen enforces onTrue == output so we avoid a move to set that up.
-//
-// The remaining movdqa is currently unavoidable, it moves the control mask into a temp.
-// The temp should be identical to the mask but the regalloc does not currently
-// allow this constraint to be enforced.
-
-let expected = `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 6f da               movdqa %xmm2, %xmm3
-000000..  66 0f db c3               pand %xmm3, %xmm0
-000000..  66 0f df d9               pandn %xmm1, %xmm3
-000000..  66 0f eb c3               por %xmm3, %xmm0
-000000..  5d                        pop %rbp
-`;
-
-let ins = wasmEvalText(`
-  (module
-    (func (export "f") (param v128) (param v128) (param v128) (param v128) (result v128)
-      (v128.bitselect (local.get 0) (local.get 1) (local.get 2))))
-        `);
-let output = wasmDis(ins.exports.f, "ion", true);
-if (output.indexOf('No disassembly available') < 0) {
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/cmp-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/cmp-x64-ion-codegen.js
@ -1,224 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves or fixups for various SIMD comparison
-// operations.  See README-codegen.md for general information about this type of
-// test case.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['i8x16.gt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 64 c1               pcmpgtb %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.gt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 65 c1               pcmpgtw %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.gt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 66 c1               pcmpgtd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.le_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 64 c1               pcmpgtb %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.le_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 65 c1               pcmpgtw %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.le_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 66 c1               pcmpgtd %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.eq',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 74 c1               pcmpeqb %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.eq',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c1               pcmpeqw %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.eq',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 76 c1               pcmpeqd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.ne',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 74 c1               pcmpeqb %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.ne',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c1               pcmpeqw %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.ne',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 76 c1               pcmpeqd %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.eq',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 00               cmpps \\$0x00, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.ne',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 04               cmpps \\$0x04, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.lt',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 01               cmpps \\$0x01, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.le',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 02               cmpps \\$0x02, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.eq',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 00            cmppd \\$0x00, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.ne',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 04            cmppd \\$0x04, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.lt',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 01            cmppd \\$0x01, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.le',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 02            cmppd \\$0x02, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (param v128) (result v128)
-      (${op} (local.get 0) (local.get 1))))
-        `)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-// Inputs (xmm1, xmm0) because the operation reverses its arguments.
-
-for ( let [op, expected] of [
-    ['i8x16.ge_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 64 c1               pcmpgtb %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.ge_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 65 c1               pcmpgtw %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.ge_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 66 c1               pcmpgtd %xmm1, %xmm0
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x0000000000000040, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.lt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 64 c1               pcmpgtb %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.lt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 65 c1               pcmpgtw %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.lt_s',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 66 c1               pcmpgtd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.gt',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 01               cmpps \\$0x01, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.ge',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c2 c1 02               cmpps \\$0x02, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.gt',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 01            cmppd \\$0x01, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.ge',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c2 c1 02            cmppd \\$0x02, %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (param v128) (result v128)
-      (${op} (local.get 1) (local.get 0))))
-        `)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/const-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/const-x64-ion-codegen.js
@ -1,82 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that constants that can be synthesized are synthesized.  See README-codegen.md
-// for general information about this type of test case.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['v128.const i8x16 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i8x16 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c0               pcmpeqw %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i16x8 0 0 0 0 0 0 0 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i16x8 -1 -1 -1 -1 -1 -1 -1 -1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c0               pcmpeqw %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i32x4 0 0 0 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i32x4 -1 -1 -1 -1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c0               pcmpeqw %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i64x2 0 0',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const i64x2 -1 -1',
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 75 c0               pcmpeqw %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const f32x4 0 0 0 0',
-     // Arguably this should be xorps but that's for later
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.const f64x2 0 0',
-     // Arguably this should be xorpd but that's for later
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-
-] ) {
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (result v128)
-      (${op})))
-        `)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/cvt-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/cvt-x64-ion-codegen.js
@ -1,42 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves for various SIMD conversion
-// operations. See README-codegen.md for general information about this type of
-// test case.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['i32x4.trunc_sat_f32x4_s',
-     // The movaps is dest -> scratch and needs to be here.  The test is
-     // asserting that there is not an additional (redundant) move here.
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  44 0f 28 f8               movaps %xmm0, %xmm15
-000000..  45 0f c2 ff 00            cmpps \\$0x00, %xmm15, %xmm15
-000000..  66 41 0f db c7            pand %xmm15, %xmm0
-`],
-    ['i32x4.trunc_sat_f32x4_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 45 0f ef ff            pxor %xmm15, %xmm15
-000000..  41 0f 5f c7               maxps %xmm15, %xmm0
-`],
-    ['f32x4.convert_i32x4_u', `
-00000023  48 8b ec                  mov %rsp, %rbp
-00000026  66 45 0f ef ff            pxor %xmm15, %xmm15
-0000002B  66 44 0f 3a 0e f8 55      pblendw \\$0x55, %xmm0, %xmm15
-00000032  66 41 0f fa c7            psubd %xmm15, %xmm0
-00000037  45 0f 5b ff               cvtdq2ps %xmm15, %xmm15
-`],
-] ) {
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (result v128)
-      (${op} (local.get 0))))`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-
--- a/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/neg-abs-not-x64-ion-codegen.js
@ -1,92 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves for variable SIMD negate
-// instructions. See README-codegen.md for general information about this type
-// of test case.
-
-// Integer negates don't have to reuse the input for the output, and prefer for
-// the registers to be different.
-
-// Inputs (xmm1, xmm0)
-
-for ( let [ op, expected ] of [
-    ['i8x16.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  66 0f f8 c1               psubb %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  66 0f f9 c1               psubw %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  66 0f fa c1               psubd %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i64x2.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c0               pxor %xmm0, %xmm0
-000000..  66 0f fb c1               psubq %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-
-] ) {
-
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (param v128) (result v128)
-      (${op} (local.get 1))))
-`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-// Floating point negate and absolute value, and bitwise not, prefer for the
-// registers to be the same and guarantee that no move is inserted if so.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [ op, expected ] of [
-    ['f32x4.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x00000000000000.., %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.neg', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x00000000000000.., %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f32x4.abs', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f db 05 .. 00 00 00   pandx 0x00000000000000.., %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2.abs', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f db 05 .. 00 00 00   pandx 0x00000000000000.., %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.not', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef 05 .. 00 00 00   pxorx 0x00000000000000.., %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    var ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (result v128)
-      (${op} (local.get 0))))
-`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/shift-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/shift-x64-ion-codegen.js
@ -1,76 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves for a constant integer SIMD shift
-// that can reuse its input for its output.  See README-codegen.md for general
-// information about this type of test case.
-//
-// There are test cases here for all codegen cases that include a potential move
-// to set up the operation, but not for all shift operations in general.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['i8x16.shl', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f fc c0               paddb %xmm0, %xmm0
-000000..  66 0f fc c0               paddb %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.shl', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 71 f0 02            psllw \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.shl', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 72 f0 02            pslld \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i64x2.shl', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 73 f0 02            psllq \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shr_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f db 05 .. 00 00 00   pandx 0x00000000000000.., %xmm0
-000000..  66 0f 71 d0 02            psrlw \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.shr_s', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 71 e0 02            psraw \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i16x8.shr_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 71 d0 02            psrlw \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.shr_s', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 72 e0 02            psrad \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i32x4.shr_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 72 d0 02            psrld \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i64x2.shr_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 73 d0 02            psrlq \\$0x02, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param v128) (result v128)
-      (${op} (local.get 0) (i32.const 2))))`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-
--- a/js/src/jit-test/tests/wasm/simd/shuffle-x86-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/shuffle-x86-ion-codegen.js
@ -1,133 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves or fixups for SIMD shuffle
-// operations.  See README-codegen.md for general information about this type of
-// test case.
-
-// Inputs (xmm0, xmm1)
-
-for ( let [op, expected] of [
-    ['i8x16.shuffle 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15',
-     // Identity op on first argument should generate no code
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31',
-     // Identity op on second argument should generate a move
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 6f c1               movdqa %xmm1, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5',
-     // Broadcast a byte from first argument
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 60 c0               punpcklbw %xmm0, %xmm0
-000000..  f3 0f 70 c0 55            pshufhw \\$0x55, %xmm0, %xmm0
-000000..  66 0f 70 c0 aa            pshufd \\$0xAA, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5',
-     // Broadcast a word from first argument
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f2 0f 70 c0 aa            pshuflw \\$0xAA, %xmm0, %xmm0
-000000..  66 0f 70 c0 00            pshufd \\$0x00, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 2 1 4 3 6 5 8 7 10 9 12 11 14 13 0 15',
-     // Permute bytes
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 44 0f 6f 3d .. 00 00 00 
-                                    movdqax 0x0000000000000040, %xmm15
-000000..  66 41 0f 38 00 c7         pshufb %xmm15, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 2 3 0 1 6 7 4 5 10 11 8 9 14 15 12 13',
-     // Permute words
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f2 0f 70 c0 b1            pshuflw \\$0xB1, %xmm0, %xmm0
-000000..  f3 0f 70 c0 b1            pshufhw \\$0xB1, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 4 5 6 7 0 1 2 3 12 13 14 15 8 9 10 11',
-     // Permute doublewords
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 70 c0 b1            pshufd \\$0xB1, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 13 14 15 0 1 2 3 4 5 6 7 8 9 10 11 12',
-     // Rotate right
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f 3a 0f c0 0d         palignr \\$0x0D, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 15 29 0 1 2 1 2 0 3 4 7 8 16 8 17 9',
-     // General shuffle + blend.  The initial movdqa to scratch is
-     // unavoidable unless we can convince the compiler that it's OK to destroy xmm1.
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 44 0f 6f f9            movdqa %xmm1, %xmm15
-000000..  66 44 0f 38 00 3d .. 00 00 00 
-                                    pshufbx 0x0000000000000050, %xmm15
-000000..  66 0f 38 00 05 .. 00 00 00 
-                                    pshufbx 0x0000000000000060, %xmm0
-000000..  66 41 0f eb c7            por %xmm15, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-    let ins = wasmEvalText(`
-  (module
-    (func (export "f") (param v128) (param v128) (result v128)
-      (${op} (local.get 0) (local.get 1))))
-        `);
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-// Inputs (xmm0, zero)
-
-for ( let [op, expected] of [
-    ['i8x16.shuffle 16 16 16 0 1 2 3 4 5 6 7 8 9 10 11 12',
-     // Shift left bytes, shifting in zeroes
-     //
-     // Remember the low-order bytes are at the "right" end
-     //
-     // The pxor is a code generation bug: the operand is unused, and no
-     // code should need to be generated for it, and no register should
-     // be allocated to it.  The lowering does not use that operand, but
-     // code generation still touches it.
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c9               pxor %xmm1, %xmm1
-000000..  66 0f 73 f8 03            pslldq \\$0x03, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['i8x16.shuffle 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18',
-     // Shift right bytes, shifting in zeroes.  See above.
-`
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f ef c9               pxor %xmm1, %xmm1
-000000..  66 0f 73 d8 03            psrldq \\$0x03, %xmm0
-000000..  5d                        pop %rbp
-`],
-
-] ) {
-    let ins = wasmEvalText(`
-  (module
-    (func (export "f") (param v128) (result v128)
-      (${op} (local.get 0) (v128.const i32x4 0 0 0 0))))
-        `);
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
--- a/js/src/jit-test/tests/wasm/simd/splat-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/splat-x64-ion-codegen.js
@ -1,90 +0,0 @@
-// |jit-test| skip-if: !wasmSimdEnabled() || wasmCompileMode() != "ion" || !getBuildConfiguration().x64
-
-// Test that there are no extraneous moves or other instructions for splat and
-// other splat-like operations that can reuse its input for its output and/or
-// has a specializable code path.  See README-codegen.md for general information
-// about this type of test case.
-
-// Input (xmm0)
-
-for ( let [ simd_type, expected ] of [
-    ['f32x4', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  0f c6 c0 00               shufps \\$0x00, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['f64x2', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 0f c6 c0 00            shufpd \\$0x00, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-
-    let type = simd_type.substring(0,3);
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (func (export "f") (param ${type}) (result v128)
-      (${simd_type}.splat (local.get 0))))`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
-// Input (paramreg0)
-
-for ( let [ op, expected ] of [
-    ['v128.load32_splat', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f3 41 0f 10 04 3f         movssl \\(%r15,%rdi,1\\), %xmm0
-000000..  0f c6 c0 00               shufps \\$0x00, %xmm0, %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load64_splat', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  f2 41 0f 12 04 3f         movddupq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load8x8_s', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 20 04 3f      pmovsxbwq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load8x8_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 30 04 3f      pmovzxbwq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load16x4_s', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 23 04 3f      pmovsxwdq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load16x4_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 33 04 3f      pmovzxwdq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load32x2_s', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 25 04 3f      pmovsxdqq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-    ['v128.load32x2_u', `
-000000..  48 8b ec                  mov %rsp, %rbp
-000000..  66 41 0f 38 35 04 3f      pmovzxdqq \\(%r15,%rdi,1\\), %xmm0
-000000..  5d                        pop %rbp
-`],
-] ) {
-
-    let ins = new WebAssembly.Instance(new WebAssembly.Module(wasmTextToBinary(`
-  (module
-    (memory 1)
-    (func (export "f") (param i32) (result v128)
-      (${op} (local.get 0))))`)));
-    let output = wasmDis(ins.exports.f, "ion", true);
-    if (output.indexOf('No disassembly available') >= 0)
-        continue;
-    assertEq(output.match(new RegExp(expected)) != null, true);
-}
-
--- a/js/src/jit/IonTypes.h
+++ b/js/src/jit/IonTypes.h
@ -552,16 +552,6 @@ class SimdConstant {
    return type_;
  }

-  bool isFloatingType() const {
-    MOZ_ASSERT(defined());
-    return type_ >= Float32x4;
-  }
-
-  bool isIntegerType() const {
-    MOZ_ASSERT(defined());
-    return type_ <= Int64x2;
-  }
-
  // Get the raw bytes of the constant.
  const void* bytes() const { return u.i8x16; }

@ -595,32 +585,28 @@ class SimdConstant {
    return u.f64x2;
  }

-  bool bitwiseEqual(const SimdConstant& rhs) const {
+  bool operator==(const SimdConstant& rhs) const {
    MOZ_ASSERT(defined() && rhs.defined());
+    if (type() != rhs.type()) {
+      return false;
+    }
+    // Takes negative zero into account, as it's a bit comparison.
    return memcmp(&u, &rhs.u, sizeof(u)) == 0;
  }
+  bool operator!=(const SimdConstant& rhs) const { return !operator==(rhs); }

-  bool isZeroBits() const {
-    MOZ_ASSERT(defined());
-    return u.i64x2[0] == 0 && u.i64x2[1] == 0;
+  bool isIntegerZero() const {
+    return type_ <= Int64x2 && u.i64x2[0] == 0 && u.i64x2[1] == 0;
  }

-  bool isOneBits() const {
-    MOZ_ASSERT(defined());
-    return ~u.i64x2[0] == 0 && ~u.i64x2[1] == 0;
-  }
-
-  // SimdConstant is a HashPolicy.  Currently we discriminate by type, but it
-  // may be that we should only be discriminating by int vs float.
+  // SimdConstant is a HashPolicy
  using Lookup = SimdConstant;
-
  static HashNumber hash(const SimdConstant& val) {
    uint32_t hash = mozilla::HashBytes(&val.u, sizeof(val.u));
    return mozilla::AddToHash(hash, val.type_);
  }
-
  static bool match(const SimdConstant& lhs, const SimdConstant& rhs) {
-    return lhs.type() == rhs.type() && lhs.bitwiseEqual(rhs);
+    return lhs == rhs;
  }
 };

--- a/js/src/jit/MIR.cpp
+++ b/js/src/jit/MIR.cpp
@ -5216,7 +5216,7 @@ MDefinition* MWasmReduceSimd128::foldsTo(TempAllocator& alloc) {
      case wasm::SimdOp::I8x16AnyTrue:
      case wasm::SimdOp::I16x8AnyTrue:
      case wasm::SimdOp::I32x4AnyTrue:
-        i32Result = !c.isZeroBits();
+        i32Result = !c.isIntegerZero();
        break;
      case wasm::SimdOp::I8x16AllTrue:
        i32Result = AllTrue(
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -14337,7 +14337,7 @@ class MWasmShuffleSimd128 : public MBinaryInstruction,

  AliasSet getAliasSet() const override { return AliasSet::None(); }
  bool congruentTo(const MDefinition* ins) const override {
-    return ins->toWasmShuffleSimd128()->control().bitwiseEqual(control_) &&
+    return ins->toWasmShuffleSimd128()->control() == control_ &&
           congruentIfOperandsEqual(ins);
  }

--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@ -2051,8 +2051,11 @@ class MacroAssembler : public MacroAssemblerSpecific {

  // lane values 0..31
  inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
-                             FloatRegister lhsDest)
-      DEFINED_ON(x86_shared, arm64);
+                             FloatRegister lhsDest, FloatRegister temp)
+      DEFINED_ON(x86_shared);
+
+  inline void shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
+                             FloatRegister lhsDest) DEFINED_ON(arm64);

  // lane values 0 (select from lhs) or FF (select from rhs).
  inline void blendInt8x16(const uint8_t lanes[16], FloatRegister rhs,
@ -2388,9 +2391,6 @@ class MacroAssembler : public MacroAssemblerSpecific {
  inline void bitwiseXorSimd128(FloatRegister rhs, FloatRegister lhsDest)
      DEFINED_ON(x86_shared, arm64);

-  inline void bitwiseXorSimd128(const SimdConstant& rhs, FloatRegister lhsDest)
-      DEFINED_ON(x64, x86);
-
  inline void bitwiseNotSimd128(FloatRegister src, FloatRegister dest)
      DEFINED_ON(x86_shared, arm64);

@ -2655,27 +2655,17 @@ class MacroAssembler : public MacroAssemblerSpecific {
      DEFINED_ON(x86_shared, arm64);

  // Compare-based minimum/maximum
-  //
-  // On x86, the signature is (rhsDest, lhs); on arm64 it is (rhs, lhsDest).
-  //
-  // The masm preprocessor can't deal with multiple declarations with identical
-  // signatures even if they are on different platforms, hence the weird
-  // argument names.

-  inline void pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
-                                 FloatRegister lhsOrLhsDest)
+  inline void pseudoMinFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
      DEFINED_ON(x86_shared, arm64);

-  inline void pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
-                                 FloatRegister lhsOrLhsDest)
+  inline void pseudoMinFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
      DEFINED_ON(x86_shared, arm64);

-  inline void pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
-                                 FloatRegister lhsOrLhsDest)
+  inline void pseudoMaxFloat32x4(FloatRegister rhs, FloatRegister lhsDest)
      DEFINED_ON(x86_shared, arm64);

-  inline void pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
-                                 FloatRegister lhsOrLhsDest)
+  inline void pseudoMaxFloat64x2(FloatRegister rhs, FloatRegister lhsDest)
      DEFINED_ON(x86_shared, arm64);

  // Widening/pairwise integer dot product
--- a/js/src/jit/arm/MacroAssembler-arm.cpp
+++ b/js/src/jit/arm/MacroAssembler-arm.cpp
@ -6008,9 +6008,6 @@ void MacroAssemblerARM::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
                                     Register ptrScratch, AnyRegister output,
                                     Register64 out64) {
  MOZ_ASSERT(ptr == ptrScratch);
-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());

  uint32_t offset = access.offset();
  MOZ_ASSERT(offset < wasm::MaxOffsetGuardLimit);
@ -6057,6 +6054,7 @@ void MacroAssemblerARM::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
      }
    }
  } else {
+    MOZ_ASSERT(!access.isZeroExtendSimd128Load());
    bool isFloat = output.isFloat();
    if (isFloat) {
      MOZ_ASSERT((byteSize == 4) == output.fpu().isSingle());
@ -6153,9 +6151,6 @@ void MacroAssemblerARM::wasmUnalignedLoadImpl(
  MOZ_ASSERT(ptr == ptrScratch);
  MOZ_ASSERT(tmp != ptr);
  MOZ_ASSERT(!Assembler::SupportsFastUnalignedAccesses());
-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());

  uint32_t offset = access.offset();
  MOZ_ASSERT(offset < wasm::MaxOffsetGuardLimit);
--- a/js/src/jit/arm64/MacroAssembler-arm64-inl.h
+++ b/js/src/jit/arm64/MacroAssembler-arm64-inl.h
@ -2663,42 +2663,32 @@ void MacroAssembler::unsignedWidenLowInt32x4(FloatRegister src,
 // Compare-based minimum/maximum (experimental as of August, 2020)
 // https://github.com/WebAssembly/simd/pull/122

-void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  // Shut up the linter by using the same names as in the declaration, then
-  // aliasing here.
-  FloatRegister rhs = rhsOrRhsDest;
-  FloatRegister lhsDest = lhsOrLhsDest;
+void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
  ScratchSimd128Scope scratch(*this);
  Fcmgt(Simd4S(scratch), Simd4S(lhsDest), Simd4S(rhs));
  Bsl(Simd16B(scratch), Simd16B(rhs), Simd16B(lhsDest));
  Mov(SimdReg(lhsDest), scratch);
 }

-void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhs = rhsOrRhsDest;
-  FloatRegister lhsDest = lhsOrLhsDest;
+void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
  ScratchSimd128Scope scratch(*this);
  Fcmgt(Simd2D(scratch), Simd2D(lhsDest), Simd2D(rhs));
  Bsl(Simd16B(scratch), Simd16B(rhs), Simd16B(lhsDest));
  Mov(SimdReg(lhsDest), scratch);
 }

-void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhs = rhsOrRhsDest;
-  FloatRegister lhsDest = lhsOrLhsDest;
+void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
  ScratchSimd128Scope scratch(*this);
  Fcmgt(Simd4S(scratch), Simd4S(rhs), Simd4S(lhsDest));
  Bsl(Simd16B(scratch), Simd16B(rhs), Simd16B(lhsDest));
  Mov(SimdReg(lhsDest), scratch);
 }

-void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhs = rhsOrRhsDest;
-  FloatRegister lhsDest = lhsOrLhsDest;
+void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
  ScratchSimd128Scope scratch(*this);
  Fcmgt(Simd2D(scratch), Simd2D(rhs), Simd2D(lhsDest));
  Bsl(Simd16B(scratch), Simd16B(rhs), Simd16B(lhsDest));
--- a/js/src/jit/arm64/MacroAssembler-arm64.cpp
+++ b/js/src/jit/arm64/MacroAssembler-arm64.cpp
@ -337,10 +337,6 @@ void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
  uint32_t offset = access.offset();
  MOZ_ASSERT(offset < wasm::MaxOffsetGuardLimit);

-  // Not yet supported: not used by baseline compiler
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());
-
  MOZ_ASSERT(ptr_ == ptrScratch_);

  ARMRegister memoryBase(memoryBase_, 64);
@ -387,11 +383,11 @@ void MacroAssemblerCompat::wasmLoadImpl(const wasm::MemoryAccessDesc& access,
        Ldr(SelectGPReg(outany, out64), srcAddr);
        break;
      case Scalar::Float32:
-        // LDR does the right thing also for access.isZeroExtendSimd128Load()
+        // LDR does the right thing also for access.isZeroExtendSimdLoad()
        Ldr(SelectFPReg(outany, out64, 32), srcAddr);
        break;
      case Scalar::Float64:
-        // LDR does the right thing also for access.isZeroExtendSimd128Load()
+        // LDR does the right thing also for access.isZeroExtendSimdLoad()
        Ldr(SelectFPReg(outany, out64, 64), srcAddr);
        break;
      case Scalar::Simd128:
--- a/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
+++ b/js/src/jit/mips-shared/MacroAssembler-mips-shared.cpp
@ -2108,9 +2108,6 @@ void MacroAssemblerMIPSShared::wasmLoadImpl(
  bool isSigned;
  bool isFloat = false;

-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());
  switch (access.type()) {
    case Scalar::Int8:
      isSigned = true;
@ -2131,9 +2128,11 @@ void MacroAssemblerMIPSShared::wasmLoadImpl(
      isSigned = false;
      break;
    case Scalar::Float64:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
      isFloat = true;
      break;
    case Scalar::Float32:
+      MOZ_ASSERT(!access.isZeroExtendSimd128Load());
      isFloat = true;
      break;
    default:
--- a/js/src/jit/mips32/MacroAssembler-mips32.cpp
+++ b/js/src/jit/mips32/MacroAssembler-mips32.cpp
@ -2380,10 +2380,6 @@ void MacroAssemblerMIPSCompat::wasmLoadI64Impl(
  uint32_t offset = access.offset();
  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);

-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());
-
  // Maybe add the offset.
  if (offset) {
    asMasm().movePtr(ptr, ptrScratch);
--- a/js/src/jit/mips64/MacroAssembler-mips64.cpp
+++ b/js/src/jit/mips64/MacroAssembler-mips64.cpp
@ -2281,10 +2281,6 @@ void MacroAssemblerMIPS64Compat::wasmLoadI64Impl(
  MOZ_ASSERT(offset < wasm::MaxOffsetGuardLimit);
  MOZ_ASSERT_IF(offset, ptrScratch != InvalidReg);

-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());
-  MOZ_ASSERT(!access.isSplatSimd128Load());
-  MOZ_ASSERT(!access.isWidenSimd128Load());
-
  // Maybe add the offset.
  if (offset) {
    asMasm().addPtr(Imm32(offset), ptrScratch);
--- a/js/src/jit/shared/Assembler-shared.h
+++ b/js/src/jit/shared/Assembler-shared.h
@ -504,8 +504,7 @@ class MemoryAccessDesc {
  Scalar::Type type_;
  jit::Synchronization sync_;
  wasm::BytecodeOffset trapOffset_;
-  wasm::SimdOp widenOp_;
-  enum { Plain, ZeroExtend, Splat, Widen } loadOp_;
+  bool zeroExtendSimd128Load_;

 public:
  explicit MemoryAccessDesc(
@ -517,8 +516,7 @@ class MemoryAccessDesc {
        type_(type),
        sync_(sync),
        trapOffset_(trapOffset),
-        widenOp_(wasm::SimdOp::Limit),
-        loadOp_(Plain) {
+        zeroExtendSimd128Load_(false) {
    MOZ_ASSERT(mozilla::IsPowerOfTwo(align));
  }

@ -528,35 +526,13 @@ class MemoryAccessDesc {
  unsigned byteSize() const { return Scalar::byteSize(type()); }
  const jit::Synchronization& sync() const { return sync_; }
  BytecodeOffset trapOffset() const { return trapOffset_; }
-  wasm::SimdOp widenSimdOp() const {
-    MOZ_ASSERT(isWidenSimd128Load());
-    return widenOp_;
-  }
  bool isAtomic() const { return !sync_.isNone(); }
-  bool isZeroExtendSimd128Load() const { return loadOp_ == ZeroExtend; }
-  bool isSplatSimd128Load() const { return loadOp_ == Splat; }
-  bool isWidenSimd128Load() const { return loadOp_ == Widen; }
+  bool isZeroExtendSimd128Load() const { return zeroExtendSimd128Load_; }

  void setZeroExtendSimd128Load() {
    MOZ_ASSERT(type() == Scalar::Float32 || type() == Scalar::Float64);
    MOZ_ASSERT(!isAtomic());
-    MOZ_ASSERT(loadOp_ == Plain);
-    loadOp_ = ZeroExtend;
-  }
-
-  void setSplatSimd128Load() {
-    MOZ_ASSERT(type() == Scalar::Float64);
-    MOZ_ASSERT(!isAtomic());
-    MOZ_ASSERT(loadOp_ == Plain);
-    loadOp_ = Splat;
-  }
-
-  void setWidenSimd128Load(wasm::SimdOp op) {
-    MOZ_ASSERT(type() == Scalar::Float64);
-    MOZ_ASSERT(!isAtomic());
-    MOZ_ASSERT(loadOp_ == Plain);
-    widenOp_ = op;
-    loadOp_ = Widen;
+    zeroExtendSimd128Load_ = true;
  }

  void clearOffset() { offset_ = 0; }
--- a/js/src/jit/x64/BaseAssembler-x64.h
+++ b/js/src/jit/x64/BaseAssembler-x64.h
@ -911,15 +911,6 @@ class BaseAssemblerX64 : public BaseAssembler {
                            dst);
  }

-  MOZ_MUST_USE JmpSrc vpxor_ripr(XMMRegisterID dst) {
-    return twoByteRipOpSimd("vpxor", VEX_PD, OP2_PXORDQ_VdqWdq, invalid_xmm,
-                            dst);
-  }
-  MOZ_MUST_USE JmpSrc vpshufb_ripr(XMMRegisterID dst) {
-    return threeByteRipOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38,
-                              invalid_xmm, dst);
-  }
-
 private:
  MOZ_MUST_USE JmpSrc twoByteRipOpSimd(const char* name, VexOperandType ty,
                                       TwoByteOpcodeID opcode,
@ -1015,20 +1006,6 @@ class BaseAssemblerX64 : public BaseAssembler {
                               (XMMRegisterID)dst);
  }

-  MOZ_MUST_USE JmpSrc threeByteRipOpSimd(const char* name, VexOperandType ty,
-                                         ThreeByteOpcodeID opcode,
-                                         ThreeByteEscape escape,
-                                         XMMRegisterID src0,
-                                         XMMRegisterID dst) {
-    MOZ_ASSERT(useLegacySSEEncoding(src0, dst));
-    m_formatter.legacySSEPrefix(ty);
-    m_formatter.threeByteRipOp(opcode, escape, 0, dst);
-    JmpSrc label(m_formatter.size());
-    spew("%-11s" MEM_o32r ", %s", legacySSEOpName(name),
-         ADDR_o32r(label.offset()), XMMRegName(dst));
-    return label;
-  }
-
  void threeByteOpImmSimdInt64(const char* name, VexOperandType ty,
                               ThreeByteOpcodeID opcode, ThreeByteEscape escape,
                               uint32_t imm, XMMRegisterID src1,
--- a/js/src/jit/x64/MacroAssembler-x64-inl.h
+++ b/js/src/jit/x64/MacroAssembler-x64-inl.h
@ -793,11 +793,6 @@ void MacroAssembler::bitwiseAndSimd128(const SimdConstant& rhs,
  vpandSimd128(rhs, lhsDest);
 }

-void MacroAssembler::bitwiseXorSimd128(const SimdConstant& rhs,
-                                       FloatRegister lhsDest) {
-  vpxorSimd128(rhs, lhsDest);
-}
-
 // ========================================================================
 // Truncate floating point.

--- a/js/src/jit/x64/MacroAssembler-x64.cpp
+++ b/js/src/jit/x64/MacroAssembler-x64.cpp
@ -78,32 +78,12 @@ void MacroAssemblerX64::loadConstantSimd128Float(const SimdConstant& v,
 }

 void MacroAssemblerX64::vpandSimd128(const SimdConstant& v,
-                                     FloatRegister srcDest) {
+                                     FloatRegister dest) {
  SimdData* val = getSimdData(v);
  if (!val) {
    return;
  }
-  JmpSrc j = masm.vpand_ripr(srcDest.encoding());
-  propagateOOM(val->uses.append(CodeOffset(j.offset())));
-}
-
-void MacroAssemblerX64::vpxorSimd128(const SimdConstant& v,
-                                     FloatRegister srcDest) {
-  SimdData* val = getSimdData(v);
-  if (!val) {
-    return;
-  }
-  JmpSrc j = masm.vpxor_ripr(srcDest.encoding());
-  propagateOOM(val->uses.append(CodeOffset(j.offset())));
-}
-
-void MacroAssemblerX64::vpshufbSimd128(const SimdConstant& v,
-                                       FloatRegister srcDest) {
-  SimdData* val = getSimdData(v);
-  if (!val) {
-    return;
-  }
-  JmpSrc j = masm.vpshufb_ripr(srcDest.encoding());
+  JmpSrc j = masm.vpand_ripr(dest.encoding());
  propagateOOM(val->uses.append(CodeOffset(j.offset())));
 }

@ -598,12 +578,6 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
                              Operand srcAddr, AnyRegister out) {
  memoryBarrierBefore(access.sync());

-  MOZ_ASSERT_IF(
-      access.isZeroExtendSimd128Load(),
-      access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
-  MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
-  MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
-
  append(access, size());
  switch (access.type()) {
    case Scalar::Int8:
@ -623,39 +597,12 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
      movl(srcAddr, out.gpr());
      break;
    case Scalar::Float32:
-      // vmovss does the right thing also for access.isZeroExtendSimd128Load()
+      // vmovss does the right thing also for access.isZeroExtendSimdLoad()
      vmovss(srcAddr, out.fpu());
      break;
    case Scalar::Float64:
-      if (access.isSplatSimd128Load()) {
-        vmovddup(srcAddr, out.fpu());
-      } else if (access.isWidenSimd128Load()) {
-        switch (access.widenSimdOp()) {
-          case wasm::SimdOp::I16x8LoadS8x8:
-            vpmovsxbw(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I16x8LoadU8x8:
-            vpmovzxbw(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I32x4LoadS16x4:
-            vpmovsxwd(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I32x4LoadU16x4:
-            vpmovzxwd(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I64x2LoadS32x2:
-            vpmovsxdq(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I64x2LoadU32x2:
-            vpmovzxdq(srcAddr, out.fpu());
-            break;
-          default:
-            MOZ_CRASH("Unexpected widening op for wasmLoad");
-        }
-      } else {
-        // vmovsd does the right thing also for access.isZeroExtendSimd128Load()
-        vmovsd(srcAddr, out.fpu());
-      }
+      // vmovsd does the right thing also for access.isZeroExtendSimdLoad()
+      vmovsd(srcAddr, out.fpu());
      break;
    case Scalar::Simd128:
      MacroAssemblerX64::loadUnalignedSimd128(srcAddr, out.fpu());
--- a/js/src/jit/x64/MacroAssembler-x64.h
+++ b/js/src/jit/x64/MacroAssembler-x64.h
@ -930,8 +930,6 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared {
  void loadConstantSimd128Int(const SimdConstant& v, FloatRegister dest);
  void loadConstantSimd128Float(const SimdConstant& v, FloatRegister dest);
  void vpandSimd128(const SimdConstant& v, FloatRegister srcDest);
-  void vpxorSimd128(const SimdConstant& v, FloatRegister srcDest);
-  void vpshufbSimd128(const SimdConstant& v, FloatRegister srcDest);

  void loadWasmGlobalPtr(uint32_t globalDataOffset, Register dest) {
    loadPtr(Address(WasmTlsReg,
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@ -3368,13 +3368,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovsxbw_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovsxbw_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovsxbw_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3385,13 +3378,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovzxbw_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovzxbw_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovzxbw_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3402,13 +3388,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovsxwd_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovsxwd_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovsxwd_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3419,13 +3398,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovzxwd_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovzxwd_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovzxwd_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3436,13 +3408,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovsxdq_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovsxdq_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovsxdq_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3453,13 +3418,6 @@ class AssemblerX86Shared : public AssemblerShared {
      case Operand::FPREG:
        masm.vpmovzxdq_rr(src.fpu(), dest.encoding());
        break;
-      case Operand::MEM_REG_DISP:
-        masm.vpmovzxdq_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vpmovzxdq_mr(src.disp(), src.base(), src.index(), src.scale(),
-                          dest.encoding());
-        break;
      default:
        MOZ_CRASH("unexpected operand kind");
    }
@ -3741,22 +3699,9 @@ class AssemblerX86Shared : public AssemblerShared {
    MOZ_ASSERT(HasSSSE3());
    masm.vpshufb_rr(mask.encoding(), src.encoding(), dest.encoding());
  }
-  void vmovddup(const Operand& src, FloatRegister dest) {
+  void vmovddup(FloatRegister src, FloatRegister dest) {
    MOZ_ASSERT(HasSSE3());
-    switch (src.kind()) {
-      case Operand::FPREG:
-        masm.vmovddup_rr(src.fpu(), dest.encoding());
-        break;
-      case Operand::MEM_REG_DISP:
-        masm.vmovddup_mr(src.disp(), src.base(), dest.encoding());
-        break;
-      case Operand::MEM_SCALE:
-        masm.vmovddup_mr(src.disp(), src.base(), src.index(), src.scale(),
-                         dest.encoding());
-        break;
-      default:
-        MOZ_CRASH("unexpected operand kind");
-    }
+    masm.vmovddup_rr(src.encoding(), dest.encoding());
  }
  void vmovhlps(FloatRegister src1, FloatRegister src0, FloatRegister dest) {
    MOZ_ASSERT(HasSSE2());
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@ -2860,10 +2860,6 @@ class BaseAssembler : public GenericAssembler {
    threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, src1, src0,
                    dst);
  }
-  void vpshufb_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst) {
-    threeByteOpSimd("vpshufb", VEX_PD, OP3_PSHUFB_VdqWdq, ESCAPE_38, address,
-                    src0, dst);
-  }

  void vshufps_irr(uint32_t mask, XMMRegisterID src1, XMMRegisterID src0,
                   XMMRegisterID dst) {
@ -2889,15 +2885,6 @@ class BaseAssembler : public GenericAssembler {
  void vmovddup_rr(XMMRegisterID src, XMMRegisterID dst) {
    twoByteOpSimd("vmovddup", VEX_SD, OP2_MOVDDUP_VqWq, src, invalid_xmm, dst);
  }
-  void vmovddup_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    twoByteOpSimd("vmovddup", VEX_SD, OP2_MOVDDUP_VqWq, offset, base,
-                  invalid_xmm, dst);
-  }
-  void vmovddup_mr(int32_t offset, RegisterID base, RegisterID index,
-                   int32_t scale, XMMRegisterID dst) {
-    twoByteOpSimd("vmovddup", VEX_SD, OP2_MOVDDUP_VqWq, offset, base, index,
-                  scale, invalid_xmm, dst);
-  }

  void vmovhlps_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst) {
    twoByteOpSimd("vmovhlps", VEX_PS, OP2_MOVHLPS_VqUq, src1, src0, dst);
@ -3711,85 +3698,31 @@ class BaseAssembler : public GenericAssembler {
    threeByteOpSimd("vpmovsxbw", VEX_PD, OP3_PMOVSXBW_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovsxbw_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxbw", VEX_PD, OP3_PMOVSXBW_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovsxbw_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxbw", VEX_PD, OP3_PMOVSXBW_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpmovzxbw_rr(XMMRegisterID src, XMMRegisterID dst) {
    threeByteOpSimd("vpmovzxbw", VEX_PD, OP3_PMOVZXBW_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovzxbw_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxbw", VEX_PD, OP3_PMOVZXBW_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovzxbw_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxbw", VEX_PD, OP3_PMOVZXBW_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpmovsxwd_rr(XMMRegisterID src, XMMRegisterID dst) {
    threeByteOpSimd("vpmovsxwd", VEX_PD, OP3_PMOVSXWD_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovsxwd_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxwd", VEX_PD, OP3_PMOVSXWD_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovsxwd_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxwd", VEX_PD, OP3_PMOVSXWD_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpmovzxwd_rr(XMMRegisterID src, XMMRegisterID dst) {
    threeByteOpSimd("vpmovzxwd", VEX_PD, OP3_PMOVZXWD_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovzxwd_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxwd", VEX_PD, OP3_PMOVZXWD_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovzxwd_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxwd", VEX_PD, OP3_PMOVZXWD_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpmovsxdq_rr(XMMRegisterID src, XMMRegisterID dst) {
    threeByteOpSimd("vpmovsxwd", VEX_PD, OP3_PMOVSXDQ_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovsxdq_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxdq", VEX_PD, OP3_PMOVSXDQ_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovsxdq_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovsxdq", VEX_PD, OP3_PMOVSXDQ_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpmovzxdq_rr(XMMRegisterID src, XMMRegisterID dst) {
    threeByteOpSimd("vpmovzxwd", VEX_PD, OP3_PMOVZXDQ_VdqWdq, ESCAPE_38, src,
                    invalid_xmm, dst);
  }
-  void vpmovzxdq_mr(int32_t offset, RegisterID base, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxdq", VEX_PD, OP3_PMOVZXDQ_VdqWdq, ESCAPE_38, offset,
-                    base, invalid_xmm, dst);
-  }
-  void vpmovzxdq_mr(int32_t offset, RegisterID base, RegisterID index,
-                    int32_t scale, XMMRegisterID dst) {
-    threeByteOpSimd("vpmovzxdq", VEX_PD, OP3_PMOVZXDQ_VdqWdq, ESCAPE_38, offset,
-                    base, index, scale, invalid_xmm, dst);
-  }

  void vpalignr_irr(unsigned imm, XMMRegisterID src, XMMRegisterID dst) {
    MOZ_ASSERT(imm < 32);
@ -4487,17 +4420,6 @@ class BaseAssembler : public GenericAssembler {
    m_formatter.threeByteOpVex(ty, opcode, escape, offset, base, src0, dst);
  }

-  void threeByteOpSimd(const char* name, VexOperandType ty,
-                       ThreeByteOpcodeID opcode, ThreeByteEscape escape,
-                       int32_t offset, RegisterID base, RegisterID index,
-                       int32_t scale, XMMRegisterID src0, XMMRegisterID dst) {
-    MOZ_ASSERT(useLegacySSEEncoding(src0, dst));
-    spew("%-11s" MEM_obs ", %s", legacySSEOpName(name),
-         ADDR_obs(offset, base, index, scale), XMMRegName(dst));
-    m_formatter.legacySSEPrefix(ty);
-    m_formatter.threeByteOp(opcode, escape, offset, base, index, scale, dst);
-  }
-
  void threeByteOpImmSimd(const char* name, VexOperandType ty,
                          ThreeByteOpcodeID opcode, ThreeByteEscape escape,
                          uint32_t imm, int32_t offset, RegisterID base,
@ -4960,17 +4882,6 @@ class BaseAssembler : public GenericAssembler {
      memoryModRM(offset, base, reg);
    }

-    void threeByteOp(ThreeByteOpcodeID opcode, ThreeByteEscape escape,
-                     int32_t offset, RegisterID base, RegisterID index,
-                     int32_t scale, int reg) {
-      m_buffer.ensureSpace(MaxInstructionSize);
-      emitRexIfNeeded(reg, 0, base);
-      m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
-      m_buffer.putByteUnchecked(escape);
-      m_buffer.putByteUnchecked(opcode);
-      memoryModRM(offset, base, index, scale, reg);
-    }
-
    void threeByteOpVex(VexOperandType ty, ThreeByteOpcodeID opcode,
                        ThreeByteEscape escape, int32_t offset, RegisterID base,
                        XMMRegisterID src0, int reg) {
@ -5000,17 +4911,6 @@ class BaseAssembler : public GenericAssembler {
      memoryModRM(address, reg);
    }

-    void threeByteRipOp(ThreeByteOpcodeID opcode, ThreeByteEscape escape,
-                        int ripOffset, int reg) {
-      m_buffer.ensureSpace(MaxInstructionSize);
-      emitRexIfNeeded(reg, 0, 0);
-      m_buffer.putByteUnchecked(OP_2BYTE_ESCAPE);
-      m_buffer.putByteUnchecked(escape);
-      m_buffer.putByteUnchecked(opcode);
-      putModRm(ModRmMemoryNoDisp, noBase, reg);
-      m_buffer.putIntUnchecked(ripOffset);
-    }
-
    void threeByteOpVex(VexOperandType ty, ThreeByteOpcodeID opcode,
                        ThreeByteEscape escape, const void* address,
                        XMMRegisterID src0, int reg) {
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -2595,20 +2595,16 @@ void CodeGenerator::visitWasmBinarySimd128(LWasmBinarySimd128* ins) {
      masm.compareFloat64x2(Assembler::GreaterThanOrEqual, rhs, lhsDest);
      break;
    case wasm::SimdOp::F32x4PMax:
-      // `lhsDest` is actually rhsDest, and `rhs` is actually lhs
-      masm.pseudoMaxFloat32x4(lhsDest, rhs);
+      masm.pseudoMaxFloat32x4(rhs, lhsDest);
      break;
    case wasm::SimdOp::F32x4PMin:
-      // `lhsDest` is actually rhsDest, and `rhs` is actually lhs
-      masm.pseudoMinFloat32x4(lhsDest, rhs);
+      masm.pseudoMinFloat32x4(rhs, lhsDest);
      break;
    case wasm::SimdOp::F64x2PMax:
-      // `lhsDest` is actually rhsDest, and `rhs` is actually lhs
-      masm.pseudoMaxFloat64x2(lhsDest, rhs);
+      masm.pseudoMaxFloat64x2(rhs, lhsDest);
      break;
    case wasm::SimdOp::F64x2PMin:
-      // `lhsDest` is actually rhsDest, and `rhs` is actually lhs
-      masm.pseudoMinFloat64x2(lhsDest, rhs);
+      masm.pseudoMinFloat64x2(rhs, lhsDest);
      break;
    case wasm::SimdOp::I32x4DotSI16x8:
      masm.widenDotInt16x8(rhs, lhsDest);
@ -2798,7 +2794,7 @@ void CodeGenerator::visitWasmShuffleSimd128(LWasmShuffleSimd128* ins) {
    }
    case LWasmShuffleSimd128::SHUFFLE_BLEND_8x16: {
      masm.shuffleInt8x16(reinterpret_cast<const uint8_t*>(control.asInt8x16()),
-                          rhs, lhsDest);
+                          rhs, lhsDest, ToFloatRegister(ins->temp()));
      break;
    }
    default: {
--- a/js/src/jit/x86-shared/LIR-x86-shared.h
+++ b/js/src/jit/x86-shared/LIR-x86-shared.h
@ -340,20 +340,16 @@ class LWasmBitselectSimd128 : public LInstructionHelper<1, 3, 1> {
 // (v128, v128) -> v128 effect-free operations
 // lhs and dest are the same.
 // temps (if in use) are FPR.
-// The op may differ from the MIR node's op.
 class LWasmBinarySimd128 : public LInstructionHelper<1, 2, 2> {
-  wasm::SimdOp op_;
-
 public:
  LIR_HEADER(WasmBinarySimd128)

  static constexpr uint32_t LhsDest = 0;
  static constexpr uint32_t Rhs = 1;

-  LWasmBinarySimd128(wasm::SimdOp op, const LAllocation& lhsDest,
-                     const LAllocation& rhs, const LDefinition& temp0,
-                     const LDefinition& temp1)
-      : LInstructionHelper(classOpcode), op_(op) {
+  LWasmBinarySimd128(const LAllocation& lhsDest, const LAllocation& rhs,
+                     const LDefinition& temp0, const LDefinition& temp1)
+      : LInstructionHelper(classOpcode) {
    setOperand(LhsDest, lhsDest);
    setOperand(Rhs, rhs);
    setTemp(0, temp0);
@ -362,7 +358,7 @@ class LWasmBinarySimd128 : public LInstructionHelper<1, 2, 2> {

  const LAllocation* lhsDest() { return getOperand(LhsDest); }
  const LAllocation* rhs() { return getOperand(Rhs); }
-  wasm::SimdOp simdOp() const { return op_; }
+  wasm::SimdOp simdOp() const { return mir_->toWasmBinarySimd128()->simdOp(); }
 };

 // (v128, i32) -> v128 effect-free variable-width shift operations
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -736,11 +736,6 @@ void LIRGenerator::visitWasmBitselectSimd128(MWasmBitselectSimd128* ins) {
  MOZ_ASSERT(ins->control()->type() == MIRType::Simd128);
  MOZ_ASSERT(ins->type() == MIRType::Simd128);

-  // Enforcing lhs == output avoids one setup move.  We would like to also
-  // enforce merging the control with the temp (with usRegisterAtStart(control)
-  // and tempCopy()), but the register allocator ignores those constraints
-  // at present.
-
  auto* lir = new (alloc()) LWasmBitselectSimd128(
      useRegisterAtStart(ins->lhs()), useRegister(ins->rhs()),
      useRegister(ins->control()), tempSimd128());
@ -750,7 +745,6 @@ void LIRGenerator::visitWasmBitselectSimd128(MWasmBitselectSimd128* ins) {
 void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
  MDefinition* lhs = ins->lhs();
  MDefinition* rhs = ins->rhs();
-  wasm::SimdOp op = ins->simdOp();

  MOZ_ASSERT(lhs->type() == MIRType::Simd128);
  MOZ_ASSERT(rhs->type() == MIRType::Simd128);
@ -760,88 +754,16 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
    ReorderCommutative(&lhs, &rhs, ins);
  }

-  // Swap operands and change operation if necessary, these are all x86/x64
-  // dependent transformations.  Except where noted, this is about avoiding
-  // unnecessary moves and fixups in the code generator macros.
-  bool swap = false;
-  switch (op) {
-    case wasm::SimdOp::V128AndNot: {
-      // Code generation requires the operands to be reversed.
-      swap = true;
-      break;
-    }
-    case wasm::SimdOp::I8x16LtS: {
-      swap = true;
-      op = wasm::SimdOp::I8x16GtS;
-      break;
-    }
-    case wasm::SimdOp::I8x16GeS: {
-      swap = true;
-      op = wasm::SimdOp::I8x16LeS;
-      break;
-    }
-    case wasm::SimdOp::I16x8LtS: {
-      swap = true;
-      op = wasm::SimdOp::I16x8GtS;
-      break;
-    }
-    case wasm::SimdOp::I16x8GeS: {
-      swap = true;
-      op = wasm::SimdOp::I16x8LeS;
-      break;
-    }
-    case wasm::SimdOp::I32x4LtS: {
-      swap = true;
-      op = wasm::SimdOp::I32x4GtS;
-      break;
-    }
-    case wasm::SimdOp::I32x4GeS: {
-      swap = true;
-      op = wasm::SimdOp::I32x4LeS;
-      break;
-    }
-    case wasm::SimdOp::F32x4Gt: {
-      swap = true;
-      op = wasm::SimdOp::F32x4Lt;
-      break;
-    }
-    case wasm::SimdOp::F32x4Ge: {
-      swap = true;
-      op = wasm::SimdOp::F32x4Le;
-      break;
-    }
-    case wasm::SimdOp::F64x2Gt: {
-      swap = true;
-      op = wasm::SimdOp::F64x2Lt;
-      break;
-    }
-    case wasm::SimdOp::F64x2Ge: {
-      swap = true;
-      op = wasm::SimdOp::F64x2Le;
-      break;
-    }
-    case wasm::SimdOp::F32x4PMin:
-    case wasm::SimdOp::F32x4PMax:
-    case wasm::SimdOp::F64x2PMin:
-    case wasm::SimdOp::F64x2PMax: {
-      // Code generation requires the operations to be reversed (the rhs is the
-      // output register).
-      swap = true;
-      break;
-    }
-    default:
-      break;
-  }
-  if (swap) {
-    MDefinition* tmp = lhs;
-    lhs = rhs;
-    rhs = tmp;
-  }
-
-  // Allocate temp registers
  LDefinition tempReg0 = LDefinition::BogusTemp();
  LDefinition tempReg1 = LDefinition::BogusTemp();
-  switch (op) {
+  switch (ins->simdOp()) {
+    case wasm::SimdOp::V128AndNot: {
+      // x86/x64 specific: Code generation requires the operands to be reversed.
+      MDefinition* tmp = lhs;
+      lhs = rhs;
+      rhs = tmp;
+      break;
+    }
    case wasm::SimdOp::I64x2Mul:
    case wasm::SimdOp::V8x16Swizzle:
      tempReg0 = tempSimd128();
@ -869,26 +791,11 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
      break;
  }

-  // For binary ops, the Masm API always is usually (rhs, lhsDest) and requires
-  // AtStart+ReuseInput for the lhs.
-  //
-  // The rhs is tricky due to register allocator restrictions:
-  //  - if lhs == rhs and lhs is AtStart then rhs must be AtStart too
-  //  - if lhs != rhs and lhs is AtStart then rhs must not be AtStart,
-  //    this appears to have something to do with risk of the rhs
-  //    being clobbered.  Anyway it doesn't matter much, since the
-  //    liveness of rhs will not prevent the lhs register to be reused
-  //    for the output.
-  //
-  // For a few ops, the API is actually (rhsDest, lhs) and the rules are the
-  // same but the reversed.  We swapped operands above; they will be swapped
-  // again in the code generator to emit the right code.
-
  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
  LAllocation rhsAlloc =
      lhs != rhs ? useRegister(rhs) : useRegisterAtStart(rhs);
  auto* lir = new (alloc())
-      LWasmBinarySimd128(op, lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
+      LWasmBinarySimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
  defineReuseInput(lir, ins, LWasmBinarySimd128::LhsDest);
 }

@ -933,8 +840,6 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
 #  ifdef DEBUG
    js::wasm::ReportSimdAnalysis("shift -> constant shift");
 #  endif
-    // Almost always beneficial, and never detrimental, to reuse the input if
-    // possible.
    auto* lir = new (alloc())
        LWasmConstantShiftSimd128(useRegisterAtStart(lhs), temp, shiftCount);
    defineReuseInput(lir, ins, LWasmConstantShiftSimd128::Src);
@ -959,9 +864,8 @@ void LIRGenerator::visitWasmShiftSimd128(MWasmShiftSimd128* ins) {
      break;
  }

-  // Reusing the input if possible is never detrimental.
  LAllocation lhsDestAlloc = useRegisterAtStart(lhs);
-  LAllocation rhsAlloc = useRegisterAtStart(rhs);
+  LAllocation rhsAlloc = useRegister(rhs);
  auto* lir = new (alloc())
      LWasmVariableShiftSimd128(lhsDestAlloc, rhsAlloc, tempReg0, tempReg1);
  defineReuseInput(lir, ins, LWasmVariableShiftSimd128::LhsDest);
@ -1580,7 +1484,7 @@ static Shuffle AnalyzeShuffle(MWasmShuffleSimd128* ins) {
  // Deal with constant rhs.
  if (rhs->isWasmFloatConstant()) {
    SimdConstant rhsConstant = rhs->toWasmFloatConstant()->toSimd128();
-    if (rhsConstant.isZeroBits()) {
+    if (rhsConstant.isIntegerZero()) {
      Maybe<LWasmPermuteSimd128::Op> op = AnalyzeShuffleWithZero(&control);
      if (op) {
        return Shuffle::permute(
@ -1696,34 +1600,14 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
    case Shuffle::Operand::LEFT:
    case Shuffle::Operand::RIGHT: {
      LAllocation src;
-      // All permute operators currently favor reusing the input register so
-      // we're not currently exercising code paths below that do not reuse.
-      // Those paths have been exercised in the past however and are believed
-      // to be correct.
-      bool useAtStartAndReuse = false;
-      switch (*s.permuteOp) {
-        case LWasmPermuteSimd128::MOVE:
-        case LWasmPermuteSimd128::BROADCAST_8x16:
-        case LWasmPermuteSimd128::BROADCAST_16x8:
-        case LWasmPermuteSimd128::PERMUTE_8x16:
-        case LWasmPermuteSimd128::PERMUTE_16x8:
-        case LWasmPermuteSimd128::PERMUTE_32x4:
-        case LWasmPermuteSimd128::ROTATE_RIGHT_8x16:
-        case LWasmPermuteSimd128::SHIFT_LEFT_8x16:
-        case LWasmPermuteSimd128::SHIFT_RIGHT_8x16:
-          useAtStartAndReuse = true;
-          break;
-        default:
-          MOZ_CRASH("Unexpected operator");
-      }
      if (s.opd == Shuffle::Operand::LEFT) {
-        if (useAtStartAndReuse) {
+        if (*s.permuteOp == LWasmPermuteSimd128::MOVE) {
          src = useRegisterAtStart(ins->lhs());
        } else {
          src = useRegister(ins->lhs());
        }
      } else {
-        if (useAtStartAndReuse) {
+        if (*s.permuteOp == LWasmPermuteSimd128::MOVE) {
          src = useRegisterAtStart(ins->rhs());
        } else {
          src = useRegister(ins->rhs());
@ -1731,7 +1615,7 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
      }
      auto* lir =
          new (alloc()) LWasmPermuteSimd128(src, *s.permuteOp, s.control);
-      if (useAtStartAndReuse) {
+      if (*s.permuteOp == LWasmPermuteSimd128::MOVE) {
        defineReuseInput(lir, ins, LWasmPermuteSimd128::Src);
      } else {
        define(lir, ins);
@ -1742,6 +1626,7 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
    case Shuffle::Operand::BOTH_SWAPPED: {
      LDefinition temp = LDefinition::BogusTemp();
      switch (*s.shuffleOp) {
+        case LWasmShuffleSimd128::SHUFFLE_BLEND_8x16:
        case LWasmShuffleSimd128::BLEND_8x16:
          temp = tempSimd128();
          break;
@ -1769,10 +1654,6 @@ void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
  MOZ_ASSERT(ins->lhs()->type() == MIRType::Simd128);
  MOZ_ASSERT(ins->type() == MIRType::Simd128);

-  // The Masm API is (rhs, lhsDest) and requires AtStart+ReuseInput for the lhs.
-  // For type reasons, the rhs will never be the same as the lhs and is
-  // therefore a plain Use.
-
  if (ins->rhs()->type() == MIRType::Int64) {
    auto* lir = new (alloc()) LWasmReplaceInt64LaneSimd128(
        useRegisterAtStart(ins->lhs()), useInt64Register(ins->rhs()));
@ -1784,35 +1665,20 @@ void LIRGenerator::visitWasmReplaceLaneSimd128(MWasmReplaceLaneSimd128* ins) {
  }
 }

+// For unary operations we currently avoid using useRegisterAtStart() and
+// reusing the input for the output, as that frequently leads to longer code
+// sequences as we end up using scratch to hold an intermediate result.
+
 void LIRGenerator::visitWasmScalarToSimd128(MWasmScalarToSimd128* ins) {
  MOZ_ASSERT(ins->type() == MIRType::Simd128);

-  switch (ins->input()->type()) {
-    case MIRType::Int64: {
-      // 64-bit integer splats.
-      // Load-and-(sign|zero)extend.
-      auto* lir = new (alloc())
-          LWasmInt64ToSimd128(useInt64RegisterAtStart(ins->input()));
-      define(lir, ins);
-      break;
-    }
-    case MIRType::Float32:
-    case MIRType::Double: {
-      // Floating-point splats.
-      // Ideally we save a move on SSE systems by reusing the input register,
-      // but since the input and output register types differ, we can't.
-      auto* lir =
-          new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
-      define(lir, ins);
-      break;
-    }
-    default: {
-      // 32-bit integer splats.
-      auto* lir =
-          new (alloc()) LWasmScalarToSimd128(useRegisterAtStart(ins->input()));
-      define(lir, ins);
-      break;
-    }
+  if (ins->input()->type() == MIRType::Int64) {
+    auto* lir =
+        new (alloc()) LWasmInt64ToSimd128(useInt64Register(ins->input()));
+    define(lir, ins);
+  } else {
+    auto* lir = new (alloc()) LWasmScalarToSimd128(useRegister(ins->input()));
+    define(lir, ins);
  }
 }

@ -1820,73 +1686,18 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
  MOZ_ASSERT(ins->input()->type() == MIRType::Simd128);
  MOZ_ASSERT(ins->type() == MIRType::Simd128);

-  bool useAtStart = false;
-  bool reuseInput = false;
  LDefinition tempReg = LDefinition::BogusTemp();
  switch (ins->simdOp()) {
-    case wasm::SimdOp::I8x16Neg:
-    case wasm::SimdOp::I16x8Neg:
-    case wasm::SimdOp::I32x4Neg:
-    case wasm::SimdOp::I64x2Neg:
-      // Prefer src != dest to avoid an unconditional src->temp move.
-      MOZ_ASSERT(!useAtStart && !reuseInput);
-      break;
-    case wasm::SimdOp::F32x4Neg:
-    case wasm::SimdOp::F64x2Neg:
-    case wasm::SimdOp::F32x4Abs:
-    case wasm::SimdOp::F64x2Abs:
-    case wasm::SimdOp::V128Not:
-    case wasm::SimdOp::F32x4Sqrt:
-    case wasm::SimdOp::F64x2Sqrt:
-    case wasm::SimdOp::I8x16Abs:
-    case wasm::SimdOp::I16x8Abs:
-    case wasm::SimdOp::I32x4Abs:
-    case wasm::SimdOp::I32x4TruncSSatF32x4:
-    case wasm::SimdOp::F32x4ConvertUI32x4:
-      // Prefer src == dest to avoid an unconditional src->dest move.
-      useAtStart = true;
-      reuseInput = true;
-      break;
    case wasm::SimdOp::I32x4TruncUSatF32x4:
      tempReg = tempSimd128();
-      // Prefer src == dest to avoid an unconditional src->dest move.
-      useAtStart = true;
-      reuseInput = true;
-      break;
-    case wasm::SimdOp::I16x8WidenLowSI8x16:
-    case wasm::SimdOp::I16x8WidenHighSI8x16:
-    case wasm::SimdOp::I16x8WidenLowUI8x16:
-    case wasm::SimdOp::I16x8WidenHighUI8x16:
-    case wasm::SimdOp::I32x4WidenLowSI16x8:
-    case wasm::SimdOp::I32x4WidenHighSI16x8:
-    case wasm::SimdOp::I32x4WidenLowUI16x8:
-    case wasm::SimdOp::I32x4WidenHighUI16x8:
-    case wasm::SimdOp::F32x4ConvertSI32x4:
-    case wasm::SimdOp::F32x4Ceil:
-    case wasm::SimdOp::F32x4Floor:
-    case wasm::SimdOp::F32x4Trunc:
-    case wasm::SimdOp::F32x4Nearest:
-    case wasm::SimdOp::F64x2Ceil:
-    case wasm::SimdOp::F64x2Floor:
-    case wasm::SimdOp::F64x2Trunc:
-    case wasm::SimdOp::F64x2Nearest:
-      // Prefer src == dest to exert the lowest register pressure on the
-      // surrounding code.
-      useAtStart = true;
-      MOZ_ASSERT(!reuseInput);
      break;
    default:
-      MOZ_CRASH("Unary SimdOp not implemented");
+      break;
  }

-  LUse inputUse =
-      useAtStart ? useRegisterAtStart(ins->input()) : useRegister(ins->input());
-  LWasmUnarySimd128* lir = new (alloc()) LWasmUnarySimd128(inputUse, tempReg);
-  if (reuseInput) {
-    defineReuseInput(lir, ins, LWasmUnarySimd128::Src);
-  } else {
-    define(lir, ins);
-  }
+  LWasmUnarySimd128* lir =
+      new (alloc()) LWasmUnarySimd128(useRegister(ins->input()), tempReg);
+  define(lir, ins);
 }

 bool LIRGeneratorX86Shared::canFoldReduceSimd128AndBranch(wasm::SimdOp op) {
@ -1936,30 +1747,12 @@ void LIRGenerator::visitWasmReduceSimd128(MWasmReduceSimd128* ins) {
    emitAtUses(ins);
    return;
  }
-
-  // Reductions (any_true, all_true, bitmask, extract_lane) uniformly prefer
-  // useRegisterAtStart:
-  //
-  // - In most cases, the input type differs from the output type, so there's no
-  //   conflict and it doesn't really matter.
-  //
-  // - For extract_lane(0) on F32x4 and F64x2, input == output results in zero
-  //   code being generated.
-  //
-  // - For extract_lane(k > 0) on F32x4 and F64x2, allowing the input register
-  //   to be targeted lowers register pressure if it's the last use of the
-  //   input.
-
  if (ins->type() == MIRType::Int64) {
-    auto* lir = new (alloc())
-        LWasmReduceSimd128ToInt64(useRegisterAtStart(ins->input()));
+    auto* lir =
+        new (alloc()) LWasmReduceSimd128ToInt64(useRegister(ins->input()));
    defineInt64(lir, ins);
  } else {
-    // Ideally we would reuse the input register for floating extract_lane if
-    // the lane is zero, but constraints in the register allocator require the
-    // input and output register types to be the same.
-    auto* lir =
-        new (alloc()) LWasmReduceSimd128(useRegisterAtStart(ins->input()));
+    auto* lir = new (alloc()) LWasmReduceSimd128(useRegister(ins->input()));
    define(lir, ins);
  }
 }
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
@ -36,7 +36,7 @@ void MacroAssemblerX86Shared::checkedConvertFloat32x4ToInt32x4(

  ScratchSimd128Scope scratch(asMasm());
  asMasm().loadConstantSimd128Int(InvalidResult, scratch);
-  vpcmpeqd(Operand(dest), scratch, scratch);
+  packedEqualInt32x4(Operand(dest), scratch);
  // TODO (bug 1156228): If we have SSE4.1, we can use PTEST here instead of
  // the two following instructions.
  vmovmskps(scratch, temp);
@ -115,7 +115,7 @@ void MacroAssemblerX86Shared::checkedConvertFloat32x4ToUint32x4(
  // positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
  // mask of non-A-lanes into |tempF|.
  zeroSimd128Float(tempF);
-  vpcmpgtd(Operand(out), tempF, tempF);
+  packedGreaterThanInt32x4(Operand(out), tempF);

  // Clear the A-lanes in B.
  bitwiseAndSimdInt(scratch, Operand(tempF), scratch);
@ -274,7 +274,7 @@ void MacroAssemblerX86Shared::swizzleFloat32x4(FloatRegister input,

  if (LanesMatch(lanes, 0, 1, 0, 1)) {
    if (AssemblerX86Shared::HasSSE3() && !AssemblerX86Shared::HasAVX()) {
-      vmovddup(Operand(input), output);
+      vmovddup(input, output);
      return;
    }
    FloatRegister inputCopy = reusedInputSimd128Float(input, output);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@ -21,8 +21,19 @@ void MacroAssemblerX86Shared::splatX16(Register input, FloatRegister output) {
  ScratchSimd128Scope scratch(asMasm());

  vmovd(input, output);
-  zeroSimd128Int(scratch);
-  vpshufb(scratch, output, output);
+  if (AssemblerX86Shared::HasSSSE3()) {
+    zeroSimd128Int(scratch);
+    vpshufb(scratch, output, output);
+  } else {
+    // Use two shifts to duplicate the low 8 bits into the low 16 bits.
+    vpsllw(Imm32(8), output, output);
+    vmovdqa(output, scratch);
+    vpsrlw(Imm32(8), scratch, scratch);
+    vpor(scratch, output, output);
+    // Then do an X8 splat.
+    vpshuflw(0, output, output);
+    vpshufd(0, output, output);
+  }
 }

 void MacroAssemblerX86Shared::splatX8(Register input, FloatRegister output) {
@ -38,16 +49,14 @@ void MacroAssemblerX86Shared::splatX4(Register input, FloatRegister output) {

 void MacroAssemblerX86Shared::splatX4(FloatRegister input,
                                      FloatRegister output) {
-  MOZ_ASSERT(input.isSingle() && output.isSimd128());
-  asMasm().moveSimd128Float(input.asSimd128(), output);
-  vshufps(0, output, output, output);
+  FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+  vshufps(0, inputCopy, inputCopy, output);
 }

 void MacroAssemblerX86Shared::splatX2(FloatRegister input,
                                      FloatRegister output) {
-  MOZ_ASSERT(input.isDouble() && output.isSimd128());
-  asMasm().moveSimd128Float(input.asSimd128(), output);
-  vshufpd(0, output, output, output);
+  FloatRegister inputCopy = reusedInputSimd128Float(input, output);
+  vshufpd(0, inputCopy, inputCopy, output);
 }

 void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
@ -56,8 +65,13 @@ void MacroAssemblerX86Shared::extractLaneInt32x4(FloatRegister input,
  if (lane == 0) {
    // The value we want to extract is in the low double-word
    moveLowInt32(input, output);
-  } else {
+  } else if (AssemblerX86Shared::HasSSE41()) {
    vpextrd(lane, input, output);
+  } else {
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
+    ScratchSimd128Scope scratch(asMasm());
+    shuffleInt32(mask, input, scratch);
+    moveLowInt32(scratch, output);
  }
 }

@ -73,7 +87,7 @@ void MacroAssemblerX86Shared::extractLaneFloat32x4(FloatRegister input,
    moveHighPairToLowPairFloat32(input, output);
  } else {
    uint32_t mask = MacroAssembler::ComputeShuffleMask(lane);
-    shuffleFloat32(mask, input, output.asSimd128());
+    shuffleFloat32(mask, input, output);
  }
 }

@ -93,6 +107,7 @@ void MacroAssemblerX86Shared::extractLaneFloat64x2(FloatRegister input,
 void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
                                                 Register output, unsigned lane,
                                                 SimdSign sign) {
+  // Unlike pextrd and pextrb, this is available in SSE2.
  vpextrw(lane, input, output);
  if (sign == SimdSign::Signed) {
    movswl(output, output);
@ -102,46 +117,148 @@ void MacroAssemblerX86Shared::extractLaneInt16x8(FloatRegister input,
 void MacroAssemblerX86Shared::extractLaneInt8x16(FloatRegister input,
                                                 Register output, unsigned lane,
                                                 SimdSign sign) {
-  vpextrb(lane, input, output);
-  if (sign == SimdSign::Signed) {
-    movsbl(output, output);
-  }
-}
-
-void MacroAssemblerX86Shared::replaceLaneFloat32x4(FloatRegister rhs,
-                                                   FloatRegister lhsDest,
-                                                   unsigned lane) {
-  MOZ_ASSERT(lhsDest.isSimd128() && rhs.isSingle());
-
-  if (lane == 0) {
-    if (rhs.asSimd128() == lhsDest) {
-      // no-op, although this should not normally happen for type checking
-      // reasons higher up in the stack.
-    } else {
-      // move low dword of value into low dword of output
-      vmovss(rhs, lhsDest, lhsDest);
+  if (AssemblerX86Shared::HasSSE41()) {
+    vpextrb(lane, input, output);
+    // vpextrb clears the high bits, so no further extension required.
+    if (sign == SimdSign::Unsigned) {
+      sign = SimdSign::NotApplicable;
    }
  } else {
-    vinsertps(vinsertpsMask(0, lane), rhs, lhsDest, lhsDest);
+    // Extract the relevant 16 bits containing our lane, then shift the
+    // right 8 bits into place.
+    extractLaneInt16x8(input, output, lane / 2, SimdSign::Unsigned);
+    if (lane % 2) {
+      shrl(Imm32(8), output);
+      // The shrl handles the zero-extension. Don't repeat it.
+      if (sign == SimdSign::Unsigned) {
+        sign = SimdSign::NotApplicable;
+      }
+    }
+  }
+
+  // We have the right low 8 bits in |output|, but we may need to fix the high
+  // bits. Note that this requires |output| to be one of the %eax-%edx
+  // registers.
+  switch (sign) {
+    case SimdSign::Signed:
+      movsbl(output, output);
+      break;
+    case SimdSign::Unsigned:
+      movzbl(output, output);
+      break;
+    case SimdSign::NotApplicable:
+      // No adjustment needed.
+      break;
  }
 }

-void MacroAssemblerX86Shared::replaceLaneFloat64x2(FloatRegister rhs,
-                                                   FloatRegister lhsDest,
-                                                   unsigned lane) {
-  MOZ_ASSERT(lhsDest.isSimd128() && rhs.isDouble());
+void MacroAssemblerX86Shared::insertLaneSimdInt(FloatRegister input,
+                                                Register value,
+                                                FloatRegister output,
+                                                unsigned lane,
+                                                unsigned numLanes) {
+  if (numLanes == 8) {
+    // Available in SSE 2.
+    vpinsrw(lane, value, input, output);
+    return;
+  }
+
+  // Note that, contrarily to float32x4, we cannot use vmovd if the inserted
+  // value goes into the first component, as vmovd clears out the higher lanes
+  // of the output.
+  if (AssemblerX86Shared::HasSSE41()) {
+    // TODO: Teach Lowering that we don't need defineReuseInput if we have AVX.
+    switch (numLanes) {
+      case 4:
+        vpinsrd(lane, value, input, output);
+        return;
+      case 16:
+        vpinsrb(lane, value, input, output);
+        return;
+    }
+  }
+
+  asMasm().reserveStack(Simd128DataSize);
+  storeAlignedSimd128Int(input, Address(StackPointer, 0));
+  switch (numLanes) {
+    case 4:
+      store32(value, Address(StackPointer, lane * sizeof(int32_t)));
+      break;
+    case 16:
+      // Note that this requires `value` to be in one the registers where the
+      // low 8 bits are addressible (%eax - %edx on x86, all of them on x86-64).
+      store8(value, Address(StackPointer, lane * sizeof(int8_t)));
+      break;
+    default:
+      MOZ_CRASH("Unsupported SIMD numLanes");
+  }
+  loadAlignedSimd128Int(Address(StackPointer, 0), output);
+  asMasm().freeStack(Simd128DataSize);
+}
+
+void MacroAssemblerX86Shared::insertLaneFloat32x4(FloatRegister input,
+                                                  FloatRegister value,
+                                                  FloatRegister output,
+                                                  unsigned lane) {
+  // This code can't work if this is not true.  That's probably a bug.
+  MOZ_RELEASE_ASSERT(input == output);

  if (lane == 0) {
-    if (rhs.asSimd128() == lhsDest) {
-      // no-op, although this should not normally happen for type checking
-      // reasons higher up in the stack.
-    } else {
+    if (value != output) {
+      vmovss(value, input, output);
+    }
+    return;
+  }
+
+  if (AssemblerX86Shared::HasSSE41()) {
+    // The input value is in the low float32 of the 'value' FloatRegister.
+    vinsertps(vinsertpsMask(0, lane), value, output, output);
+    return;
+  }
+
+  asMasm().reserveStack(Simd128DataSize);
+  storeAlignedSimd128Float(input, Address(StackPointer, 0));
+  asMasm().storeFloat32(value, Address(StackPointer, lane * sizeof(int32_t)));
+  loadAlignedSimd128Float(Address(StackPointer, 0), output);
+  asMasm().freeStack(Simd128DataSize);
+}
+
+void MacroAssemblerX86Shared::insertLaneFloat64x2(FloatRegister input,
+                                                  FloatRegister value,
+                                                  FloatRegister output,
+                                                  unsigned lane) {
+  if (input == output && output == value) {
+    // No-op
+    return;
+  }
+
+  if (input != output && value != output) {
+    // Merge input and value into output, so make input==output
+    vmovapd(input, output);
+    input = output;
+  }
+
+  if (input == output) {
+    // Merge value into output
+    if (lane == 0) {
      // move low qword of value into low qword of output
-      vmovsd(rhs, lhsDest, lhsDest);
+      vmovsd(value, output, output);
+    } else {
+      // move low qword of value into high qword of output
+      vshufpd(0, value, output, output);
    }
  } else {
-    // move low qword of value into high qword of output
-    vshufpd(0, rhs, lhsDest, lhsDest);
+    MOZ_ASSERT(value == output);
+    // Merge input into output
+    if (lane == 0) {
+      // move high qword of input into high qword of output
+      vshufpd(2, input, output, output);
+    } else {
+      // move low qword of output into high qword of output
+      vmovddup(output, output);
+      // move low qword of input into low qword of output
+      vmovsd(input, output, output);
+    }
  }
 }

@ -149,11 +266,12 @@ void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
                                           FloatRegister output,
                                           FloatRegister temp,
                                           const uint8_t lanes[16]) {
+  MOZ_ASSERT(AssemblerX86Shared::HasSSSE3());
  MOZ_ASSERT(lhs == output);
  MOZ_ASSERT(lhs == rhs || !temp.isInvalid());

-  // TODO: Consider whether PBLENDVB would not be better, even if it is variable
-  // and requires xmm0 to be free and the loading of a mask.
+  // TODO: For sse4.1, consider whether PBLENDVB would not be better, even if it
+  // is variable and requires xmm0 to be free and the loading of a mask.

  // Set scratch = lanes to select from lhs.
  int8_t mask[16];
@ -174,6 +292,7 @@ void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
                                           FloatRegister output,
                                           const uint16_t lanes[8]) {
+  MOZ_ASSERT(AssemblerX86Shared::HasSSE41());
  MOZ_ASSERT(lhs == output);

  uint32_t mask = 0;
@ -185,34 +304,61 @@ void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
  vpblendw(mask, rhs, lhs, lhs);
 }

-void MacroAssemblerX86Shared::shuffleInt8x16(FloatRegister lhs,
-                                             FloatRegister rhs,
-                                             FloatRegister output,
-                                             const uint8_t lanes[16]) {
-  ScratchSimd128Scope scratch(asMasm());
+void MacroAssemblerX86Shared::shuffleInt8x16(
+    FloatRegister lhs, FloatRegister rhs, FloatRegister output,
+    const Maybe<FloatRegister>& maybeFloatTemp,
+    const Maybe<Register>& maybeTemp, const uint8_t lanes[16]) {
+  DebugOnly<bool> hasSSSE3 = AssemblerX86Shared::HasSSSE3();
+  MOZ_ASSERT(hasSSSE3 == !!maybeFloatTemp);
+  MOZ_ASSERT(!hasSSSE3 == !!maybeTemp);

-  // Use pshufb instructions to gather the lanes from each source vector.
-  // A negative index creates a zero lane, so the two vectors can be combined.
+  // Use pshufb if it is available.
+  if (AssemblerX86Shared::HasSSSE3()) {
+    ScratchSimd128Scope scratch(asMasm());

-  // Register preference: lhs == output.
+    // Use pshufb instructions to gather the lanes from each source vector.
+    // A negative index creates a zero lane, so the two vectors can be combined.

-  // Set scratch = lanes from rhs.
-  int8_t idx[16];
-  for (unsigned i = 0; i < 16; i++) {
-    idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+    // Set scratch = lanes from lhs.
+    int8_t idx[16];
+    for (unsigned i = 0; i < 16; i++) {
+      idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+    }
+    asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
+                                    *maybeFloatTemp);
+    FloatRegister lhsCopy = reusedInputInt32x4(lhs, scratch);
+    vpshufb(*maybeFloatTemp, lhsCopy, scratch);
+
+    // Set output = lanes from rhs.
+    // TODO: The alternative to loading this constant is to complement
+    // the one that is already in *maybeFloatTemp, takes two instructions
+    // and a temp register: PCMPEQD tmp, tmp; PXOR *maybeFloatTemp, tmp.
+    // But scratch is available here so that's OK.  But it's not given
+    // that avoiding the load is a win.
+    for (unsigned i = 0; i < 16; i++) {
+      idx[i] = lanes[i] >= 16 ? lanes[i] - 16 : -1;
+    }
+    asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(idx),
+                                    *maybeFloatTemp);
+    FloatRegister rhsCopy = reusedInputInt32x4(rhs, output);
+    vpshufb(*maybeFloatTemp, rhsCopy, output);
+
+    // Combine.
+    vpor(scratch, output, output);
+    return;
  }
-  moveSimd128Int(rhs, scratch);
-  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), scratch);

-  // Set output = lanes from lhs.
+  // Worst-case fallback for pre-SSE3 machines. Bounce through memory.
+  asMasm().reserveStack(3 * Simd128DataSize);
+  storeAlignedSimd128Int(lhs, Address(StackPointer, Simd128DataSize));
+  storeAlignedSimd128Int(rhs, Address(StackPointer, 2 * Simd128DataSize));
  for (unsigned i = 0; i < 16; i++) {
-    idx[i] = lanes[i] < 16 ? lanes[i] : -1;
+    load8ZeroExtend(Address(StackPointer, Simd128DataSize + lanes[i]),
+                    *maybeTemp);
+    store8(*maybeTemp, Address(StackPointer, i));
  }
-  moveSimd128Int(lhs, output);
-  asMasm().vpshufbSimd128(SimdConstant::CreateX16(idx), output);
-
-  // Combine.
-  vpor(scratch, output, output);
+  loadAlignedSimd128Int(Address(StackPointer, 0), output);
+  asMasm().freeStack(3 * Simd128DataSize);
 }

 static inline FloatRegister ToSimdFloatRegister(const Operand& op) {
@ -232,23 +378,27 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
      vpcmpeqb(rhs, lhs, output);
      break;
    case Assembler::Condition::LessThan:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
      } else {
        loadAlignedSimd128Int(rhs, scratch);
      }
+
      // src := src > lhs (i.e. lhs < rhs)
+      // Improve by doing custom lowering (rhs is tied to the output register)
      vpcmpgtb(Operand(lhs), scratch, scratch);
      moveSimd128Int(scratch, output);
      break;
    case Assembler::Condition::NotEqual:
+      // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+      // should invert the comparison by, e.g. swapping the arms of a select
+      // if that's what it's used in.
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
      vpcmpeqb(rhs, lhs, output);
-      asMasm().bitwiseXorSimd128(allOnes, output);
+      bitwiseXorSimdInt(output, Operand(scratch), output);
      break;
    case Assembler::Condition::GreaterThanOrEqual:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
@ -261,8 +411,9 @@ void MacroAssemblerX86Shared::compareInt8x16(FloatRegister lhs, Operand rhs,
      break;
    case Assembler::Condition::LessThanOrEqual:
      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
      vpcmpgtb(rhs, lhs, output);
-      asMasm().bitwiseXorSimd128(allOnes, output);
+      bitwiseXorSimdInt(output, Operand(scratch), output);
      break;
    default:
      MOZ_CRASH("unexpected condition op");
@ -280,6 +431,7 @@ void MacroAssemblerX86Shared::unsignedCompareInt8x16(
  // TODO?  Rhs could be in memory (for Ion, anyway), in which case loading it
  // into scratch first would be better than loading it twice from memory.

+  MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
  MOZ_ASSERT(lhs == output);
  MOZ_ASSERT(lhs != tmp1 && lhs != tmp2);
  MOZ_ASSERT_IF(rhs.kind() == Operand::FPREG,
@ -363,23 +515,27 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
      vpcmpeqw(rhs, lhs, output);
      break;
    case Assembler::Condition::LessThan:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
      } else {
        loadAlignedSimd128Int(rhs, scratch);
      }
+
      // src := src > lhs (i.e. lhs < rhs)
+      // Improve by doing custom lowering (rhs is tied to the output register)
      vpcmpgtw(Operand(lhs), scratch, scratch);
      moveSimd128Int(scratch, output);
      break;
    case Assembler::Condition::NotEqual:
+      // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+      // should invert the comparison by, e.g. swapping the arms of a select
+      // if that's what it's used in.
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
      vpcmpeqw(rhs, lhs, output);
-      asMasm().bitwiseXorSimd128(allOnes, output);
+      bitwiseXorSimdInt(output, Operand(scratch), output);
      break;
    case Assembler::Condition::GreaterThanOrEqual:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
@ -392,8 +548,9 @@ void MacroAssemblerX86Shared::compareInt16x8(FloatRegister lhs, Operand rhs,
      break;
    case Assembler::Condition::LessThanOrEqual:
      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
      vpcmpgtw(rhs, lhs, output);
-      asMasm().bitwiseXorSimd128(allOnes, output);
+      bitwiseXorSimdInt(output, Operand(scratch), output);
      break;
    default:
      MOZ_CRASH("unexpected condition op");
@ -405,6 +562,7 @@ void MacroAssemblerX86Shared::unsignedCompareInt16x8(
    FloatRegister output, FloatRegister tmp1, FloatRegister tmp2) {
  // See comments at unsignedCompareInt8x16.

+  MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
  MOZ_ASSERT(lhs == output);

  bool complement = false;
@ -462,43 +620,48 @@ void MacroAssemblerX86Shared::compareInt32x4(FloatRegister lhs, Operand rhs,
  ScratchSimd128Scope scratch(asMasm());
  switch (cond) {
    case Assembler::Condition::GreaterThan:
-      vpcmpgtd(rhs, lhs, lhs);
+      packedGreaterThanInt32x4(rhs, lhs);
      break;
    case Assembler::Condition::Equal:
-      vpcmpeqd(rhs, lhs, lhs);
+      packedEqualInt32x4(rhs, lhs);
      break;
    case Assembler::Condition::LessThan:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
      } else {
        loadAlignedSimd128Int(rhs, scratch);
      }
+
      // src := src > lhs (i.e. lhs < rhs)
-      vpcmpgtd(Operand(lhs), scratch, scratch);
+      // Improve by doing custom lowering (rhs is tied to the output register)
+      packedGreaterThanInt32x4(Operand(lhs), scratch);
      moveSimd128Int(scratch, lhs);
      break;
    case Assembler::Condition::NotEqual:
-      vpcmpeqd(rhs, lhs, lhs);
-      asMasm().bitwiseXorSimd128(allOnes, lhs);
+      // Ideally for notEqual, greaterThanOrEqual, and lessThanOrEqual, we
+      // should invert the comparison by, e.g. swapping the arms of a select
+      // if that's what it's used in.
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
+      packedEqualInt32x4(rhs, lhs);
+      bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
      break;
    case Assembler::Condition::GreaterThanOrEqual:
-      // This is bad, but Ion does not use it.
      // src := rhs
      if (rhs.kind() == Operand::FPREG) {
        moveSimd128Int(ToSimdFloatRegister(rhs), scratch);
      } else {
        loadAlignedSimd128Int(rhs, scratch);
      }
-      vpcmpgtd(Operand(lhs), scratch, scratch);
+      packedGreaterThanInt32x4(Operand(lhs), scratch);
      asMasm().loadConstantSimd128Int(allOnes, lhs);
      bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
      break;
    case Assembler::Condition::LessThanOrEqual:
      // lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
-      vpcmpgtd(rhs, lhs, lhs);
-      asMasm().bitwiseXorSimd128(allOnes, lhs);
+      asMasm().loadConstantSimd128Int(allOnes, scratch);
+      packedGreaterThanInt32x4(rhs, lhs);
+      bitwiseXorSimdInt(lhs, Operand(scratch), lhs);
      break;
    default:
      MOZ_CRASH("unexpected condition op");
@ -512,6 +675,7 @@ void MacroAssemblerX86Shared::unsignedCompareInt32x4(
  // only have PCMPGTQ on SSE4.2 or later, so for SSE4.1 we need to use subtract
  // to compute the flags.

+  MOZ_ASSERT(AssemblerX86Shared::HasSSE41());  // PMOVZX, PMOVSX
  MOZ_ASSERT(lhs == output);

  bool complement = false;
@ -586,15 +750,17 @@ void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
  }

  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
-  // This is bad, but Ion does not need this fixup.
+  //
+  // TODO: The front end really needs to set things up so that this hack is not
+  // necessary.
  ScratchSimd128Scope scratch(asMasm());
  if (!lhs.aliases(output)) {
    if (rhs.kind() == Operand::FPREG &&
        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
-      vmovaps(rhs, scratch);
+      vmovdqa(rhs, scratch);
      rhs = Operand(scratch);
    }
-    vmovaps(lhs, output);
+    vmovdqa(lhs, output);
  }

  switch (cond) {
@ -612,7 +778,7 @@ void MacroAssemblerX86Shared::compareFloat32x4(FloatRegister lhs, Operand rhs,
      break;
    case Assembler::Condition::GreaterThanOrEqual:
    case Assembler::Condition::GreaterThan:
-      // We reverse these operations in the -inl.h file so that we don't have to
+      // We reverse these before register allocation so that we don't have to
      // copy into and out of temporaries after codegen.
      MOZ_CRASH("should have reversed this");
    default:
@ -628,15 +794,17 @@ void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
  }

  // Move lhs to output if lhs!=output; move rhs out of the way if rhs==output.
-  // This is bad, but Ion does not need this fixup.
+  //
+  // TODO: The front end really needs to set things up so that this hack is not
+  // necessary.
  ScratchSimd128Scope scratch(asMasm());
  if (!lhs.aliases(output)) {
    if (rhs.kind() == Operand::FPREG &&
        output.aliases(FloatRegister::FromCode(rhs.fpu()))) {
-      vmovapd(rhs, scratch);
+      vmovdqa(rhs, scratch);
      rhs = Operand(scratch);
    }
-    vmovapd(lhs, output);
+    vmovdqa(lhs, output);
  }

  switch (cond) {
@ -654,7 +822,7 @@ void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
      break;
    case Assembler::Condition::GreaterThanOrEqual:
    case Assembler::Condition::GreaterThan:
-      // We reverse these operations in the -inl.h file so that we don't have to
+      // We reverse these before register allocation so that we don't have to
      // copy into and out of temporaries after codegen.
      MOZ_CRASH("should have reversed this");
    default:
@ -662,6 +830,30 @@ void MacroAssemblerX86Shared::compareFloat64x2(FloatRegister lhs, Operand rhs,
  }
 }

+void MacroAssemblerX86Shared::mulInt32x4(FloatRegister lhs, Operand rhs,
+                                         const Maybe<FloatRegister>& temp,
+                                         FloatRegister output) {
+  if (AssemblerX86Shared::HasSSE41()) {
+    vpmulld(rhs, lhs, output);
+    return;
+  }
+
+  ScratchSimd128Scope scratch(asMasm());
+  loadAlignedSimd128Int(rhs, scratch);
+  vpmuludq(lhs, scratch, scratch);
+  // scratch contains (Rx, _, Rz, _) where R is the resulting vector.
+
+  MOZ_ASSERT(!!temp);
+  vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), lhs, lhs);
+  vpshufd(MacroAssembler::ComputeShuffleMask(1, 1, 3, 3), rhs, *temp);
+  vpmuludq(*temp, lhs, lhs);
+  // lhs contains (Ry, _, Rw, _) where R is the resulting vector.
+
+  vshufps(MacroAssembler::ComputeShuffleMask(0, 2, 0, 2), scratch, lhs, lhs);
+  // lhs contains (Ry, Rw, Rx, Rz)
+  vshufps(MacroAssembler::ComputeShuffleMask(2, 0, 3, 1), lhs, lhs, lhs);
+}
+
 // Semantics of wasm max and min.
 //
 //  * -0 < 0
@ -734,7 +926,8 @@ void MacroAssemblerX86Shared::minMaxFloat32x4(bool isMin, FloatRegister lhs_,

  vmovaps(temp1, temp2);                     // clear NaN lanes of result
  vpandn(output, temp2, temp2);              //   result now in temp2
-  asMasm().bitwiseAndSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes
+  asMasm().loadConstantSimd128Float(quietBits, output);
+  vandps(output, temp1, temp1);              // setup QNaN bits in NaN lanes
  vorps(temp1, temp2, temp2);                //   and OR into result
  vmovaps(lhs, temp1);                       // find NaN lanes
  vcmpunordps(Operand(temp1), temp1);        //   in lhs
@ -787,7 +980,8 @@ void MacroAssemblerX86Shared::minMaxFloat64x2(bool isMin, FloatRegister lhs_,

  vmovapd(temp1, temp2);                     // clear NaN lanes of result
  vpandn(output, temp2, temp2);              //   result now in temp2
-  asMasm().bitwiseAndSimd128(quietBits, temp1); // setup QNaN bits in NaN lanes
+  asMasm().loadConstantSimd128Float(quietBits, output);
+  vandpd(output, temp1, temp1);              // setup QNaN bits in NaN lanes
  vorpd(temp1, temp2, temp2);                //   and OR into result
  vmovapd(lhs, temp1);                       // find NaN lanes
  vcmpunordpd(Operand(temp1), temp1);        //   in lhs
@ -832,6 +1026,109 @@ void MacroAssemblerX86Shared::maxFloat64x2(FloatRegister lhs, Operand rhs,
  minMaxFloat64x2(/*isMin=*/false, lhs, rhs, temp1, temp2, output);
 }

+void MacroAssemblerX86Shared::negFloat32x4(Operand in, FloatRegister out) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister result = out;
+  if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
+    result = scratch;
+  }
+  // All zeros but the sign bit
+  static const SimdConstant minusZero = SimdConstant::SplatX4(-0.f);
+  asMasm().loadConstantSimd128Float(minusZero, result);
+  bitwiseXorFloat32x4(result, in, result);
+  if (result == scratch) {
+    moveSimd128Float(result, out);
+  }
+}
+
+void MacroAssemblerX86Shared::negFloat64x2(Operand in, FloatRegister out) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister result = out;
+  if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
+    result = scratch;
+  }
+  // All zeros but the sign bit
+  static const SimdConstant minusZero = SimdConstant::SplatX2(-0.0);
+  asMasm().loadConstantSimd128Float(minusZero, result);
+  vxorpd(ToSimdFloatRegister(in), result, result);
+  if (result == scratch) {
+    moveSimd128Float(result, out);
+  }
+}
+
+void MacroAssemblerX86Shared::notInt8x16(Operand in, FloatRegister out) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister result = out;
+  if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
+    result = scratch;
+  }
+  static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
+  asMasm().loadConstantSimd128Int(allOnes, result);
+  bitwiseXorSimdInt(result, in, result);
+  if (result == scratch) {
+    moveSimd128Float(result, out);
+  }
+}
+
+void MacroAssemblerX86Shared::notInt16x8(Operand in, FloatRegister out) {
+  // Bug, really
+  MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
+  static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
+  asMasm().loadConstantSimd128Int(allOnes, out);
+  bitwiseXorSimdInt(out, in, out);
+}
+
+void MacroAssemblerX86Shared::notInt32x4(Operand in, FloatRegister out) {
+  // Bug, really
+  MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
+  static const SimdConstant allOnes = SimdConstant::SplatX4(-1);
+  asMasm().loadConstantSimd128Int(allOnes, out);
+  bitwiseXorSimdInt(out, in, out);
+}
+
+void MacroAssemblerX86Shared::notFloat32x4(Operand in, FloatRegister out) {
+  // Bug, really
+  MOZ_ASSERT_IF(in.kind() == Operand::FPREG, in.fpu() != out.encoding());
+  float ones = SpecificNaN<float>(1, FloatingPoint<float>::kSignificandBits);
+  static const SimdConstant allOnes = SimdConstant::SplatX4(ones);
+  asMasm().loadConstantSimd128Float(allOnes, out);
+  bitwiseXorFloat32x4(out, in, out);
+}
+
+void MacroAssemblerX86Shared::absFloat32x4(Operand in, FloatRegister out) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister result = out;
+  if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
+    result = scratch;
+  }
+  // All ones but the sign bit
+  float signMask =
+      SpecificNaN<float>(0, FloatingPoint<float>::kSignificandBits);
+  static const SimdConstant signMasks = SimdConstant::SplatX4(signMask);
+  asMasm().loadConstantSimd128Float(signMasks, result);
+  bitwiseAndFloat32x4(result, in, result);
+  if (result == scratch) {
+    moveSimd128Float(result, out);
+  }
+}
+
+void MacroAssemblerX86Shared::absFloat64x2(Operand in, FloatRegister out) {
+  ScratchSimd128Scope scratch(asMasm());
+  FloatRegister result = out;
+  if (in.kind() == Operand::FPREG && ToSimdFloatRegister(in) == out) {
+    result = scratch;
+  }
+  // All ones but the sign bit
+  double signMask =
+      SpecificNaN<double>(0, FloatingPoint<double>::kSignificandBits);
+  static const SimdConstant signMasks = SimdConstant::SplatX2(signMask);
+  asMasm().loadConstantSimd128Float(signMasks, result);
+  vandpd(ToSimdFloatRegister(in), result, result);
+  if (result == scratch) {
+    moveSimd128Float(result, out);
+  }
+}
+
 static inline void MaskSimdShiftCount(MacroAssembler& masm, unsigned shiftmask,
                                      Register count, Register temp,
                                      FloatRegister dest) {
@ -878,7 +1175,9 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
 void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
    Imm32 count, FloatRegister src, FloatRegister dest) {
  MOZ_ASSERT(count.value <= 7);
-  asMasm().moveSimd128(src, dest);
+  if (src != dest) {
+    asMasm().moveSimd128(src, dest);
+  }
  // Use the doubling trick for low shift counts, otherwise mask off the bits
  // that are shifted out of the low byte of each word and use word shifts.  The
  // optimal cutoff remains to be explored.
@ -887,8 +1186,11 @@ void MacroAssemblerX86Shared::packedLeftShiftByScalarInt8x16(
      asMasm().addInt8x16(dest, dest);
    }
  } else {
-    asMasm().bitwiseAndSimd128(SimdConstant::SplatX16(0xFF >> count.value),
-                               dest);
+    ScratchSimd128Scope scratch(asMasm());
+    // Whether SplatX8 or SplatX16 is best depends on the constant probably?
+    asMasm().loadConstantSimd128Int(SimdConstant::SplatX16(0xFF >> count.value),
+                                    scratch);
+    vpand(Operand(scratch), dest, dest);
    vpsllw(count, dest, dest);
  }
 }
@ -909,8 +1211,7 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt8x16(
  asMasm().moveSimd128(src, scratch);
  vpslldq(Imm32(1), scratch, scratch);               // Low bytes -> high bytes
  vpsraw(Imm32(count.value + 8), scratch, scratch);  // Shift low bytes
-  asMasm().moveSimd128(src, dest);
-  vpsraw(count, dest, dest);  // Shift high bytes
+  vpsraw(count, dest, dest);                         // Shift high bytes
  asMasm().loadConstantSimd128Int(SimdConstant::SplatX8(0xFF00), temp);
  bitwiseAndSimdInt(dest, Operand(temp), dest);        // Keep high bytes
  bitwiseAndNotSimdInt(temp, Operand(scratch), temp);  // Keep low bytes
@ -928,9 +1229,14 @@ void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
 void MacroAssemblerX86Shared::packedUnsignedRightShiftByScalarInt8x16(
    Imm32 count, FloatRegister src, FloatRegister dest) {
  MOZ_ASSERT(count.value <= 7);
-  asMasm().moveSimd128(src, dest);
-  asMasm().bitwiseAndSimd128(
-      SimdConstant::SplatX16((0xFF << count.value) & 0xFF), dest);
+  if (src != dest) {
+    asMasm().moveSimd128(src, dest);
+  }
+  ScratchSimd128Scope scratch(asMasm());
+  // Whether SplatX8 or SplatX16 is best depends on the constant probably?
+  asMasm().loadConstantSimd128Int(
+      SimdConstant::SplatX16((0xFF << count.value) & 0xFF), scratch);
+  vpand(Operand(scratch), dest, dest);
  vpsrlw(count, dest, dest);
 }

@ -1023,7 +1329,9 @@ void MacroAssemblerX86Shared::packedRightShiftByScalarInt64x2(
      SimdConstant::SplatX2(int64_t(0xFFFFFFFF00000000LL)), scratch);
  // Compute low dwords (high dwords at most have clear high bits where the
  // result will have set low high bits)
-  asMasm().moveSimd128(src, dest);
+  if (src != dest) {
+    asMasm().moveSimd128(src, dest);
+  }
  vpsrlq(count, dest, dest);
  // Merge the parts
  vpor(scratch, dest, dest);
@ -1034,11 +1342,12 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
                                            FloatRegister onFalse,
                                            FloatRegister temp,
                                            FloatRegister output) {
-  // Normally the codegen will attempt to enforce these register assignments so
-  // that the moves are avoided.
-
-  asMasm().moveSimd128Int(onTrue, output);
-  asMasm().moveSimd128Int(mask, temp);
+  if (onTrue != output) {
+    vmovaps(onTrue, output);
+  }
+  if (mask != temp) {
+    vmovaps(mask, temp);
+  }

  // SSE4.1 has plain blendvps which can do this, but it is awkward
  // to use because it requires the mask to be in xmm0.
@ -1053,7 +1362,9 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
 void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
    FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Int(src, dest);
+  if (src != dest) {
+    vmovaps(src, dest);
+  }
  vpxor(Operand(scratch), scratch, scratch);  // extract low bits
  vpblendw(0x55, dest, scratch, scratch);     //   into scratch
  vpsubd(Operand(scratch), dest, dest);       //     and high bits into dest
@ -1067,7 +1378,9 @@ void MacroAssemblerX86Shared::unsignedConvertInt32x4ToFloat32x4(
 void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
                                                         FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  if (src != dest) {
+    vmovaps(src, dest);
+  }

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
  // of range values as we need it to.  We want to saturate too-large positive
@ -1101,7 +1414,9 @@ void MacroAssemblerX86Shared::truncSatFloat32x4ToInt32x4(FloatRegister src,
 void MacroAssemblerX86Shared::unsignedTruncSatFloat32x4ToInt32x4(
    FloatRegister src, FloatRegister temp, FloatRegister dest) {
  ScratchSimd128Scope scratch(asMasm());
-  asMasm().moveSimd128Float(src, dest);
+  if (src != dest) {
+    vmovaps(src, dest);
+  }

  // The cvttps2dq instruction is the workhorse but does not handle NaN or out
  // of range values as we need it to.  We want to saturate too-large positive
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@ -1114,11 +1114,7 @@ void MacroAssembler::zeroSimd128(FloatRegister dest) {

 void MacroAssembler::loadConstantSimd128(const SimdConstant& v,
                                         FloatRegister dest) {
-  if (v.isFloatingType()) {
-    loadConstantSimd128Float(v, dest);
-  } else {
-    loadConstantSimd128Int(v, dest);
-  }
+  loadConstantSimd128Int(v, dest);
 }

 // Splat
@ -1190,34 +1186,35 @@ void MacroAssembler::extractLaneFloat64x2(uint32_t lane, FloatRegister src,

 void MacroAssembler::replaceLaneInt8x16(unsigned lane, Register rhs,
                                        FloatRegister lhsDest) {
-  vpinsrb(lane, rhs, lhsDest, lhsDest);
+  MacroAssemblerX86Shared::insertLaneSimdInt(lhsDest, rhs, lhsDest, lane, 16);
 }

 void MacroAssembler::replaceLaneInt16x8(unsigned lane, Register rhs,
                                        FloatRegister lhsDest) {
-  vpinsrw(lane, rhs, lhsDest, lhsDest);
+  MacroAssemblerX86Shared::insertLaneSimdInt(lhsDest, rhs, lhsDest, lane, 8);
 }

 void MacroAssembler::replaceLaneInt32x4(unsigned lane, Register rhs,
                                        FloatRegister lhsDest) {
-  vpinsrd(lane, rhs, lhsDest, lhsDest);
+  MacroAssemblerX86Shared::insertLaneSimdInt(lhsDest, rhs, lhsDest, lane, 4);
 }

 void MacroAssembler::replaceLaneFloat32x4(unsigned lane, FloatRegister rhs,
                                          FloatRegister lhsDest) {
-  MacroAssemblerX86Shared::replaceLaneFloat32x4(rhs, lhsDest, lane);
+  MacroAssemblerX86Shared::insertLaneFloat32x4(lhsDest, rhs, lhsDest, lane);
 }

 void MacroAssembler::replaceLaneFloat64x2(unsigned lane, FloatRegister rhs,
                                          FloatRegister lhsDest) {
-  MacroAssemblerX86Shared::replaceLaneFloat64x2(rhs, lhsDest, lane);
+  MacroAssemblerX86Shared::insertLaneFloat64x2(lhsDest, rhs, lhsDest, lane);
 }

 // Shuffle - permute with immediate indices

 void MacroAssembler::shuffleInt8x16(const uint8_t lanes[16], FloatRegister rhs,
-                                    FloatRegister lhsDest) {
-  MacroAssemblerX86Shared::shuffleInt8x16(lhsDest, rhs, lhsDest, lanes);
+                                    FloatRegister lhsDest, FloatRegister temp) {
+  MacroAssemblerX86Shared::shuffleInt8x16(
+      lhsDest, rhs, lhsDest, mozilla::Some(temp), mozilla::Nothing(), lanes);
 }

 void MacroAssembler::blendInt8x16(const uint8_t lanes[16], FloatRegister rhs,
@ -1265,7 +1262,9 @@ void MacroAssembler::permuteInt8x16(const uint8_t lanes[16], FloatRegister src,
  ScratchSimd128Scope scratch(*this);
  loadConstantSimd128Int(SimdConstant::CreateX16((const int8_t*)lanes),
                         scratch);
-  moveSimd128Int(src, dest);
+  if (src != dest) {
+    vmovaps(src, dest);
+  }
  vpshufb(scratch, dest, dest);
 }

@ -1297,13 +1296,17 @@ void MacroAssembler::concatAndRightShiftInt8x16(FloatRegister rhs,

 void MacroAssembler::leftShiftSimd128(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpslldq(count, dest, dest);
 }

 void MacroAssembler::rightShiftSimd128(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsrldq(count, dest, dest);
 }

@ -1382,7 +1385,7 @@ void MacroAssembler::swizzleInt8x16(FloatRegister rhs, FloatRegister lhsDest,
                                    FloatRegister temp) {
  ScratchSimd128Scope scratch(*this);
  loadConstantSimd128Int(SimdConstant::SplatX16(15), scratch);
-  moveSimd128Int(rhs, temp);
+  vmovapd(rhs, temp);
  vpcmpgtb(Operand(scratch), temp, temp);  // set high bit
  vpor(Operand(rhs), temp, temp);          //   for values > 15
  vpshufb(temp, lhsDest, lhsDest);         // permute
@ -1461,7 +1464,7 @@ void MacroAssembler::mulInt64x2(FloatRegister rhs, FloatRegister lhsDest,
 void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(*this);
  if (src == dest) {
-    moveSimd128Int(src, scratch);
+    vmovaps(src, scratch);
    src = scratch;
  }
  vpxor(Operand(dest), dest, dest);
@ -1471,7 +1474,7 @@ void MacroAssembler::negInt8x16(FloatRegister src, FloatRegister dest) {
 void MacroAssembler::negInt16x8(FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(*this);
  if (src == dest) {
-    moveSimd128Int(src, scratch);
+    vmovaps(src, scratch);
    src = scratch;
  }
  vpxor(Operand(dest), dest, dest);
@ -1481,7 +1484,7 @@ void MacroAssembler::negInt16x8(FloatRegister src, FloatRegister dest) {
 void MacroAssembler::negInt32x4(FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(*this);
  if (src == dest) {
-    moveSimd128Int(src, scratch);
+    vmovaps(src, scratch);
    src = scratch;
  }
  vpxor(Operand(dest), dest, dest);
@ -1491,7 +1494,7 @@ void MacroAssembler::negInt32x4(FloatRegister src, FloatRegister dest) {
 void MacroAssembler::negInt64x2(FloatRegister src, FloatRegister dest) {
  ScratchSimd128Scope scratch(*this);
  if (src == dest) {
-    moveSimd128Int(src, scratch);
+    vmovaps(src, scratch);
    src = scratch;
  }
  vpxor(Operand(dest), dest, dest);
@ -1643,7 +1646,9 @@ void MacroAssembler::leftShiftInt16x8(Register rhs, FloatRegister lhsDest,

 void MacroAssembler::leftShiftInt16x8(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsllw(count, src, dest);
 }

@ -1655,7 +1660,9 @@ void MacroAssembler::leftShiftInt32x4(Register rhs, FloatRegister lhsDest,

 void MacroAssembler::leftShiftInt32x4(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpslld(count, src, dest);
 }

@ -1667,7 +1674,9 @@ void MacroAssembler::leftShiftInt64x2(Register rhs, FloatRegister lhsDest,

 void MacroAssembler::leftShiftInt64x2(Imm32 count, FloatRegister src,
                                      FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsllq(count, src, dest);
 }

@ -1707,7 +1716,9 @@ void MacroAssembler::rightShiftInt16x8(Register rhs, FloatRegister lhsDest,

 void MacroAssembler::rightShiftInt16x8(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsraw(count, src, dest);
 }

@ -1720,7 +1731,9 @@ void MacroAssembler::unsignedRightShiftInt16x8(Register rhs,

 void MacroAssembler::unsignedRightShiftInt16x8(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsrlw(count, src, dest);
 }

@ -1732,7 +1745,9 @@ void MacroAssembler::rightShiftInt32x4(Register rhs, FloatRegister lhsDest,

 void MacroAssembler::rightShiftInt32x4(Imm32 count, FloatRegister src,
                                       FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsrad(count, src, dest);
 }

@ -1745,7 +1760,9 @@ void MacroAssembler::unsignedRightShiftInt32x4(Register rhs,

 void MacroAssembler::unsignedRightShiftInt32x4(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsrld(count, src, dest);
 }

@ -1763,7 +1780,9 @@ void MacroAssembler::unsignedRightShiftInt64x2(Register rhs,

 void MacroAssembler::unsignedRightShiftInt64x2(Imm32 count, FloatRegister src,
                                               FloatRegister dest) {
-  moveSimd128(src, dest);
+  if (src != dest) {
+    moveSimd128(src, dest);
+  }
  vpsrlq(count, src, dest);
 }

@ -1785,8 +1804,7 @@ void MacroAssembler::bitwiseXorSimd128(FloatRegister rhs,
 }

 void MacroAssembler::bitwiseNotSimd128(FloatRegister src, FloatRegister dest) {
-  moveSimd128(src, dest);
-  bitwiseXorSimd128(SimdConstant::SplatX16(-1), dest);
+  MacroAssemblerX86Shared::notInt8x16(Operand(src), dest);
 }

 // Bitwise and-not
@ -1914,25 +1932,21 @@ void MacroAssembler::storeUnalignedSimd128(FloatRegister src,
 // Floating point negation

 void MacroAssembler::negFloat32x4(FloatRegister src, FloatRegister dest) {
-  moveSimd128(src, dest);
-  bitwiseXorSimd128(SimdConstant::SplatX4(-0.f), dest);
+  MacroAssemblerX86Shared::negFloat32x4(Operand(src), dest);
 }

 void MacroAssembler::negFloat64x2(FloatRegister src, FloatRegister dest) {
-  moveSimd128(src, dest);
-  bitwiseXorSimd128(SimdConstant::SplatX2(-0.0), dest);
+  MacroAssemblerX86Shared::negFloat64x2(Operand(src), dest);
 }

 // Floating point absolute value

 void MacroAssembler::absFloat32x4(FloatRegister src, FloatRegister dest) {
-  moveSimd128(src, dest);
-  bitwiseAndSimd128(SimdConstant::SplatX4(0x7FFFFFFF), dest);
+  MacroAssemblerX86Shared::absFloat32x4(Operand(src), dest);
 }

 void MacroAssembler::absFloat64x2(FloatRegister src, FloatRegister dest) {
-  moveSimd128(src, dest);
-  bitwiseAndSimd128(SimdConstant::SplatX2(int64_t(0x7FFFFFFFFFFFFFFFll)), dest);
+  MacroAssemblerX86Shared::absFloat64x2(Operand(src), dest);
 }

 // NaN-propagating minimum
@ -1965,36 +1979,38 @@ void MacroAssembler::maxFloat64x2(FloatRegister rhs, FloatRegister lhsDest,

 // Compare-based minimum

-void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  // Shut up the linter by using the same names as in the declaration, then
-  // aliasing here.
-  FloatRegister rhsDest = rhsOrRhsDest;
-  FloatRegister lhs = lhsOrLhsDest;
-  vminps(Operand(lhs), rhsDest, rhsDest);
+void MacroAssembler::pseudoMinFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  ScratchSimd128Scope scratch(*this);
+  vmovaps(rhs, scratch);
+  vminps(Operand(lhsDest), scratch, scratch);
+  vmovaps(scratch, lhsDest);
 }

-void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhsDest = rhsOrRhsDest;
-  FloatRegister lhs = lhsOrLhsDest;
-  vminpd(Operand(lhs), rhsDest, rhsDest);
+void MacroAssembler::pseudoMinFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  ScratchSimd128Scope scratch(*this);
+  vmovapd(rhs, scratch);
+  vminpd(Operand(lhsDest), scratch, scratch);
+  vmovapd(scratch, lhsDest);
 }

 // Compare-based maximum

-void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhsDest = rhsOrRhsDest;
-  FloatRegister lhs = lhsOrLhsDest;
-  vmaxps(Operand(lhs), rhsDest, rhsDest);
+void MacroAssembler::pseudoMaxFloat32x4(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  ScratchSimd128Scope scratch(*this);
+  vmovaps(rhs, scratch);
+  vmaxps(Operand(lhsDest), scratch, scratch);
+  vmovaps(scratch, lhsDest);
 }

-void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhsOrRhsDest,
-                                        FloatRegister lhsOrLhsDest) {
-  FloatRegister rhsDest = rhsOrRhsDest;
-  FloatRegister lhs = lhsOrLhsDest;
-  vmaxpd(Operand(lhs), rhsDest, rhsDest);
+void MacroAssembler::pseudoMaxFloat64x2(FloatRegister rhs,
+                                        FloatRegister lhsDest) {
+  ScratchSimd128Scope scratch(*this);
+  vmovapd(rhs, scratch);
+  vmaxpd(Operand(lhsDest), scratch, scratch);
+  vmovapd(scratch, lhsDest);
 }

 // Widening/pairwise integer dot product
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@ -408,13 +408,19 @@ class MacroAssemblerX86Shared : public Assembler {
  void extractLaneInt8x16(FloatRegister input, Register output, unsigned lane,
                          SimdSign sign);

-  void replaceLaneFloat32x4(FloatRegister rhs, FloatRegister lhsDest,
-                            unsigned lane);
-  void replaceLaneFloat64x2(FloatRegister rhs, FloatRegister lhsDest,
-                            unsigned lane);
+  void insertLaneSimdInt(FloatRegister input, Register value,
+                         FloatRegister output, unsigned lane,
+                         unsigned numLanes);
+  void insertLaneFloat32x4(FloatRegister input, FloatRegister value,
+                           FloatRegister output, unsigned lane);
+  void insertLaneFloat64x2(FloatRegister input, FloatRegister value,
+                           FloatRegister output, unsigned lane);

  void shuffleInt8x16(FloatRegister lhs, FloatRegister rhs,
-                      FloatRegister output, const uint8_t lanes[16]);
+                      FloatRegister output,
+                      const mozilla::Maybe<FloatRegister>& maybeFloatTemp,
+                      const mozilla::Maybe<Register>& maybeTemp,
+                      const uint8_t lanes[16]);
  void blendInt8x16(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
                    FloatRegister temp, const uint8_t lanes[16]);
  void blendInt16x8(FloatRegister lhs, FloatRegister rhs, FloatRegister output,
@ -440,6 +446,18 @@ class MacroAssemblerX86Shared : public Assembler {
  void compareFloat64x2(FloatRegister lhs, Operand rhs,
                        Assembler::Condition cond, FloatRegister output);

+  void mulInt32x4(FloatRegister lhs, Operand rhs,
+                  const mozilla::Maybe<FloatRegister>& temp,
+                  FloatRegister output);
+
+  void negFloat32x4(Operand in, FloatRegister out);
+  void negFloat64x2(Operand in, FloatRegister out);
+
+  void notInt8x16(Operand in, FloatRegister out);
+  void notInt16x8(Operand in, FloatRegister out);
+  void notInt32x4(Operand in, FloatRegister out);
+  void notFloat32x4(Operand in, FloatRegister out);
+
  void minMaxFloat32x4(bool isMin, FloatRegister lhs, Operand rhs,
                       FloatRegister temp1, FloatRegister temp2,
                       FloatRegister output);
@ -456,6 +474,9 @@ class MacroAssemblerX86Shared : public Assembler {
  void maxFloat64x2(FloatRegister lhs, Operand rhs, FloatRegister temp1,
                    FloatRegister temp2, FloatRegister output);

+  void absFloat32x4(Operand in, FloatRegister out);
+  void absFloat64x2(Operand in, FloatRegister out);
+
  void packedShiftByScalarInt8x16(
      FloatRegister in, Register count, Register temp, FloatRegister xtmp,
      FloatRegister dest,
@ -558,12 +579,9 @@ class MacroAssemblerX86Shared : public Assembler {
    vmovdqa(src, Operand(dest));
  }
  void moveSimd128Int(FloatRegister src, FloatRegister dest) {
-    if (src != dest) {
-      vmovdqa(src, dest);
-    }
+    vmovdqa(src, dest);
  }
  FloatRegister reusedInputInt32x4(FloatRegister src, FloatRegister dest) {
-    MOZ_ASSERT(src.isSimd128() && dest.isSimd128());
    if (HasAVX()) {
      return src;
    }
@ -588,6 +606,12 @@ class MacroAssemblerX86Shared : public Assembler {
  void storeUnalignedSimd128Int(FloatRegister src, const Operand& dest) {
    vmovdqu(src, dest);
  }
+  void packedEqualInt32x4(const Operand& src, FloatRegister dest) {
+    vpcmpeqd(src, dest, dest);
+  }
+  void packedGreaterThanInt32x4(const Operand& src, FloatRegister dest) {
+    vpcmpgtd(src, dest, dest);
+  }
  void packedLeftShiftByScalarInt16x8(Imm32 count, FloatRegister dest) {
    count.value &= 15;
    vpsllw(count, dest, dest);
@ -624,12 +648,9 @@ class MacroAssemblerX86Shared : public Assembler {
    vmovaps(src, Operand(dest));
  }
  void moveSimd128Float(FloatRegister src, FloatRegister dest) {
-    if (src != dest) {
-      vmovaps(src, dest);
-    }
+    vmovaps(src, dest);
  }
  FloatRegister reusedInputSimd128Float(FloatRegister src, FloatRegister dest) {
-    MOZ_ASSERT(src.isSimd128() && dest.isSimd128());
    if (HasAVX()) {
      return src;
    }
@ -719,7 +740,6 @@ class MacroAssemblerX86Shared : public Assembler {
  }
  FloatRegister reusedInputAlignedInt32x4(const Operand& src,
                                          FloatRegister dest) {
-    MOZ_ASSERT(dest.isSimd128());
    if (HasAVX() && src.kind() == Operand::FPREG) {
      return FloatRegister::FromCode(src.fpu());
    }
@ -757,7 +777,6 @@ class MacroAssemblerX86Shared : public Assembler {
  }
  FloatRegister reusedInputAlignedSimd128Float(const Operand& src,
                                               FloatRegister dest) {
-    MOZ_ASSERT(dest.isSimd128());
    if (HasAVX() && src.kind() == Operand::FPREG) {
      return FloatRegister::FromCode(src.fpu());
    }
@ -913,11 +932,13 @@ class MacroAssemblerX86Shared : public Assembler {
  }

  bool maybeInlineSimd128Int(const SimdConstant& v, const FloatRegister& dest) {
-    if (v.isZeroBits()) {
+    static const SimdConstant zero = SimdConstant::SplatX4(0);
+    static const SimdConstant minusOne = SimdConstant::SplatX4(-1);
+    if (v == zero) {
      zeroSimd128Int(dest);
      return true;
    }
-    if (v.isOneBits()) {
+    if (v == minusOne) {
      vpcmpeqw(Operand(dest), dest, dest);
      return true;
    }
@ -925,7 +946,10 @@ class MacroAssemblerX86Shared : public Assembler {
  }
  bool maybeInlineSimd128Float(const SimdConstant& v,
                               const FloatRegister& dest) {
-    if (v.isZeroBits()) {
+    static const SimdConstant zero = SimdConstant::SplatX4(0.f);
+    if (v == zero) {
+      // This won't get inlined if the SimdConstant v contains -0 in any
+      // lane, as operator== here does a memcmp.
      zeroSimd128Float(dest);
      return true;
    }
--- a/js/src/jit/x86/MacroAssembler-x86-inl.h
+++ b/js/src/jit/x86/MacroAssembler-x86-inl.h
@ -1019,11 +1019,6 @@ void MacroAssembler::bitwiseAndSimd128(const SimdConstant& rhs,
  vpandSimd128(rhs, lhsDest);
 }

-void MacroAssembler::bitwiseXorSimd128(const SimdConstant& rhs,
-                                       FloatRegister lhsDest) {
-  vpxorSimd128(rhs, lhsDest);
-}
-
 // ========================================================================
 // Truncate floating point.

--- a/js/src/jit/x86/MacroAssembler-x86.cpp
+++ b/js/src/jit/x86/MacroAssembler-x86.cpp
@ -86,26 +86,6 @@ void MacroAssemblerX86::vpandSimd128(const SimdConstant& v,
  propagateOOM(val->uses.append(CodeOffset(masm.size())));
 }

-void MacroAssemblerX86::vpxorSimd128(const SimdConstant& v,
-                                     FloatRegister srcDest) {
-  SimdData* val = getSimdData(v);
-  if (!val) {
-    return;
-  }
-  masm.vpxor_mr(nullptr, srcDest.encoding(), srcDest.encoding());
-  propagateOOM(val->uses.append(CodeOffset(masm.size())));
-}
-
-void MacroAssemblerX86::vpshufbSimd128(const SimdConstant& v,
-                                       FloatRegister srcDest) {
-  SimdData* val = getSimdData(v);
-  if (!val) {
-    return;
-  }
-  masm.vpshufb_mr(nullptr, srcDest.encoding(), srcDest.encoding());
-  propagateOOM(val->uses.append(CodeOffset(masm.size())));
-}
-
 void MacroAssemblerX86::finish() {
  // Last instruction may be an indirect jump so eagerly insert an undefined
  // instruction byte to prevent processors from decoding data values into
@ -615,12 +595,6 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
  MOZ_ASSERT(srcAddr.kind() == Operand::MEM_REG_DISP ||
             srcAddr.kind() == Operand::MEM_SCALE);

-  MOZ_ASSERT_IF(
-      access.isZeroExtendSimd128Load(),
-      access.type() == Scalar::Float32 || access.type() == Scalar::Float64);
-  MOZ_ASSERT_IF(access.isSplatSimd128Load(), access.type() == Scalar::Float64);
-  MOZ_ASSERT_IF(access.isWidenSimd128Load(), access.type() == Scalar::Float64);
-
  memoryBarrierBefore(access.sync());

  append(access, size());
@ -642,39 +616,12 @@ void MacroAssembler::wasmLoad(const wasm::MemoryAccessDesc& access,
      movl(srcAddr, out.gpr());
      break;
    case Scalar::Float32:
-      // vmovss does the right thing also for access.isZeroExtendSimd128Load()
+      // vmovss does the right thing also for access.isZeroExtendSimdLoad()
      vmovss(srcAddr, out.fpu());
      break;
    case Scalar::Float64:
-      if (access.isSplatSimd128Load()) {
-        vmovddup(srcAddr, out.fpu());
-      } else if (access.isWidenSimd128Load()) {
-        switch (access.widenSimdOp()) {
-          case wasm::SimdOp::I16x8LoadS8x8:
-            vpmovsxbw(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I16x8LoadU8x8:
-            vpmovzxbw(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I32x4LoadS16x4:
-            vpmovsxwd(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I32x4LoadU16x4:
-            vpmovzxwd(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I64x2LoadS32x2:
-            vpmovsxdq(srcAddr, out.fpu());
-            break;
-          case wasm::SimdOp::I64x2LoadU32x2:
-            vpmovzxdq(srcAddr, out.fpu());
-            break;
-          default:
-            MOZ_CRASH("Unexpected widening op for wasmLoad");
-        }
-      } else {
-        // vmovsd does the right thing also for access.isZeroExtendSimd128Load()
-        vmovsd(srcAddr, out.fpu());
-      }
+      // vmovsd does the right thing also for access.isZeroExtendSimdLoad()
+      vmovsd(srcAddr, out.fpu());
      break;
    case Scalar::Simd128:
      vmovups(srcAddr, out.fpu());
@ -696,9 +643,6 @@ void MacroAssembler::wasmLoadI64(const wasm::MemoryAccessDesc& access,
  MOZ_ASSERT_IF(access.isAtomic(), access.byteSize() <= 4);
  MOZ_ASSERT(srcAddr.kind() == Operand::MEM_REG_DISP ||
             srcAddr.kind() == Operand::MEM_SCALE);
-  MOZ_ASSERT(!access.isZeroExtendSimd128Load());  // Use wasmLoad()
-  MOZ_ASSERT(!access.isSplatSimd128Load());       // Use wasmLoad()
-  MOZ_ASSERT(!access.isWidenSimd128Load());       // Use wasmLoad()

  memoryBarrierBefore(access.sync());

--- a/js/src/jit/x86/MacroAssembler-x86.h
+++ b/js/src/jit/x86/MacroAssembler-x86.h
@ -876,8 +876,6 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared {
  void loadConstantSimd128Int(const SimdConstant& v, FloatRegister dest);
  void loadConstantSimd128Float(const SimdConstant& v, FloatRegister dest);
  void vpandSimd128(const SimdConstant& v, FloatRegister srcDest);
-  void vpxorSimd128(const SimdConstant& v, FloatRegister srcDest);
-  void vpshufbSimd128(const SimdConstant& v, FloatRegister srcDest);

  Condition testInt32Truthy(bool truthy, const ValueOperand& operand) {
    test32(operand.payloadReg(), operand.payloadReg());
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -192,7 +192,6 @@ class BaseStackFrame;

 enum class UseABI { Wasm, Builtin, System };
 enum class InterModule { False = false, True = true };
-enum class RhsDestOp { True = true };

 #if defined(JS_CODEGEN_NONE)
 #  define RABALDR_SCRATCH_I32
@ -8240,10 +8239,6 @@ class BaseCompiler final : public BaseCompilerInterface {
  void emitVectorBinop(void (*op)(MacroAssembler& masm, RhsType src,
                                  LhsDestType srcDest));

-  template <typename RhsDestType, typename LhsType>
-  void emitVectorBinop(void (*op)(MacroAssembler& masm, RhsDestType src,
-                                  LhsType srcDest, RhsDestOp));
-
  template <typename RhsType, typename LhsDestType, typename TempType>
  void emitVectorBinop(void (*)(MacroAssembler& masm, RhsType rs,
                                LhsDestType rsd, TempType temp));
@ -12978,25 +12973,6 @@ static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
  masm.maxFloat64x2(rs, rsd, temp1, temp2);
 }

-static void PMinF32x4(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                      RhsDestOp) {
-  masm.pseudoMinFloat32x4(rsd, rs);
-}
-
-static void PMinF64x2(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                      RhsDestOp) {
-  masm.pseudoMinFloat64x2(rsd, rs);
-}
-
-static void PMaxF32x4(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                      RhsDestOp) {
-  masm.pseudoMaxFloat32x4(rsd, rs);
-}
-
-static void PMaxF64x2(MacroAssembler& masm, RegV128 rsd, RegV128 rs,
-                      RhsDestOp) {
-  masm.pseudoMaxFloat64x2(rsd, rs);
-}
 #  elif defined(JS_CODEGEN_ARM64)
 static void MinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.minFloat32x4(rs, rsd);
@ -13013,6 +12989,7 @@ static void MaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
 static void MaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.maxFloat64x2(rs, rsd);
 }
+#  endif

 static void PMinF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.pseudoMinFloat32x4(rs, rsd);
@ -13029,7 +13006,6 @@ static void PMaxF32x4(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
 static void PMaxF64x2(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.pseudoMaxFloat64x2(rs, rsd);
 }
-#  endif

 static void DotI16x8(MacroAssembler& masm, RegV128 rs, RegV128 rsd) {
  masm.widenDotInt16x8(rs, rsd);
@ -13597,17 +13573,6 @@ void BaseCompiler::emitVectorBinop(void (*op)(MacroAssembler& masm, RhsType src,
  push(rsd);
 }

-template <typename RhsDestType, typename LhsType>
-void BaseCompiler::emitVectorBinop(void (*op)(MacroAssembler& masm,
-                                              RhsDestType src, LhsType srcDest,
-                                              RhsDestOp)) {
-  RhsDestType rsd = pop<RhsDestType>();
-  LhsType rs = pop<LhsType>();
-  op(masm, rsd, rs, RhsDestOp::True);
-  free(rs);
-  push(rsd);
-}
-
 template <typename RhsType, typename LhsDestType, typename TempType>
 void BaseCompiler::emitVectorBinop(void (*op)(MacroAssembler& masm, RhsType rs,
                                              LhsDestType rsd, TempType temp)) {
@ -13838,7 +13803,15 @@ bool BaseCompiler::emitVectorShuffle() {

  RegV128 rd, rs;
  pop2xV128(&rd, &rs);
+#  if defined(JS_CODEGEN_X86) || defined(JS_CODEGEN_X64)
+  RegV128 temp = needV128();
+  masm.shuffleInt8x16(shuffleMask.bytes, rs, rd, temp);
+  freeV128(temp);
+#  elif defined(JS_CODEGEN_ARM64)
  masm.shuffleInt8x16(shuffleMask.bytes, rs, rd);
+#  else
+  MOZ_CRASH("NYI");
+#  endif
  freeV128(rs);
  pushV128(rd);

--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@ -846,20 +846,11 @@ class FunctionCompiler {
      return nullptr;
    }

+    // Expand load-and-splat as integer load followed by splat.
    MemoryAccessDesc access(viewType, addr.align, addr.offset,
                            bytecodeIfNotAsmJS());
-
-    // Generate better code (on x86)
-    if (viewType == Scalar::Float64) {
-      access.setSplatSimd128Load();
-      return load(addr.base, &access, ValType::V128);
-    }
-
-    ValType resultType = ValType::I32;
-    if (viewType == Scalar::Float32) {
-      resultType = ValType::F32;
-      splatOp = wasm::SimdOp::F32x4Splat;
-    }
+    ValType resultType =
+        viewType == Scalar::Int64 ? ValType::I64 : ValType::I32;
    auto* scalar = load(addr.base, &access, resultType);
    if (!inDeadCode() && !scalar) {
      return nullptr;
@ -873,12 +864,14 @@ class FunctionCompiler {
      return nullptr;
    }

-    // Generate better code (on x86) by loading as a double with an
-    // operation that sign extends directly.
-    MemoryAccessDesc access(Scalar::Float64, addr.align, addr.offset,
+    MemoryAccessDesc access(Scalar::Int64, addr.align, addr.offset,
                            bytecodeIfNotAsmJS());
-    access.setWidenSimd128Load(op);
-    return load(addr.base, &access, ValType::V128);
+    // Expand load-and-extend as integer load followed by widen.
+    auto* scalar = load(addr.base, &access, ValType::I64);
+    if (!inDeadCode() && !scalar) {
+      return nullptr;
+    }
+    return scalarToSimd128(scalar, op);
  }

  MDefinition* loadZeroSimd128(Scalar::Type viewType, size_t numBytes,
@ -5020,9 +5013,9 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
          case uint32_t(SimdOp::V16x8LoadSplat):
            CHECK(EmitLoadSplatSimd128(f, Scalar::Uint16, SimdOp::I16x8Splat));
          case uint32_t(SimdOp::V32x4LoadSplat):
-            CHECK(EmitLoadSplatSimd128(f, Scalar::Float32, SimdOp::I32x4Splat));
+            CHECK(EmitLoadSplatSimd128(f, Scalar::Uint32, SimdOp::I32x4Splat));
          case uint32_t(SimdOp::V64x2LoadSplat):
-            CHECK(EmitLoadSplatSimd128(f, Scalar::Float64, SimdOp::I64x2Splat));
+            CHECK(EmitLoadSplatSimd128(f, Scalar::Int64, SimdOp::I64x2Splat));
          case uint32_t(SimdOp::I16x8LoadS8x8):
          case uint32_t(SimdOp::I16x8LoadU8x8):
          case uint32_t(SimdOp::I32x4LoadS16x4):