Bug 1690492 - Use PBLENDVB for blend operations. r=lth

Differential Revision: https://phabricator.services.mozilla.com/D113582
2024-10-07 18:04:46 +00:00 · 2021-05-27 22:04:57 +00:00 · 2021-05-27 22:04:57 +00:00 · 7a3361b3fd
commit 7a3361b3fd
parent d85b55ac8e
15 changed files with 126 additions and 30 deletions
--- a/js/src/jit-test/tests/wasm/simd/bitselect-x64-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/bitselect-x64-ion-codegen.js
@ -22,3 +22,12 @@ codegenTestX64_adhoc(
 66 0f df d9               pandn %xmm1, %xmm3
 66 0f eb c3               por %xmm3, %xmm0`);

+// Blend constant optimizations
+
+codegenTestX64_adhoc(
+  `(module
+      (func (export "f") (param v128) (param v128) (param v128) (result v128)
+        (v128.bitselect (local.get 0) (local.get 1) (v128.const i32x4 -1 0 0 -1))))`,
+      'f',
+  `66 0f 3a 0e c1 c3         pblendw \\$0xC3, %xmm1, %xmm0`);
+  
--- a/js/src/jit-test/tests/wasm/simd/ion-analysis.js
+++ b/js/src/jit-test/tests/wasm/simd/ion-analysis.js
@ -822,6 +822,16 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
    assertEq(wasmSimdAnalysis(), "shuffle -> permute 32x4");
 }

+// Bitselect with constant mask folded into shuffle operation
+
+if (!isArm64) {
+  wasmCompile(`
+  (module (func (param v128) (param v128) (result v128)
+    (v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
+  `);
+      assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");  
+}
+
 // Library

 function wasmCompile(text) {
--- a/js/src/jit-test/tests/wasm/simd/shuffle-x86-ion-codegen.js
+++ b/js/src/jit-test/tests/wasm/simd/shuffle-x86-ion-codegen.js
@ -76,3 +76,15 @@ codegenTestX64_v128xLITERAL_v128(
 66 0f ef c9               pxor %xmm1, %xmm1
 66 0f 73 d8 03            psrldq \\$0x03, %xmm0`]]);

+// SSE4.1 PBLENDVB instruction is using XMM0, checking if blend
+// operation generated as expected.
+codegenTestX64_adhoc(
+     `(func (export "f") (param v128 v128 v128 v128) (result v128)
+        (i8x16.shuffle 0 17 2 3 4 5 6 7 24 25 26 11 12 13 30 15
+          (local.get 2)(local.get 3)))`,
+     'f',
+`
+66 0f 6f ca               movdqa %xmm2, %xmm1
+66 0f 6f 05 ${RIPRADDR}   movdqax ${RIPR}, %xmm0
+66 0f 38 10 cb            pblendvb %xmm3, %xmm1
+66 0f 6f c1               movdqa %xmm1, %xmm0`);
--- a/js/src/jit/MIR.cpp
+++ b/js/src/jit/MIR.cpp
@ -4450,6 +4450,17 @@ bool MWasmLoadGlobalCell::congruentTo(const MDefinition* ins) const {
 }

 #ifdef ENABLE_WASM_SIMD
+MDefinition* MWasmBitselectSimd128::foldsTo(TempAllocator& alloc) {
+  if (control()->op() == MDefinition::Opcode::WasmFloatConstant) {
+    int8_t shuffle[16];
+    if (specializeConstantMaskAsShuffle(shuffle)) {
+      return MWasmShuffleSimd128::New(alloc, lhs(), rhs(),
+                                      SimdConstant::CreateX16(shuffle));
+    }
+  }
+  return this;
+}
+
 MDefinition* MWasmBinarySimd128::foldsTo(TempAllocator& alloc) {
  if (simdOp() == wasm::SimdOp::V8x16Swizzle && rhs()->isWasmFloatConstant()) {
    // Specialize swizzle(v, constant) as shuffle(mask, v, zero) to trigger all
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@ -13430,6 +13430,14 @@ class MWasmBitselectSimd128 : public MTernaryInstruction,
  bool congruentTo(const MDefinition* ins) const override {
    return congruentIfOperandsEqual(ins);
  }
+#ifdef ENABLE_WASM_SIMD
+  MDefinition* foldsTo(TempAllocator& alloc) override;
+
+  // If the control mask allows the operation to be specialized as a shuffle
+  // and it is profitable to specialize it on this platform, return true and
+  // the appropriate shuffle mask.
+  bool specializeConstantMaskAsShuffle(int8_t shuffle[16]);
+#endif

  ALLOW_CLONE(MWasmBitselectSimd128)
 };
--- a/js/src/jit/arm/Lowering-arm.cpp
+++ b/js/src/jit/arm/Lowering-arm.cpp
@ -1173,6 +1173,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
  MOZ_CRASH("binary SIMD NYI");
 }

+#ifdef ENABLE_WASM_SIMD
+bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
+    int8_t shuffle[16]) {
+  return false;
+}
+#endif
+
 bool MWasmBinarySimd128::specializeForConstantRhs() {
  // Probably many we want to do here
  return false;
--- a/js/src/jit/arm64/Lowering-arm64.cpp
+++ b/js/src/jit/arm64/Lowering-arm64.cpp
@ -956,6 +956,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
 #endif
 }

+#ifdef ENABLE_WASM_SIMD
+bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
+    int8_t shuffle[16]) {
+  return false;
+}
+#endif
+
 bool MWasmBinarySimd128::specializeForConstantRhs() {
  // Probably many we want to do here
  return false;
--- a/js/src/jit/mips-shared/Lowering-mips-shared.cpp
+++ b/js/src/jit/mips-shared/Lowering-mips-shared.cpp
@ -904,6 +904,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
  MOZ_CRASH("binary SIMD NYI");
 }

+#ifdef ENABLE_WASM_SIMD
+bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
+    int8_t shuffle[16]) {
+  return false;
+}
+#endif
+
 bool MWasmBinarySimd128::specializeForConstantRhs() {
  // Probably many we want to do here
  return false;
--- a/js/src/jit/x86-shared/Assembler-x86-shared.h
+++ b/js/src/jit/x86-shared/Assembler-x86-shared.h
@ -2214,6 +2214,15 @@ class AssemblerX86Shared : public AssemblerShared {
    masm.vpblendw_irr(mask, src1.encoding(), src0.encoding(), dest.encoding());
  }

+  void vpblendvb(FloatRegister mask, FloatRegister src1, FloatRegister src0,
+                 FloatRegister dest) {
+    MOZ_ASSERT(HasSSE41());
+    MOZ_ASSERT(mask.encoding() == X86Encoding::xmm0 &&
+                   src0.encoding() == dest.encoding(),
+               "only legacy encoding is supported");
+    masm.pblendvb_rr(src1.encoding(), dest.encoding());
+  }
+
  void vpinsrb(unsigned lane, const Operand& src1, FloatRegister src0,
               FloatRegister dest) {
    MOZ_ASSERT(HasSSE41());
--- a/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/BaseAssembler-x86-shared.h
@ -3775,6 +3775,13 @@ class BaseAssembler : public GenericAssembler {
                       mask, src1, src0, dst);
  }

+  void pblendvb_rr(XMMRegisterID other, XMMRegisterID dst) {
+    spew("%-11s%s, %s", "pblendvb", XMMRegName(other), XMMRegName(dst));
+    m_formatter.legacySSEPrefix(VEX_PD);
+    m_formatter.threeByteOp(OP3_PBLENDVB_VdqWdq, ESCAPE_38, (RegisterID)other,
+                            dst);
+  }
+
  void vpinsrb_irr(unsigned lane, RegisterID src1, XMMRegisterID src0,
                   XMMRegisterID dst) {
    MOZ_ASSERT(lane < 16);
--- a/js/src/jit/x86-shared/Encoding-x86-shared.h
+++ b/js/src/jit/x86-shared/Encoding-x86-shared.h
@ -352,6 +352,7 @@ enum ThreeByteOpcodeID {
  OP3_BLENDPS_VpsWpsIb = 0x0C,
  OP3_PBLENDW_VdqWdqIb = 0x0E,
  OP3_PALIGNR_VdqWdqIb = 0x0F,
+  OP3_PBLENDVB_VdqWdq = 0x10,
  OP3_BLENDVPS_VdqWdq = 0x14,
  OP3_PEXTRB_EvVdqIb = 0x14,
  OP3_PEXTRW_EwVdqIb = 0x15,
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -947,6 +947,27 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
 #endif
 }

+#ifdef ENABLE_WASM_SIMD
+bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
+    int8_t shuffle[16]) {
+  // Optimization when control vector is a mask with all 0 or all 1 per lane.
+  // On x86, there is no bitselect, blend operations will be a win,
+  // e.g. via PBLENDVB or PBLENDW.
+  MWasmFloatConstant* constant = static_cast<MWasmFloatConstant*>(control());
+  const auto& bytes = constant->toSimd128().asInt8x16();
+  for (int8_t i = 0; i < 16; i++) {
+    if (bytes[i] == -1) {
+      shuffle[i] = i + 16;
+    } else if (bytes[i] == 0) {
+      shuffle[i] = i;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+#endif
+
 bool MWasmBinarySimd128::specializeForConstantRhs() {
  // The order follows MacroAssembler.h, generally
  switch (simdOp()) {
@ -1218,7 +1239,7 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
      LDefinition temp = LDefinition::BogusTemp();
      switch (*s.shuffleOp) {
        case LWasmShuffleSimd128::BLEND_8x16:
-          temp = tempSimd128();
+          temp = tempFixed(xmm0);
          break;
        default:
          break;
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD-unused.cpp
@ -614,3 +614,14 @@ void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
    vorps(Operand(mask), output, output);
  }
 }
+
+void MacroAssemblerX86Shared::selectX4(FloatRegister mask, FloatRegister onTrue,
+                                       FloatRegister onFalse,
+                                       FloatRegister temp,
+                                       FloatRegister output) {
+  if (AssemblerX86Shared::HasAVX()) {
+    vblendvps(mask, onTrue, onFalse, output);
+  } else {
+    selectSimd128(mask, onTrue, onFalse, temp, output);
+  }
+}
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@ -158,25 +158,11 @@ void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
                                           FloatRegister temp,
                                           const uint8_t lanes[16]) {
  MOZ_ASSERT(lhs == output);
-  MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
+  MOZ_ASSERT(temp.encoding() == X86Encoding::xmm0, "pblendvb needs xmm0");

-  // TODO: Consider whether PBLENDVB would not be better, even if it is variable
-  // and requires xmm0 to be free and the loading of a mask.
-
-  // Set scratch = lanes to select from lhs.
-  int8_t mask[16];
-  for (unsigned i = 0; i < 16; i++) {
-    mask[i] = ~lanes[i];
-  }
-  ScratchSimd128Scope scratch(asMasm());
-  asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
-  if (lhs == rhs) {
-    asMasm().moveSimd128Int(rhs, temp);
-    rhs = temp;
-  }
-  vpand(Operand(scratch), lhs, lhs);
-  vpandn(Operand(rhs), scratch, scratch);
-  vpor(scratch, lhs, lhs);
+  asMasm().loadConstantSimd128Int(
+      SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp);
+  vpblendvb(temp, rhs, lhs, output);
 }

 void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
@ -1095,9 +1081,6 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
  asMasm().moveSimd128Int(onTrue, output);
  asMasm().moveSimd128Int(mask, temp);

-  // SSE4.1 has plain blendvps which can do this, but it is awkward
-  // to use because it requires the mask to be in xmm0.
-
  vpand(Operand(temp), output, output);
  vpandn(Operand(onFalse), temp, temp);
  vpor(Operand(temp), output, output);
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@ -799,14 +799,7 @@ class MacroAssemblerX86Shared : public Assembler {
    vshufps(mask, src, dest, dest);
  }
  void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
-                FloatRegister temp, FloatRegister output) {
-    if (AssemblerX86Shared::HasAVX()) {
-      vblendvps(mask, onTrue, onFalse, output);
-    } else {
-      selectSimd128(mask, onTrue, onFalse, temp, output);
-    }
-  }
-
+                FloatRegister temp, FloatRegister output);
  // End unused SIMD.

  void moveFloatAsDouble(Register src, FloatRegister dest) {