Merge pull request #3736 from Sonicadvance1/avx_9

AVX128: Some pun pickles, moves and conversions
2024-12-13 17:15:41 +00:00 · 2024-06-21 10:55:19 -07:00 · 2024-06-21 10:55:19 -07:00 · 7bbbd95775
commit 7bbbd95775
parent 2da1e90dd5 903d6a742e
5 changed files with 168 additions and 42 deletions
--- a/FEXCore/Source/Interface/Core/CPUBackend.h
+++ b/FEXCore/Source/Interface/Core/CPUBackend.h
@ -36,7 +36,6 @@ namespace CodeSerialize {
 namespace CPU {
  struct CPUBackendFeatures {
    bool SupportsFlags = false;
-    bool SupportsSaturatingRoundingShifts = false;
    bool SupportsVTBL2 = false;
  };

--- a/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
+++ b/FEXCore/Source/Interface/Core/JIT/Arm64/JIT.cpp
@ -887,7 +887,6 @@ fextl::unique_ptr<CPUBackend> CreateArm64JITCore(FEXCore::Context::ContextImpl*
 CPUBackendFeatures GetArm64JITBackendFeatures() {
  return CPUBackendFeatures {
    .SupportsFlags = true,
-    .SupportsSaturatingRoundingShifts = true,
    .SupportsVTBL2 = true,
  };
 }
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@ -1012,6 +1012,23 @@ public:
  void AVX128_VMOVSHDUP(OpcodeArgs);
  template<size_t ElementSize>
  void AVX128_VBROADCAST(OpcodeArgs);
+  template<size_t ElementSize>
+  void AVX128_VPUNPCKL(OpcodeArgs);
+  template<size_t ElementSize>
+  void AVX128_VPUNPCKH(OpcodeArgs);
+  void AVX128_MOVVectorUnaligned(OpcodeArgs);
+  template<size_t DstElementSize>
+  void AVX128_InsertCVTGPR_To_FPR(OpcodeArgs);
+  template<size_t SrcElementSize, bool HostRoundingMode>
+  void AVX128_CVTFPR_To_GPR(OpcodeArgs);
+  void AVX128_VANDN(OpcodeArgs);
+  template<size_t ElementSize>
+  void AVX128_VPACKSS(OpcodeArgs);
+  template<size_t ElementSize>
+  void AVX128_VPACKUS(OpcodeArgs);
+  Ref AVX128_PSIGNImpl(size_t ElementSize, Ref Src1, Ref Src2);
+  template<size_t ElementSize>
+  void AVX128_VPSIGN(OpcodeArgs);

  // End of AVX 128-bit implementation

--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/AVX_128.cpp
@ -40,11 +40,11 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b00, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},
    {OPD(1, 0b01, 0x13), 1, &OpDispatchBuilder::AVX128_VMOVLP},

-    // TODO: {OPD(1, 0b00, 0x14), 1, &OpDispatchBuilder::VPUNPCKLOp<4>},
-    // TODO: {OPD(1, 0b01, 0x14), 1, &OpDispatchBuilder::VPUNPCKLOp<8>},
+    {OPD(1, 0b00, 0x14), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<4>},
+    {OPD(1, 0b01, 0x14), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<8>},

-    // TODO: {OPD(1, 0b00, 0x15), 1, &OpDispatchBuilder::VPUNPCKHOp<4>},
-    // TODO: {OPD(1, 0b01, 0x15), 1, &OpDispatchBuilder::VPUNPCKHOp<8>},
+    {OPD(1, 0b00, 0x15), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<4>},
+    {OPD(1, 0b01, 0x15), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<8>},

    {OPD(1, 0b00, 0x16), 1, &OpDispatchBuilder::AVX128_VMOVHP},
    {OPD(1, 0b01, 0x16), 1, &OpDispatchBuilder::AVX128_VMOVHP},
@ -57,17 +57,17 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b00, 0x29), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
    {OPD(1, 0b01, 0x29), 1, &OpDispatchBuilder::AVX128_VMOVAPS},

-    // TODO: {OPD(1, 0b10, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR<4>},
-    // TODO: {OPD(1, 0b11, 0x2A), 1, &OpDispatchBuilder::AVXInsertCVTGPR_To_FPR<8>},
+    {OPD(1, 0b10, 0x2A), 1, &OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR<4>},
+    {OPD(1, 0b11, 0x2A), 1, &OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR<8>},

    {OPD(1, 0b00, 0x2B), 1, &OpDispatchBuilder::AVX128_MOVVectorNT},
    {OPD(1, 0b01, 0x2B), 1, &OpDispatchBuilder::AVX128_MOVVectorNT},

-    // TODO: {OPD(1, 0b10, 0x2C), 1, &OpDispatchBuilder::CVTFPR_To_GPR<4, false>},
-    // TODO: {OPD(1, 0b11, 0x2C), 1, &OpDispatchBuilder::CVTFPR_To_GPR<8, false>},
+    {OPD(1, 0b10, 0x2C), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<4, false>},
+    {OPD(1, 0b11, 0x2C), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<8, false>},

-    // TODO: {OPD(1, 0b10, 0x2D), 1, &OpDispatchBuilder::CVTFPR_To_GPR<4, true>},
-    // TODO: {OPD(1, 0b11, 0x2D), 1, &OpDispatchBuilder::CVTFPR_To_GPR<8, true>},
+    {OPD(1, 0b10, 0x2D), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<4, true>},
+    {OPD(1, 0b11, 0x2D), 1, &OpDispatchBuilder::AVX128_CVTFPR_To_GPR<8, true>},

    // TODO: {OPD(1, 0b00, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp<4>},
    // TODO: {OPD(1, 0b01, 0x2E), 1, &OpDispatchBuilder::UCOMISxOp<8>},
@ -91,8 +91,8 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b00, 0x54), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VAND, 16>},
    {OPD(1, 0b01, 0x54), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VAND, 16>},

-    // TODO: {OPD(1, 0b00, 0x55), 1, &OpDispatchBuilder::VANDNOp},
-    // TODO: {OPD(1, 0b01, 0x55), 1, &OpDispatchBuilder::VANDNOp},
+    {OPD(1, 0b00, 0x55), 1, &OpDispatchBuilder::AVX128_VANDN},
+    {OPD(1, 0b01, 0x55), 1, &OpDispatchBuilder::AVX128_VANDN},

    {OPD(1, 0b00, 0x56), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VOR, 16>},
    {OPD(1, 0b01, 0x56), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VOR, 16>},
@ -139,20 +139,20 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    // TODO: {OPD(1, 0b10, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp<IR::OP_VFMAXSCALARINSERT, 4>},
    // TODO: {OPD(1, 0b11, 0x5F), 1, &OpDispatchBuilder::AVXVectorScalarInsertALUOp<IR::OP_VFMAXSCALARINSERT, 8>},

-    // TODO: {OPD(1, 0b01, 0x60), 1, &OpDispatchBuilder::VPUNPCKLOp<1>},
-    // TODO: {OPD(1, 0b01, 0x61), 1, &OpDispatchBuilder::VPUNPCKLOp<2>},
-    // TODO: {OPD(1, 0b01, 0x62), 1, &OpDispatchBuilder::VPUNPCKLOp<4>},
-    // TODO: {OPD(1, 0b01, 0x63), 1, &OpDispatchBuilder::VPACKSSOp<2>},
+    {OPD(1, 0b01, 0x60), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<1>},
+    {OPD(1, 0b01, 0x61), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<2>},
+    {OPD(1, 0b01, 0x62), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<4>},
+    {OPD(1, 0b01, 0x63), 1, &OpDispatchBuilder::AVX128_VPACKSS<2>},
    {OPD(1, 0b01, 0x64), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VCMPGT, 1>},
    {OPD(1, 0b01, 0x65), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VCMPGT, 2>},
    {OPD(1, 0b01, 0x66), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VCMPGT, 4>},
-    // TODO: {OPD(1, 0b01, 0x67), 1, &OpDispatchBuilder::VPACKUSOp<2>},
-    // TODO: {OPD(1, 0b01, 0x68), 1, &OpDispatchBuilder::VPUNPCKHOp<1>},
-    // TODO: {OPD(1, 0b01, 0x69), 1, &OpDispatchBuilder::VPUNPCKHOp<2>},
-    // TODO: {OPD(1, 0b01, 0x6A), 1, &OpDispatchBuilder::VPUNPCKHOp<4>},
-    // TODO: {OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::VPACKSSOp<4>},
-    // TODO: {OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::VPUNPCKLOp<8>},
-    // TODO: {OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::VPUNPCKHOp<8>},
+    {OPD(1, 0b01, 0x67), 1, &OpDispatchBuilder::AVX128_VPACKUS<2>},
+    {OPD(1, 0b01, 0x68), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<1>},
+    {OPD(1, 0b01, 0x69), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<2>},
+    {OPD(1, 0b01, 0x6A), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<4>},
+    {OPD(1, 0b01, 0x6B), 1, &OpDispatchBuilder::AVX128_VPACKSS<4>},
+    {OPD(1, 0b01, 0x6C), 1, &OpDispatchBuilder::AVX128_VPUNPCKL<8>},
+    {OPD(1, 0b01, 0x6D), 1, &OpDispatchBuilder::AVX128_VPUNPCKH<8>},
    // TODO: {OPD(1, 0b01, 0x6E), 1, &OpDispatchBuilder::MOVBetweenGPR_FPR},

    {OPD(1, 0b01, 0x6F), 1, &OpDispatchBuilder::AVX128_VMOVAPS},
@ -208,7 +208,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b01, 0xDC), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VUQADD, 1>},
    {OPD(1, 0b01, 0xDD), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VUQADD, 2>},
    {OPD(1, 0b01, 0xDE), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VUMAX, 1>},
-    // TODO: {OPD(1, 0b01, 0xDF), 1, &OpDispatchBuilder::VANDNOp},
+    {OPD(1, 0b01, 0xDF), 1, &OpDispatchBuilder::AVX128_VANDN},

    {OPD(1, 0b01, 0xE0), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VURAVG, 1>},
    // TODO: {OPD(1, 0b01, 0xE1), 1, &OpDispatchBuilder::VPSRAOp<2>},
@ -232,7 +232,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    {OPD(1, 0b01, 0xEE), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VSMAX, 2>},
    {OPD(1, 0b01, 0xEF), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VXOR, 16>},

-    // TODO: {OPD(1, 0b11, 0xF0), 1, &OpDispatchBuilder::MOVVectorUnalignedOp},
+    {OPD(1, 0b11, 0xF0), 1, &OpDispatchBuilder::AVX128_MOVVectorUnaligned},
    // TODO: {OPD(1, 0b01, 0xF1), 1, &OpDispatchBuilder::VPSLLOp<2>},
    // TODO: {OPD(1, 0b01, 0xF2), 1, &OpDispatchBuilder::VPSLLOp<4>},
    // TODO: {OPD(1, 0b01, 0xF3), 1, &OpDispatchBuilder::VPSLLOp<8>},
@ -259,9 +259,9 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    // TODO: {OPD(2, 0b01, 0x06), 1, &OpDispatchBuilder::VPHSUBOp<4>},
    // TODO: {OPD(2, 0b01, 0x07), 1, &OpDispatchBuilder::VPHSUBSWOp},

-    // TODO: {OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::VPSIGN<1>},
-    // TODO: {OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::VPSIGN<2>},
-    // TODO: {OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::VPSIGN<4>},
+    {OPD(2, 0b01, 0x08), 1, &OpDispatchBuilder::AVX128_VPSIGN<1>},
+    {OPD(2, 0b01, 0x09), 1, &OpDispatchBuilder::AVX128_VPSIGN<2>},
+    {OPD(2, 0b01, 0x0A), 1, &OpDispatchBuilder::AVX128_VPSIGN<4>},
    // TODO: {OPD(2, 0b01, 0x0B), 1, &OpDispatchBuilder::VPMULHRSWOp},
    // TODO: {OPD(2, 0b01, 0x0C), 1, &OpDispatchBuilder::VPERMILRegOp<4>},
    // TODO: {OPD(2, 0b01, 0x0D), 1, &OpDispatchBuilder::VPERMILRegOp<8>},
@ -287,7 +287,7 @@ void OpDispatchBuilder::InstallAVX128Handlers() {
    // TODO: {OPD(2, 0b01, 0x28), 1, &OpDispatchBuilder::VPMULLOp<4, true>},
    {OPD(2, 0b01, 0x29), 1, &OpDispatchBuilder::AVX128_VectorALU<IR::OP_VCMPEQ, 8>},
    {OPD(2, 0b01, 0x2A), 1, &OpDispatchBuilder::AVX128_MOVVectorNT},
-    // TODO: {OPD(2, 0b01, 0x2B), 1, &OpDispatchBuilder::VPACKUSOp<4>},
+    {OPD(2, 0b01, 0x2B), 1, &OpDispatchBuilder::AVX128_VPACKUS<4>},
    // TODO: {OPD(2, 0b01, 0x2C), 1, &OpDispatchBuilder::VMASKMOVOp<4, false>},
    // TODO: {OPD(2, 0b01, 0x2D), 1, &OpDispatchBuilder::VMASKMOVOp<8, false>},
    // TODO: {OPD(2, 0b01, 0x2E), 1, &OpDispatchBuilder::VMASKMOVOp<4, true>},
@ -782,4 +782,123 @@ void OpDispatchBuilder::AVX128_VBROADCAST(OpcodeArgs) {
  AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
 }

+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_VPUNPCKL(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return _VZip(OpSize::i128Bit, _ElementSize, Src1, Src2); });
+}
+
+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_VPUNPCKH(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return _VZip2(OpSize::i128Bit, _ElementSize, Src1, Src2); });
+}
+
+void OpDispatchBuilder::AVX128_MOVVectorUnaligned(OpcodeArgs) {
+  const auto SrcSize = GetSrcSize(Op);
+  const auto Is128Bit = SrcSize == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  if (!Is128Bit && Op->Dest.IsGPR() && Op->Src[0].IsGPR() && Op->Dest.Data.GPR.GPR == Op->Src[0].Data.GPR.GPR) {
+    // Nop
+    return;
+  }
+
+  auto Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, !Is128Bit);
+
+  if (Is128Bit) {
+    Src.High = LoadZeroVector(OpSize::i128Bit);
+  }
+
+  AVX128_StoreResult_WithOpSize(Op, Op->Dest, Src);
+}
+
+template<size_t DstElementSize>
+void OpDispatchBuilder::AVX128_InsertCVTGPR_To_FPR(OpcodeArgs) {
+  const auto SrcSize = GetSrcSize(Op);
+  const auto DstSize = GetDstSize(Op);
+  const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
+
+  auto Src1 = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
+
+  RefPair Result {};
+
+  if (Op->Src[1].IsGPR()) {
+    // If the source is a GPR then convert directly from the GPR.
+    auto Src2 = LoadSource_WithOpSize(GPRClass, Op, Op->Src[1], CTX->GetGPRSize(), Op->Flags);
+    Result.Low = _VSToFGPRInsert(OpSize::i128Bit, DstElementSize, SrcSize, Src1.Low, Src2, false);
+  } else if (SrcSize != DstElementSize) {
+    // If the source is from memory but the Source size and destination size aren't the same,
+    // then it is more optimal to load in to a GPR and convert between GPR->FPR.
+    // ARM GPR->FPR conversion supports different size source and destinations while FPR->FPR doesn't.
+    auto Src2 = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
+    Result.Low = _VSToFGPRInsert(IR::SizeToOpSize(DstSize), DstElementSize, SrcSize, Src1.Low, Src2, false);
+  } else {
+    // In the case of cvtsi2s{s,d} where the source and destination are the same size,
+    // then it is more optimal to load in to the FPR register directly and convert there.
+    auto Src2 = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, false);
+    // Always signed
+    Result.Low = _VSToFVectorInsert(IR::SizeToOpSize(DstSize), DstElementSize, DstElementSize, Src1.Low, Src2.Low, false, false);
+  }
+
+  Result.High = LoadZeroVector(OpSize::i128Bit);
+  LOGMAN_THROW_A_FMT(Is128Bit, "Programming Error: This should never occur!");
+
+  AVX128_StoreResult_WithOpSize(Op, Op->Dest, Result);
+}
+
+template<size_t SrcElementSize, bool HostRoundingMode>
+void OpDispatchBuilder::AVX128_CVTFPR_To_GPR(OpcodeArgs) {
+  // If loading a vector, use the full size, so we don't
+  // unnecessarily zero extend the vector. Otherwise, if
+  // memory, then we want to load the element size exactly.
+  RefPair Src {};
+  if (Op->Src[0].IsGPR()) {
+    Src = AVX128_LoadSource_WithOpSize(Op, Op->Src[0], Op->Flags, false);
+  } else {
+    Src.Low = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], GetSrcSize(Op), Op->Flags);
+  }
+
+  // GPR size is determined by REX.W
+  // Source Element size is determined by instruction
+  size_t GPRSize = GetDstSize(Op);
+
+  Ref Result {};
+  if constexpr (HostRoundingMode) {
+    Result = _Float_ToGPR_S(GPRSize, SrcElementSize, Src.Low);
+  } else {
+    Result = _Float_ToGPR_ZS(GPRSize, SrcElementSize, Src.Low);
+  }
+
+  StoreResult_WithOpSize(GPRClass, Op, Op->Dest, Result, GPRSize, -1);
+}
+
+void OpDispatchBuilder::AVX128_VANDN(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), OpSize::i128Bit,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return _VAndn(OpSize::i128Bit, _ElementSize, Src2, Src1); });
+}
+
+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_VPACKSS(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return _VSQXTNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); });
+}
+
+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_VPACKUS(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return _VSQXTUNPair(OpSize::i128Bit, _ElementSize, Src1, Src2); });
+}
+
+Ref OpDispatchBuilder::AVX128_PSIGNImpl(size_t ElementSize, Ref Src1, Ref Src2) {
+  Ref Control = _VSQSHL(OpSize::i128Bit, ElementSize, Src2, (ElementSize * 8) - 1);
+  Control = _VSRSHR(OpSize::i128Bit, ElementSize, Control, (ElementSize * 8) - 1);
+  return _VMul(OpSize::i128Bit, ElementSize, Src1, Control);
+}
+
+template<size_t ElementSize>
+void OpDispatchBuilder::AVX128_VPSIGN(OpcodeArgs) {
+  AVX128_VectorBinaryImpl(Op, GetSrcSize(Op), ElementSize,
+                          [this](size_t _ElementSize, Ref Src1, Ref Src2) { return AVX128_PSIGNImpl(_ElementSize, Src1, Src2); });
+}
+
 } // namespace FEXCore::IR
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Vector.cpp
@ -1723,17 +1723,9 @@ void OpDispatchBuilder::VEXTRACT128Op(OpcodeArgs) {
 Ref OpDispatchBuilder::PSIGNImpl(OpcodeArgs, size_t ElementSize, Ref Src1, Ref Src2) {
  const auto Size = GetSrcSize(Op);

-  if (CTX->BackendFeatures.SupportsSaturatingRoundingShifts) {
-    Ref Control = _VSQSHL(Size, ElementSize, Src2, (ElementSize * 8) - 1);
-    Control = _VSRSHR(Size, ElementSize, Control, (ElementSize * 8) - 1);
-    return _VMul(Size, ElementSize, Src1, Control);
-  } else {
-    auto NegVec = _VNeg(Size, ElementSize, Src1);
-    Ref CmpLT = _VCMPLTZ(Size, ElementSize, Src2);
-    Ref CmpEQ = _VCMPEQZ(Size, ElementSize, Src2);
-    auto BSLResult = _VBSL(Size, CmpLT, NegVec, Src1);
-    return _VAndn(Size, Size, BSLResult, CmpEQ);
-  }
+  Ref Control = _VSQSHL(Size, ElementSize, Src2, (ElementSize * 8) - 1);
+  Control = _VSRSHR(Size, ElementSize, Control, (ElementSize * 8) - 1);
+  return _VMul(Size, ElementSize, Src1, Control);
 }

 template<size_t ElementSize>