From b2b91caa03a6f2861a3a80f919fbfc5f7f061668 Mon Sep 17 00:00:00 2001
From: Benjamin Bouvier <benj@benj.me>
Date: Wed, 27 Aug 2014 19:24:41 +0200
Subject: [PATCH] Bug 1021716: SIMD x86-x64: Implement MSimdShuffleMix;
 r=sunfish

---
 js/src/jit/LIR-Common.h                       |  29 ++++-
 js/src/jit/LOpcodes.h                         |   1 +
 js/src/jit/Lowering.cpp                       |  18 +++
 js/src/jit/Lowering.h                         |   1 +
 js/src/jit/MIR.h                              | 121 ++++++++++++------
 js/src/jit/MOpcodes.h                         |   1 +
 js/src/jit/ParallelSafetyAnalysis.cpp         |   1 +
 js/src/jit/shared/Assembler-x86-shared.h      |  16 +++
 js/src/jit/shared/BaseAssembler-x86-shared.h  |  20 ++-
 .../jit/shared/CodeGenerator-x86-shared.cpp   |  13 ++
 js/src/jit/shared/CodeGenerator-x86-shared.h  |   1 +
 js/src/jit/shared/MacroAssembler-x86-shared.h |  16 ++-
 12 files changed, 191 insertions(+), 47 deletions(-)

diff --git a/js/src/jit/LIR-Common.h b/js/src/jit/LIR-Common.h
index a8c427bf5436..9b5a7f6d1926 100644
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@@ -242,10 +242,10 @@ class LSimdSwizzleBase : public LInstructionHelper<1, 1, 0>
         return getOperand(0);
     }
 
-    SimdLane laneX() const { return mir_->toSimdSwizzle()->laneX(); }
-    SimdLane laneY() const { return mir_->toSimdSwizzle()->laneY(); }
-    SimdLane laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
-    SimdLane laneW() const { return mir_->toSimdSwizzle()->laneW(); }
+    int32_t laneX() const { return mir_->toSimdSwizzle()->laneX(); }
+    int32_t laneY() const { return mir_->toSimdSwizzle()->laneY(); }
+    int32_t laneZ() const { return mir_->toSimdSwizzle()->laneZ(); }
+    int32_t laneW() const { return mir_->toSimdSwizzle()->laneW(); }
 };
 
 // Shuffles a int32x4 into another int32x4 vector.
@@ -265,6 +265,27 @@ class LSimdSwizzleF : public LSimdSwizzleBase
     {}
 };
 
+// Base class for both int32x4 and float32x4 shuffle instructions.
+class LSimdShuffle : public LInstructionHelper<1, 2, 0>
+{
+  public:
+    LIR_HEADER(SimdShuffle);
+    LSimdShuffle()
+    {}
+
+    const LAllocation *lhs() {
+        return getOperand(0);
+    }
+    const LAllocation *rhs() {
+        return getOperand(1);
+    }
+
+    int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); }
+    int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); }
+    int32_t laneZ() const { return mir_->toSimdShuffle()->laneZ(); }
+    int32_t laneW() const { return mir_->toSimdShuffle()->laneW(); }
+};
+
 // Binary SIMD comparison operation between two SIMD operands
 class LSimdBinaryComp: public LInstructionHelper<1, 2, 0>
 {
diff --git a/js/src/jit/LOpcodes.h b/js/src/jit/LOpcodes.h
index 2bd8fcf2f2a6..84cc5ce9e157 100644
--- a/js/src/jit/LOpcodes.h
+++ b/js/src/jit/LOpcodes.h
@@ -26,6 +26,7 @@
     _(SimdSignMaskX4)               \
     _(SimdSwizzleI)                 \
     _(SimdSwizzleF)                 \
+    _(SimdShuffle)                  \
     _(SimdUnaryArithIx4)            \
     _(SimdUnaryArithFx4)            \
     _(SimdBinaryCompIx4)            \
diff --git a/js/src/jit/Lowering.cpp b/js/src/jit/Lowering.cpp
index 17dc591cb414..35d2a01256cc 100644
--- a/js/src/jit/Lowering.cpp
+++ b/js/src/jit/Lowering.cpp
@@ -3830,6 +3830,24 @@ LIRGenerator::visitSimdSwizzle(MSimdSwizzle *ins)
     return false;
 }
 
+bool
+LIRGenerator::visitSimdShuffle(MSimdShuffle *ins)
+{
+    MOZ_ASSERT(IsSimdType(ins->lhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->rhs()->type()));
+    MOZ_ASSERT(IsSimdType(ins->type()));
+
+    if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) {
+        MDefinition *lhs = ins->lhs();
+        MDefinition *rhs = ins->rhs();
+        LSimdShuffle *lir = new (alloc()) LSimdShuffle;
+        return lowerForFPU(lir, ins, lhs, rhs);
+    }
+
+    MOZ_CRASH("Unknown SIMD kind when getting lane");
+    return false;
+}
+
 bool
 LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith *ins)
 {
diff --git a/js/src/jit/Lowering.h b/js/src/jit/Lowering.h
index 65ae87665f3a..06b738ff4464 100644
--- a/js/src/jit/Lowering.h
+++ b/js/src/jit/Lowering.h
@@ -273,6 +273,7 @@ class LIRGenerator : public LIRGeneratorSpecific
     bool visitSimdInsertElement(MSimdInsertElement *ins);
     bool visitSimdSignMask(MSimdSignMask *ins);
     bool visitSimdSwizzle(MSimdSwizzle *ins);
+    bool visitSimdShuffle(MSimdShuffle *ins);
     bool visitSimdUnaryArith(MSimdUnaryArith *ins);
     bool visitSimdBinaryComp(MSimdBinaryComp *ins);
     bool visitSimdBinaryArith(MSimdBinaryArith *ins);
diff --git a/js/src/jit/MIR.h b/js/src/jit/MIR.h
index 62b11508740d..e57c8d4d63ea 100644
--- a/js/src/jit/MIR.h
+++ b/js/src/jit/MIR.h
@@ -1575,34 +1575,51 @@ class MSimdSignMask : public MUnaryInstruction
     ALLOW_CLONE(MSimdSignMask)
 };
 
+// Base for the MSimdSwizzle and MSimdShuffle classes.
+class MSimdShuffleBase
+{
+  protected:
+    // As of now, there are at most 4 lanes. For each lane, we need to know
+    // which input we choose and which of the 4 lanes we choose; that can be
+    // packed in 3 bits for each lane, so 12 bits in total.
+    uint32_t laneMask_;
+    uint32_t arity_;
+
+    MSimdShuffleBase(int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW, MIRType type)
+    {
+        MOZ_ASSERT(SimdTypeToLength(type) == 4);
+        MOZ_ASSERT(IsSimdType(type));
+        laneMask_ = (laneX << 0) | (laneY << 3) | (laneZ << 6) | (laneW << 9);
+        arity_ = 4;
+    }
+
+    bool sameLanes(const MSimdShuffleBase *other) const {
+        return laneMask_ == other->laneMask_;
+    }
+
+  public:
+    // For now, these formulas are fine for x4 types. They'll need to be
+    // generalized for other SIMD type lengths.
+    int32_t laneX() const { MOZ_ASSERT(arity_ == 4); return laneMask_ & 7; }
+    int32_t laneY() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 3) & 7; }
+    int32_t laneZ() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 6) & 7; }
+    int32_t laneW() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 9) & 7; }
+};
+
 // Applies a shuffle operation to the input, putting the input lanes as
 // indicated in the output register's lanes. This implements the SIMD.js
 // "shuffle" function, that takes one vector and one mask.
-class MSimdSwizzle : public MUnaryInstruction
+class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase
 {
   protected:
-    // As of now, there are at most 4 lanes.
-    SimdLane laneX_;
-    SimdLane laneY_;
-    SimdLane laneZ_;
-    SimdLane laneW_;
-
     MSimdSwizzle(MDefinition *obj, MIRType type,
-                 SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW)
-      : MUnaryInstruction(obj),
-        laneX_(laneX), laneY_(laneY), laneZ_(laneZ), laneW_(laneW)
+                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+      : MUnaryInstruction(obj), MSimdShuffleBase(laneX, laneY, laneZ, laneW, type)
     {
+        MOZ_ASSERT(laneX < 4 && laneY < 4 && laneZ < 4 && laneW < 4);
         MOZ_ASSERT(IsSimdType(obj->type()));
-        // Returned value needs to be in a vector too
         MOZ_ASSERT(IsSimdType(type));
-        MOZ_ASSERT(SimdTypeToScalarType(obj->type()) == type);
-
-        mozilla::DebugOnly<uint32_t> expectedLength = SimdTypeToLength(obj->type());
-        MOZ_ASSERT(uint32_t(laneX_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneY_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneZ_) < expectedLength);
-        MOZ_ASSERT(uint32_t(laneW_) < expectedLength);
-
+        MOZ_ASSERT(obj->type() == type);
         setResultType(type);
         setMovable();
     }
@@ -1611,36 +1628,68 @@ class MSimdSwizzle : public MUnaryInstruction
     INSTRUCTION_HEADER(SimdSwizzle);
 
     static MSimdSwizzle *NewAsmJS(TempAllocator &alloc, MDefinition *obj, MIRType type,
-                                  SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW)
+                                  int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
     {
         return new(alloc) MSimdSwizzle(obj, type, laneX, laneY, laneZ, laneW);
     }
 
-    SimdLane laneX() const { return laneX_; }
-    SimdLane laneY() const { return laneY_; }
-    SimdLane laneZ() const { return laneZ_; }
-    SimdLane laneW() const { return laneW_; }
-
-    AliasSet getAliasSet() const {
-        return AliasSet::None();
-    }
     bool congruentTo(const MDefinition *ins) const {
         if (!ins->isSimdSwizzle())
             return false;
         const MSimdSwizzle *other = ins->toSimdSwizzle();
-        if (other->laneX_ != laneX_ ||
-            other->laneY_ != laneY_ ||
-            other->laneZ_ != laneZ_ ||
-            other->laneW_ != laneW_)
-        {
-            return false;
-        }
-        return congruentIfOperandsEqual(other);
+        return sameLanes(other) && congruentIfOperandsEqual(other);
+    }
+
+    AliasSet getAliasSet() const {
+        return AliasSet::None();
     }
 
     ALLOW_CLONE(MSimdSwizzle)
 };
 
+// Applies a shuffle operation to the inputs, selecting the 2 first lanes of the
+// output from lanes of the first input, and the 2 last lanes of the output from
+// lanes of the second input.
+class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase
+{
+    MSimdShuffle(MDefinition *lhs, MDefinition *rhs, MIRType type,
+                 int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW)
+      : MBinaryInstruction(lhs, rhs), MSimdShuffleBase(laneX, laneY, laneZ, laneW, lhs->type())
+    {
+        MOZ_ASSERT(laneX < 8 && laneY < 8 && laneZ < 8 && laneW < 8);
+        MOZ_ASSERT(IsSimdType(lhs->type()));
+        MOZ_ASSERT(IsSimdType(rhs->type()));
+        MOZ_ASSERT(lhs->type() == rhs->type());
+        MOZ_ASSERT(IsSimdType(type));
+        MOZ_ASSERT(lhs->type() == type);
+        setResultType(type);
+        setMovable();
+    }
+
+  public:
+    INSTRUCTION_HEADER(SimdShuffle);
+
+    static MSimdShuffle *NewAsmJS(TempAllocator &alloc, MDefinition *lhs, MDefinition *rhs,
+                                  MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ,
+                                  int32_t laneW)
+    {
+        return new(alloc) MSimdShuffle(lhs, rhs, type, laneX, laneY, laneZ, laneW);
+    }
+
+    bool congruentTo(const MDefinition *ins) const {
+        if (!ins->isSimdShuffle())
+            return false;
+        const MSimdShuffle *other = ins->toSimdShuffle();
+        return sameLanes(other) && binaryCongruentTo(other);
+    }
+
+    AliasSet getAliasSet() const {
+        return AliasSet::None();
+    }
+
+    ALLOW_CLONE(MSimdShuffle)
+};
+
 class MSimdUnaryArith : public MUnaryInstruction
 {
   public:
diff --git a/js/src/jit/MOpcodes.h b/js/src/jit/MOpcodes.h
index 27a95dbc7f25..5a8b28e2d8f9 100644
--- a/js/src/jit/MOpcodes.h
+++ b/js/src/jit/MOpcodes.h
@@ -21,6 +21,7 @@ namespace jit {
     _(SimdInsertElement)                                                    \
     _(SimdSignMask)                                                         \
     _(SimdSwizzle)                                                          \
+    _(SimdShuffle)                                                          \
     _(SimdUnaryArith)                                                       \
     _(SimdBinaryComp)                                                       \
     _(SimdBinaryArith)                                                      \
diff --git a/js/src/jit/ParallelSafetyAnalysis.cpp b/js/src/jit/ParallelSafetyAnalysis.cpp
index e2b06894b6be..95be08c268f6 100644
--- a/js/src/jit/ParallelSafetyAnalysis.cpp
+++ b/js/src/jit/ParallelSafetyAnalysis.cpp
@@ -120,6 +120,7 @@ class ParallelSafetyVisitor : public MDefinitionVisitor
     SAFE_OP(SimdInsertElement)
     SAFE_OP(SimdSignMask)
     SAFE_OP(SimdSwizzle)
+    SAFE_OP(SimdShuffle)
     SAFE_OP(SimdUnaryArith)
     SAFE_OP(SimdBinaryComp)
     SAFE_OP(SimdBinaryArith)
diff --git a/js/src/jit/shared/Assembler-x86-shared.h b/js/src/jit/shared/Assembler-x86-shared.h
index 1ff589e26091..2e09a51d4f42 100644
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@@ -1864,6 +1864,22 @@ class AssemblerX86Shared : public AssemblerShared
         MOZ_ASSERT(HasSSE2());
         masm.shufps_irr(mask, src.code(), dest.code());
     }
+    void shufps(uint32_t mask, const Operand &src, FloatRegister dest) {
+        MOZ_ASSERT(HasSSE2());
+        switch (src.kind()) {
+          case Operand::FPREG:
+            masm.shufps_irr(mask, src.fpu(), dest.code());
+            break;
+          case Operand::MEM_REG_DISP:
+            masm.shufps_imr(mask, src.disp(), src.base(), dest.code());
+            break;
+          case Operand::MEM_ADDRESS32:
+            masm.shufps_imr(mask, src.address(), dest.code());
+            break;
+          default:
+            MOZ_CRASH("unexpected operand kind");
+        }
+    }
     void addsd(FloatRegister src, FloatRegister dest) {
         MOZ_ASSERT(HasSSE2());
         masm.addsd_rr(src.code(), dest.code());
diff --git a/js/src/jit/shared/BaseAssembler-x86-shared.h b/js/src/jit/shared/BaseAssembler-x86-shared.h
index ee82fcb02978..b46cadfa6d82 100644
--- a/js/src/jit/shared/BaseAssembler-x86-shared.h
+++ b/js/src/jit/shared/BaseAssembler-x86-shared.h
@@ -2940,7 +2940,7 @@ public:
     void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst)
     {
         MOZ_ASSERT(mask < 256);
-        spew("pshufd      0x%x, %s, %s",
+        spew("pshufd     0x%x, %s, %s",
              mask, nameFPReg(src), nameFPReg(dst));
         m_formatter.prefix(PRE_SSE_66);
         m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)dst, (RegisterID)src);
@@ -2956,6 +2956,24 @@ public:
         m_formatter.immediate8(uint8_t(mask));
     }
 
+    void shufps_imr(uint32_t mask, int offset, RegisterID base, XMMRegisterID dst)
+    {
+        MOZ_ASSERT(mask < 256);
+        spew("shufps     0x%x, %s0x%x(%s), %s",
+             mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst));
+        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, base, offset);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
+    void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst)
+    {
+        spew("shufps     %x, %p, %s",
+             mask, address, nameFPReg(dst));
+        m_formatter.prefix(PRE_SSE_F3);
+        m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, address);
+        m_formatter.immediate8(uint8_t(mask));
+    }
+
     void movhlps_rr(XMMRegisterID src, XMMRegisterID dst)
     {
         spew("movhlps     %s, %s",
diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.cpp b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
index ab2d847740ed..6d44607233f5 100644
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@@ -2412,6 +2412,19 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF *ins)
     return true;
 }
 
+bool
+CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins)
+{
+    FloatRegister lhs = ToFloatRegister(ins->lhs());
+    Operand rhs = ToOperand(ins->rhs());
+    MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs);
+
+    uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ() - 4,
+                                                       ins->laneW() - 4);
+    masm.shuffleMix(mask, rhs, lhs);
+    return true;
+}
+
 bool
 CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *ins)
 {
diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.h b/js/src/jit/shared/CodeGenerator-x86-shared.h
index daca5c6e0c95..8ac2d9f79653 100644
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@@ -221,6 +221,7 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
     bool visitSimdSignMaskX4(LSimdSignMaskX4 *ins);
     bool visitSimdSwizzleI(LSimdSwizzleI *lir);
     bool visitSimdSwizzleF(LSimdSwizzleF *lir);
+    bool visitSimdShuffle(LSimdShuffle *lir);
     bool visitSimdUnaryArithIx4(LSimdUnaryArithIx4 *lir);
     bool visitSimdUnaryArithFx4(LSimdUnaryArithFx4 *lir);
     bool visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *lir);
diff --git a/js/src/jit/shared/MacroAssembler-x86-shared.h b/js/src/jit/shared/MacroAssembler-x86-shared.h
index 31b29c70fd1e..c7dcf026ca39 100644
--- a/js/src/jit/shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/shared/MacroAssembler-x86-shared.h
@@ -595,13 +595,12 @@ class MacroAssemblerX86Shared : public Assembler
     void packedDivFloat32(const Operand &src, FloatRegister dest) {
         divps(src, dest);
     }
-    static uint32_t ComputeShuffleMask(SimdLane x, SimdLane y = LaneX,
-                                       SimdLane z = LaneX, SimdLane w = LaneX)
+
+    static uint32_t ComputeShuffleMask(uint32_t x = LaneX, uint32_t y = LaneY,
+                                       uint32_t z = LaneZ, uint32_t w = LaneW)
     {
-        uint32_t r = (uint32_t(w) << 6) |
-                     (uint32_t(z) << 4) |
-                     (uint32_t(y) << 2) |
-                     uint32_t(x);
+        MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4);
+        uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0);
         MOZ_ASSERT(r < 256);
         return r;
     }
@@ -626,6 +625,11 @@ class MacroAssemblerX86Shared : public Assembler
             moveAlignedFloat32x4(src, dest);
         shufps(mask, dest, dest);
     }
+    void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) {
+        // Note this uses shufps, which is a cross-domain penaly on CPU where it
+        // applies, but that's the way clang and gcc do it.
+        shufps(mask, src, dest);
+    }
 
     void moveFloatAsDouble(Register src, FloatRegister dest) {
         movd(src, dest);