From b2b91caa03a6f2861a3a80f919fbfc5f7f061668 Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Wed, 27 Aug 2014 19:24:41 +0200 Subject: [PATCH] Bug 1021716: SIMD x86-x64: Implement MSimdShuffleMix; r=sunfish --- js/src/jit/LIR-Common.h | 29 ++++- js/src/jit/LOpcodes.h | 1 + js/src/jit/Lowering.cpp | 18 +++ js/src/jit/Lowering.h | 1 + js/src/jit/MIR.h | 121 ++++++++++++------ js/src/jit/MOpcodes.h | 1 + js/src/jit/ParallelSafetyAnalysis.cpp | 1 + js/src/jit/shared/Assembler-x86-shared.h | 16 +++ js/src/jit/shared/BaseAssembler-x86-shared.h | 20 ++- .../jit/shared/CodeGenerator-x86-shared.cpp | 13 ++ js/src/jit/shared/CodeGenerator-x86-shared.h | 1 + js/src/jit/shared/MacroAssembler-x86-shared.h | 16 ++- 12 files changed, 191 insertions(+), 47 deletions(-) diff --git a/js/src/jit/LIR-Common.h b/js/src/jit/LIR-Common.h index a8c427bf5436..9b5a7f6d1926 100644 --- a/js/src/jit/LIR-Common.h +++ b/js/src/jit/LIR-Common.h @@ -242,10 +242,10 @@ class LSimdSwizzleBase : public LInstructionHelper<1, 1, 0> return getOperand(0); } - SimdLane laneX() const { return mir_->toSimdSwizzle()->laneX(); } - SimdLane laneY() const { return mir_->toSimdSwizzle()->laneY(); } - SimdLane laneZ() const { return mir_->toSimdSwizzle()->laneZ(); } - SimdLane laneW() const { return mir_->toSimdSwizzle()->laneW(); } + int32_t laneX() const { return mir_->toSimdSwizzle()->laneX(); } + int32_t laneY() const { return mir_->toSimdSwizzle()->laneY(); } + int32_t laneZ() const { return mir_->toSimdSwizzle()->laneZ(); } + int32_t laneW() const { return mir_->toSimdSwizzle()->laneW(); } }; // Shuffles a int32x4 into another int32x4 vector. @@ -265,6 +265,27 @@ class LSimdSwizzleF : public LSimdSwizzleBase {} }; +// Base class for both int32x4 and float32x4 shuffle instructions. +class LSimdShuffle : public LInstructionHelper<1, 2, 0> +{ + public: + LIR_HEADER(SimdShuffle); + LSimdShuffle() + {} + + const LAllocation *lhs() { + return getOperand(0); + } + const LAllocation *rhs() { + return getOperand(1); + } + + int32_t laneX() const { return mir_->toSimdShuffle()->laneX(); } + int32_t laneY() const { return mir_->toSimdShuffle()->laneY(); } + int32_t laneZ() const { return mir_->toSimdShuffle()->laneZ(); } + int32_t laneW() const { return mir_->toSimdShuffle()->laneW(); } +}; + // Binary SIMD comparison operation between two SIMD operands class LSimdBinaryComp: public LInstructionHelper<1, 2, 0> { diff --git a/js/src/jit/LOpcodes.h b/js/src/jit/LOpcodes.h index 2bd8fcf2f2a6..84cc5ce9e157 100644 --- a/js/src/jit/LOpcodes.h +++ b/js/src/jit/LOpcodes.h @@ -26,6 +26,7 @@ _(SimdSignMaskX4) \ _(SimdSwizzleI) \ _(SimdSwizzleF) \ + _(SimdShuffle) \ _(SimdUnaryArithIx4) \ _(SimdUnaryArithFx4) \ _(SimdBinaryCompIx4) \ diff --git a/js/src/jit/Lowering.cpp b/js/src/jit/Lowering.cpp index 17dc591cb414..35d2a01256cc 100644 --- a/js/src/jit/Lowering.cpp +++ b/js/src/jit/Lowering.cpp @@ -3830,6 +3830,24 @@ LIRGenerator::visitSimdSwizzle(MSimdSwizzle *ins) return false; } +bool +LIRGenerator::visitSimdShuffle(MSimdShuffle *ins) +{ + MOZ_ASSERT(IsSimdType(ins->lhs()->type())); + MOZ_ASSERT(IsSimdType(ins->rhs()->type())); + MOZ_ASSERT(IsSimdType(ins->type())); + + if (ins->type() == MIRType_Int32x4 || ins->type() == MIRType_Float32x4) { + MDefinition *lhs = ins->lhs(); + MDefinition *rhs = ins->rhs(); + LSimdShuffle *lir = new (alloc()) LSimdShuffle; + return lowerForFPU(lir, ins, lhs, rhs); + } + + MOZ_CRASH("Unknown SIMD kind when getting lane"); + return false; +} + bool LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith *ins) { diff --git a/js/src/jit/Lowering.h b/js/src/jit/Lowering.h index 65ae87665f3a..06b738ff4464 100644 --- a/js/src/jit/Lowering.h +++ b/js/src/jit/Lowering.h @@ -273,6 +273,7 @@ class LIRGenerator : public LIRGeneratorSpecific bool visitSimdInsertElement(MSimdInsertElement *ins); bool visitSimdSignMask(MSimdSignMask *ins); bool visitSimdSwizzle(MSimdSwizzle *ins); + bool visitSimdShuffle(MSimdShuffle *ins); bool visitSimdUnaryArith(MSimdUnaryArith *ins); bool visitSimdBinaryComp(MSimdBinaryComp *ins); bool visitSimdBinaryArith(MSimdBinaryArith *ins); diff --git a/js/src/jit/MIR.h b/js/src/jit/MIR.h index 62b11508740d..e57c8d4d63ea 100644 --- a/js/src/jit/MIR.h +++ b/js/src/jit/MIR.h @@ -1575,34 +1575,51 @@ class MSimdSignMask : public MUnaryInstruction ALLOW_CLONE(MSimdSignMask) }; +// Base for the MSimdSwizzle and MSimdShuffle classes. +class MSimdShuffleBase +{ + protected: + // As of now, there are at most 4 lanes. For each lane, we need to know + // which input we choose and which of the 4 lanes we choose; that can be + // packed in 3 bits for each lane, so 12 bits in total. + uint32_t laneMask_; + uint32_t arity_; + + MSimdShuffleBase(int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW, MIRType type) + { + MOZ_ASSERT(SimdTypeToLength(type) == 4); + MOZ_ASSERT(IsSimdType(type)); + laneMask_ = (laneX << 0) | (laneY << 3) | (laneZ << 6) | (laneW << 9); + arity_ = 4; + } + + bool sameLanes(const MSimdShuffleBase *other) const { + return laneMask_ == other->laneMask_; + } + + public: + // For now, these formulas are fine for x4 types. They'll need to be + // generalized for other SIMD type lengths. + int32_t laneX() const { MOZ_ASSERT(arity_ == 4); return laneMask_ & 7; } + int32_t laneY() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 3) & 7; } + int32_t laneZ() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 6) & 7; } + int32_t laneW() const { MOZ_ASSERT(arity_ == 4); return (laneMask_ >> 9) & 7; } +}; + // Applies a shuffle operation to the input, putting the input lanes as // indicated in the output register's lanes. This implements the SIMD.js // "shuffle" function, that takes one vector and one mask. -class MSimdSwizzle : public MUnaryInstruction +class MSimdSwizzle : public MUnaryInstruction, public MSimdShuffleBase { protected: - // As of now, there are at most 4 lanes. - SimdLane laneX_; - SimdLane laneY_; - SimdLane laneZ_; - SimdLane laneW_; - MSimdSwizzle(MDefinition *obj, MIRType type, - SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW) - : MUnaryInstruction(obj), - laneX_(laneX), laneY_(laneY), laneZ_(laneZ), laneW_(laneW) + int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW) + : MUnaryInstruction(obj), MSimdShuffleBase(laneX, laneY, laneZ, laneW, type) { + MOZ_ASSERT(laneX < 4 && laneY < 4 && laneZ < 4 && laneW < 4); MOZ_ASSERT(IsSimdType(obj->type())); - // Returned value needs to be in a vector too MOZ_ASSERT(IsSimdType(type)); - MOZ_ASSERT(SimdTypeToScalarType(obj->type()) == type); - - mozilla::DebugOnly expectedLength = SimdTypeToLength(obj->type()); - MOZ_ASSERT(uint32_t(laneX_) < expectedLength); - MOZ_ASSERT(uint32_t(laneY_) < expectedLength); - MOZ_ASSERT(uint32_t(laneZ_) < expectedLength); - MOZ_ASSERT(uint32_t(laneW_) < expectedLength); - + MOZ_ASSERT(obj->type() == type); setResultType(type); setMovable(); } @@ -1611,36 +1628,68 @@ class MSimdSwizzle : public MUnaryInstruction INSTRUCTION_HEADER(SimdSwizzle); static MSimdSwizzle *NewAsmJS(TempAllocator &alloc, MDefinition *obj, MIRType type, - SimdLane laneX, SimdLane laneY, SimdLane laneZ, SimdLane laneW) + int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW) { return new(alloc) MSimdSwizzle(obj, type, laneX, laneY, laneZ, laneW); } - SimdLane laneX() const { return laneX_; } - SimdLane laneY() const { return laneY_; } - SimdLane laneZ() const { return laneZ_; } - SimdLane laneW() const { return laneW_; } - - AliasSet getAliasSet() const { - return AliasSet::None(); - } bool congruentTo(const MDefinition *ins) const { if (!ins->isSimdSwizzle()) return false; const MSimdSwizzle *other = ins->toSimdSwizzle(); - if (other->laneX_ != laneX_ || - other->laneY_ != laneY_ || - other->laneZ_ != laneZ_ || - other->laneW_ != laneW_) - { - return false; - } - return congruentIfOperandsEqual(other); + return sameLanes(other) && congruentIfOperandsEqual(other); + } + + AliasSet getAliasSet() const { + return AliasSet::None(); } ALLOW_CLONE(MSimdSwizzle) }; +// Applies a shuffle operation to the inputs, selecting the 2 first lanes of the +// output from lanes of the first input, and the 2 last lanes of the output from +// lanes of the second input. +class MSimdShuffle : public MBinaryInstruction, public MSimdShuffleBase +{ + MSimdShuffle(MDefinition *lhs, MDefinition *rhs, MIRType type, + int32_t laneX, int32_t laneY, int32_t laneZ, int32_t laneW) + : MBinaryInstruction(lhs, rhs), MSimdShuffleBase(laneX, laneY, laneZ, laneW, lhs->type()) + { + MOZ_ASSERT(laneX < 8 && laneY < 8 && laneZ < 8 && laneW < 8); + MOZ_ASSERT(IsSimdType(lhs->type())); + MOZ_ASSERT(IsSimdType(rhs->type())); + MOZ_ASSERT(lhs->type() == rhs->type()); + MOZ_ASSERT(IsSimdType(type)); + MOZ_ASSERT(lhs->type() == type); + setResultType(type); + setMovable(); + } + + public: + INSTRUCTION_HEADER(SimdShuffle); + + static MSimdShuffle *NewAsmJS(TempAllocator &alloc, MDefinition *lhs, MDefinition *rhs, + MIRType type, int32_t laneX, int32_t laneY, int32_t laneZ, + int32_t laneW) + { + return new(alloc) MSimdShuffle(lhs, rhs, type, laneX, laneY, laneZ, laneW); + } + + bool congruentTo(const MDefinition *ins) const { + if (!ins->isSimdShuffle()) + return false; + const MSimdShuffle *other = ins->toSimdShuffle(); + return sameLanes(other) && binaryCongruentTo(other); + } + + AliasSet getAliasSet() const { + return AliasSet::None(); + } + + ALLOW_CLONE(MSimdShuffle) +}; + class MSimdUnaryArith : public MUnaryInstruction { public: diff --git a/js/src/jit/MOpcodes.h b/js/src/jit/MOpcodes.h index 27a95dbc7f25..5a8b28e2d8f9 100644 --- a/js/src/jit/MOpcodes.h +++ b/js/src/jit/MOpcodes.h @@ -21,6 +21,7 @@ namespace jit { _(SimdInsertElement) \ _(SimdSignMask) \ _(SimdSwizzle) \ + _(SimdShuffle) \ _(SimdUnaryArith) \ _(SimdBinaryComp) \ _(SimdBinaryArith) \ diff --git a/js/src/jit/ParallelSafetyAnalysis.cpp b/js/src/jit/ParallelSafetyAnalysis.cpp index e2b06894b6be..95be08c268f6 100644 --- a/js/src/jit/ParallelSafetyAnalysis.cpp +++ b/js/src/jit/ParallelSafetyAnalysis.cpp @@ -120,6 +120,7 @@ class ParallelSafetyVisitor : public MDefinitionVisitor SAFE_OP(SimdInsertElement) SAFE_OP(SimdSignMask) SAFE_OP(SimdSwizzle) + SAFE_OP(SimdShuffle) SAFE_OP(SimdUnaryArith) SAFE_OP(SimdBinaryComp) SAFE_OP(SimdBinaryArith) diff --git a/js/src/jit/shared/Assembler-x86-shared.h b/js/src/jit/shared/Assembler-x86-shared.h index 1ff589e26091..2e09a51d4f42 100644 --- a/js/src/jit/shared/Assembler-x86-shared.h +++ b/js/src/jit/shared/Assembler-x86-shared.h @@ -1864,6 +1864,22 @@ class AssemblerX86Shared : public AssemblerShared MOZ_ASSERT(HasSSE2()); masm.shufps_irr(mask, src.code(), dest.code()); } + void shufps(uint32_t mask, const Operand &src, FloatRegister dest) { + MOZ_ASSERT(HasSSE2()); + switch (src.kind()) { + case Operand::FPREG: + masm.shufps_irr(mask, src.fpu(), dest.code()); + break; + case Operand::MEM_REG_DISP: + masm.shufps_imr(mask, src.disp(), src.base(), dest.code()); + break; + case Operand::MEM_ADDRESS32: + masm.shufps_imr(mask, src.address(), dest.code()); + break; + default: + MOZ_CRASH("unexpected operand kind"); + } + } void addsd(FloatRegister src, FloatRegister dest) { MOZ_ASSERT(HasSSE2()); masm.addsd_rr(src.code(), dest.code()); diff --git a/js/src/jit/shared/BaseAssembler-x86-shared.h b/js/src/jit/shared/BaseAssembler-x86-shared.h index ee82fcb02978..b46cadfa6d82 100644 --- a/js/src/jit/shared/BaseAssembler-x86-shared.h +++ b/js/src/jit/shared/BaseAssembler-x86-shared.h @@ -2940,7 +2940,7 @@ public: void pshufd_irr(uint32_t mask, XMMRegisterID src, XMMRegisterID dst) { MOZ_ASSERT(mask < 256); - spew("pshufd 0x%x, %s, %s", + spew("pshufd 0x%x, %s, %s", mask, nameFPReg(src), nameFPReg(dst)); m_formatter.prefix(PRE_SSE_66); m_formatter.twoByteOp(OP2_PSHUFD_VdqWdqIb, (RegisterID)dst, (RegisterID)src); @@ -2956,6 +2956,24 @@ public: m_formatter.immediate8(uint8_t(mask)); } + void shufps_imr(uint32_t mask, int offset, RegisterID base, XMMRegisterID dst) + { + MOZ_ASSERT(mask < 256); + spew("shufps 0x%x, %s0x%x(%s), %s", + mask, PRETTY_PRINT_OFFSET(offset), nameIReg(base), nameFPReg(dst)); + m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, base, offset); + m_formatter.immediate8(uint8_t(mask)); + } + + void shufps_imr(uint32_t mask, const void* address, XMMRegisterID dst) + { + spew("shufps %x, %p, %s", + mask, address, nameFPReg(dst)); + m_formatter.prefix(PRE_SSE_F3); + m_formatter.twoByteOp(OP2_SHUFPS_VpsWpsIb, (RegisterID)dst, address); + m_formatter.immediate8(uint8_t(mask)); + } + void movhlps_rr(XMMRegisterID src, XMMRegisterID dst) { spew("movhlps %s, %s", diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.cpp b/js/src/jit/shared/CodeGenerator-x86-shared.cpp index ab2d847740ed..6d44607233f5 100644 --- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp +++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp @@ -2412,6 +2412,19 @@ CodeGeneratorX86Shared::visitSimdSwizzleF(LSimdSwizzleF *ins) return true; } +bool +CodeGeneratorX86Shared::visitSimdShuffle(LSimdShuffle *ins) +{ + FloatRegister lhs = ToFloatRegister(ins->lhs()); + Operand rhs = ToOperand(ins->rhs()); + MOZ_ASSERT(ToFloatRegister(ins->output()) == lhs); + + uint32_t mask = MacroAssembler::ComputeShuffleMask(ins->laneX(), ins->laneY(), ins->laneZ() - 4, + ins->laneW() - 4); + masm.shuffleMix(mask, rhs, lhs); + return true; +} + bool CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *ins) { diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.h b/js/src/jit/shared/CodeGenerator-x86-shared.h index daca5c6e0c95..8ac2d9f79653 100644 --- a/js/src/jit/shared/CodeGenerator-x86-shared.h +++ b/js/src/jit/shared/CodeGenerator-x86-shared.h @@ -221,6 +221,7 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared bool visitSimdSignMaskX4(LSimdSignMaskX4 *ins); bool visitSimdSwizzleI(LSimdSwizzleI *lir); bool visitSimdSwizzleF(LSimdSwizzleF *lir); + bool visitSimdShuffle(LSimdShuffle *lir); bool visitSimdUnaryArithIx4(LSimdUnaryArithIx4 *lir); bool visitSimdUnaryArithFx4(LSimdUnaryArithFx4 *lir); bool visitSimdBinaryCompIx4(LSimdBinaryCompIx4 *lir); diff --git a/js/src/jit/shared/MacroAssembler-x86-shared.h b/js/src/jit/shared/MacroAssembler-x86-shared.h index 31b29c70fd1e..c7dcf026ca39 100644 --- a/js/src/jit/shared/MacroAssembler-x86-shared.h +++ b/js/src/jit/shared/MacroAssembler-x86-shared.h @@ -595,13 +595,12 @@ class MacroAssemblerX86Shared : public Assembler void packedDivFloat32(const Operand &src, FloatRegister dest) { divps(src, dest); } - static uint32_t ComputeShuffleMask(SimdLane x, SimdLane y = LaneX, - SimdLane z = LaneX, SimdLane w = LaneX) + + static uint32_t ComputeShuffleMask(uint32_t x = LaneX, uint32_t y = LaneY, + uint32_t z = LaneZ, uint32_t w = LaneW) { - uint32_t r = (uint32_t(w) << 6) | - (uint32_t(z) << 4) | - (uint32_t(y) << 2) | - uint32_t(x); + MOZ_ASSERT(x < 4 && y < 4 && z < 4 && w < 4); + uint32_t r = (w << 6) | (z << 4) | (y << 2) | (x << 0); MOZ_ASSERT(r < 256); return r; } @@ -626,6 +625,11 @@ class MacroAssemblerX86Shared : public Assembler moveAlignedFloat32x4(src, dest); shufps(mask, dest, dest); } + void shuffleMix(uint32_t mask, const Operand &src, FloatRegister dest) { + // Note this uses shufps, which is a cross-domain penaly on CPU where it + // applies, but that's the way clang and gcc do it. + shufps(mask, src, dest); + } void moveFloatAsDouble(Register src, FloatRegister dest) { movd(src, dest);