Bug 1136226 - Unary functions for small integer SIMD types. r=bbouvier

- Implement 'not' and 'neg' for 8x16 and 16x8 types.
- Rename some 'bitwiseFooX4' masm functions to 'bitwiseFooSimd128'.
- Rename the zeroInt32x4 and zeroFloat32x4 to zeroSimd128{Int,Float}.
- Add support for the paddb/paddw and psubb/psubw SSE2 instructions in the
  assembler.
This commit is contained in:
Jakob Stoklund Olesen 2016-05-31 09:00:18 -07:00
parent 43fd82ad3e
commit 62a36df31c
12 changed files with 260 additions and 34 deletions

View File

@ -4483,13 +4483,23 @@ LIRGenerator::visitSimdUnaryArith(MSimdUnaryArith* ins)
// Cannot be at start, as the ouput is used as a temporary to store values.
LUse in = use(ins->input());
if (ins->type() == MIRType::Int32x4 || ins->type() == MIRType::Bool32x4) {
LSimdUnaryArithIx4* lir = new(alloc()) LSimdUnaryArithIx4(in);
define(lir, ins);
} else if (ins->type() == MIRType::Float32x4) {
LSimdUnaryArithFx4* lir = new(alloc()) LSimdUnaryArithFx4(in);
define(lir, ins);
} else {
switch (ins->type()) {
case MIRType::Int8x16:
case MIRType::Bool8x16:
define(new (alloc()) LSimdUnaryArithIx16(in), ins);
break;
case MIRType::Int16x8:
case MIRType::Bool16x8:
define(new (alloc()) LSimdUnaryArithIx8(in), ins);
break;
case MIRType::Int32x4:
case MIRType::Bool32x4:
define(new (alloc()) LSimdUnaryArithIx4(in), ins);
break;
case MIRType::Float32x4:
define(new (alloc()) LSimdUnaryArithFx4(in), ins);
break;
default:
MOZ_CRASH("Unknown SIMD kind for unary operation");
}
}

View File

@ -525,6 +525,22 @@ class LSimdUnaryArith : public LInstructionHelper<1, 1, 0>
}
};
// Unary SIMD arithmetic operation on a Int8x16 operand
class LSimdUnaryArithIx16 : public LSimdUnaryArith
{
public:
LIR_HEADER(SimdUnaryArithIx16);
explicit LSimdUnaryArithIx16(const LAllocation& in) : LSimdUnaryArith(in) {}
};
// Unary SIMD arithmetic operation on a Int16x8 operand
class LSimdUnaryArithIx8 : public LSimdUnaryArith
{
public:
LIR_HEADER(SimdUnaryArithIx8);
explicit LSimdUnaryArithIx8(const LAllocation& in) : LSimdUnaryArith(in) {}
};
// Unary SIMD arithmetic operation on a Int32x4 operand
class LSimdUnaryArithIx4 : public LSimdUnaryArith
{

View File

@ -37,6 +37,8 @@
_(SimdSwizzleI) \
_(SimdSwizzleF) \
_(SimdShuffle) \
_(SimdUnaryArithIx16) \
_(SimdUnaryArithIx8) \
_(SimdUnaryArithIx4) \
_(SimdUnaryArithFx4) \
_(SimdBinaryCompIx4) \

View File

@ -55,9 +55,13 @@ ABIArgGenerator::next(MIRType type)
case MIRType::Double:
current_ = ABIArg(FloatArgRegs[regIndex_++]);
break;
case MIRType::Bool32x4:
case MIRType::Int8x16:
case MIRType::Int16x8:
case MIRType::Int32x4:
case MIRType::Float32x4:
case MIRType::Bool8x16:
case MIRType::Bool16x8:
case MIRType::Bool32x4:
// On Win64, >64 bit args need to be passed by reference, but asm.js
// doesn't allow passing SIMD values to FFIs. The only way to reach
// here is asm to asm calls, so we can break the ABI here.
@ -91,9 +95,13 @@ ABIArgGenerator::next(MIRType type)
else
current_ = ABIArg(FloatArgRegs[floatRegIndex_++]);
break;
case MIRType::Bool32x4:
case MIRType::Int8x16:
case MIRType::Int16x8:
case MIRType::Int32x4:
case MIRType::Float32x4:
case MIRType::Bool8x16:
case MIRType::Bool16x8:
case MIRType::Bool32x4:
if (floatRegIndex_ == NumFloatArgRegs) {
stackOffset_ = AlignBytes(stackOffset_, SimdMemoryAlignment);
current_ = ABIArg(stackOffset_);

View File

@ -2439,6 +2439,70 @@ class AssemblerX86Shared : public AssemblerShared
MOZ_CRASH("unexpected operand kind");
}
}
void vpaddb(const Operand& src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src1.kind()) {
case Operand::FPREG:
masm.vpaddb_rr(src1.fpu(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vpaddb_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_ADDRESS32:
masm.vpaddb_mr(src1.address(), src0.encoding(), dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vpsubb(const Operand& src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src1.kind()) {
case Operand::FPREG:
masm.vpsubb_rr(src1.fpu(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vpsubb_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_ADDRESS32:
masm.vpsubb_mr(src1.address(), src0.encoding(), dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vpaddw(const Operand& src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src1.kind()) {
case Operand::FPREG:
masm.vpaddw_rr(src1.fpu(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vpaddw_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_ADDRESS32:
masm.vpaddw_mr(src1.address(), src0.encoding(), dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vpsubw(const Operand& src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src1.kind()) {
case Operand::FPREG:
masm.vpsubw_rr(src1.fpu(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_REG_DISP:
masm.vpsubw_mr(src1.disp(), src1.base(), src0.encoding(), dest.encoding());
break;
case Operand::MEM_ADDRESS32:
masm.vpsubw_mr(src1.address(), src0.encoding(), dest.encoding());
break;
default:
MOZ_CRASH("unexpected operand kind");
}
}
void vpaddd(const Operand& src1, FloatRegister src0, FloatRegister dest) {
MOZ_ASSERT(HasSSE2());
switch (src1.kind()) {

View File

@ -583,6 +583,32 @@ public:
m_formatter.twoByteOp(OP2_XADD_EvGv, offset, base, index, scale, srcdest);
}
void vpaddb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, src1, src0, dst);
}
void vpaddb_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, offset, base, src0, dst);
}
void vpaddb_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddb", VEX_PD, OP2_PADDB_VdqWdq, address, src0, dst);
}
void vpaddw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddw", VEX_PD, OP2_PADDW_VdqWdq, src1, src0, dst);
}
void vpaddw_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddw", VEX_PD, OP2_PADDW_VdqWdq, offset, base, src0, dst);
}
void vpaddw_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddw", VEX_PD, OP2_PADDW_VdqWdq, address, src0, dst);
}
void vpaddd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpaddd", VEX_PD, OP2_PADDD_VdqWdq, src1, src0, dst);
@ -596,6 +622,32 @@ public:
twoByteOpSimd("vpaddd", VEX_PD, OP2_PADDD_VdqWdq, address, src0, dst);
}
void vpsubb_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubb", VEX_PD, OP2_PSUBB_VdqWdq, src1, src0, dst);
}
void vpsubb_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubb", VEX_PD, OP2_PSUBB_VdqWdq, offset, base, src0, dst);
}
void vpsubb_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubb", VEX_PD, OP2_PSUBB_VdqWdq, address, src0, dst);
}
void vpsubw_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubw", VEX_PD, OP2_PSUBW_VdqWdq, src1, src0, dst);
}
void vpsubw_mr(int32_t offset, RegisterID base, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubw", VEX_PD, OP2_PSUBW_VdqWdq, offset, base, src0, dst);
}
void vpsubw_mr(const void* address, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubw", VEX_PD, OP2_PSUBW_VdqWdq, address, src0, dst);
}
void vpsubd_rr(XMMRegisterID src1, XMMRegisterID src0, XMMRegisterID dst)
{
twoByteOpSimd("vpsubd", VEX_PD, OP2_PSUBD_VdqWdq, src1, src0, dst);

View File

@ -2533,14 +2533,14 @@ CodeGeneratorX86Shared::visitFloat32x4ToUint32x4(LFloat32x4ToUint32x4* ins)
// We can identify A-lanes by the sign bits in A: Any A-lanes will be
// positive in A, and N, B, and V-lanes will be 0x80000000 in A. Compute a
// mask of non-A-lanes into |tempF|.
masm.zeroFloat32x4(tempF);
masm.zeroSimd128Float(tempF);
masm.packedGreaterThanInt32x4(Operand(out), tempF);
// Clear the A-lanes in B.
masm.bitwiseAndX4(Operand(tempF), scratch);
masm.bitwiseAndSimd128(Operand(tempF), scratch);
// Compute the final result: A for A-lanes, A|B for B-lanes.
masm.bitwiseOrX4(Operand(scratch), out);
masm.bitwiseOrSimd128(Operand(scratch), out);
// We still need to filter out the V-lanes. They would show up as 0x80000000
// in both A and B. Since we cleared the valid A-lanes in B, the V-lanes are
@ -3315,7 +3315,7 @@ CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
// if that's what it's used in.
masm.loadConstantSimd128Int(allOnes, scratch);
masm.packedEqualInt32x4(rhs, lhs);
masm.bitwiseXorX4(Operand(scratch), lhs);
masm.bitwiseXorSimd128(Operand(scratch), lhs);
return;
case MSimdBinaryComp::greaterThanOrEqual:
// src := rhs
@ -3325,13 +3325,13 @@ CodeGeneratorX86Shared::visitSimdBinaryCompIx4(LSimdBinaryCompIx4* ins)
masm.loadAlignedSimd128Int(rhs, scratch);
masm.packedGreaterThanInt32x4(ToOperand(ins->lhs()), scratch);
masm.loadConstantSimd128Int(allOnes, lhs);
masm.bitwiseXorX4(Operand(scratch), lhs);
masm.bitwiseXorSimd128(Operand(scratch), lhs);
return;
case MSimdBinaryComp::lessThanOrEqual:
// lhs <= rhs is equivalent to !(rhs < lhs), which we compute here.
masm.loadConstantSimd128Int(allOnes, scratch);
masm.packedGreaterThanInt32x4(rhs, lhs);
masm.bitwiseXorX4(Operand(scratch), lhs);
masm.bitwiseXorSimd128(Operand(scratch), lhs);
return;
}
MOZ_CRASH("unexpected SIMD op");
@ -3534,6 +3534,58 @@ CodeGeneratorX86Shared::visitSimdBinaryArithFx4(LSimdBinaryArithFx4* ins)
MOZ_CRASH("unexpected SIMD op");
}
void
CodeGeneratorX86Shared::visitSimdUnaryArithIx16(LSimdUnaryArithIx16* ins)
{
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
static const SimdConstant allOnes = SimdConstant::SplatX16(-1);
switch (ins->operation()) {
case MSimdUnaryArith::neg:
masm.zeroSimd128Int(out);
masm.packedSubInt8(in, out);
return;
case MSimdUnaryArith::not_:
masm.loadConstantSimd128Int(allOnes, out);
masm.bitwiseXorSimd128(in, out);
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
case MSimdUnaryArith::reciprocalSqrtApproximation:
case MSimdUnaryArith::sqrt:
break;
}
MOZ_CRASH("unexpected SIMD op");
}
void
CodeGeneratorX86Shared::visitSimdUnaryArithIx8(LSimdUnaryArithIx8* ins)
{
Operand in = ToOperand(ins->input());
FloatRegister out = ToFloatRegister(ins->output());
static const SimdConstant allOnes = SimdConstant::SplatX8(-1);
switch (ins->operation()) {
case MSimdUnaryArith::neg:
masm.zeroSimd128Int(out);
masm.packedSubInt16(in, out);
return;
case MSimdUnaryArith::not_:
masm.loadConstantSimd128Int(allOnes, out);
masm.bitwiseXorSimd128(in, out);
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
case MSimdUnaryArith::reciprocalSqrtApproximation:
case MSimdUnaryArith::sqrt:
break;
}
MOZ_CRASH("unexpected SIMD op");
}
void
CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
{
@ -3544,12 +3596,12 @@ CodeGeneratorX86Shared::visitSimdUnaryArithIx4(LSimdUnaryArithIx4* ins)
switch (ins->operation()) {
case MSimdUnaryArith::neg:
masm.zeroInt32x4(out);
masm.zeroSimd128Int(out);
masm.packedSubInt32(in, out);
return;
case MSimdUnaryArith::not_:
masm.loadConstantSimd128Int(allOnes, out);
masm.bitwiseXorX4(in, out);
masm.bitwiseXorSimd128(in, out);
return;
case MSimdUnaryArith::abs:
case MSimdUnaryArith::reciprocalApproximation:
@ -3580,15 +3632,15 @@ CodeGeneratorX86Shared::visitSimdUnaryArithFx4(LSimdUnaryArithFx4* ins)
switch (ins->operation()) {
case MSimdUnaryArith::abs:
masm.loadConstantSimd128Float(signMasks, out);
masm.bitwiseAndX4(in, out);
masm.bitwiseAndSimd128(in, out);
return;
case MSimdUnaryArith::neg:
masm.loadConstantSimd128Float(minusZero, out);
masm.bitwiseXorX4(in, out);
masm.bitwiseXorSimd128(in, out);
return;
case MSimdUnaryArith::not_:
masm.loadConstantSimd128Float(allOnes, out);
masm.bitwiseXorX4(in, out);
masm.bitwiseXorSimd128(in, out);
return;
case MSimdUnaryArith::reciprocalApproximation:
masm.packedRcpApproximationFloat32x4(in, out);
@ -3709,9 +3761,9 @@ CodeGeneratorX86Shared::visitSimdSelect(LSimdSelect* ins)
if (!mir->mask()->isSimdBinaryComp())
masm.packedRightShiftByScalar(Imm32(31), temp);
masm.bitwiseAndX4(Operand(temp), output);
masm.bitwiseAndNotX4(Operand(onFalse), temp);
masm.bitwiseOrX4(Operand(temp), output);
masm.bitwiseAndSimd128(Operand(temp), output);
masm.bitwiseAndNotSimd128(Operand(onFalse), temp);
masm.bitwiseOrSimd128(Operand(temp), output);
}
void

View File

@ -307,6 +307,8 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
void visitSimdSwizzleI(LSimdSwizzleI* lir);
void visitSimdSwizzleF(LSimdSwizzleF* lir);
void visitSimdShuffle(LSimdShuffle* lir);
void visitSimdUnaryArithIx16(LSimdUnaryArithIx16* lir);
void visitSimdUnaryArithIx8(LSimdUnaryArithIx8* lir);
void visitSimdUnaryArithIx4(LSimdUnaryArithIx4* lir);
void visitSimdUnaryArithFx4(LSimdUnaryArithFx4* lir);
void visitSimdBinaryCompIx4(LSimdBinaryCompIx4* lir);

View File

@ -259,7 +259,11 @@ enum TwoByteOpcodeID {
OP2_PXORDQ_VdqWdq = 0xEF,
OP2_PSLLD_VdqWdq = 0xF2,
OP2_PMULUDQ_VdqWdq = 0xF4,
OP2_PSUBB_VdqWdq = 0xF8,
OP2_PSUBW_VdqWdq = 0xF9,
OP2_PSUBD_VdqWdq = 0xFA,
OP2_PADDB_VdqWdq = 0xFC,
OP2_PADDW_VdqWdq = 0xFD,
OP2_PADDD_VdqWdq = 0xFE
};

View File

@ -928,9 +928,9 @@ MacroAssembler::canonicalizeFloat32x4(FloatRegister reg, FloatRegister scratch)
float nanf = float(JS::GenericNaN());
loadConstantSimd128Float(SimdConstant::SplatX4(nanf), ifFalse);
bitwiseAndX4(Operand(mask), reg);
bitwiseAndNotX4(Operand(ifFalse), mask);
bitwiseOrX4(Operand(mask), reg);
bitwiseAndSimd128(Operand(mask), reg);
bitwiseAndNotSimd128(Operand(ifFalse), mask);
bitwiseOrSimd128(Operand(mask), reg);
}
// ========================================================================

View File

@ -804,24 +804,24 @@ class MacroAssemblerX86Shared : public Assembler
vcvtdq2ps(src, dest);
}
void bitwiseAndX4(const Operand& src, FloatRegister dest) {
void bitwiseAndSimd128(const Operand& src, FloatRegister dest) {
// TODO Using the "ps" variant for all types incurs a domain crossing
// penalty for integer types and double.
vandps(src, dest, dest);
}
void bitwiseAndNotX4(const Operand& src, FloatRegister dest) {
void bitwiseAndNotSimd128(const Operand& src, FloatRegister dest) {
vandnps(src, dest, dest);
}
void bitwiseOrX4(const Operand& src, FloatRegister dest) {
void bitwiseOrSimd128(const Operand& src, FloatRegister dest) {
vorps(src, dest, dest);
}
void bitwiseXorX4(const Operand& src, FloatRegister dest) {
void bitwiseXorSimd128(const Operand& src, FloatRegister dest) {
vxorps(src, dest, dest);
}
void zeroFloat32x4(FloatRegister dest) {
void zeroSimd128Float(FloatRegister dest) {
vxorps(dest, dest, dest);
}
void zeroInt32x4(FloatRegister dest) {
void zeroSimd128Int(FloatRegister dest) {
vpxor(dest, dest, dest);
}
@ -939,6 +939,18 @@ class MacroAssemblerX86Shared : public Assembler
void packedGreaterThanInt32x4(const Operand& src, FloatRegister dest) {
vpcmpgtd(src, dest, dest);
}
void packedAddInt8(const Operand& src, FloatRegister dest) {
vpaddb(src, dest, dest);
}
void packedSubInt8(const Operand& src, FloatRegister dest) {
vpsubb(src, dest, dest);
}
void packedAddInt16(const Operand& src, FloatRegister dest) {
vpaddw(src, dest, dest);
}
void packedSubInt16(const Operand& src, FloatRegister dest) {
vpsubw(src, dest, dest);
}
void packedAddInt32(const Operand& src, FloatRegister dest) {
vpaddd(src, dest, dest);
}
@ -1197,7 +1209,7 @@ class MacroAssemblerX86Shared : public Assembler
static const SimdConstant zero = SimdConstant::SplatX4(0);
static const SimdConstant minusOne = SimdConstant::SplatX4(-1);
if (v == zero) {
zeroInt32x4(dest);
zeroSimd128Int(dest);
return true;
}
if (v == minusOne) {
@ -1211,7 +1223,7 @@ class MacroAssemblerX86Shared : public Assembler
if (v == zero) {
// This won't get inlined if the SimdConstant v contains -0 in any
// lane, as operator== here does a memcmp.
zeroFloat32x4(dest);
zeroSimd128Float(dest);
return true;
}
return false;

View File

@ -30,8 +30,12 @@ ABIArgGenerator::next(MIRType type)
current_ = ABIArg(stackOffset_);
stackOffset_ += sizeof(uint64_t);
break;
case MIRType::Int8x16:
case MIRType::Int16x8:
case MIRType::Int32x4:
case MIRType::Float32x4:
case MIRType::Bool8x16:
case MIRType::Bool16x8:
case MIRType::Bool32x4:
// SIMD values aren't passed in or out of C++, so we can make up
// whatever internal ABI we like. visitAsmJSPassArg assumes