Bug 1690492 - Use PBLENDVB for blend operations. r=lth

Differential Revision: https://phabricator.services.mozilla.com/D113582
This commit is contained in:
Yury Delendik 2021-05-27 22:04:57 +00:00
parent d85b55ac8e
commit 7a3361b3fd
15 changed files with 126 additions and 30 deletions

View File

@ -22,3 +22,12 @@ codegenTestX64_adhoc(
66 0f df d9 pandn %xmm1, %xmm3
66 0f eb c3 por %xmm3, %xmm0`);
// Blend constant optimizations
codegenTestX64_adhoc(
`(module
(func (export "f") (param v128) (param v128) (param v128) (result v128)
(v128.bitselect (local.get 0) (local.get 1) (v128.const i32x4 -1 0 0 -1))))`,
'f',
`66 0f 3a 0e c1 c3 pblendw \\$0xC3, %xmm1, %xmm0`);

View File

@ -822,6 +822,16 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
assertEq(wasmSimdAnalysis(), "shuffle -> permute 32x4");
}
// Bitselect with constant mask folded into shuffle operation
if (!isArm64) {
wasmCompile(`
(module (func (param v128) (param v128) (result v128)
(v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
`);
assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
}
// Library
function wasmCompile(text) {

View File

@ -76,3 +76,15 @@ codegenTestX64_v128xLITERAL_v128(
66 0f ef c9 pxor %xmm1, %xmm1
66 0f 73 d8 03 psrldq \\$0x03, %xmm0`]]);
// SSE4.1 PBLENDVB instruction is using XMM0, checking if blend
// operation generated as expected.
codegenTestX64_adhoc(
`(func (export "f") (param v128 v128 v128 v128) (result v128)
(i8x16.shuffle 0 17 2 3 4 5 6 7 24 25 26 11 12 13 30 15
(local.get 2)(local.get 3)))`,
'f',
`
66 0f 6f ca movdqa %xmm2, %xmm1
66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0
66 0f 38 10 cb pblendvb %xmm3, %xmm1
66 0f 6f c1 movdqa %xmm1, %xmm0`);

View File

@ -4450,6 +4450,17 @@ bool MWasmLoadGlobalCell::congruentTo(const MDefinition* ins) const {
}
#ifdef ENABLE_WASM_SIMD
MDefinition* MWasmBitselectSimd128::foldsTo(TempAllocator& alloc) {
if (control()->op() == MDefinition::Opcode::WasmFloatConstant) {
int8_t shuffle[16];
if (specializeConstantMaskAsShuffle(shuffle)) {
return MWasmShuffleSimd128::New(alloc, lhs(), rhs(),
SimdConstant::CreateX16(shuffle));
}
}
return this;
}
MDefinition* MWasmBinarySimd128::foldsTo(TempAllocator& alloc) {
if (simdOp() == wasm::SimdOp::V8x16Swizzle && rhs()->isWasmFloatConstant()) {
// Specialize swizzle(v, constant) as shuffle(mask, v, zero) to trigger all

View File

@ -13430,6 +13430,14 @@ class MWasmBitselectSimd128 : public MTernaryInstruction,
bool congruentTo(const MDefinition* ins) const override {
return congruentIfOperandsEqual(ins);
}
#ifdef ENABLE_WASM_SIMD
MDefinition* foldsTo(TempAllocator& alloc) override;
// If the control mask allows the operation to be specialized as a shuffle
// and it is profitable to specialize it on this platform, return true and
// the appropriate shuffle mask.
bool specializeConstantMaskAsShuffle(int8_t shuffle[16]);
#endif
ALLOW_CLONE(MWasmBitselectSimd128)
};

View File

@ -1173,6 +1173,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
MOZ_CRASH("binary SIMD NYI");
}
#ifdef ENABLE_WASM_SIMD
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
int8_t shuffle[16]) {
return false;
}
#endif
bool MWasmBinarySimd128::specializeForConstantRhs() {
// Probably many we want to do here
return false;

View File

@ -956,6 +956,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
#endif
}
#ifdef ENABLE_WASM_SIMD
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
int8_t shuffle[16]) {
return false;
}
#endif
bool MWasmBinarySimd128::specializeForConstantRhs() {
// Probably many we want to do here
return false;

View File

@ -904,6 +904,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
MOZ_CRASH("binary SIMD NYI");
}
#ifdef ENABLE_WASM_SIMD
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
int8_t shuffle[16]) {
return false;
}
#endif
bool MWasmBinarySimd128::specializeForConstantRhs() {
// Probably many we want to do here
return false;

View File

@ -2214,6 +2214,15 @@ class AssemblerX86Shared : public AssemblerShared {
masm.vpblendw_irr(mask, src1.encoding(), src0.encoding(), dest.encoding());
}
void vpblendvb(FloatRegister mask, FloatRegister src1, FloatRegister src0,
FloatRegister dest) {
MOZ_ASSERT(HasSSE41());
MOZ_ASSERT(mask.encoding() == X86Encoding::xmm0 &&
src0.encoding() == dest.encoding(),
"only legacy encoding is supported");
masm.pblendvb_rr(src1.encoding(), dest.encoding());
}
void vpinsrb(unsigned lane, const Operand& src1, FloatRegister src0,
FloatRegister dest) {
MOZ_ASSERT(HasSSE41());

View File

@ -3775,6 +3775,13 @@ class BaseAssembler : public GenericAssembler {
mask, src1, src0, dst);
}
void pblendvb_rr(XMMRegisterID other, XMMRegisterID dst) {
spew("%-11s%s, %s", "pblendvb", XMMRegName(other), XMMRegName(dst));
m_formatter.legacySSEPrefix(VEX_PD);
m_formatter.threeByteOp(OP3_PBLENDVB_VdqWdq, ESCAPE_38, (RegisterID)other,
dst);
}
void vpinsrb_irr(unsigned lane, RegisterID src1, XMMRegisterID src0,
XMMRegisterID dst) {
MOZ_ASSERT(lane < 16);

View File

@ -352,6 +352,7 @@ enum ThreeByteOpcodeID {
OP3_BLENDPS_VpsWpsIb = 0x0C,
OP3_PBLENDW_VdqWdqIb = 0x0E,
OP3_PALIGNR_VdqWdqIb = 0x0F,
OP3_PBLENDVB_VdqWdq = 0x10,
OP3_BLENDVPS_VdqWdq = 0x14,
OP3_PEXTRB_EvVdqIb = 0x14,
OP3_PEXTRW_EwVdqIb = 0x15,

View File

@ -947,6 +947,27 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
#endif
}
#ifdef ENABLE_WASM_SIMD
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
int8_t shuffle[16]) {
// Optimization when control vector is a mask with all 0 or all 1 per lane.
// On x86, there is no bitselect, blend operations will be a win,
// e.g. via PBLENDVB or PBLENDW.
MWasmFloatConstant* constant = static_cast<MWasmFloatConstant*>(control());
const auto& bytes = constant->toSimd128().asInt8x16();
for (int8_t i = 0; i < 16; i++) {
if (bytes[i] == -1) {
shuffle[i] = i + 16;
} else if (bytes[i] == 0) {
shuffle[i] = i;
} else {
return false;
}
}
return true;
}
#endif
bool MWasmBinarySimd128::specializeForConstantRhs() {
// The order follows MacroAssembler.h, generally
switch (simdOp()) {
@ -1218,7 +1239,7 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
LDefinition temp = LDefinition::BogusTemp();
switch (*s.shuffleOp) {
case LWasmShuffleSimd128::BLEND_8x16:
temp = tempSimd128();
temp = tempFixed(xmm0);
break;
default:
break;

View File

@ -614,3 +614,14 @@ void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
vorps(Operand(mask), output, output);
}
}
void MacroAssemblerX86Shared::selectX4(FloatRegister mask, FloatRegister onTrue,
FloatRegister onFalse,
FloatRegister temp,
FloatRegister output) {
if (AssemblerX86Shared::HasAVX()) {
vblendvps(mask, onTrue, onFalse, output);
} else {
selectSimd128(mask, onTrue, onFalse, temp, output);
}
}

View File

@ -158,25 +158,11 @@ void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
FloatRegister temp,
const uint8_t lanes[16]) {
MOZ_ASSERT(lhs == output);
MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
MOZ_ASSERT(temp.encoding() == X86Encoding::xmm0, "pblendvb needs xmm0");
// TODO: Consider whether PBLENDVB would not be better, even if it is variable
// and requires xmm0 to be free and the loading of a mask.
// Set scratch = lanes to select from lhs.
int8_t mask[16];
for (unsigned i = 0; i < 16; i++) {
mask[i] = ~lanes[i];
}
ScratchSimd128Scope scratch(asMasm());
asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
if (lhs == rhs) {
asMasm().moveSimd128Int(rhs, temp);
rhs = temp;
}
vpand(Operand(scratch), lhs, lhs);
vpandn(Operand(rhs), scratch, scratch);
vpor(scratch, lhs, lhs);
asMasm().loadConstantSimd128Int(
SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp);
vpblendvb(temp, rhs, lhs, output);
}
void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
@ -1095,9 +1081,6 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
asMasm().moveSimd128Int(onTrue, output);
asMasm().moveSimd128Int(mask, temp);
// SSE4.1 has plain blendvps which can do this, but it is awkward
// to use because it requires the mask to be in xmm0.
vpand(Operand(temp), output, output);
vpandn(Operand(onFalse), temp, temp);
vpor(Operand(temp), output, output);

View File

@ -799,14 +799,7 @@ class MacroAssemblerX86Shared : public Assembler {
vshufps(mask, src, dest, dest);
}
void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
FloatRegister temp, FloatRegister output) {
if (AssemblerX86Shared::HasAVX()) {
vblendvps(mask, onTrue, onFalse, output);
} else {
selectSimd128(mask, onTrue, onFalse, temp, output);
}
}
FloatRegister temp, FloatRegister output);
// End unused SIMD.
void moveFloatAsDouble(Register src, FloatRegister dest) {