mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-07 18:04:46 +00:00
Bug 1690492 - Use PBLENDVB for blend operations. r=lth
Differential Revision: https://phabricator.services.mozilla.com/D113582
This commit is contained in:
parent
d85b55ac8e
commit
7a3361b3fd
@ -22,3 +22,12 @@ codegenTestX64_adhoc(
|
||||
66 0f df d9 pandn %xmm1, %xmm3
|
||||
66 0f eb c3 por %xmm3, %xmm0`);
|
||||
|
||||
// Blend constant optimizations
|
||||
|
||||
codegenTestX64_adhoc(
|
||||
`(module
|
||||
(func (export "f") (param v128) (param v128) (param v128) (result v128)
|
||||
(v128.bitselect (local.get 0) (local.get 1) (v128.const i32x4 -1 0 0 -1))))`,
|
||||
'f',
|
||||
`66 0f 3a 0e c1 c3 pblendw \\$0xC3, %xmm1, %xmm0`);
|
||||
|
||||
|
@ -822,6 +822,16 @@ for ( let [ty128,size] of [['i8x16',1], ['i16x8',2], ['i32x4',4]] ) {
|
||||
assertEq(wasmSimdAnalysis(), "shuffle -> permute 32x4");
|
||||
}
|
||||
|
||||
// Bitselect with constant mask folded into shuffle operation
|
||||
|
||||
if (!isArm64) {
|
||||
wasmCompile(`
|
||||
(module (func (param v128) (param v128) (result v128)
|
||||
(v128.bitselect (local.get 0) (local.get 1) (v128.const i8x16 0 -1 -1 0 0 0 0 0 -1 -1 -1 -1 -1 -1 0 0))))
|
||||
`);
|
||||
assertEq(wasmSimdAnalysis(), "shuffle -> blend 8x16");
|
||||
}
|
||||
|
||||
// Library
|
||||
|
||||
function wasmCompile(text) {
|
||||
|
@ -76,3 +76,15 @@ codegenTestX64_v128xLITERAL_v128(
|
||||
66 0f ef c9 pxor %xmm1, %xmm1
|
||||
66 0f 73 d8 03 psrldq \\$0x03, %xmm0`]]);
|
||||
|
||||
// SSE4.1 PBLENDVB instruction is using XMM0, checking if blend
|
||||
// operation generated as expected.
|
||||
codegenTestX64_adhoc(
|
||||
`(func (export "f") (param v128 v128 v128 v128) (result v128)
|
||||
(i8x16.shuffle 0 17 2 3 4 5 6 7 24 25 26 11 12 13 30 15
|
||||
(local.get 2)(local.get 3)))`,
|
||||
'f',
|
||||
`
|
||||
66 0f 6f ca movdqa %xmm2, %xmm1
|
||||
66 0f 6f 05 ${RIPRADDR} movdqax ${RIPR}, %xmm0
|
||||
66 0f 38 10 cb pblendvb %xmm3, %xmm1
|
||||
66 0f 6f c1 movdqa %xmm1, %xmm0`);
|
||||
|
@ -4450,6 +4450,17 @@ bool MWasmLoadGlobalCell::congruentTo(const MDefinition* ins) const {
|
||||
}
|
||||
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
MDefinition* MWasmBitselectSimd128::foldsTo(TempAllocator& alloc) {
|
||||
if (control()->op() == MDefinition::Opcode::WasmFloatConstant) {
|
||||
int8_t shuffle[16];
|
||||
if (specializeConstantMaskAsShuffle(shuffle)) {
|
||||
return MWasmShuffleSimd128::New(alloc, lhs(), rhs(),
|
||||
SimdConstant::CreateX16(shuffle));
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
MDefinition* MWasmBinarySimd128::foldsTo(TempAllocator& alloc) {
|
||||
if (simdOp() == wasm::SimdOp::V8x16Swizzle && rhs()->isWasmFloatConstant()) {
|
||||
// Specialize swizzle(v, constant) as shuffle(mask, v, zero) to trigger all
|
||||
|
@ -13430,6 +13430,14 @@ class MWasmBitselectSimd128 : public MTernaryInstruction,
|
||||
bool congruentTo(const MDefinition* ins) const override {
|
||||
return congruentIfOperandsEqual(ins);
|
||||
}
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
MDefinition* foldsTo(TempAllocator& alloc) override;
|
||||
|
||||
// If the control mask allows the operation to be specialized as a shuffle
|
||||
// and it is profitable to specialize it on this platform, return true and
|
||||
// the appropriate shuffle mask.
|
||||
bool specializeConstantMaskAsShuffle(int8_t shuffle[16]);
|
||||
#endif
|
||||
|
||||
ALLOW_CLONE(MWasmBitselectSimd128)
|
||||
};
|
||||
|
@ -1173,6 +1173,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
||||
MOZ_CRASH("binary SIMD NYI");
|
||||
}
|
||||
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
|
||||
int8_t shuffle[16]) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool MWasmBinarySimd128::specializeForConstantRhs() {
|
||||
// Probably many we want to do here
|
||||
return false;
|
||||
|
@ -956,6 +956,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
|
||||
int8_t shuffle[16]) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool MWasmBinarySimd128::specializeForConstantRhs() {
|
||||
// Probably many we want to do here
|
||||
return false;
|
||||
|
@ -904,6 +904,13 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
||||
MOZ_CRASH("binary SIMD NYI");
|
||||
}
|
||||
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
|
||||
int8_t shuffle[16]) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool MWasmBinarySimd128::specializeForConstantRhs() {
|
||||
// Probably many we want to do here
|
||||
return false;
|
||||
|
@ -2214,6 +2214,15 @@ class AssemblerX86Shared : public AssemblerShared {
|
||||
masm.vpblendw_irr(mask, src1.encoding(), src0.encoding(), dest.encoding());
|
||||
}
|
||||
|
||||
void vpblendvb(FloatRegister mask, FloatRegister src1, FloatRegister src0,
|
||||
FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSE41());
|
||||
MOZ_ASSERT(mask.encoding() == X86Encoding::xmm0 &&
|
||||
src0.encoding() == dest.encoding(),
|
||||
"only legacy encoding is supported");
|
||||
masm.pblendvb_rr(src1.encoding(), dest.encoding());
|
||||
}
|
||||
|
||||
void vpinsrb(unsigned lane, const Operand& src1, FloatRegister src0,
|
||||
FloatRegister dest) {
|
||||
MOZ_ASSERT(HasSSE41());
|
||||
|
@ -3775,6 +3775,13 @@ class BaseAssembler : public GenericAssembler {
|
||||
mask, src1, src0, dst);
|
||||
}
|
||||
|
||||
void pblendvb_rr(XMMRegisterID other, XMMRegisterID dst) {
|
||||
spew("%-11s%s, %s", "pblendvb", XMMRegName(other), XMMRegName(dst));
|
||||
m_formatter.legacySSEPrefix(VEX_PD);
|
||||
m_formatter.threeByteOp(OP3_PBLENDVB_VdqWdq, ESCAPE_38, (RegisterID)other,
|
||||
dst);
|
||||
}
|
||||
|
||||
void vpinsrb_irr(unsigned lane, RegisterID src1, XMMRegisterID src0,
|
||||
XMMRegisterID dst) {
|
||||
MOZ_ASSERT(lane < 16);
|
||||
|
@ -352,6 +352,7 @@ enum ThreeByteOpcodeID {
|
||||
OP3_BLENDPS_VpsWpsIb = 0x0C,
|
||||
OP3_PBLENDW_VdqWdqIb = 0x0E,
|
||||
OP3_PALIGNR_VdqWdqIb = 0x0F,
|
||||
OP3_PBLENDVB_VdqWdq = 0x10,
|
||||
OP3_BLENDVPS_VdqWdq = 0x14,
|
||||
OP3_PEXTRB_EvVdqIb = 0x14,
|
||||
OP3_PEXTRW_EwVdqIb = 0x15,
|
||||
|
@ -947,6 +947,27 @@ void LIRGenerator::visitWasmBinarySimd128(MWasmBinarySimd128* ins) {
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ENABLE_WASM_SIMD
|
||||
bool MWasmBitselectSimd128::specializeConstantMaskAsShuffle(
|
||||
int8_t shuffle[16]) {
|
||||
// Optimization when control vector is a mask with all 0 or all 1 per lane.
|
||||
// On x86, there is no bitselect, blend operations will be a win,
|
||||
// e.g. via PBLENDVB or PBLENDW.
|
||||
MWasmFloatConstant* constant = static_cast<MWasmFloatConstant*>(control());
|
||||
const auto& bytes = constant->toSimd128().asInt8x16();
|
||||
for (int8_t i = 0; i < 16; i++) {
|
||||
if (bytes[i] == -1) {
|
||||
shuffle[i] = i + 16;
|
||||
} else if (bytes[i] == 0) {
|
||||
shuffle[i] = i;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool MWasmBinarySimd128::specializeForConstantRhs() {
|
||||
// The order follows MacroAssembler.h, generally
|
||||
switch (simdOp()) {
|
||||
@ -1218,7 +1239,7 @@ void LIRGenerator::visitWasmShuffleSimd128(MWasmShuffleSimd128* ins) {
|
||||
LDefinition temp = LDefinition::BogusTemp();
|
||||
switch (*s.shuffleOp) {
|
||||
case LWasmShuffleSimd128::BLEND_8x16:
|
||||
temp = tempSimd128();
|
||||
temp = tempFixed(xmm0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -614,3 +614,14 @@ void MacroAssemblerX86Shared::maxNumFloat32x4(FloatRegister lhs, Operand rhs,
|
||||
vorps(Operand(mask), output, output);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::selectX4(FloatRegister mask, FloatRegister onTrue,
|
||||
FloatRegister onFalse,
|
||||
FloatRegister temp,
|
||||
FloatRegister output) {
|
||||
if (AssemblerX86Shared::HasAVX()) {
|
||||
vblendvps(mask, onTrue, onFalse, output);
|
||||
} else {
|
||||
selectSimd128(mask, onTrue, onFalse, temp, output);
|
||||
}
|
||||
}
|
||||
|
@ -158,25 +158,11 @@ void MacroAssemblerX86Shared::blendInt8x16(FloatRegister lhs, FloatRegister rhs,
|
||||
FloatRegister temp,
|
||||
const uint8_t lanes[16]) {
|
||||
MOZ_ASSERT(lhs == output);
|
||||
MOZ_ASSERT(lhs == rhs || !temp.isInvalid());
|
||||
MOZ_ASSERT(temp.encoding() == X86Encoding::xmm0, "pblendvb needs xmm0");
|
||||
|
||||
// TODO: Consider whether PBLENDVB would not be better, even if it is variable
|
||||
// and requires xmm0 to be free and the loading of a mask.
|
||||
|
||||
// Set scratch = lanes to select from lhs.
|
||||
int8_t mask[16];
|
||||
for (unsigned i = 0; i < 16; i++) {
|
||||
mask[i] = ~lanes[i];
|
||||
}
|
||||
ScratchSimd128Scope scratch(asMasm());
|
||||
asMasm().loadConstantSimd128Int(SimdConstant::CreateX16(mask), scratch);
|
||||
if (lhs == rhs) {
|
||||
asMasm().moveSimd128Int(rhs, temp);
|
||||
rhs = temp;
|
||||
}
|
||||
vpand(Operand(scratch), lhs, lhs);
|
||||
vpandn(Operand(rhs), scratch, scratch);
|
||||
vpor(scratch, lhs, lhs);
|
||||
asMasm().loadConstantSimd128Int(
|
||||
SimdConstant::CreateX16(reinterpret_cast<const int8_t*>(lanes)), temp);
|
||||
vpblendvb(temp, rhs, lhs, output);
|
||||
}
|
||||
|
||||
void MacroAssemblerX86Shared::blendInt16x8(FloatRegister lhs, FloatRegister rhs,
|
||||
@ -1095,9 +1081,6 @@ void MacroAssemblerX86Shared::selectSimd128(FloatRegister mask,
|
||||
asMasm().moveSimd128Int(onTrue, output);
|
||||
asMasm().moveSimd128Int(mask, temp);
|
||||
|
||||
// SSE4.1 has plain blendvps which can do this, but it is awkward
|
||||
// to use because it requires the mask to be in xmm0.
|
||||
|
||||
vpand(Operand(temp), output, output);
|
||||
vpandn(Operand(onFalse), temp, temp);
|
||||
vpor(Operand(temp), output, output);
|
||||
|
@ -799,14 +799,7 @@ class MacroAssemblerX86Shared : public Assembler {
|
||||
vshufps(mask, src, dest, dest);
|
||||
}
|
||||
void selectX4(FloatRegister mask, FloatRegister onTrue, FloatRegister onFalse,
|
||||
FloatRegister temp, FloatRegister output) {
|
||||
if (AssemblerX86Shared::HasAVX()) {
|
||||
vblendvps(mask, onTrue, onFalse, output);
|
||||
} else {
|
||||
selectSimd128(mask, onTrue, onFalse, temp, output);
|
||||
}
|
||||
}
|
||||
|
||||
FloatRegister temp, FloatRegister output);
|
||||
// End unused SIMD.
|
||||
|
||||
void moveFloatAsDouble(Register src, FloatRegister dest) {
|
||||
|
Loading…
Reference in New Issue
Block a user