Merge pull request #3380 from alyssarosenzweig/opt/pdep

Optimize PDEP
This commit is contained in:
Mai 2024-01-29 13:27:15 -05:00 committed by GitHub
commit 58f3d3caf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 102 additions and 163 deletions

View File

@ -713,64 +713,58 @@ DEF_OP(PDep) {
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
const auto Input = GetReg(Op->Input.ID());
const auto Mask = GetReg(Op->Mask.ID());
const auto Dest = GetReg(Node);
const auto ShiftedBitReg = TMP1.R();
const auto BitReg = TMP2.R();
const auto SubMaskReg = TMP3.R();
const auto IndexReg = TMP4.R();
const auto ZeroReg = ARMEmitter::Reg::zr;
// PDep implementation follows the ideas from
// http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set*
// bits only, which will be faster than the naive implementation as long as
// there are enough holes in the mask.
//
// The specific arm64 assembly used is based on the sequence that clang
// generates for the C code, giving context to the scheduling yielding better
// ILP than I would do by hand. The registers are allocated by hand however,
// to fit within the tight constraints we have here withot spilling. Also, we
// use cbz/cbnz for conditional branching to avoid clobbering NZCV.
const auto InputReg = StaticRegisters[0];
const auto MaskReg = StaticRegisters[1];
const auto DestReg = StaticRegisters[2];
// We can't clobber these
const auto OrigInput = GetReg(Op->Input.ID());
const auto OrigMask = GetReg(Op->Mask.ID());
const auto SpillCode = 1U << InputReg.Idx() |
1U << MaskReg.Idx() |
1U << DestReg.Idx();
// So we have shadow as temporaries
const auto Input = TMP1.R();
const auto Mask = TMP2.R();
// these get used variously as scratch
const auto T0 = TMP3.R();
const auto T1 = TMP4.R();
ARMEmitter::SingleUseForwardLabel EarlyExit;
ARMEmitter::BackwardLabel NextBit;
ARMEmitter::SingleUseForwardLabel Done;
cbz(EmitSize, Mask, &EarlyExit);
mov(EmitSize, IndexReg, ZeroReg);
// We sadly need to spill regs for this for the time being
// TODO: Remove when scratch registers can be allocated
// explicitly.
SpillStaticRegs(TMP1, false, SpillCode);
// First, copy the input/mask, since we'll be clobbering. Copy as 64-bit to
// make this 0-uop on Firestorm.
mov(ARMEmitter::Size::i64Bit, Input, OrigInput);
mov(ARMEmitter::Size::i64Bit, Mask, OrigMask);
// Now, they're copied, so we can start setting Dest (even if it overlaps with
// one of them). Handle early exit case
mov(EmitSize, Dest, 0);
cbz(EmitSize, OrigMask, &Done);
mov(EmitSize, InputReg, Input);
mov(EmitSize, MaskReg, Mask);
mov(EmitSize, DestReg, ZeroReg);
// Setup for first iteration
neg(EmitSize, T0, Mask);
and_(EmitSize, T0, T0, Mask);
// Main loop
Bind(&NextBit);
rbit(EmitSize, ShiftedBitReg, MaskReg);
clz(EmitSize, ShiftedBitReg, ShiftedBitReg);
lsrv(EmitSize, BitReg, InputReg, IndexReg);
and_(EmitSize, BitReg, BitReg, 1);
sub(EmitSize, SubMaskReg, MaskReg, 1);
add(EmitSize, IndexReg, IndexReg, 1);
ands(EmitSize, MaskReg, MaskReg, SubMaskReg);
lslv(EmitSize, ShiftedBitReg, BitReg, ShiftedBitReg);
orr(EmitSize, DestReg, DestReg, ShiftedBitReg);
b(ARMEmitter::Condition::CC_NE, &NextBit);
// Store result in a temp so it doesn't get clobbered.
// and restore it after the re-fill below.
mov(EmitSize, IndexReg, DestReg);
// Restore our registers before leaving
// TODO: Also remove along with above TODO.
FillStaticRegs(false, SpillCode);
mov(EmitSize, Dest, IndexReg);
b(&Done);
// Early exit
Bind(&EarlyExit);
mov(EmitSize, Dest, ZeroReg);
sbfx(EmitSize, T1, Input, 0, 1);
eor(EmitSize, Mask, Mask, T0);
and_(EmitSize, T0, T1, T0);
neg(EmitSize, T1, Mask);
orr(EmitSize, Dest, Dest, T0);
lsr(EmitSize, Input, Input, 1);
and_(EmitSize, T0, Mask, T1);
cbnz(EmitSize, T0, &NextBit);
// All done with nothing to do.
Bind(&Done);

View File

@ -2307,8 +2307,9 @@ void OpDispatchBuilder::MULX(OpcodeArgs) {
}
void OpDispatchBuilder::PDEP(OpcodeArgs) {
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask);
StoreResult(GPRClass, Op, Op->Dest, Result, -1);

View File

@ -472,75 +472,47 @@
]
},
"pdep eax, ebx, ecx": {
"ExpectedInstructionCount": 29,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b11 0xf5 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"mov w21, w5",
"cbz w21, #+0x68",
"mov w3, wzr",
"mrs x0, nzcv",
"str w0, [x28, #728]",
"stp x4, x5, [x28, #8]",
"str x6, [x28, #24]",
"mov w4, w20",
"mov w5, w21",
"mov w6, wzr",
"rbit w0, w5",
"clz w0, w0",
"lsr w1, w4, w3",
"and w1, w1, #0x1",
"sub w2, w5, #0x1 (1)",
"add w3, w3, #0x1 (1)",
"ands w5, w5, w2",
"lsl w0, w1, w0",
"orr w6, w6, w0",
"b.ne #-0x24",
"mov w3, w6",
"ldr w4, [x28, #728]",
"msr nzcv, x4",
"ldp x4, x5, [x28, #8]",
"ldr x6, [x28, #24]",
"mov w4, w3",
"b #+0x8",
"mov w4, wzr"
"mov x0, x7",
"mov x1, x5",
"mov w4, #0x0",
"cbz w5, #+0x2c",
"neg w2, w1",
"and w2, w2, w1",
"sbfx w3, w0, #0, #1",
"eor w1, w1, w2",
"and w2, w3, w2",
"neg w3, w1",
"orr w4, w4, w2",
"lsr w0, w0, #1",
"and w2, w1, w3",
"cbnz w2, #-0x1c"
]
},
"pdep rax, rbx, rcx": {
"ExpectedInstructionCount": 27,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b11 0xf5 64-bit"
],
"ExpectedArm64ASM": [
"cbz x5, #+0x68",
"mov x3, xzr",
"mrs x0, nzcv",
"str w0, [x28, #728]",
"stp x4, x5, [x28, #8]",
"str x6, [x28, #24]",
"mov x4, x7",
"mov x5, x5",
"mov x6, xzr",
"rbit x0, x5",
"clz x0, x0",
"lsr x1, x4, x3",
"and x1, x1, #0x1",
"sub x2, x5, #0x1 (1)",
"add x3, x3, #0x1 (1)",
"ands x5, x5, x2",
"lsl x0, x1, x0",
"orr x6, x6, x0",
"b.ne #-0x24",
"mov x3, x6",
"ldr w4, [x28, #728]",
"msr nzcv, x4",
"ldp x4, x5, [x28, #8]",
"ldr x6, [x28, #24]",
"mov x4, x3",
"b #+0x8",
"mov x4, xzr"
"mov x0, x7",
"mov x1, x5",
"mov x4, #0x0",
"cbz x5, #+0x2c",
"neg x2, x1",
"and x2, x2, x1",
"sbfx x3, x0, #0, #1",
"eor x1, x1, x2",
"and x2, x3, x2",
"neg x3, x1",
"orr x4, x4, x2",
"lsr x0, x0, #1",
"and x2, x1, x3",
"cbnz x2, #-0x1c"
]
},
"bextr eax, ebx, ecx": {

View File

@ -3329,75 +3329,47 @@
]
},
"pdep eax, ebx, ecx": {
"ExpectedInstructionCount": 29,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b11 0xf5 32-bit"
],
"ExpectedArm64ASM": [
"mov w20, w7",
"mov w21, w5",
"cbz w21, #+0x68",
"mov w3, wzr",
"mrs x0, nzcv",
"str w0, [x28, #728]",
"stp x4, x5, [x28, #8]",
"str x6, [x28, #24]",
"mov w4, w20",
"mov w5, w21",
"mov w6, wzr",
"rbit w0, w5",
"clz w0, w0",
"lsr w1, w4, w3",
"and w1, w1, #0x1",
"sub w2, w5, #0x1 (1)",
"add w3, w3, #0x1 (1)",
"ands w5, w5, w2",
"lsl w0, w1, w0",
"orr w6, w6, w0",
"b.ne #-0x24",
"mov w3, w6",
"ldr w4, [x28, #728]",
"msr nzcv, x4",
"ldp x4, x5, [x28, #8]",
"ldr x6, [x28, #24]",
"mov w4, w3",
"b #+0x8",
"mov w4, wzr"
"mov x0, x7",
"mov x1, x5",
"mov w4, #0x0",
"cbz w5, #+0x2c",
"neg w2, w1",
"and w2, w2, w1",
"sbfx w3, w0, #0, #1",
"eor w1, w1, w2",
"and w2, w3, w2",
"neg w3, w1",
"orr w4, w4, w2",
"lsr w0, w0, #1",
"and w2, w1, w3",
"cbnz w2, #-0x1c"
]
},
"pdep rax, rbx, rcx": {
"ExpectedInstructionCount": 27,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b11 0xf5 64-bit"
],
"ExpectedArm64ASM": [
"cbz x5, #+0x68",
"mov x3, xzr",
"mrs x0, nzcv",
"str w0, [x28, #728]",
"stp x4, x5, [x28, #8]",
"str x6, [x28, #24]",
"mov x4, x7",
"mov x5, x5",
"mov x6, xzr",
"rbit x0, x5",
"clz x0, x0",
"lsr x1, x4, x3",
"and x1, x1, #0x1",
"sub x2, x5, #0x1 (1)",
"add x3, x3, #0x1 (1)",
"ands x5, x5, x2",
"lsl x0, x1, x0",
"orr x6, x6, x0",
"b.ne #-0x24",
"mov x3, x6",
"ldr w4, [x28, #728]",
"msr nzcv, x4",
"ldp x4, x5, [x28, #8]",
"ldr x6, [x28, #24]",
"mov x4, x3",
"b #+0x8",
"mov x4, xzr"
"mov x0, x7",
"mov x1, x5",
"mov x4, #0x0",
"cbz x5, #+0x2c",
"neg x2, x1",
"and x2, x2, x1",
"sbfx x3, x0, #0, #1",
"eor x1, x1, x2",
"and x2, x3, x2",
"neg x3, x1",
"orr x4, x4, x2",
"lsr x0, x0, #1",
"and x2, x1, x3",
"cbnz x2, #-0x1c"
]
},
"mulx eax, ebx, ecx": {