mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-03-05 04:57:12 +00:00
commit
58f3d3caf5
@ -713,64 +713,58 @@ DEF_OP(PDep) {
|
||||
LOGMAN_THROW_AA_FMT(OpSize == 4 || OpSize == 8, "Unsupported {} size: {}", __func__, OpSize);
|
||||
const auto EmitSize = OpSize == 8 ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
|
||||
|
||||
const auto Input = GetReg(Op->Input.ID());
|
||||
const auto Mask = GetReg(Op->Mask.ID());
|
||||
const auto Dest = GetReg(Node);
|
||||
|
||||
const auto ShiftedBitReg = TMP1.R();
|
||||
const auto BitReg = TMP2.R();
|
||||
const auto SubMaskReg = TMP3.R();
|
||||
const auto IndexReg = TMP4.R();
|
||||
const auto ZeroReg = ARMEmitter::Reg::zr;
|
||||
// PDep implementation follows the ideas from
|
||||
// http://0x80.pl/articles/pdep-soft-emu.html ... Basically, iterate the *set*
|
||||
// bits only, which will be faster than the naive implementation as long as
|
||||
// there are enough holes in the mask.
|
||||
//
|
||||
// The specific arm64 assembly used is based on the sequence that clang
|
||||
// generates for the C code, giving context to the scheduling yielding better
|
||||
// ILP than I would do by hand. The registers are allocated by hand however,
|
||||
// to fit within the tight constraints we have here withot spilling. Also, we
|
||||
// use cbz/cbnz for conditional branching to avoid clobbering NZCV.
|
||||
|
||||
const auto InputReg = StaticRegisters[0];
|
||||
const auto MaskReg = StaticRegisters[1];
|
||||
const auto DestReg = StaticRegisters[2];
|
||||
// We can't clobber these
|
||||
const auto OrigInput = GetReg(Op->Input.ID());
|
||||
const auto OrigMask = GetReg(Op->Mask.ID());
|
||||
|
||||
const auto SpillCode = 1U << InputReg.Idx() |
|
||||
1U << MaskReg.Idx() |
|
||||
1U << DestReg.Idx();
|
||||
// So we have shadow as temporaries
|
||||
const auto Input = TMP1.R();
|
||||
const auto Mask = TMP2.R();
|
||||
|
||||
// these get used variously as scratch
|
||||
const auto T0 = TMP3.R();
|
||||
const auto T1 = TMP4.R();
|
||||
|
||||
ARMEmitter::SingleUseForwardLabel EarlyExit;
|
||||
ARMEmitter::BackwardLabel NextBit;
|
||||
ARMEmitter::SingleUseForwardLabel Done;
|
||||
cbz(EmitSize, Mask, &EarlyExit);
|
||||
mov(EmitSize, IndexReg, ZeroReg);
|
||||
|
||||
// We sadly need to spill regs for this for the time being
|
||||
// TODO: Remove when scratch registers can be allocated
|
||||
// explicitly.
|
||||
SpillStaticRegs(TMP1, false, SpillCode);
|
||||
// First, copy the input/mask, since we'll be clobbering. Copy as 64-bit to
|
||||
// make this 0-uop on Firestorm.
|
||||
mov(ARMEmitter::Size::i64Bit, Input, OrigInput);
|
||||
mov(ARMEmitter::Size::i64Bit, Mask, OrigMask);
|
||||
|
||||
// Now, they're copied, so we can start setting Dest (even if it overlaps with
|
||||
// one of them). Handle early exit case
|
||||
mov(EmitSize, Dest, 0);
|
||||
cbz(EmitSize, OrigMask, &Done);
|
||||
|
||||
mov(EmitSize, InputReg, Input);
|
||||
mov(EmitSize, MaskReg, Mask);
|
||||
mov(EmitSize, DestReg, ZeroReg);
|
||||
// Setup for first iteration
|
||||
neg(EmitSize, T0, Mask);
|
||||
and_(EmitSize, T0, T0, Mask);
|
||||
|
||||
// Main loop
|
||||
Bind(&NextBit);
|
||||
rbit(EmitSize, ShiftedBitReg, MaskReg);
|
||||
clz(EmitSize, ShiftedBitReg, ShiftedBitReg);
|
||||
lsrv(EmitSize, BitReg, InputReg, IndexReg);
|
||||
and_(EmitSize, BitReg, BitReg, 1);
|
||||
sub(EmitSize, SubMaskReg, MaskReg, 1);
|
||||
add(EmitSize, IndexReg, IndexReg, 1);
|
||||
ands(EmitSize, MaskReg, MaskReg, SubMaskReg);
|
||||
lslv(EmitSize, ShiftedBitReg, BitReg, ShiftedBitReg);
|
||||
orr(EmitSize, DestReg, DestReg, ShiftedBitReg);
|
||||
b(ARMEmitter::Condition::CC_NE, &NextBit);
|
||||
// Store result in a temp so it doesn't get clobbered.
|
||||
// and restore it after the re-fill below.
|
||||
mov(EmitSize, IndexReg, DestReg);
|
||||
// Restore our registers before leaving
|
||||
// TODO: Also remove along with above TODO.
|
||||
FillStaticRegs(false, SpillCode);
|
||||
mov(EmitSize, Dest, IndexReg);
|
||||
b(&Done);
|
||||
|
||||
// Early exit
|
||||
Bind(&EarlyExit);
|
||||
mov(EmitSize, Dest, ZeroReg);
|
||||
sbfx(EmitSize, T1, Input, 0, 1);
|
||||
eor(EmitSize, Mask, Mask, T0);
|
||||
and_(EmitSize, T0, T1, T0);
|
||||
neg(EmitSize, T1, Mask);
|
||||
orr(EmitSize, Dest, Dest, T0);
|
||||
lsr(EmitSize, Input, Input, 1);
|
||||
and_(EmitSize, T0, Mask, T1);
|
||||
cbnz(EmitSize, T0, &NextBit);
|
||||
|
||||
// All done with nothing to do.
|
||||
Bind(&Done);
|
||||
|
@ -2307,8 +2307,9 @@ void OpDispatchBuilder::MULX(OpcodeArgs) {
|
||||
}
|
||||
|
||||
void OpDispatchBuilder::PDEP(OpcodeArgs) {
|
||||
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
|
||||
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags);
|
||||
LOGMAN_THROW_A_FMT(Op->InstSize >= 4, "No masking needed");
|
||||
auto* Input = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto* Mask = LoadSource(GPRClass, Op, Op->Src[1], Op->Flags, {.AllowUpperGarbage = true});
|
||||
auto Result = _PDep(OpSizeFromSrc(Op), Input, Mask);
|
||||
|
||||
StoreResult(GPRClass, Op, Op->Dest, Result, -1);
|
||||
|
@ -472,75 +472,47 @@
|
||||
]
|
||||
},
|
||||
"pdep eax, ebx, ecx": {
|
||||
"ExpectedInstructionCount": 29,
|
||||
"ExpectedInstructionCount": 14,
|
||||
"Comment": [
|
||||
"Map 2 0b11 0xf5 32-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov w20, w7",
|
||||
"mov w21, w5",
|
||||
"cbz w21, #+0x68",
|
||||
"mov w3, wzr",
|
||||
"mrs x0, nzcv",
|
||||
"str w0, [x28, #728]",
|
||||
"stp x4, x5, [x28, #8]",
|
||||
"str x6, [x28, #24]",
|
||||
"mov w4, w20",
|
||||
"mov w5, w21",
|
||||
"mov w6, wzr",
|
||||
"rbit w0, w5",
|
||||
"clz w0, w0",
|
||||
"lsr w1, w4, w3",
|
||||
"and w1, w1, #0x1",
|
||||
"sub w2, w5, #0x1 (1)",
|
||||
"add w3, w3, #0x1 (1)",
|
||||
"ands w5, w5, w2",
|
||||
"lsl w0, w1, w0",
|
||||
"orr w6, w6, w0",
|
||||
"b.ne #-0x24",
|
||||
"mov w3, w6",
|
||||
"ldr w4, [x28, #728]",
|
||||
"msr nzcv, x4",
|
||||
"ldp x4, x5, [x28, #8]",
|
||||
"ldr x6, [x28, #24]",
|
||||
"mov w4, w3",
|
||||
"b #+0x8",
|
||||
"mov w4, wzr"
|
||||
"mov x0, x7",
|
||||
"mov x1, x5",
|
||||
"mov w4, #0x0",
|
||||
"cbz w5, #+0x2c",
|
||||
"neg w2, w1",
|
||||
"and w2, w2, w1",
|
||||
"sbfx w3, w0, #0, #1",
|
||||
"eor w1, w1, w2",
|
||||
"and w2, w3, w2",
|
||||
"neg w3, w1",
|
||||
"orr w4, w4, w2",
|
||||
"lsr w0, w0, #1",
|
||||
"and w2, w1, w3",
|
||||
"cbnz w2, #-0x1c"
|
||||
]
|
||||
},
|
||||
"pdep rax, rbx, rcx": {
|
||||
"ExpectedInstructionCount": 27,
|
||||
"ExpectedInstructionCount": 14,
|
||||
"Comment": [
|
||||
"Map 2 0b11 0xf5 64-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"cbz x5, #+0x68",
|
||||
"mov x3, xzr",
|
||||
"mrs x0, nzcv",
|
||||
"str w0, [x28, #728]",
|
||||
"stp x4, x5, [x28, #8]",
|
||||
"str x6, [x28, #24]",
|
||||
"mov x4, x7",
|
||||
"mov x5, x5",
|
||||
"mov x6, xzr",
|
||||
"rbit x0, x5",
|
||||
"clz x0, x0",
|
||||
"lsr x1, x4, x3",
|
||||
"and x1, x1, #0x1",
|
||||
"sub x2, x5, #0x1 (1)",
|
||||
"add x3, x3, #0x1 (1)",
|
||||
"ands x5, x5, x2",
|
||||
"lsl x0, x1, x0",
|
||||
"orr x6, x6, x0",
|
||||
"b.ne #-0x24",
|
||||
"mov x3, x6",
|
||||
"ldr w4, [x28, #728]",
|
||||
"msr nzcv, x4",
|
||||
"ldp x4, x5, [x28, #8]",
|
||||
"ldr x6, [x28, #24]",
|
||||
"mov x4, x3",
|
||||
"b #+0x8",
|
||||
"mov x4, xzr"
|
||||
"mov x0, x7",
|
||||
"mov x1, x5",
|
||||
"mov x4, #0x0",
|
||||
"cbz x5, #+0x2c",
|
||||
"neg x2, x1",
|
||||
"and x2, x2, x1",
|
||||
"sbfx x3, x0, #0, #1",
|
||||
"eor x1, x1, x2",
|
||||
"and x2, x3, x2",
|
||||
"neg x3, x1",
|
||||
"orr x4, x4, x2",
|
||||
"lsr x0, x0, #1",
|
||||
"and x2, x1, x3",
|
||||
"cbnz x2, #-0x1c"
|
||||
]
|
||||
},
|
||||
"bextr eax, ebx, ecx": {
|
||||
|
@ -3329,75 +3329,47 @@
|
||||
]
|
||||
},
|
||||
"pdep eax, ebx, ecx": {
|
||||
"ExpectedInstructionCount": 29,
|
||||
"ExpectedInstructionCount": 14,
|
||||
"Comment": [
|
||||
"Map 2 0b11 0xf5 32-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"mov w20, w7",
|
||||
"mov w21, w5",
|
||||
"cbz w21, #+0x68",
|
||||
"mov w3, wzr",
|
||||
"mrs x0, nzcv",
|
||||
"str w0, [x28, #728]",
|
||||
"stp x4, x5, [x28, #8]",
|
||||
"str x6, [x28, #24]",
|
||||
"mov w4, w20",
|
||||
"mov w5, w21",
|
||||
"mov w6, wzr",
|
||||
"rbit w0, w5",
|
||||
"clz w0, w0",
|
||||
"lsr w1, w4, w3",
|
||||
"and w1, w1, #0x1",
|
||||
"sub w2, w5, #0x1 (1)",
|
||||
"add w3, w3, #0x1 (1)",
|
||||
"ands w5, w5, w2",
|
||||
"lsl w0, w1, w0",
|
||||
"orr w6, w6, w0",
|
||||
"b.ne #-0x24",
|
||||
"mov w3, w6",
|
||||
"ldr w4, [x28, #728]",
|
||||
"msr nzcv, x4",
|
||||
"ldp x4, x5, [x28, #8]",
|
||||
"ldr x6, [x28, #24]",
|
||||
"mov w4, w3",
|
||||
"b #+0x8",
|
||||
"mov w4, wzr"
|
||||
"mov x0, x7",
|
||||
"mov x1, x5",
|
||||
"mov w4, #0x0",
|
||||
"cbz w5, #+0x2c",
|
||||
"neg w2, w1",
|
||||
"and w2, w2, w1",
|
||||
"sbfx w3, w0, #0, #1",
|
||||
"eor w1, w1, w2",
|
||||
"and w2, w3, w2",
|
||||
"neg w3, w1",
|
||||
"orr w4, w4, w2",
|
||||
"lsr w0, w0, #1",
|
||||
"and w2, w1, w3",
|
||||
"cbnz w2, #-0x1c"
|
||||
]
|
||||
},
|
||||
"pdep rax, rbx, rcx": {
|
||||
"ExpectedInstructionCount": 27,
|
||||
"ExpectedInstructionCount": 14,
|
||||
"Comment": [
|
||||
"Map 2 0b11 0xf5 64-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"cbz x5, #+0x68",
|
||||
"mov x3, xzr",
|
||||
"mrs x0, nzcv",
|
||||
"str w0, [x28, #728]",
|
||||
"stp x4, x5, [x28, #8]",
|
||||
"str x6, [x28, #24]",
|
||||
"mov x4, x7",
|
||||
"mov x5, x5",
|
||||
"mov x6, xzr",
|
||||
"rbit x0, x5",
|
||||
"clz x0, x0",
|
||||
"lsr x1, x4, x3",
|
||||
"and x1, x1, #0x1",
|
||||
"sub x2, x5, #0x1 (1)",
|
||||
"add x3, x3, #0x1 (1)",
|
||||
"ands x5, x5, x2",
|
||||
"lsl x0, x1, x0",
|
||||
"orr x6, x6, x0",
|
||||
"b.ne #-0x24",
|
||||
"mov x3, x6",
|
||||
"ldr w4, [x28, #728]",
|
||||
"msr nzcv, x4",
|
||||
"ldp x4, x5, [x28, #8]",
|
||||
"ldr x6, [x28, #24]",
|
||||
"mov x4, x3",
|
||||
"b #+0x8",
|
||||
"mov x4, xzr"
|
||||
"mov x0, x7",
|
||||
"mov x1, x5",
|
||||
"mov x4, #0x0",
|
||||
"cbz x5, #+0x2c",
|
||||
"neg x2, x1",
|
||||
"and x2, x2, x1",
|
||||
"sbfx x3, x0, #0, #1",
|
||||
"eor x1, x1, x2",
|
||||
"and x2, x3, x2",
|
||||
"neg x3, x1",
|
||||
"orr x4, x4, x2",
|
||||
"lsr x0, x0, #1",
|
||||
"and x2, x1, x3",
|
||||
"cbnz x2, #-0x1c"
|
||||
]
|
||||
},
|
||||
"mulx eax, ebx, ecx": {
|
||||
|
Loading…
x
Reference in New Issue
Block a user