Merge pull request #3530 from alyssarosenzweig/opt/cmpxchg-flags2

Optimize cmpxchg with flagm
This commit is contained in:
Alyssa Rosenzweig 2024-03-30 15:22:40 -04:00 committed by GitHub
commit 2a625a467b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 139 additions and 74 deletions

View File

@ -5,6 +5,7 @@ tags: backend|arm64
$end_info$
*/
#include "FEXCore/IR/IR.h"
#include "Interface/Context/Context.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Emitter.h"
#include "Interface/Core/ArchHelpers/CodeEmitter/Registers.h"
@ -299,6 +300,31 @@ DEF_OP(SubNZCV) {
}
}
DEF_OP(CmpPairZ) {
auto Op = IROp->C<IR::IROp_CmpPairZ>();
const uint8_t OpSize = IROp->Size;
const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
// Save NZCV
mrs(TMP1, ARMEmitter::SystemRegister::NZCV);
// Compare, setting Z and clobbering NzCV
const auto Src1 = GetRegPair(Op->Src1.ID());
const auto Src2 = GetRegPair(Op->Src2.ID());
cmp(EmitSize, Src1.first, Src2.first);
ccmp(EmitSize, Src1.second, Src2.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ);
// Restore NzCV
if (CTX->HostFeatures.SupportsFlagM) {
rmif(TMP1, 0, 0xb /* NzCV */);
} else {
cset(ARMEmitter::Size::i32Bit, TMP2, ARMEmitter::Condition::CC_EQ);
bfi(ARMEmitter::Size::i32Bit, TMP1, TMP2, 30 /* lsb: Z */, 1);
msr(ARMEmitter::SystemRegister::NZCV, TMP1);
}
}
DEF_OP(CarryInvert) {
LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op");
cfinv();

View File

@ -31,11 +31,19 @@ DEF_OP(CASPair) {
mov(EmitSize, Dst.second, TMP4.R());
}
else {
// Save NZCV so we don't have to mark this op as clobbering NZCV (the
// SupportsAtomics does not clobber atomics and this !SupportsAtomics path
// is so slow it's not worth the complexity of splitting the IR op.). We
// clobber NZCV inside the hot loop and we can't replace cmp/ccmp/b.ne with
// something NZCV-preserving without requiring an extra instruction.
mrs(TMP1, ARMEmitter::SystemRegister::NZCV);
ARMEmitter::BackwardLabel LoopTop;
ARMEmitter::SingleUseForwardLabel LoopNotExpected;
ARMEmitter::SingleUseForwardLabel LoopExpected;
Bind(&LoopTop);
// This instruction sequence must be synced with HandleCASPAL_Armv8.
ldaxp(EmitSize, TMP2, TMP3, MemSrc);
cmp(EmitSize, TMP2, Expected.first);
ccmp(EmitSize, TMP3, Expected.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ);
@ -54,6 +62,9 @@ DEF_OP(CASPair) {
// Might have hit the case where ldaxr was hit but stlxr wasn't
clrex();
Bind(&LoopExpected);
// Restore
msr(ARMEmitter::SystemRegister::NZCV, TMP1);
}
}

View File

@ -4258,16 +4258,8 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) {
OrderedNode *Result_Lower = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 0);
OrderedNode *Result_Upper = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 1);
// Set ZF if memory result was expected
auto OneConst = _Constant(1);
auto ZeroConst = _Constant(0);
OrderedNode *ZFResult = _Select(FEXCore::IR::COND_EQ,
CASResult, Expected,
OneConst, ZeroConst);
// Set ZF
SetRFLAG<FEXCore::X86State::RFLAG_ZF_RAW_LOC>(ZFResult);
HandleNZCV_RMW();
_CmpPairZ(IR::SizeToOpSize(Size), CASResult, Expected);
CalculateDeferredFlags();
auto UpdateIfNotZF = [this](auto Reg, auto Value) {

View File

@ -642,7 +642,6 @@
],
"HasDest": true,
"DestSize": "Size",
"ImplicitFlagClobber": true,
"NumElements": "2",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i64Bit || Size == FEXCore::IR::OpSize::i128Bit"
@ -1095,6 +1094,11 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"CmpPairZ OpSize:#Size, GPRPair:$Src1, GPRPair:$Src2": {
"Desc": ["Compares register pairs and sets Z accordingly, preserving N/Z/V.",
"This accelerates cmpxchg."],
"HasSideEffects": true
},
"SubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Set NZCV for the difference of two GPRs. ",
"Carry flag uses arm64 definition, inverted x86.",

View File

@ -176,6 +176,12 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp)
.CanEliminate = true,
};
case OP_CMPPAIRZ:
return {
.Write = FLAG_Z,
.CanEliminate = true,
};
case OP_CARRYINVERT:
return {
.Read = FLAG_C,

View File

@ -284,6 +284,38 @@
"mov x26, x5",
"cmn wzr, w26, lsl #24"
]
},
"Dead cmpxchg flags": {
"ExpectedInstructionCount": 23,
"x86Insts": [
"cmpxchg8b [rbp]",
"test rax, rax"
],
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
"mov w21, w4",
"mov w22, w6",
"mov w23, w22",
"mov w22, w21",
"mov w21, w7",
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne",
"ands x26, x4, x4"
]
}
}
}

View File

@ -80,7 +80,7 @@
]
},
"dxvk hotblock from MGRR": {
"ExpectedInstructionCount": 42,
"ExpectedInstructionCount": 40,
"Comment": [
"Hottest block in Metal Gear Rising: Revengeance render thread"
],
@ -128,21 +128,19 @@
"mov w23, w6",
"mov w24, w21",
"mov w25, w5",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w12, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x12, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"Psychonauts matrix swizzle": {

View File

@ -644,7 +644,7 @@
]
},
"cmpxchg8b [rbp]": {
"ExpectedInstructionCount": 24,
"ExpectedInstructionCount": 22,
"Comment": "GROUP9 0x0F 0xC7 /1",
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
@ -656,25 +656,23 @@
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w30, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"cmpxchg16b [rbp]": {
"ExpectedInstructionCount": 20,
"ExpectedInstructionCount": 18,
"Comment": "GROUP9 0x0F 0xC7 /1",
"ExpectedArm64ASM": [
"add x20, x9, #0x0 (0)",
@ -682,21 +680,19 @@
"mov x23, x6",
"mov x24, x7",
"mov x25, x5",
"mrs x21, nzcv",
"mov x2, x22",
"mov x3, x23",
"caspal x2, x3, x24, x25, [x20]",
"mov x24, x2",
"mov x25, x3",
"mov x20, x24",
"mov x30, x25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"msr nzcv, x21",
"rmif x22, #62, #nZcv",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov x20, x2",
"mov x21, x3",
"mov x24, x20",
"mov x25, x21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"rmif x0, #0, #NzCV",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"rdrand ax": {

View File

@ -788,21 +788,21 @@
"mov w24, w5",
"mov w25, w24",
"mov w24, w21",
"mrs x21, nzcv",
"mov w2, w22",
"mov w3, w23",
"caspal w2, w3, w24, w25, [x20]",
"mov w24, w2",
"mov w25, w3",
"mov w20, w24",
"mov w30, w25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"bfi w21, w22, #30, #1",
"msr nzcv, x21",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov w20, w2",
"mov w21, w3",
"mov w24, w20",
"mov w25, w21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"cset w1, eq",
"bfi w0, w1, #30, #1",
"msr nzcv, x0",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"cmpxchg16b [rbp]": {
@ -814,21 +814,21 @@
"mov x23, x6",
"mov x24, x7",
"mov x25, x5",
"mrs x21, nzcv",
"mov x2, x22",
"mov x3, x23",
"caspal x2, x3, x24, x25, [x20]",
"mov x24, x2",
"mov x25, x3",
"mov x20, x24",
"mov x30, x25",
"cmp x24, x22",
"ccmp x25, x23, #nzcv, eq",
"cset x22, eq",
"bfi w21, w22, #30, #1",
"msr nzcv, x21",
"csel x4, x20, x4, ne",
"csel x6, x30, x6, ne"
"mov x20, x2",
"mov x21, x3",
"mov x24, x20",
"mov x25, x21",
"mrs x0, nzcv",
"cmp w20, w22",
"ccmp w21, w23, #nzcv, eq",
"cset w1, eq",
"bfi w0, w1, #30, #1",
"msr nzcv, x0",
"csel x4, x24, x4, ne",
"csel x6, x25, x6, ne"
]
},
"rdrand ax": {