From 6866c3d0aceb2928f26f48814d9acf8b757c2769 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 11:03:43 -0400 Subject: [PATCH 1/5] InstCountCI: add dead cmpxchg case not super optimizable but worth tracking. Signed-off-by: Alyssa Rosenzweig --- .../InstructionCountCI/FlagM/FlagOpts.json | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/unittests/InstructionCountCI/FlagM/FlagOpts.json b/unittests/InstructionCountCI/FlagM/FlagOpts.json index a1e03dab2..7a66fc927 100644 --- a/unittests/InstructionCountCI/FlagM/FlagOpts.json +++ b/unittests/InstructionCountCI/FlagM/FlagOpts.json @@ -284,6 +284,38 @@ "mov x26, x5", "cmn wzr, w26, lsl #24" ] + }, + "Dead cmpxchg flags": { + "ExpectedInstructionCount": 23, + "x86Insts": [ + "cmpxchg8b [rbp]", + "test rax, rax" + ], + "ExpectedArm64ASM": [ + "add x20, x9, #0x0 (0)", + "mov w21, w4", + "mov w22, w6", + "mov w23, w22", + "mov w22, w21", + "mov w21, w7", + "mov w24, w5", + "mov w25, w24", + "mov w24, w21", + "mov w2, w22", + "mov w3, w23", + "caspal w2, w3, w24, w25, [x20]", + "mov w20, w2", + "mov w21, w3", + "mov w24, w20", + "mov w25, w21", + "cmp x20, x22", + "ccmp x21, x23, #nzcv, eq", + "cset x20, eq", + "rmif x20, #62, #nZcv", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne", + "ands x26, x4, x4" + ] } } } From deba6a1b76dfab264c12d0289e7b9f32b397ae5b Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 14:12:14 -0400 Subject: [PATCH 2/5] JIT: add comment about unaligned backpatching save future me some grief. Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp index d0d7df656..976336389 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp @@ -36,6 +36,7 @@ DEF_OP(CASPair) { ARMEmitter::SingleUseForwardLabel LoopExpected; Bind(&LoopTop); + // This instruction sequence must be synced with HandleCASPAL_Armv8. ldaxp(EmitSize, TMP2, TMP3, MemSrc); cmp(EmitSize, TMP2, Expected.first); ccmp(EmitSize, TMP3, Expected.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ); From 9fd32f07cbea87852f01d2cbc23efd299e202040 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 14:12:27 -0400 Subject: [PATCH 3/5] JIT: preserve nzcv for the slow atomic path Signed-off-by: Alyssa Rosenzweig --- FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp | 10 ++++++++++ FEXCore/Source/Interface/IR/IR.json | 1 - 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp index 976336389..8cf864c7f 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/AtomicOps.cpp @@ -31,6 +31,13 @@ DEF_OP(CASPair) { mov(EmitSize, Dst.second, TMP4.R()); } else { + // Save NZCV so we don't have to mark this op as clobbering NZCV (the + // SupportsAtomics does not clobber atomics and this !SupportsAtomics path + // is so slow it's not worth the complexity of splitting the IR op.). We + // clobber NZCV inside the hot loop and we can't replace cmp/ccmp/b.ne with + // something NZCV-preserving without requiring an extra instruction. + mrs(TMP1, ARMEmitter::SystemRegister::NZCV); + ARMEmitter::BackwardLabel LoopTop; ARMEmitter::SingleUseForwardLabel LoopNotExpected; ARMEmitter::SingleUseForwardLabel LoopExpected; @@ -55,6 +62,9 @@ DEF_OP(CASPair) { // Might have hit the case where ldaxr was hit but stlxr wasn't clrex(); Bind(&LoopExpected); + + // Restore + msr(ARMEmitter::SystemRegister::NZCV, TMP1); } } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 16bacee11..f3e8f640f 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -642,7 +642,6 @@ ], "HasDest": true, "DestSize": "Size", - "ImplicitFlagClobber": true, "NumElements": "2", "EmitValidation": [ "Size == FEXCore::IR::OpSize::i64Bit || Size == FEXCore::IR::OpSize::i128Bit" From 706065b0e23fa69f9dd590b4c7e97114aaaf9449 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 10:42:01 -0400 Subject: [PATCH 4/5] OpcodeDispatcher: accelerate cmpxchg with flagm Signed-off-by: Alyssa Rosenzweig --- .../Interface/Core/JIT/Arm64/ALUOps.cpp | 26 +++++++++++++++++++ .../Interface/Core/OpcodeDispatcher.cpp | 12 ++------- FEXCore/Source/Interface/IR/IR.json | 5 ++++ .../RedundantFlagCalculationElimination.cpp | 6 +++++ 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp index 0055812e8..129d05a4f 100644 --- a/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/Arm64/ALUOps.cpp @@ -5,6 +5,7 @@ tags: backend|arm64 $end_info$ */ +#include "FEXCore/IR/IR.h" #include "Interface/Context/Context.h" #include "Interface/Core/ArchHelpers/CodeEmitter/Emitter.h" #include "Interface/Core/ArchHelpers/CodeEmitter/Registers.h" @@ -299,6 +300,31 @@ DEF_OP(SubNZCV) { } } +DEF_OP(CmpPairZ) { + auto Op = IROp->C(); + const uint8_t OpSize = IROp->Size; + + const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit; + + // Save NZCV + mrs(TMP1, ARMEmitter::SystemRegister::NZCV); + + // Compare, setting Z and clobbering NzCV + const auto Src1 = GetRegPair(Op->Src1.ID()); + const auto Src2 = GetRegPair(Op->Src2.ID()); + cmp(EmitSize, Src1.first, Src2.first); + ccmp(EmitSize, Src1.second, Src2.second, ARMEmitter::StatusFlags::None, ARMEmitter::Condition::CC_EQ); + + // Restore NzCV + if (CTX->HostFeatures.SupportsFlagM) { + rmif(TMP1, 0, 0xb /* NzCV */); + } else { + cset(ARMEmitter::Size::i32Bit, TMP2, ARMEmitter::Condition::CC_EQ); + bfi(ARMEmitter::Size::i32Bit, TMP1, TMP2, 30 /* lsb: Z */, 1); + msr(ARMEmitter::SystemRegister::NZCV, TMP1); + } +} + DEF_OP(CarryInvert) { LOGMAN_THROW_A_FMT(CTX->HostFeatures.SupportsFlagM, "Unsupported flagm op"); cfinv(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 12091359d..2214f4c92 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4258,16 +4258,8 @@ void OpDispatchBuilder::CMPXCHGPairOp(OpcodeArgs) { OrderedNode *Result_Lower = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 0); OrderedNode *Result_Upper = _ExtractElementPair(IR::SizeToOpSize(Size), CASResult, 1); - // Set ZF if memory result was expected - auto OneConst = _Constant(1); - auto ZeroConst = _Constant(0); - - OrderedNode *ZFResult = _Select(FEXCore::IR::COND_EQ, - CASResult, Expected, - OneConst, ZeroConst); - - // Set ZF - SetRFLAG(ZFResult); + HandleNZCV_RMW(); + _CmpPairZ(IR::SizeToOpSize(Size), CASResult, Expected); CalculateDeferredFlags(); auto UpdateIfNotZF = [this](auto Reg, auto Value) { diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index f3e8f640f..abf9b8586 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -1094,6 +1094,11 @@ "Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit" ] }, + "CmpPairZ OpSize:#Size, GPRPair:$Src1, GPRPair:$Src2": { + "Desc": ["Compares register pairs and sets Z accordingly, preserving N/Z/V.", + "This accelerates cmpxchg."], + "HasSideEffects": true + }, "SubNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": { "Desc": ["Set NZCV for the difference of two GPRs. ", "Carry flag uses arm64 definition, inverted x86.", diff --git a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp index 4a0fde231..2df6270d8 100644 --- a/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp +++ b/FEXCore/Source/Interface/IR/Passes/RedundantFlagCalculationElimination.cpp @@ -176,6 +176,12 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp) .CanEliminate = true, }; + case OP_CMPPAIRZ: + return { + .Write = FLAG_Z, + .CanEliminate = true, + }; + case OP_CARRYINVERT: return { .Read = FLAG_C, From 9bca0521465353520d06fa853a12d0b919b797e9 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 29 Mar 2024 14:13:52 -0400 Subject: [PATCH 5/5] InstCountCI: Update Signed-off-by: Alyssa Rosenzweig --- .../InstructionCountCI/FlagM/FlagOpts.json | 8 ++-- .../FlagM/HotBlocks_32Bit.json | 24 +++++----- .../FlagM/SecondaryGroup.json | 48 +++++++++---------- .../InstructionCountCI/SecondaryGroup.json | 48 +++++++++---------- 4 files changed, 61 insertions(+), 67 deletions(-) diff --git a/unittests/InstructionCountCI/FlagM/FlagOpts.json b/unittests/InstructionCountCI/FlagM/FlagOpts.json index 7a66fc927..d2c0c60de 100644 --- a/unittests/InstructionCountCI/FlagM/FlagOpts.json +++ b/unittests/InstructionCountCI/FlagM/FlagOpts.json @@ -308,10 +308,10 @@ "mov w21, w3", "mov w24, w20", "mov w25, w21", - "cmp x20, x22", - "ccmp x21, x23, #nzcv, eq", - "cset x20, eq", - "rmif x20, #62, #nZcv", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "rmif x0, #0, #NzCV", "csel x4, x24, x4, ne", "csel x6, x25, x6, ne", "ands x26, x4, x4" diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json b/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json index e85848d4e..797d1d4a0 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks_32Bit.json @@ -80,7 +80,7 @@ ] }, "dxvk hotblock from MGRR": { - "ExpectedInstructionCount": 42, + "ExpectedInstructionCount": 40, "Comment": [ "Hottest block in Metal Gear Rising: Revengeance render thread" ], @@ -128,21 +128,19 @@ "mov w23, w6", "mov w24, w21", "mov w25, w5", - "mrs x21, nzcv", "mov w2, w22", "mov w3, w23", "caspal w2, w3, w24, w25, [x20]", - "mov w24, w2", - "mov w25, w3", - "mov w20, w24", - "mov w12, w25", - "cmp x24, x22", - "ccmp x25, x23, #nzcv, eq", - "cset x22, eq", - "msr nzcv, x21", - "rmif x22, #62, #nZcv", - "csel x4, x20, x4, ne", - "csel x6, x12, x6, ne" + "mov w20, w2", + "mov w21, w3", + "mov w24, w20", + "mov w25, w21", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "rmif x0, #0, #NzCV", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne" ] }, "Psychonauts matrix swizzle": { diff --git a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json index 76f03aa25..de0bf4368 100644 --- a/unittests/InstructionCountCI/FlagM/SecondaryGroup.json +++ b/unittests/InstructionCountCI/FlagM/SecondaryGroup.json @@ -644,7 +644,7 @@ ] }, "cmpxchg8b [rbp]": { - "ExpectedInstructionCount": 24, + "ExpectedInstructionCount": 22, "Comment": "GROUP9 0x0F 0xC7 /1", "ExpectedArm64ASM": [ "add x20, x9, #0x0 (0)", @@ -656,25 +656,23 @@ "mov w24, w5", "mov w25, w24", "mov w24, w21", - "mrs x21, nzcv", "mov w2, w22", "mov w3, w23", "caspal w2, w3, w24, w25, [x20]", - "mov w24, w2", - "mov w25, w3", - "mov w20, w24", - "mov w30, w25", - "cmp x24, x22", - "ccmp x25, x23, #nzcv, eq", - "cset x22, eq", - "msr nzcv, x21", - "rmif x22, #62, #nZcv", - "csel x4, x20, x4, ne", - "csel x6, x30, x6, ne" + "mov w20, w2", + "mov w21, w3", + "mov w24, w20", + "mov w25, w21", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "rmif x0, #0, #NzCV", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne" ] }, "cmpxchg16b [rbp]": { - "ExpectedInstructionCount": 20, + "ExpectedInstructionCount": 18, "Comment": "GROUP9 0x0F 0xC7 /1", "ExpectedArm64ASM": [ "add x20, x9, #0x0 (0)", @@ -682,21 +680,19 @@ "mov x23, x6", "mov x24, x7", "mov x25, x5", - "mrs x21, nzcv", "mov x2, x22", "mov x3, x23", "caspal x2, x3, x24, x25, [x20]", - "mov x24, x2", - "mov x25, x3", - "mov x20, x24", - "mov x30, x25", - "cmp x24, x22", - "ccmp x25, x23, #nzcv, eq", - "cset x22, eq", - "msr nzcv, x21", - "rmif x22, #62, #nZcv", - "csel x4, x20, x4, ne", - "csel x6, x30, x6, ne" + "mov x20, x2", + "mov x21, x3", + "mov x24, x20", + "mov x25, x21", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "rmif x0, #0, #NzCV", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne" ] }, "rdrand ax": { diff --git a/unittests/InstructionCountCI/SecondaryGroup.json b/unittests/InstructionCountCI/SecondaryGroup.json index 825100170..3f8802609 100644 --- a/unittests/InstructionCountCI/SecondaryGroup.json +++ b/unittests/InstructionCountCI/SecondaryGroup.json @@ -788,21 +788,21 @@ "mov w24, w5", "mov w25, w24", "mov w24, w21", - "mrs x21, nzcv", "mov w2, w22", "mov w3, w23", "caspal w2, w3, w24, w25, [x20]", - "mov w24, w2", - "mov w25, w3", - "mov w20, w24", - "mov w30, w25", - "cmp x24, x22", - "ccmp x25, x23, #nzcv, eq", - "cset x22, eq", - "bfi w21, w22, #30, #1", - "msr nzcv, x21", - "csel x4, x20, x4, ne", - "csel x6, x30, x6, ne" + "mov w20, w2", + "mov w21, w3", + "mov w24, w20", + "mov w25, w21", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "cset w1, eq", + "bfi w0, w1, #30, #1", + "msr nzcv, x0", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne" ] }, "cmpxchg16b [rbp]": { @@ -814,21 +814,21 @@ "mov x23, x6", "mov x24, x7", "mov x25, x5", - "mrs x21, nzcv", "mov x2, x22", "mov x3, x23", "caspal x2, x3, x24, x25, [x20]", - "mov x24, x2", - "mov x25, x3", - "mov x20, x24", - "mov x30, x25", - "cmp x24, x22", - "ccmp x25, x23, #nzcv, eq", - "cset x22, eq", - "bfi w21, w22, #30, #1", - "msr nzcv, x21", - "csel x4, x20, x4, ne", - "csel x6, x30, x6, ne" + "mov x20, x2", + "mov x21, x3", + "mov x24, x20", + "mov x25, x21", + "mrs x0, nzcv", + "cmp w20, w22", + "ccmp w21, w23, #nzcv, eq", + "cset w1, eq", + "bfi w0, w1, #30, #1", + "msr nzcv, x0", + "csel x4, x24, x4, ne", + "csel x6, x25, x6, ne" ] }, "rdrand ax": {