From 9baf5adf4a12542e8d82234615adb7a8e1237c81 Mon Sep 17 00:00:00 2001 From: Esther1024 Date: Sat, 17 May 2025 18:52:36 +0000 Subject: [PATCH] basic opcode matcher optimizations (#1) Matcher logic now use a function pointer array to match faster (cache aligned too); based on the idea that the cpu will keep it in hot section (maybe even mark with __attribute__((hot))?); profiling seems to measure some small gains (codegen is also better), Opcodes re-grouped so checks aren't that expensive (i.e no holes), Do not create vectors in ABI push/pop (compiler fails to ellide?!); explicitly use a hardcoded array and "copy" it over Reviewed-on: https://git.eden-emu.dev/eden-emu/dynarmic/pulls/1 Co-authored-by: Esther1024 Co-committed-by: Esther1024 --- src/dynarmic/backend/x64/a64_emit_x64.cpp | 26 ++-- src/dynarmic/backend/x64/abi.cpp | 56 +++++++- src/dynarmic/backend/x64/emit_x64.h | 4 + src/dynarmic/backend/x64/reg_alloc.h | 19 +-- src/dynarmic/frontend/A64/decoder/a64.h | 7 +- src/dynarmic/ir/microinstruction.h | 14 +- src/dynarmic/ir/opcodes.inc | 162 +++++++++++----------- 7 files changed, 175 insertions(+), 113 deletions(-) diff --git a/src/dynarmic/backend/x64/a64_emit_x64.cpp b/src/dynarmic/backend/x64/a64_emit_x64.cpp index d4c15e89..fa2cb86e 100644 --- a/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -95,30 +95,40 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) { ASSERT(block.GetCondition() == IR::Cond::AL); + static void (EmitX64::*opcode_handlers[])(EmitContext& context, IR::Inst* inst) = { +#define OPCODE(name, type, ...) &EmitX64::Emit##name, +#define A32OPC(...) +#define A64OPC(...) +#include "dynarmic/ir/opcodes.inc" +#undef OPCODE +#undef A32OPC +#undef A64OPC + }; + for (auto iter = block.begin(); iter != block.end(); ++iter) { IR::Inst* inst = &*iter; // Call the relevant Emit* member function. switch (inst->GetOpcode()) { #define OPCODE(name, type, ...) \ - case IR::Opcode::name: \ - A64EmitX64::Emit##name(ctx, inst); \ - break; + case IR::Opcode::name: goto true_opcode_branch; #define A32OPC(...) #define A64OPC(name, type, ...) \ - case IR::Opcode::A64##name: \ - A64EmitX64::EmitA64##name(ctx, inst); \ - break; + case IR::Opcode::A64##name: \ + A64EmitX64::EmitA64##name(ctx, inst); \ + break; #include "dynarmic/ir/opcodes.inc" #undef OPCODE #undef A32OPC #undef A64OPC - default: ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode()); break; } - + goto false_opcode_branch; +true_opcode_branch: + (this->*opcode_handlers[size_t(inst->GetOpcode())])(ctx, inst); +false_opcode_branch: ctx.reg_alloc.EndOfAllocScope(); if (conf.very_verbose_debugging_output) { diff --git a/src/dynarmic/backend/x64/abi.cpp b/src/dynarmic/backend/x64/abi.cpp index d6a83b65..f7488ea8 100644 --- a/src/dynarmic/backend/x64/abi.cpp +++ b/src/dynarmic/backend/x64/abi.cpp @@ -120,16 +120,60 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_si ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE); } +static consteval size_t ABI_AllCallerSaveSize() noexcept { + return ABI_ALL_CALLER_SAVE.max_size(); +} +static consteval std::array ABI_AllCallerSaveExcept(std::size_t except) noexcept { + std::array arr; + for(std::size_t i = 0; i < arr.size(); ++i) { + arr[i] = static_cast(i + (i >= except ? 1 : 0)); + } + return arr; +} + +alignas(64) static constinit std::array reg_table[] = { + ABI_AllCallerSaveExcept(0), + ABI_AllCallerSaveExcept(1), + ABI_AllCallerSaveExcept(2), + ABI_AllCallerSaveExcept(3), + ABI_AllCallerSaveExcept(4), + ABI_AllCallerSaveExcept(5), + ABI_AllCallerSaveExcept(6), + ABI_AllCallerSaveExcept(7), + ABI_AllCallerSaveExcept(8), + ABI_AllCallerSaveExcept(9), + ABI_AllCallerSaveExcept(10), + ABI_AllCallerSaveExcept(11), + ABI_AllCallerSaveExcept(12), + ABI_AllCallerSaveExcept(13), + ABI_AllCallerSaveExcept(14), + ABI_AllCallerSaveExcept(15), + ABI_AllCallerSaveExcept(16), + ABI_AllCallerSaveExcept(17), + ABI_AllCallerSaveExcept(18), + ABI_AllCallerSaveExcept(19), + ABI_AllCallerSaveExcept(20), + ABI_AllCallerSaveExcept(21), + ABI_AllCallerSaveExcept(22), + ABI_AllCallerSaveExcept(23), + ABI_AllCallerSaveExcept(24), + ABI_AllCallerSaveExcept(25), + ABI_AllCallerSaveExcept(26), + ABI_AllCallerSaveExcept(27), + ABI_AllCallerSaveExcept(28), + ABI_AllCallerSaveExcept(29), + ABI_AllCallerSaveExcept(30), + ABI_AllCallerSaveExcept(31), +}; + void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { - std::vector regs; - std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); - ABI_PushRegistersAndAdjustStack(code, 0, regs); + assert(size_t(exception) < 32); + ABI_PushRegistersAndAdjustStack(code, 0, reg_table[size_t(exception) % 32]); } void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) { - std::vector regs; - std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception); - ABI_PopRegistersAndAdjustStack(code, 0, regs); + assert(size_t(exception) < 32); + ABI_PopRegistersAndAdjustStack(code, 0, reg_table[size_t(exception) % 32]); } } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/emit_x64.h b/src/dynarmic/backend/x64/emit_x64.h index 2d3945f6..f9fe1a36 100644 --- a/src/dynarmic/backend/x64/emit_x64.h +++ b/src/dynarmic/backend/x64/emit_x64.h @@ -35,6 +35,7 @@ enum class OptimizationFlag : u32; namespace Dynarmic::Backend::X64 { +class A64EmitX64; class BlockOfCode; using A64FullVectorWidth = std::integral_constant; @@ -139,6 +140,9 @@ protected: ExceptionHandler exception_handler; ankerl::unordered_dense::map block_descriptors; ankerl::unordered_dense::map patch_information; + + // We need materialized protected members + friend class A64EmitX64; }; } // namespace Dynarmic::Backend::X64 diff --git a/src/dynarmic/backend/x64/reg_alloc.h b/src/dynarmic/backend/x64/reg_alloc.h index f7441835..ddbaad98 100644 --- a/src/dynarmic/backend/x64/reg_alloc.h +++ b/src/dynarmic/backend/x64/reg_alloc.h @@ -52,19 +52,20 @@ public: void EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const; private: +//non trivial + std::vector values; +//sometimes zeroed + size_t accumulated_uses = 0; + // Block state + size_t total_uses = 0; + // Value state + size_t max_bit_width = 0; +//always zeroed // Current instruction state size_t is_being_used_count = 0; + size_t current_references = 0; bool is_scratch = false; bool is_set_last_use = false; - - // Block state - size_t current_references = 0; - size_t accumulated_uses = 0; - size_t total_uses = 0; - - // Value state - std::vector values; - size_t max_bit_width = 0; }; struct Argument { diff --git a/src/dynarmic/frontend/A64/decoder/a64.h b/src/dynarmic/frontend/A64/decoder/a64.h index d6f447e5..f2648935 100644 --- a/src/dynarmic/frontend/A64/decoder/a64.h +++ b/src/dynarmic/frontend/A64/decoder/a64.h @@ -71,9 +71,10 @@ DecodeTable GetDecodeTable() { template std::optional>> Decode(u32 instruction) { - static const auto table = GetDecodeTable(); - - const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); }; + alignas(64) static const auto table = GetDecodeTable(); + const auto matches_instruction = [instruction](const auto& matcher) { + return matcher.Matches(instruction); + }; const auto& subtable = table[detail::ToFastLookupIndex(instruction)]; auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction); diff --git a/src/dynarmic/ir/microinstruction.h b/src/dynarmic/ir/microinstruction.h index 26b6b899..8bd13437 100644 --- a/src/dynarmic/ir/microinstruction.h +++ b/src/dynarmic/ir/microinstruction.h @@ -150,13 +150,15 @@ private: void Use(const Value& value); void UndoUse(const Value& value); - Opcode op; - unsigned use_count = 0; - unsigned name = 0; - std::array args; - + // TODO: so much padding wasted with mcl::intrusive_node + // 16 + 1, 24 + Opcode op; //2 (6) // Linked list of pseudooperations associated with this instruction. - Inst* next_pseudoop = nullptr; + Inst* next_pseudoop = nullptr; //8 (14) + unsigned use_count = 0; //4 (0) + unsigned name = 0; //4 (4) + alignas(64) std::array args; //16 * 4 = 64 (1 cache line) }; +static_assert(sizeof(Inst) == 128); } // namespace Dynarmic::IR diff --git a/src/dynarmic/ir/opcodes.inc b/src/dynarmic/ir/opcodes.inc index dd060e0a..0a29db6b 100644 --- a/src/dynarmic/ir/opcodes.inc +++ b/src/dynarmic/ir/opcodes.inc @@ -7,78 +7,6 @@ OPCODE(Identity, Opaque, Opaq OPCODE(Breakpoint, Void, ) OPCODE(CallHostFunction, Void, U64, Opaque, Opaque, Opaque ) -// A32 Context getters/setters -A32OPC(SetCheckBit, Void, U1 ) -A32OPC(GetRegister, U32, A32Reg ) -A32OPC(GetExtendedRegister32, U32, A32ExtReg ) -A32OPC(GetExtendedRegister64, U64, A32ExtReg ) -A32OPC(GetVector, U128, A32ExtReg ) -A32OPC(SetRegister, Void, A32Reg, U32 ) -A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 ) -A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 ) -A32OPC(SetVector, Void, A32ExtReg, U128 ) -A32OPC(GetCpsr, U32, ) -A32OPC(SetCpsr, Void, U32 ) -A32OPC(SetCpsrNZCV, Void, NZCV ) -A32OPC(SetCpsrNZCVRaw, Void, U32 ) -A32OPC(SetCpsrNZCVQ, Void, U32 ) -A32OPC(SetCpsrNZ, Void, NZCV ) -A32OPC(SetCpsrNZC, Void, NZCV, U1 ) -A32OPC(GetCFlag, U1, ) -A32OPC(OrQFlag, Void, U1 ) -A32OPC(GetGEFlags, U32, ) -A32OPC(SetGEFlags, Void, U32 ) -A32OPC(SetGEFlagsCompressed, Void, U32 ) -A32OPC(BXWritePC, Void, U32 ) -A32OPC(UpdateUpperLocationDescriptor, Void, ) -A32OPC(CallSupervisor, Void, U32 ) -A32OPC(ExceptionRaised, Void, U32, U64 ) -A32OPC(DataSynchronizationBarrier, Void, ) -A32OPC(DataMemoryBarrier, Void, ) -A32OPC(InstructionSynchronizationBarrier, Void, ) -A32OPC(GetFpscr, U32, ) -A32OPC(SetFpscr, Void, U32, ) -A32OPC(GetFpscrNZCV, U32, ) -A32OPC(SetFpscrNZCV, Void, NZCV ) - -// A64 Context getters/setters -A64OPC(SetCheckBit, Void, U1 ) -A64OPC(GetCFlag, U1, ) -A64OPC(GetNZCVRaw, U32, ) -A64OPC(SetNZCVRaw, Void, U32 ) -A64OPC(SetNZCV, Void, NZCV ) -A64OPC(GetW, U32, A64Reg ) -A64OPC(GetX, U64, A64Reg ) -A64OPC(GetS, U128, A64Vec ) -A64OPC(GetD, U128, A64Vec ) -A64OPC(GetQ, U128, A64Vec ) -A64OPC(GetSP, U64, ) -A64OPC(GetFPCR, U32, ) -A64OPC(GetFPSR, U32, ) -A64OPC(SetW, Void, A64Reg, U32 ) -A64OPC(SetX, Void, A64Reg, U64 ) -A64OPC(SetS, Void, A64Vec, U128 ) -A64OPC(SetD, Void, A64Vec, U128 ) -A64OPC(SetQ, Void, A64Vec, U128 ) -A64OPC(SetSP, Void, U64 ) -A64OPC(SetFPCR, Void, U32 ) -A64OPC(SetFPSR, Void, U32 ) -A64OPC(SetPC, Void, U64 ) -A64OPC(CallSupervisor, Void, U32 ) -A64OPC(ExceptionRaised, Void, U64, U64 ) -A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 ) -A64OPC(InstructionCacheOperationRaised, Void, U64, U64 ) -A64OPC(DataSynchronizationBarrier, Void, ) -A64OPC(DataMemoryBarrier, Void, ) -A64OPC(InstructionSynchronizationBarrier, Void, ) -A64OPC(GetCNTFRQ, U32, ) -A64OPC(GetCNTPCT, U64, ) -A64OPC(GetCTR, U32, ) -A64OPC(GetDCZID, U32, ) -A64OPC(GetTPIDR, U64, ) -A64OPC(GetTPIDRRO, U64, ) -A64OPC(SetTPIDR, Void, U64 ) - // Hints OPCODE(PushRSB, Void, U64 ) @@ -716,6 +644,40 @@ OPCODE(FPVectorToUnsignedFixed16, U128, U128 OPCODE(FPVectorToUnsignedFixed32, U128, U128, U8, U8, U1 ) OPCODE(FPVectorToUnsignedFixed64, U128, U128, U8, U8, U1 ) +// A32 Context getters/setters +A32OPC(SetCheckBit, Void, U1 ) +A32OPC(GetRegister, U32, A32Reg ) +A32OPC(GetExtendedRegister32, U32, A32ExtReg ) +A32OPC(GetExtendedRegister64, U64, A32ExtReg ) +A32OPC(GetVector, U128, A32ExtReg ) +A32OPC(SetRegister, Void, A32Reg, U32 ) +A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 ) +A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 ) +A32OPC(SetVector, Void, A32ExtReg, U128 ) +A32OPC(GetCpsr, U32, ) +A32OPC(SetCpsr, Void, U32 ) +A32OPC(SetCpsrNZCV, Void, NZCV ) +A32OPC(SetCpsrNZCVRaw, Void, U32 ) +A32OPC(SetCpsrNZCVQ, Void, U32 ) +A32OPC(SetCpsrNZ, Void, NZCV ) +A32OPC(SetCpsrNZC, Void, NZCV, U1 ) +A32OPC(GetCFlag, U1, ) +A32OPC(OrQFlag, Void, U1 ) +A32OPC(GetGEFlags, U32, ) +A32OPC(SetGEFlags, Void, U32 ) +A32OPC(SetGEFlagsCompressed, Void, U32 ) +A32OPC(BXWritePC, Void, U32 ) +A32OPC(UpdateUpperLocationDescriptor, Void, ) +A32OPC(CallSupervisor, Void, U32 ) +A32OPC(ExceptionRaised, Void, U32, U64 ) +A32OPC(DataSynchronizationBarrier, Void, ) +A32OPC(DataMemoryBarrier, Void, ) +A32OPC(InstructionSynchronizationBarrier, Void, ) +A32OPC(GetFpscr, U32, ) +A32OPC(SetFpscr, Void, U32, ) +A32OPC(GetFpscrNZCV, U32, ) +A32OPC(SetFpscrNZCV, Void, NZCV ) + // A32 Memory access A32OPC(ClearExclusive, Void, ) A32OPC(ReadMemory8, U8, U64, U32, AccType ) @@ -735,6 +697,53 @@ A32OPC(ExclusiveWriteMemory16, U32, U64, A32OPC(ExclusiveWriteMemory32, U32, U64, U32, U32, AccType ) A32OPC(ExclusiveWriteMemory64, U32, U64, U32, U64, AccType ) +// Coprocessor +A32OPC(CoprocInternalOperation, Void, CoprocInfo ) +A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 ) +A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 ) +A32OPC(CoprocGetOneWord, U32, CoprocInfo ) +A32OPC(CoprocGetTwoWords, U64, CoprocInfo ) +A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 ) +A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 ) + +// A64 Context getters/setters +A64OPC(SetCheckBit, Void, U1 ) +A64OPC(GetCFlag, U1, ) +A64OPC(GetNZCVRaw, U32, ) +A64OPC(SetNZCVRaw, Void, U32 ) +A64OPC(SetNZCV, Void, NZCV ) +A64OPC(GetW, U32, A64Reg ) +A64OPC(GetX, U64, A64Reg ) +A64OPC(GetS, U128, A64Vec ) +A64OPC(GetD, U128, A64Vec ) +A64OPC(GetQ, U128, A64Vec ) +A64OPC(GetSP, U64, ) +A64OPC(GetFPCR, U32, ) +A64OPC(GetFPSR, U32, ) +A64OPC(SetW, Void, A64Reg, U32 ) +A64OPC(SetX, Void, A64Reg, U64 ) +A64OPC(SetS, Void, A64Vec, U128 ) +A64OPC(SetD, Void, A64Vec, U128 ) +A64OPC(SetQ, Void, A64Vec, U128 ) +A64OPC(SetSP, Void, U64 ) +A64OPC(SetFPCR, Void, U32 ) +A64OPC(SetFPSR, Void, U32 ) +A64OPC(SetPC, Void, U64 ) +A64OPC(CallSupervisor, Void, U32 ) +A64OPC(ExceptionRaised, Void, U64, U64 ) +A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 ) +A64OPC(InstructionCacheOperationRaised, Void, U64, U64 ) +A64OPC(DataSynchronizationBarrier, Void, ) +A64OPC(DataMemoryBarrier, Void, ) +A64OPC(InstructionSynchronizationBarrier, Void, ) +A64OPC(GetCNTFRQ, U32, ) +A64OPC(GetCNTPCT, U64, ) +A64OPC(GetCTR, U32, ) +A64OPC(GetDCZID, U32, ) +A64OPC(GetTPIDR, U64, ) +A64OPC(GetTPIDRRO, U64, ) +A64OPC(SetTPIDR, Void, U64 ) + // A64 Memory access A64OPC(ClearExclusive, Void, ) A64OPC(ReadMemory8, U8, U64, U64, AccType ) @@ -758,13 +767,4 @@ A64OPC(ExclusiveWriteMemory32, U32, U64, A64OPC(ExclusiveWriteMemory64, U32, U64, U64, U64, AccType ) A64OPC(ExclusiveWriteMemory128, U32, U64, U64, U128, AccType ) -// Coprocessor -A32OPC(CoprocInternalOperation, Void, CoprocInfo ) -A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 ) -A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 ) -A32OPC(CoprocGetOneWord, U32, CoprocInfo ) -A32OPC(CoprocGetTwoWords, U64, CoprocInfo ) -A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 ) -A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 ) - // clang-format on