basic opcode matcher optimizations (#1)

Matcher logic now use a function pointer array to match faster (cache aligned too); based on the idea that the cpu will keep it in hot section (maybe even mark with __attribute__((hot))?); profiling seems to measure some small gains (codegen is also better),
Opcodes re-grouped so checks aren't that expensive (i.e no holes),
Do not create vectors in ABI push/pop (compiler fails to ellide?!); explicitly use a hardcoded array and "copy" it over

Reviewed-on: https://git.eden-emu.dev/eden-emu/dynarmic/pulls/1
Co-authored-by: Esther1024 <danishreyjavik@outlook.com>
Co-committed-by: Esther1024 <danishreyjavik@outlook.com>
This commit is contained in:
Esther1024
2025-05-17 18:52:36 +00:00
committed by crueter
parent 6583fdb214
commit 9baf5adf4a
7 changed files with 175 additions and 113 deletions

View File

@@ -95,30 +95,40 @@ A64EmitX64::BlockDescriptor A64EmitX64::Emit(IR::Block& block) {
ASSERT(block.GetCondition() == IR::Cond::AL);
static void (EmitX64::*opcode_handlers[])(EmitContext& context, IR::Inst* inst) = {
#define OPCODE(name, type, ...) &EmitX64::Emit##name,
#define A32OPC(...)
#define A64OPC(...)
#include "dynarmic/ir/opcodes.inc"
#undef OPCODE
#undef A32OPC
#undef A64OPC
};
for (auto iter = block.begin(); iter != block.end(); ++iter) {
IR::Inst* inst = &*iter;
// Call the relevant Emit* member function.
switch (inst->GetOpcode()) {
#define OPCODE(name, type, ...) \
case IR::Opcode::name: \
A64EmitX64::Emit##name(ctx, inst); \
break;
case IR::Opcode::name: goto true_opcode_branch;
#define A32OPC(...)
#define A64OPC(name, type, ...) \
case IR::Opcode::A64##name: \
A64EmitX64::EmitA64##name(ctx, inst); \
break;
case IR::Opcode::A64##name: \
A64EmitX64::EmitA64##name(ctx, inst); \
break;
#include "dynarmic/ir/opcodes.inc"
#undef OPCODE
#undef A32OPC
#undef A64OPC
default:
ASSERT_MSG(false, "Invalid opcode: {}", inst->GetOpcode());
break;
}
goto false_opcode_branch;
true_opcode_branch:
(this->*opcode_handlers[size_t(inst->GetOpcode())])(ctx, inst);
false_opcode_branch:
ctx.reg_alloc.EndOfAllocScope();
if (conf.very_verbose_debugging_output) {

View File

@@ -120,16 +120,60 @@ void ABI_PopCallerSaveRegistersAndAdjustStack(BlockOfCode& code, size_t frame_si
ABI_PopRegistersAndAdjustStack(code, frame_size, ABI_ALL_CALLER_SAVE);
}
static consteval size_t ABI_AllCallerSaveSize() noexcept {
return ABI_ALL_CALLER_SAVE.max_size();
}
static consteval std::array<HostLoc, ABI_AllCallerSaveSize() - 1> ABI_AllCallerSaveExcept(std::size_t except) noexcept {
std::array<HostLoc, ABI_AllCallerSaveSize() - 1> arr;
for(std::size_t i = 0; i < arr.size(); ++i) {
arr[i] = static_cast<HostLoc>(i + (i >= except ? 1 : 0));
}
return arr;
}
alignas(64) static constinit std::array<HostLoc, ABI_AllCallerSaveSize() - 1> reg_table[] = {
ABI_AllCallerSaveExcept(0),
ABI_AllCallerSaveExcept(1),
ABI_AllCallerSaveExcept(2),
ABI_AllCallerSaveExcept(3),
ABI_AllCallerSaveExcept(4),
ABI_AllCallerSaveExcept(5),
ABI_AllCallerSaveExcept(6),
ABI_AllCallerSaveExcept(7),
ABI_AllCallerSaveExcept(8),
ABI_AllCallerSaveExcept(9),
ABI_AllCallerSaveExcept(10),
ABI_AllCallerSaveExcept(11),
ABI_AllCallerSaveExcept(12),
ABI_AllCallerSaveExcept(13),
ABI_AllCallerSaveExcept(14),
ABI_AllCallerSaveExcept(15),
ABI_AllCallerSaveExcept(16),
ABI_AllCallerSaveExcept(17),
ABI_AllCallerSaveExcept(18),
ABI_AllCallerSaveExcept(19),
ABI_AllCallerSaveExcept(20),
ABI_AllCallerSaveExcept(21),
ABI_AllCallerSaveExcept(22),
ABI_AllCallerSaveExcept(23),
ABI_AllCallerSaveExcept(24),
ABI_AllCallerSaveExcept(25),
ABI_AllCallerSaveExcept(26),
ABI_AllCallerSaveExcept(27),
ABI_AllCallerSaveExcept(28),
ABI_AllCallerSaveExcept(29),
ABI_AllCallerSaveExcept(30),
ABI_AllCallerSaveExcept(31),
};
void ABI_PushCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PushRegistersAndAdjustStack(code, 0, regs);
assert(size_t(exception) < 32);
ABI_PushRegistersAndAdjustStack(code, 0, reg_table[size_t(exception) % 32]);
}
void ABI_PopCallerSaveRegistersAndAdjustStackExcept(BlockOfCode& code, HostLoc exception) {
std::vector<HostLoc> regs;
std::remove_copy(ABI_ALL_CALLER_SAVE.begin(), ABI_ALL_CALLER_SAVE.end(), std::back_inserter(regs), exception);
ABI_PopRegistersAndAdjustStack(code, 0, regs);
assert(size_t(exception) < 32);
ABI_PopRegistersAndAdjustStack(code, 0, reg_table[size_t(exception) % 32]);
}
} // namespace Dynarmic::Backend::X64

View File

@@ -35,6 +35,7 @@ enum class OptimizationFlag : u32;
namespace Dynarmic::Backend::X64 {
class A64EmitX64;
class BlockOfCode;
using A64FullVectorWidth = std::integral_constant<size_t, 128>;
@@ -139,6 +140,9 @@ protected:
ExceptionHandler exception_handler;
ankerl::unordered_dense::map<IR::LocationDescriptor, BlockDescriptor> block_descriptors;
ankerl::unordered_dense::map<IR::LocationDescriptor, PatchInformation> patch_information;
// We need materialized protected members
friend class A64EmitX64;
};
} // namespace Dynarmic::Backend::X64

View File

@@ -52,19 +52,20 @@ public:
void EmitVerboseDebuggingOutput(BlockOfCode& code, size_t host_loc_index) const;
private:
//non trivial
std::vector<IR::Inst*> values;
//sometimes zeroed
size_t accumulated_uses = 0;
// Block state
size_t total_uses = 0;
// Value state
size_t max_bit_width = 0;
//always zeroed
// Current instruction state
size_t is_being_used_count = 0;
size_t current_references = 0;
bool is_scratch = false;
bool is_set_last_use = false;
// Block state
size_t current_references = 0;
size_t accumulated_uses = 0;
size_t total_uses = 0;
// Value state
std::vector<IR::Inst*> values;
size_t max_bit_width = 0;
};
struct Argument {

View File

@@ -71,9 +71,10 @@ DecodeTable<V> GetDecodeTable() {
template<typename V>
std::optional<std::reference_wrapper<const Matcher<V>>> Decode(u32 instruction) {
static const auto table = GetDecodeTable<V>();
const auto matches_instruction = [instruction](const auto& matcher) { return matcher.Matches(instruction); };
alignas(64) static const auto table = GetDecodeTable<V>();
const auto matches_instruction = [instruction](const auto& matcher) {
return matcher.Matches(instruction);
};
const auto& subtable = table[detail::ToFastLookupIndex(instruction)];
auto iter = std::find_if(subtable.begin(), subtable.end(), matches_instruction);

View File

@@ -150,13 +150,15 @@ private:
void Use(const Value& value);
void UndoUse(const Value& value);
Opcode op;
unsigned use_count = 0;
unsigned name = 0;
std::array<Value, max_arg_count> args;
// TODO: so much padding wasted with mcl::intrusive_node
// 16 + 1, 24
Opcode op; //2 (6)
// Linked list of pseudooperations associated with this instruction.
Inst* next_pseudoop = nullptr;
Inst* next_pseudoop = nullptr; //8 (14)
unsigned use_count = 0; //4 (0)
unsigned name = 0; //4 (4)
alignas(64) std::array<Value, max_arg_count> args; //16 * 4 = 64 (1 cache line)
};
static_assert(sizeof(Inst) == 128);
} // namespace Dynarmic::IR

View File

@@ -7,78 +7,6 @@ OPCODE(Identity, Opaque, Opaq
OPCODE(Breakpoint, Void, )
OPCODE(CallHostFunction, Void, U64, Opaque, Opaque, Opaque )
// A32 Context getters/setters
A32OPC(SetCheckBit, Void, U1 )
A32OPC(GetRegister, U32, A32Reg )
A32OPC(GetExtendedRegister32, U32, A32ExtReg )
A32OPC(GetExtendedRegister64, U64, A32ExtReg )
A32OPC(GetVector, U128, A32ExtReg )
A32OPC(SetRegister, Void, A32Reg, U32 )
A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 )
A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 )
A32OPC(SetVector, Void, A32ExtReg, U128 )
A32OPC(GetCpsr, U32, )
A32OPC(SetCpsr, Void, U32 )
A32OPC(SetCpsrNZCV, Void, NZCV )
A32OPC(SetCpsrNZCVRaw, Void, U32 )
A32OPC(SetCpsrNZCVQ, Void, U32 )
A32OPC(SetCpsrNZ, Void, NZCV )
A32OPC(SetCpsrNZC, Void, NZCV, U1 )
A32OPC(GetCFlag, U1, )
A32OPC(OrQFlag, Void, U1 )
A32OPC(GetGEFlags, U32, )
A32OPC(SetGEFlags, Void, U32 )
A32OPC(SetGEFlagsCompressed, Void, U32 )
A32OPC(BXWritePC, Void, U32 )
A32OPC(UpdateUpperLocationDescriptor, Void, )
A32OPC(CallSupervisor, Void, U32 )
A32OPC(ExceptionRaised, Void, U32, U64 )
A32OPC(DataSynchronizationBarrier, Void, )
A32OPC(DataMemoryBarrier, Void, )
A32OPC(InstructionSynchronizationBarrier, Void, )
A32OPC(GetFpscr, U32, )
A32OPC(SetFpscr, Void, U32, )
A32OPC(GetFpscrNZCV, U32, )
A32OPC(SetFpscrNZCV, Void, NZCV )
// A64 Context getters/setters
A64OPC(SetCheckBit, Void, U1 )
A64OPC(GetCFlag, U1, )
A64OPC(GetNZCVRaw, U32, )
A64OPC(SetNZCVRaw, Void, U32 )
A64OPC(SetNZCV, Void, NZCV )
A64OPC(GetW, U32, A64Reg )
A64OPC(GetX, U64, A64Reg )
A64OPC(GetS, U128, A64Vec )
A64OPC(GetD, U128, A64Vec )
A64OPC(GetQ, U128, A64Vec )
A64OPC(GetSP, U64, )
A64OPC(GetFPCR, U32, )
A64OPC(GetFPSR, U32, )
A64OPC(SetW, Void, A64Reg, U32 )
A64OPC(SetX, Void, A64Reg, U64 )
A64OPC(SetS, Void, A64Vec, U128 )
A64OPC(SetD, Void, A64Vec, U128 )
A64OPC(SetQ, Void, A64Vec, U128 )
A64OPC(SetSP, Void, U64 )
A64OPC(SetFPCR, Void, U32 )
A64OPC(SetFPSR, Void, U32 )
A64OPC(SetPC, Void, U64 )
A64OPC(CallSupervisor, Void, U32 )
A64OPC(ExceptionRaised, Void, U64, U64 )
A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 )
A64OPC(InstructionCacheOperationRaised, Void, U64, U64 )
A64OPC(DataSynchronizationBarrier, Void, )
A64OPC(DataMemoryBarrier, Void, )
A64OPC(InstructionSynchronizationBarrier, Void, )
A64OPC(GetCNTFRQ, U32, )
A64OPC(GetCNTPCT, U64, )
A64OPC(GetCTR, U32, )
A64OPC(GetDCZID, U32, )
A64OPC(GetTPIDR, U64, )
A64OPC(GetTPIDRRO, U64, )
A64OPC(SetTPIDR, Void, U64 )
// Hints
OPCODE(PushRSB, Void, U64 )
@@ -716,6 +644,40 @@ OPCODE(FPVectorToUnsignedFixed16, U128, U128
OPCODE(FPVectorToUnsignedFixed32, U128, U128, U8, U8, U1 )
OPCODE(FPVectorToUnsignedFixed64, U128, U128, U8, U8, U1 )
// A32 Context getters/setters
A32OPC(SetCheckBit, Void, U1 )
A32OPC(GetRegister, U32, A32Reg )
A32OPC(GetExtendedRegister32, U32, A32ExtReg )
A32OPC(GetExtendedRegister64, U64, A32ExtReg )
A32OPC(GetVector, U128, A32ExtReg )
A32OPC(SetRegister, Void, A32Reg, U32 )
A32OPC(SetExtendedRegister32, Void, A32ExtReg, U32 )
A32OPC(SetExtendedRegister64, Void, A32ExtReg, U64 )
A32OPC(SetVector, Void, A32ExtReg, U128 )
A32OPC(GetCpsr, U32, )
A32OPC(SetCpsr, Void, U32 )
A32OPC(SetCpsrNZCV, Void, NZCV )
A32OPC(SetCpsrNZCVRaw, Void, U32 )
A32OPC(SetCpsrNZCVQ, Void, U32 )
A32OPC(SetCpsrNZ, Void, NZCV )
A32OPC(SetCpsrNZC, Void, NZCV, U1 )
A32OPC(GetCFlag, U1, )
A32OPC(OrQFlag, Void, U1 )
A32OPC(GetGEFlags, U32, )
A32OPC(SetGEFlags, Void, U32 )
A32OPC(SetGEFlagsCompressed, Void, U32 )
A32OPC(BXWritePC, Void, U32 )
A32OPC(UpdateUpperLocationDescriptor, Void, )
A32OPC(CallSupervisor, Void, U32 )
A32OPC(ExceptionRaised, Void, U32, U64 )
A32OPC(DataSynchronizationBarrier, Void, )
A32OPC(DataMemoryBarrier, Void, )
A32OPC(InstructionSynchronizationBarrier, Void, )
A32OPC(GetFpscr, U32, )
A32OPC(SetFpscr, Void, U32, )
A32OPC(GetFpscrNZCV, U32, )
A32OPC(SetFpscrNZCV, Void, NZCV )
// A32 Memory access
A32OPC(ClearExclusive, Void, )
A32OPC(ReadMemory8, U8, U64, U32, AccType )
@@ -735,6 +697,53 @@ A32OPC(ExclusiveWriteMemory16, U32, U64,
A32OPC(ExclusiveWriteMemory32, U32, U64, U32, U32, AccType )
A32OPC(ExclusiveWriteMemory64, U32, U64, U32, U64, AccType )
// Coprocessor
A32OPC(CoprocInternalOperation, Void, CoprocInfo )
A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 )
A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 )
A32OPC(CoprocGetOneWord, U32, CoprocInfo )
A32OPC(CoprocGetTwoWords, U64, CoprocInfo )
A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 )
A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 )
// A64 Context getters/setters
A64OPC(SetCheckBit, Void, U1 )
A64OPC(GetCFlag, U1, )
A64OPC(GetNZCVRaw, U32, )
A64OPC(SetNZCVRaw, Void, U32 )
A64OPC(SetNZCV, Void, NZCV )
A64OPC(GetW, U32, A64Reg )
A64OPC(GetX, U64, A64Reg )
A64OPC(GetS, U128, A64Vec )
A64OPC(GetD, U128, A64Vec )
A64OPC(GetQ, U128, A64Vec )
A64OPC(GetSP, U64, )
A64OPC(GetFPCR, U32, )
A64OPC(GetFPSR, U32, )
A64OPC(SetW, Void, A64Reg, U32 )
A64OPC(SetX, Void, A64Reg, U64 )
A64OPC(SetS, Void, A64Vec, U128 )
A64OPC(SetD, Void, A64Vec, U128 )
A64OPC(SetQ, Void, A64Vec, U128 )
A64OPC(SetSP, Void, U64 )
A64OPC(SetFPCR, Void, U32 )
A64OPC(SetFPSR, Void, U32 )
A64OPC(SetPC, Void, U64 )
A64OPC(CallSupervisor, Void, U32 )
A64OPC(ExceptionRaised, Void, U64, U64 )
A64OPC(DataCacheOperationRaised, Void, U64, U64, U64 )
A64OPC(InstructionCacheOperationRaised, Void, U64, U64 )
A64OPC(DataSynchronizationBarrier, Void, )
A64OPC(DataMemoryBarrier, Void, )
A64OPC(InstructionSynchronizationBarrier, Void, )
A64OPC(GetCNTFRQ, U32, )
A64OPC(GetCNTPCT, U64, )
A64OPC(GetCTR, U32, )
A64OPC(GetDCZID, U32, )
A64OPC(GetTPIDR, U64, )
A64OPC(GetTPIDRRO, U64, )
A64OPC(SetTPIDR, Void, U64 )
// A64 Memory access
A64OPC(ClearExclusive, Void, )
A64OPC(ReadMemory8, U8, U64, U64, AccType )
@@ -758,13 +767,4 @@ A64OPC(ExclusiveWriteMemory32, U32, U64,
A64OPC(ExclusiveWriteMemory64, U32, U64, U64, U64, AccType )
A64OPC(ExclusiveWriteMemory128, U32, U64, U64, U128, AccType )
// Coprocessor
A32OPC(CoprocInternalOperation, Void, CoprocInfo )
A32OPC(CoprocSendOneWord, Void, CoprocInfo, U32 )
A32OPC(CoprocSendTwoWords, Void, CoprocInfo, U32, U32 )
A32OPC(CoprocGetOneWord, U32, CoprocInfo )
A32OPC(CoprocGetTwoWords, U64, CoprocInfo )
A32OPC(CoprocLoadWords, Void, CoprocInfo, U32 )
A32OPC(CoprocStoreWords, Void, CoprocInfo, U32 )
// clang-format on