Merge pull request #3956 from alyssarosenzweig/opt/pop-return

small optimizations for returns
This commit is contained in:
Ryan Houdek 2024-08-15 03:30:23 -07:00 committed by GitHub
commit 933c65d805
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 46 additions and 55 deletions

View File

@ -78,8 +78,12 @@ DEF_OP(ExitFunction) {
// L1 Cache
ldr(TMP1, STATE, offsetof(FEXCore::Core::CpuStateFrame, Pointers.Common.L1Pointer));
and_(ARMEmitter::Size::i64Bit, TMP4, RipReg, LookupCache::L1_ENTRIES_MASK);
add(TMP1, TMP1, TMP4, ARMEmitter::ShiftType::LSL, 4);
// Calculate (tmp1 + ((ripreg & L1_ENTRIES_MASK) << 4)) for the address
// arithmetic. ubfiz+add is marginally faster on Firestorm than
// and+add(shift). Same performance on Cortex.
static_assert(LookupCache::L1_ENTRIES_MASK == ((1u << 20) - 1));
ubfiz(ARMEmitter::Size::i64Bit, TMP4, RipReg, 4, 20);
add(TMP1, TMP1, TMP4);
// Note: sub+cbnz used over cmp+br to preserve flags.
ldp<ARMEmitter::IndexType::OFFSET>(TMP2, TMP1, TMP1, 0);

View File

@ -162,20 +162,16 @@ void OpDispatchBuilder::RETOp(OpcodeArgs) {
InvalidateDeferredFlags();
}
auto Constant = _Constant(GPRSize);
auto OldSP = LoadGPRRegister(X86State::REG_RSP);
auto NewRIP = _LoadMem(GPRClass, GPRSize, OldSP, GPRSize);
Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP));
Ref NewRIP = Pop(GPRSize, SP);
Ref NewSP;
if (Op->OP == 0xC2) {
auto Offset = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags);
NewSP = _Add(IR::SizeToOpSize(GPRSize), _Add(IR::SizeToOpSize(GPRSize), OldSP, Constant), Offset);
} else {
NewSP = _Add(IR::SizeToOpSize(GPRSize), OldSP, Constant);
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Offset);
}
// Store the new stack pointer
StoreGPRRegister(X86State::REG_RSP, NewSP);
StoreGPRRegister(X86State::REG_RSP, SP);
// Store the new RIP
ExitFunction(NewRIP);
@ -201,36 +197,27 @@ void OpDispatchBuilder::IRETOp(OpcodeArgs) {
const uint8_t GPRSize = CTX->GetGPRSize();
auto Constant = _Constant(GPRSize);
auto SP = LoadGPRRegister(X86State::REG_RSP);
Ref SP = _RMWHandle(LoadGPRRegister(X86State::REG_RSP));
// RIP (64/32/16 bits)
auto NewRIP = _LoadMem(GPRClass, GPRSize, SP, GPRSize);
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Constant);
auto NewRIP = Pop(GPRSize, SP);
// CS (lower 16 used)
auto NewSegmentCS = _LoadMem(GPRClass, GPRSize, SP, GPRSize);
auto NewSegmentCS = Pop(GPRSize, SP);
_StoreContext(2, GPRClass, NewSegmentCS, offsetof(FEXCore::Core::CPUState, cs_idx));
UpdatePrefixFromSegment(NewSegmentCS, FEXCore::X86Tables::DecodeFlags::FLAG_CS_PREFIX);
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Constant);
// eflags (lower 16 used)
auto eflags = _LoadMem(GPRClass, GPRSize, SP, GPRSize);
SetPackedRFLAG(false, eflags);
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Constant);
SetPackedRFLAG(false, Pop(GPRSize, SP));
if (CTX->Config.Is64BitMode) {
// RSP and SS only happen in 64-bit mode or if this is a CPL mode jump!
// FEX doesn't support a CPL mode switch, so don't need to worry about this on 32-bit
StoreGPRRegister(X86State::REG_RSP, _LoadMem(GPRClass, GPRSize, SP, GPRSize));
StoreGPRRegister(X86State::REG_RSP, Pop(GPRSize, SP));
SP = _Add(IR::SizeToOpSize(GPRSize), SP, Constant);
// ss
auto NewSegmentSS = _LoadMem(GPRClass, GPRSize, SP, GPRSize);
auto NewSegmentSS = Pop(GPRSize, SP);
_StoreContext(2, GPRClass, NewSegmentSS, offsetof(FEXCore::Core::CPUState, ss_idx));
UpdatePrefixFromSegment(NewSegmentSS, FEXCore::X86Tables::DecodeFlags::FLAG_SS_PREFIX);
_Add(IR::SizeToOpSize(GPRSize), SP, Constant);
} else {
// Store the stack in 32-bit mode
StoreGPRRegister(X86State::REG_RSP, SP);

View File

@ -3161,8 +3161,8 @@
"bic w20, w22, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -4187,8 +4187,8 @@
"bic w20, w23, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x22, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x22, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -6327,8 +6327,8 @@
"add w21, w20, w21",
"str w20, [x8, #-4]!",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -6613,8 +6613,8 @@
"add w21, w20, w21",
"str w20, [x8, #-4]!",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -6771,8 +6771,8 @@
"add w21, w20, w21",
"str w20, [x8, #-4]!",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
}

View File

@ -56171,8 +56171,8 @@
"bic w20, w23, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -58291,8 +58291,8 @@
"bic w20, w22, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -58949,8 +58949,8 @@
"bic w20, w22, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -59329,8 +59329,8 @@
"bic w20, w22, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -74263,8 +74263,8 @@
"orr w20, w23, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -74926,8 +74926,8 @@
"add w21, w20, w21",
"str w20, [x8, #-4]!",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
}

View File

@ -32172,8 +32172,8 @@
"str w20, [x8, #-4]!",
"cfinv",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -65419,8 +65419,8 @@
"str w20, [x8, #-4]!",
"strb wzr, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -73416,8 +73416,8 @@
"str w20, [x8, #-4]!",
"strb wzr, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
},
@ -97572,8 +97572,8 @@
"bic w20, w22, w20",
"strb w20, [x28, #1298]",
"ldr x0, [x28, #2272]",
"and x3, x21, #0xfffff",
"add x0, x0, x3, lsl #4",
"ubfiz x3, x21, #4, #20",
"add x0, x0, x3",
"ldp x1, x0, [x0]"
]
}