OpCodeDispatcher: Optimize a case of GOT calculation

32-bit GOT calculation needs to do a call+pop to do get the EIP on
32-bit. LEA doesn't work because it there is no EIP relative ops like on
x86-64.

This causes a terrible block split on every GOT calculation without the
optimization in place.

Now the block can continue through this weird GOT calculation.

This will be worthwhile for our 32-bit thunks where for some reason the
GOT calculation can't be removed. The GOT is calculated even though it
isn't used.
This commit is contained in:
Ryan Houdek 2022-12-09 09:15:41 -08:00
parent 2123868a42
commit a6b0181cd4
6 changed files with 88 additions and 26 deletions

View File

@ -856,22 +856,25 @@ namespace FEXCore::Context {
Thread->OpDispatcher->_ExitFunction(Thread->OpDispatcher->_EntrypointOffset(Block.Entry - GuestRIP, GPRSize));
}
// If we had a dispatch error then leave early
if (HadDispatchError) {
if (TotalInstructions == 0) {
// Couldn't handle any instruction in op dispatcher
Thread->OpDispatcher->ResetWorkingList();
return { nullptr, nullptr, 0, 0, 0, 0 };
}
else {
const uint8_t GPRSize = GetGPRSize();
const bool NeedsBlockEnd = (HadDispatchError && TotalInstructions > 0) ||
(Thread->OpDispatcher->NeedsBlockEnder() && i + 1 == InstsInBlock);
// We had some instructions. Early exit
Thread->OpDispatcher->_ExitFunction(Thread->OpDispatcher->_EntrypointOffset(Block.Entry + BlockInstructionsLength - GuestRIP, GPRSize));
break;
}
// If we had a dispatch error then leave early
if (HadDispatchError && TotalInstructions == 0) {
// Couldn't handle any instruction in op dispatcher
Thread->OpDispatcher->ResetWorkingList();
return { nullptr, nullptr, 0, 0, 0, 0 };
}
if (NeedsBlockEnd) {
const uint8_t GPRSize = GetGPRSize();
// We had some instructions. Early exit
Thread->OpDispatcher->_ExitFunction(Thread->OpDispatcher->_EntrypointOffset(Block.Entry + BlockInstructionsLength - GuestRIP, GPRSize));
break;
}
if (Thread->OpDispatcher->FinishOp(DecodedInfo->PC + DecodedInfo->InstSize, i + 1 == InstsInBlock)) {
break;
}

View File

@ -1127,6 +1127,35 @@ void Decoder::BranchTargetInMultiblockRange() {
}
}
bool Decoder::BranchTargetCanContinue(bool FinalInstruction) const {
if (FinalInstruction) {
return false;
}
uint64_t TargetRIP = 0;
const uint8_t GPRSize = CTX->GetGPRSize();
if (DecodeInst->OP == 0xE8) { // Call - immediate target
const uint64_t NextRIP = DecodeInst->PC + DecodeInst->InstSize;
LOGMAN_THROW_A_FMT(DecodeInst->Src[0].IsLiteral(), "Had wrong operand type");
TargetRIP = DecodeInst->PC + DecodeInst->InstSize + DecodeInst->Src[0].Data.Literal.Value;
if (GPRSize == 4) {
// If we are running a 32bit guest then wrap around addresses that go above 32bit
TargetRIP &= 0xFFFFFFFFU;
}
if (TargetRIP == NextRIP) {
// Optimize the case that the instruction is jumping just after itself.
// This is a GOT calculation which we can optimize out.
// Optimization occurs inside of the OpDispatcher implementation
return true;
}
}
return false;
}
const uint8_t *Decoder::AdjustAddrForSpecialRegion(uint8_t const* _InstStream, uint64_t EntryPoint, uint64_t RIP) {
constexpr uint64_t VSyscall_Base = 0xFFFF'FFFF'FF60'0000ULL;
constexpr uint64_t VSyscall_End = VSyscall_Base + 0x1000;
@ -1251,23 +1280,21 @@ void Decoder::DecodeInstructionsAtEntry(uint8_t const* _InstStream, uint64_t PC,
CanContinue = true;
}
bool FinalInstruction = DecodedSize >= CTX->Config.MaxInstPerBlock ||
DecodedSize >= DefaultDecodedBufferSize ||
TotalInstructions >= CTX->Config.MaxInstPerBlock;
if (DecodeInst->TableInfo->Flags & FEXCore::X86Tables::InstFlags::FLAGS_SETS_RIP) {
// If we have multiblock enabled
// If the branch target is within our multiblock range then we can keep going on
// We don't want to short circuit this since we want to calculate our ranges still
BranchTargetInMultiblockRange();
// Bypass branches if we can continue through them in some cases.
CanContinue |= BranchTargetCanContinue(FinalInstruction);
}
if (!CanContinue) {
break;
}
if (DecodedSize >= CTX->Config.MaxInstPerBlock ||
DecodedSize >= DefaultDecodedBufferSize) {
break;
}
if (TotalInstructions >= CTX->Config.MaxInstPerBlock) {
if (FinalInstruction || !CanContinue) {
break;
}

View File

@ -58,6 +58,7 @@ private:
bool DecodeInstruction(uint64_t PC);
void BranchTargetInMultiblockRange();
bool BranchTargetCanContinue(bool FinalInstruction) const;
uint8_t ReadByte();
uint8_t PeekByte(uint8_t Offset) const;

View File

@ -783,8 +783,17 @@ void OpDispatchBuilder::CALLOp(OpcodeArgs) {
_StoreMem(GPRClass, GPRSize, NewSP, ConstantPCReturn, GPRSize);
// Store the RIP
_ExitFunction(NewRIP); // If we get here then leave the function now
const uint64_t NextRIP = Op->PC + Op->InstSize;
LOGMAN_THROW_A_FMT(Op->Src[0].IsLiteral(), "Had wrong operand type");
const uint64_t TargetRIP = Op->PC + Op->InstSize + Op->Src[0].Data.Literal.Value;
if (NextRIP != TargetRIP) {
// Store the RIP
_ExitFunction(NewRIP); // If we get here then leave the function now
}
else {
NeedsBlockEnd = true;
}
}
void OpDispatchBuilder::CALLAbsoluteOp(OpcodeArgs) {

View File

@ -153,8 +153,9 @@ public:
OpDispatchBuilder(FEXCore::Utils::IntrusivePooledAllocator &Allocator);
void ResetWorkingList();
void ResetDecodeFailure() { DecodeFailure = false; }
void ResetDecodeFailure() { NeedsBlockEnd = DecodeFailure = false; }
bool HadDecodeFailure() const { return DecodeFailure; }
bool NeedsBlockEnder() const { return NeedsBlockEnd; }
void BeginFunction(uint64_t RIP, std::vector<FEXCore::Frontend::Decoder::DecodedBlocks> const *Blocks);
void Finalize();
@ -659,6 +660,7 @@ public:
bool HandledLock = false;
private:
bool DecodeFailure{false};
bool NeedsBlockEnd{false};
FEXCore::IR::IROp_IRHeader *Current_Header{};
OrderedNode *Current_HeaderNode{};

View File

@ -0,0 +1,20 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x10011"
},
"Mode": "32BIT"
}
%endif
mov esp, 0xe0000010
; This is a common pattern in 32-bit PIE code.
; 32-bit GOT calculation needs to do a call+pop to do get the EIP.
; LEA doesn't work because it there is no EIP relative ops like on x86-64.
call target
target:
pop eax
hlt