mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-01-31 11:32:07 +00:00
Arm64: Stop abusing orr in LoadConstant
The current implementation uses orr excessively. This has FEX missing hardware optimization opportunities where some CPU cores will zero-cycle move constants that fit in to the 16-bits of movz/movk. First evaluate up front if the number of 16-bit segments is > 1, in those cases we should check if it is a bitfield that can be moved in one instruction with orr. After that point we will use movz for 16-bit constant moves. Additionally this optimizes the case where a constant of zero is loaded to be a `mov <reg>, zr` which gets renamed in most hardware.
This commit is contained in:
parent
6cb0f52e94
commit
23fd79a3b3
@ -213,17 +213,6 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
|
||||
Segments = 2;
|
||||
}
|
||||
|
||||
// If this can be loaded with a mov bitmask.
|
||||
const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
|
||||
if (IsImm) {
|
||||
orr(s, Reg, ARMEmitter::Reg::zr, Constant);
|
||||
if (NOPPad) {
|
||||
nop(); nop(); nop();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
int NumMoves = 1;
|
||||
int RequiredMoveSegments{};
|
||||
|
||||
// Count the number of move segments
|
||||
@ -235,6 +224,20 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
|
||||
}
|
||||
}
|
||||
|
||||
// If this can be loaded with a mov bitmask.
|
||||
if (RequiredMoveSegments > 1) {
|
||||
// Only try to use this path if the number of segments is > 1.
|
||||
// `movz` is better than `orr` since hardware will rename or merge if possible when `movz` is used.
|
||||
const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
|
||||
if (IsImm) {
|
||||
orr(s, Reg, ARMEmitter::Reg::zr, Constant);
|
||||
if (NOPPad) {
|
||||
nop(); nop(); nop();
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// ADRP+ADD is specifically optimized in hardware
|
||||
// Check if we can use this
|
||||
auto PC = GetCursorAddress<uint64_t>();
|
||||
@ -245,6 +248,8 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
|
||||
// Offset from aligned PC
|
||||
int64_t AlignedOffset = static_cast<int64_t>(Constant) - static_cast<int64_t>(AlignedPC);
|
||||
|
||||
int NumMoves = 0;
|
||||
|
||||
// If the aligned offset is within the 4GB window then we can use ADRP+ADD
|
||||
// and the number of move segments more than 1
|
||||
if (RequiredMoveSegments > 1 && vixl::IsInt32(AlignedOffset)) {
|
||||
@ -268,14 +273,30 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
|
||||
}
|
||||
}
|
||||
else {
|
||||
movz(s, Reg, (Constant) & 0xFFFF, 0);
|
||||
for (int i = 1; i < Segments; ++i) {
|
||||
uint16_t Part = (Constant >> (i * 16)) & 0xFFFF;
|
||||
int CurrentSegment = 0;
|
||||
for (; CurrentSegment < Segments; ++CurrentSegment) {
|
||||
uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
|
||||
if (Part) {
|
||||
movk(s, Reg, Part, i * 16);
|
||||
movz(s, Reg, Part, CurrentSegment * 16);
|
||||
++CurrentSegment;
|
||||
++NumMoves;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (; CurrentSegment < Segments; ++CurrentSegment) {
|
||||
uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
|
||||
if (Part) {
|
||||
movk(s, Reg, Part, CurrentSegment * 16);
|
||||
++NumMoves;
|
||||
}
|
||||
}
|
||||
|
||||
if (NumMoves == 0) {
|
||||
// If we didn't move anything that means this is a zero move. Special case this.
|
||||
movz(s, Reg, 0);
|
||||
++NumMoves;
|
||||
}
|
||||
}
|
||||
|
||||
if (NOPPad) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user