Arm64: Stop abusing orr in LoadConstant

The current implementation uses orr excessively. This has FEX missing
hardware optimization opportunities where some CPU cores will zero-cycle
move constants that fit in to the 16-bits of movz/movk.

First evaluate up front if the number of 16-bit segments is > 1, in
those cases we should check if it is a bitfield that can be moved in one
instruction with orr.

After that point we will use movz for 16-bit constant moves.

Additionally this optimizes the case where a constant of zero is loaded
to be a `mov <reg>, zr` which gets renamed in most hardware.
This commit is contained in:
Ryan Houdek 2023-08-16 16:49:46 -07:00
parent 6cb0f52e94
commit 23fd79a3b3

View File

@ -213,17 +213,6 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
Segments = 2;
}
// If this can be loaded with a mov bitmask.
const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
if (IsImm) {
orr(s, Reg, ARMEmitter::Reg::zr, Constant);
if (NOPPad) {
nop(); nop(); nop();
}
return;
}
int NumMoves = 1;
int RequiredMoveSegments{};
// Count the number of move segments
@ -235,6 +224,20 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
}
}
// If this can be loaded with a mov bitmask.
if (RequiredMoveSegments > 1) {
// Only try to use this path if the number of segments is > 1.
// `movz` is better than `orr` since hardware will rename or merge if possible when `movz` is used.
const auto IsImm = vixl::aarch64::Assembler::IsImmLogical(Constant, RegSizeInBits(s));
if (IsImm) {
orr(s, Reg, ARMEmitter::Reg::zr, Constant);
if (NOPPad) {
nop(); nop(); nop();
}
return;
}
}
// ADRP+ADD is specifically optimized in hardware
// Check if we can use this
auto PC = GetCursorAddress<uint64_t>();
@ -245,6 +248,8 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
// Offset from aligned PC
int64_t AlignedOffset = static_cast<int64_t>(Constant) - static_cast<int64_t>(AlignedPC);
int NumMoves = 0;
// If the aligned offset is within the 4GB window then we can use ADRP+ADD
// and the number of move segments more than 1
if (RequiredMoveSegments > 1 && vixl::IsInt32(AlignedOffset)) {
@ -268,14 +273,30 @@ void Arm64Emitter::LoadConstant(ARMEmitter::Size s, ARMEmitter::Register Reg, ui
}
}
else {
movz(s, Reg, (Constant) & 0xFFFF, 0);
for (int i = 1; i < Segments; ++i) {
uint16_t Part = (Constant >> (i * 16)) & 0xFFFF;
int CurrentSegment = 0;
for (; CurrentSegment < Segments; ++CurrentSegment) {
uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
if (Part) {
movk(s, Reg, Part, i * 16);
movz(s, Reg, Part, CurrentSegment * 16);
++CurrentSegment;
++NumMoves;
break;
}
}
for (; CurrentSegment < Segments; ++CurrentSegment) {
uint16_t Part = (Constant >> (CurrentSegment * 16)) & 0xFFFF;
if (Part) {
movk(s, Reg, Part, CurrentSegment * 16);
++NumMoves;
}
}
if (NumMoves == 0) {
// If we didn't move anything that means this is a zero move. Special case this.
movz(s, Reg, 0);
++NumMoves;
}
}
if (NOPPad) {