Merge pull request #9241 from hrydgard/arm64-abi

Follow the ARM64 ABI better (update the frame pointer).
This commit is contained in:
Henrik Rydgård 2017-01-27 20:11:04 +01:00 committed by GitHub
commit f28fec3fa5
6 changed files with 115 additions and 414 deletions

View File

@ -1913,17 +1913,25 @@ inline int64_t abs64(int64_t x) {
return x >= 0 ? x : -x;
}
int Count(bool part[4]) {
int cnt = 0;
for (int i = 0; i < 4; i++) {
if (part[i])
cnt++;
}
return cnt;
}
// Wrapper around MOVZ+MOVK (and later MOVN)
void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
{
unsigned int parts = Is64Bit(Rd) ? 4 : 2;
BitSet32 upload_part(0);
bool upload_part[4];
// Always start with a movz! Kills the dependency on the register.
bool use_movz = true;
if (!imm)
{
if (!imm) {
// Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks clearer in disasm too.
MOVZ(Rd, 0, SHIFT_0);
return;
@ -1961,7 +1969,7 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
u64 aligned_pc = (u64)GetCodePointer() & ~0xFFF;
s64 aligned_offset = (s64)imm - (s64)aligned_pc;
if (upload_part.Count() > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL)
if (Count(upload_part) > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL)
{
// Immediate we are loading is within 4GB of our aligned range
// Most likely a address that we can load in one or two instructions
@ -2015,115 +2023,11 @@ void ARM64XEmitter::POP(ARM64Reg Rd) {
void ARM64XEmitter::PUSH2(ARM64Reg Rd, ARM64Reg Rn) {
STP(INDEX_PRE, Rd, Rn, SP, -16);
}
void ARM64XEmitter::POP2(ARM64Reg Rd, ARM64Reg Rn) {
LDP(INDEX_POST, Rd, Rn, SP, 16);
}
void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
{
int num_regs = registers.Count();
if (num_regs % 2)
{
bool first = true;
// Stack is required to be quad-word aligned.
u32 stack_size = ROUND_UP(num_regs * 8, 16);
u32 current_offset = 0;
std::vector<ARM64Reg> reg_pair;
for (auto it : registers)
{
if (first)
{
STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -(s32)stack_size);
first = false;
current_offset += 16;
}
else
{
reg_pair.push_back((ARM64Reg)(X0 + it));
if (reg_pair.size() == 2)
{
STP(INDEX_UNSIGNED, reg_pair[0], reg_pair[1], SP, current_offset);
reg_pair.clear();
current_offset += 16;
}
}
}
}
else
{
std::vector<ARM64Reg> reg_pair;
for (auto it : registers)
{
reg_pair.push_back((ARM64Reg)(X0 + it));
if (reg_pair.size() == 2)
{
STP(INDEX_PRE, reg_pair[0], reg_pair[1], SP, -16);
reg_pair.clear();
}
}
}
}
void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
{
int num_regs = registers.Count();
if (num_regs % 2)
{
bool first = true;
std::vector<ARM64Reg> reg_pair;
for (auto it : registers)
{
if (ignore_mask[it])
it = WSP;
if (first)
{
LDR(INDEX_POST, (ARM64Reg)(X0 + it), SP, 16);
first = false;
}
else
{
reg_pair.push_back((ARM64Reg)(X0 + it));
if (reg_pair.size() == 2)
{
LDP(INDEX_POST, reg_pair[0], reg_pair[1], SP, 16);
reg_pair.clear();
}
}
}
}
else
{
std::vector<ARM64Reg> reg_pair;
for (int i = 31; i >= 0; --i)
{
if (!registers[i])
continue;
int reg = i;
if (ignore_mask[reg])
reg = WSP;
reg_pair.push_back((ARM64Reg)(X0 + reg));
if (reg_pair.size() == 2)
{
LDP(INDEX_POST, reg_pair[1], reg_pair[0], SP, 16);
reg_pair.clear();
}
}
}
}
// Float Emitter
void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
{
@ -3658,161 +3562,95 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8
EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
}
void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
{
bool bundled_loadstore = false;
void ARM64FloatEmitter::ABI_PushRegisters(uint32_t registers, uint32_t fp_registers) {
_assert_msg_(DYNA_REC, (registers & 0x60000000) == 0, "ABI_PushRegisters: Do not include FP and LR, those are handled non-conditionally");
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count > 1)
{
bundled_loadstore = true;
break;
}
ARM64Reg gprs[32]{}, fprs[32]{};
int num_gprs = 0, num_fprs = 0;
for (int i = 0; i < 29; i++) {
if (registers & (1U << i))
gprs[num_gprs++] = (ARM64Reg)(X0 + i);
}
if (bundled_loadstore && tmp != INVALID_REG)
{
int num_regs = registers.Count();
m_emit->SUB(SP, SP, num_regs * 16);
m_emit->ADD(tmp, SP, 0);
std::vector<ARM64Reg> island_regs;
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
// 0 = true
// 1 < 4 && registers[i + 1] true!
// 2 < 4 && registers[i + 2] true!
// 3 < 4 && registers[i + 3] true!
// 4 < 4 && registers[i + 4] false!
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count == 1)
island_regs.push_back((ARM64Reg)(Q0 + i));
else
ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
i += count - 1;
}
// Handle island registers
std::vector<ARM64Reg> pair_regs;
for (auto& it : island_regs)
{
pair_regs.push_back(it);
if (pair_regs.size() == 2)
{
STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
pair_regs.clear();
}
}
if (pair_regs.size())
STR(128, INDEX_POST, pair_regs[0], tmp, 16);
for (int i = 0; i < 32; i++) {
if (fp_registers & (1U << i))
fprs[num_fprs++] = (ARM64Reg)(D0 + i);
}
else
{
std::vector<ARM64Reg> pair_regs;
for (auto it : registers)
{
pair_regs.push_back((ARM64Reg)(Q0 + it));
if (pair_regs.size() == 2)
{
STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
pair_regs.clear();
}
}
if (pair_regs.size())
STR(128, INDEX_PRE, pair_regs[0], SP, -16);
u32 stack_size = 16 + ROUND_UP(num_gprs * 8, 16) + ROUND_UP(num_fprs * 8, 16);
// Stack is required to be quad-word aligned.
if (stack_size < 256) {
m_emit->STP(INDEX_PRE, FP, LR, SP, -(s32)stack_size);
} else {
m_emit->SUB(SP, SP, stack_size);
m_emit->STP(INDEX_UNSIGNED, FP, LR, SP, 0);
}
m_emit->MOVfromSP(X29); // Set new frame pointer
int offset = 16;
for (int i = 0; i < num_gprs / 2; i++) {
m_emit->STP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], X29, offset);
offset += 16;
}
if (num_gprs & 1) {
m_emit->STR(INDEX_UNSIGNED, gprs[num_gprs - 1], X29, offset);
offset += 16;
}
for (int i = 0; i < num_fprs / 2; i++) {
STP(64, INDEX_SIGNED, fprs[i * 2], fprs[i * 2 + 1], SP, offset);
offset += 16;
}
if (num_fprs & 1) {
STR(64, INDEX_UNSIGNED, fprs[num_fprs - 1], X29, offset);
offset += 16;
}
// Now offset should be == stack_size.
}
void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
{
bool bundled_loadstore = false;
int num_regs = registers.Count();
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count > 1)
{
bundled_loadstore = true;
break;
}
void ARM64FloatEmitter::ABI_PopRegisters(uint32_t registers, uint32_t fp_registers) {
ARM64Reg gprs[32]{}, fprs[32]{};
int num_gprs = 0, num_fprs = 0;
for (int i = 0; i < 29; i++) {
if (registers & (1U << i))
gprs[num_gprs++] = (ARM64Reg)(X0 + i);
}
if (bundled_loadstore && tmp != INVALID_REG)
{
// The temporary register is only used to indicate that we can use this code path
std::vector<ARM64Reg> island_regs;
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count == 1)
island_regs.push_back((ARM64Reg)(Q0 + i));
else
LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
i += count - 1;
}
// Handle island registers
std::vector<ARM64Reg> pair_regs;
for (auto& it : island_regs)
{
pair_regs.push_back(it);
if (pair_regs.size() == 2)
{
LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
pair_regs.clear();
}
}
if (pair_regs.size())
LDR(128, INDEX_POST, pair_regs[0], SP, 16);
for (int i = 0; i < 32; i++) {
if (fp_registers & (1U << i))
fprs[num_fprs++] = (ARM64Reg)(D0 + i);
}
else
{
bool odd = (num_regs % 2) != 0;
std::vector<ARM64Reg> pair_regs;
for (int i = 31; i >= 0; --i)
{
if (!registers[i])
continue;
if (odd)
{
// First load must be a regular LDR if odd
odd = false;
LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
}
else
{
pair_regs.push_back((ARM64Reg)(Q0 + i));
if (pair_regs.size() == 2)
{
LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
pair_regs.clear();
}
}
}
u32 stack_size = 16 + ROUND_UP(num_gprs * 8, 16) + ROUND_UP(num_fprs * 8, 16);
// SP points to the bottom. We're gonna walk it upwards.
// Reload FP, LR.
m_emit->LDP(INDEX_SIGNED, FP, LR, SP, 0);
int offset = 16;
for (int i = 0; i < num_gprs / 2; i++) {
m_emit->LDP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], SP, offset);
offset += 16;
}
// Do the straggler.
if (num_gprs & 1) {
m_emit->LDR(INDEX_UNSIGNED, gprs[num_gprs-1], SP, offset);
offset += 16;
}
// Time for the FP regs.
for (int i = 0; i < num_fprs / 2; i++) {
LDP(64, INDEX_SIGNED, fprs[i * 2], fprs[i * 2 + 1], SP, offset);
offset += 16;
}
// Do the straggler.
if (num_fprs & 1) {
LDR(64, INDEX_UNSIGNED, fprs[num_fprs-1], SP, offset);
offset += 16;
}
// Now offset should be == stack_size.
// Restore the stack pointer.
m_emit->ADD(SP, SP, stack_size);
}
void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {

View File

@ -85,12 +85,14 @@ enum ARM64Reg
WZR = WSP,
ZR = SP,
FP = X29,
LR = X30,
INVALID_REG = 0xFFFFFFFF
};
// R19-R28, R29 (FP), R30 (LR). FP seems questionable?
const u32 ALL_CALLEE_SAVED = 0x7FF80000;
// R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
const u32 ALL_CALLEE_SAVED = 0x1FF80000;
const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // d8-d15
inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
@ -720,10 +722,6 @@ public:
bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
// ABI related
void ABI_PushRegisters(BitSet32 registers);
void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
// Pseudo-instruction for convenience. PUSH pushes 16 bytes even though we only push a single register.
// This is so the stack pointer is always 16-byte aligned, which is checked by hardware!
void PUSH(ARM64Reg Rd);
@ -943,8 +941,8 @@ public:
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
// ABI related
void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
void ABI_PopRegisters(uint32_t gpr_registers, uint32_t fp_registers);
private:
ARM64XEmitter* m_emit;

View File

@ -3,8 +3,6 @@
#pragma once
#include <cstddef>
#include <initializer_list>
#include <type_traits>
#include "CommonTypes.h"
// Helper functions:
@ -12,8 +10,7 @@
#ifdef _WIN32
#include <intrin.h>
template <typename T>
static inline int CountSetBits(T v)
{
inline int CountSetBits(T v) {
// from https://graphics.stanford.edu/~seander/bithacks.html
// GCC has this built in, but MSVC's intrinsic will only emit the actual
// POPCNT instruction, which we're not depending on
@ -22,14 +19,14 @@ static inline int CountSetBits(T v)
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u32 val)
inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
#ifdef _M_X64
static inline int LeastSignificantSetBit(u64 val)
inline int LeastSignificantSetBit(u64 val)
{
unsigned long index;
_BitScanForward64(&index, val);
@ -37,134 +34,8 @@ static inline int LeastSignificantSetBit(u64 val)
}
#endif
#else
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
namespace BS
{
// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
// using the set bits of an integer to represent a set of integers. Like that
// class, it acts like an array of bools:
// BitSet32 bs;
// bs[1] = true;
// but also like the underlying integer ([0] = least significant bit):
// BitSet32 bs2 = ...;
// bs = (bs ^ bs2) & BitSet32(0xffff);
// The following additional functionality is provided:
// - Construction using an initializer list.
// BitSet bs { 1, 2, 4, 8 };
// - Efficiently iterating through the set bits:
// for (int i : bs)
// [i is the *index* of a set bit]
// (This uses the appropriate CPU instruction to find the next set bit in one
// operation.)
// - Counting set bits using .Count() - see comment on that method.
// TODO: use constexpr when MSVC gets out of the Dark Ages
template <typename IntTy>
class BitSet
{
static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
public:
// A reference to a particular bit, returned from operator[].
class Ref
{
public:
Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
operator bool() const { return (m_bs->m_val & m_mask) != 0; }
bool operator=(bool set)
{
m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
return set;
}
private:
BitSet* m_bs;
IntTy m_mask;
};
// A STL-like iterator is required to be able to use range-based for loops.
class Iterator
{
public:
Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
int operator*() { return m_bit; }
Iterator& operator++()
{
if (m_val == 0)
{
m_bit = -1;
}
else
{
int bit = LeastSignificantSetBit(m_val);
m_val &= ~(1 << bit);
m_bit = bit;
}
return *this;
}
Iterator operator++(int _)
{
Iterator other(*this);
++*this;
return other;
}
bool operator==(Iterator other) const { return m_bit == other.m_bit; }
bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
private:
IntTy m_val;
int m_bit;
};
BitSet() : m_val(0) {}
explicit BitSet(IntTy val) : m_val(val) {}
BitSet(std::initializer_list<int> init)
{
m_val = 0;
for (int bit : init)
m_val |= (IntTy)1 << bit;
}
static BitSet AllTrue(size_t count)
{
return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
}
Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
bool operator==(BitSet other) const { return m_val == other.m_val; }
bool operator!=(BitSet other) const { return m_val != other.m_val; }
BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
BitSet operator~() const { return BitSet(~m_val); }
BitSet& operator|=(BitSet other) { return *this = *this | other; }
BitSet& operator&=(BitSet other) { return *this = *this & other; }
BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
operator u32() = delete;
operator bool() { return m_val != 0; }
// Warning: Even though on modern CPUs this is a single fast instruction,
// Dolphin's official builds do not currently assume POPCNT support on x86,
// so slower explicit bit twiddling is generated. Still should generally
// be faster than a loop.
unsigned int Count() const { return CountSetBits(m_val); }
Iterator begin() const { Iterator it(m_val, 0); return ++it; }
Iterator end() const { return Iterator(m_val, -1); }
IntTy m_val;
};
}
typedef BS::BitSet<u32> BitSet32;
typedef BS::BitSet<u64> BitSet64;

View File

@ -198,10 +198,9 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) {
enterDispatcher = AlignCode16();
BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
ABI_PushRegisters(regs_to_save);
fp.ABI_PushRegisters(regs_to_save_fp);
uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
// Fixed registers, these are always kept when in Jit context.
MOVP2R(MEMBASEREG, Memory::base);
@ -290,8 +289,7 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) {
SaveStaticRegisters();
RestoreRoundingMode(true);
fp.ABI_PopRegisters(regs_to_save_fp);
ABI_PopRegisters(regs_to_save);
fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
RET();

View File

@ -175,10 +175,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
// if (skinning) log = true;
BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
ABI_PushRegisters(regs_to_save);
fp.ABI_PushRegisters(regs_to_save_fp);
uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
@ -279,8 +278,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
}
fp.ABI_PopRegisters(regs_to_save_fp);
ABI_PopRegisters(regs_to_save);
fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
RET();

View File

@ -39,10 +39,9 @@ void TestCode::Generate()
const u8 *start = AlignCode16();
BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
ABI_PushRegisters(regs_to_save);
fp.ABI_PushRegisters(regs_to_save_fp);
uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
PUSH(X3);
POP(X3);
@ -54,8 +53,7 @@ void TestCode::Generate()
fp.SCVTF(S3, W12);
MOVI2R(X0, 1337);
ABI_PopRegisters(regs_to_save);
fp.ABI_PopRegisters(regs_to_save_fp);
fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
RET();