Use hardware half-to-float on ARM when available.

This commit is contained in:
Henrik Rydgard 2013-11-17 14:17:13 +01:00
parent a3caefed18
commit 9f5402ce54
2 changed files with 60 additions and 6 deletions

View File

@ -929,7 +929,52 @@ namespace MIPSComp
}
void Jit::Comp_Vh2f(MIPSOpcode op) {
DISABLE;
if (!cpu_info.bNEON || !cpu_info.bHalf) {
// No hardware support for half-to-float, fallback to interpreter
// TODO: Translate the fast SSE solution to NEON.
DISABLE;
}
CONDITIONAL_DISABLE;
if (js.HasUnknownPrefix() || disablePrefixes )
DISABLE;
u8 sregs[4], dregs[4];
VectorSize sz = GetVecSize(op);
VectorSize outSz;
switch (sz) {
case V_Single:
outSz = V_Pair;
break;
case V_Pair:
outSz = V_Quad;
break;
default:
DISABLE;
}
int n = GetNumVectorElements(sz);
int nOut = n * 2;
GetVectorRegsPrefixS(sregs, sz, _VS);
GetVectorRegsPrefixD(dregs, outSz, _VD);
static const ARMReg tmp[4] = { S0, S1, S2, S3 };
for (int i = 0; i < n; i++) {
fpr.MapRegV(sregs[i], sz);
VMOV(tmp[i], fpr.V(sregs[i]));
}
// Okay, let's convert!
VCVTF32F16(Q0, D0);
for (int i = 0; i < nOut ; i++) {
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
VMOV(fpr.V(dregs[i]), tmp[i]);
}
ApplyPrefixD(dregs, sz);
fpr.ReleaseSpillLocksAndDiscardTemps();
}
void Jit::Comp_Vf2i(MIPSOpcode op) {

View File

@ -20,10 +20,8 @@
#include "Common/CPUDetect.h"
#include "Core/MIPS/ARM/ArmRegCacheFPU.h"
using namespace ArmGen;
ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mips) : mips_(mips), vr(mr + 32) {
if (cpu_info.bNEON) {
numARMFpuReg_ = 32;
@ -52,15 +50,26 @@ void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
static const ARMReg *GetMIPSAllocationOrder(int &count) {
// We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things.
static const ARMReg allocationOrder[] = {
S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
S2, S3,
S4, S5, S6, S7,
S8, S9, S10, S11,
S12, S13, S14, S15
};
// With NEON, we have many more.
// In the future I plan to use S0-S7 (Q0-Q1) for FPU and S8 forwards (Q2-Q15, yes, 15) for VFPU.
// VFPU will use NEON to do SIMD and it will be awkward to mix with FPU.
static const ARMReg allocationOrderNEON[] = {
S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15,
S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31
// Reserve four temp registers. Useful when building quads until we really figure out
// how to do that best.
S4, S5, S6, S7, // Q1
S8, S9, S10, S11, // Q2
S12, S13, S14, S15, // Q3
S16, S17, S18, S19, // Q4
S20, S21, S22, S23, // Q5
S24, S25, S26, S27, // Q6
S28, S29, S30, S31, // Q7
// Q8-Q15 free for NEON tricks
};
if (cpu_info.bNEON) {