Use hardware half-to-float on ARM when available.

2024-11-28 10:51:06 +00:00 · 2013-11-17 14:17:13 +01:00 · 2013-11-17 14:17:13 +01:00 · 9f5402ce54
commit 9f5402ce54
parent a3caefed18
2 changed files with 60 additions and 6 deletions
--- a/Core/MIPS/ARM/ArmCompVFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompVFPU.cpp
@ -929,7 +929,52 @@ namespace MIPSComp
 	}

 	void Jit::Comp_Vh2f(MIPSOpcode op) {
-		DISABLE;
+		if (!cpu_info.bNEON || !cpu_info.bHalf) {
+			// No hardware support for half-to-float, fallback to interpreter
+			// TODO: Translate the fast SSE solution to NEON.
+			DISABLE;
+		}
+		CONDITIONAL_DISABLE;
+
+		if (js.HasUnknownPrefix() || disablePrefixes	)
+			DISABLE;
+
+		u8 sregs[4], dregs[4];
+		VectorSize sz = GetVecSize(op);
+		VectorSize outSz;
+
+		switch (sz) {
+		case V_Single:
+			outSz = V_Pair;
+			break;
+		case V_Pair:
+			outSz = V_Quad;
+			break;
+		default:
+			DISABLE;
+		}
+
+		int n = GetNumVectorElements(sz);
+		int nOut = n * 2;
+		GetVectorRegsPrefixS(sregs, sz, _VS);
+		GetVectorRegsPrefixD(dregs, outSz, _VD);
+
+		static const ARMReg tmp[4] = { S0, S1, S2, S3 };
+
+		for (int i = 0; i < n; i++) {
+			fpr.MapRegV(sregs[i], sz);
+			VMOV(tmp[i], fpr.V(sregs[i]));
+		}
+
+		// Okay, let's convert!
+		VCVTF32F16(Q0, D0);
+		for (int i = 0; i < nOut	; i++) {
+			fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
+			VMOV(fpr.V(dregs[i]), tmp[i]);
+		}
+
+		ApplyPrefixD(dregs, sz);
+		fpr.ReleaseSpillLocksAndDiscardTemps();
 	}

 	void Jit::Comp_Vf2i(MIPSOpcode op) {
--- a/Core/MIPS/ARM/ArmRegCacheFPU.cpp
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.cpp
@ -20,10 +20,8 @@
 #include "Common/CPUDetect.h"
 #include "Core/MIPS/ARM/ArmRegCacheFPU.h"

-
 using namespace ArmGen;

-
 ArmRegCacheFPU::ArmRegCacheFPU(MIPSState *mips) : mips_(mips), vr(mr + 32) {
 	if (cpu_info.bNEON) {
 		numARMFpuReg_ = 32;
@ -52,15 +50,26 @@ void ArmRegCacheFPU::Start(MIPSAnalyst::AnalysisResults &stats) {
 static const ARMReg *GetMIPSAllocationOrder(int &count) {
 	// We reserve S0-S1 as scratch. Can afford two registers. Maybe even four, which could simplify some things.
 	static const ARMReg allocationOrder[] = {
-		S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
+							S2,  S3,
+		S4,  S5,  S6,  S7,
+		S8,  S9,  S10, S11,
+		S12, S13, S14, S15
 	};

 	// With NEON, we have many more.
 	// In the future I plan to use S0-S7 (Q0-Q1) for FPU and S8 forwards (Q2-Q15, yes, 15) for VFPU.
 	// VFPU will use NEON to do SIMD and it will be awkward to mix with FPU.
 	static const ARMReg allocationOrderNEON[] = {
-		S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15,
-		S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31
+		// Reserve four temp registers. Useful when building quads until we really figure out
+		// how to do that best.
+		S4,  S5,  S6,  S7,   // Q1
+		S8,  S9,  S10, S11,  // Q2
+		S12, S13, S14, S15,  // Q3
+		S16, S17, S18, S19,  // Q4
+		S20, S21, S22, S23,  // Q5
+		S24, S25, S26, S27,  // Q6
+		S28, S29, S30, S31,  // Q7
+		// Q8-Q15 free for NEON tricks
 	};

 	if (cpu_info.bNEON) {