Add back our older VFPU approximations, as fallbacks if files are missing.

PR #16984 added more accurate versions of these functions, but they require large lookup tables stored in assets/. If these files are missing, PPSSPP would simply crash, which isn't good. We should probably try to warn the user somehow that these files are missing, though...
2024-11-23 05:19:56 +00:00 · 2023-04-03 11:33:41 +02:00 · 2023-04-03 11:33:41 +02:00 · aba026f7e9
commit aba026f7e9
parent ecfd4759dd
9 changed files with 379 additions and 23 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2067,6 +2067,8 @@ add_library(${CoreLibName} ${CoreLinkType}
 	Core/MIPS/MIPSTables.h
 	Core/MIPS/MIPSVFPUUtils.cpp
 	Core/MIPS/MIPSVFPUUtils.h
+	Core/MIPS/MIPSVFPUFallbacks.cpp
+	Core/MIPS/MIPSVFPUFallbacks.h
 	Core/MIPS/MIPSAsm.cpp
 	Core/MIPS/MIPSAsm.h
 	Core/MemFault.cpp
--- a/Core/Core.vcxproj
+++ b/Core/Core.vcxproj
@ -590,6 +590,7 @@
    <ClCompile Include="MIPS\IR\IRJit.cpp" />
    <ClCompile Include="MIPS\IR\IRPassSimplify.cpp" />
    <ClCompile Include="MIPS\IR\IRRegCache.cpp" />
+    <ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp" />
    <ClCompile Include="Replay.cpp" />
    <ClCompile Include="Compatibility.cpp" />
    <ClCompile Include="Config.cpp" />
@ -1154,6 +1155,7 @@
    <ClInclude Include="MIPS\IR\IRJit.h" />
    <ClInclude Include="MIPS\IR\IRPassSimplify.h" />
    <ClInclude Include="MIPS\IR\IRRegCache.h" />
+    <ClInclude Include="MIPS\MIPSVFPUFallbacks.h" />
    <ClInclude Include="Replay.h" />
    <ClInclude Include="Compatibility.h" />
    <ClInclude Include="Config.h" />
--- a/Core/Core.vcxproj.filters
+++ b/Core/Core.vcxproj.filters
@ -1192,6 +1192,9 @@
    <ClCompile Include="TiltEventProcessor.cpp">
      <Filter>Core</Filter>
    </ClCompile>
+    <ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp">
+      <Filter>MIPS</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="ELF\ElfReader.h">
@ -1926,6 +1929,9 @@
    <ClInclude Include="TiltEventProcessor.h">
      <Filter>Core</Filter>
    </ClInclude>
+    <ClInclude Include="MIPS\MIPSVFPUFallbacks.h">
+      <Filter>MIPS</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="..\LICENSE.TXT" />
--- a/Core/MIPS/MIPSVFPUFallbacks.cpp
+++ b/Core/MIPS/MIPSVFPUFallbacks.cpp
@ -0,0 +1,312 @@
+#include <cmath>
+
+#include "Common/BitScan.h"
+
+#include "Core/MIPS/MIPSVFPUFallbacks.h"
+#include "Core/MIPS/MIPSVFPUUtils.h"
+
+// MIPSVFPUUtils now has the high precision instructions implemented by fp64
+// in https://github.com/hrydgard/ppsspp/pull/16984 .
+//
+// These are our older approximations that are quite good but has flaws,
+// but we need them to fall back to if the table files are missing.
+//
+// Note that currently, some of the new functions are not integrated in the JIT
+// and are thus not normally used anyway.
+
+// First the "trivial" fallbacks where we haven't done any accuracy work previously.
+
+float vfpu_asin_fallback(float angle) {
+	return (float)(asinf(angle) / M_PI_2);
+}
+
+float vfpu_rcp_fallback(float x) {
+	return 1.0f / x;
+}
+
+float vfpu_log2_fallback(float x) {
+	return log2f(x);
+}
+
+float vfpu_exp2_fallback(float x) {
+	return exp2f(x);
+}
+
+// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
+// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
+// to the shape of the sine curve which seem to be very slightly different.
+//
+// So setting a lower value.
+#define PRECISION_EXP_THRESHOLD 0x65
+
+union float2int {
+	uint32_t i;
+	float f;
+};
+
+float vfpu_sqrt_fallback(float a) {
+	float2int val;
+	val.f = a;
+
+	if ((val.i & 0xff800000) == 0x7f800000) {
+		if ((val.i & 0x007fffff) != 0) {
+			val.i = 0x7f800001;
+		}
+		return val.f;
+	}
+	if ((val.i & 0x7f800000) == 0) {
+		// Kill any sign.
+		val.i = 0;
+		return val.f;
+	}
+	if (val.i & 0x80000000) {
+		val.i = 0x7f800001;
+		return val.f;
+	}
+
+	int k = get_exp(val.i);
+	uint32_t sp = get_mant(val.i);
+	int less_bits = k & 1;
+	k >>= 1;
+
+	uint32_t z = 0x00C00000 >> less_bits;
+	int64_t halfsp = sp >> 1;
+	halfsp <<= 23 - less_bits;
+	for (int i = 0; i < 6; ++i) {
+		z = (z >> 1) + (uint32_t)(halfsp / z);
+	}
+
+	val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
+	// The lower two bits never end up set on the PSP, it seems like.
+	val.i &= 0xFFFFFFFC;
+
+	return val.f;
+}
+
+static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
+	uint64_t m = (uint64_t)a * (uint64_t)b;
+	if (m & 0x007FFFFF) {
+		m += 0x01437000;
+	}
+	return (uint32_t)(m >> 23);
+}
+
+float vfpu_rsqrt_fallback(float a) {
+	float2int val;
+	val.f = a;
+
+	if (val.i == 0x7f800000) {
+		return 0.0f;
+	}
+	if ((val.i & 0x7fffffff) > 0x7f800000) {
+		val.i = (val.i & 0x80000000) | 0x7f800001;
+		return val.f;
+	}
+	if ((val.i & 0x7f800000) == 0) {
+		val.i = (val.i & 0x80000000) | 0x7f800000;
+		return val.f;
+	}
+	if (val.i & 0x80000000) {
+		val.i = 0xff800001;
+		return val.f;
+	}
+
+	int k = get_exp(val.i);
+	uint32_t sp = get_mant(val.i);
+	int less_bits = k & 1;
+	k = -(k >> 1);
+
+	uint32_t z = 0x00800000 >> less_bits;
+	uint32_t halfsp = sp >> (1 + less_bits);
+	for (int i = 0; i < 6; ++i) {
+		uint32_t zsq = mant_mul(z, z);
+		uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
+		z = mant_mul(z, correction);
+	}
+
+	int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
+	if (shift < 1) {
+		z >>= -shift;
+		k += -shift;
+	} else if (shift > 0) {
+		z <<= shift;
+		k -= shift;
+	}
+
+	z >>= less_bits;
+
+	val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
+	val.i &= 0xFFFFFFFC;
+
+	return val.f;
+}
+
+float vfpu_sin_fallback(float a) {
+	float2int val;
+	val.f = a;
+
+	int32_t k = get_uexp(val.i);
+	if (k == 255) {
+		val.i = (val.i & 0xFF800001) | 1;
+		return val.f;
+	}
+
+	if (k < PRECISION_EXP_THRESHOLD) {
+		val.i &= 0x80000000;
+		return val.f;
+	}
+
+	// Okay, now modulus by 4 to begin with (identical wave every 4.)
+	int32_t mantissa = get_mant(val.i);
+	if (k > 0x80) {
+		const uint8_t over = k & 0x1F;
+		mantissa = (mantissa << over) & 0x00FFFFFF;
+		k = 0x80;
+	}
+	// This subtracts off the 2.  If we do, flip sign to inverse the wave.
+	if (k == 0x80 && mantissa >= (1 << 23)) {
+		val.i ^= 0x80000000;
+		mantissa -= 1 << 23;
+	}
+
+	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
+	mantissa <<= norm_shift;
+	k -= norm_shift;
+
+	if (k <= 0 || mantissa == 0) {
+		val.i &= 0x80000000;
+		return val.f;
+	}
+
+	// This is the value with modulus applied.
+	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
+	val.f = (float)sin((double)val.f * M_PI_2);
+	val.i &= 0xFFFFFFFC;
+	return val.f;
+}
+
+float vfpu_cos_fallback(float a) {
+	float2int val;
+	val.f = a;
+	bool negate = false;
+
+	int32_t k = get_uexp(val.i);
+	if (k == 255) {
+		// Note: unlike sin, cos always returns +NAN.
+		val.i = (val.i & 0x7F800001) | 1;
+		return val.f;
+	}
+
+	if (k < PRECISION_EXP_THRESHOLD)
+		return 1.0f;
+
+	// Okay, now modulus by 4 to begin with (identical wave every 4.)
+	int32_t mantissa = get_mant(val.i);
+	if (k > 0x80) {
+		const uint8_t over = k & 0x1F;
+		mantissa = (mantissa << over) & 0x00FFFFFF;
+		k = 0x80;
+	}
+	// This subtracts off the 2.  If we do, negate the result value.
+	if (k == 0x80 && mantissa >= (1 << 23)) {
+		mantissa -= 1 << 23;
+		negate = true;
+	}
+
+	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
+	mantissa <<= norm_shift;
+	k -= norm_shift;
+
+	if (k <= 0 || mantissa == 0)
+		return negate ? -1.0f : 1.0f;
+
+	// This is the value with modulus applied.
+	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
+	if (val.f == 1.0f || val.f == -1.0f) {
+		return negate ? 0.0f : -0.0f;
+	}
+	val.f = (float)cos((double)val.f * M_PI_2);
+	val.i &= 0xFFFFFFFC;
+	return negate ? -val.f : val.f;
+}
+
+void vfpu_sincos_fallback(float a, float &s, float &c) {
+	float2int val;
+	val.f = a;
+	// For sin, negate the input, for cos negate the output.
+	bool negate = false;
+
+	int32_t k = get_uexp(val.i);
+	if (k == 255) {
+		val.i = (val.i & 0xFF800001) | 1;
+		s = val.f;
+		val.i &= 0x7F800001;
+		c = val.f;
+		return;
+	}
+
+	if (k < PRECISION_EXP_THRESHOLD) {
+		val.i &= 0x80000000;
+		s = val.f;
+		c = 1.0f;
+		return;
+	}
+
+	// Okay, now modulus by 4 to begin with (identical wave every 4.)
+	int32_t mantissa = get_mant(val.i);
+	if (k > 0x80) {
+		const uint8_t over = k & 0x1F;
+		mantissa = (mantissa << over) & 0x00FFFFFF;
+		k = 0x80;
+	}
+	// This subtracts off the 2.  If we do, flip signs.
+	if (k == 0x80 && mantissa >= (1 << 23)) {
+		mantissa -= 1 << 23;
+		negate = true;
+	}
+
+	int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
+	mantissa <<= norm_shift;
+	k -= norm_shift;
+
+	if (k <= 0 || mantissa == 0) {
+		val.i &= 0x80000000;
+		if (negate)
+			val.i ^= 0x80000000;
+		s = val.f;
+		c = negate ? -1.0f : 1.0f;
+		return;
+	}
+
+	// This is the value with modulus applied.
+	val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
+	float2int i_sine, i_cosine;
+	if (val.f == 1.0f) {
+		i_sine.f = negate ? -1.0f : 1.0f;
+		i_cosine.f = negate ? 0.0f : -0.0f;
+	} else if (val.f == -1.0f) {
+		i_sine.f = negate ? 1.0f : -1.0f;
+		i_cosine.f = negate ? 0.0f : -0.0f;
+	} else if (negate) {
+		i_sine.f = (float)sin((double)-val.f * M_PI_2);
+		i_cosine.f = -(float)cos((double)val.f * M_PI_2);
+	} else {
+		double angle = (double)val.f * M_PI_2;
+#if defined(__linux__)
+		double d_sine;
+		double d_cosine;
+		sincos(angle, &d_sine, &d_cosine);
+		i_sine.f = (float)d_sine;
+		i_cosine.f = (float)d_cosine;
+#else
+		i_sine.f = (float)sin(angle);
+		i_cosine.f = (float)cos(angle);
+#endif
+	}
+
+	i_sine.i &= 0xFFFFFFFC;
+	i_cosine.i &= 0xFFFFFFFC;
+	s = i_sine.f;
+	c = i_cosine.f;
+	return;
+}
--- a/Core/MIPS/MIPSVFPUFallbacks.h
+++ b/Core/MIPS/MIPSVFPUFallbacks.h
@ -0,0 +1,14 @@
+#pragma once
+
+// These are our old implementation of VFPU math functions, that don't make use of the
+// accuracy-improving tables from #16984.
+
+float vfpu_asin_fallback(float angle);
+float vfpu_sqrt_fallback(float a);
+float vfpu_rsqrt_fallback(float a);
+float vfpu_sin_fallback(float a);
+float vfpu_cos_fallback(float a);
+void vfpu_sincos_fallback(float a, float &s, float &c);
+float vfpu_rcp_fallback(float x);
+float vfpu_log2_fallback(float x);
+float vfpu_exp2_fallback(float x);
--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@ -26,6 +26,7 @@
 #include "Core/Reporting.h"
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"
+#include "Core/MIPS/MIPSVFPUFallbacks.h"

 #ifdef _MSC_VER
 #pragma warning(disable: 4146)
@ -666,23 +667,6 @@ float Float16ToFloat32(unsigned short l)
 	return f;
 }

-static uint32_t get_uexp(uint32_t x) {
-	return (x >> 23) & 0xFF;
-}
-
-int32_t get_exp(uint32_t x) {
-	return get_uexp(x) - 127;
-}
-
-static int32_t get_mant(uint32_t x) {
-	// Note: this returns the hidden 1.
-	return (x & 0x007FFFFF) | 0x00800000;
-}
-
-static int32_t get_sign(uint32_t x) {
-	return x & 0x80000000;
-}
-
 float vfpu_dot(const float a[4], const float b[4]) {
 	static const int EXTRA_BITS = 2;
 	float2int result;
@ -830,12 +814,12 @@ static uint16_t (*vfpu_asin_lut_indices)=nullptr;
 static  int8_t  (*vfpu_rcp_lut)[2]=nullptr;

 template<typename T>
-static inline bool load_vfpu_table(T *&ptr,const char *filename, size_t expected_size) {
+static inline bool load_vfpu_table(T *&ptr, const char *filename, size_t expected_size) {
 #if COMMON_BIG_ENDIAN
 	// Tables are little-endian.
 #error Byteswap for VFPU tables not implemented
 #endif
-	if(ptr) return true; // Already loaded.
+	if (ptr) return true; // Already loaded.
 	size_t size = 0u;
 	INFO_LOG(CPU, "Loading '%s'...", filename);
 	ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
@ -911,6 +895,8 @@ float vfpu_sin(float x) {
 		LOAD_TABLE(vfpu_sin_lut_delta,          262144)&&
 		LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
 		LOAD_TABLE(vfpu_sin_lut_exceptions,      86938);
+	if (!loaded)
+		return vfpu_sin_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(x));
 	uint32_t sign = bits & 0x80000000u;
@ -948,6 +934,8 @@ float vfpu_cos(float x) {
 		LOAD_TABLE(vfpu_sin_lut_delta,          262144)&&
 		LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
 		LOAD_TABLE(vfpu_sin_lut_exceptions,      86938);
+	if (!loaded)
+		return vfpu_cos_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(x));
 	bits &= 0x7FFFFFFFu;
@ -1053,6 +1041,8 @@ static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
 float vfpu_sqrt(float x) {
 	static bool loaded =
 		LOAD_TABLE(vfpu_sqrt_lut, 262144);
+	if (!loaded)
+		return vfpu_sqrt_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(bits));
 	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1139,6 +1129,8 @@ static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
 float vfpu_rsqrt(float x) {
 	static bool loaded =
 		LOAD_TABLE(vfpu_rsqrt_lut, 262144);
+	if (!loaded)
+		return vfpu_rsqrt_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(bits));
 	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1200,6 +1192,9 @@ float vfpu_asin(float x) {
 		LOAD_TABLE(vfpu_asin_lut65536,      1536)&&
 		LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
 		LOAD_TABLE(vfpu_asin_lut_deltas,  517448);
+	if (!loaded)
+		return vfpu_asin_fallback(x);
+
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(x));
 	uint32_t sign = bits & 0x80000000u;
@ -1238,8 +1233,10 @@ static inline uint32_t vfpu_exp2_fixed(uint32_t x) {

 float vfpu_exp2(float x) {
 	static bool loaded =
-	        LOAD_TABLE(vfpu_exp2_lut65536,    512)&&
+		LOAD_TABLE(vfpu_exp2_lut65536,    512)&&
 		LOAD_TABLE(vfpu_exp2_lut,      262144);
+	if (!loaded)
+		return vfpu_exp2_fallback(x);
 	int32_t bits;
 	memcpy(&bits, &x, sizeof(bits));
 	if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
@ -1277,9 +1274,9 @@ float vfpu_rexp2(float x) {
 // Input fixed 9.23, output fixed 10.22.
 // Returns log2(1+x).
 static inline uint32_t vfpu_log2_approx(uint32_t x) {
-	uint32_t a=vfpu_log2_lut65536[(x >> 16) + 0];
-	uint32_t b=vfpu_log2_lut65536[(x >> 16) + 1];
-	uint32_t c=vfpu_log2_lut65536_quadratic[x >> 16];
+	uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];
+	uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];
+	uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];
 	x &= 0xFFFFu;
 	uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
 	uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
@ -1293,6 +1290,8 @@ float vfpu_log2(float x) {
 		LOAD_TABLE(vfpu_log2_lut65536,               516)&&
 		LOAD_TABLE(vfpu_log2_lut65536_quadratic,     512)&&
 		LOAD_TABLE(vfpu_log2_lut,                2097152);
+	if (!loaded)
+		return vfpu_log2_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(bits));
 	if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1345,6 +1344,8 @@ static inline uint32_t vfpu_rcp_approx(uint32_t i) {
 float vfpu_rcp(float x) {
 	static bool loaded =
 		LOAD_TABLE(vfpu_rcp_lut, 262144);
+	if (!loaded)
+		return vfpu_rcp_fallback(x);
 	uint32_t bits;
 	memcpy(&bits, &x, sizeof(bits));
 	uint32_t s = bits & 0x80000000u;
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@ -67,6 +67,23 @@ extern float vfpu_rexp2(float);
 extern float vfpu_log2(float);
 extern float vfpu_rcp(float);

+inline uint32_t get_uexp(uint32_t x) {
+	return (x >> 23) & 0xFF;
+}
+
+inline int32_t get_exp(uint32_t x) {
+	return get_uexp(x) - 127;
+}
+
+inline int32_t get_mant(uint32_t x) {
+	// Note: this returns the hidden 1.
+	return (x & 0x007FFFFF) | 0x00800000;
+}
+
+inline int32_t get_sign(uint32_t x) {
+	return x & 0x80000000;
+}
+
 #define VFPU_FLOAT16_EXP_MAX    0x1f
 #define VFPU_SH_FLOAT16_SIGN    15
 #define VFPU_MASK_FLOAT16_SIGN  0x1
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -347,6 +347,7 @@ EXEC_AND_LIB_FILES := \
  $(SRC)/Core/MIPS/MIPSStackWalk.cpp \
  $(SRC)/Core/MIPS/MIPSTables.cpp \
  $(SRC)/Core/MIPS/MIPSVFPUUtils.cpp.arm \
+  $(SRC)/Core/MIPS/MIPSVFPUFallbacks.cpp.arm \
  $(SRC)/Core/MIPS/MIPSCodeUtils.cpp.arm \
  $(SRC)/Core/MIPS/MIPSDebugInterface.cpp \
  $(SRC)/Core/MIPS/IR/IRFrontend.cpp \
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@ -652,6 +652,7 @@ SOURCES_CXX += \
 	       $(COREDIR)/MIPS/MIPSTables.cpp \
 	       $(COREDIR)/MIPS/MIPSStackWalk.cpp \
 	       $(COREDIR)/MIPS/MIPSVFPUUtils.cpp \
+	       $(COREDIR)/MIPS/MIPSVFPUFallbacks.cpp \
 	       $(COREDIR)/MemFault.cpp \
 	       $(COREDIR)/MemMap.cpp \
 	       $(COREDIR)/MemMapFunctions.cpp \