Add back our older VFPU approximations, as fallbacks if files are missing.

PR #16984 added more accurate versions of these functions, but they require
large lookup tables stored in assets/.

If these files are missing, PPSSPP would simply crash, which isn't good.

We should probably try to warn the user somehow that these files are
missing, though...
This commit is contained in:
Henrik Rydgård 2023-04-03 11:33:41 +02:00
parent ecfd4759dd
commit aba026f7e9
9 changed files with 379 additions and 23 deletions

View File

@ -2067,6 +2067,8 @@ add_library(${CoreLibName} ${CoreLinkType}
Core/MIPS/MIPSTables.h
Core/MIPS/MIPSVFPUUtils.cpp
Core/MIPS/MIPSVFPUUtils.h
Core/MIPS/MIPSVFPUFallbacks.cpp
Core/MIPS/MIPSVFPUFallbacks.h
Core/MIPS/MIPSAsm.cpp
Core/MIPS/MIPSAsm.h
Core/MemFault.cpp

View File

@ -590,6 +590,7 @@
<ClCompile Include="MIPS\IR\IRJit.cpp" />
<ClCompile Include="MIPS\IR\IRPassSimplify.cpp" />
<ClCompile Include="MIPS\IR\IRRegCache.cpp" />
<ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp" />
<ClCompile Include="Replay.cpp" />
<ClCompile Include="Compatibility.cpp" />
<ClCompile Include="Config.cpp" />
@ -1154,6 +1155,7 @@
<ClInclude Include="MIPS\IR\IRJit.h" />
<ClInclude Include="MIPS\IR\IRPassSimplify.h" />
<ClInclude Include="MIPS\IR\IRRegCache.h" />
<ClInclude Include="MIPS\MIPSVFPUFallbacks.h" />
<ClInclude Include="Replay.h" />
<ClInclude Include="Compatibility.h" />
<ClInclude Include="Config.h" />

View File

@ -1192,6 +1192,9 @@
<ClCompile Include="TiltEventProcessor.cpp">
<Filter>Core</Filter>
</ClCompile>
<ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp">
<Filter>MIPS</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="ELF\ElfReader.h">
@ -1926,6 +1929,9 @@
<ClInclude Include="TiltEventProcessor.h">
<Filter>Core</Filter>
</ClInclude>
<ClInclude Include="MIPS\MIPSVFPUFallbacks.h">
<Filter>MIPS</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\LICENSE.TXT" />

View File

@ -0,0 +1,312 @@
#include <cmath>
#include "Common/BitScan.h"
#include "Core/MIPS/MIPSVFPUFallbacks.h"
#include "Core/MIPS/MIPSVFPUUtils.h"
// MIPSVFPUUtils now has the high precision instructions implemented by fp64
// in https://github.com/hrydgard/ppsspp/pull/16984 .
//
// These are our older approximations that are quite good but has flaws,
// but we need them to fall back to if the table files are missing.
//
// Note that currently, some of the new functions are not integrated in the JIT
// and are thus not normally used anyway.
// First the "trivial" fallbacks where we haven't done any accuracy work previously.
float vfpu_asin_fallback(float angle) {
return (float)(asinf(angle) / M_PI_2);
}
float vfpu_rcp_fallback(float x) {
return 1.0f / x;
}
float vfpu_log2_fallback(float x) {
return log2f(x);
}
float vfpu_exp2_fallback(float x) {
return exp2f(x);
}
// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
// to the shape of the sine curve which seem to be very slightly different.
//
// So setting a lower value.
#define PRECISION_EXP_THRESHOLD 0x65
union float2int {
uint32_t i;
float f;
};
float vfpu_sqrt_fallback(float a) {
float2int val;
val.f = a;
if ((val.i & 0xff800000) == 0x7f800000) {
if ((val.i & 0x007fffff) != 0) {
val.i = 0x7f800001;
}
return val.f;
}
if ((val.i & 0x7f800000) == 0) {
// Kill any sign.
val.i = 0;
return val.f;
}
if (val.i & 0x80000000) {
val.i = 0x7f800001;
return val.f;
}
int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k >>= 1;
uint32_t z = 0x00C00000 >> less_bits;
int64_t halfsp = sp >> 1;
halfsp <<= 23 - less_bits;
for (int i = 0; i < 6; ++i) {
z = (z >> 1) + (uint32_t)(halfsp / z);
}
val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
// The lower two bits never end up set on the PSP, it seems like.
val.i &= 0xFFFFFFFC;
return val.f;
}
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
uint64_t m = (uint64_t)a * (uint64_t)b;
if (m & 0x007FFFFF) {
m += 0x01437000;
}
return (uint32_t)(m >> 23);
}
float vfpu_rsqrt_fallback(float a) {
float2int val;
val.f = a;
if (val.i == 0x7f800000) {
return 0.0f;
}
if ((val.i & 0x7fffffff) > 0x7f800000) {
val.i = (val.i & 0x80000000) | 0x7f800001;
return val.f;
}
if ((val.i & 0x7f800000) == 0) {
val.i = (val.i & 0x80000000) | 0x7f800000;
return val.f;
}
if (val.i & 0x80000000) {
val.i = 0xff800001;
return val.f;
}
int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k = -(k >> 1);
uint32_t z = 0x00800000 >> less_bits;
uint32_t halfsp = sp >> (1 + less_bits);
for (int i = 0; i < 6; ++i) {
uint32_t zsq = mant_mul(z, z);
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
z = mant_mul(z, correction);
}
int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
if (shift < 1) {
z >>= -shift;
k += -shift;
} else if (shift > 0) {
z <<= shift;
k -= shift;
}
z >>= less_bits;
val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
val.i &= 0xFFFFFFFC;
return val.f;
}
float vfpu_sin_fallback(float a) {
float2int val;
val.f = a;
int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
return val.f;
}
if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
return val.f;
}
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip sign to inverse the wave.
if (k == 0x80 && mantissa >= (1 << 23)) {
val.i ^= 0x80000000;
mantissa -= 1 << 23;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
return val.f;
}
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
val.f = (float)sin((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return val.f;
}
float vfpu_cos_fallback(float a) {
float2int val;
val.f = a;
bool negate = false;
int32_t k = get_uexp(val.i);
if (k == 255) {
// Note: unlike sin, cos always returns +NAN.
val.i = (val.i & 0x7F800001) | 1;
return val.f;
}
if (k < PRECISION_EXP_THRESHOLD)
return 1.0f;
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, negate the result value.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0)
return negate ? -1.0f : 1.0f;
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
if (val.f == 1.0f || val.f == -1.0f) {
return negate ? 0.0f : -0.0f;
}
val.f = (float)cos((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return negate ? -val.f : val.f;
}
void vfpu_sincos_fallback(float a, float &s, float &c) {
float2int val;
val.f = a;
// For sin, negate the input, for cos negate the output.
bool negate = false;
int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
s = val.f;
val.i &= 0x7F800001;
c = val.f;
return;
}
if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
s = val.f;
c = 1.0f;
return;
}
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip signs.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
if (negate)
val.i ^= 0x80000000;
s = val.f;
c = negate ? -1.0f : 1.0f;
return;
}
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
float2int i_sine, i_cosine;
if (val.f == 1.0f) {
i_sine.f = negate ? -1.0f : 1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (val.f == -1.0f) {
i_sine.f = negate ? 1.0f : -1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (negate) {
i_sine.f = (float)sin((double)-val.f * M_PI_2);
i_cosine.f = -(float)cos((double)val.f * M_PI_2);
} else {
double angle = (double)val.f * M_PI_2;
#if defined(__linux__)
double d_sine;
double d_cosine;
sincos(angle, &d_sine, &d_cosine);
i_sine.f = (float)d_sine;
i_cosine.f = (float)d_cosine;
#else
i_sine.f = (float)sin(angle);
i_cosine.f = (float)cos(angle);
#endif
}
i_sine.i &= 0xFFFFFFFC;
i_cosine.i &= 0xFFFFFFFC;
s = i_sine.f;
c = i_cosine.f;
return;
}

View File

@ -0,0 +1,14 @@
#pragma once
// These are our old implementation of VFPU math functions, that don't make use of the
// accuracy-improving tables from #16984.
float vfpu_asin_fallback(float angle);
float vfpu_sqrt_fallback(float a);
float vfpu_rsqrt_fallback(float a);
float vfpu_sin_fallback(float a);
float vfpu_cos_fallback(float a);
void vfpu_sincos_fallback(float a, float &s, float &c);
float vfpu_rcp_fallback(float x);
float vfpu_log2_fallback(float x);
float vfpu_exp2_fallback(float x);

View File

@ -26,6 +26,7 @@
#include "Core/Reporting.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/MIPSVFPUUtils.h"
#include "Core/MIPS/MIPSVFPUFallbacks.h"
#ifdef _MSC_VER
#pragma warning(disable: 4146)
@ -666,23 +667,6 @@ float Float16ToFloat32(unsigned short l)
return f;
}
static uint32_t get_uexp(uint32_t x) {
return (x >> 23) & 0xFF;
}
int32_t get_exp(uint32_t x) {
return get_uexp(x) - 127;
}
static int32_t get_mant(uint32_t x) {
// Note: this returns the hidden 1.
return (x & 0x007FFFFF) | 0x00800000;
}
static int32_t get_sign(uint32_t x) {
return x & 0x80000000;
}
float vfpu_dot(const float a[4], const float b[4]) {
static const int EXTRA_BITS = 2;
float2int result;
@ -830,12 +814,12 @@ static uint16_t (*vfpu_asin_lut_indices)=nullptr;
static int8_t (*vfpu_rcp_lut)[2]=nullptr;
template<typename T>
static inline bool load_vfpu_table(T *&ptr,const char *filename, size_t expected_size) {
static inline bool load_vfpu_table(T *&ptr, const char *filename, size_t expected_size) {
#if COMMON_BIG_ENDIAN
// Tables are little-endian.
#error Byteswap for VFPU tables not implemented
#endif
if(ptr) return true; // Already loaded.
if (ptr) return true; // Already loaded.
size_t size = 0u;
INFO_LOG(CPU, "Loading '%s'...", filename);
ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
@ -911,6 +895,8 @@ float vfpu_sin(float x) {
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
if (!loaded)
return vfpu_sin_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
uint32_t sign = bits & 0x80000000u;
@ -948,6 +934,8 @@ float vfpu_cos(float x) {
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
if (!loaded)
return vfpu_cos_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
bits &= 0x7FFFFFFFu;
@ -1053,6 +1041,8 @@ static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
float vfpu_sqrt(float x) {
static bool loaded =
LOAD_TABLE(vfpu_sqrt_lut, 262144);
if (!loaded)
return vfpu_sqrt_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1139,6 +1129,8 @@ static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
float vfpu_rsqrt(float x) {
static bool loaded =
LOAD_TABLE(vfpu_rsqrt_lut, 262144);
if (!loaded)
return vfpu_rsqrt_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1200,6 +1192,9 @@ float vfpu_asin(float x) {
LOAD_TABLE(vfpu_asin_lut65536, 1536)&&
LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
LOAD_TABLE(vfpu_asin_lut_deltas, 517448);
if (!loaded)
return vfpu_asin_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
uint32_t sign = bits & 0x80000000u;
@ -1238,8 +1233,10 @@ static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
float vfpu_exp2(float x) {
static bool loaded =
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
LOAD_TABLE(vfpu_exp2_lut, 262144);
if (!loaded)
return vfpu_exp2_fallback(x);
int32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
@ -1277,9 +1274,9 @@ float vfpu_rexp2(float x) {
// Input fixed 9.23, output fixed 10.22.
// Returns log2(1+x).
static inline uint32_t vfpu_log2_approx(uint32_t x) {
uint32_t a=vfpu_log2_lut65536[(x >> 16) + 0];
uint32_t b=vfpu_log2_lut65536[(x >> 16) + 1];
uint32_t c=vfpu_log2_lut65536_quadratic[x >> 16];
uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];
uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];
uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];
x &= 0xFFFFu;
uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
@ -1293,6 +1290,8 @@ float vfpu_log2(float x) {
LOAD_TABLE(vfpu_log2_lut65536, 516)&&
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&
LOAD_TABLE(vfpu_log2_lut, 2097152);
if (!loaded)
return vfpu_log2_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
@ -1345,6 +1344,8 @@ static inline uint32_t vfpu_rcp_approx(uint32_t i) {
float vfpu_rcp(float x) {
static bool loaded =
LOAD_TABLE(vfpu_rcp_lut, 262144);
if (!loaded)
return vfpu_rcp_fallback(x);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
uint32_t s = bits & 0x80000000u;

View File

@ -67,6 +67,23 @@ extern float vfpu_rexp2(float);
extern float vfpu_log2(float);
extern float vfpu_rcp(float);
inline uint32_t get_uexp(uint32_t x) {
return (x >> 23) & 0xFF;
}
inline int32_t get_exp(uint32_t x) {
return get_uexp(x) - 127;
}
inline int32_t get_mant(uint32_t x) {
// Note: this returns the hidden 1.
return (x & 0x007FFFFF) | 0x00800000;
}
inline int32_t get_sign(uint32_t x) {
return x & 0x80000000;
}
#define VFPU_FLOAT16_EXP_MAX 0x1f
#define VFPU_SH_FLOAT16_SIGN 15
#define VFPU_MASK_FLOAT16_SIGN 0x1

View File

@ -347,6 +347,7 @@ EXEC_AND_LIB_FILES := \
$(SRC)/Core/MIPS/MIPSStackWalk.cpp \
$(SRC)/Core/MIPS/MIPSTables.cpp \
$(SRC)/Core/MIPS/MIPSVFPUUtils.cpp.arm \
$(SRC)/Core/MIPS/MIPSVFPUFallbacks.cpp.arm \
$(SRC)/Core/MIPS/MIPSCodeUtils.cpp.arm \
$(SRC)/Core/MIPS/MIPSDebugInterface.cpp \
$(SRC)/Core/MIPS/IR/IRFrontend.cpp \

View File

@ -652,6 +652,7 @@ SOURCES_CXX += \
$(COREDIR)/MIPS/MIPSTables.cpp \
$(COREDIR)/MIPS/MIPSStackWalk.cpp \
$(COREDIR)/MIPS/MIPSVFPUUtils.cpp \
$(COREDIR)/MIPS/MIPSVFPUFallbacks.cpp \
$(COREDIR)/MemFault.cpp \
$(COREDIR)/MemMap.cpp \
$(COREDIR)/MemMapFunctions.cpp \