Merge pull request #16984 from fp64/vfpu-sincos

VFPU sin/cos
This commit is contained in:
Henrik Rydgård 2023-03-28 16:36:51 +02:00 committed by GitHub
commit 98d9a9c8ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 633 additions and 279 deletions

View File

@ -2388,6 +2388,7 @@ set(NativeAssets
assets/lang
assets/shaders
assets/themes
assets/vfpu
assets/Roboto-Condensed.ttf
assets/7z.png
assets/compat.ini
@ -2498,6 +2499,7 @@ if(TargetBin)
file(GLOB_RECURSE SHADER_FILES assets/shaders/*)
file(GLOB_RECURSE THEME_FILE assets/themes/*)
file(GLOB_RECURSE DEBUGGER_FILES assets/debugger/*)
file(GLOB_RECURSE VFPU_FILES assets/vfpu/*)
if(NOT IOS)
set_source_files_properties(${BigFontAssets} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets")
@ -2507,6 +2509,7 @@ if(TargetBin)
set_source_files_properties(${SHADER_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/shaders")
set_source_files_properties(${THEME_FILE} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/themes")
set_source_files_properties(${DEBUGGER_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/debugger")
set_source_files_properties(${VFPU_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/vfpu")
endif()
if(IOS)

View File

@ -629,18 +629,18 @@ namespace MIPSInt
// vsat0 changes -0.0 to +0.0, both retain NAN.
case 4: if (s[i] <= 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
case 5: if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
case 16: d[i] = 1.0f / s[i]; break; //vrcp
case 16: { d[i] = vfpu_rcp(s[i]); } break; //vrcp
case 17: d[i] = USE_VFPU_SQRT ? vfpu_rsqrt(s[i]) : 1.0f / sqrtf(s[i]); break; //vrsq
case 18: { d[i] = vfpu_sin(s[i]); } break; //vsin
case 19: { d[i] = vfpu_cos(s[i]); } break; //vcos
case 20: d[i] = powf(2.0f, s[i]); break; //vexp2
case 21: d[i] = logf(s[i])/log(2.0f); break; //vlog2
case 20: { d[i] = vfpu_exp2(s[i]); } break; //vexp2
case 21: { d[i] = vfpu_log2(s[i]); } break; //vlog2
case 22: d[i] = USE_VFPU_SQRT ? vfpu_sqrt(s[i]) : fabsf(sqrtf(s[i])); break; //vsqrt
case 23: d[i] = (float)(asinf(s[i]) / M_PI_2); break; //vasin
case 24: d[i] = -1.0f / s[i]; break; // vnrcp
case 23: { d[i] = vfpu_asin(s[i]); } break; //vasin
case 24: { d[i] = -vfpu_rcp(s[i]); } break; // vnrcp
case 26: { d[i] = -vfpu_sin(s[i]); } break; // vnsin
case 28: d[i] = 1.0f / powf(2.0, s[i]); break; // vrexp2
case 28: { d[i] = vfpu_rexp2(s[i]); } break; // vrexp2
default:
_dbg_assert_msg_( false, "Invalid VV2Op op type %d", optype);
break;

View File

@ -22,6 +22,7 @@
#include "Common/BitScan.h"
#include "Common/CommonFuncs.h"
#include "Common/File/VFS/VFS.h"
#include "Core/Reporting.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/MIPSVFPUUtils.h"
@ -29,13 +30,6 @@
#define V(i) (currentMIPS->v[voffset[i]])
#define VI(i) (currentMIPS->vi[voffset[i]])
// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
// to the shape of the sine curve which seem to be very slightly different.
//
// So setting a lower value.
#define PRECISION_EXP_THRESHOLD 0x65
union float2int {
uint32_t i;
float f;
@ -798,274 +792,600 @@ float vfpu_dot(const float a[4], const float b[4]) {
return result.f;
}
// TODO: This is still not completely accurate compared to the PSP's vsqrt.
float vfpu_sqrt(float a) {
float2int val;
val.f = a;
//==============================================================================
// The code below attempts to exactly match the output of
// several PSP's VFPU functions. For the sake of
// making lookup tables smaller the code is
// somewhat gnarly.
// Lookup tables sometimes store deltas from (explicitly computable)
// estimations, to allow to store them in smaller types.
// See https://github.com/hrydgard/ppsspp/issues/16946 for details.
if ((val.i & 0xff800000) == 0x7f800000) {
if ((val.i & 0x007fffff) != 0) {
val.i = 0x7f800001;
// Lookup tables.
// Note: these are never unloaded, and stay till program termination.
static uint32_t (*vfpu_sin_lut8192)=nullptr;
static int8_t (*vfpu_sin_lut_delta)[2]=nullptr;
static int16_t (*vfpu_sin_lut_interval_delta)=nullptr;
static uint8_t (*vfpu_sin_lut_exceptions)=nullptr;
static int8_t (*vfpu_sqrt_lut)[2]=nullptr;
static int8_t (*vfpu_rsqrt_lut)[2]=nullptr;
static uint32_t (*vfpu_exp2_lut65536)=nullptr;
static uint8_t (*vfpu_exp2_lut)[2]=nullptr;
static uint32_t (*vfpu_log2_lut65536)=nullptr;
static uint32_t (*vfpu_log2_lut65536_quadratic)=nullptr;
static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr;
static int32_t (*vfpu_asin_lut65536)[3]=nullptr;
static uint64_t (*vfpu_asin_lut_deltas)=nullptr;
static uint16_t (*vfpu_asin_lut_indices)=nullptr;
static int8_t (*vfpu_rcp_lut)[2]=nullptr;
template<typename T>
static inline bool load_vfpu_table(T *&ptr,const char *filename, size_t expected_size) {
#if COMMON_BIG_ENDIAN
// Tables are little-endian.
#error Byteswap for VFPU tables not implemented
#endif
if(ptr) return true; // Already loaded.
size_t size = 0u;
INFO_LOG(CPU, "Loading '%s'...", filename);
ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
if(!ptr || size != expected_size)
{
ERROR_LOG(CPU, "Error loading '%s' (size=%u, expected: %u)", filename, (unsigned)size, (unsigned)expected_size);
if(ptr) delete[] ptr;
ptr = nullptr;
return false;
}
INFO_LOG(CPU, "Successfully loaded '%s'", filename);
return true;
}
#define LOAD_TABLE(name, expected_size)\
load_vfpu_table(name,"vfpu/" #name ".dat",expected_size)
// Note: PSP sin/cos output only has 22 significant
// binary digits.
static inline uint32_t vfpu_sin_quantum(uint32_t x) {
return x < 1u << 22?
1u:
1u << (32 - 22 - clz32_nonzero(x));
}
static inline uint32_t vfpu_sin_truncate_bits(u32 x) {
return x & -vfpu_sin_quantum(x);
}
static inline uint32_t vfpu_sin_fixed(uint32_t arg) {
// Handle endpoints.
if(arg == 0u) return 0u;
if(arg == 0x00800000) return 0x10000000;
// Get endpoints for 8192-wide interval.
uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];
uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];
// Approximate endpoints for 64-wide interval via lerp.
uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);
uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);
// Adjust endpoints from deltas, and increase working precision.
uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);
uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);
// Compute approximation via lerp. Is off by at most 1 quantum.
uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);
v=vfpu_sin_truncate_bits(v);
// Look up exceptions via binary search.
// Note: vfpu_sin_lut_interval_delta stores
// deltas from interval estimation.
uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;
uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;
while(lo < hi) {
uint32_t m = (lo + hi) / 2;
// Note: vfpu_sin_lut_exceptions stores
// index&127 (for each initial interval the
// upper bits of index are the same, namely
// arg&-128), plus direction (0 for +1, and
// 128 for -1).
uint32_t b = vfpu_sin_lut_exceptions[m];
uint32_t e = (arg & -128u)+(b & 127u);
if(e == arg) {
v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);
break;
}
return val.f;
else if(e < arg) lo = m + 1;
else hi = m;
}
if ((val.i & 0x7f800000) == 0) {
// Kill any sign.
val.i = 0;
return val.f;
}
if (val.i & 0x80000000) {
val.i = 0x7f800001;
return val.f;
}
int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k >>= 1;
uint32_t z = 0x00C00000 >> less_bits;
int64_t halfsp = sp >> 1;
halfsp <<= 23 - less_bits;
for (int i = 0; i < 6; ++i) {
z = (z >> 1) + (uint32_t)(halfsp / z);
}
val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
// The lower two bits never end up set on the PSP, it seems like.
val.i &= 0xFFFFFFFC;
return val.f;
return v;
}
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
uint64_t m = (uint64_t)a * (uint64_t)b;
if (m & 0x007FFFFF) {
m += 0x01437000;
float vfpu_sin(float x) {
static bool loaded =
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
uint32_t sign = bits & 0x80000000u;
uint32_t exponent = (bits >> 23) & 0xFFu;
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
if(exponent == 0xFFu) {
// NOTE: this bitpattern is a signaling
// NaN on x86, so maybe just return
// a normal qNaN?
float y;
bits=sign ^ 0x7F800001u;
memcpy(&y, &bits, sizeof(y));
return y;
}
return (uint32_t)(m >> 23);
if(exponent < 0x7Fu) {
if(exponent < 0x7Fu-23u) significand = 0u;
else significand >>= (0x7F - exponent);
}
else if(exponent > 0x7Fu) {
// There is weirdness for large exponents.
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
else significand <<= (exponent - 0x7Fu);
}
sign ^= ((significand << 7) & 0x80000000u);
significand &= 0x00FFFFFFu;
if(significand > 0x00800000u) significand = 0x01000000u - significand;
uint32_t ret = vfpu_sin_fixed(significand);
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
}
float vfpu_rsqrt(float a) {
float2int val;
val.f = a;
if (val.i == 0x7f800000) {
return 0.0f;
float vfpu_cos(float x) {
static bool loaded =
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
bits &= 0x7FFFFFFFu;
uint32_t sign = 0u;
uint32_t exponent = (bits >> 23) & 0xFFu;
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
if(exponent == 0xFFu) {
// NOTE: this bitpattern is a signaling
// NaN on x86, so maybe just return
// a normal qNaN?
float y;
bits = sign ^ 0x7F800001u;
memcpy(&y, &bits, sizeof(y));
return y;
}
if ((val.i & 0x7fffffff) > 0x7f800000) {
val.i = (val.i & 0x80000000) | 0x7f800001;
return val.f;
if(exponent < 0x7Fu) {
if(exponent < 0x7Fu - 23u) significand = 0u;
else significand >>= (0x7F - exponent);
}
if ((val.i & 0x7f800000) == 0) {
val.i = (val.i & 0x80000000) | 0x7f800000;
return val.f;
else if(exponent > 0x7Fu) {
// There is weirdness for large exponents.
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
else significand <<= (exponent - 0x7Fu);
}
if (val.i & 0x80000000) {
val.i = 0xff800001;
return val.f;
sign ^= ((significand << 7) & 0x80000000u);
significand &= 0x00FFFFFFu;
if(significand >= 0x00800000u) {
significand = 0x01000000u - significand;
sign ^= 0x80000000u;
}
int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k = -(k >> 1);
uint32_t z = 0x00800000 >> less_bits;
uint32_t halfsp = sp >> (1 + less_bits);
for (int i = 0; i < 6; ++i) {
uint32_t zsq = mant_mul(z, z);
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
z = mant_mul(z, correction);
}
int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
if (shift < 1) {
z >>= -shift;
k += -shift;
} else if (shift > 0) {
z <<= shift;
k -= shift;
}
z >>= less_bits;
val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
val.i &= 0xFFFFFFFC;
return val.f;
}
float vfpu_sin(float a) {
float2int val;
val.f = a;
int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
return val.f;
}
if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
return val.f;
}
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip sign to inverse the wave.
if (k == 0x80 && mantissa >= (1 << 23)) {
val.i ^= 0x80000000;
mantissa -= 1 << 23;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
return val.f;
}
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
val.f = (float)sin((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return val.f;
}
float vfpu_cos(float a) {
float2int val;
val.f = a;
bool negate = false;
int32_t k = get_uexp(val.i);
if (k == 255) {
// Note: unlike sin, cos always returns +NAN.
val.i = (val.i & 0x7F800001) | 1;
return val.f;
}
if (k < PRECISION_EXP_THRESHOLD)
return 1.0f;
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, negate the result value.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0)
return negate ? -1.0f : 1.0f;
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
if (val.f == 1.0f || val.f == -1.0f) {
return negate ? 0.0f : -0.0f;
}
val.f = (float)cos((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return negate ? -val.f : val.f;
uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
}
void vfpu_sincos(float a, float &s, float &c) {
float2int val;
val.f = a;
// For sin, negate the input, for cos negate the output.
bool negate = false;
// Just invoke both sin and cos.
// Suboptimal but whatever.
s = vfpu_sin(a);
c = vfpu_cos(a);
}
int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
s = val.f;
val.i &= 0x7F800001;
c = val.f;
return;
}
if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
s = val.f;
c = 1.0f;
return;
}
// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip signs.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;
if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
if (negate)
val.i ^= 0x80000000;
s = val.f;
c = negate ? -1.0f : 1.0f;
return;
}
// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
float2int i_sine, i_cosine;
if (val.f == 1.0f) {
i_sine.f = negate ? -1.0f : 1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (val.f == -1.0f) {
i_sine.f = negate ? 1.0f : -1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (negate) {
i_sine.f = (float)sin((double)-val.f * M_PI_2);
i_cosine.f = -(float)cos((double)val.f * M_PI_2);
} else {
double angle = (double)val.f * M_PI_2;
#if defined(__linux__)
double d_sine;
double d_cosine;
sincos(angle, &d_sine, &d_cosine);
i_sine.f = (float)d_sine;
i_cosine.f = (float)d_cosine;
// Integer square root of 2^23*x (rounded to zero).
// Input is in 2^23 <= x < 2^25, and representable in float.
static inline uint32_t isqrt23(uint32_t x) {
#if 0
// Reference version.
int dir=fesetround(FE_TOWARDZERO);
uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));
fesetround(dir);
return ret;
#elif 1
// Double version.
// Verified to produce correct result for all valid inputs,
// in all rounding modes, both in double and double-extended (x87)
// precision.
// Requires correctly-rounded sqrt (which on any IEEE-754 system
// it should be).
return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));
#else
i_sine.f = (float)sin(angle);
i_cosine.f = (float)cos(angle);
// Pure integer version, if you don't like floating point.
// Based on code from Hacker's Delight. See isqrt4 in
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
// Relatively slow.
uint64_t t=uint64_t(x) << 23, m, y, b;
m=0x4000000000000000ull;
y=0;
while(m != 0) // Do 32 times.
{
b=y | m;
y=y >> 1;
if(t >= b)
{
t = t - b;
y = y | m;
}
m = m >> 2;
}
return y;
#endif
}
// Returns floating-point bitpattern.
static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
// Endpoints of input.
uint32_t lo =(x + 0u) & -64u;
uint32_t hi = (x + 64u) & -64u;
// Convert input to 9.23 fixed-point.
lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);
hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);
// Estimate endpoints of output.
uint32_t A = 0x3F000000u + isqrt23(lo);
uint32_t B = 0x3F000000u + isqrt23(hi);
// Apply deltas, and increase the working precision.
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
// Truncate lower 2 bits.
ret &= -4u;
return ret;
}
float vfpu_sqrt(float x) {
static bool loaded =
LOAD_TABLE(vfpu_sqrt_lut, 262144);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
// Denormals (and zeroes) get +0, regardless
// of sign.
return +0.0f;
}
if(bits >> 31) {
// Other negatives get NaN.
bits = 0x7F800001u;
memcpy(&x, &bits, sizeof(x));
return x;
}
if((bits >> 23) == 255u) {
// Inf/NaN gets Inf/NaN.
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);
memcpy(&x, &bits, sizeof(bits));
return x;
}
int32_t exponent = int32_t(bits >> 23) - 127;
// Bottom bit of exponent (inverted) + significand (except bottom bit).
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
bits = vfpu_sqrt_fixed(index);
bits += uint32_t(exponent >> 1) << 23;
memcpy(&x, &bits, sizeof(bits));
return x;
}
// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.
static inline uint32_t rsqrt_floor22(uint32_t x) {
#if 1
// Verified correct in all rounding directions,
// by exhaustive search.
return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33
#else
// Pure integer version, if you don't like floating point.
// Based on code from Hacker's Delight. See isqrt4 in
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
// Relatively slow.
uint64_t t=uint64_t(x) << 22, m, y, b;
m = 0x4000000000000000ull;
y = 0;
while(m != 0) // Do 32 times.
{
b = y | m;
y = y >> 1;
if(t >= b)
{
t = t - b;
y = y | m;
}
m = m >> 2;
}
y = (1ull << 44) / y;
// Decrement if y > floor(2^33 / sqrt(x)).
// This hack works because exhaustive
// search (on [2^22;2^24]) says it does.
if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;
return uint32_t(y);
#endif
}
// Returns floating-point bitpattern.
static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
// Endpoints of input.
uint32_t lo = (x + 0u) & -64u;
uint32_t hi = (x + 64u) & -64u;
// Convert input to 10.22 fixed-point.
lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);
hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);
// Estimate endpoints of output.
uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);
uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);
// Apply deltas, and increase the working precision.
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);
// Evaluate via lerp.
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
// Truncate lower 2 bits.
ret &= -4u;
return ret;
}
float vfpu_rsqrt(float x) {
static bool loaded =
LOAD_TABLE(vfpu_rsqrt_lut, 262144);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
// Denormals (and zeroes) get inf of the same sign.
bits = 0x7F800000u | (bits & 0x80000000u);
memcpy(&x, &bits, sizeof(x));
return x;
}
if(bits >> 31) {
// Other negatives get negative NaN.
bits = 0xFF800001u;
memcpy(&x, &bits, sizeof(x));
return x;
}
if((bits >> 23) == 255u) {
// inf gets 0, NaN gets NaN.
bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);
memcpy(&x, &bits, sizeof(bits));
return x;
}
int32_t exponent = int32_t(bits >> 23) - 127;
// Bottom bit of exponent (inverted) + significand (except bottom bit).
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
bits = vfpu_rsqrt_fixed(index);
bits -= uint32_t(exponent >> 1) << 23;
memcpy(&x, &bits, sizeof(bits));
return x;
}
static inline uint32_t vfpu_asin_quantum(uint32_t x) {
return x<1u<<23?
1u:
1u<<(32-23-clz32_nonzero(x));
}
static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {
return x & -vfpu_asin_quantum(x);
}
// Input is fixed 9.23, output is fixed 2.30.
static inline uint32_t vfpu_asin_approx(uint32_t x) {
const int32_t *C = vfpu_asin_lut65536[x >> 16];
x &= 0xFFFFu;
return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));
}
// Input is fixed 9.23, output is fixed 2.30.
static uint32_t vfpu_asin_fixed(uint32_t x) {
if(x == 0u) return 0u;
if(x == 1u << 23) return 1u << 30;
uint32_t ret = vfpu_asin_approx(x);
uint32_t index = vfpu_asin_lut_indices[x / 21u];
uint64_t deltas = vfpu_asin_lut_deltas[index];
return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);
}
float vfpu_asin(float x) {
static bool loaded =
LOAD_TABLE(vfpu_asin_lut65536, 1536)&&
LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
LOAD_TABLE(vfpu_asin_lut_deltas, 517448);
uint32_t bits;
memcpy(&bits, &x, sizeof(x));
uint32_t sign = bits & 0x80000000u;
bits = bits & 0x7FFFFFFFu;
if(bits > 0x3F800000u) {
bits = 0x7F800001u ^ sign;
memcpy(&x, &bits, sizeof(x));
return x;
}
i_sine.i &= 0xFFFFFFFC;
i_cosine.i &= 0xFFFFFFFC;
s = i_sine.f;
c = i_cosine.f;
return ;
bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23
x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30
if(sign) x = -x;
return x;
}
void InitVFPUSinCos() {
// TODO: Could prepare a CORDIC table here.
static inline uint32_t vfpu_exp2_approx(uint32_t x) {
if(x == 0x00800000u) return 0x00800000u;
uint32_t a=vfpu_exp2_lut65536[x >> 16];
x &= 0x0000FFFFu;
uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));
return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;
}
static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
if(x == 0u) return 0u;
if(x == 0x00800000u) return 0x00800000u;
uint32_t A = vfpu_exp2_approx((x ) & -64u);
uint32_t B = vfpu_exp2_approx((x + 64) & -64u);
uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;
uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;
uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
y &= -4u;
return y;
}
float vfpu_exp2(float x) {
static bool loaded =
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
LOAD_TABLE(vfpu_exp2_lut, 262144);
int32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
// Denormals are treated as 0.
return 1.0f;
}
if(x != x) {
// NaN gets NaN.
bits = 0x7F800001u;
memcpy(&x, &bits, sizeof(x));
return x;
}
if(x <= -126.0f) {
// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).
// But yes, -126.0f produces +0.0f.
return 0.0f;
}
if(x >= +128.0f) {
// Large numbers get infinity.
bits = 0x7F800000u;
memcpy(&x, &bits, sizeof(x));
return x;
}
bits = int32_t(x * 0x1p23f);
if(x < 0.0f) --bits; // Yes, really.
bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));
memcpy(&x, &bits, sizeof(bits));
return x;
}
float vfpu_rexp2(float x) {
return vfpu_exp2(-x);
}
// Input fixed 9.23, output fixed 10.22.
// Returns log2(1+x).
static inline uint32_t vfpu_log2_approx(uint32_t x) {
uint32_t a=vfpu_log2_lut65536[(x >> 16) + 0];
uint32_t b=vfpu_log2_lut65536[(x >> 16) + 1];
uint32_t c=vfpu_log2_lut65536_quadratic[x >> 16];
x &= 0xFFFFu;
uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
ret += d;
return uint32_t(ret >> 16);
}
// Matches PSP output on all known values.
float vfpu_log2(float x) {
static bool loaded =
LOAD_TABLE(vfpu_log2_lut65536, 516)&&
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&
LOAD_TABLE(vfpu_log2_lut, 2097152);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
// Denormals (and zeroes) get -inf.
bits = 0xFF800000u;
memcpy(&x, &bits, sizeof(x));
return x;
}
if(bits & 0x80000000u) {
// Other negatives get NaN.
bits = 0x7F800001u;
memcpy(&x, &bits, sizeof(x));
return x;
}
if((bits >> 23) == 255u) {
// NaN gets NaN, +inf gets +inf.
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);
memcpy(&x, &bits, sizeof(x));
return x;
}
uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;
uint32_t i = bits & 0x007FFFFFu;
if(e >> 31 && i >= 0x007FFE00u) {
// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,
// since the table doesn't give the right answer.
float c = float(int32_t(~e) >> 23);
// Note: if c is 0 the sign of -0 output is correct.
return i < 0x007FFEF7u ? // 1-265*2^{-24}
-3.05175781e-05f - c:
-0.0f - c;
}
int d = (e < 0x01000000u ? 0 : 8 - clz32_nonzero(e) - int(e >> 31));
//assert(d >= 0 && d < 8);
uint32_t q = 1u << d;
uint32_t A = vfpu_log2_approx((i ) & -64u) & -q;
uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;
uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;
uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;
uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);
v &= -q;
bits = e ^ (2u * v);
x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f
return x;
}
static inline uint32_t vfpu_rcp_approx(uint32_t i) {
return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);
}
float vfpu_rcp(float x) {
static bool loaded =
LOAD_TABLE(vfpu_rcp_lut, 262144);
uint32_t bits;
memcpy(&bits, &x, sizeof(bits));
uint32_t s = bits & 0x80000000u;
uint32_t e = bits & 0x7F800000u;
uint32_t i = bits & 0x007FFFFFu;
if((bits & 0x7FFFFFFFu) > 0x7E800000u) {
bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);
memcpy(&x, &bits, sizeof(x));
return x;
}
if(e==0u) {
bits = s^0x7F800000u;
memcpy(&x, &bits, sizeof(x));
return x;
}
uint32_t A = vfpu_rcp_approx((i ) & -64u);
uint32_t B = vfpu_rcp_approx((i + 64) & -64u);
uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;
uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;
uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);
v &= -4u;
bits = s + (0x3F800000u - e) + v;
memcpy(&x, &bits, sizeof(x));
return x;
}
//==============================================================================
void InitVFPU() {
#if 0
// Load all in advance.
LOAD_TABLE(vfpu_asin_lut65536 , 1536);
LOAD_TABLE(vfpu_asin_lut_deltas , 517448);
LOAD_TABLE(vfpu_asin_lut_indices , 798916);
LOAD_TABLE(vfpu_exp2_lut65536 , 512);
LOAD_TABLE(vfpu_exp2_lut , 262144);
LOAD_TABLE(vfpu_log2_lut65536 , 516);
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512);
LOAD_TABLE(vfpu_log2_lut , 2097152);
LOAD_TABLE(vfpu_rcp_lut , 262144);
LOAD_TABLE(vfpu_rsqrt_lut , 262144);
LOAD_TABLE(vfpu_sin_lut8192 , 4100);
LOAD_TABLE(vfpu_sin_lut_delta , 262144);
LOAD_TABLE(vfpu_sin_lut_exceptions , 86938);
LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074);
LOAD_TABLE(vfpu_sqrt_lut , 262144);
#endif
}

View File

@ -35,18 +35,15 @@ inline int Xpose(int v) {
#endif
// The VFPU uses weird angles where 4.0 represents a full circle. This makes it possible to return
// exact 1.0/-1.0 values at certain angles. We currently just scale, and special case the cardinal directions.
// exact 1.0/-1.0 values at certain angles.
//
// Stepping down to [0, 2pi) helps, but we also check common exact-result values.
// TODO: cos(1) and sin(2) should be -0.0, but doing that gives wrong results (possibly from floorf.)
//
// We also try an alternative solution, computing things in double precision, multiplying the input by pi/2.
// This fixes #12900 (Hitman Reborn 2) but breaks #13705 (Cho Aniki Zero) and #13671 (Hajime no Ippo).
// #2921 is still fine. So the alt solution (vfpu_sin_double etc) are behind a compat flag.
//
// A better solution would be to tailor some sine approximation for the 0..90 degrees range, compute
// modulo manually and mirror that around the circle. Also correctly special casing for inf/nan inputs
// and just trying to match it as closely as possible to the real PSP.
// The current code attempts to match VFPU sin/cos exactly.
// Possibly affected games:
// Final Fantasy III (#2921 )
// Hitman Reborn 2 (#12900)
// Cho Aniki Zero (#13705)
// Hajime no Ippo (#13671)
// Dissidia Duodecim Final Fantasy (#6710 )
//
// Messing around with the modulo functions? try https://www.desmos.com/calculator.
@ -54,9 +51,7 @@ extern float vfpu_sin(float);
extern float vfpu_cos(float);
extern void vfpu_sincos(float, float&, float&);
inline float vfpu_asin(float angle) {
return (float)(asinf(angle) / M_PI_2);
}
extern float vfpu_asin(float);
inline float vfpu_clamp(float v, float min, float max) {
// Note: NAN is preserved, and -0.0 becomes +0.0 if min=+0.0.
@ -67,6 +62,11 @@ float vfpu_dot(const float a[4], const float b[4]);
float vfpu_sqrt(float a);
float vfpu_rsqrt(float a);
extern float vfpu_exp2(float);
extern float vfpu_rexp2(float);
extern float vfpu_log2(float);
extern float vfpu_rcp(float);
#define VFPU_FLOAT16_EXP_MAX 0x1f
#define VFPU_SH_FLOAT16_SIGN 15
#define VFPU_MASK_FLOAT16_SIGN 0x1
@ -215,4 +215,4 @@ int GetVectorOverlap(int reg1, VectorSize size1, int reg2, VectorSize size2);
bool GetVFPUCtrlMask(int reg, u32 *mask);
float Float16ToFloat32(unsigned short l);
void InitVFPUSinCos();
void InitVFPU();

View File

@ -306,7 +306,7 @@ bool CPU_Init(std::string *errorString) {
// likely to collide with any commercial ones.
g_CoreParameter.compat.Load(g_paramSFO.GetDiscID());
InitVFPUSinCos();
InitVFPU();
if (allowPlugins)
HLEPlugins::Init();

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -92,7 +92,7 @@ static void SetupJitHarness() {
Memory::Init();
mipsr4k.Reset();
CoreTiming::Init();
InitVFPUSinCos();
InitVFPU();
}
static void DestroyJitHarness() {

View File

@ -58,6 +58,8 @@
#include "Common/CPUDetect.h"
#include "Common/Log.h"
#include "Core/Config.h"
#include "Common/File/VFS/VFS.h"
#include "Common/File/VFS/DirectoryReader.h"
#include "Core/FileSystems/ISOFileSystem.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSVFPUUtils.h"
@ -358,7 +360,10 @@ bool TestTinySet() {
bool TestVFPUSinCos() {
float sine, cosine;
InitVFPUSinCos();
// Needed for VFPU tables.
// There might be a better place to invoke it, but whatever.
g_VFS.Register("", new DirectoryReader(Path("assets")));
InitVFPU();
vfpu_sincos(0.0f, sine, cosine);
EXPECT_EQ_FLOAT(sine, 0.0f);
EXPECT_EQ_FLOAT(cosine, 1.0f);