mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 05:19:56 +00:00
commit
98d9a9c8ca
@ -2388,6 +2388,7 @@ set(NativeAssets
|
||||
assets/lang
|
||||
assets/shaders
|
||||
assets/themes
|
||||
assets/vfpu
|
||||
assets/Roboto-Condensed.ttf
|
||||
assets/7z.png
|
||||
assets/compat.ini
|
||||
@ -2498,6 +2499,7 @@ if(TargetBin)
|
||||
file(GLOB_RECURSE SHADER_FILES assets/shaders/*)
|
||||
file(GLOB_RECURSE THEME_FILE assets/themes/*)
|
||||
file(GLOB_RECURSE DEBUGGER_FILES assets/debugger/*)
|
||||
file(GLOB_RECURSE VFPU_FILES assets/vfpu/*)
|
||||
|
||||
if(NOT IOS)
|
||||
set_source_files_properties(${BigFontAssets} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets")
|
||||
@ -2507,6 +2509,7 @@ if(TargetBin)
|
||||
set_source_files_properties(${SHADER_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/shaders")
|
||||
set_source_files_properties(${THEME_FILE} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/themes")
|
||||
set_source_files_properties(${DEBUGGER_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/debugger")
|
||||
set_source_files_properties(${VFPU_FILES} PROPERTIES MACOSX_PACKAGE_LOCATION "Resources/assets/vfpu")
|
||||
endif()
|
||||
|
||||
if(IOS)
|
||||
|
@ -629,18 +629,18 @@ namespace MIPSInt
|
||||
// vsat0 changes -0.0 to +0.0, both retain NAN.
|
||||
case 4: if (s[i] <= 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
|
||||
case 5: if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
|
||||
case 16: d[i] = 1.0f / s[i]; break; //vrcp
|
||||
case 16: { d[i] = vfpu_rcp(s[i]); } break; //vrcp
|
||||
case 17: d[i] = USE_VFPU_SQRT ? vfpu_rsqrt(s[i]) : 1.0f / sqrtf(s[i]); break; //vrsq
|
||||
|
||||
case 18: { d[i] = vfpu_sin(s[i]); } break; //vsin
|
||||
case 19: { d[i] = vfpu_cos(s[i]); } break; //vcos
|
||||
case 20: d[i] = powf(2.0f, s[i]); break; //vexp2
|
||||
case 21: d[i] = logf(s[i])/log(2.0f); break; //vlog2
|
||||
case 20: { d[i] = vfpu_exp2(s[i]); } break; //vexp2
|
||||
case 21: { d[i] = vfpu_log2(s[i]); } break; //vlog2
|
||||
case 22: d[i] = USE_VFPU_SQRT ? vfpu_sqrt(s[i]) : fabsf(sqrtf(s[i])); break; //vsqrt
|
||||
case 23: d[i] = (float)(asinf(s[i]) / M_PI_2); break; //vasin
|
||||
case 24: d[i] = -1.0f / s[i]; break; // vnrcp
|
||||
case 23: { d[i] = vfpu_asin(s[i]); } break; //vasin
|
||||
case 24: { d[i] = -vfpu_rcp(s[i]); } break; // vnrcp
|
||||
case 26: { d[i] = -vfpu_sin(s[i]); } break; // vnsin
|
||||
case 28: d[i] = 1.0f / powf(2.0, s[i]); break; // vrexp2
|
||||
case 28: { d[i] = vfpu_rexp2(s[i]); } break; // vrexp2
|
||||
default:
|
||||
_dbg_assert_msg_( false, "Invalid VV2Op op type %d", optype);
|
||||
break;
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "Common/BitScan.h"
|
||||
#include "Common/CommonFuncs.h"
|
||||
#include "Common/File/VFS/VFS.h"
|
||||
#include "Core/Reporting.h"
|
||||
#include "Core/MIPS/MIPS.h"
|
||||
#include "Core/MIPS/MIPSVFPUUtils.h"
|
||||
@ -29,13 +30,6 @@
|
||||
#define V(i) (currentMIPS->v[voffset[i]])
|
||||
#define VI(i) (currentMIPS->vi[voffset[i]])
|
||||
|
||||
// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
|
||||
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
|
||||
// to the shape of the sine curve which seem to be very slightly different.
|
||||
//
|
||||
// So setting a lower value.
|
||||
#define PRECISION_EXP_THRESHOLD 0x65
|
||||
|
||||
union float2int {
|
||||
uint32_t i;
|
||||
float f;
|
||||
@ -798,274 +792,600 @@ float vfpu_dot(const float a[4], const float b[4]) {
|
||||
return result.f;
|
||||
}
|
||||
|
||||
// TODO: This is still not completely accurate compared to the PSP's vsqrt.
|
||||
float vfpu_sqrt(float a) {
|
||||
float2int val;
|
||||
val.f = a;
|
||||
//==============================================================================
|
||||
// The code below attempts to exactly match the output of
|
||||
// several PSP's VFPU functions. For the sake of
|
||||
// making lookup tables smaller the code is
|
||||
// somewhat gnarly.
|
||||
// Lookup tables sometimes store deltas from (explicitly computable)
|
||||
// estimations, to allow to store them in smaller types.
|
||||
// See https://github.com/hrydgard/ppsspp/issues/16946 for details.
|
||||
|
||||
if ((val.i & 0xff800000) == 0x7f800000) {
|
||||
if ((val.i & 0x007fffff) != 0) {
|
||||
val.i = 0x7f800001;
|
||||
// Lookup tables.
|
||||
// Note: these are never unloaded, and stay till program termination.
|
||||
static uint32_t (*vfpu_sin_lut8192)=nullptr;
|
||||
static int8_t (*vfpu_sin_lut_delta)[2]=nullptr;
|
||||
static int16_t (*vfpu_sin_lut_interval_delta)=nullptr;
|
||||
static uint8_t (*vfpu_sin_lut_exceptions)=nullptr;
|
||||
|
||||
static int8_t (*vfpu_sqrt_lut)[2]=nullptr;
|
||||
|
||||
static int8_t (*vfpu_rsqrt_lut)[2]=nullptr;
|
||||
|
||||
static uint32_t (*vfpu_exp2_lut65536)=nullptr;
|
||||
static uint8_t (*vfpu_exp2_lut)[2]=nullptr;
|
||||
|
||||
static uint32_t (*vfpu_log2_lut65536)=nullptr;
|
||||
static uint32_t (*vfpu_log2_lut65536_quadratic)=nullptr;
|
||||
static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr;
|
||||
|
||||
static int32_t (*vfpu_asin_lut65536)[3]=nullptr;
|
||||
static uint64_t (*vfpu_asin_lut_deltas)=nullptr;
|
||||
static uint16_t (*vfpu_asin_lut_indices)=nullptr;
|
||||
|
||||
static int8_t (*vfpu_rcp_lut)[2]=nullptr;
|
||||
|
||||
template<typename T>
|
||||
static inline bool load_vfpu_table(T *&ptr,const char *filename, size_t expected_size) {
|
||||
#if COMMON_BIG_ENDIAN
|
||||
// Tables are little-endian.
|
||||
#error Byteswap for VFPU tables not implemented
|
||||
#endif
|
||||
if(ptr) return true; // Already loaded.
|
||||
size_t size = 0u;
|
||||
INFO_LOG(CPU, "Loading '%s'...", filename);
|
||||
ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
|
||||
if(!ptr || size != expected_size)
|
||||
{
|
||||
ERROR_LOG(CPU, "Error loading '%s' (size=%u, expected: %u)", filename, (unsigned)size, (unsigned)expected_size);
|
||||
if(ptr) delete[] ptr;
|
||||
ptr = nullptr;
|
||||
return false;
|
||||
}
|
||||
INFO_LOG(CPU, "Successfully loaded '%s'", filename);
|
||||
return true;
|
||||
}
|
||||
|
||||
#define LOAD_TABLE(name, expected_size)\
|
||||
load_vfpu_table(name,"vfpu/" #name ".dat",expected_size)
|
||||
|
||||
// Note: PSP sin/cos output only has 22 significant
|
||||
// binary digits.
|
||||
static inline uint32_t vfpu_sin_quantum(uint32_t x) {
|
||||
return x < 1u << 22?
|
||||
1u:
|
||||
1u << (32 - 22 - clz32_nonzero(x));
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_sin_truncate_bits(u32 x) {
|
||||
return x & -vfpu_sin_quantum(x);
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_sin_fixed(uint32_t arg) {
|
||||
// Handle endpoints.
|
||||
if(arg == 0u) return 0u;
|
||||
if(arg == 0x00800000) return 0x10000000;
|
||||
// Get endpoints for 8192-wide interval.
|
||||
uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];
|
||||
uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];
|
||||
// Approximate endpoints for 64-wide interval via lerp.
|
||||
uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);
|
||||
uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);
|
||||
// Adjust endpoints from deltas, and increase working precision.
|
||||
uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);
|
||||
uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);
|
||||
// Compute approximation via lerp. Is off by at most 1 quantum.
|
||||
uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);
|
||||
v=vfpu_sin_truncate_bits(v);
|
||||
// Look up exceptions via binary search.
|
||||
// Note: vfpu_sin_lut_interval_delta stores
|
||||
// deltas from interval estimation.
|
||||
uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;
|
||||
uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;
|
||||
while(lo < hi) {
|
||||
uint32_t m = (lo + hi) / 2;
|
||||
// Note: vfpu_sin_lut_exceptions stores
|
||||
// index&127 (for each initial interval the
|
||||
// upper bits of index are the same, namely
|
||||
// arg&-128), plus direction (0 for +1, and
|
||||
// 128 for -1).
|
||||
uint32_t b = vfpu_sin_lut_exceptions[m];
|
||||
uint32_t e = (arg & -128u)+(b & 127u);
|
||||
if(e == arg) {
|
||||
v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);
|
||||
break;
|
||||
}
|
||||
return val.f;
|
||||
else if(e < arg) lo = m + 1;
|
||||
else hi = m;
|
||||
}
|
||||
if ((val.i & 0x7f800000) == 0) {
|
||||
// Kill any sign.
|
||||
val.i = 0;
|
||||
return val.f;
|
||||
}
|
||||
if (val.i & 0x80000000) {
|
||||
val.i = 0x7f800001;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
int k = get_exp(val.i);
|
||||
uint32_t sp = get_mant(val.i);
|
||||
int less_bits = k & 1;
|
||||
k >>= 1;
|
||||
|
||||
uint32_t z = 0x00C00000 >> less_bits;
|
||||
int64_t halfsp = sp >> 1;
|
||||
halfsp <<= 23 - less_bits;
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
z = (z >> 1) + (uint32_t)(halfsp / z);
|
||||
}
|
||||
|
||||
val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
|
||||
// The lower two bits never end up set on the PSP, it seems like.
|
||||
val.i &= 0xFFFFFFFC;
|
||||
|
||||
return val.f;
|
||||
return v;
|
||||
}
|
||||
|
||||
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
|
||||
uint64_t m = (uint64_t)a * (uint64_t)b;
|
||||
if (m & 0x007FFFFF) {
|
||||
m += 0x01437000;
|
||||
float vfpu_sin(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(x));
|
||||
uint32_t sign = bits & 0x80000000u;
|
||||
uint32_t exponent = (bits >> 23) & 0xFFu;
|
||||
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
|
||||
if(exponent == 0xFFu) {
|
||||
// NOTE: this bitpattern is a signaling
|
||||
// NaN on x86, so maybe just return
|
||||
// a normal qNaN?
|
||||
float y;
|
||||
bits=sign ^ 0x7F800001u;
|
||||
memcpy(&y, &bits, sizeof(y));
|
||||
return y;
|
||||
}
|
||||
return (uint32_t)(m >> 23);
|
||||
if(exponent < 0x7Fu) {
|
||||
if(exponent < 0x7Fu-23u) significand = 0u;
|
||||
else significand >>= (0x7F - exponent);
|
||||
}
|
||||
else if(exponent > 0x7Fu) {
|
||||
// There is weirdness for large exponents.
|
||||
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
|
||||
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
|
||||
else significand <<= (exponent - 0x7Fu);
|
||||
}
|
||||
sign ^= ((significand << 7) & 0x80000000u);
|
||||
significand &= 0x00FFFFFFu;
|
||||
if(significand > 0x00800000u) significand = 0x01000000u - significand;
|
||||
uint32_t ret = vfpu_sin_fixed(significand);
|
||||
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
|
||||
}
|
||||
|
||||
float vfpu_rsqrt(float a) {
|
||||
float2int val;
|
||||
val.f = a;
|
||||
|
||||
if (val.i == 0x7f800000) {
|
||||
return 0.0f;
|
||||
float vfpu_cos(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
|
||||
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(x));
|
||||
bits &= 0x7FFFFFFFu;
|
||||
uint32_t sign = 0u;
|
||||
uint32_t exponent = (bits >> 23) & 0xFFu;
|
||||
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
|
||||
if(exponent == 0xFFu) {
|
||||
// NOTE: this bitpattern is a signaling
|
||||
// NaN on x86, so maybe just return
|
||||
// a normal qNaN?
|
||||
float y;
|
||||
bits = sign ^ 0x7F800001u;
|
||||
memcpy(&y, &bits, sizeof(y));
|
||||
return y;
|
||||
}
|
||||
if ((val.i & 0x7fffffff) > 0x7f800000) {
|
||||
val.i = (val.i & 0x80000000) | 0x7f800001;
|
||||
return val.f;
|
||||
if(exponent < 0x7Fu) {
|
||||
if(exponent < 0x7Fu - 23u) significand = 0u;
|
||||
else significand >>= (0x7F - exponent);
|
||||
}
|
||||
if ((val.i & 0x7f800000) == 0) {
|
||||
val.i = (val.i & 0x80000000) | 0x7f800000;
|
||||
return val.f;
|
||||
else if(exponent > 0x7Fu) {
|
||||
// There is weirdness for large exponents.
|
||||
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
|
||||
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
|
||||
else significand <<= (exponent - 0x7Fu);
|
||||
}
|
||||
if (val.i & 0x80000000) {
|
||||
val.i = 0xff800001;
|
||||
return val.f;
|
||||
sign ^= ((significand << 7) & 0x80000000u);
|
||||
significand &= 0x00FFFFFFu;
|
||||
if(significand >= 0x00800000u) {
|
||||
significand = 0x01000000u - significand;
|
||||
sign ^= 0x80000000u;
|
||||
}
|
||||
|
||||
int k = get_exp(val.i);
|
||||
uint32_t sp = get_mant(val.i);
|
||||
int less_bits = k & 1;
|
||||
k = -(k >> 1);
|
||||
|
||||
uint32_t z = 0x00800000 >> less_bits;
|
||||
uint32_t halfsp = sp >> (1 + less_bits);
|
||||
for (int i = 0; i < 6; ++i) {
|
||||
uint32_t zsq = mant_mul(z, z);
|
||||
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
|
||||
z = mant_mul(z, correction);
|
||||
}
|
||||
|
||||
int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
|
||||
if (shift < 1) {
|
||||
z >>= -shift;
|
||||
k += -shift;
|
||||
} else if (shift > 0) {
|
||||
z <<= shift;
|
||||
k -= shift;
|
||||
}
|
||||
|
||||
z >>= less_bits;
|
||||
|
||||
val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
|
||||
val.i &= 0xFFFFFFFC;
|
||||
|
||||
return val.f;
|
||||
}
|
||||
|
||||
float vfpu_sin(float a) {
|
||||
float2int val;
|
||||
val.f = a;
|
||||
|
||||
int32_t k = get_uexp(val.i);
|
||||
if (k == 255) {
|
||||
val.i = (val.i & 0xFF800001) | 1;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
if (k < PRECISION_EXP_THRESHOLD) {
|
||||
val.i &= 0x80000000;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
// Okay, now modulus by 4 to begin with (identical wave every 4.)
|
||||
int32_t mantissa = get_mant(val.i);
|
||||
if (k > 0x80) {
|
||||
const uint8_t over = k & 0x1F;
|
||||
mantissa = (mantissa << over) & 0x00FFFFFF;
|
||||
k = 0x80;
|
||||
}
|
||||
// This subtracts off the 2. If we do, flip sign to inverse the wave.
|
||||
if (k == 0x80 && mantissa >= (1 << 23)) {
|
||||
val.i ^= 0x80000000;
|
||||
mantissa -= 1 << 23;
|
||||
}
|
||||
|
||||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
|
||||
mantissa <<= norm_shift;
|
||||
k -= norm_shift;
|
||||
|
||||
if (k <= 0 || mantissa == 0) {
|
||||
val.i &= 0x80000000;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
// This is the value with modulus applied.
|
||||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
|
||||
val.f = (float)sin((double)val.f * M_PI_2);
|
||||
val.i &= 0xFFFFFFFC;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
float vfpu_cos(float a) {
|
||||
float2int val;
|
||||
val.f = a;
|
||||
bool negate = false;
|
||||
|
||||
int32_t k = get_uexp(val.i);
|
||||
if (k == 255) {
|
||||
// Note: unlike sin, cos always returns +NAN.
|
||||
val.i = (val.i & 0x7F800001) | 1;
|
||||
return val.f;
|
||||
}
|
||||
|
||||
if (k < PRECISION_EXP_THRESHOLD)
|
||||
return 1.0f;
|
||||
|
||||
// Okay, now modulus by 4 to begin with (identical wave every 4.)
|
||||
int32_t mantissa = get_mant(val.i);
|
||||
if (k > 0x80) {
|
||||
const uint8_t over = k & 0x1F;
|
||||
mantissa = (mantissa << over) & 0x00FFFFFF;
|
||||
k = 0x80;
|
||||
}
|
||||
// This subtracts off the 2. If we do, negate the result value.
|
||||
if (k == 0x80 && mantissa >= (1 << 23)) {
|
||||
mantissa -= 1 << 23;
|
||||
negate = true;
|
||||
}
|
||||
|
||||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
|
||||
mantissa <<= norm_shift;
|
||||
k -= norm_shift;
|
||||
|
||||
if (k <= 0 || mantissa == 0)
|
||||
return negate ? -1.0f : 1.0f;
|
||||
|
||||
// This is the value with modulus applied.
|
||||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
|
||||
if (val.f == 1.0f || val.f == -1.0f) {
|
||||
return negate ? 0.0f : -0.0f;
|
||||
}
|
||||
val.f = (float)cos((double)val.f * M_PI_2);
|
||||
val.i &= 0xFFFFFFFC;
|
||||
return negate ? -val.f : val.f;
|
||||
uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);
|
||||
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
|
||||
}
|
||||
|
||||
void vfpu_sincos(float a, float &s, float &c) {
|
||||
float2int val;
|
||||
val.f = a;
|
||||
// For sin, negate the input, for cos negate the output.
|
||||
bool negate = false;
|
||||
// Just invoke both sin and cos.
|
||||
// Suboptimal but whatever.
|
||||
s = vfpu_sin(a);
|
||||
c = vfpu_cos(a);
|
||||
}
|
||||
|
||||
int32_t k = get_uexp(val.i);
|
||||
if (k == 255) {
|
||||
val.i = (val.i & 0xFF800001) | 1;
|
||||
s = val.f;
|
||||
val.i &= 0x7F800001;
|
||||
c = val.f;
|
||||
return;
|
||||
}
|
||||
|
||||
if (k < PRECISION_EXP_THRESHOLD) {
|
||||
val.i &= 0x80000000;
|
||||
s = val.f;
|
||||
c = 1.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
// Okay, now modulus by 4 to begin with (identical wave every 4.)
|
||||
int32_t mantissa = get_mant(val.i);
|
||||
if (k > 0x80) {
|
||||
const uint8_t over = k & 0x1F;
|
||||
mantissa = (mantissa << over) & 0x00FFFFFF;
|
||||
k = 0x80;
|
||||
}
|
||||
// This subtracts off the 2. If we do, flip signs.
|
||||
if (k == 0x80 && mantissa >= (1 << 23)) {
|
||||
mantissa -= 1 << 23;
|
||||
negate = true;
|
||||
}
|
||||
|
||||
int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
|
||||
mantissa <<= norm_shift;
|
||||
k -= norm_shift;
|
||||
|
||||
if (k <= 0 || mantissa == 0) {
|
||||
val.i &= 0x80000000;
|
||||
if (negate)
|
||||
val.i ^= 0x80000000;
|
||||
s = val.f;
|
||||
c = negate ? -1.0f : 1.0f;
|
||||
return;
|
||||
}
|
||||
|
||||
// This is the value with modulus applied.
|
||||
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
|
||||
float2int i_sine, i_cosine;
|
||||
if (val.f == 1.0f) {
|
||||
i_sine.f = negate ? -1.0f : 1.0f;
|
||||
i_cosine.f = negate ? 0.0f : -0.0f;
|
||||
} else if (val.f == -1.0f) {
|
||||
i_sine.f = negate ? 1.0f : -1.0f;
|
||||
i_cosine.f = negate ? 0.0f : -0.0f;
|
||||
} else if (negate) {
|
||||
i_sine.f = (float)sin((double)-val.f * M_PI_2);
|
||||
i_cosine.f = -(float)cos((double)val.f * M_PI_2);
|
||||
} else {
|
||||
double angle = (double)val.f * M_PI_2;
|
||||
#if defined(__linux__)
|
||||
double d_sine;
|
||||
double d_cosine;
|
||||
sincos(angle, &d_sine, &d_cosine);
|
||||
i_sine.f = (float)d_sine;
|
||||
i_cosine.f = (float)d_cosine;
|
||||
// Integer square root of 2^23*x (rounded to zero).
|
||||
// Input is in 2^23 <= x < 2^25, and representable in float.
|
||||
static inline uint32_t isqrt23(uint32_t x) {
|
||||
#if 0
|
||||
// Reference version.
|
||||
int dir=fesetround(FE_TOWARDZERO);
|
||||
uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));
|
||||
fesetround(dir);
|
||||
return ret;
|
||||
#elif 1
|
||||
// Double version.
|
||||
// Verified to produce correct result for all valid inputs,
|
||||
// in all rounding modes, both in double and double-extended (x87)
|
||||
// precision.
|
||||
// Requires correctly-rounded sqrt (which on any IEEE-754 system
|
||||
// it should be).
|
||||
return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));
|
||||
#else
|
||||
i_sine.f = (float)sin(angle);
|
||||
i_cosine.f = (float)cos(angle);
|
||||
// Pure integer version, if you don't like floating point.
|
||||
// Based on code from Hacker's Delight. See isqrt4 in
|
||||
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
|
||||
// Relatively slow.
|
||||
uint64_t t=uint64_t(x) << 23, m, y, b;
|
||||
m=0x4000000000000000ull;
|
||||
y=0;
|
||||
while(m != 0) // Do 32 times.
|
||||
{
|
||||
b=y | m;
|
||||
y=y >> 1;
|
||||
if(t >= b)
|
||||
{
|
||||
t = t - b;
|
||||
y = y | m;
|
||||
}
|
||||
m = m >> 2;
|
||||
}
|
||||
return y;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns floating-point bitpattern.
|
||||
static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
|
||||
// Endpoints of input.
|
||||
uint32_t lo =(x + 0u) & -64u;
|
||||
uint32_t hi = (x + 64u) & -64u;
|
||||
// Convert input to 9.23 fixed-point.
|
||||
lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);
|
||||
hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);
|
||||
// Estimate endpoints of output.
|
||||
uint32_t A = 0x3F000000u + isqrt23(lo);
|
||||
uint32_t B = 0x3F000000u + isqrt23(hi);
|
||||
// Apply deltas, and increase the working precision.
|
||||
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);
|
||||
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);
|
||||
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
|
||||
// Truncate lower 2 bits.
|
||||
ret &= -4u;
|
||||
return ret;
|
||||
}
|
||||
|
||||
float vfpu_sqrt(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_sqrt_lut, 262144);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(bits));
|
||||
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
|
||||
// Denormals (and zeroes) get +0, regardless
|
||||
// of sign.
|
||||
return +0.0f;
|
||||
}
|
||||
if(bits >> 31) {
|
||||
// Other negatives get NaN.
|
||||
bits = 0x7F800001u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if((bits >> 23) == 255u) {
|
||||
// Inf/NaN gets Inf/NaN.
|
||||
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);
|
||||
memcpy(&x, &bits, sizeof(bits));
|
||||
return x;
|
||||
}
|
||||
int32_t exponent = int32_t(bits >> 23) - 127;
|
||||
// Bottom bit of exponent (inverted) + significand (except bottom bit).
|
||||
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
|
||||
bits = vfpu_sqrt_fixed(index);
|
||||
bits += uint32_t(exponent >> 1) << 23;
|
||||
memcpy(&x, &bits, sizeof(bits));
|
||||
return x;
|
||||
}
|
||||
|
||||
// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.
|
||||
static inline uint32_t rsqrt_floor22(uint32_t x) {
|
||||
#if 1
|
||||
// Verified correct in all rounding directions,
|
||||
// by exhaustive search.
|
||||
return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33
|
||||
#else
|
||||
// Pure integer version, if you don't like floating point.
|
||||
// Based on code from Hacker's Delight. See isqrt4 in
|
||||
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
|
||||
// Relatively slow.
|
||||
uint64_t t=uint64_t(x) << 22, m, y, b;
|
||||
m = 0x4000000000000000ull;
|
||||
y = 0;
|
||||
while(m != 0) // Do 32 times.
|
||||
{
|
||||
b = y | m;
|
||||
y = y >> 1;
|
||||
if(t >= b)
|
||||
{
|
||||
t = t - b;
|
||||
y = y | m;
|
||||
}
|
||||
m = m >> 2;
|
||||
}
|
||||
y = (1ull << 44) / y;
|
||||
// Decrement if y > floor(2^33 / sqrt(x)).
|
||||
// This hack works because exhaustive
|
||||
// search (on [2^22;2^24]) says it does.
|
||||
if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;
|
||||
return uint32_t(y);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Returns floating-point bitpattern.
|
||||
static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
|
||||
// Endpoints of input.
|
||||
uint32_t lo = (x + 0u) & -64u;
|
||||
uint32_t hi = (x + 64u) & -64u;
|
||||
// Convert input to 10.22 fixed-point.
|
||||
lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);
|
||||
hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);
|
||||
// Estimate endpoints of output.
|
||||
uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);
|
||||
uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);
|
||||
// Apply deltas, and increase the working precision.
|
||||
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);
|
||||
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);
|
||||
// Evaluate via lerp.
|
||||
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
|
||||
// Truncate lower 2 bits.
|
||||
ret &= -4u;
|
||||
return ret;
|
||||
}
|
||||
|
||||
float vfpu_rsqrt(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_rsqrt_lut, 262144);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(bits));
|
||||
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
|
||||
// Denormals (and zeroes) get inf of the same sign.
|
||||
bits = 0x7F800000u | (bits & 0x80000000u);
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if(bits >> 31) {
|
||||
// Other negatives get negative NaN.
|
||||
bits = 0xFF800001u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if((bits >> 23) == 255u) {
|
||||
// inf gets 0, NaN gets NaN.
|
||||
bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);
|
||||
memcpy(&x, &bits, sizeof(bits));
|
||||
return x;
|
||||
}
|
||||
int32_t exponent = int32_t(bits >> 23) - 127;
|
||||
// Bottom bit of exponent (inverted) + significand (except bottom bit).
|
||||
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
|
||||
bits = vfpu_rsqrt_fixed(index);
|
||||
bits -= uint32_t(exponent >> 1) << 23;
|
||||
memcpy(&x, &bits, sizeof(bits));
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_asin_quantum(uint32_t x) {
|
||||
return x<1u<<23?
|
||||
1u:
|
||||
1u<<(32-23-clz32_nonzero(x));
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {
|
||||
return x & -vfpu_asin_quantum(x);
|
||||
}
|
||||
|
||||
// Input is fixed 9.23, output is fixed 2.30.
|
||||
static inline uint32_t vfpu_asin_approx(uint32_t x) {
|
||||
const int32_t *C = vfpu_asin_lut65536[x >> 16];
|
||||
x &= 0xFFFFu;
|
||||
return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));
|
||||
}
|
||||
|
||||
// Input is fixed 9.23, output is fixed 2.30.
|
||||
static uint32_t vfpu_asin_fixed(uint32_t x) {
|
||||
if(x == 0u) return 0u;
|
||||
if(x == 1u << 23) return 1u << 30;
|
||||
uint32_t ret = vfpu_asin_approx(x);
|
||||
uint32_t index = vfpu_asin_lut_indices[x / 21u];
|
||||
uint64_t deltas = vfpu_asin_lut_deltas[index];
|
||||
return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);
|
||||
}
|
||||
|
||||
float vfpu_asin(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_asin_lut65536, 1536)&&
|
||||
LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
|
||||
LOAD_TABLE(vfpu_asin_lut_deltas, 517448);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(x));
|
||||
uint32_t sign = bits & 0x80000000u;
|
||||
bits = bits & 0x7FFFFFFFu;
|
||||
if(bits > 0x3F800000u) {
|
||||
bits = 0x7F800001u ^ sign;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
i_sine.i &= 0xFFFFFFFC;
|
||||
i_cosine.i &= 0xFFFFFFFC;
|
||||
s = i_sine.f;
|
||||
c = i_cosine.f;
|
||||
return ;
|
||||
bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23
|
||||
x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30
|
||||
if(sign) x = -x;
|
||||
return x;
|
||||
}
|
||||
|
||||
void InitVFPUSinCos() {
|
||||
// TODO: Could prepare a CORDIC table here.
|
||||
static inline uint32_t vfpu_exp2_approx(uint32_t x) {
|
||||
if(x == 0x00800000u) return 0x00800000u;
|
||||
uint32_t a=vfpu_exp2_lut65536[x >> 16];
|
||||
x &= 0x0000FFFFu;
|
||||
uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));
|
||||
return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
|
||||
if(x == 0u) return 0u;
|
||||
if(x == 0x00800000u) return 0x00800000u;
|
||||
uint32_t A = vfpu_exp2_approx((x ) & -64u);
|
||||
uint32_t B = vfpu_exp2_approx((x + 64) & -64u);
|
||||
uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;
|
||||
uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;
|
||||
uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
|
||||
y &= -4u;
|
||||
return y;
|
||||
}
|
||||
|
||||
float vfpu_exp2(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
|
||||
LOAD_TABLE(vfpu_exp2_lut, 262144);
|
||||
int32_t bits;
|
||||
memcpy(&bits, &x, sizeof(bits));
|
||||
if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
|
||||
// Denormals are treated as 0.
|
||||
return 1.0f;
|
||||
}
|
||||
if(x != x) {
|
||||
// NaN gets NaN.
|
||||
bits = 0x7F800001u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if(x <= -126.0f) {
|
||||
// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).
|
||||
// But yes, -126.0f produces +0.0f.
|
||||
return 0.0f;
|
||||
}
|
||||
if(x >= +128.0f) {
|
||||
// Large numbers get infinity.
|
||||
bits = 0x7F800000u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
bits = int32_t(x * 0x1p23f);
|
||||
if(x < 0.0f) --bits; // Yes, really.
|
||||
bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));
|
||||
memcpy(&x, &bits, sizeof(bits));
|
||||
return x;
|
||||
}
|
||||
|
||||
float vfpu_rexp2(float x) {
|
||||
return vfpu_exp2(-x);
|
||||
}
|
||||
|
||||
// Input fixed 9.23, output fixed 10.22.
|
||||
// Returns log2(1+x).
|
||||
static inline uint32_t vfpu_log2_approx(uint32_t x) {
|
||||
uint32_t a=vfpu_log2_lut65536[(x >> 16) + 0];
|
||||
uint32_t b=vfpu_log2_lut65536[(x >> 16) + 1];
|
||||
uint32_t c=vfpu_log2_lut65536_quadratic[x >> 16];
|
||||
x &= 0xFFFFu;
|
||||
uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
|
||||
uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
|
||||
ret += d;
|
||||
return uint32_t(ret >> 16);
|
||||
}
|
||||
|
||||
// Matches PSP output on all known values.
|
||||
float vfpu_log2(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_log2_lut65536, 516)&&
|
||||
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&
|
||||
LOAD_TABLE(vfpu_log2_lut, 2097152);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(bits));
|
||||
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
|
||||
// Denormals (and zeroes) get -inf.
|
||||
bits = 0xFF800000u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if(bits & 0x80000000u) {
|
||||
// Other negatives get NaN.
|
||||
bits = 0x7F800001u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if((bits >> 23) == 255u) {
|
||||
// NaN gets NaN, +inf gets +inf.
|
||||
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;
|
||||
uint32_t i = bits & 0x007FFFFFu;
|
||||
if(e >> 31 && i >= 0x007FFE00u) {
|
||||
// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,
|
||||
// since the table doesn't give the right answer.
|
||||
float c = float(int32_t(~e) >> 23);
|
||||
// Note: if c is 0 the sign of -0 output is correct.
|
||||
return i < 0x007FFEF7u ? // 1-265*2^{-24}
|
||||
-3.05175781e-05f - c:
|
||||
-0.0f - c;
|
||||
}
|
||||
int d = (e < 0x01000000u ? 0 : 8 - clz32_nonzero(e) - int(e >> 31));
|
||||
//assert(d >= 0 && d < 8);
|
||||
uint32_t q = 1u << d;
|
||||
uint32_t A = vfpu_log2_approx((i ) & -64u) & -q;
|
||||
uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;
|
||||
uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;
|
||||
uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;
|
||||
uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);
|
||||
v &= -q;
|
||||
bits = e ^ (2u * v);
|
||||
x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline uint32_t vfpu_rcp_approx(uint32_t i) {
|
||||
return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);
|
||||
}
|
||||
|
||||
float vfpu_rcp(float x) {
|
||||
static bool loaded =
|
||||
LOAD_TABLE(vfpu_rcp_lut, 262144);
|
||||
uint32_t bits;
|
||||
memcpy(&bits, &x, sizeof(bits));
|
||||
uint32_t s = bits & 0x80000000u;
|
||||
uint32_t e = bits & 0x7F800000u;
|
||||
uint32_t i = bits & 0x007FFFFFu;
|
||||
if((bits & 0x7FFFFFFFu) > 0x7E800000u) {
|
||||
bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
if(e==0u) {
|
||||
bits = s^0x7F800000u;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
uint32_t A = vfpu_rcp_approx((i ) & -64u);
|
||||
uint32_t B = vfpu_rcp_approx((i + 64) & -64u);
|
||||
uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;
|
||||
uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;
|
||||
uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);
|
||||
v &= -4u;
|
||||
bits = s + (0x3F800000u - e) + v;
|
||||
memcpy(&x, &bits, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
|
||||
void InitVFPU() {
|
||||
#if 0
|
||||
// Load all in advance.
|
||||
LOAD_TABLE(vfpu_asin_lut65536 , 1536);
|
||||
LOAD_TABLE(vfpu_asin_lut_deltas , 517448);
|
||||
LOAD_TABLE(vfpu_asin_lut_indices , 798916);
|
||||
LOAD_TABLE(vfpu_exp2_lut65536 , 512);
|
||||
LOAD_TABLE(vfpu_exp2_lut , 262144);
|
||||
LOAD_TABLE(vfpu_log2_lut65536 , 516);
|
||||
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512);
|
||||
LOAD_TABLE(vfpu_log2_lut , 2097152);
|
||||
LOAD_TABLE(vfpu_rcp_lut , 262144);
|
||||
LOAD_TABLE(vfpu_rsqrt_lut , 262144);
|
||||
LOAD_TABLE(vfpu_sin_lut8192 , 4100);
|
||||
LOAD_TABLE(vfpu_sin_lut_delta , 262144);
|
||||
LOAD_TABLE(vfpu_sin_lut_exceptions , 86938);
|
||||
LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074);
|
||||
LOAD_TABLE(vfpu_sqrt_lut , 262144);
|
||||
#endif
|
||||
}
|
||||
|
@ -35,18 +35,15 @@ inline int Xpose(int v) {
|
||||
#endif
|
||||
|
||||
// The VFPU uses weird angles where 4.0 represents a full circle. This makes it possible to return
|
||||
// exact 1.0/-1.0 values at certain angles. We currently just scale, and special case the cardinal directions.
|
||||
// exact 1.0/-1.0 values at certain angles.
|
||||
//
|
||||
// Stepping down to [0, 2pi) helps, but we also check common exact-result values.
|
||||
// TODO: cos(1) and sin(2) should be -0.0, but doing that gives wrong results (possibly from floorf.)
|
||||
//
|
||||
// We also try an alternative solution, computing things in double precision, multiplying the input by pi/2.
|
||||
// This fixes #12900 (Hitman Reborn 2) but breaks #13705 (Cho Aniki Zero) and #13671 (Hajime no Ippo).
|
||||
// #2921 is still fine. So the alt solution (vfpu_sin_double etc) are behind a compat flag.
|
||||
//
|
||||
// A better solution would be to tailor some sine approximation for the 0..90 degrees range, compute
|
||||
// modulo manually and mirror that around the circle. Also correctly special casing for inf/nan inputs
|
||||
// and just trying to match it as closely as possible to the real PSP.
|
||||
// The current code attempts to match VFPU sin/cos exactly.
|
||||
// Possibly affected games:
|
||||
// Final Fantasy III (#2921 )
|
||||
// Hitman Reborn 2 (#12900)
|
||||
// Cho Aniki Zero (#13705)
|
||||
// Hajime no Ippo (#13671)
|
||||
// Dissidia Duodecim Final Fantasy (#6710 )
|
||||
//
|
||||
// Messing around with the modulo functions? try https://www.desmos.com/calculator.
|
||||
|
||||
@ -54,9 +51,7 @@ extern float vfpu_sin(float);
|
||||
extern float vfpu_cos(float);
|
||||
extern void vfpu_sincos(float, float&, float&);
|
||||
|
||||
inline float vfpu_asin(float angle) {
|
||||
return (float)(asinf(angle) / M_PI_2);
|
||||
}
|
||||
extern float vfpu_asin(float);
|
||||
|
||||
inline float vfpu_clamp(float v, float min, float max) {
|
||||
// Note: NAN is preserved, and -0.0 becomes +0.0 if min=+0.0.
|
||||
@ -67,6 +62,11 @@ float vfpu_dot(const float a[4], const float b[4]);
|
||||
float vfpu_sqrt(float a);
|
||||
float vfpu_rsqrt(float a);
|
||||
|
||||
extern float vfpu_exp2(float);
|
||||
extern float vfpu_rexp2(float);
|
||||
extern float vfpu_log2(float);
|
||||
extern float vfpu_rcp(float);
|
||||
|
||||
#define VFPU_FLOAT16_EXP_MAX 0x1f
|
||||
#define VFPU_SH_FLOAT16_SIGN 15
|
||||
#define VFPU_MASK_FLOAT16_SIGN 0x1
|
||||
@ -215,4 +215,4 @@ int GetVectorOverlap(int reg1, VectorSize size1, int reg2, VectorSize size2);
|
||||
bool GetVFPUCtrlMask(int reg, u32 *mask);
|
||||
|
||||
float Float16ToFloat32(unsigned short l);
|
||||
void InitVFPUSinCos();
|
||||
void InitVFPU();
|
||||
|
@ -306,7 +306,7 @@ bool CPU_Init(std::string *errorString) {
|
||||
// likely to collide with any commercial ones.
|
||||
g_CoreParameter.compat.Load(g_paramSFO.GetDiscID());
|
||||
|
||||
InitVFPUSinCos();
|
||||
InitVFPU();
|
||||
|
||||
if (allowPlugins)
|
||||
HLEPlugins::Init();
|
||||
|
BIN
assets/vfpu/vfpu_asin_lut65536.dat
Normal file
BIN
assets/vfpu/vfpu_asin_lut65536.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_asin_lut_deltas.dat
Normal file
BIN
assets/vfpu/vfpu_asin_lut_deltas.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_asin_lut_indices.dat
Normal file
BIN
assets/vfpu/vfpu_asin_lut_indices.dat
Normal file
Binary file not shown.
4
assets/vfpu/vfpu_exp2_lut.dat
Normal file
4
assets/vfpu/vfpu_exp2_lut.dat
Normal file
File diff suppressed because one or more lines are too long
BIN
assets/vfpu/vfpu_exp2_lut65536.dat
Normal file
BIN
assets/vfpu/vfpu_exp2_lut65536.dat
Normal file
Binary file not shown.
22
assets/vfpu/vfpu_log2_lut.dat
Normal file
22
assets/vfpu/vfpu_log2_lut.dat
Normal file
File diff suppressed because one or more lines are too long
BIN
assets/vfpu/vfpu_log2_lut65536.dat
Normal file
BIN
assets/vfpu/vfpu_log2_lut65536.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_log2_lut65536_quadratic.dat
Normal file
BIN
assets/vfpu/vfpu_log2_lut65536_quadratic.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_rcp_lut.dat
Normal file
BIN
assets/vfpu/vfpu_rcp_lut.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_rsqrt_lut.dat
Normal file
BIN
assets/vfpu/vfpu_rsqrt_lut.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_sin_lut8192.dat
Normal file
BIN
assets/vfpu/vfpu_sin_lut8192.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_sin_lut_delta.dat
Normal file
BIN
assets/vfpu/vfpu_sin_lut_delta.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_sin_lut_exceptions.dat
Normal file
BIN
assets/vfpu/vfpu_sin_lut_exceptions.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_sin_lut_interval_delta.dat
Normal file
BIN
assets/vfpu/vfpu_sin_lut_interval_delta.dat
Normal file
Binary file not shown.
BIN
assets/vfpu/vfpu_sqrt_lut.dat
Normal file
BIN
assets/vfpu/vfpu_sqrt_lut.dat
Normal file
Binary file not shown.
@ -92,7 +92,7 @@ static void SetupJitHarness() {
|
||||
Memory::Init();
|
||||
mipsr4k.Reset();
|
||||
CoreTiming::Init();
|
||||
InitVFPUSinCos();
|
||||
InitVFPU();
|
||||
}
|
||||
|
||||
static void DestroyJitHarness() {
|
||||
|
@ -58,6 +58,8 @@
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "Common/Log.h"
|
||||
#include "Core/Config.h"
|
||||
#include "Common/File/VFS/VFS.h"
|
||||
#include "Common/File/VFS/DirectoryReader.h"
|
||||
#include "Core/FileSystems/ISOFileSystem.h"
|
||||
#include "Core/MemMap.h"
|
||||
#include "Core/MIPS/MIPSVFPUUtils.h"
|
||||
@ -358,7 +360,10 @@ bool TestTinySet() {
|
||||
|
||||
bool TestVFPUSinCos() {
|
||||
float sine, cosine;
|
||||
InitVFPUSinCos();
|
||||
// Needed for VFPU tables.
|
||||
// There might be a better place to invoke it, but whatever.
|
||||
g_VFS.Register("", new DirectoryReader(Path("assets")));
|
||||
InitVFPU();
|
||||
vfpu_sincos(0.0f, sine, cosine);
|
||||
EXPECT_EQ_FLOAT(sine, 0.0f);
|
||||
EXPECT_EQ_FLOAT(cosine, 1.0f);
|
||||
|
Loading…
Reference in New Issue
Block a user