interp: Implement software inverse square root.

This commit is contained in:
Unknown W. Brackets 2019-07-14 14:09:20 -07:00
parent 6028b79895
commit c1c869df27
3 changed files with 77 additions and 16 deletions

View File

@ -618,13 +618,13 @@ namespace MIPSInt
case 4: if (s[i] <= 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
case 5: if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
case 16: d[i] = 1.0f / s[i]; break; //vrcp
case 17: d[i] = USE_VPFU_SQRT ? 1.0f / vfpu_sqrt(s[i]) : 1.0f / sqrtf(s[i]); break; //vrsq
case 17: d[i] = USE_VPFU_SQRT ? vfpu_rsqrt(s[i]) : 1.0f / sqrtf(s[i]); break; //vrsq
case 18: { d[i] = vfpu_sin(s[i]); } break; //vsin
case 19: { d[i] = vfpu_cos(s[i]); } break; //vcos
case 20: d[i] = powf(2.0f, s[i]); break; //vexp2
case 21: d[i] = logf(s[i])/log(2.0f); break; //vlog2
case 22: d[i] = USE_VPFU_SQRT ? fabsf(vfpu_sqrt(s[i])) : fabsf(sqrtf(s[i])); break; //vsqrt
case 22: d[i] = USE_VPFU_SQRT ? vfpu_sqrt(s[i]) : fabsf(sqrtf(s[i])); break; //vsqrt
case 23: d[i] = asinf(s[i]) / M_PI_2; break; //vasin
case 24: d[i] = -1.0f / s[i]; break; // vnrcp
case 26: { d[i] = -vfpu_sin(s[i]); } break; // vnsin

View File

@ -717,10 +717,13 @@ float vfpu_dot(float a[4], float b[4]) {
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
if (shift < 0) {
// Round to even if we'd shift away a 0.5.
uint32_t round_bit = 1 << (-shift - 1);
const uint32_t round_bit = 1 << (-shift - 1);
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
mant_sum += round_bit;
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
mant_sum += round_bit;
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
}
mant_sum >>= -shift;
max_exp += -shift;
@ -756,7 +759,7 @@ float vfpu_sqrt(float a) {
}
return val.f;
}
if ((val.u & 0x7fffffff) == 0) {
if ((val.u & 0x7f800000) == 0) {
// Kill any sign.
val.u = 0;
return val.f;
@ -767,26 +770,83 @@ float vfpu_sqrt(float a) {
}
int k = get_exp(val.u);
int sp = get_mant(val.u);
int less_bits;
uint32_t sp = get_mant(val.u);
int less_bits = k & 1;
k >>= 1;
if (k & 1) {
less_bits = 1;
k /= 2;
} else {
less_bits = 0;
k /= 2;
}
int z = 0x00800000 >> less_bits;
uint32_t z = 0x00C00000 >> less_bits;
int64_t halfsp = sp >> 1;
halfsp <<= 23 - less_bits;
for (int i = 0; i < 6; ++i) {
z = (z >> 1) + (int)(halfsp / z);
z = (z >> 1) + (uint32_t)(halfsp / z);
}
val.u = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
// The lower two bits never end up set on the PSP, it seems like.
val.u &= 0xFFFFFFFC;
return val.f;
}
static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
uint64_t m = (uint64_t)a * (uint64_t)b;
if (m & 0x007FFFFF) {
m += 0x01437000;
}
return m >> 23;
}
float vfpu_rsqrt(float a) {
union float2int {
uint32_t u;
float f;
};
float2int val;
val.f = a;
if (val.u == 0x7f800000) {
return 0.0f;
}
if ((val.u & 0x7fffffff) > 0x7f800000) {
val.u = (val.u & 0x80000000) | 0x7f800001;
return val.f;
}
if ((val.u & 0x7f800000) == 0) {
val.u = (val.u & 0x80000000) | 0x7f800000;
return val.f;
}
if (val.u & 0x80000000) {
val.u = 0xff800001;
return val.f;
}
int k = get_exp(val.u);
uint32_t sp = get_mant(val.u);
int less_bits = k & 1;
k = -(k >> 1);
uint32_t z = 0x00800000 >> less_bits;
uint32_t halfsp = sp >> (1 + less_bits);
for (int i = 0; i < 6; ++i) {
uint32_t oldz = z;
uint32_t zsq = mant_mul(z, z);
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
z = mant_mul(z, correction);
}
int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
if (shift < 1) {
z >>= -shift;
k += -shift;
} else if (shift > 0) {
z <<= shift;
k -= shift;
}
z >>= less_bits;
val.u = ((k + 127) << 23) | (z & 0x007FFFFF);
val.u &= 0xFFFFFFFC;
return val.f;
}

View File

@ -99,6 +99,7 @@ inline float vfpu_clamp(float v, float min, float max) {
float vfpu_dot(float a[4], float b[4]);
float vfpu_sqrt(float a);
float vfpu_rsqrt(float a);
#define VFPU_FLOAT16_EXP_MAX 0x1f
#define VFPU_SH_FLOAT16_SIGN 15