GPU: Use NEON for vector length/normalize.

This is a bit less accurate for normalize, but it's faster.
This commit is contained in:
Unknown W. Brackets 2023-01-08 12:32:55 -08:00
parent 8acd855eac
commit d90aec6f63

View File

@ -32,6 +32,12 @@ float Vec2<float>::Length() const
const __m128 res = _mm_add_ss(sq, r2); const __m128 res = _mm_add_ss(sq, r2);
_mm_store_ss(&ret, _mm_sqrt_ss(res)); _mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret; return ret;
#elif PPSSPP_ARCH(ARM64_NEON)
float32x2_t vec = vld1_f32(&x);
float32x2_t sq = vmul_f32(vec, vec);
float32x2_t add2 = vpadd_f32(sq, sq);
float32x2_t res = vsqrt_f32(add2);
return vget_lane_f32(res, 0);
#else #else
return sqrtf(Length2()); return sqrtf(Length2());
#endif #endif
@ -80,6 +86,12 @@ float Vec3<float>::Length() const
const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3)); const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
_mm_store_ss(&ret, _mm_sqrt_ss(res)); _mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret; return ret;
#elif PPSSPP_ARCH(ARM64_NEON)
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
float32x2_t add2 = vpadd_f32(add1, add1);
float32x2_t res = vsqrt_f32(add2);
return vget_lane_f32(res, 0);
#else #else
return sqrtf(Length2()); return sqrtf(Length2());
#endif #endif
@ -146,6 +158,37 @@ Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
// Replace with the constant if the mask matched. // Replace with the constant if the mask matched.
return _mm_or_ps(_mm_andnot_ps(mask, result), replace); return _mm_or_ps(_mm_andnot_ps(mask, result), replace);
} }
#elif PPSSPP_ARCH(ARM64_NEON)
template<>
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
float32x2_t summed = vpadd_f32(add1, add1);
float32x2_t e = vrsqrte_f32(summed);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
float32x4_t factor = vdupq_lane_f32(e, 0);
return Vec3<float>(vmulq_f32(vec, factor));
}
template<>
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
float32x2_t summed = vpadd_f32(add1, add1);
if (vget_lane_f32(summed, 0) == 0.0f) {
return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));
}
float32x2_t e = vrsqrte_f32(summed);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
float32x4_t factor = vdupq_lane_f32(e, 0);
return Vec3<float>(vmulq_f32(vec, factor));
}
#else #else
template<> template<>
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
@ -258,6 +301,12 @@ float Vec4<float>::Length() const
const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1))); const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
_mm_store_ss(&ret, _mm_sqrt_ss(res)); _mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret; return ret;
#elif PPSSPP_ARCH(ARM64_NEON)
float32x4_t sq = vmulq_f32(vec, vec);
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
float32x2_t add2 = vpadd_f32(add1, add1);
float32x2_t res = vsqrt_f32(add2);
return vget_lane_f32(res, 0);
#else #else
return sqrtf(Length2()); return sqrtf(Length2());
#endif #endif