From d90aec6f63acca69f8806a685e9fb0aa26f26ff1 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 8 Jan 2023 12:32:55 -0800 Subject: [PATCH] GPU: Use NEON for vector length/normalize. This is a bit less accurate for normalize, but it's faster. --- GPU/Math3D.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp index fec4c887bd..22ad0f86d8 100644 --- a/GPU/Math3D.cpp +++ b/GPU/Math3D.cpp @@ -32,6 +32,12 @@ float Vec2::Length() const const __m128 res = _mm_add_ss(sq, r2); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; +#elif PPSSPP_ARCH(ARM64_NEON) + float32x2_t vec = vld1_f32(&x); + float32x2_t sq = vmul_f32(vec, vec); + float32x2_t add2 = vpadd_f32(sq, sq); + float32x2_t res = vsqrt_f32(add2); + return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif @@ -80,6 +86,12 @@ float Vec3::Length() const const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3)); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; +#elif PPSSPP_ARCH(ARM64_NEON) + float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); + float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); + float32x2_t add2 = vpadd_f32(add1, add1); + float32x2_t res = vsqrt_f32(add2); + return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif @@ -146,6 +158,37 @@ Vec3 Vec3::NormalizedOr001(bool useSSE4) const { // Replace with the constant if the mask matched. return _mm_or_ps(_mm_andnot_ps(mask, result), replace); } +#elif PPSSPP_ARCH(ARM64_NEON) +template<> +Vec3 Vec3::Normalized(bool useSSE4) const { + float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); + float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); + float32x2_t summed = vpadd_f32(add1, add1); + + float32x2_t e = vrsqrte_f32(summed); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); + + float32x4_t factor = vdupq_lane_f32(e, 0); + return Vec3(vmulq_f32(vec, factor)); +} + +template<> +Vec3 Vec3::NormalizedOr001(bool useSSE4) const { + float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3); + float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); + float32x2_t summed = vpadd_f32(add1, add1); + if (vget_lane_f32(summed, 0) == 0.0f) { + return Vec3(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2)); + } + + float32x2_t e = vrsqrte_f32(summed); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); + e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e); + + float32x4_t factor = vdupq_lane_f32(e, 0); + return Vec3(vmulq_f32(vec, factor)); +} #else template<> Vec3 Vec3::Normalized(bool useSSE4) const @@ -258,6 +301,12 @@ float Vec4::Length() const const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1))); _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; +#elif PPSSPP_ARCH(ARM64_NEON) + float32x4_t sq = vmulq_f32(vec, vec); + float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq)); + float32x2_t add2 = vpadd_f32(add1, add1); + float32x2_t res = vsqrt_f32(add2); + return vget_lane_f32(res, 0); #else return sqrtf(Length2()); #endif