mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 13:30:02 +00:00
GPU: Use NEON for vector length/normalize.
This is a bit less accurate for normalize, but it's faster.
This commit is contained in:
parent
8acd855eac
commit
d90aec6f63
@ -32,6 +32,12 @@ float Vec2<float>::Length() const
|
||||
const __m128 res = _mm_add_ss(sq, r2);
|
||||
_mm_store_ss(&ret, _mm_sqrt_ss(res));
|
||||
return ret;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
float32x2_t vec = vld1_f32(&x);
|
||||
float32x2_t sq = vmul_f32(vec, vec);
|
||||
float32x2_t add2 = vpadd_f32(sq, sq);
|
||||
float32x2_t res = vsqrt_f32(add2);
|
||||
return vget_lane_f32(res, 0);
|
||||
#else
|
||||
return sqrtf(Length2());
|
||||
#endif
|
||||
@ -80,6 +86,12 @@ float Vec3<float>::Length() const
|
||||
const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
|
||||
_mm_store_ss(&ret, _mm_sqrt_ss(res));
|
||||
return ret;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
|
||||
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
|
||||
float32x2_t add2 = vpadd_f32(add1, add1);
|
||||
float32x2_t res = vsqrt_f32(add2);
|
||||
return vget_lane_f32(res, 0);
|
||||
#else
|
||||
return sqrtf(Length2());
|
||||
#endif
|
||||
@ -146,6 +158,37 @@ Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
|
||||
// Replace with the constant if the mask matched.
|
||||
return _mm_or_ps(_mm_andnot_ps(mask, result), replace);
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
template<>
|
||||
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const {
|
||||
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
|
||||
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
|
||||
float32x2_t summed = vpadd_f32(add1, add1);
|
||||
|
||||
float32x2_t e = vrsqrte_f32(summed);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
|
||||
|
||||
float32x4_t factor = vdupq_lane_f32(e, 0);
|
||||
return Vec3<float>(vmulq_f32(vec, factor));
|
||||
}
|
||||
|
||||
template<>
|
||||
Vec3<float> Vec3<float>::NormalizedOr001(bool useSSE4) const {
|
||||
float32x4_t sq = vsetq_lane_f32(0.0f, vmulq_f32(vec, vec), 3);
|
||||
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
|
||||
float32x2_t summed = vpadd_f32(add1, add1);
|
||||
if (vget_lane_f32(summed, 0) == 0.0f) {
|
||||
return Vec3<float>(vsetq_lane_f32(1.0f, vdupq_lane_f32(summed, 0), 2));
|
||||
}
|
||||
|
||||
float32x2_t e = vrsqrte_f32(summed);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
|
||||
e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), summed), e);
|
||||
|
||||
float32x4_t factor = vdupq_lane_f32(e, 0);
|
||||
return Vec3<float>(vmulq_f32(vec, factor));
|
||||
}
|
||||
#else
|
||||
template<>
|
||||
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
|
||||
@ -258,6 +301,12 @@ float Vec4<float>::Length() const
|
||||
const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
|
||||
_mm_store_ss(&ret, _mm_sqrt_ss(res));
|
||||
return ret;
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
float32x4_t sq = vmulq_f32(vec, vec);
|
||||
float32x2_t add1 = vget_low_f32(vpaddq_f32(sq, sq));
|
||||
float32x2_t add2 = vpadd_f32(add1, add1);
|
||||
float32x2_t res = vsqrt_f32(add2);
|
||||
return vget_lane_f32(res, 0);
|
||||
#else
|
||||
return sqrtf(Length2());
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user