Use SSE4.1, if available, for spline normals.

2024-12-14 04:28:42 +00:00 · 2015-02-25 19:24:12 -08:00 · 2015-02-25 19:24:12 -08:00 · 588efa5a71
commit 588efa5a71
parent f070d6f5ed
1 changed files with 32 additions and 6 deletions
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@ -18,6 +18,7 @@
 #include <string.h>
 #include <algorithm>

+#include "Common/CPUDetect.h"
 #include "Core/Config.h"

 #include "GPU/Common/SplineCommon.h"
@ -34,16 +35,28 @@ inline __m128 SSECrossProduct(__m128 a, __m128 b)
 	return _mm_sub_ps(left, right);
 }

-inline __m128 SSENormalizeMultiplier(__m128 v)
+inline __m128 SSENormalizeMultiplierSSE2(__m128 v)
 {
 	const __m128 sq = _mm_mul_ps(v, v);
 	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
 	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
 	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));

-	__m128 rt = _mm_rsqrt_ss(res);
+	const __m128 rt = _mm_rsqrt_ss(res);
 	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
 }
+
+inline __m128 SSENormalizeMultiplierSSE4(__m128 v)
+{
+	return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
+}
+
+inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
+{
+	if (useSSE4)
+		return SSENormalizeMultiplierSSE4(v);
+	return SSENormalizeMultiplierSSE2(v);
+}
 #endif


@ -238,7 +251,7 @@ static inline void AccumulateWeighted(Vec4f &out, const Vec4f &in, const Vec4f &
 #endif
 }

-template <bool origNrm, bool origCol, bool origTc>
+template <bool origNrm, bool origCol, bool origTc, bool useSSE4>
 static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
 	// Full (mostly) correct tessellation of spline patches.
 	// Not very fast.
@ -351,7 +364,12 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
 			}
 			vert->pos = vert_pos;
 			if (origNrm) {
+#ifdef _M_SSE
+				const __m128 normalize = SSENormalizeMultiplier(useSSE4, vert_nrm.vec);
+				vert_nrm.vec = _mm_mul_ps(vert_nrm.vec, normalize);
+#else
 				vert_nrm.Normalize();
+#endif
 				vert->nrm = vert_nrm;
 			} else {
 				vert->nrm.SetZero();
@ -392,7 +410,7 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
 				const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec);

 				const __m128 crossed = SSECrossProduct(right, down);
-				const __m128 normalize = SSENormalizeMultiplier(crossed);
+				const __m128 normalize = SSENormalizeMultiplier(useSSE4, crossed);

 				Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing));
 				vertices[v * (patch_div_s + 1) + u].nrm = finalNrm;
@ -428,14 +446,22 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
 	}
 }

+template <bool origNrm, bool origCol, bool origTc>
+static inline void SplinePatchFullQualityDispatch4(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
+	if (cpu_info.bSSE4_1)
+		SplinePatchFullQuality<origNrm, origCol, origTc, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
+	else
+		SplinePatchFullQuality<origNrm, origCol, origTc, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
+}
+
 template <bool origNrm, bool origCol>
 static inline void SplinePatchFullQualityDispatch3(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
 	bool origTc = (origVertType & GE_VTYPE_TC_MASK) != 0;

 	if (origTc)
-		SplinePatchFullQuality<origNrm, origCol, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
+		SplinePatchFullQualityDispatch4<origNrm, origCol, true>(dest, indices, count, spatch, origVertType, quality, maxVertices);
 	else
-		SplinePatchFullQuality<origNrm, origCol, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
+		SplinePatchFullQualityDispatch4<origNrm, origCol, false>(dest, indices, count, spatch, origVertType, quality, maxVertices);
 }

 template <bool origNrm>