From f070d6f5ed7f49545f2064bbfb9fd6492056b4ff Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 25 Feb 2015 19:22:48 -0800
Subject: [PATCH] Use SSE when generating spline normals.

---
 GPU/Common/SplineCommon.cpp | 67 +++++++++++++++++++++++++++++++------
 GPU/Math3D.h                |  6 ++--
 2 files changed, 60 insertions(+), 13 deletions(-)
diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp
index e000e4c6f..e4762df7f 100644
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@@ -18,16 +18,34 @@
 #include <string.h>
 #include <algorithm>
 
-#if defined(_M_SSE)
-#include <emmintrin.h>
-#endif
-
 #include "Core/Config.h"
 
 #include "GPU/Common/SplineCommon.h"
 #include "GPU/ge_constants.h"
 #include "GPU/GPUState.h"
 
+#if defined(_M_SSE)
+#include <emmintrin.h>
+
+inline __m128 SSECrossProduct(__m128 a, __m128 b)
+{
+	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)));
+	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
+	return _mm_sub_ps(left, right);
+}
+
+inline __m128 SSENormalizeMultiplier(__m128 v)
+{
+	const __m128 sq = _mm_mul_ps(v, v);
+	const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
+	const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
+	const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
+
+	__m128 rt = _mm_rsqrt_ss(res);
+	return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
+}
+#endif
+
 
 #define START_OPEN 1
 #define END_OPEN 2
@@ -350,20 +368,47 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
 
 	// Hacky normal generation through central difference.
 	if (gstate.isLightingEnabled() && !origNrm) {
-		for (int v = 0; v < patch_div_t + 1; v++) {
-			for (int u = 0; u < patch_div_s + 1; u++) {
-				int l = std::max(0, u - 1);
-				int t = std::max(0, v - 1);
-				int r = std::min(patch_div_s, u + 1);
-				int b = std::min(patch_div_t, v + 1);
+#ifdef _M_SSE
+		const __m128 facing = (gstate.patchfacing & 1) != 0 ? _mm_set_ps1(-1.0f) : _mm_set_ps1(1.0f);
+#endif
 
-				const Vec3Packedf &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
+		for (int v = 0; v < patch_div_t + 1; v++) {
+			Vec3f vl_pos = vertices[v * (patch_div_s + 1)].pos;
+			Vec3f vc_pos = vertices[v * (patch_div_s + 1)].pos;
+
+			for (int u = 0; u < patch_div_s + 1; u++) {
+				const int l = std::max(0, u - 1);
+				const int t = std::max(0, v - 1);
+				const int r = std::min(patch_div_s, u + 1);
+				const int b = std::min(patch_div_t, v + 1);
+
+				const Vec3f vr_pos = vertices[v * (patch_div_s + 1) + r].pos;
+
+#ifdef _M_SSE
+				const __m128 right = _mm_sub_ps(vr_pos.vec, vl_pos.vec);
+
+				const Vec3f vb_pos = vertices[b * (patch_div_s + 1) + u].pos;
+				const Vec3f vt_pos = vertices[t * (patch_div_s + 1) + u].pos;
+				const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec);
+
+				const __m128 crossed = SSECrossProduct(right, down);
+				const __m128 normalize = SSENormalizeMultiplier(crossed);
+
+				Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing));
+				vertices[v * (patch_div_s + 1) + u].nrm = finalNrm;
+#else
+				const Vec3Packedf &right = vr_pos - vl_pos;
 				const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
 
 				vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
 				if (gstate.patchfacing & 1) {
 					vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f;
 				}
+#endif
+
+				// Rotate for the next one to the right.
+				vl_pos = vc_pos;
+				vc_pos = vr_pos;
 			}
 		}
 	}
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 72dd29e2a..921786ba5 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -209,7 +209,7 @@ public:
 	Vec3(const __m128 &_vec) : vec(_vec) {}
 	Vec3(const __m128i &_ivec) : ivec(_ivec) {}
 	Vec3(const Vec3Packed<T> &_xyz) {
-		ivec = _mm_loadu_ps(_xyz.AsArray());
+		vec = _mm_loadu_ps(_xyz.AsArray());
 	}
 #else
 	Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
@@ -377,7 +377,9 @@ public:
 	Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
 	Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
 	Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
-	Vec3Packed(const Vec3<T>& _xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
+	Vec3Packed(const Vec3<T>& _xyz) {
+		memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
+	}
 
 	template<typename T2>
 	Vec3Packed<T2> Cast() const