From f070d6f5ed7f49545f2064bbfb9fd6492056b4ff Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 25 Feb 2015 19:22:48 -0800 Subject: [PATCH] Use SSE when generating spline normals. --- GPU/Common/SplineCommon.cpp | 67 +++++++++++++++++++++++++++++++------ GPU/Math3D.h | 6 ++-- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp index e000e4c6f..e4762df7f 100644 --- a/GPU/Common/SplineCommon.cpp +++ b/GPU/Common/SplineCommon.cpp @@ -18,16 +18,34 @@ #include #include -#if defined(_M_SSE) -#include -#endif - #include "Core/Config.h" #include "GPU/Common/SplineCommon.h" #include "GPU/ge_constants.h" #include "GPU/GPUState.h" +#if defined(_M_SSE) +#include + +inline __m128 SSECrossProduct(__m128 a, __m128 b) +{ + const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))); + const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); + return _mm_sub_ps(left, right); +} + +inline __m128 SSENormalizeMultiplier(__m128 v) +{ + const __m128 sq = _mm_mul_ps(v, v); + const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); + const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); + + __m128 rt = _mm_rsqrt_ss(res); + return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); +} +#endif + #define START_OPEN 1 #define END_OPEN 2 @@ -350,20 +368,47 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp // Hacky normal generation through central difference. if (gstate.isLightingEnabled() && !origNrm) { - for (int v = 0; v < patch_div_t + 1; v++) { - for (int u = 0; u < patch_div_s + 1; u++) { - int l = std::max(0, u - 1); - int t = std::max(0, v - 1); - int r = std::min(patch_div_s, u + 1); - int b = std::min(patch_div_t, v + 1); +#ifdef _M_SSE + const __m128 facing = (gstate.patchfacing & 1) != 0 ? _mm_set_ps1(-1.0f) : _mm_set_ps1(1.0f); +#endif - const Vec3Packedf &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos; + for (int v = 0; v < patch_div_t + 1; v++) { + Vec3f vl_pos = vertices[v * (patch_div_s + 1)].pos; + Vec3f vc_pos = vertices[v * (patch_div_s + 1)].pos; + + for (int u = 0; u < patch_div_s + 1; u++) { + const int l = std::max(0, u - 1); + const int t = std::max(0, v - 1); + const int r = std::min(patch_div_s, u + 1); + const int b = std::min(patch_div_t, v + 1); + + const Vec3f vr_pos = vertices[v * (patch_div_s + 1) + r].pos; + +#ifdef _M_SSE + const __m128 right = _mm_sub_ps(vr_pos.vec, vl_pos.vec); + + const Vec3f vb_pos = vertices[b * (patch_div_s + 1) + u].pos; + const Vec3f vt_pos = vertices[t * (patch_div_s + 1) + u].pos; + const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec); + + const __m128 crossed = SSECrossProduct(right, down); + const __m128 normalize = SSENormalizeMultiplier(crossed); + + Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing)); + vertices[v * (patch_div_s + 1) + u].nrm = finalNrm; +#else + const Vec3Packedf &right = vr_pos - vl_pos; const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos; vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized(); if (gstate.patchfacing & 1) { vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f; } +#endif + + // Rotate for the next one to the right. + vl_pos = vc_pos; + vc_pos = vr_pos; } } } diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 72dd29e2a..921786ba5 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -209,7 +209,7 @@ public: Vec3(const __m128 &_vec) : vec(_vec) {} Vec3(const __m128i &_ivec) : ivec(_ivec) {} Vec3(const Vec3Packed &_xyz) { - ivec = _mm_loadu_ps(_xyz.AsArray()); + vec = _mm_loadu_ps(_xyz.AsArray()); } #else Vec3(const Vec3Packed &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {} @@ -377,7 +377,9 @@ public: Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} Vec3Packed(const Vec2& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {} - Vec3Packed(const Vec3& _xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {} + Vec3Packed(const Vec3& _xyz) { + memcpy(&x, _xyz.AsArray(), sizeof(float) * 3); + } template Vec3Packed Cast() const