Use SSE when generating spline normals.

This commit is contained in:
Unknown W. Brackets 2015-02-25 19:22:48 -08:00
parent 8e19f568d7
commit f070d6f5ed
2 changed files with 60 additions and 13 deletions

View File

@ -18,16 +18,34 @@
#include <string.h>
#include <algorithm>
#if defined(_M_SSE)
#include <emmintrin.h>
#endif
#include "Core/Config.h"
#include "GPU/Common/SplineCommon.h"
#include "GPU/ge_constants.h"
#include "GPU/GPUState.h"
#if defined(_M_SSE)
#include <emmintrin.h>
inline __m128 SSECrossProduct(__m128 a, __m128 b)
{
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)));
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
return _mm_sub_ps(left, right);
}
inline __m128 SSENormalizeMultiplier(__m128 v)
{
const __m128 sq = _mm_mul_ps(v, v);
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));
__m128 rt = _mm_rsqrt_ss(res);
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
}
#endif
#define START_OPEN 1
#define END_OPEN 2
@ -350,20 +368,47 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
// Hacky normal generation through central difference.
if (gstate.isLightingEnabled() && !origNrm) {
for (int v = 0; v < patch_div_t + 1; v++) {
for (int u = 0; u < patch_div_s + 1; u++) {
int l = std::max(0, u - 1);
int t = std::max(0, v - 1);
int r = std::min(patch_div_s, u + 1);
int b = std::min(patch_div_t, v + 1);
#ifdef _M_SSE
const __m128 facing = (gstate.patchfacing & 1) != 0 ? _mm_set_ps1(-1.0f) : _mm_set_ps1(1.0f);
#endif
const Vec3Packedf &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
for (int v = 0; v < patch_div_t + 1; v++) {
Vec3f vl_pos = vertices[v * (patch_div_s + 1)].pos;
Vec3f vc_pos = vertices[v * (patch_div_s + 1)].pos;
for (int u = 0; u < patch_div_s + 1; u++) {
const int l = std::max(0, u - 1);
const int t = std::max(0, v - 1);
const int r = std::min(patch_div_s, u + 1);
const int b = std::min(patch_div_t, v + 1);
const Vec3f vr_pos = vertices[v * (patch_div_s + 1) + r].pos;
#ifdef _M_SSE
const __m128 right = _mm_sub_ps(vr_pos.vec, vl_pos.vec);
const Vec3f vb_pos = vertices[b * (patch_div_s + 1) + u].pos;
const Vec3f vt_pos = vertices[t * (patch_div_s + 1) + u].pos;
const __m128 down = _mm_sub_ps(vb_pos.vec, vt_pos.vec);
const __m128 crossed = SSECrossProduct(right, down);
const __m128 normalize = SSENormalizeMultiplier(crossed);
Vec3f finalNrm = _mm_mul_ps(normalize, _mm_mul_ps(crossed, facing));
vertices[v * (patch_div_s + 1) + u].nrm = finalNrm;
#else
const Vec3Packedf &right = vr_pos - vl_pos;
const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
if (gstate.patchfacing & 1) {
vertices[v * (patch_div_s + 1) + u].nrm *= -1.0f;
}
#endif
// Rotate for the next one to the right.
vl_pos = vc_pos;
vc_pos = vr_pos;
}
}
}

View File

@ -209,7 +209,7 @@ public:
Vec3(const __m128 &_vec) : vec(_vec) {}
Vec3(const __m128i &_ivec) : ivec(_ivec) {}
Vec3(const Vec3Packed<T> &_xyz) {
ivec = _mm_loadu_ps(_xyz.AsArray());
vec = _mm_loadu_ps(_xyz.AsArray());
}
#else
Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
@ -377,7 +377,9 @@ public:
Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
Vec3Packed(const Vec3<T>& _xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
Vec3Packed(const Vec3<T>& _xyz) {
memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
}
template<typename T2>
Vec3Packed<T2> Cast() const