softgpu: Use SSE for gouraud shading.

2024-11-28 10:51:06 +00:00 · 2014-03-16 14:29:22 -07:00 · 2014-03-16 14:29:22 -07:00 · dd140b73bb
commit dd140b73bb
parent 743854afc8
2 changed files with 92 additions and 15 deletions
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -18,6 +18,11 @@
 #pragma once

 #include <cmath>
+#include "Common/Common.h"
+
+#if defined(_M_SSE)
+#include <emmintrin.h>
+#endif

 namespace Math3D {

@ -36,9 +41,16 @@ template<typename T>
 class Vec2
 {
 public:
-	struct
+	union
 	{
-		T x,y;
+		struct
+		{
+			T x,y;
+		};
+#if defined(_M_SSE)
+		__m128i ivec;
+		__m128 vec;
+#endif
 	};

 	T* AsArray() { return &x; }
@ -47,6 +59,10 @@ public:
 	Vec2() {}
 	Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
 	Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
+#if defined(_M_SSE)
+	Vec2(const __m128 &_vec) : vec(_vec) {}
+	Vec2(const __m128i &_ivec) : ivec(_ivec) {}
+#endif

 	template<typename T2>
 	Vec2<T2> Cast() const
@ -164,9 +180,16 @@ template<typename T>
 class Vec3
 {
 public:
-	struct
+	union
 	{
-		T x,y,z;
+		struct
+		{
+			T x,y,z;
+		};
+#if defined(_M_SSE)
+		__m128i ivec;
+		__m128 vec;
+#endif
 	};

 	T* AsArray() { return &x; }
@ -176,6 +199,10 @@ public:
 	Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
 	Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
 	Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
+#if defined(_M_SSE)
+	Vec3(const __m128 &_vec) : vec(_vec) {}
+	Vec3(const __m128i &_ivec) : ivec(_ivec) {}
+#endif

 	template<typename T2>
 	Vec3<T2> Cast() const
@ -324,9 +351,16 @@ template<typename T>
 class Vec4
 {
 public:
-	struct
+	union
 	{
-		T x,y,z,w;
+		struct
+		{
+			T x,y,z,w;
+		};
+#if defined(_M_SSE)
+		__m128i ivec;
+		__m128 vec;
+#endif
 	};

 	T* AsArray() { return &x; }
@ -337,6 +371,10 @@ public:
 	Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
 	Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
 	Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
+#if defined(_M_SSE)
+	Vec4(const __m128 &_vec) : vec(_vec) {}
+	Vec4(const __m128i &_ivec) : ivec(_ivec) {}
+#endif

 	template<typename T2>
 	Vec4<T2> Cast() const
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@ -30,6 +30,10 @@

 #include <algorithm>

+#if defined(_M_SSE)
+#include <emmintrin.h>
+#endif
+
 extern FormatBuffer fb;
 extern FormatBuffer depthbuf;

@ -978,6 +982,46 @@ inline void ApplyTexturing(Vec3<int> &prim_color_rgb, int &prim_color_a, float s
 	prim_color_a = out.a();
 }

+#if defined(_M_SSE)
+static inline __m128 Interpolate(const __m128 &c0, const __m128 &c1, const __m128 &c2, int w0, int w1, int w2, float wsum) {
+	__m128 v = _mm_mul_ps(c0, _mm_cvtepi32_ps(_mm_set1_epi32(w0)));
+	v = _mm_add_ps(v, _mm_mul_ps(c1, _mm_cvtepi32_ps(_mm_set1_epi32(w1))));
+	v = _mm_add_ps(v, _mm_mul_ps(c2, _mm_cvtepi32_ps(_mm_set1_epi32(w2))));
+	return _mm_mul_ps(v, _mm_set_ps1(wsum));
+}
+
+static inline __m128i Interpolate(const __m128i &c0, const __m128i &c1, const __m128i &c2, int w0, int w1, int w2, float wsum) {
+	return _mm_cvtps_epi32(Interpolate(_mm_cvtepi32_ps(c0), _mm_cvtepi32_ps(c1), _mm_cvtepi32_ps(c2), w0, w1, w2, wsum));
+}
+#endif
+
+// NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues.
+// Not sure if that should be regarded as a bug or if casting to float is a valid fix.
+
+static inline Vec4<int> Interpolate(const Vec4<int> &c0, const Vec4<int> &c1, const Vec4<int> &c2, int w0, int w1, int w2, float wsum) {
+#if defined(_M_SSE)
+	return Vec4<int>(Interpolate(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));
+#else
+	return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();
+#endif
+}
+
+static inline Vec3<int> Interpolate(const Vec3<int> &c0, const Vec3<int> &c1, const Vec3<int> &c2, int w0, int w1, int w2, float wsum) {
+#if defined(_M_SSE)
+	return Vec3<int>(Interpolate(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum));
+#else
+	return ((c0.Cast<float>() * w0 + c1.Cast<float>() * w1 + c2.Cast<float>() * w2) * wsum).Cast<int>();
+#endif
+}
+
+static inline Vec2<float> Interpolate(const Vec2<float> &c0, const Vec2<float> &c1, const Vec2<float> &c2, int w0, int w1, int w2, float wsum) {
+#if defined(_M_SSE)
+	return Vec2<float>(Interpolate(c0.vec, c1.vec, c2.vec, w0, w1, w2, wsum));
+#else
+	return (c0 * w0 + c1 * w1 + c2 * w2) * wsum;
+#endif
+}
+
 template <bool clearMode>
 void DrawTriangleSlice(
 	const VertexData& v0, const VertexData& v1, const VertexData& v2,
@ -1067,16 +1111,11 @@ void DrawTriangleSlice(
 				int prim_color_a = 0;
 				Vec3<int> sec_color(0, 0, 0);
 				if (gstate.getShadeMode() == GE_SHADE_GOURAUD && !clearMode) {
-					// NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues.
-					// Not sure if that should be regarded as a bug or if casting to float is a valid fix.
 					// TODO: Is that the correct way to interpolate?
-					prim_color_rgb = ((v0.color0.rgb().Cast<float>() * w0 +
-									v1.color0.rgb().Cast<float>() * w1 +
-									v2.color0.rgb().Cast<float>() * w2) * wsum).Cast<int>();
-					prim_color_a = (int)(((float)v0.color0.a() * w0 + (float)v1.color0.a() * w1 + (float)v2.color0.a() * w2) * wsum);
-					sec_color = ((v0.color1.Cast<float>() * w0 +
-									v1.color1.Cast<float>() * w1 +
-									v2.color1.Cast<float>() * w2) * wsum).Cast<int>();
+					const Vec4<int> prim_color = Interpolate(v0.color0, v1.color0, v2.color0, w0, w1, w2, wsum);
+					prim_color_rgb = prim_color.rgb();
+					prim_color_a = prim_color.a();
+					sec_color = Interpolate(v0.color1, v1.color1, v2.color1, w0, w1, w2, wsum);
 				} else {
 					prim_color_rgb = v2.color0.rgb();
 					prim_color_a = v2.color0.a();