Just add a packed version of Vec3f.

This way we can have it aligned to memory where needed. I think it'd be better to avoid this if possible so that we can actually vectorize spline/etc. code. Fixes #5673.
2024-11-23 05:19:56 +00:00 · 2014-03-17 06:58:50 -07:00 · 2014-03-17 06:58:50 -07:00 · 6630e45eff
commit 6630e45eff
parent 38d0bac1df
4 changed files with 262 additions and 26 deletions
--- a/GPU/Common/SplineCommon.h
+++ b/GPU/Common/SplineCommon.h
@ -24,6 +24,6 @@
 struct SimpleVertex {
 	float uv[2];
 	u8 color[4];
-	Vec3f nrm;
-	Vec3f pos;
+	Vec3Packedf nrm;
+	Vec3Packedf pos;
 };
--- a/GPU/GLES/Spline.cpp
+++ b/GPU/GLES/Spline.cpp
@ -80,16 +80,16 @@ u32 TransformDrawEngine::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inP
 			float weights[8];
 			reader.ReadWeights(weights);
 			// Skinning
-			Vec3f psum(0,0,0);
-			Vec3f nsum(0,0,0);
+			Vec3Packedf psum(0,0,0);
+			Vec3Packedf nsum(0,0,0);
 			for (int w = 0; w < numBoneWeights; w++) {
 				if (weights[w] != 0.0f) {
 					Vec3ByMatrix43(bpos, pos, gstate.boneMatrix+w*12);
-					Vec3f tpos(bpos);
+					Vec3Packedf tpos(bpos);
 					psum += tpos * weights[w];

 					Norm3ByMatrix43(bnrm, nrm, gstate.boneMatrix+w*12);
-					Vec3f tnorm(bnrm);
+					Vec3Packedf tnorm(bnrm);
 					nsum += tnorm * weights[w];
 				}
 			}
@ -288,11 +288,11 @@ inline float bern2deriv(float x) { return 3 * (2 - 3 * x) * x; }
 inline float bern3deriv(float x) { return 3 * x * x; }

 // http://en.wikipedia.org/wiki/Bernstein_polynomial
-Vec3f Bernstein3D(const Vec3f p0, const Vec3f p1, const Vec3f p2, const Vec3f p3, float x) {
+Vec3Packedf Bernstein3D(const Vec3Packedf p0, const Vec3Packedf p1, const Vec3Packedf p2, const Vec3Packedf p3, float x) {
 	return p0 * bern0(x) + p1 * bern1(x) + p2 * bern2(x) + p3 * bern3(x);
 }

-Vec3f Bernstein3DDerivative(const Vec3f p0, const Vec3f p1, const Vec3f p2, const Vec3f p3, float x) {
+Vec3Packedf Bernstein3DDerivative(const Vec3Packedf p0, const Vec3Packedf p1, const Vec3Packedf p2, const Vec3Packedf p3, float x) {
 	return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
 }

@ -379,7 +379,7 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
 				// Generate normal if lighting is enabled (otherwise there's no point).
 				// This is a really poor quality algorithm, we get facet normals.
 				if (gstate.isLightingEnabled()) {
-					Vec3f norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
+					Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
 					norm.Normalize();
 					if (gstate.patchfacing & 1)
 						norm *= -1.0f;
@ -503,8 +503,8 @@ void TesselateSplinePatch(u8 *&dest, int &count, const SplinePatch &spatch, u32
 					int r = std::min(patch_div_s, u + 1);
 					int b = std::min(patch_div_t, v + 1);

-					const Vec3f &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
-					const Vec3f &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;
+					const Vec3Packedf &right = vertices[v * (patch_div_s + 1) + r].pos - vertices[v * (patch_div_s + 1) + l].pos;
+					const Vec3Packedf &down = vertices[b * (patch_div_s + 1) + u].pos - vertices[t * (patch_div_s + 1) + u].pos;

 					vertices[v * (patch_div_s + 1) + u].nrm = Cross(right, down).Normalized();
 					if (gstate.patchfacing & 1) {
@ -570,7 +570,7 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
 				// Generate normal if lighting is enabled (otherwise there's no point).
 				// This is a really poor quality algorithm, we get facet normals.
 				if (gstate.isLightingEnabled()) {
-					Vec3f norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
+					Vec3Packedf norm = Cross(v1.pos - v0.pos, v2.pos - v0.pos);
 					norm.Normalize();
 					if (gstate.patchfacing & 1)
 						norm *= -1.0f;
@ -591,10 +591,10 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
 		// First compute all the vertices and put them in an array
 		SimpleVertex *vertices = new SimpleVertex[(tess_u + 1) * (tess_v + 1)];

-		Vec3f *horiz = new Vec3f[(tess_u + 1) * 4];
-		Vec3f *horiz2 = horiz + (tess_u + 1) * 1;
-		Vec3f *horiz3 = horiz + (tess_u + 1) * 2;
-		Vec3f *horiz4 = horiz + (tess_u + 1) * 3;
+		Vec3Packedf *horiz = new Vec3Packedf[(tess_u + 1) * 4];
+		Vec3Packedf *horiz2 = horiz + (tess_u + 1) * 1;
+		Vec3Packedf *horiz3 = horiz + (tess_u + 1) * 2;
+		Vec3Packedf *horiz4 = horiz + (tess_u + 1) * 3;

 		// Precompute the horizontal curves to we only have to evaluate the vertical ones.
 		for (int i = 0; i < tess_u + 1; i++) {
@ -615,20 +615,20 @@ void TesselateBezierPatch(u8 *&dest, int &count, int tess_u, int tess_v, const B
 				float bv = v;

 				// TODO: Should be able to precompute the four curves per U, then just Bernstein per V. Will benefit large tesselation factors.
-				const Vec3f &pos1 = horiz[tile_u];
-				const Vec3f &pos2 = horiz2[tile_u];
-				const Vec3f &pos3 = horiz3[tile_u];
-				const Vec3f &pos4 = horiz4[tile_u];
+				const Vec3Packedf &pos1 = horiz[tile_u];
+				const Vec3Packedf &pos2 = horiz2[tile_u];
+				const Vec3Packedf &pos3 = horiz3[tile_u];
+				const Vec3Packedf &pos4 = horiz4[tile_u];

 				SimpleVertex &vert = vertices[tile_v * (tess_u + 1) + tile_u];

 				if (computeNormals) {
-					Vec3f derivU1 = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, bu);
-					Vec3f derivU2 = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, bu);
-					Vec3f derivU3 = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, bu);
-					Vec3f derivU4 = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, bu);
-					Vec3f derivU = Bernstein3D(derivU1, derivU2, derivU3, derivU4, bv);
-					Vec3f derivV = Bernstein3DDerivative(pos1, pos2, pos3, pos4, bv);
+					Vec3Packedf derivU1 = Bernstein3DDerivative(patch.points[0]->pos, patch.points[1]->pos, patch.points[2]->pos, patch.points[3]->pos, bu);
+					Vec3Packedf derivU2 = Bernstein3DDerivative(patch.points[4]->pos, patch.points[5]->pos, patch.points[6]->pos, patch.points[7]->pos, bu);
+					Vec3Packedf derivU3 = Bernstein3DDerivative(patch.points[8]->pos, patch.points[9]->pos, patch.points[10]->pos, patch.points[11]->pos, bu);
+					Vec3Packedf derivU4 = Bernstein3DDerivative(patch.points[12]->pos, patch.points[13]->pos, patch.points[14]->pos, patch.points[15]->pos, bu);
+					Vec3Packedf derivU = Bernstein3D(derivU1, derivU2, derivU3, derivU4, bv);
+					Vec3Packedf derivV = Bernstein3DDerivative(pos1, pos2, pos3, pos4, bv);

 					// TODO: Interpolate normals instead of generating them, if available?
 					vert.nrm = Cross(derivU, derivV).Normalized();
--- a/GPU/Math3D.cpp
+++ b/GPU/Math3D.cpp
@ -167,6 +167,72 @@ float Vec3<float>::Normalize()
 	return len;
 }

+template<>
+Vec3Packed<float> Vec3Packed<float>::FromRGB(unsigned int rgb)
+{
+	return Vec3Packed((rgb & 0xFF) * (1.0f/255.0f),
+				((rgb >> 8) & 0xFF) * (1.0f/255.0f),
+				((rgb >> 16) & 0xFF) * (1.0f/255.0f));
+}
+
+template<>
+Vec3Packed<int> Vec3Packed<int>::FromRGB(unsigned int rgb)
+{
+	return Vec3Packed(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
+}
+
+template<>
+unsigned int Vec3Packed<float>::ToRGB() const
+{
+	return ((unsigned int)(r()*255.f)) +
+			((unsigned int)(g()*255.f*256.f)) +
+			((unsigned int)(b()*255.f*256.f*256.f));
+}
+
+template<>
+unsigned int Vec3Packed<int>::ToRGB() const
+{
+	return (r()&0xFF) | ((g()&0xFF)<<8) | ((b()&0xFF)<<16);
+}
+
+template<>
+float Vec3Packed<float>::Length() const
+{
+	return sqrtf(Length2());
+}
+
+template<>
+void Vec3Packed<float>::SetLength(const float l)
+{
+	(*this) *= l / Length();
+}
+
+template<>
+Vec3Packed<float> Vec3Packed<float>::WithLength(const float l) const
+{
+	return (*this) * l / Length();
+}
+
+template<>
+float Vec3Packed<float>::Distance2To(Vec3Packed<float> &other)
+{
+	return Vec3Packed<float>(other-(*this)).Length2();
+}
+
+template<>
+Vec3Packed<float> Vec3Packed<float>::Normalized() const
+{
+	return (*this) / Length();
+}
+
+template<>
+float Vec3Packed<float>::Normalize()
+{
+	float len = Length();
+	(*this) = (*this)/len;
+	return len;
+}
+
 template<>
 Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
 {
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -347,6 +347,169 @@ public:
 #undef _DEFINE_SWIZZLER2
 };

+template<typename T>
+class Vec3Packed
+{
+public:
+	union
+	{
+		struct
+		{
+			T x,y,z;
+		};
+	};
+
+	T* AsArray() { return &x; }
+	const T* AsArray() const { return &x; }
+
+	Vec3Packed() {}
+	Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
+	Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
+	Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
+
+	template<typename T2>
+	Vec3Packed<T2> Cast() const
+	{
+		return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
+	}
+
+	// Only implemented for T=int and T=float
+	static Vec3Packed FromRGB(unsigned int rgb);
+	unsigned int ToRGB() const; // alpha bits set to zero
+
+	static Vec3Packed AssignToAll(const T& f)
+	{
+		return Vec3Packed<T>(f, f, f);
+	}
+
+	void Write(T a[3])
+	{
+		a[0] = x; a[1] = y; a[2] = z;
+	}
+
+	Vec3Packed operator +(const Vec3Packed &other) const
+	{
+		return Vec3Packed(x+other.x, y+other.y, z+other.z);
+	}
+	void operator += (const Vec3Packed &other)
+	{
+		x+=other.x; y+=other.y; z+=other.z;
+	}
+	Vec3Packed operator -(const Vec3Packed &other) const
+	{
+		return Vec3Packed(x-other.x, y-other.y, z-other.z);
+	}
+	void operator -= (const Vec3Packed &other)
+	{
+		x-=other.x; y-=other.y; z-=other.z;
+	}
+	Vec3Packed operator -() const
+	{
+		return Vec3Packed(-x,-y,-z);
+	}
+	Vec3Packed operator * (const Vec3Packed &other) const
+	{
+		return Vec3Packed(x*other.x, y*other.y, z*other.z);
+	}
+	template<typename V>
+	Vec3Packed operator * (const V& f) const
+	{
+		return Vec3Packed(x*f,y*f,z*f);
+	}
+	template<typename V>
+	void operator *= (const V& f)
+	{
+		x*=f; y*=f; z*=f;
+	}
+	template<typename V>
+	Vec3Packed operator / (const V& f) const
+	{
+		return Vec3Packed(x/f,y/f,z/f);
+	}
+	template<typename V>
+	void operator /= (const V& f)
+	{
+		*this = *this / f;
+	}
+
+	T Length2() const
+	{
+		return x*x + y*y + z*z;
+	}
+
+	Vec3Packed Clamp(const T &l, const T &h) const
+	{
+		return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
+	}
+
+	// Only implemented for T=float
+	float Length() const;
+	void SetLength(const float l);
+	Vec3Packed WithLength(const float l) const;
+	float Distance2To(Vec3Packed &other);
+	Vec3Packed Normalized() const;
+	float Normalize(); // returns the previous length, which is often useful
+
+	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
+	{
+		return *((&x) + i);
+	}
+	T operator [] (const int i) const
+	{
+		return *((&x) + i);
+	}
+
+	void SetZero()
+	{
+		x=0; y=0; z=0;
+	}
+
+	// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
+	T& u() { return x; }
+	T& v() { return y; }
+	T& w() { return z; }
+
+	T& r() { return x; }
+	T& g() { return y; }
+	T& b() { return z; }
+
+	T& s() { return x; }
+	T& t() { return y; }
+	T& q() { return z; }
+
+	const T& u() const { return x; }
+	const T& v() const { return y; }
+	const T& w() const { return z; }
+
+	const T& r() const { return x; }
+	const T& g() const { return y; }
+	const T& b() const { return z; }
+
+	const T& s() const { return x; }
+	const T& t() const { return y; }
+	const T& q() const { return z; }
+
+	// swizzlers - create a subvector of specific components
+	// e.g. Vec2 uv() { return Vec2(x,y); }
+	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
+#define _DEFINE_SWIZZLER2(a, b, name) Vec2<T> name() const { return Vec2<T>(a, b); }
+#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
+	_DEFINE_SWIZZLER2(a, b, a##b); \
+	_DEFINE_SWIZZLER2(a, b, a2##b2); \
+	_DEFINE_SWIZZLER2(a, b, a3##b3); \
+	_DEFINE_SWIZZLER2(a, b, a4##b4); \
+	_DEFINE_SWIZZLER2(b, a, b##a); \
+	_DEFINE_SWIZZLER2(b, a, b2##a2); \
+	_DEFINE_SWIZZLER2(b, a, b3##a3); \
+	_DEFINE_SWIZZLER2(b, a, b4##a4);
+
+	DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
+	DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
+	DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
+#undef DEFINE_SWIZZLER2
+#undef _DEFINE_SWIZZLER2
+};
+
 template<typename T>
 class Vec4
 {
@ -629,6 +792,7 @@ private:
 }; // namespace Math3D

 typedef Math3D::Vec3<float> Vec3f;
+typedef Math3D::Vec3Packed<float> Vec3Packedf;
 typedef Math3D::Vec4<float> Vec4f;


@ -721,6 +885,12 @@ inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
 	return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
 }

+template<typename T>
+inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
+{
+	return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
+}
+
 }; // namespace Math3D

 // linear interpolation via float: 0.0=begin, 1.0=end