[spline/bezier]Get rid of divisions in spline weights calculation.

2025-02-27 01:17:09 +00:00 · 2018-01-29 18:25:38 +09:00 · 2018-01-29 18:25:38 +09:00 · 5c7284f8f2
commit 5c7284f8f2
parent 6fd1c0e3d9
1 changed files with 69 additions and 53 deletions
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@ -151,50 +151,56 @@ static Vec3f Bernstein3DDerivative(const Vec3f& p0, const Vec3f& p1, const Vec3f
 	return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
 }

-static void spline_n_4(int i, float t, float *knot, float *splineVal) {
-	knot += i + 1;
+struct KnotDiv {
+	float _3_0 = 1.0f / 3.0f;
+	float _4_1 = 1.0f / 3.0f;
+	float _5_2 = 1.0f / 3.0f;
+	float _3_1 = 1.0f / 2.0f;
+	float _4_2 = 1.0f / 2.0f;
+	float _3_2 = 1.0f; // Always 1
+};
+
+static void spline_n_4(int i, float t, float *knot, const KnotDiv &div, float *splineVal) {
+	knot += i;

 #ifdef _M_SSE
-	const __m128 knot012 = _mm_loadu_ps(&knot[0]);
-	const __m128 knot345 = _mm_loadu_ps(&knot[3]);
-	const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
-	const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012));
+		const __m128 knot012 = _mm_loadu_ps(knot);
+		const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
+		const __m128 f30_41_52 = _mm_mul_ps(t012, _mm_loadu_ps(&div._3_0));
+		const __m128 f52_31_42 = _mm_mul_ps(t012, _mm_loadu_ps(&div._5_2));
+		const float &f32 = t012.m128_f32[2];

-	const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0));
-	const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1));
-	const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1));
-	const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122));
-
-	// It's still faster to use SSE, even with this.
-	alignas(16) float ff30_41_52[4];
-	alignas(16) float ff31_42_32[4];
-	_mm_store_ps(ff30_41_52, f30_41_52);
-	_mm_store_ps(ff31_42_32, f31_42_32);
-
-	const float &f30 = ff30_41_52[0];
-	const float &f41 = ff30_41_52[1];
-	const float &f52 = ff30_41_52[2];
-	const float &f31 = ff31_42_32[0];
-	const float &f42 = ff31_42_32[1];
-	const float &f32 = ff31_42_32[2];
+		// Following comments are for explains order of the multiply.
+	//	float a = (1-f30)*(1-f31);
+	//	float c = (1-f41)*(1-f42);
+	//	float b = (  f31 *   f41);
+	//	float d = (  f42 *   f52);
+		const __m128 f30_41_31_42 = _mm_shuffle_ps(f30_41_52, f52_31_42, _MM_SHUFFLE(2, 1, 1, 0));
+		const __m128 f31_42_41_52 = _mm_shuffle_ps(f52_31_42, f30_41_52, _MM_SHUFFLE(2, 1, 2, 1));
+		const __m128 c1_1_0_0 = { 1, 1, 0, 0 };
+		const __m128 acbd = _mm_mul_ps(_mm_sub_ps(c1_1_0_0, f30_41_31_42), _mm_sub_ps(c1_1_0_0, f31_42_41_52));
+		const float &a = acbd.m128_f32[0];
+		const float &b = acbd.m128_f32[2];
+		const float &c = acbd.m128_f32[1];
+		const float &d = acbd.m128_f32[3];
 #else
 	// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
 	float t0 = (t - knot[0]);
 	float t1 = (t - knot[1]);
 	float t2 = (t - knot[2]);
-	// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
-	float f30 = t0/(knot[3]-knot[0]);
-	float f41 = t1/(knot[4]-knot[1]);
-	float f52 = t2/(knot[5]-knot[2]);
-	float f31 = t1/(knot[3]-knot[1]);
-	float f42 = t2/(knot[4]-knot[2]);
-	float f32 = t2/(knot[3]-knot[2]);
-#endif
+
+	float f30 = t0 * div._3_0;
+	float f41 = t1 * div._4_1;
+	float f52 = t2 * div._5_2;
+	float f31 = t1 * div._3_1;
+	float f42 = t2 * div._4_2;
+	float f32 = t2 * div._3_2;

 	float a = (1-f30)*(1-f31);
 	float b = (f31*f41);
 	float c = (1-f41)*(1-f42);
 	float d = (f42*f52);
+#endif

 	splineVal[0] = a-(a*f32);
 	splineVal[1] = 1-a-b+((a+b+c-1)*f32);
@ -203,25 +209,33 @@ static void spline_n_4(int i, float t, float *knot, float *splineVal) {
 }

 // knot should be an array sized n + 5  (n + 1 + 1 + degree (cubic))
-static void spline_knot(int n, int type, float *knot) {
-	memset(knot, 0, sizeof(float) * (n + 5));
-	for (int i = 0; i < n - 1; ++i)
-		knot[i + 3] = (float)i;
+static void spline_knot(int n, int type, float *knots, KnotDiv *divs) {
+		// Basic theory (-2 to +3), optimized with KnotDiv (-2 to +0) 
+	//	for (int i = 0; i < n + 5; ++i) {
+		for (int i = 0; i < n + 2; ++i) {
+			knots[i] = (float)i - 2;
+		}

-	if ((type & 1) == 0) {
-		knot[0] = -3;
-		knot[1] = -2;
-		knot[2] = -1;
-	}
-	if ((type & 2) == 0) {
-		knot[n + 2] = (float)(n - 1);
-		knot[n + 3] = (float)(n);
-		knot[n + 4] = (float)(n + 1);
-	} else {
-		knot[n + 2] = (float)(n - 2);
-		knot[n + 3] = (float)(n - 2);
-		knot[n + 4] = (float)(n - 2);
-	}
+		if ((type & 1) != 0) {
+			knots[0] = 0;
+			knots[1] = 0;
+
+			divs[0]._3_0 = 1.0f;
+			divs[0]._4_1 = 1.0f / 2.0f;
+			divs[0]._3_1 = 1.0f;
+			if (n > 1)
+				divs[1]._3_0 = 1.0f / 2.0f;
+		}
+		if ((type & 2) != 0) {
+		//	knots[n + 2] = (float)n; // Got rid of this line optimized with KnotDiv
+		//	knots[n + 3] = (float)n; // Got rid of this line optimized with KnotDiv
+		//	knots[n + 4] = (float)n; // Got rid of this line optimized with KnotDiv
+			divs[n - 1]._4_1 = 1.0f / 2.0f;
+			divs[n - 1]._5_2 = 1.0f;
+			divs[n - 1]._4_2 = 1.0f;
+			if (n > 1)
+				divs[n - 2]._5_2 = 1.0f / 2.0f;
+		}
 }

 bool CanUseHardwareTessellation(GEPatchPrimType prim) {
@ -359,8 +373,10 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp

 	float *knot_u = new float[spatch.count_u + 4];
 	float *knot_v = new float[spatch.count_v + 4];
-	spline_knot(spatch.count_u - 1, spatch.type_u, knot_u);
-	spline_knot(spatch.count_v - 1, spatch.type_v, knot_v);
+	KnotDiv *divs_u = new KnotDiv[spatch.count_u - 3];
+	KnotDiv *divs_v = new KnotDiv[spatch.count_v - 3];
+	spline_knot(spatch.count_u - 3, spatch.type_u, knot_u, divs_u);
+	spline_knot(spatch.count_v - 3, spatch.type_v, knot_v, divs_v);

 	// Increase tessellation based on the size. Should be approximately right?
 	int patch_div_s = (spatch.count_u - 3) * spatch.tess_u;
@ -439,8 +455,8 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
 			if (iu >= spatch.count_u - 3) iu = spatch.count_u - 4;
 			if (iv >= spatch.count_v - 3) iv = spatch.count_v - 4;

-			spline_n_4(iu, u, knot_u, u_weights);
-			spline_n_4(iv, v, knot_v, v_weights);
+			spline_n_4(iu, u, knot_u, divs_u[iu], u_weights);
+			spline_n_4(iv, v, knot_v, divs_v[iv], v_weights);

 			// Handle degenerate patches. without this, spatch.points[] may read outside the number of initialized points.
 			int patch_w = std::min(spatch.count_u - iu, 4);