mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-02-27 01:17:09 +00:00
[spline/bezier]Get rid of divisions in spline weights calculation.
This commit is contained in:
parent
6fd1c0e3d9
commit
5c7284f8f2
@ -151,50 +151,56 @@ static Vec3f Bernstein3DDerivative(const Vec3f& p0, const Vec3f& p1, const Vec3f
|
||||
return p0 * bern0deriv(x) + p1 * bern1deriv(x) + p2 * bern2deriv(x) + p3 * bern3deriv(x);
|
||||
}
|
||||
|
||||
static void spline_n_4(int i, float t, float *knot, float *splineVal) {
|
||||
knot += i + 1;
|
||||
struct KnotDiv {
|
||||
float _3_0 = 1.0f / 3.0f;
|
||||
float _4_1 = 1.0f / 3.0f;
|
||||
float _5_2 = 1.0f / 3.0f;
|
||||
float _3_1 = 1.0f / 2.0f;
|
||||
float _4_2 = 1.0f / 2.0f;
|
||||
float _3_2 = 1.0f; // Always 1
|
||||
};
|
||||
|
||||
static void spline_n_4(int i, float t, float *knot, const KnotDiv &div, float *splineVal) {
|
||||
knot += i;
|
||||
|
||||
#ifdef _M_SSE
|
||||
const __m128 knot012 = _mm_loadu_ps(&knot[0]);
|
||||
const __m128 knot345 = _mm_loadu_ps(&knot[3]);
|
||||
const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
|
||||
const __m128 f30_41_52 = _mm_div_ps(t012, _mm_sub_ps(knot345, knot012));
|
||||
const __m128 knot012 = _mm_loadu_ps(knot);
|
||||
const __m128 t012 = _mm_sub_ps(_mm_set_ps1(t), knot012);
|
||||
const __m128 f30_41_52 = _mm_mul_ps(t012, _mm_loadu_ps(&div._3_0));
|
||||
const __m128 f52_31_42 = _mm_mul_ps(t012, _mm_loadu_ps(&div._5_2));
|
||||
const float &f32 = t012.m128_f32[2];
|
||||
|
||||
const __m128 knot343 = _mm_shuffle_ps(knot345, knot345, _MM_SHUFFLE(3, 0, 1, 0));
|
||||
const __m128 knot122 = _mm_shuffle_ps(knot012, knot012, _MM_SHUFFLE(3, 2, 2, 1));
|
||||
const __m128 t122 = _mm_shuffle_ps(t012, t012, _MM_SHUFFLE(3, 2, 2, 1));
|
||||
const __m128 f31_42_32 = _mm_div_ps(t122, _mm_sub_ps(knot343, knot122));
|
||||
|
||||
// It's still faster to use SSE, even with this.
|
||||
alignas(16) float ff30_41_52[4];
|
||||
alignas(16) float ff31_42_32[4];
|
||||
_mm_store_ps(ff30_41_52, f30_41_52);
|
||||
_mm_store_ps(ff31_42_32, f31_42_32);
|
||||
|
||||
const float &f30 = ff30_41_52[0];
|
||||
const float &f41 = ff30_41_52[1];
|
||||
const float &f52 = ff30_41_52[2];
|
||||
const float &f31 = ff31_42_32[0];
|
||||
const float &f42 = ff31_42_32[1];
|
||||
const float &f32 = ff31_42_32[2];
|
||||
// Following comments are for explains order of the multiply.
|
||||
// float a = (1-f30)*(1-f31);
|
||||
// float c = (1-f41)*(1-f42);
|
||||
// float b = ( f31 * f41);
|
||||
// float d = ( f42 * f52);
|
||||
const __m128 f30_41_31_42 = _mm_shuffle_ps(f30_41_52, f52_31_42, _MM_SHUFFLE(2, 1, 1, 0));
|
||||
const __m128 f31_42_41_52 = _mm_shuffle_ps(f52_31_42, f30_41_52, _MM_SHUFFLE(2, 1, 2, 1));
|
||||
const __m128 c1_1_0_0 = { 1, 1, 0, 0 };
|
||||
const __m128 acbd = _mm_mul_ps(_mm_sub_ps(c1_1_0_0, f30_41_31_42), _mm_sub_ps(c1_1_0_0, f31_42_41_52));
|
||||
const float &a = acbd.m128_f32[0];
|
||||
const float &b = acbd.m128_f32[2];
|
||||
const float &c = acbd.m128_f32[1];
|
||||
const float &d = acbd.m128_f32[3];
|
||||
#else
|
||||
// TODO: Maybe compilers could be coaxed into vectorizing this code without the above explicitly...
|
||||
float t0 = (t - knot[0]);
|
||||
float t1 = (t - knot[1]);
|
||||
float t2 = (t - knot[2]);
|
||||
// TODO: All our knots are integers so we should be able to get rid of these divisions (How?)
|
||||
float f30 = t0/(knot[3]-knot[0]);
|
||||
float f41 = t1/(knot[4]-knot[1]);
|
||||
float f52 = t2/(knot[5]-knot[2]);
|
||||
float f31 = t1/(knot[3]-knot[1]);
|
||||
float f42 = t2/(knot[4]-knot[2]);
|
||||
float f32 = t2/(knot[3]-knot[2]);
|
||||
#endif
|
||||
|
||||
float f30 = t0 * div._3_0;
|
||||
float f41 = t1 * div._4_1;
|
||||
float f52 = t2 * div._5_2;
|
||||
float f31 = t1 * div._3_1;
|
||||
float f42 = t2 * div._4_2;
|
||||
float f32 = t2 * div._3_2;
|
||||
|
||||
float a = (1-f30)*(1-f31);
|
||||
float b = (f31*f41);
|
||||
float c = (1-f41)*(1-f42);
|
||||
float d = (f42*f52);
|
||||
#endif
|
||||
|
||||
splineVal[0] = a-(a*f32);
|
||||
splineVal[1] = 1-a-b+((a+b+c-1)*f32);
|
||||
@ -203,25 +209,33 @@ static void spline_n_4(int i, float t, float *knot, float *splineVal) {
|
||||
}
|
||||
|
||||
// knot should be an array sized n + 5 (n + 1 + 1 + degree (cubic))
|
||||
static void spline_knot(int n, int type, float *knot) {
|
||||
memset(knot, 0, sizeof(float) * (n + 5));
|
||||
for (int i = 0; i < n - 1; ++i)
|
||||
knot[i + 3] = (float)i;
|
||||
static void spline_knot(int n, int type, float *knots, KnotDiv *divs) {
|
||||
// Basic theory (-2 to +3), optimized with KnotDiv (-2 to +0)
|
||||
// for (int i = 0; i < n + 5; ++i) {
|
||||
for (int i = 0; i < n + 2; ++i) {
|
||||
knots[i] = (float)i - 2;
|
||||
}
|
||||
|
||||
if ((type & 1) == 0) {
|
||||
knot[0] = -3;
|
||||
knot[1] = -2;
|
||||
knot[2] = -1;
|
||||
}
|
||||
if ((type & 2) == 0) {
|
||||
knot[n + 2] = (float)(n - 1);
|
||||
knot[n + 3] = (float)(n);
|
||||
knot[n + 4] = (float)(n + 1);
|
||||
} else {
|
||||
knot[n + 2] = (float)(n - 2);
|
||||
knot[n + 3] = (float)(n - 2);
|
||||
knot[n + 4] = (float)(n - 2);
|
||||
}
|
||||
if ((type & 1) != 0) {
|
||||
knots[0] = 0;
|
||||
knots[1] = 0;
|
||||
|
||||
divs[0]._3_0 = 1.0f;
|
||||
divs[0]._4_1 = 1.0f / 2.0f;
|
||||
divs[0]._3_1 = 1.0f;
|
||||
if (n > 1)
|
||||
divs[1]._3_0 = 1.0f / 2.0f;
|
||||
}
|
||||
if ((type & 2) != 0) {
|
||||
// knots[n + 2] = (float)n; // Got rid of this line optimized with KnotDiv
|
||||
// knots[n + 3] = (float)n; // Got rid of this line optimized with KnotDiv
|
||||
// knots[n + 4] = (float)n; // Got rid of this line optimized with KnotDiv
|
||||
divs[n - 1]._4_1 = 1.0f / 2.0f;
|
||||
divs[n - 1]._5_2 = 1.0f;
|
||||
divs[n - 1]._4_2 = 1.0f;
|
||||
if (n > 1)
|
||||
divs[n - 2]._5_2 = 1.0f / 2.0f;
|
||||
}
|
||||
}
|
||||
|
||||
bool CanUseHardwareTessellation(GEPatchPrimType prim) {
|
||||
@ -359,8 +373,10 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
|
||||
|
||||
float *knot_u = new float[spatch.count_u + 4];
|
||||
float *knot_v = new float[spatch.count_v + 4];
|
||||
spline_knot(spatch.count_u - 1, spatch.type_u, knot_u);
|
||||
spline_knot(spatch.count_v - 1, spatch.type_v, knot_v);
|
||||
KnotDiv *divs_u = new KnotDiv[spatch.count_u - 3];
|
||||
KnotDiv *divs_v = new KnotDiv[spatch.count_v - 3];
|
||||
spline_knot(spatch.count_u - 3, spatch.type_u, knot_u, divs_u);
|
||||
spline_knot(spatch.count_v - 3, spatch.type_v, knot_v, divs_v);
|
||||
|
||||
// Increase tessellation based on the size. Should be approximately right?
|
||||
int patch_div_s = (spatch.count_u - 3) * spatch.tess_u;
|
||||
@ -439,8 +455,8 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
|
||||
if (iu >= spatch.count_u - 3) iu = spatch.count_u - 4;
|
||||
if (iv >= spatch.count_v - 3) iv = spatch.count_v - 4;
|
||||
|
||||
spline_n_4(iu, u, knot_u, u_weights);
|
||||
spline_n_4(iv, v, knot_v, v_weights);
|
||||
spline_n_4(iu, u, knot_u, divs_u[iu], u_weights);
|
||||
spline_n_4(iv, v, knot_v, divs_v[iv], v_weights);
|
||||
|
||||
// Handle degenerate patches. without this, spatch.points[] may read outside the number of initialized points.
|
||||
int patch_w = std::min(spatch.count_u - iu, 4);
|
||||
|
Loading…
x
Reference in New Issue
Block a user