softgpu: Use common SIMD matrix multiplies.

This commit is contained in:
Unknown W. Brackets 2022-01-04 09:00:50 -08:00
parent cba2374abd
commit 079b67e7ed
2 changed files with 124 additions and 38 deletions

View File

@ -38,6 +38,12 @@
#endif
#endif
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
#define MATH3D_CALL __vectorcall
#else
#define MATH3D_CALL
#endif
namespace Math3D {
// Helper for Vec classes to clamp values.
@ -913,6 +919,38 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
#endif
}
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
#if defined(_M_SSE)
__m128 col0 = _mm_loadu_ps(m);
__m128 col1 = _mm_loadu_ps(m + 3);
__m128 col2 = _mm_loadu_ps(m + 6);
__m128 col3 = _mm_loadu_ps(m + 9);
__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
__m128 sum = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
_mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum;
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t col3 = vld1q_f32(m + 9);
float32x4_t vec = v.vec;
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
#else
Vec3f vecOut;
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
return vecOut;
#endif
}
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
{
#if defined(_M_SSE)
@ -945,6 +983,39 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
#endif
}
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
#if defined(_M_SSE)
__m128 col0 = _mm_loadu_ps(m);
__m128 col1 = _mm_loadu_ps(m + 4);
__m128 col2 = _mm_loadu_ps(m + 8);
__m128 col3 = _mm_loadu_ps(m + 12);
__m128 x = _mm_set1_ps(v[0]);
__m128 y = _mm_set1_ps(v[1]);
__m128 z = _mm_set1_ps(v[2]);
__m128 sum = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
_mm_add_ps(_mm_mul_ps(col2, z), col3));
return sum;
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 4);
float32x4_t col2 = vld1q_f32(m + 8);
float32x4_t col3 = vld1q_f32(m + 12);
float32x4_t vec = v.vec;
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
return sum;
#else
Vec4f vecOut;
vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
return vecOut;
#endif
}
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
{
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
@ -952,6 +1023,36 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
}
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
#if defined(_M_SSE)
__m128 col0 = _mm_loadu_ps(m);
__m128 col1 = _mm_loadu_ps(m + 3);
__m128 col2 = _mm_loadu_ps(m + 6);
__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
__m128 sum = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
_mm_mul_ps(col2, z));
return sum;
#elif PPSSPP_ARCH(ARM_NEON) && PPSSPP_ARCH(ARM64)
float32x4_t col0 = vld1q_f32(m);
float32x4_t col1 = vld1q_f32(m + 3);
float32x4_t col2 = vld1q_f32(m + 6);
float32x4_t vec = v.vec;
float32x4_t sum = vaddq_f32(
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
vmulq_laneq_f32(col2, vec, 2));
return sum;
#else
Vec3f vecOut;
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
return vecOut;
#endif
}
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
fast_matrix_mul_4x4(out, b, a);
}

View File

@ -67,29 +67,20 @@ VertexDecoder *SoftwareDrawEngine::FindVertexDecoder(u32 vtype) {
return DrawEngineCommon::GetVertexDecoder(vertTypeID);
}
WorldCoords TransformUnit::ModelToWorld(const ModelCoords& coords)
{
Mat3x3<float> world_matrix(gstate.worldMatrix);
return WorldCoords(world_matrix * coords) + Vec3<float>(gstate.worldMatrix[9], gstate.worldMatrix[10], gstate.worldMatrix[11]);
WorldCoords TransformUnit::ModelToWorld(const ModelCoords &coords) {
return Vec3ByMatrix43(coords, gstate.worldMatrix);
}
WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords& coords)
{
Mat3x3<float> world_matrix(gstate.worldMatrix);
return WorldCoords(world_matrix * coords);
WorldCoords TransformUnit::ModelToWorldNormal(const ModelCoords &coords) {
return Norm3ByMatrix43(coords, gstate.worldMatrix);
}
ViewCoords TransformUnit::WorldToView(const WorldCoords& coords)
{
Mat3x3<float> view_matrix(gstate.viewMatrix);
return ViewCoords(view_matrix * coords) + Vec3<float>(gstate.viewMatrix[9], gstate.viewMatrix[10], gstate.viewMatrix[11]);
ViewCoords TransformUnit::WorldToView(const WorldCoords &coords) {
return Vec3ByMatrix43(coords, gstate.viewMatrix);
}
ClipCoords TransformUnit::ViewToClip(const ViewCoords& coords)
{
Vec4<float> coords4(coords.x, coords.y, coords.z, 1.0f);
Mat4x4<float> projection_matrix(gstate.projMatrix);
return ClipCoords(projection_matrix * coords4);
ClipCoords TransformUnit::ViewToClip(const ViewCoords &coords) {
return Vec3ByMatrix44(coords, gstate.projMatrix);
}
static inline ScreenCoords ClipToScreenInternal(const ClipCoords& coords, bool *outside_range_flag) {
@ -161,20 +152,16 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
PROFILE_THIS_SCOPE("read_vert");
VertexData vertex;
float pos[3];
ModelCoords pos;
// VertexDecoder normally scales z, but we want it unscaled.
vreader.ReadPosThroughZ16(pos);
vreader.ReadPosThroughZ16(pos.AsArray());
if (!gstate.isModeClear() && gstate.isTextureMapEnabled() && vreader.hasUV()) {
float uv[2];
vreader.ReadUV(uv);
vertex.texturecoords = Vec2<float>(uv[0], uv[1]);
vreader.ReadUV(vertex.texturecoords.AsArray());
}
if (vreader.hasNormal()) {
float normal[3];
vreader.ReadNrm(normal);
vertex.normal = Vec3<float>(normal[0], normal[1], normal[2]);
vreader.ReadNrm(vertex.normal.AsArray());
if (gstate.areNormalsReversed())
vertex.normal = -vertex.normal;
@ -188,15 +175,15 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
Vec3<float> tmpnrm(0.f, 0.f, 0.f);
for (int i = 0; i < vertTypeGetNumBoneWeights(gstate.vertType); ++i) {
Mat3x3<float> bone(&gstate.boneMatrix[12*i]);
tmppos += (bone * ModelCoords(pos[0], pos[1], pos[2]) + Vec3<float>(gstate.boneMatrix[12*i+9], gstate.boneMatrix[12*i+10], gstate.boneMatrix[12*i+11])) * W[i];
if (vreader.hasNormal())
tmpnrm += (bone * vertex.normal) * W[i];
Vec3<float> step = Vec3ByMatrix43(pos, gstate.boneMatrix + i * 12);
tmppos += step * W[i];
if (vreader.hasNormal()) {
step = Norm3ByMatrix43(vertex.normal, gstate.boneMatrix + i * 12);
tmpnrm += step * W[i];
}
}
pos[0] = tmppos.x;
pos[1] = tmppos.y;
pos[2] = tmppos.z;
pos = tmppos;
if (vreader.hasNormal())
vertex.normal = tmpnrm;
}
@ -206,7 +193,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
vreader.ReadColor0(col);
vertex.color0 = Vec4<int>(col[0]*255, col[1]*255, col[2]*255, col[3]*255);
} else {
vertex.color0 = Vec4<int>(gstate.getMaterialAmbientR(), gstate.getMaterialAmbientG(), gstate.getMaterialAmbientB(), gstate.getMaterialAmbientA());
vertex.color0 = Vec4<int>::FromRGBA(gstate.getMaterialAmbientRGBA());
}
if (vreader.hasColor1()) {
@ -218,7 +205,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
}
if (!gstate.isModeThrough()) {
vertex.modelpos = ModelCoords(pos[0], pos[1], pos[2]);
vertex.modelpos = pos;
vertex.worldpos = WorldCoords(TransformUnit::ModelToWorld(vertex.modelpos));
ModelCoords viewpos = TransformUnit::WorldToView(vertex.worldpos);
vertex.clippos = ClipCoords(TransformUnit::ViewToClip(viewpos));
@ -240,8 +227,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
vertex.screenpos = ClipToScreenInternal(vertex.clippos, &outside_range_flag);
if (vreader.hasNormal()) {
vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal);
vertex.worldnormal /= vertex.worldnormal.Length();
vertex.worldnormal = TransformUnit::ModelToWorldNormal(vertex.normal).Normalized(cpu_info.bSSE4_1);
} else {
vertex.worldnormal = Vec3<float>(0.0f, 0.0f, 1.0f);
}
@ -273,8 +259,7 @@ VertexData TransformUnit::ReadVertex(VertexReader &vreader, bool &outside_range_
}
// TODO: What about uv scale and offset?
Mat3x3<float> tgen(gstate.tgenMatrix);
Vec3<float> stq = tgen * source + Vec3<float>(gstate.tgenMatrix[9], gstate.tgenMatrix[10], gstate.tgenMatrix[11]);
Vec3<float> stq = Vec3ByMatrix43(source, gstate.tgenMatrix);
float z_recip = 1.0f / stq.z;
vertex.texturecoords = Vec2f(stq.x * z_recip, stq.y * z_recip);
} else if (gstate.getUVGenMode() == GE_TEXMAP_ENVIRONMENT_MAP) {