NEON-optimize matrix tranposes

2024-11-23 05:19:56 +00:00 · 2023-11-27 23:21:47 +01:00 · 2023-11-27 23:21:47 +01:00 · 4ec2d76bc9
commit 4ec2d76bc9
parent 45aae7b9da
2 changed files with 17 additions and 1 deletions
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -1944,7 +1944,7 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
 }

 void GPUCommon::UpdateUVScaleOffset() {
-#ifdef _M_SSE
+#if defined(_M_SSE)
 	__m128i values = _mm_slli_epi32(_mm_load_si128((const __m128i *)&gstate.texscaleu), 8);
 	_mm_storeu_si128((__m128i *)&gstate_c.uv, values);
 #elif PPSSPP_ARCH(ARM_NEON)
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@ -1152,6 +1152,13 @@ inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
 }

 inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
+#if PPSSPP_ARCH(ARM_NEON)
+	// vld3q is a perfect match here!
+	float32x4x3_t packed = vld3q_f32(m4x3);
+	vst1q_f32(m4x4, packed.val[0]);
+	vst1q_f32(m4x4 + 4, packed.val[1]);
+	vst1q_f32(m4x4 + 8, packed.val[2]);
+#else
 	m4x4[0] = m4x3[0];
 	m4x4[1] = m4x3[3];
 	m4x4[2] = m4x3[6];
@ -1164,6 +1171,7 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
 	m4x4[9] = m4x3[5];
 	m4x4[10] = m4x3[8];
 	m4x4[11] = m4x3[11];
+#endif
 	m4x4[12] = 0.0f;
 	m4x4[13] = 0.0f;
 	m4x4[14] = 0.0f;
@ -1179,6 +1187,13 @@ inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
 // 89AB
 // Don't see a way to SIMD that. Should be pretty fast anyway.
 inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
+#if PPSSPP_ARCH(ARM_NEON)
+	// vld3q is a perfect match here!
+	float32x4x3_t packed = vld3q_f32(m4x3);
+	vst1q_f32(m4x4, packed.val[0]);
+	vst1q_f32(m4x4 + 4, packed.val[1]);
+	vst1q_f32(m4x4 + 8, packed.val[2]);
+#else
 	m4x4[0] = m4x3[0];
 	m4x4[1] = m4x3[3];
 	m4x4[2] = m4x3[6];
@ -1191,6 +1206,7 @@ inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
 	m4x4[9] = m4x3[5];
 	m4x4[10] = m4x3[8];
 	m4x4[11] = m4x3[11];
+#endif
 }

 inline void Transpose4x4(float out[16], const float in[16]) {