Remove the assembly implementation of 4x4 matrix mul, in favor of intrinsics.

2025-02-18 21:27:52 +00:00 · 2022-09-05 10:20:35 +02:00 · 2022-09-05 10:20:35 +02:00 · ed8c4e8758
commit ed8c4e8758
parent 0126bc181c
16 changed files with 103 additions and 165 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -626,10 +626,7 @@ add_library(Common STATIC
 	Common/Input/KeyCodes.h
 	Common/Input/InputState.cpp
 	Common/Input/InputState.h
 	Common/Math/fast/fast_math.c
 	Common/Math/fast/fast_matrix.c
 	Common/Math/fast/fast_matrix_neon.S
 	Common/Math/fast/fast_matrix_sse.c
 	Common/Math/curves.cpp
 	Common/Math/curves.h
 	Common/Math/expression_parser.cpp
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@ -452,7 +452,6 @@
    <ClInclude Include="Input\KeyCodes.h" />
    <ClInclude Include="Math\curves.h" />
    <ClInclude Include="Math\expression_parser.h" />
    <ClInclude Include="Math\fast\fast_math.h" />
    <ClInclude Include="Math\fast\fast_matrix.h" />
    <ClInclude Include="Math\geom2d.h" />
    <ClInclude Include="Math\lin\matrix4x4.h" />
@ -873,9 +872,7 @@
    <ClCompile Include="Log.cpp" />
    <ClCompile Include="Math\curves.cpp" />
    <ClCompile Include="Math\expression_parser.cpp" />
    <ClCompile Include="Math\fast\fast_math.c" />
    <ClCompile Include="Math\fast\fast_matrix.c" />
    <ClCompile Include="Math\fast\fast_matrix_sse.c" />
    <ClCompile Include="Math\lin\matrix4x4.cpp" />
    <ClCompile Include="Math\lin\vec3.cpp" />
    <ClCompile Include="Math\math_util.cpp" />
@ -992,7 +989,6 @@
    </ProjectReference>
  </ItemGroup>
  <ItemGroup>
    <None Include="Math\fast\fast_matrix_neon.S" />
    <None Include="Math\lin\matrix_neon.s" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -161,9 +161,6 @@
    <ClInclude Include="Math\lin\vec3.h">
      <Filter>Math\lin</Filter>
    </ClInclude>
    <ClInclude Include="Math\fast\fast_math.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
    <ClInclude Include="Math\fast\fast_matrix.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
@ -568,15 +565,9 @@
    <ClCompile Include="Math\lin\vec3.cpp">
      <Filter>Math\lin</Filter>
    </ClCompile>
    <ClCompile Include="Math\fast\fast_math.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="Math\fast\fast_matrix.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="Math\fast\fast_matrix_sse.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="Data\Format\RIFF.cpp">
      <Filter>Data\Format</Filter>
    </ClCompile>
@ -914,8 +905,5 @@
    <None Include="Math\lin\matrix_neon.s">
      <Filter>Math\lin</Filter>
    </None>
    <None Include="Math\fast\fast_matrix_neon.S">
      <Filter>Math\fast</Filter>
    </None>
  </ItemGroup>
 </Project>
--- a/Common/Math/fast/fast_math.c
+++ b/Common/Math/fast/fast_math.c
@ -1,12 +0,0 @@
 #include "ppsspp_config.h"
 #include "fast_math.h"
 #include "fast_matrix.h"
 void InitFastMath() {
 #ifndef _MSC_VER
 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
 		fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
 #endif
 #endif
 }
--- a/Common/Math/fast/fast_math.h
+++ b/Common/Math/fast/fast_math.h
@ -1,21 +0,0 @@
 #pragma once
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Fast Math
 // A mini library of math kernels. These should be large enough to be worth calling
 // as functions, and generic enough to fit in the "native" library (not PSP specific stuff).
 // NEON versions are dynamically selected at runtime, when you call InitFastMath.
 // SSE versions are hard linked at compile time.
 // See fast_matrix.h for the first set of functions.
 void InitFastMath();
 #ifdef __cplusplus
 }
 #endif
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@ -1,6 +1,100 @@
-#include "fast_math.h"
+#include "ppsspp_config.h"
 #include "fast_matrix.h"
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include <emmintrin.h>
 #include "fast_matrix.h"
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
 	__m128 a_col_2 = _mm_loadu_ps(&a[4]);
 	__m128 a_col_3 = _mm_loadu_ps(&a[8]);
 	__m128 a_col_4 = _mm_loadu_ps(&a[12]);
 	for (i = 0; i < 16; i += 4) {
 		__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
 		_mm_storeu_ps(&dest[i], r_col);
 	}
 }
 #elif PPSSPP_ARCH(ARM_NEON)
 #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
 #include <arm64_neon.h>
 #else
 #include <arm_neon.h>
 #endif
 // From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
 void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {
 	// these are the columns A
 	float32x4_t A0;
 	float32x4_t A1;
 	float32x4_t A2;
 	float32x4_t A3;
 	// these are the columns B
 	float32x4_t B0;
 	float32x4_t B1;
 	float32x4_t B2;
 	float32x4_t B3;
 	// these are the columns C
 	float32x4_t C0;
 	float32x4_t C1;
 	float32x4_t C2;
 	float32x4_t C3;
 	A0 = vld1q_f32(A);
 	A1 = vld1q_f32(A + 4);
 	A2 = vld1q_f32(A + 8);
 	A3 = vld1q_f32(A + 12);
 	// Zero accumulators for C values
 	C0 = vmovq_n_f32(0);
 	C1 = vmovq_n_f32(0);
 	C2 = vmovq_n_f32(0);
 	C3 = vmovq_n_f32(0);
 	// Multiply accumulate in 4x1 blocks, i.e. each column in C
 	B0 = vld1q_f32(B);
 	C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
 	C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
 	C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
 	C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
 	vst1q_f32(C, C0);
 	B1 = vld1q_f32(B + 4);
 	C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
 	C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
 	C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
 	C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
 	vst1q_f32(C + 4, C1);
 	B2 = vld1q_f32(B + 8);
 	C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
 	C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
 	C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
 	C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
 	vst1q_f32(C + 8, C2);
 	B3 = vld1q_f32(B + 12);
 	C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
 	C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
 	C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
 	C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
 	vst1q_f32(C + 12, C3);
 }
 #else
 #define xx 0
 #define xy 1
 #define xz 2
@ -40,6 +134,4 @@ void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b) {
 	dest[ww] = b[wx] * a[xw] + b[wy] * a[yw] + b[wz] * a[zw] + b[ww] * a[ww];
 }
-#ifndef fast_matrix_mul_4x4
+#endif
 fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_c;
 #endif
--- a/Common/Math/fast/fast_matrix.h
+++ b/Common/Math/fast/fast_matrix.h
@ -6,11 +6,8 @@
 extern "C" {
 #endif
-// A mini library of matrix math kernels.
+// A mini library of 4x4 matrix muls.
 // TODO: Really need to wrap this block in a macro or something, will get repetitive.
 typedef void(*fptr_fast_matrix_mul_4x4)(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
@ -18,13 +15,12 @@ extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b)
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 // Hard link to SSE implementations on x86/amd64
 #define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(ARM_NEON)
-#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
+#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
 #else
-extern fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4;
+#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
 #endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/Common/Math/fast/fast_matrix_neon.S
+++ b/Common/Math/fast/fast_matrix_neon.S
@ -1,54 +0,0 @@
 #include "ppsspp_config.h"
 #if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
 	.syntax unified		// Allow both ARM and Thumb-2 instructions
 	.text
 	.align 2		// Align the function code to a 4-byte (2^n) word boundary.
 	.arm			// Use ARM instructions instead of Thumb.
@ From ARM samples
@
@ matrix_mul_float:
@ Calculate 4x4 (matrix 0) * (matrix 1) and store to result 4x4 matrix.
@  matrix 0, matrix 1 and result pointers can be the same,
@  ie. my_matrix = my_matrix * my_matrix is possible.
@
@ r0 = pointer to 4x4 result matrix, single precision floats, column major order
@ r1 = pointer to 4x4 matrix 0, single precision floats, column major order
@ r2 = pointer to 4x4 matrix 1, single precision floats, column major order
@
 	.globl _fast_matrix_mul_4x4_neon
 _fast_matrix_mul_4x4_neon:
 	.globl fast_matrix_mul_4x4_neon
 fast_matrix_mul_4x4_neon:
    vld1.32     {d16-d19}, [r1]!            @ load first eight elements of matrix 0
    vld1.32     {d20-d23}, [r1]!            @ load second eight elements of matrix 0
    vld1.32     {d0-d3}, [r2]!              @ load first eight elements of matrix 1
    vld1.32     {d4-d7}, [r2]!              @ load second eight elements of matrix 1
    vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
    vmul.f32    q13, q8, d2[0]              @ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
    vmul.f32    q14, q8, d4[0]              @ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
    vmul.f32    q15, q8, d6[0]              @ rslt col3  = (mat0 col0) * (mat1 col3 elt0)
    vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
    vmla.f32    q13, q9, d2[1]              @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
    vmla.f32    q14, q9, d4[1]              @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
    vmla.f32    q15, q9, d6[1]              @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
    vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
    vmla.f32    q13, q10, d3[0]             @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
    vmla.f32    q14, q10, d5[0]             @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
    vmla.f32    q15, q10, d7[0]             @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
    vmla.f32    q12, q11, d1[1]             @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
    vmla.f32    q13, q11, d3[1]             @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
    vmla.f32    q14, q11, d5[1]             @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
    vmla.f32    q15, q11, d7[1]             @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
    vst1.32     {d24-d27}, [r0]!            @ store first eight elements of result
    vst1.32     {d28-d31}, [r0]!            @ store second eight elements of result
 	bx lr
 #endif
--- a/Common/Math/fast/fast_matrix_sse.c
+++ b/Common/Math/fast/fast_matrix_sse.c
@ -1,25 +0,0 @@
 #include "ppsspp_config.h"
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include <emmintrin.h>
 #include "fast_matrix.h"
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
 	__m128 a_col_2 = _mm_loadu_ps(&a[4]);
 	__m128 a_col_3 = _mm_loadu_ps(&a[8]);
 	__m128 a_col_4 = _mm_loadu_ps(&a[12]);
 	for (i = 0; i < 16; i += 4) {
 		__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
 		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
 		_mm_storeu_ps(&dest[i], r_col);
 	}
 }
 #endif
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -234,6 +234,7 @@ bool DrawEngineCommon::TestBoundingBox(const void* control_points, int vertexCou
 	float worldviewproj[16];
 	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
 	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
 	// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
 	Matrix4ByMatrix4(worldview, world, view);
 	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
 	PlanesFromMatrix(worldviewproj, planes);
--- a/GPU/Common/GPUStateUtils.cpp
+++ b/GPU/Common/GPUStateUtils.cpp
@ -1559,7 +1559,7 @@ void ConvertStencilFuncState(GenericStencilFuncState &state) {
 }
 void GenericMaskState::Log() {
-	WARN_LOG(G3D, "Mask: %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
+	WARN_LOG(G3D, "Mask: %08x %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
 }
 void GenericBlendState::Log() {
--- a/UI/NativeApp.cpp
+++ b/UI/NativeApp.cpp
@ -62,7 +62,6 @@
 #include "Common/System/NativeApp.h"
 #include "Common/Data/Text/I18n.h"
 #include "Common/Input/InputState.h"
 #include "Common/Math/fast/fast_math.h"
 #include "Common/Math/math_util.h"
 #include "Common/Math/lin/matrix4x4.h"
 #include "Common/Profiler/Profiler.h"
@ -463,7 +462,6 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
 	ShaderTranslationInit();
 	InitFastMath();
 	g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
 	g_Discord.SetPresenceMenu();
--- a/UWP/CommonUWP/CommonUWP.vcxproj
+++ b/UWP/CommonUWP/CommonUWP.vcxproj
@ -432,7 +432,6 @@
    <ClInclude Include="..\..\Common\Input\KeyCodes.h" />
    <ClInclude Include="..\..\Common\Math\curves.h" />
    <ClInclude Include="..\..\Common\Math\expression_parser.h" />
    <ClInclude Include="..\..\Common\Math\fast\fast_math.h" />
    <ClInclude Include="..\..\Common\Math\fast\fast_matrix.h" />
    <ClInclude Include="..\..\Common\Math\geom2d.h" />
    <ClInclude Include="..\..\Common\Math\lin\matrix4x4.h" />
@ -555,9 +554,7 @@
    <ClCompile Include="..\..\Common\Input\InputState.cpp" />
    <ClCompile Include="..\..\Common\Math\curves.cpp" />
    <ClCompile Include="..\..\Common\Math\expression_parser.cpp" />
    <ClCompile Include="..\..\Common\Math\fast\fast_math.c" />
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix.c" />
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c" />
    <ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp" />
    <ClCompile Include="..\..\Common\Math\lin\vec3.cpp" />
    <ClCompile Include="..\..\Common\Math\math_util.cpp" />
--- a/UWP/CommonUWP/CommonUWP.vcxproj.filters
+++ b/UWP/CommonUWP/CommonUWP.vcxproj.filters
@ -225,15 +225,9 @@
    <ClCompile Include="..\..\Common\Math\math_util.cpp">
      <Filter>Math</Filter>
    </ClCompile>
    <ClCompile Include="..\..\Common\Math\fast\fast_math.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
    <ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp">
      <Filter>Math\lin</Filter>
    </ClCompile>
@ -529,9 +523,6 @@
    <ClInclude Include="..\..\Common\Math\math_util.h">
      <Filter>Math</Filter>
    </ClInclude>
    <ClInclude Include="..\..\Common\Math\fast\fast_math.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
    <ClInclude Include="..\..\Common\Math\fast\fast_matrix.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -11,19 +11,16 @@ ARCH_FILES := \
  $(SRC)/Common/ABI.cpp \
  $(SRC)/Common/x64Emitter.cpp \
  $(SRC)/Common/x64Analyzer.cpp \
  $(SRC)/Common/Math/fast/fast_matrix_sse.c \
  $(SRC)/Common/Thunk.cpp
 else ifeq ($(TARGET_ARCH_ABI),x86_64)
 ARCH_FILES := \
  $(SRC)/Common/ABI.cpp \
  $(SRC)/Common/x64Emitter.cpp \
  $(SRC)/Common/x64Analyzer.cpp \
  $(SRC)/Common/Math/fast/fast_matrix_sse.c \
  $(SRC)/Common/Thunk.cpp
 else ifeq ($(findstring armeabi-v7a,$(TARGET_ARCH_ABI)),armeabi-v7a)
 ARCH_FILES := \
  $(SRC)/Common/ArmEmitter.cpp \
  $(SRC)/Common/Math/fast/fast_matrix_neon.S.neon \
  $(SRC)/ext/disarm.cpp \
  $(SRC)/ext/libpng17/arm/arm_init.c \
  $(SRC)/ext/libpng17/arm/filter_neon_intrinsics.c \
@ -164,7 +161,6 @@ EXEC_AND_LIB_FILES := \
  $(SRC)/Common/Render/Text/draw_text_android.cpp \
  $(SRC)/Common/Input/GestureDetector.cpp \
  $(SRC)/Common/Input/InputState.cpp \
  $(SRC)/Common/Math/fast/fast_math.c \
  $(SRC)/Common/Math/fast/fast_matrix.c \
  $(SRC)/Common/Math/math_util.cpp \
  $(SRC)/Common/Math/curves.cpp \
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@ -163,7 +163,6 @@ ifeq ($(PLATFORM_EXT), android)
 	endif
 	SOURCES_C +=\
 		$(COMMONDIR)/Math/fast/fast_math.c \
 		$(COMMONDIR)/Math/fast/fast_matrix.c
 endif
@ -633,8 +632,7 @@ ifeq ($(WITH_DYNAREC),1)
 			SOURCES_C += $(EXTDIR)/libpng17/arm/arm_init.c \
 				     $(EXTDIR)/libpng17/arm/filter_neon_intrinsics.c
-			ASMFILES   += $(COMMONDIR)/Math/fast/fast_matrix_neon.S \
+			ASMFILES   += $(EXTDIR)/libpng17/arm/filter_neon.S
 				      $(EXTDIR)/libpng17/arm/filter_neon.S
 		endif
 	else ifeq ($(TARGET_ARCH),arm64)
      DYNAFLAGS += -D_ARCH_64