Remove the assembly implementation of 4x4 matrix mul, in favor of intrinsics.

2024-11-26 23:10:38 +00:00 · 2022-09-05 10:20:35 +02:00 · 2022-09-05 10:20:35 +02:00 · ed8c4e8758
commit ed8c4e8758
parent 0126bc181c
16 changed files with 103 additions and 165 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -626,10 +626,7 @@ add_library(Common STATIC
 	Common/Input/KeyCodes.h
 	Common/Input/InputState.cpp
 	Common/Input/InputState.h
-	Common/Math/fast/fast_math.c
 	Common/Math/fast/fast_matrix.c
-	Common/Math/fast/fast_matrix_neon.S
-	Common/Math/fast/fast_matrix_sse.c
 	Common/Math/curves.cpp
 	Common/Math/curves.h
 	Common/Math/expression_parser.cpp
--- a/Common/Common.vcxproj
+++ b/Common/Common.vcxproj
@ -452,7 +452,6 @@
    <ClInclude Include="Input\KeyCodes.h" />
    <ClInclude Include="Math\curves.h" />
    <ClInclude Include="Math\expression_parser.h" />
-    <ClInclude Include="Math\fast\fast_math.h" />
    <ClInclude Include="Math\fast\fast_matrix.h" />
    <ClInclude Include="Math\geom2d.h" />
    <ClInclude Include="Math\lin\matrix4x4.h" />
@ -873,9 +872,7 @@
    <ClCompile Include="Log.cpp" />
    <ClCompile Include="Math\curves.cpp" />
    <ClCompile Include="Math\expression_parser.cpp" />
-    <ClCompile Include="Math\fast\fast_math.c" />
    <ClCompile Include="Math\fast\fast_matrix.c" />
-    <ClCompile Include="Math\fast\fast_matrix_sse.c" />
    <ClCompile Include="Math\lin\matrix4x4.cpp" />
    <ClCompile Include="Math\lin\vec3.cpp" />
    <ClCompile Include="Math\math_util.cpp" />
@ -992,7 +989,6 @@
    </ProjectReference>
  </ItemGroup>
  <ItemGroup>
-    <None Include="Math\fast\fast_matrix_neon.S" />
    <None Include="Math\lin\matrix_neon.s" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/Common/Common.vcxproj.filters
+++ b/Common/Common.vcxproj.filters
@ -161,9 +161,6 @@
    <ClInclude Include="Math\lin\vec3.h">
      <Filter>Math\lin</Filter>
    </ClInclude>
-    <ClInclude Include="Math\fast\fast_math.h">
-      <Filter>Math\fast</Filter>
-    </ClInclude>
    <ClInclude Include="Math\fast\fast_matrix.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
@ -568,15 +565,9 @@
    <ClCompile Include="Math\lin\vec3.cpp">
      <Filter>Math\lin</Filter>
    </ClCompile>
-    <ClCompile Include="Math\fast\fast_math.c">
-      <Filter>Math\fast</Filter>
-    </ClCompile>
    <ClCompile Include="Math\fast\fast_matrix.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
-    <ClCompile Include="Math\fast\fast_matrix_sse.c">
-      <Filter>Math\fast</Filter>
-    </ClCompile>
    <ClCompile Include="Data\Format\RIFF.cpp">
      <Filter>Data\Format</Filter>
    </ClCompile>
@ -914,8 +905,5 @@
    <None Include="Math\lin\matrix_neon.s">
      <Filter>Math\lin</Filter>
    </None>
-    <None Include="Math\fast\fast_matrix_neon.S">
-      <Filter>Math\fast</Filter>
-    </None>
  </ItemGroup>
 </Project>
--- a/Common/Math/fast/fast_math.c
+++ b/Common/Math/fast/fast_math.c
@ -1,12 +0,0 @@
-#include "ppsspp_config.h"
-
-#include "fast_math.h"
-#include "fast_matrix.h"
-
-void InitFastMath() {
-#ifndef _MSC_VER
-#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
-		fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
-#endif
-#endif
-}
--- a/Common/Math/fast/fast_math.h
+++ b/Common/Math/fast/fast_math.h
@ -1,21 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Fast Math
-// A mini library of math kernels. These should be large enough to be worth calling
-// as functions, and generic enough to fit in the "native" library (not PSP specific stuff).
-
-// NEON versions are dynamically selected at runtime, when you call InitFastMath.
-
-// SSE versions are hard linked at compile time.
-
-// See fast_matrix.h for the first set of functions.
-
-void InitFastMath();
-
-#ifdef __cplusplus
-}
-#endif
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@ -1,6 +1,100 @@
-#include "fast_math.h"
+#include "ppsspp_config.h"
+
 #include "fast_matrix.h"

+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+
+#include <emmintrin.h>
+
+#include "fast_matrix.h"
+
+void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
+	int i;
+	__m128 a_col_1 = _mm_loadu_ps(a);
+	__m128 a_col_2 = _mm_loadu_ps(&a[4]);
+	__m128 a_col_3 = _mm_loadu_ps(&a[8]);
+	__m128 a_col_4 = _mm_loadu_ps(&a[12]);
+
+	for (i = 0; i < 16; i += 4) {
+		__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
+		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
+		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
+		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
+		_mm_storeu_ps(&dest[i], r_col);
+	}
+}
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
+void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {
+	// these are the columns A
+	float32x4_t A0;
+	float32x4_t A1;
+	float32x4_t A2;
+	float32x4_t A3;
+
+	// these are the columns B
+	float32x4_t B0;
+	float32x4_t B1;
+	float32x4_t B2;
+	float32x4_t B3;
+
+	// these are the columns C
+	float32x4_t C0;
+	float32x4_t C1;
+	float32x4_t C2;
+	float32x4_t C3;
+
+	A0 = vld1q_f32(A);
+	A1 = vld1q_f32(A + 4);
+	A2 = vld1q_f32(A + 8);
+	A3 = vld1q_f32(A + 12);
+
+	// Zero accumulators for C values
+	C0 = vmovq_n_f32(0);
+	C1 = vmovq_n_f32(0);
+	C2 = vmovq_n_f32(0);
+	C3 = vmovq_n_f32(0);
+
+	// Multiply accumulate in 4x1 blocks, i.e. each column in C
+	B0 = vld1q_f32(B);
+	C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
+	C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
+	C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
+	C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
+	vst1q_f32(C, C0);
+
+	B1 = vld1q_f32(B + 4);
+	C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
+	C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
+	C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
+	C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
+	vst1q_f32(C + 4, C1);
+
+	B2 = vld1q_f32(B + 8);
+	C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
+	C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
+	C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
+	C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
+	vst1q_f32(C + 8, C2);
+
+	B3 = vld1q_f32(B + 12);
+	C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
+	C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
+	C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
+	C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
+	vst1q_f32(C + 12, C3);
+}
+
+#else
+
 #define xx 0
 #define xy 1
 #define xz 2
@ -40,6 +134,4 @@ void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b) {
 	dest[ww] = b[wx] * a[xw] + b[wy] * a[yw] + b[wz] * a[zw] + b[ww] * a[ww];
 }

-#ifndef fast_matrix_mul_4x4
-fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_c;
-#endif
+#endif
--- a/Common/Math/fast/fast_matrix.h
+++ b/Common/Math/fast/fast_matrix.h
@ -6,11 +6,8 @@
 extern "C" {
 #endif

-// A mini library of matrix math kernels.
+// A mini library of 4x4 matrix muls.

-// TODO: Really need to wrap this block in a macro or something, will get repetitive.
-
-typedef void(*fptr_fast_matrix_mul_4x4)(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
 extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
@ -18,13 +15,12 @@ extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b)
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 // Hard link to SSE implementations on x86/amd64
 #define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
-#elif PPSSPP_ARCH(ARM64)
-#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
+#elif PPSSPP_ARCH(ARM_NEON)
+#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
 #else
-extern fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4;
+#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
 #endif

-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/Common/Math/fast/fast_matrix_neon.S
+++ b/Common/Math/fast/fast_matrix_neon.S
@ -1,54 +0,0 @@
-#include "ppsspp_config.h"
-#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
-
-	.syntax unified		// Allow both ARM and Thumb-2 instructions
-	.text
-	.align 2		// Align the function code to a 4-byte (2^n) word boundary.
-	.arm			// Use ARM instructions instead of Thumb.
-
-@ From ARM samples
-@
-@ matrix_mul_float:
-@ Calculate 4x4 (matrix 0) * (matrix 1) and store to result 4x4 matrix.
-@  matrix 0, matrix 1 and result pointers can be the same,
-@  ie. my_matrix = my_matrix * my_matrix is possible.
-@
-@ r0 = pointer to 4x4 result matrix, single precision floats, column major order
-@ r1 = pointer to 4x4 matrix 0, single precision floats, column major order
-@ r2 = pointer to 4x4 matrix 1, single precision floats, column major order
-@
-
-	.globl _fast_matrix_mul_4x4_neon
-_fast_matrix_mul_4x4_neon:
-	.globl fast_matrix_mul_4x4_neon
-fast_matrix_mul_4x4_neon:
-    vld1.32     {d16-d19}, [r1]!            @ load first eight elements of matrix 0
-    vld1.32     {d20-d23}, [r1]!            @ load second eight elements of matrix 0
-    vld1.32     {d0-d3}, [r2]!              @ load first eight elements of matrix 1
-    vld1.32     {d4-d7}, [r2]!              @ load second eight elements of matrix 1
-
-    vmul.f32    q12, q8, d0[0]              @ rslt col0  = (mat0 col0) * (mat1 col0 elt0)
-    vmul.f32    q13, q8, d2[0]              @ rslt col1  = (mat0 col0) * (mat1 col1 elt0)
-    vmul.f32    q14, q8, d4[0]              @ rslt col2  = (mat0 col0) * (mat1 col2 elt0)
-    vmul.f32    q15, q8, d6[0]              @ rslt col3  = (mat0 col0) * (mat1 col3 elt0)
-
-    vmla.f32    q12, q9, d0[1]              @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
-    vmla.f32    q13, q9, d2[1]              @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
-    vmla.f32    q14, q9, d4[1]              @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
-    vmla.f32    q15, q9, d6[1]              @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
-
-    vmla.f32    q12, q10, d1[0]             @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
-    vmla.f32    q13, q10, d3[0]             @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
-    vmla.f32    q14, q10, d5[0]             @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
-    vmla.f32    q15, q10, d7[0]             @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
-
-    vmla.f32    q12, q11, d1[1]             @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
-    vmla.f32    q13, q11, d3[1]             @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
-    vmla.f32    q14, q11, d5[1]             @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
-    vmla.f32    q15, q11, d7[1]             @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
-
-    vst1.32     {d24-d27}, [r0]!            @ store first eight elements of result
-    vst1.32     {d28-d31}, [r0]!            @ store second eight elements of result
-	bx lr
-
-#endif
--- a/Common/Math/fast/fast_matrix_sse.c
+++ b/Common/Math/fast/fast_matrix_sse.c
@ -1,25 +0,0 @@
-#include "ppsspp_config.h"
-
-#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
-
-#include <emmintrin.h>
-
-#include "fast_matrix.h"
-
-void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
-	int i;
-	__m128 a_col_1 = _mm_loadu_ps(a);
-	__m128 a_col_2 = _mm_loadu_ps(&a[4]);
-	__m128 a_col_3 = _mm_loadu_ps(&a[8]);
-	__m128 a_col_4 = _mm_loadu_ps(&a[12]);
-
-	for (i = 0; i < 16; i += 4) {
-		__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
-		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
-		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
-		r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
-		_mm_storeu_ps(&dest[i], r_col);
-	}
-}
-
-#endif
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -234,6 +234,7 @@ bool DrawEngineCommon::TestBoundingBox(const void* control_points, int vertexCou
 	float worldviewproj[16];
 	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
 	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
+	// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
 	Matrix4ByMatrix4(worldview, world, view);
 	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
 	PlanesFromMatrix(worldviewproj, planes);
--- a/GPU/Common/GPUStateUtils.cpp
+++ b/GPU/Common/GPUStateUtils.cpp
@ -1559,7 +1559,7 @@ void ConvertStencilFuncState(GenericStencilFuncState &state) {
 }

 void GenericMaskState::Log() {
-	WARN_LOG(G3D, "Mask: %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
+	WARN_LOG(G3D, "Mask: %08x %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
 }

 void GenericBlendState::Log() {
--- a/UI/NativeApp.cpp
+++ b/UI/NativeApp.cpp
@ -62,7 +62,6 @@
 #include "Common/System/NativeApp.h"
 #include "Common/Data/Text/I18n.h"
 #include "Common/Input/InputState.h"
-#include "Common/Math/fast/fast_math.h"
 #include "Common/Math/math_util.h"
 #include "Common/Math/lin/matrix4x4.h"
 #include "Common/Profiler/Profiler.h"
@ -463,7 +462,6 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch

 	ShaderTranslationInit();

-	InitFastMath();
 	g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);

 	g_Discord.SetPresenceMenu();
--- a/UWP/CommonUWP/CommonUWP.vcxproj
+++ b/UWP/CommonUWP/CommonUWP.vcxproj
@ -432,7 +432,6 @@
    <ClInclude Include="..\..\Common\Input\KeyCodes.h" />
    <ClInclude Include="..\..\Common\Math\curves.h" />
    <ClInclude Include="..\..\Common\Math\expression_parser.h" />
-    <ClInclude Include="..\..\Common\Math\fast\fast_math.h" />
    <ClInclude Include="..\..\Common\Math\fast\fast_matrix.h" />
    <ClInclude Include="..\..\Common\Math\geom2d.h" />
    <ClInclude Include="..\..\Common\Math\lin\matrix4x4.h" />
@ -555,9 +554,7 @@
    <ClCompile Include="..\..\Common\Input\InputState.cpp" />
    <ClCompile Include="..\..\Common\Math\curves.cpp" />
    <ClCompile Include="..\..\Common\Math\expression_parser.cpp" />
-    <ClCompile Include="..\..\Common\Math\fast\fast_math.c" />
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix.c" />
-    <ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c" />
    <ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp" />
    <ClCompile Include="..\..\Common\Math\lin\vec3.cpp" />
    <ClCompile Include="..\..\Common\Math\math_util.cpp" />
--- a/UWP/CommonUWP/CommonUWP.vcxproj.filters
+++ b/UWP/CommonUWP/CommonUWP.vcxproj.filters
@ -225,15 +225,9 @@
    <ClCompile Include="..\..\Common\Math\math_util.cpp">
      <Filter>Math</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\Common\Math\fast\fast_math.c">
-      <Filter>Math\fast</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\Common\Math\fast\fast_matrix.c">
      <Filter>Math\fast</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c">
-      <Filter>Math\fast</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp">
      <Filter>Math\lin</Filter>
    </ClCompile>
@ -529,9 +523,6 @@
    <ClInclude Include="..\..\Common\Math\math_util.h">
      <Filter>Math</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\Common\Math\fast\fast_math.h">
-      <Filter>Math\fast</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\Common\Math\fast\fast_matrix.h">
      <Filter>Math\fast</Filter>
    </ClInclude>
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@ -11,19 +11,16 @@ ARCH_FILES := \
  $(SRC)/Common/ABI.cpp \
  $(SRC)/Common/x64Emitter.cpp \
  $(SRC)/Common/x64Analyzer.cpp \
-  $(SRC)/Common/Math/fast/fast_matrix_sse.c \
  $(SRC)/Common/Thunk.cpp
 else ifeq ($(TARGET_ARCH_ABI),x86_64)
 ARCH_FILES := \
  $(SRC)/Common/ABI.cpp \
  $(SRC)/Common/x64Emitter.cpp \
  $(SRC)/Common/x64Analyzer.cpp \
-  $(SRC)/Common/Math/fast/fast_matrix_sse.c \
  $(SRC)/Common/Thunk.cpp
 else ifeq ($(findstring armeabi-v7a,$(TARGET_ARCH_ABI)),armeabi-v7a)
 ARCH_FILES := \
  $(SRC)/Common/ArmEmitter.cpp \
-  $(SRC)/Common/Math/fast/fast_matrix_neon.S.neon \
  $(SRC)/ext/disarm.cpp \
  $(SRC)/ext/libpng17/arm/arm_init.c \
  $(SRC)/ext/libpng17/arm/filter_neon_intrinsics.c \
@ -164,7 +161,6 @@ EXEC_AND_LIB_FILES := \
  $(SRC)/Common/Render/Text/draw_text_android.cpp \
  $(SRC)/Common/Input/GestureDetector.cpp \
  $(SRC)/Common/Input/InputState.cpp \
-  $(SRC)/Common/Math/fast/fast_math.c \
  $(SRC)/Common/Math/fast/fast_matrix.c \
  $(SRC)/Common/Math/math_util.cpp \
  $(SRC)/Common/Math/curves.cpp \
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@ -163,7 +163,6 @@ ifeq ($(PLATFORM_EXT), android)
 	endif

 	SOURCES_C +=\
-		$(COMMONDIR)/Math/fast/fast_math.c \
 		$(COMMONDIR)/Math/fast/fast_matrix.c
 endif

@ -633,8 +632,7 @@ ifeq ($(WITH_DYNAREC),1)

 			SOURCES_C += $(EXTDIR)/libpng17/arm/arm_init.c \
 				     $(EXTDIR)/libpng17/arm/filter_neon_intrinsics.c
-			ASMFILES   += $(COMMONDIR)/Math/fast/fast_matrix_neon.S \
-				      $(EXTDIR)/libpng17/arm/filter_neon.S
+			ASMFILES   += $(EXTDIR)/libpng17/arm/filter_neon.S
 		endif
 	else ifeq ($(TARGET_ARCH),arm64)
      DYNAFLAGS += -D_ARCH_64