Remove the assembly implementation of 4x4 matrix mul, in favor of intrinsics.

This commit is contained in:
Henrik Rydgård 2022-09-05 10:20:35 +02:00
parent 0126bc181c
commit ed8c4e8758
16 changed files with 103 additions and 165 deletions

View File

@ -626,10 +626,7 @@ add_library(Common STATIC
Common/Input/KeyCodes.h
Common/Input/InputState.cpp
Common/Input/InputState.h
Common/Math/fast/fast_math.c
Common/Math/fast/fast_matrix.c
Common/Math/fast/fast_matrix_neon.S
Common/Math/fast/fast_matrix_sse.c
Common/Math/curves.cpp
Common/Math/curves.h
Common/Math/expression_parser.cpp

View File

@ -452,7 +452,6 @@
<ClInclude Include="Input\KeyCodes.h" />
<ClInclude Include="Math\curves.h" />
<ClInclude Include="Math\expression_parser.h" />
<ClInclude Include="Math\fast\fast_math.h" />
<ClInclude Include="Math\fast\fast_matrix.h" />
<ClInclude Include="Math\geom2d.h" />
<ClInclude Include="Math\lin\matrix4x4.h" />
@ -873,9 +872,7 @@
<ClCompile Include="Log.cpp" />
<ClCompile Include="Math\curves.cpp" />
<ClCompile Include="Math\expression_parser.cpp" />
<ClCompile Include="Math\fast\fast_math.c" />
<ClCompile Include="Math\fast\fast_matrix.c" />
<ClCompile Include="Math\fast\fast_matrix_sse.c" />
<ClCompile Include="Math\lin\matrix4x4.cpp" />
<ClCompile Include="Math\lin\vec3.cpp" />
<ClCompile Include="Math\math_util.cpp" />
@ -992,7 +989,6 @@
</ProjectReference>
</ItemGroup>
<ItemGroup>
<None Include="Math\fast\fast_matrix_neon.S" />
<None Include="Math\lin\matrix_neon.s" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

View File

@ -161,9 +161,6 @@
<ClInclude Include="Math\lin\vec3.h">
<Filter>Math\lin</Filter>
</ClInclude>
<ClInclude Include="Math\fast\fast_math.h">
<Filter>Math\fast</Filter>
</ClInclude>
<ClInclude Include="Math\fast\fast_matrix.h">
<Filter>Math\fast</Filter>
</ClInclude>
@ -568,15 +565,9 @@
<ClCompile Include="Math\lin\vec3.cpp">
<Filter>Math\lin</Filter>
</ClCompile>
<ClCompile Include="Math\fast\fast_math.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="Math\fast\fast_matrix.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="Math\fast\fast_matrix_sse.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="Data\Format\RIFF.cpp">
<Filter>Data\Format</Filter>
</ClCompile>
@ -914,8 +905,5 @@
<None Include="Math\lin\matrix_neon.s">
<Filter>Math\lin</Filter>
</None>
<None Include="Math\fast\fast_matrix_neon.S">
<Filter>Math\fast</Filter>
</None>
</ItemGroup>
</Project>

View File

@ -1,12 +0,0 @@
#include "ppsspp_config.h"
#include "fast_math.h"
#include "fast_matrix.h"
void InitFastMath() {
#ifndef _MSC_VER
#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_neon;
#endif
#endif
}

View File

@ -1,21 +0,0 @@
#pragma once
#ifdef __cplusplus
extern "C" {
#endif
// Fast Math
// A mini library of math kernels. These should be large enough to be worth calling
// as functions, and generic enough to fit in the "native" library (not PSP specific stuff).
// NEON versions are dynamically selected at runtime, when you call InitFastMath.
// SSE versions are hard linked at compile time.
// See fast_matrix.h for the first set of functions.
void InitFastMath();
#ifdef __cplusplus
}
#endif

View File

@ -1,6 +1,100 @@
#include "fast_math.h"
#include "ppsspp_config.h"
#include "fast_matrix.h"
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <emmintrin.h>
#include "fast_matrix.h"
void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
int i;
__m128 a_col_1 = _mm_loadu_ps(a);
__m128 a_col_2 = _mm_loadu_ps(&a[4]);
__m128 a_col_3 = _mm_loadu_ps(&a[8]);
__m128 a_col_4 = _mm_loadu_ps(&a[12]);
for (i = 0; i < 16; i += 4) {
__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
_mm_storeu_ps(&dest[i], r_col);
}
}
#elif PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {
// these are the columns A
float32x4_t A0;
float32x4_t A1;
float32x4_t A2;
float32x4_t A3;
// these are the columns B
float32x4_t B0;
float32x4_t B1;
float32x4_t B2;
float32x4_t B3;
// these are the columns C
float32x4_t C0;
float32x4_t C1;
float32x4_t C2;
float32x4_t C3;
A0 = vld1q_f32(A);
A1 = vld1q_f32(A + 4);
A2 = vld1q_f32(A + 8);
A3 = vld1q_f32(A + 12);
// Zero accumulators for C values
C0 = vmovq_n_f32(0);
C1 = vmovq_n_f32(0);
C2 = vmovq_n_f32(0);
C3 = vmovq_n_f32(0);
// Multiply accumulate in 4x1 blocks, i.e. each column in C
B0 = vld1q_f32(B);
C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
vst1q_f32(C, C0);
B1 = vld1q_f32(B + 4);
C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
vst1q_f32(C + 4, C1);
B2 = vld1q_f32(B + 8);
C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
vst1q_f32(C + 8, C2);
B3 = vld1q_f32(B + 12);
C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
vst1q_f32(C + 12, C3);
}
#else
#define xx 0
#define xy 1
#define xz 2
@ -40,6 +134,4 @@ void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b) {
dest[ww] = b[wx] * a[xw] + b[wy] * a[yw] + b[wz] * a[zw] + b[ww] * a[ww];
}
#ifndef fast_matrix_mul_4x4
fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4 = &fast_matrix_mul_4x4_c;
#endif
#endif

View File

@ -6,11 +6,8 @@
extern "C" {
#endif
// A mini library of matrix math kernels.
// A mini library of 4x4 matrix muls.
// TODO: Really need to wrap this block in a macro or something, will get repetitive.
typedef void(*fptr_fast_matrix_mul_4x4)(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
@ -18,13 +15,12 @@ extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b)
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
// Hard link to SSE implementations on x86/amd64
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
#elif PPSSPP_ARCH(ARM64)
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
#elif PPSSPP_ARCH(ARM_NEON)
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
#else
extern fptr_fast_matrix_mul_4x4 fast_matrix_mul_4x4;
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
#endif
#ifdef __cplusplus
} // extern "C"
#endif

View File

@ -1,54 +0,0 @@
#include "ppsspp_config.h"
#if PPSSPP_ARCH(ARM_NEON) && !PPSSPP_ARCH(ARM64)
.syntax unified // Allow both ARM and Thumb-2 instructions
.text
.align 2 // Align the function code to a 4-byte (2^n) word boundary.
.arm // Use ARM instructions instead of Thumb.
@ From ARM samples
@
@ matrix_mul_float:
@ Calculate 4x4 (matrix 0) * (matrix 1) and store to result 4x4 matrix.
@ matrix 0, matrix 1 and result pointers can be the same,
@ ie. my_matrix = my_matrix * my_matrix is possible.
@
@ r0 = pointer to 4x4 result matrix, single precision floats, column major order
@ r1 = pointer to 4x4 matrix 0, single precision floats, column major order
@ r2 = pointer to 4x4 matrix 1, single precision floats, column major order
@
.globl _fast_matrix_mul_4x4_neon
_fast_matrix_mul_4x4_neon:
.globl fast_matrix_mul_4x4_neon
fast_matrix_mul_4x4_neon:
vld1.32 {d16-d19}, [r1]! @ load first eight elements of matrix 0
vld1.32 {d20-d23}, [r1]! @ load second eight elements of matrix 0
vld1.32 {d0-d3}, [r2]! @ load first eight elements of matrix 1
vld1.32 {d4-d7}, [r2]! @ load second eight elements of matrix 1
vmul.f32 q12, q8, d0[0] @ rslt col0 = (mat0 col0) * (mat1 col0 elt0)
vmul.f32 q13, q8, d2[0] @ rslt col1 = (mat0 col0) * (mat1 col1 elt0)
vmul.f32 q14, q8, d4[0] @ rslt col2 = (mat0 col0) * (mat1 col2 elt0)
vmul.f32 q15, q8, d6[0] @ rslt col3 = (mat0 col0) * (mat1 col3 elt0)
vmla.f32 q12, q9, d0[1] @ rslt col0 += (mat0 col1) * (mat1 col0 elt1)
vmla.f32 q13, q9, d2[1] @ rslt col1 += (mat0 col1) * (mat1 col1 elt1)
vmla.f32 q14, q9, d4[1] @ rslt col2 += (mat0 col1) * (mat1 col2 elt1)
vmla.f32 q15, q9, d6[1] @ rslt col3 += (mat0 col1) * (mat1 col3 elt1)
vmla.f32 q12, q10, d1[0] @ rslt col0 += (mat0 col2) * (mat1 col0 elt2)
vmla.f32 q13, q10, d3[0] @ rslt col1 += (mat0 col2) * (mat1 col1 elt2)
vmla.f32 q14, q10, d5[0] @ rslt col2 += (mat0 col2) * (mat1 col2 elt2)
vmla.f32 q15, q10, d7[0] @ rslt col3 += (mat0 col2) * (mat1 col2 elt2)
vmla.f32 q12, q11, d1[1] @ rslt col0 += (mat0 col3) * (mat1 col0 elt3)
vmla.f32 q13, q11, d3[1] @ rslt col1 += (mat0 col3) * (mat1 col1 elt3)
vmla.f32 q14, q11, d5[1] @ rslt col2 += (mat0 col3) * (mat1 col2 elt3)
vmla.f32 q15, q11, d7[1] @ rslt col3 += (mat0 col3) * (mat1 col3 elt3)
vst1.32 {d24-d27}, [r0]! @ store first eight elements of result
vst1.32 {d28-d31}, [r0]! @ store second eight elements of result
bx lr
#endif

View File

@ -1,25 +0,0 @@
#include "ppsspp_config.h"
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <emmintrin.h>
#include "fast_matrix.h"
void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
int i;
__m128 a_col_1 = _mm_loadu_ps(a);
__m128 a_col_2 = _mm_loadu_ps(&a[4]);
__m128 a_col_3 = _mm_loadu_ps(&a[8]);
__m128 a_col_4 = _mm_loadu_ps(&a[12]);
for (i = 0; i < 16; i += 4) {
__m128 r_col = _mm_mul_ps(a_col_1, _mm_set1_ps(b[i]));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_2, _mm_set1_ps(b[i + 1])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_3, _mm_set1_ps(b[i + 2])));
r_col = _mm_add_ps(r_col, _mm_mul_ps(a_col_4, _mm_set1_ps(b[i + 3])));
_mm_storeu_ps(&dest[i], r_col);
}
}
#endif

View File

@ -234,6 +234,7 @@ bool DrawEngineCommon::TestBoundingBox(const void* control_points, int vertexCou
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
// TODO: Create a Matrix4x3ByMatrix4x3, and Matrix4x4ByMatrix4x3?
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
PlanesFromMatrix(worldviewproj, planes);

View File

@ -1559,7 +1559,7 @@ void ConvertStencilFuncState(GenericStencilFuncState &state) {
}
void GenericMaskState::Log() {
WARN_LOG(G3D, "Mask: %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
WARN_LOG(G3D, "Mask: %08x %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
}
void GenericBlendState::Log() {

View File

@ -62,7 +62,6 @@
#include "Common/System/NativeApp.h"
#include "Common/Data/Text/I18n.h"
#include "Common/Input/InputState.h"
#include "Common/Math/fast/fast_math.h"
#include "Common/Math/math_util.h"
#include "Common/Math/lin/matrix4x4.h"
#include "Common/Profiler/Profiler.h"
@ -463,7 +462,6 @@ void NativeInit(int argc, const char *argv[], const char *savegame_dir, const ch
ShaderTranslationInit();
InitFastMath();
g_threadManager.Init(cpu_info.num_cores, cpu_info.logical_cpu_count);
g_Discord.SetPresenceMenu();

View File

@ -432,7 +432,6 @@
<ClInclude Include="..\..\Common\Input\KeyCodes.h" />
<ClInclude Include="..\..\Common\Math\curves.h" />
<ClInclude Include="..\..\Common\Math\expression_parser.h" />
<ClInclude Include="..\..\Common\Math\fast\fast_math.h" />
<ClInclude Include="..\..\Common\Math\fast\fast_matrix.h" />
<ClInclude Include="..\..\Common\Math\geom2d.h" />
<ClInclude Include="..\..\Common\Math\lin\matrix4x4.h" />
@ -555,9 +554,7 @@
<ClCompile Include="..\..\Common\Input\InputState.cpp" />
<ClCompile Include="..\..\Common\Math\curves.cpp" />
<ClCompile Include="..\..\Common\Math\expression_parser.cpp" />
<ClCompile Include="..\..\Common\Math\fast\fast_math.c" />
<ClCompile Include="..\..\Common\Math\fast\fast_matrix.c" />
<ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c" />
<ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp" />
<ClCompile Include="..\..\Common\Math\lin\vec3.cpp" />
<ClCompile Include="..\..\Common\Math\math_util.cpp" />

View File

@ -225,15 +225,9 @@
<ClCompile Include="..\..\Common\Math\math_util.cpp">
<Filter>Math</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\Math\fast\fast_math.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\Math\fast\fast_matrix.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\Math\fast\fast_matrix_sse.c">
<Filter>Math\fast</Filter>
</ClCompile>
<ClCompile Include="..\..\Common\Math\lin\matrix4x4.cpp">
<Filter>Math\lin</Filter>
</ClCompile>
@ -529,9 +523,6 @@
<ClInclude Include="..\..\Common\Math\math_util.h">
<Filter>Math</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Math\fast\fast_math.h">
<Filter>Math\fast</Filter>
</ClInclude>
<ClInclude Include="..\..\Common\Math\fast\fast_matrix.h">
<Filter>Math\fast</Filter>
</ClInclude>

View File

@ -11,19 +11,16 @@ ARCH_FILES := \
$(SRC)/Common/ABI.cpp \
$(SRC)/Common/x64Emitter.cpp \
$(SRC)/Common/x64Analyzer.cpp \
$(SRC)/Common/Math/fast/fast_matrix_sse.c \
$(SRC)/Common/Thunk.cpp
else ifeq ($(TARGET_ARCH_ABI),x86_64)
ARCH_FILES := \
$(SRC)/Common/ABI.cpp \
$(SRC)/Common/x64Emitter.cpp \
$(SRC)/Common/x64Analyzer.cpp \
$(SRC)/Common/Math/fast/fast_matrix_sse.c \
$(SRC)/Common/Thunk.cpp
else ifeq ($(findstring armeabi-v7a,$(TARGET_ARCH_ABI)),armeabi-v7a)
ARCH_FILES := \
$(SRC)/Common/ArmEmitter.cpp \
$(SRC)/Common/Math/fast/fast_matrix_neon.S.neon \
$(SRC)/ext/disarm.cpp \
$(SRC)/ext/libpng17/arm/arm_init.c \
$(SRC)/ext/libpng17/arm/filter_neon_intrinsics.c \
@ -164,7 +161,6 @@ EXEC_AND_LIB_FILES := \
$(SRC)/Common/Render/Text/draw_text_android.cpp \
$(SRC)/Common/Input/GestureDetector.cpp \
$(SRC)/Common/Input/InputState.cpp \
$(SRC)/Common/Math/fast/fast_math.c \
$(SRC)/Common/Math/fast/fast_matrix.c \
$(SRC)/Common/Math/math_util.cpp \
$(SRC)/Common/Math/curves.cpp \

View File

@ -163,7 +163,6 @@ ifeq ($(PLATFORM_EXT), android)
endif
SOURCES_C +=\
$(COMMONDIR)/Math/fast/fast_math.c \
$(COMMONDIR)/Math/fast/fast_matrix.c
endif
@ -633,8 +632,7 @@ ifeq ($(WITH_DYNAREC),1)
SOURCES_C += $(EXTDIR)/libpng17/arm/arm_init.c \
$(EXTDIR)/libpng17/arm/filter_neon_intrinsics.c
ASMFILES += $(COMMONDIR)/Math/fast/fast_matrix_neon.S \
$(EXTDIR)/libpng17/arm/filter_neon.S
ASMFILES += $(EXTDIR)/libpng17/arm/filter_neon.S
endif
else ifeq ($(TARGET_ARCH),arm64)
DYNAFLAGS += -D_ARCH_64