From 73cd690753f9109e4e92cba4eb8826d6bc90998f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Nov 2023 00:25:04 +0100 Subject: [PATCH] Optimize NEON matrix multiplication slightly --- Common/Math/fast/fast_matrix.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 614a552b39..7e142c84f6 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -67,36 +67,30 @@ void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) { A2 = vld1q_f32(A + 8); A3 = vld1q_f32(A + 12); - // Zero accumulators for C values - C0 = vmovq_n_f32(0); - C1 = vmovq_n_f32(0); - C2 = vmovq_n_f32(0); - C3 = vmovq_n_f32(0); - // Multiply accumulate in 4x1 blocks, i.e. each column in C B0 = vld1q_f32(B); - C0 = vfmaq_laneq_f32(C0, A0, B0, 0); + C0 = vmulq_laneq_f32(A0, B0, 0); C0 = vfmaq_laneq_f32(C0, A1, B0, 1); C0 = vfmaq_laneq_f32(C0, A2, B0, 2); C0 = vfmaq_laneq_f32(C0, A3, B0, 3); vst1q_f32(C, C0); B1 = vld1q_f32(B + 4); - C1 = vfmaq_laneq_f32(C1, A0, B1, 0); + C1 = vmulq_laneq_f32(A0, B1, 0); C1 = vfmaq_laneq_f32(C1, A1, B1, 1); C1 = vfmaq_laneq_f32(C1, A2, B1, 2); C1 = vfmaq_laneq_f32(C1, A3, B1, 3); vst1q_f32(C + 4, C1); B2 = vld1q_f32(B + 8); - C2 = vfmaq_laneq_f32(C2, A0, B2, 0); + C2 = vmulq_laneq_f32(A0, B2, 0); C2 = vfmaq_laneq_f32(C2, A1, B2, 1); C2 = vfmaq_laneq_f32(C2, A2, B2, 2); C2 = vfmaq_laneq_f32(C2, A3, B2, 3); vst1q_f32(C + 8, C2); B3 = vld1q_f32(B + 12); - C3 = vfmaq_laneq_f32(C3, A0, B3, 0); + C3 = vmulq_laneq_f32(A0, B3, 0); C3 = vfmaq_laneq_f32(C3, A1, B3, 1); C3 = vfmaq_laneq_f32(C3, A2, B3, 2); C3 = vfmaq_laneq_f32(C3, A3, B3, 3);