From 090d1f8d0e207dd1927b272823ea53242dc65f2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 2 May 2024 16:56:32 +0200
Subject: [PATCH] atrac3: Add a cross platform restrict modifier, use it for
 the vector math functions

Also remove redundant parameters.

There are a bunch of other places where we could use this, so just
experimenting here.

This enables autovectorization to work here.
---
 ext/at3_standalone/atrac3.cpp        |  2 +-
 ext/at3_standalone/atrac3plusdsp.cpp | 18 ++++++++--------
 ext/at3_standalone/compat.h          | 10 +++++++--
 ext/at3_standalone/float_dsp.h       | 31 +++++++++++++---------------
 4 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/ext/at3_standalone/atrac3.cpp b/ext/at3_standalone/atrac3.cpp
index fe0e71c841..6c1afcb166 100644
--- a/ext/at3_standalone/atrac3.cpp
+++ b/ext/at3_standalone/atrac3.cpp
@@ -143,7 +143,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
     imdct_calc(&q->mdct_ctx, output, input);
 
     /* Perform windowing on the output. */
-    vector_fmul(output, output, mdct_window, MDCT_SIZE);
+    vector_fmul(output, mdct_window, MDCT_SIZE);
 }
 
 /*
diff --git a/ext/at3_standalone/atrac3plusdsp.cpp b/ext/at3_standalone/atrac3plusdsp.cpp
index 1cf345ced6..a070148e0e 100644
--- a/ext/at3_standalone/atrac3plusdsp.cpp
+++ b/ext/at3_standalone/atrac3plusdsp.cpp
@@ -174,7 +174,7 @@ static void waves_synth(Atrac3pWaveSynthParams *synth_param,
 
     /* invert phase if requested */
     if (invert_phase)
-        vector_fmul_scalar(out, out, -1.0f, 128);
+        vector_fmul_scalar(out, -1.0f, 128);
 
     /* fade in with steep Hann window if requested */
     if (envelope->has_start_point) {
@@ -255,14 +255,14 @@ void ff_atrac3p_generate_tones(Atrac3pChanUnitCtx *ch_unit, int ch_num, int sb,
     /* Hann windowing for non-faded wave signals */
     if (tones_now->num_wavs && tones_next->num_wavs &&
         reg1_env_nonzero && reg2_env_nonzero) {
-        vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
-        vector_fmul(wavreg2, wavreg2,  hann_window,      128);
+        vector_fmul(wavreg1, &hann_window[128], 128);
+        vector_fmul(wavreg2,  hann_window,      128);
     } else {
         if (tones_now->num_wavs && !tones_now->curr_env.has_stop_point)
-            vector_fmul(wavreg1, wavreg1, &hann_window[128], 128);
+            vector_fmul(wavreg1, &hann_window[128], 128);
 
         if (tones_next->num_wavs && !tones_next->curr_env.has_start_point)
-            vector_fmul(wavreg2, wavreg2, hann_window, 128);
+            vector_fmul(wavreg2, hann_window, 128);
     }
 
     /* Overlap and add to residual */
@@ -502,15 +502,15 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn,
      *   Both regions are 32 samples long. */
     if (wind_id & 2) { /* 1st half: steep window */
         memset(pOut, 0, sizeof(float) * 32);
-        vector_fmul(&pOut[32], &pOut[32], av_sine_64, 64);
+        vector_fmul(&pOut[32], av_sine_64, 64);
     } else /* 1st half: simple sine window */
-        vector_fmul(pOut, pOut, av_sine_128, ATRAC3P_MDCT_SIZE / 2);
+        vector_fmul(pOut, av_sine_128, ATRAC3P_MDCT_SIZE / 2);
 
     if (wind_id & 1) { /* 2nd half: steep window */
-        vector_fmul_reverse(&pOut[160], &pOut[160], av_sine_64, 64);
+        vector_fmul_reverse(&pOut[160], av_sine_64, 64);
         memset(&pOut[224], 0, sizeof(float) * 32);
     } else /* 2nd half: simple sine window */
-        vector_fmul_reverse(&pOut[128], &pOut[128], av_sine_128, ATRAC3P_MDCT_SIZE / 2);
+        vector_fmul_reverse(&pOut[128], av_sine_128, ATRAC3P_MDCT_SIZE / 2);
 }
 
 /* lookup table for fast modulo 23 op required for cyclic buffers of the IPQF */
diff --git a/ext/at3_standalone/compat.h b/ext/at3_standalone/compat.h
index bd3e398b32..57b0244de4 100644
--- a/ext/at3_standalone/compat.h
+++ b/ext/at3_standalone/compat.h
@@ -4,15 +4,22 @@
 
 // Compat hacks to make an FFMPEG-like environment, so we can keep the core code mostly unchanged.
 
-#if defined(__GNUC__)
+#if defined(__clang__)
+#define DECLARE_ALIGNED(n, t, v)      t __attribute__((aligned(n))) v
+#define DECLARE_ASM_CONST(n, t, v)    static const t av_used __attribute__((aligned(n))) v
+#define av_restrict __restrict
+#elif defined(__GNUC__)
 #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
 #define DECLARE_ASM_CONST(n,t,v)    static const t av_used __attribute__ ((aligned (n))) v
+#define av_restrict __restrict__
 #elif defined(_MSC_VER)
 #define DECLARE_ALIGNED(n,t,v)      __declspec(align(n)) t v
 #define DECLARE_ASM_CONST(n,t,v)    __declspec(align(n)) static const t v
+#define av_restrict __restrict
 #else
 #define DECLARE_ALIGNED(n,t,v)      t v
 #define DECLARE_ASM_CONST(n,t,v)    static const t v
+#define av_restrict
 #endif
 
 #define AV_HAVE_FAST_UNALIGNED 0
@@ -22,7 +29,6 @@
 // #define BITSTREAM_READER_LE
 
 #define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript
-#define av_restrict
 #define av_alias
 #define av_unused
 #define av_assert0(cond)
diff --git a/ext/at3_standalone/float_dsp.h b/ext/at3_standalone/float_dsp.h
index f02764382f..c9664a814b 100644
--- a/ext/at3_standalone/float_dsp.h
+++ b/ext/at3_standalone/float_dsp.h
@@ -18,20 +18,20 @@
 
 #pragma once
 
-inline void vector_fmul(float *dst, const float *src0, const float *src1, int len) {
-    int i;
-    for (i = 0; i < len; i++)
-        dst[i] = src0[i] * src1[i];
+#include "compat.h"
+
+inline void vector_fmul(float * av_restrict dst, const float * av_restrict src, int len) {
+    for (int i = 0; i < len; i++)
+        dst[i] = dst[i] * src[i];
 }
 
 /**
 * Multiply a vector of floats by a scalar float.  Source and
 * destination vectors must overlap exactly or not at all.
 */
-inline void vector_fmul_scalar(float *dst, const float *src, float mul, int len) {
-    int i;
-    for (i = 0; i < len; i++)
-        dst[i] = src[i] * mul;
+inline void vector_fmul_scalar(float *dst, float mul, int len) {
+    for (int i = 0; i < len; i++)
+        dst[i] *= mul;
 }
 
 /**
@@ -39,18 +39,15 @@ inline void vector_fmul_scalar(float *dst, const float *src, float mul, int len)
 * in a vector of floats. The second vector of floats is iterated over
 * in reverse order.
 *
-* @param dst  output vector
+* @param dst  output and first input vector
 *             constraints: 32-byte aligned
-* @param src0 first input vector
-*             constraints: 32-byte aligned
-* @param src1 second input vector
+* @param src second input vector
 *             constraints: 32-byte aligned
 * @param len  number of elements in the input
 *             constraints: multiple of 16
 */
-inline void vector_fmul_reverse(float *dst, const float *src0, const float *src1, int len) {
-    int i;
-    src1 += len - 1;
-    for (i = 0; i < len; i++)
-        dst[i] = src0[i] * src1[-i];
+inline void vector_fmul_reverse(float * av_restrict dst, const float * av_restrict src, int len) {
+    src += len - 1;
+    for (int i = 0; i < len; i++)
+        dst[i] *= src[-i];
 }