Add some SSE code for an atrac hotspot just for fun, remove function pointers

2025-02-19 21:52:45 +00:00 · 2024-04-13 10:10:31 +02:00 · 2024-04-13 10:10:31 +02:00 · 7b6b7ebe3c
commit 7b6b7ebe3c
parent 52111103b8
5 changed files with 72 additions and 38 deletions
--- a/ext/at3_standalone/atrac3.cpp
+++ b/ext/at3_standalone/atrac3.cpp
@ -140,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
            FFSWAP(float, input[i], input[255 - i]);
    }

-    q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
+    imdct_calc(&q->mdct_ctx, output, input);

    /* Perform windowing on the output. */
    vector_fmul(output, output, mdct_window, MDCT_SIZE);
--- a/ext/at3_standalone/atrac3plusdsp.cpp
+++ b/ext/at3_standalone/atrac3plusdsp.cpp
@ -25,6 +25,22 @@
 *  DSP functions for ATRAC3+ decoder.
 */

+#include "ppsspp_config.h"
+
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+
+#include <emmintrin.h>
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+#endif
+
 #include <math.h>
 #include <string.h>

@ -466,7 +482,7 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn,
        for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES / 2; i++)
            FFSWAP(float, pIn[i], pIn[ATRAC3P_SUBBAND_SAMPLES - 1 - i]);

-    mdct_ctx->imdct_calc(mdct_ctx, pOut, pIn);
+    imdct_calc(mdct_ctx, pOut, pIn);

    /* Perform windowing on the output.
     * ATRAC3+ uses two different MDCT windows:
@ -612,7 +628,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
            idct_in[sb] = in[sb * ATRAC3P_SUBBAND_SAMPLES + s];

        /* Calculate the sine and cosine part of the PQF using IDCT-IV */
-        dct_ctx->imdct_half(dct_ctx, idct_out, idct_in);
+        imdct_half(dct_ctx, idct_out, idct_in);

        /* append the result to the history */
        for (i = 0; i < 8; i++) {
@ -629,13 +645,32 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
            const float *coeffs1 = ipqf_coeffs1[t];
            const float *coeffs2 = ipqf_coeffs2[t];

-			float *outp = out + s * 16;
+            float *outp = out + s * 16;
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+            auto _mm_reverse = [](__m128 x) -> __m128 {
+                return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
+            };
+            _mm_storeu_ps(outp, _mm_add_ps(_mm_loadu_ps(outp), _mm_add_ps(
+                _mm_mul_ps(_mm_loadu_ps(buf1), _mm_loadu_ps(coeffs1)),
+                _mm_mul_ps(_mm_loadu_ps(buf2), _mm_loadu_ps(coeffs2)))));
+            _mm_storeu_ps(outp + 4, _mm_add_ps(_mm_loadu_ps(outp + 4), _mm_add_ps(
+                _mm_mul_ps(_mm_loadu_ps(buf1 + 4), _mm_loadu_ps(coeffs1 + 4)),
+                _mm_mul_ps(_mm_loadu_ps(buf2 + 4), _mm_loadu_ps(coeffs2 + 4)))));
+
+            _mm_storeu_ps(outp + 8, _mm_add_ps(_mm_loadu_ps(outp + 8), _mm_add_ps(
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1 + 4)), _mm_loadu_ps(coeffs1 + 8)),
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2 + 4)), _mm_loadu_ps(coeffs2 + 8)))));
+            _mm_storeu_ps(outp + 12, _mm_add_ps(_mm_loadu_ps(outp + 12), _mm_add_ps(
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1)), _mm_loadu_ps(coeffs1 + 12)),
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2)), _mm_loadu_ps(coeffs2 + 12)))));
+#else
            for (i = 0; i < 8; i++) {
                outp[i] += buf1[i] * coeffs1[i] + buf2[i] * coeffs2[i];
            }
-			for (i = 0; i < 8; i++) {
-				outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
-			}
+            for (i = 0; i < 8; i++) {
+                outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
+            }
+#endif

            pos_now  = mod23_lut[pos_next + 2]; // pos_now  = (pos_now  + 2) % 23;
            pos_next = mod23_lut[pos_now + 2];  // pos_next = (pos_next + 2) % 23;
--- a/ext/at3_standalone/compat.h
+++ b/ext/at3_standalone/compat.h
@ -18,6 +18,9 @@
 #define AV_HAVE_FAST_UNALIGNED 0
 #define AV_INPUT_BUFFER_PADDING_SIZE 32

+// TODO: This should work but doesn't??
+// #define BITSTREAM_READER_LE
+
 #define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript
 #define av_restrict
 #define av_alias
--- a/ext/at3_standalone/fft.cpp
+++ b/ext/at3_standalone/fft.cpp
@ -35,9 +35,9 @@

 #define sqrthalf (float)M_SQRT1_2

-void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
+void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);

 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 COSTABLE(16);
@ -71,8 +71,8 @@ static FFTSample * const av_cos_tabs[] = {
    av_cos_65536,
 };

-static void fft_permute_c(FFTContext *s, FFTComplex *z);
-static void fft_calc_c(FFTContext *s, FFTComplex *z);
+void fft_permute(FFTContext *s, FFTComplex *z);
+void fft_calc(FFTContext *s, FFTComplex *z);

 static int split_radix_permutation(int i, int n, int inverse)
 {
@ -130,11 +130,6 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
        goto fail;
    s->inverse = inverse;

-    s->fft_permute = fft_permute_c;
-    s->fft_calc    = fft_calc_c;
-    s->imdct_calc  = ff_imdct_calc_c;
-    s->imdct_half  = ff_imdct_half_c;
-    s->mdct_calc   = ff_mdct_calc_c;
    for(j=4; j<=nbits; j++) {
        ff_init_ff_cos_tabs(j);
    }
@ -152,7 +147,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
    return -1;
 }

-static void fft_permute_c(FFTContext *s, FFTComplex *z)
+void fft_permute(FFTContext *s, FFTComplex *z)
 {
    int j, np;
    const uint16_t *revtab = s->revtab;
@ -315,12 +310,11 @@ static void (* const fft_dispatch[])(FFTComplex*) = {
    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
 };

-static void fft_calc_c(FFTContext *s, FFTComplex *z)
+void fft_calc(FFTContext *s, FFTComplex *z)
 {
    fft_dispatch[s->nbits-2](z);
 }

-
 #include <stdlib.h>
 #include <string.h>

@ -383,7 +377,7 @@ fail:
 * @param output N/2 samples
 * @param input N/2 samples
 */
-void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
+void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
 	int k, n8, n4, n2, n, j;
 	const uint16_t *revtab = s->revtab;
@ -406,7 +400,7 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 		in1 += 2;
 		in2 -= 2;
 	}
-	s->fft_calc(s, z);
+	fft_calc(s, z);

 	/* post rotation + reordering */
 	for (k = 0; k < n8; k++) {
@ -425,14 +419,14 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 * @param output N samples
 * @param input N/2 samples
 */
-void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
+void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
 	int k;
 	int n = 1 << s->mdct_bits;
 	int n2 = n >> 1;
 	int n4 = n >> 2;

-	ff_imdct_half_c(s, output + n4, input);
+	imdct_half(s, output + n4, input);

 	for (k = 0; k < n4; k++) {
 		output[k] = -output[n2 - k - 1];
@ -445,7 +439,7 @@ void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 * @param input N samples
 * @param out N/2 samples
 */
-void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
+void mdct_calc(FFTContext *s, FFTSample *out, const FFTSample *input)
 {
 	int i, j, n, n8, n4, n2, n3;
 	FFTDouble re, im;
@ -473,7 +467,7 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
 		CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
 	}

-	s->fft_calc(s, x);
+	fft_calc(s, x);

 	/* post rotation */
 	for (i = 0; i < n8; i++) {
--- a/ext/at3_standalone/fft.h
+++ b/ext/at3_standalone/fft.h
@ -57,21 +57,23 @@ struct FFTContext {
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
-    /**
-     * Do the permutation needed BEFORE calling fft_calc().
-     */
-    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
-    /**
-     * Do a complex FFT with the parameters defined in ff_fft_init(). The
-     * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
-     */
-    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
-    void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
-    void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
-    void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+
    enum mdct_permutation_type mdct_permutation;
 };

+/**
+     * Do the permutation needed BEFORE calling fft_calc().
+     */
+void fft_permute(struct FFTContext *s, FFTComplex *z);
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+void fft_calc(struct FFTContext *s, FFTComplex *z);
+void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_half(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+void mdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+
 #define COSTABLE(size) \
     DECLARE_ALIGNED(32, FFTSample, av_cos_##size)[size/2]