diff --git a/ext/at3_standalone/atrac3.cpp b/ext/at3_standalone/atrac3.cpp
index 7b2ca124eb..fe0e71c841 100644
--- a/ext/at3_standalone/atrac3.cpp
+++ b/ext/at3_standalone/atrac3.cpp
@@ -140,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
             FFSWAP(float, input[i], input[255 - i]);
     }
 
-    q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
+    imdct_calc(&q->mdct_ctx, output, input);
 
     /* Perform windowing on the output. */
     vector_fmul(output, output, mdct_window, MDCT_SIZE);
diff --git a/ext/at3_standalone/atrac3plusdsp.cpp b/ext/at3_standalone/atrac3plusdsp.cpp
index 27a296a937..690f93fc9d 100644
--- a/ext/at3_standalone/atrac3plusdsp.cpp
+++ b/ext/at3_standalone/atrac3plusdsp.cpp
@@ -25,6 +25,22 @@
  *  DSP functions for ATRAC3+ decoder.
  */
 
+#include "ppsspp_config.h"
+
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+
+#include <emmintrin.h>
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+#endif
+
 #include <math.h>
 #include <string.h>
 
@@ -466,7 +482,7 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn,
         for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES / 2; i++)
             FFSWAP(float, pIn[i], pIn[ATRAC3P_SUBBAND_SAMPLES - 1 - i]);
 
-    mdct_ctx->imdct_calc(mdct_ctx, pOut, pIn);
+    imdct_calc(mdct_ctx, pOut, pIn);
 
     /* Perform windowing on the output.
      * ATRAC3+ uses two different MDCT windows:
@@ -612,7 +628,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
             idct_in[sb] = in[sb * ATRAC3P_SUBBAND_SAMPLES + s];
 
         /* Calculate the sine and cosine part of the PQF using IDCT-IV */
-        dct_ctx->imdct_half(dct_ctx, idct_out, idct_in);
+        imdct_half(dct_ctx, idct_out, idct_in);
 
         /* append the result to the history */
         for (i = 0; i < 8; i++) {
@@ -629,13 +645,32 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
             const float *coeffs1 = ipqf_coeffs1[t];
             const float *coeffs2 = ipqf_coeffs2[t];
 
-			float *outp = out + s * 16;
+            float *outp = out + s * 16;
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+            auto _mm_reverse = [](__m128 x) -> __m128 {
+                return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
+            };
+            _mm_storeu_ps(outp, _mm_add_ps(_mm_loadu_ps(outp), _mm_add_ps(
+                _mm_mul_ps(_mm_loadu_ps(buf1), _mm_loadu_ps(coeffs1)),
+                _mm_mul_ps(_mm_loadu_ps(buf2), _mm_loadu_ps(coeffs2)))));
+            _mm_storeu_ps(outp + 4, _mm_add_ps(_mm_loadu_ps(outp + 4), _mm_add_ps(
+                _mm_mul_ps(_mm_loadu_ps(buf1 + 4), _mm_loadu_ps(coeffs1 + 4)),
+                _mm_mul_ps(_mm_loadu_ps(buf2 + 4), _mm_loadu_ps(coeffs2 + 4)))));
+
+            _mm_storeu_ps(outp + 8, _mm_add_ps(_mm_loadu_ps(outp + 8), _mm_add_ps(
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1 + 4)), _mm_loadu_ps(coeffs1 + 8)),
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2 + 4)), _mm_loadu_ps(coeffs2 + 8)))));
+            _mm_storeu_ps(outp + 12, _mm_add_ps(_mm_loadu_ps(outp + 12), _mm_add_ps(
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1)), _mm_loadu_ps(coeffs1 + 12)),
+                _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2)), _mm_loadu_ps(coeffs2 + 12)))));
+#else
             for (i = 0; i < 8; i++) {
                 outp[i] += buf1[i] * coeffs1[i] + buf2[i] * coeffs2[i];
             }
-			for (i = 0; i < 8; i++) {
-				outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
-			}
+            for (i = 0; i < 8; i++) {
+                outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
+            }
+#endif
 
             pos_now  = mod23_lut[pos_next + 2]; // pos_now  = (pos_now  + 2) % 23;
             pos_next = mod23_lut[pos_now + 2];  // pos_next = (pos_next + 2) % 23;
diff --git a/ext/at3_standalone/compat.h b/ext/at3_standalone/compat.h
index 6c4a4ac45b..bd3e398b32 100644
--- a/ext/at3_standalone/compat.h
+++ b/ext/at3_standalone/compat.h
@@ -18,6 +18,9 @@
 #define AV_HAVE_FAST_UNALIGNED 0
 #define AV_INPUT_BUFFER_PADDING_SIZE 32
 
+// TODO: This should work but doesn't??
+// #define BITSTREAM_READER_LE
+
 #define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript
 #define av_restrict
 #define av_alias
diff --git a/ext/at3_standalone/fft.cpp b/ext/at3_standalone/fft.cpp
index 961c43b55d..184275f870 100644
--- a/ext/at3_standalone/fft.cpp
+++ b/ext/at3_standalone/fft.cpp
@@ -35,9 +35,9 @@
 
 #define sqrthalf (float)M_SQRT1_2
 
-void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
+void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
 
 /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
 COSTABLE(16);
@@ -71,8 +71,8 @@ static FFTSample * const av_cos_tabs[] = {
     av_cos_65536,
 };
 
-static void fft_permute_c(FFTContext *s, FFTComplex *z);
-static void fft_calc_c(FFTContext *s, FFTComplex *z);
+void fft_permute(FFTContext *s, FFTComplex *z);
+void fft_calc(FFTContext *s, FFTComplex *z);
 
 static int split_radix_permutation(int i, int n, int inverse)
 {
@@ -130,11 +130,6 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
         goto fail;
     s->inverse = inverse;
 
-    s->fft_permute = fft_permute_c;
-    s->fft_calc    = fft_calc_c;
-    s->imdct_calc  = ff_imdct_calc_c;
-    s->imdct_half  = ff_imdct_half_c;
-    s->mdct_calc   = ff_mdct_calc_c;
     for(j=4; j<=nbits; j++) {
         ff_init_ff_cos_tabs(j);
     }
@@ -152,7 +147,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
     return -1;
 }
 
-static void fft_permute_c(FFTContext *s, FFTComplex *z)
+void fft_permute(FFTContext *s, FFTComplex *z)
 {
     int j, np;
     const uint16_t *revtab = s->revtab;
@@ -315,12 +310,11 @@ static void (* const fft_dispatch[])(FFTComplex*) = {
     fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
 };
 
-static void fft_calc_c(FFTContext *s, FFTComplex *z)
+void fft_calc(FFTContext *s, FFTComplex *z)
 {
     fft_dispatch[s->nbits-2](z);
 }
 
-
 #include <stdlib.h>
 #include <string.h>
 
@@ -383,7 +377,7 @@ fail:
  * @param output N/2 samples
  * @param input N/2 samples
  */
-void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
+void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
 	int k, n8, n4, n2, n, j;
 	const uint16_t *revtab = s->revtab;
@@ -406,7 +400,7 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
 		in1 += 2;
 		in2 -= 2;
 	}
-	s->fft_calc(s, z);
+	fft_calc(s, z);
 
 	/* post rotation + reordering */
 	for (k = 0; k < n8; k++) {
@@ -425,14 +419,14 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
  * @param output N samples
  * @param input N/2 samples
  */
-void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
+void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
 	int k;
 	int n = 1 << s->mdct_bits;
 	int n2 = n >> 1;
 	int n4 = n >> 2;
 
-	ff_imdct_half_c(s, output + n4, input);
+	imdct_half(s, output + n4, input);
 
 	for (k = 0; k < n4; k++) {
 		output[k] = -output[n2 - k - 1];
@@ -445,7 +439,7 @@ void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
  * @param input N samples
  * @param out N/2 samples
  */
-void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
+void mdct_calc(FFTContext *s, FFTSample *out, const FFTSample *input)
 {
 	int i, j, n, n8, n4, n2, n3;
 	FFTDouble re, im;
@@ -473,7 +467,7 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
 		CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
 	}
 
-	s->fft_calc(s, x);
+	fft_calc(s, x);
 
 	/* post rotation */
 	for (i = 0; i < n8; i++) {
diff --git a/ext/at3_standalone/fft.h b/ext/at3_standalone/fft.h
index 3c9e413bb1..d4dfd940ec 100644
--- a/ext/at3_standalone/fft.h
+++ b/ext/at3_standalone/fft.h
@@ -57,21 +57,23 @@ struct FFTContext {
     /* pre/post rotation tables */
     FFTSample *tcos;
     FFTSample *tsin;
-    /**
-     * Do the permutation needed BEFORE calling fft_calc().
-     */
-    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
-    /**
-     * Do a complex FFT with the parameters defined in ff_fft_init(). The
-     * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
-     */
-    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
-    void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
-    void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
-    void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+
     enum mdct_permutation_type mdct_permutation;
 };
 
+/**
+     * Do the permutation needed BEFORE calling fft_calc().
+     */
+void fft_permute(struct FFTContext *s, FFTComplex *z);
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+void fft_calc(struct FFTContext *s, FFTComplex *z);
+void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+void imdct_half(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+void mdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+
 #define COSTABLE(size) \
      DECLARE_ALIGNED(32, FFTSample, av_cos_##size)[size/2]