diff --git a/ext/at3_standalone/atrac3.cpp b/ext/at3_standalone/atrac3.cpp index 7b2ca124eb..fe0e71c841 100644 --- a/ext/at3_standalone/atrac3.cpp +++ b/ext/at3_standalone/atrac3.cpp @@ -140,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band) FFSWAP(float, input[i], input[255 - i]); } - q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input); + imdct_calc(&q->mdct_ctx, output, input); /* Perform windowing on the output. */ vector_fmul(output, output, mdct_window, MDCT_SIZE); diff --git a/ext/at3_standalone/atrac3plusdsp.cpp b/ext/at3_standalone/atrac3plusdsp.cpp index 27a296a937..690f93fc9d 100644 --- a/ext/at3_standalone/atrac3plusdsp.cpp +++ b/ext/at3_standalone/atrac3plusdsp.cpp @@ -25,6 +25,22 @@ * DSP functions for ATRAC3+ decoder. */ +#include "ppsspp_config.h" + +#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) + +#include + +#elif PPSSPP_ARCH(ARM_NEON) + +#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) +#include +#else +#include +#endif + +#endif + #include #include @@ -466,7 +482,7 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn, for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES / 2; i++) FFSWAP(float, pIn[i], pIn[ATRAC3P_SUBBAND_SAMPLES - 1 - i]); - mdct_ctx->imdct_calc(mdct_ctx, pOut, pIn); + imdct_calc(mdct_ctx, pOut, pIn); /* Perform windowing on the output. * ATRAC3+ uses two different MDCT windows: @@ -612,7 +628,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist, idct_in[sb] = in[sb * ATRAC3P_SUBBAND_SAMPLES + s]; /* Calculate the sine and cosine part of the PQF using IDCT-IV */ - dct_ctx->imdct_half(dct_ctx, idct_out, idct_in); + imdct_half(dct_ctx, idct_out, idct_in); /* append the result to the history */ for (i = 0; i < 8; i++) { @@ -629,13 +645,32 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist, const float *coeffs1 = ipqf_coeffs1[t]; const float *coeffs2 = ipqf_coeffs2[t]; - float *outp = out + s * 16; + float *outp = out + s * 16; +#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) + auto _mm_reverse = [](__m128 x) -> __m128 { + return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3)); + }; + _mm_storeu_ps(outp, _mm_add_ps(_mm_loadu_ps(outp), _mm_add_ps( + _mm_mul_ps(_mm_loadu_ps(buf1), _mm_loadu_ps(coeffs1)), + _mm_mul_ps(_mm_loadu_ps(buf2), _mm_loadu_ps(coeffs2))))); + _mm_storeu_ps(outp + 4, _mm_add_ps(_mm_loadu_ps(outp + 4), _mm_add_ps( + _mm_mul_ps(_mm_loadu_ps(buf1 + 4), _mm_loadu_ps(coeffs1 + 4)), + _mm_mul_ps(_mm_loadu_ps(buf2 + 4), _mm_loadu_ps(coeffs2 + 4))))); + + _mm_storeu_ps(outp + 8, _mm_add_ps(_mm_loadu_ps(outp + 8), _mm_add_ps( + _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1 + 4)), _mm_loadu_ps(coeffs1 + 8)), + _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2 + 4)), _mm_loadu_ps(coeffs2 + 8))))); + _mm_storeu_ps(outp + 12, _mm_add_ps(_mm_loadu_ps(outp + 12), _mm_add_ps( + _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1)), _mm_loadu_ps(coeffs1 + 12)), + _mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2)), _mm_loadu_ps(coeffs2 + 12))))); +#else for (i = 0; i < 8; i++) { outp[i] += buf1[i] * coeffs1[i] + buf2[i] * coeffs2[i]; } - for (i = 0; i < 8; i++) { - outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8]; - } + for (i = 0; i < 8; i++) { + outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8]; + } +#endif pos_now = mod23_lut[pos_next + 2]; // pos_now = (pos_now + 2) % 23; pos_next = mod23_lut[pos_now + 2]; // pos_next = (pos_next + 2) % 23; diff --git a/ext/at3_standalone/compat.h b/ext/at3_standalone/compat.h index 6c4a4ac45b..bd3e398b32 100644 --- a/ext/at3_standalone/compat.h +++ b/ext/at3_standalone/compat.h @@ -18,6 +18,9 @@ #define AV_HAVE_FAST_UNALIGNED 0 #define AV_INPUT_BUFFER_PADDING_SIZE 32 +// TODO: This should work but doesn't?? +// #define BITSTREAM_READER_LE + #define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript #define av_restrict #define av_alias diff --git a/ext/at3_standalone/fft.cpp b/ext/at3_standalone/fft.cpp index 961c43b55d..184275f870 100644 --- a/ext/at3_standalone/fft.cpp +++ b/ext/at3_standalone/fft.cpp @@ -35,9 +35,9 @@ #define sqrthalf (float)M_SQRT1_2 -void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input); +void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); /* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ COSTABLE(16); @@ -71,8 +71,8 @@ static FFTSample * const av_cos_tabs[] = { av_cos_65536, }; -static void fft_permute_c(FFTContext *s, FFTComplex *z); -static void fft_calc_c(FFTContext *s, FFTComplex *z); +void fft_permute(FFTContext *s, FFTComplex *z); +void fft_calc(FFTContext *s, FFTComplex *z); static int split_radix_permutation(int i, int n, int inverse) { @@ -130,11 +130,6 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) goto fail; s->inverse = inverse; - s->fft_permute = fft_permute_c; - s->fft_calc = fft_calc_c; - s->imdct_calc = ff_imdct_calc_c; - s->imdct_half = ff_imdct_half_c; - s->mdct_calc = ff_mdct_calc_c; for(j=4; j<=nbits; j++) { ff_init_ff_cos_tabs(j); } @@ -152,7 +147,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) return -1; } -static void fft_permute_c(FFTContext *s, FFTComplex *z) +void fft_permute(FFTContext *s, FFTComplex *z) { int j, np; const uint16_t *revtab = s->revtab; @@ -315,12 +310,11 @@ static void (* const fft_dispatch[])(FFTComplex*) = { fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, }; -static void fft_calc_c(FFTContext *s, FFTComplex *z) +void fft_calc(FFTContext *s, FFTComplex *z) { fft_dispatch[s->nbits-2](z); } - #include #include @@ -383,7 +377,7 @@ fail: * @param output N/2 samples * @param input N/2 samples */ -void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input) +void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) { int k, n8, n4, n2, n, j; const uint16_t *revtab = s->revtab; @@ -406,7 +400,7 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input) in1 += 2; in2 -= 2; } - s->fft_calc(s, z); + fft_calc(s, z); /* post rotation + reordering */ for (k = 0; k < n8; k++) { @@ -425,14 +419,14 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input) * @param output N samples * @param input N/2 samples */ -void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input) +void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) { int k; int n = 1 << s->mdct_bits; int n2 = n >> 1; int n4 = n >> 2; - ff_imdct_half_c(s, output + n4, input); + imdct_half(s, output + n4, input); for (k = 0; k < n4; k++) { output[k] = -output[n2 - k - 1]; @@ -445,7 +439,7 @@ void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input) * @param input N samples * @param out N/2 samples */ -void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input) +void mdct_calc(FFTContext *s, FFTSample *out, const FFTSample *input) { int i, j, n, n8, n4, n2, n3; FFTDouble re, im; @@ -473,7 +467,7 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input) CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]); } - s->fft_calc(s, x); + fft_calc(s, x); /* post rotation */ for (i = 0; i < n8; i++) { diff --git a/ext/at3_standalone/fft.h b/ext/at3_standalone/fft.h index 3c9e413bb1..d4dfd940ec 100644 --- a/ext/at3_standalone/fft.h +++ b/ext/at3_standalone/fft.h @@ -57,21 +57,23 @@ struct FFTContext { /* pre/post rotation tables */ FFTSample *tcos; FFTSample *tsin; - /** - * Do the permutation needed BEFORE calling fft_calc(). - */ - void (*fft_permute)(struct FFTContext *s, FFTComplex *z); - /** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before. No 1.0/sqrt(n) normalization is done. - */ - void (*fft_calc)(struct FFTContext *s, FFTComplex *z); - void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); - void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + enum mdct_permutation_type mdct_permutation; }; +/** + * Do the permutation needed BEFORE calling fft_calc(). + */ +void fft_permute(struct FFTContext *s, FFTComplex *z); +/** + * Do a complex FFT with the parameters defined in ff_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +void fft_calc(struct FFTContext *s, FFTComplex *z); +void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input); +void imdct_half(struct FFTContext *s, FFTSample *output, const FFTSample *input); +void mdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input); + #define COSTABLE(size) \ DECLARE_ALIGNED(32, FFTSample, av_cos_##size)[size/2]