Add some SSE code for an atrac hotspot just for fun, remove function pointers

This commit is contained in:
Henrik Rydgård 2024-04-13 10:10:31 +02:00
parent 52111103b8
commit 7b6b7ebe3c
5 changed files with 72 additions and 38 deletions

View File

@ -140,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
FFSWAP(float, input[i], input[255 - i]);
}
q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
imdct_calc(&q->mdct_ctx, output, input);
/* Perform windowing on the output. */
vector_fmul(output, output, mdct_window, MDCT_SIZE);

View File

@ -25,6 +25,22 @@
* DSP functions for ATRAC3+ decoder.
*/
#include "ppsspp_config.h"
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
#include <emmintrin.h>
#elif PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
#include <math.h>
#include <string.h>
@ -466,7 +482,7 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn,
for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES / 2; i++)
FFSWAP(float, pIn[i], pIn[ATRAC3P_SUBBAND_SAMPLES - 1 - i]);
mdct_ctx->imdct_calc(mdct_ctx, pOut, pIn);
imdct_calc(mdct_ctx, pOut, pIn);
/* Perform windowing on the output.
* ATRAC3+ uses two different MDCT windows:
@ -612,7 +628,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
idct_in[sb] = in[sb * ATRAC3P_SUBBAND_SAMPLES + s];
/* Calculate the sine and cosine part of the PQF using IDCT-IV */
dct_ctx->imdct_half(dct_ctx, idct_out, idct_in);
imdct_half(dct_ctx, idct_out, idct_in);
/* append the result to the history */
for (i = 0; i < 8; i++) {
@ -629,13 +645,32 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
const float *coeffs1 = ipqf_coeffs1[t];
const float *coeffs2 = ipqf_coeffs2[t];
float *outp = out + s * 16;
float *outp = out + s * 16;
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
auto _mm_reverse = [](__m128 x) -> __m128 {
return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
};
_mm_storeu_ps(outp, _mm_add_ps(_mm_loadu_ps(outp), _mm_add_ps(
_mm_mul_ps(_mm_loadu_ps(buf1), _mm_loadu_ps(coeffs1)),
_mm_mul_ps(_mm_loadu_ps(buf2), _mm_loadu_ps(coeffs2)))));
_mm_storeu_ps(outp + 4, _mm_add_ps(_mm_loadu_ps(outp + 4), _mm_add_ps(
_mm_mul_ps(_mm_loadu_ps(buf1 + 4), _mm_loadu_ps(coeffs1 + 4)),
_mm_mul_ps(_mm_loadu_ps(buf2 + 4), _mm_loadu_ps(coeffs2 + 4)))));
_mm_storeu_ps(outp + 8, _mm_add_ps(_mm_loadu_ps(outp + 8), _mm_add_ps(
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1 + 4)), _mm_loadu_ps(coeffs1 + 8)),
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2 + 4)), _mm_loadu_ps(coeffs2 + 8)))));
_mm_storeu_ps(outp + 12, _mm_add_ps(_mm_loadu_ps(outp + 12), _mm_add_ps(
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1)), _mm_loadu_ps(coeffs1 + 12)),
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2)), _mm_loadu_ps(coeffs2 + 12)))));
#else
for (i = 0; i < 8; i++) {
outp[i] += buf1[i] * coeffs1[i] + buf2[i] * coeffs2[i];
}
for (i = 0; i < 8; i++) {
outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
}
for (i = 0; i < 8; i++) {
outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
}
#endif
pos_now = mod23_lut[pos_next + 2]; // pos_now = (pos_now + 2) % 23;
pos_next = mod23_lut[pos_now + 2]; // pos_next = (pos_next + 2) % 23;

View File

@ -18,6 +18,9 @@
#define AV_HAVE_FAST_UNALIGNED 0
#define AV_INPUT_BUFFER_PADDING_SIZE 32
// TODO: This should work but doesn't??
// #define BITSTREAM_READER_LE
#define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript
#define av_restrict
#define av_alias

View File

@ -35,9 +35,9 @@
#define sqrthalf (float)M_SQRT1_2
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
COSTABLE(16);
@ -71,8 +71,8 @@ static FFTSample * const av_cos_tabs[] = {
av_cos_65536,
};
static void fft_permute_c(FFTContext *s, FFTComplex *z);
static void fft_calc_c(FFTContext *s, FFTComplex *z);
void fft_permute(FFTContext *s, FFTComplex *z);
void fft_calc(FFTContext *s, FFTComplex *z);
static int split_radix_permutation(int i, int n, int inverse)
{
@ -130,11 +130,6 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
goto fail;
s->inverse = inverse;
s->fft_permute = fft_permute_c;
s->fft_calc = fft_calc_c;
s->imdct_calc = ff_imdct_calc_c;
s->imdct_half = ff_imdct_half_c;
s->mdct_calc = ff_mdct_calc_c;
for(j=4; j<=nbits; j++) {
ff_init_ff_cos_tabs(j);
}
@ -152,7 +147,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
return -1;
}
static void fft_permute_c(FFTContext *s, FFTComplex *z)
void fft_permute(FFTContext *s, FFTComplex *z)
{
int j, np;
const uint16_t *revtab = s->revtab;
@ -315,12 +310,11 @@ static void (* const fft_dispatch[])(FFTComplex*) = {
fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
};
static void fft_calc_c(FFTContext *s, FFTComplex *z)
void fft_calc(FFTContext *s, FFTComplex *z)
{
fft_dispatch[s->nbits-2](z);
}
#include <stdlib.h>
#include <string.h>
@ -383,7 +377,7 @@ fail:
* @param output N/2 samples
* @param input N/2 samples
*/
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k, n8, n4, n2, n, j;
const uint16_t *revtab = s->revtab;
@ -406,7 +400,7 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
in1 += 2;
in2 -= 2;
}
s->fft_calc(s, z);
fft_calc(s, z);
/* post rotation + reordering */
for (k = 0; k < n8; k++) {
@ -425,14 +419,14 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
* @param output N samples
* @param input N/2 samples
*/
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n2 = n >> 1;
int n4 = n >> 2;
ff_imdct_half_c(s, output + n4, input);
imdct_half(s, output + n4, input);
for (k = 0; k < n4; k++) {
output[k] = -output[n2 - k - 1];
@ -445,7 +439,7 @@ void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
* @param input N samples
* @param out N/2 samples
*/
void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
void mdct_calc(FFTContext *s, FFTSample *out, const FFTSample *input)
{
int i, j, n, n8, n4, n2, n3;
FFTDouble re, im;
@ -473,7 +467,7 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
}
s->fft_calc(s, x);
fft_calc(s, x);
/* post rotation */
for (i = 0; i < n8; i++) {

View File

@ -57,21 +57,23 @@ struct FFTContext {
/* pre/post rotation tables */
FFTSample *tcos;
FFTSample *tsin;
/**
* Do the permutation needed BEFORE calling fft_calc().
*/
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
*/
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
enum mdct_permutation_type mdct_permutation;
};
/**
* Do the permutation needed BEFORE calling fft_calc().
*/
void fft_permute(struct FFTContext *s, FFTComplex *z);
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
*/
void fft_calc(struct FFTContext *s, FFTComplex *z);
void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void imdct_half(struct FFTContext *s, FFTSample *output, const FFTSample *input);
void mdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
#define COSTABLE(size) \
DECLARE_ALIGNED(32, FFTSample, av_cos_##size)[size/2]