mirror of
https://github.com/hrydgard/ppsspp.git
synced 2024-11-23 13:30:02 +00:00
Add some SSE code for an atrac hotspot just for fun, remove function pointers
This commit is contained in:
parent
52111103b8
commit
7b6b7ebe3c
@ -140,7 +140,7 @@ static void imlt(ATRAC3Context *q, float *input, float *output, int odd_band)
|
||||
FFSWAP(float, input[i], input[255 - i]);
|
||||
}
|
||||
|
||||
q->mdct_ctx.imdct_calc(&q->mdct_ctx, output, input);
|
||||
imdct_calc(&q->mdct_ctx, output, input);
|
||||
|
||||
/* Perform windowing on the output. */
|
||||
vector_fmul(output, output, mdct_window, MDCT_SIZE);
|
||||
|
@ -25,6 +25,22 @@
|
||||
* DSP functions for ATRAC3+ decoder.
|
||||
*/
|
||||
|
||||
#include "ppsspp_config.h"
|
||||
|
||||
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -466,7 +482,7 @@ void ff_atrac3p_imdct(FFTContext *mdct_ctx, float *pIn,
|
||||
for (i = 0; i < ATRAC3P_SUBBAND_SAMPLES / 2; i++)
|
||||
FFSWAP(float, pIn[i], pIn[ATRAC3P_SUBBAND_SAMPLES - 1 - i]);
|
||||
|
||||
mdct_ctx->imdct_calc(mdct_ctx, pOut, pIn);
|
||||
imdct_calc(mdct_ctx, pOut, pIn);
|
||||
|
||||
/* Perform windowing on the output.
|
||||
* ATRAC3+ uses two different MDCT windows:
|
||||
@ -612,7 +628,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
|
||||
idct_in[sb] = in[sb * ATRAC3P_SUBBAND_SAMPLES + s];
|
||||
|
||||
/* Calculate the sine and cosine part of the PQF using IDCT-IV */
|
||||
dct_ctx->imdct_half(dct_ctx, idct_out, idct_in);
|
||||
imdct_half(dct_ctx, idct_out, idct_in);
|
||||
|
||||
/* append the result to the history */
|
||||
for (i = 0; i < 8; i++) {
|
||||
@ -629,13 +645,32 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
|
||||
const float *coeffs1 = ipqf_coeffs1[t];
|
||||
const float *coeffs2 = ipqf_coeffs2[t];
|
||||
|
||||
float *outp = out + s * 16;
|
||||
float *outp = out + s * 16;
|
||||
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
|
||||
auto _mm_reverse = [](__m128 x) -> __m128 {
|
||||
return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
|
||||
};
|
||||
_mm_storeu_ps(outp, _mm_add_ps(_mm_loadu_ps(outp), _mm_add_ps(
|
||||
_mm_mul_ps(_mm_loadu_ps(buf1), _mm_loadu_ps(coeffs1)),
|
||||
_mm_mul_ps(_mm_loadu_ps(buf2), _mm_loadu_ps(coeffs2)))));
|
||||
_mm_storeu_ps(outp + 4, _mm_add_ps(_mm_loadu_ps(outp + 4), _mm_add_ps(
|
||||
_mm_mul_ps(_mm_loadu_ps(buf1 + 4), _mm_loadu_ps(coeffs1 + 4)),
|
||||
_mm_mul_ps(_mm_loadu_ps(buf2 + 4), _mm_loadu_ps(coeffs2 + 4)))));
|
||||
|
||||
_mm_storeu_ps(outp + 8, _mm_add_ps(_mm_loadu_ps(outp + 8), _mm_add_ps(
|
||||
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1 + 4)), _mm_loadu_ps(coeffs1 + 8)),
|
||||
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2 + 4)), _mm_loadu_ps(coeffs2 + 8)))));
|
||||
_mm_storeu_ps(outp + 12, _mm_add_ps(_mm_loadu_ps(outp + 12), _mm_add_ps(
|
||||
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf1)), _mm_loadu_ps(coeffs1 + 12)),
|
||||
_mm_mul_ps(_mm_reverse(_mm_loadu_ps(buf2)), _mm_loadu_ps(coeffs2 + 12)))));
|
||||
#else
|
||||
for (i = 0; i < 8; i++) {
|
||||
outp[i] += buf1[i] * coeffs1[i] + buf2[i] * coeffs2[i];
|
||||
}
|
||||
for (i = 0; i < 8; i++) {
|
||||
outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
|
||||
}
|
||||
for (i = 0; i < 8; i++) {
|
||||
outp[i + 8] += buf1[7 - i] * coeffs1[i + 8] + buf2[7 - i] * coeffs2[i + 8];
|
||||
}
|
||||
#endif
|
||||
|
||||
pos_now = mod23_lut[pos_next + 2]; // pos_now = (pos_now + 2) % 23;
|
||||
pos_next = mod23_lut[pos_now + 2]; // pos_next = (pos_next + 2) % 23;
|
||||
|
@ -18,6 +18,9 @@
|
||||
#define AV_HAVE_FAST_UNALIGNED 0
|
||||
#define AV_INPUT_BUFFER_PADDING_SIZE 32
|
||||
|
||||
// TODO: This should work but doesn't??
|
||||
// #define BITSTREAM_READER_LE
|
||||
|
||||
#define LOCAL_ALIGNED(bits, type, name, subscript) type name subscript
|
||||
#define av_restrict
|
||||
#define av_alias
|
||||
|
@ -35,9 +35,9 @@
|
||||
|
||||
#define sqrthalf (float)M_SQRT1_2
|
||||
|
||||
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
|
||||
COSTABLE(16);
|
||||
@ -71,8 +71,8 @@ static FFTSample * const av_cos_tabs[] = {
|
||||
av_cos_65536,
|
||||
};
|
||||
|
||||
static void fft_permute_c(FFTContext *s, FFTComplex *z);
|
||||
static void fft_calc_c(FFTContext *s, FFTComplex *z);
|
||||
void fft_permute(FFTContext *s, FFTComplex *z);
|
||||
void fft_calc(FFTContext *s, FFTComplex *z);
|
||||
|
||||
static int split_radix_permutation(int i, int n, int inverse)
|
||||
{
|
||||
@ -130,11 +130,6 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
||||
goto fail;
|
||||
s->inverse = inverse;
|
||||
|
||||
s->fft_permute = fft_permute_c;
|
||||
s->fft_calc = fft_calc_c;
|
||||
s->imdct_calc = ff_imdct_calc_c;
|
||||
s->imdct_half = ff_imdct_half_c;
|
||||
s->mdct_calc = ff_mdct_calc_c;
|
||||
for(j=4; j<=nbits; j++) {
|
||||
ff_init_ff_cos_tabs(j);
|
||||
}
|
||||
@ -152,7 +147,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse)
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void fft_permute_c(FFTContext *s, FFTComplex *z)
|
||||
void fft_permute(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int j, np;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
@ -315,12 +310,11 @@ static void (* const fft_dispatch[])(FFTComplex*) = {
|
||||
fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
|
||||
};
|
||||
|
||||
static void fft_calc_c(FFTContext *s, FFTComplex *z)
|
||||
void fft_calc(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
fft_dispatch[s->nbits-2](z);
|
||||
}
|
||||
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -383,7 +377,7 @@ fail:
|
||||
* @param output N/2 samples
|
||||
* @param input N/2 samples
|
||||
*/
|
||||
void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
void imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k, n8, n4, n2, n, j;
|
||||
const uint16_t *revtab = s->revtab;
|
||||
@ -406,7 +400,7 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
in1 += 2;
|
||||
in2 -= 2;
|
||||
}
|
||||
s->fft_calc(s, z);
|
||||
fft_calc(s, z);
|
||||
|
||||
/* post rotation + reordering */
|
||||
for (k = 0; k < n8; k++) {
|
||||
@ -425,14 +419,14 @@ void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
* @param output N samples
|
||||
* @param input N/2 samples
|
||||
*/
|
||||
void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
void imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
{
|
||||
int k;
|
||||
int n = 1 << s->mdct_bits;
|
||||
int n2 = n >> 1;
|
||||
int n4 = n >> 2;
|
||||
|
||||
ff_imdct_half_c(s, output + n4, input);
|
||||
imdct_half(s, output + n4, input);
|
||||
|
||||
for (k = 0; k < n4; k++) {
|
||||
output[k] = -output[n2 - k - 1];
|
||||
@ -445,7 +439,7 @@ void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
* @param input N samples
|
||||
* @param out N/2 samples
|
||||
*/
|
||||
void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
|
||||
void mdct_calc(FFTContext *s, FFTSample *out, const FFTSample *input)
|
||||
{
|
||||
int i, j, n, n8, n4, n2, n3;
|
||||
FFTDouble re, im;
|
||||
@ -473,7 +467,7 @@ void ff_mdct_calc_c(FFTContext *s, FFTSample *out, const FFTSample *input)
|
||||
CMUL(x[j].re, x[j].im, re, im, -tcos[n8 + i], tsin[n8 + i]);
|
||||
}
|
||||
|
||||
s->fft_calc(s, x);
|
||||
fft_calc(s, x);
|
||||
|
||||
/* post rotation */
|
||||
for (i = 0; i < n8; i++) {
|
||||
|
@ -57,21 +57,23 @@ struct FFTContext {
|
||||
/* pre/post rotation tables */
|
||||
FFTSample *tcos;
|
||||
FFTSample *tsin;
|
||||
/**
|
||||
* Do the permutation needed BEFORE calling fft_calc().
|
||||
*/
|
||||
void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in ff_fft_init(). The
|
||||
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
|
||||
*/
|
||||
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
|
||||
void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
enum mdct_permutation_type mdct_permutation;
|
||||
};
|
||||
|
||||
/**
|
||||
* Do the permutation needed BEFORE calling fft_calc().
|
||||
*/
|
||||
void fft_permute(struct FFTContext *s, FFTComplex *z);
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in ff_fft_init(). The
|
||||
* input data must be permuted before. No 1.0/sqrt(n) normalization is done.
|
||||
*/
|
||||
void fft_calc(struct FFTContext *s, FFTComplex *z);
|
||||
void imdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void imdct_half(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void mdct_calc(struct FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
|
||||
#define COSTABLE(size) \
|
||||
DECLARE_ALIGNED(32, FFTSample, av_cos_##size)[size/2]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user