mirror of
https://github.com/CTCaer/RetroArch.git
synced 2025-01-19 01:12:33 +00:00
Optimize sinc resampler a bit.
This commit is contained in:
parent
bb824b5679
commit
0496ffc007
155
audio/sinc.c
155
audio/sinc.c
@ -19,6 +19,7 @@
|
||||
// Only suitable as an upsampler, as there is no low-pass filter stage.
|
||||
|
||||
#include "resampler.h"
|
||||
#include "../general.h"
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@ -29,6 +30,10 @@
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#if __SSE3__
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
#define PHASE_BITS 8
|
||||
#define SUBPHASE_BITS 16
|
||||
|
||||
@ -41,11 +46,14 @@
|
||||
#define FRAMES_SHIFT (PHASE_BITS + SUBPHASE_BITS)
|
||||
|
||||
#define SIDELOBES 8
|
||||
#define TAPS (SIDELOBES * 2)
|
||||
|
||||
#define PHASE_INDEX 0
|
||||
#define DELTA_INDEX 1
|
||||
|
||||
struct ssnes_resampler
|
||||
{
|
||||
float phase_table[PHASES + 1][SIDELOBES];
|
||||
float delta_table[PHASES + 1][SIDELOBES];
|
||||
float phase_table[PHASES][2][2 * SIDELOBES];
|
||||
float buffer_l[2 * SIDELOBES];
|
||||
float buffer_r[2 * SIDELOBES];
|
||||
|
||||
@ -75,19 +83,26 @@ static inline double blackman(double index)
|
||||
|
||||
static void init_sinc_table(ssnes_resampler_t *resamp)
|
||||
{
|
||||
for (unsigned i = 0; i <= PHASES; i++)
|
||||
// Sinc phases: [..., p + 3, p + 2, p + 1, p + 0, p - 1, p - 2, p - 3, p - 4, ...]
|
||||
for (int i = 0; i < PHASES; i++)
|
||||
{
|
||||
for (unsigned j = 0; j < SIDELOBES; j++)
|
||||
for (int j = 0; j < 2 * SIDELOBES; j++)
|
||||
{
|
||||
double sinc_phase = M_PI * ((double)i / PHASES + (double)j);
|
||||
resamp->phase_table[i][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES);
|
||||
double p = (double)i / PHASES;
|
||||
double sinc_phase = M_PI * (p + (SIDELOBES - 1 - j));
|
||||
resamp->phase_table[i][PHASE_INDEX][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES);
|
||||
}
|
||||
}
|
||||
|
||||
// Optimize linear interpolation.
|
||||
for (unsigned i = 0; i < PHASES; i++)
|
||||
for (unsigned j = 0; j < SIDELOBES; j++)
|
||||
resamp->delta_table[i][j] = resamp->phase_table[i + 1][j] - resamp->phase_table[i][j];
|
||||
for (int i = 0; i < PHASES - 1; i++)
|
||||
{
|
||||
for (int j = 0; j < 2 * SIDELOBES; j++)
|
||||
{
|
||||
resamp->phase_table[i][DELTA_INDEX][j] =
|
||||
(resamp->phase_table[i + 1][PHASE_INDEX][j] - resamp->phase_table[i][PHASE_INDEX][j]) / SUBPHASES;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ssnes_resampler_t *resampler_new(void)
|
||||
@ -99,6 +114,15 @@ ssnes_resampler_t *resampler_new(void)
|
||||
memset(re, 0, sizeof(*re));
|
||||
|
||||
init_sinc_table(re);
|
||||
|
||||
#if __SSE3__
|
||||
SSNES_LOG("Sinc resampler [SSE3]\n");
|
||||
#elif __SSE__
|
||||
SSNES_LOG("Sinc resampler [SSE]\n");
|
||||
#else
|
||||
SSNES_LOG("Sinc resampler [C]\n");
|
||||
#endif
|
||||
|
||||
return re;
|
||||
}
|
||||
|
||||
@ -113,68 +137,41 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
|
||||
|
||||
unsigned phase = resamp->time >> PHASES_SHIFT;
|
||||
unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
|
||||
__m128 delta_f = _mm_set1_ps((float)delta / SUBPHASES);
|
||||
__m128 delta_f = _mm_set1_ps(delta);
|
||||
|
||||
const float *phase_table = resamp->phase_table[phase];
|
||||
const float *delta_table = resamp->delta_table[phase];
|
||||
const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
|
||||
const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];
|
||||
|
||||
__m128 sinc_vals[SIDELOBES / 4];
|
||||
for (unsigned i = 0; i < SIDELOBES; i += 4)
|
||||
for (unsigned i = 0; i < TAPS; i += 4)
|
||||
{
|
||||
__m128 phases = _mm_load_ps(phase_table + i);
|
||||
__m128 deltas = _mm_load_ps(delta_table + i);
|
||||
sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
|
||||
__m128 buf_l = _mm_load_ps(buffer_l + i);
|
||||
__m128 buf_r = _mm_load_ps(buffer_r + i);
|
||||
|
||||
__m128 phases = _mm_load_ps(phase_table + i);
|
||||
__m128 deltas = _mm_load_ps(delta_table + i);
|
||||
|
||||
__m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
|
||||
|
||||
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
|
||||
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
|
||||
}
|
||||
|
||||
// Older data.
|
||||
for (unsigned i = 0; i < SIDELOBES; i += 4)
|
||||
{
|
||||
__m128 buf_l = _mm_loadr_ps(buffer_l + SIDELOBES - 4 - i);
|
||||
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4]));
|
||||
|
||||
__m128 buf_r = _mm_loadr_ps(buffer_r + SIDELOBES - 4 - i);
|
||||
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4]));
|
||||
}
|
||||
|
||||
// Newer data.
|
||||
unsigned reverse_phase = PHASES_WRAP - resamp->time;
|
||||
phase = reverse_phase >> PHASES_SHIFT;
|
||||
delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
|
||||
delta_f = _mm_set1_ps((float)delta / SUBPHASES);
|
||||
|
||||
phase_table = resamp->phase_table[phase];
|
||||
delta_table = resamp->delta_table[phase];
|
||||
|
||||
for (unsigned i = 0; i < SIDELOBES; i += 4)
|
||||
{
|
||||
__m128 phases = _mm_load_ps(phase_table + i);
|
||||
__m128 deltas = _mm_load_ps(delta_table + i);
|
||||
sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < SIDELOBES; i += 4)
|
||||
{
|
||||
__m128 buf_l = _mm_load_ps(buffer_l + SIDELOBES + i);
|
||||
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4]));
|
||||
|
||||
__m128 buf_r = _mm_load_ps(buffer_r + SIDELOBES + i);
|
||||
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4]));
|
||||
}
|
||||
|
||||
// This can be done faster with _mm_hadd_ps(), but it's SSE3 :(
|
||||
__m128 sum_shuf_l = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
__m128 sum_shuf_r = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
__m128 sum = _mm_add_ps(sum_shuf_l, sum_shuf_r);
|
||||
|
||||
#if __SSE3__
|
||||
__m128 res = _mm_hadd_ps(_mm_hadd_ps(sum_l, sum_r), _mm_setzero_ps());
|
||||
_mm_storeu_ps(out_buffer, res); // Overwriting, but this is safe.
|
||||
#else // Meh, compiler should optimize this to something sane.
|
||||
union
|
||||
{
|
||||
float f[4];
|
||||
__m128 v;
|
||||
} u;
|
||||
u.v = sum;
|
||||
} u[2] = {
|
||||
[0] = { .v = sum_l },
|
||||
[1] = { .v = sum_r },
|
||||
};
|
||||
|
||||
out_buffer[0] = u.f[0] + u.f[1];
|
||||
out_buffer[1] = u.f[2] + u.f[3];
|
||||
out_buffer[0] = u[0].f[0] + u[0].f[1] + u[0].f[2] + u[0].f[3];
|
||||
out_buffer[1] = u[1].f[0] + u[1].f[1] + u[1].f[2] + u[1].f[3];
|
||||
#endif
|
||||
}
|
||||
#else // Plain ol' C99
|
||||
static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
|
||||
@ -186,38 +183,16 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
|
||||
|
||||
unsigned phase = resamp->time >> PHASES_SHIFT;
|
||||
unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
|
||||
float delta_f = (float)delta / SUBPHASES;
|
||||
float delta_f = (float)delta;
|
||||
|
||||
const float *phase_table = resamp->phase_table[phase];
|
||||
const float *delta_table = resamp->delta_table[phase];
|
||||
const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
|
||||
const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];
|
||||
|
||||
float sinc_vals[SIDELOBES];
|
||||
for (unsigned i = 0; i < SIDELOBES; i++)
|
||||
sinc_vals[i] = phase_table[i] + delta_f * delta_table[i];
|
||||
|
||||
// Older data.
|
||||
for (unsigned i = 0; i < SIDELOBES; i++)
|
||||
for (unsigned i = 0; i < TAPS; i++)
|
||||
{
|
||||
sum_l += buffer_l[SIDELOBES - 1 - i] * sinc_vals[i];
|
||||
sum_r += buffer_r[SIDELOBES - 1 - i] * sinc_vals[i];
|
||||
}
|
||||
|
||||
// Newer data.
|
||||
unsigned reverse_phase = PHASES_WRAP - resamp->time;
|
||||
phase = reverse_phase >> PHASES_SHIFT;
|
||||
delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
|
||||
delta_f = (float)delta / SUBPHASES;
|
||||
|
||||
phase_table = resamp->phase_table[phase];
|
||||
delta_table = resamp->delta_table[phase];
|
||||
|
||||
for (unsigned i = 0; i < SIDELOBES; i++)
|
||||
sinc_vals[i] = phase_table[i] + delta_f * delta_table[i];
|
||||
|
||||
for (unsigned i = 0; i < SIDELOBES; i++)
|
||||
{
|
||||
sum_l += buffer_l[SIDELOBES + i] * sinc_vals[i];
|
||||
sum_r += buffer_r[SIDELOBES + i] * sinc_vals[i];
|
||||
float sinc_val = phase_table[i] + delta_f * delta_table[i];
|
||||
sum_l += buffer_l[i] * sinc_val;
|
||||
sum_r += buffer_r[i] * sinc_val;
|
||||
}
|
||||
|
||||
out_buffer[0] = sum_l;
|
||||
|
Loading…
x
Reference in New Issue
Block a user