Optimize sinc resampler a bit.

This commit is contained in:
Themaister 2012-02-24 21:01:29 +01:00
parent bb824b5679
commit 0496ffc007

View File

@ -19,6 +19,7 @@
// Only suitable as an upsampler, as there is no low-pass filter stage.
#include "resampler.h"
#include "../general.h"
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
@ -29,6 +30,10 @@
#include <xmmintrin.h>
#endif
#if __SSE3__
#include <pmmintrin.h>
#endif
#define PHASE_BITS 8
#define SUBPHASE_BITS 16
@ -41,11 +46,14 @@
#define FRAMES_SHIFT (PHASE_BITS + SUBPHASE_BITS)
#define SIDELOBES 8
#define TAPS (SIDELOBES * 2)
#define PHASE_INDEX 0
#define DELTA_INDEX 1
struct ssnes_resampler
{
float phase_table[PHASES + 1][SIDELOBES];
float delta_table[PHASES + 1][SIDELOBES];
float phase_table[PHASES][2][2 * SIDELOBES];
float buffer_l[2 * SIDELOBES];
float buffer_r[2 * SIDELOBES];
@ -75,19 +83,26 @@ static inline double blackman(double index)
static void init_sinc_table(ssnes_resampler_t *resamp)
{
for (unsigned i = 0; i <= PHASES; i++)
// Sinc phases: [..., p + 3, p + 2, p + 1, p + 0, p - 1, p - 2, p - 3, p - 4, ...]
for (int i = 0; i < PHASES; i++)
{
for (unsigned j = 0; j < SIDELOBES; j++)
for (int j = 0; j < 2 * SIDELOBES; j++)
{
double sinc_phase = M_PI * ((double)i / PHASES + (double)j);
resamp->phase_table[i][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES);
double p = (double)i / PHASES;
double sinc_phase = M_PI * (p + (SIDELOBES - 1 - j));
resamp->phase_table[i][PHASE_INDEX][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES);
}
}
// Optimize linear interpolation.
for (unsigned i = 0; i < PHASES; i++)
for (unsigned j = 0; j < SIDELOBES; j++)
resamp->delta_table[i][j] = resamp->phase_table[i + 1][j] - resamp->phase_table[i][j];
for (int i = 0; i < PHASES - 1; i++)
{
for (int j = 0; j < 2 * SIDELOBES; j++)
{
resamp->phase_table[i][DELTA_INDEX][j] =
(resamp->phase_table[i + 1][PHASE_INDEX][j] - resamp->phase_table[i][PHASE_INDEX][j]) / SUBPHASES;
}
}
}
ssnes_resampler_t *resampler_new(void)
@ -99,6 +114,15 @@ ssnes_resampler_t *resampler_new(void)
memset(re, 0, sizeof(*re));
init_sinc_table(re);
#if __SSE3__
SSNES_LOG("Sinc resampler [SSE3]\n");
#elif __SSE__
SSNES_LOG("Sinc resampler [SSE]\n");
#else
SSNES_LOG("Sinc resampler [C]\n");
#endif
return re;
}
@ -113,68 +137,41 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
unsigned phase = resamp->time >> PHASES_SHIFT;
unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
__m128 delta_f = _mm_set1_ps((float)delta / SUBPHASES);
__m128 delta_f = _mm_set1_ps(delta);
const float *phase_table = resamp->phase_table[phase];
const float *delta_table = resamp->delta_table[phase];
const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];
__m128 sinc_vals[SIDELOBES / 4];
for (unsigned i = 0; i < SIDELOBES; i += 4)
for (unsigned i = 0; i < TAPS; i += 4)
{
__m128 phases = _mm_load_ps(phase_table + i);
__m128 deltas = _mm_load_ps(delta_table + i);
sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
__m128 buf_l = _mm_load_ps(buffer_l + i);
__m128 buf_r = _mm_load_ps(buffer_r + i);
__m128 phases = _mm_load_ps(phase_table + i);
__m128 deltas = _mm_load_ps(delta_table + i);
__m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc));
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc));
}
// Older data.
for (unsigned i = 0; i < SIDELOBES; i += 4)
{
__m128 buf_l = _mm_loadr_ps(buffer_l + SIDELOBES - 4 - i);
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4]));
__m128 buf_r = _mm_loadr_ps(buffer_r + SIDELOBES - 4 - i);
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4]));
}
// Newer data.
unsigned reverse_phase = PHASES_WRAP - resamp->time;
phase = reverse_phase >> PHASES_SHIFT;
delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
delta_f = _mm_set1_ps((float)delta / SUBPHASES);
phase_table = resamp->phase_table[phase];
delta_table = resamp->delta_table[phase];
for (unsigned i = 0; i < SIDELOBES; i += 4)
{
__m128 phases = _mm_load_ps(phase_table + i);
__m128 deltas = _mm_load_ps(delta_table + i);
sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f));
}
for (unsigned i = 0; i < SIDELOBES; i += 4)
{
__m128 buf_l = _mm_load_ps(buffer_l + SIDELOBES + i);
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4]));
__m128 buf_r = _mm_load_ps(buffer_r + SIDELOBES + i);
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4]));
}
// This can be done faster with _mm_hadd_ps(), but it's SSE3 :(
__m128 sum_shuf_l = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0));
__m128 sum_shuf_r = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2));
__m128 sum = _mm_add_ps(sum_shuf_l, sum_shuf_r);
#if __SSE3__
__m128 res = _mm_hadd_ps(_mm_hadd_ps(sum_l, sum_r), _mm_setzero_ps());
_mm_storeu_ps(out_buffer, res); // Overwriting, but this is safe.
#else // Meh, compiler should optimize this to something sane.
union
{
float f[4];
__m128 v;
} u;
u.v = sum;
} u[2] = {
[0] = { .v = sum_l },
[1] = { .v = sum_r },
};
out_buffer[0] = u.f[0] + u.f[1];
out_buffer[1] = u.f[2] + u.f[3];
out_buffer[0] = u[0].f[0] + u[0].f[1] + u[0].f[2] + u[0].f[3];
out_buffer[1] = u[1].f[0] + u[1].f[1] + u[1].f[2] + u[1].f[3];
#endif
}
#else // Plain ol' C99
static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
@ -186,38 +183,16 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer)
unsigned phase = resamp->time >> PHASES_SHIFT;
unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
float delta_f = (float)delta / SUBPHASES;
float delta_f = (float)delta;
const float *phase_table = resamp->phase_table[phase];
const float *delta_table = resamp->delta_table[phase];
const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];
float sinc_vals[SIDELOBES];
for (unsigned i = 0; i < SIDELOBES; i++)
sinc_vals[i] = phase_table[i] + delta_f * delta_table[i];
// Older data.
for (unsigned i = 0; i < SIDELOBES; i++)
for (unsigned i = 0; i < TAPS; i++)
{
sum_l += buffer_l[SIDELOBES - 1 - i] * sinc_vals[i];
sum_r += buffer_r[SIDELOBES - 1 - i] * sinc_vals[i];
}
// Newer data.
unsigned reverse_phase = PHASES_WRAP - resamp->time;
phase = reverse_phase >> PHASES_SHIFT;
delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
delta_f = (float)delta / SUBPHASES;
phase_table = resamp->phase_table[phase];
delta_table = resamp->delta_table[phase];
for (unsigned i = 0; i < SIDELOBES; i++)
sinc_vals[i] = phase_table[i] + delta_f * delta_table[i];
for (unsigned i = 0; i < SIDELOBES; i++)
{
sum_l += buffer_l[SIDELOBES + i] * sinc_vals[i];
sum_r += buffer_r[SIDELOBES + i] * sinc_vals[i];
float sinc_val = phase_table[i] + delta_f * delta_table[i];
sum_l += buffer_l[i] * sinc_val;
sum_r += buffer_r[i] * sinc_val;
}
out_buffer[0] = sum_l;