mirror of
https://github.com/CTCaer/RetroArch.git
synced 2024-12-14 22:38:34 +00:00
Optimize sinc for AVX as well.
Disable it for now however, as it's slightly slower than SSE1 for the few taps we're using. From testing, it's 10-20% faster when number of taps are increased. The AVX path might need some more tuning, but it's fair to assume the algorithm is memory bound.
This commit is contained in:
parent
71191c1440
commit
bebe0d78a7
65
audio/sinc.c
65
audio/sinc.c
@ -28,12 +28,22 @@
|
||||
#define RARCH_LOG(...)
|
||||
#endif
|
||||
|
||||
#if __SSE__
|
||||
#ifdef __SSE__
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
// For the little amount of taps we're using,
|
||||
// SSE1 is faster than AVX for some reason.
|
||||
// AVX code is kept here though as by increasing number
|
||||
// of sinc taps, the AVX code is clearly faster than SSE1.
|
||||
#define ENABLE_AVX 0
|
||||
|
||||
#if defined(__AVX__) && ENABLE_AVX
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#define PHASE_BITS 8
|
||||
#define SUBPHASE_BITS 15
|
||||
#define SUBPHASE_BITS 16
|
||||
|
||||
#define PHASES (1 << PHASE_BITS)
|
||||
#define PHASES_SHIFT (SUBPHASE_BITS)
|
||||
@ -133,7 +143,7 @@ static void aligned_free__(void *ptr)
|
||||
|
||||
rarch_resampler_t *resampler_new(void)
|
||||
{
|
||||
rarch_resampler_t *re = (rarch_resampler_t*)aligned_alloc__(16, sizeof(*re));
|
||||
rarch_resampler_t *re = (rarch_resampler_t*)aligned_alloc__(1024, sizeof(*re));
|
||||
if (!re)
|
||||
return NULL;
|
||||
|
||||
@ -141,7 +151,9 @@ rarch_resampler_t *resampler_new(void)
|
||||
|
||||
init_sinc_table(re);
|
||||
|
||||
#ifdef __SSE__
|
||||
#if defined(__AVX__) && ENABLE_AVX
|
||||
RARCH_LOG("Sinc resampler [AVX]\n");
|
||||
#elif defined(__SSE__)
|
||||
RARCH_LOG("Sinc resampler [SSE]\n");
|
||||
#else
|
||||
RARCH_LOG("Sinc resampler [C]\n");
|
||||
@ -150,7 +162,50 @@ rarch_resampler_t *resampler_new(void)
|
||||
return re;
|
||||
}
|
||||
|
||||
#ifdef __SSE__
|
||||
#if defined(__AVX__) && ENABLE_AVX
|
||||
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer)
|
||||
{
|
||||
__m256 sum_l = _mm256_setzero_ps();
|
||||
__m256 sum_r = _mm256_setzero_ps();
|
||||
|
||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||
|
||||
unsigned phase = resamp->time >> PHASES_SHIFT;
|
||||
unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK;
|
||||
__m256 delta_f = _mm256_set1_ps(delta);
|
||||
|
||||
const float *phase_table = resamp->phase_table[phase][PHASE_INDEX];
|
||||
const float *delta_table = resamp->phase_table[phase][DELTA_INDEX];
|
||||
|
||||
for (unsigned i = 0; i < TAPS; i += 8)
|
||||
{
|
||||
__m256 buf_l = _mm256_loadu_ps(buffer_l + i);
|
||||
__m256 buf_r = _mm256_loadu_ps(buffer_r + i);
|
||||
|
||||
__m256 phases = _mm256_load_ps(phase_table + i);
|
||||
__m256 deltas = _mm256_load_ps(delta_table + i);
|
||||
|
||||
__m256 sinc = _mm256_add_ps(phases, _mm256_mul_ps(deltas, delta_f));
|
||||
|
||||
sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
|
||||
sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
|
||||
}
|
||||
|
||||
// hadd on AVX is weird, and acts on low-lanes and high-lanes separately.
|
||||
__m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
|
||||
__m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
|
||||
res_l = _mm256_hadd_ps(res_l, res_l);
|
||||
res_r = _mm256_hadd_ps(res_r, res_r);
|
||||
res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
|
||||
res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
|
||||
|
||||
// This is optimized to mov %xmmN, [mem].
|
||||
// There doesn't seem to be any _mm256_store_ss intrinsic.
|
||||
_mm_store_ss(out_buffer + 0, _mm256_extractf128_ps(res_l, 0));
|
||||
_mm_store_ss(out_buffer + 1, _mm256_extractf128_ps(res_r, 0));
|
||||
}
|
||||
#elif defined(__SSE__)
|
||||
static void process_sinc(rarch_resampler_t *resamp, float *out_buffer)
|
||||
{
|
||||
__m128 sum_l = _mm_setzero_ps();
|
||||
|
Loading…
Reference in New Issue
Block a user