From d1a785823d1872867b8383b73f7cd514e8bffd23 Mon Sep 17 00:00:00 2001 From: aliaspider Date: Wed, 17 Sep 2014 19:06:53 +0100 Subject: [PATCH] add a faster (approximated) version of the CC resampler using SSE intrinsics --- audio/resamplers/cc_resampler.c | 258 ++++++++++++++++++++++++++++++-- 1 file changed, 249 insertions(+), 9 deletions(-) diff --git a/audio/resamplers/cc_resampler.c b/audio/resamplers/cc_resampler.c index 35a0488f1f..6910ec1a38 100644 --- a/audio/resamplers/cc_resampler.c +++ b/audio/resamplers/cc_resampler.c @@ -28,13 +28,8 @@ #define RARCH_LOG(...) fprintf(stderr, __VA_ARGS__) #endif -typedef struct audio_frame_int16 -{ - int16_t l; - int16_t r; -} audio_frame_int16_t; -#ifdef _MIPS_ARCH_ALLEGREX1 +#ifdef _MIPS_ARCH_ALLEGREX static void resampler_CC_process(void *re_, struct resampler_data *data) { (void)re_; @@ -121,7 +116,7 @@ static void resampler_CC_process(void *re_, struct resampler_data *data) outp++; } - /* The VFPU state is assumed to remain intact + /* The VFPU state is assumed to remain intact * in-between calls to resampler_CC_process. */ done: @@ -151,6 +146,251 @@ static void *resampler_CC_init(double bandwidth_mod) RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n"); return (void*)-1; } +#elif defined(__SSE__) + +/* uses a fast polynomial approximation + * since SSE lacks native support for trigonometric functions + * cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5 + */ + + +#include + +#ifndef CC_RESAMPLER_PRECISION +#define CC_RESAMPLER_PRECISION 1 +#endif + +typedef struct rarch_CC_resampler +{ + __m128 previous; + __m128 current; + + float distance; + void (*process)(void *re, struct resampler_data *data); +} rarch_CC_resampler_t; + + +static void resampler_CC_downsample(void *re_, struct resampler_data *data) +{ + float ratio, b; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + + audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in; + audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames); + audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out; + + ratio = 1.0 / data->ratio; + b = data->ratio; /* cutoff frequency. */ + + __m128 vec_previous = _mm_loadu_ps((float*)&re->previous); + __m128 vec_current = _mm_loadu_ps((float*)&re->current); + + while (inp != inp_max) + { + __m128 vec_ratio = + _mm_mul_ps(_mm_set_ps1(ratio), _mm_set_ps(3.0, 2.0, 1.0, 0.0)); + __m128 vec_w = _mm_sub_ps(_mm_set_ps1(re->distance), vec_ratio); + + __m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5)); + __m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5)); + + __m128 vec_b = _mm_set_ps1(b); + vec_w1 = _mm_mul_ps(vec_w1, vec_b); + vec_w2 = _mm_mul_ps(vec_w2, vec_b); + +#if (CC_RESAMPLER_PRECISION > 0) + __m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1); + __m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2); + + + vec_ww1 = _mm_mul_ps(vec_ww1, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww1)); + vec_ww2 = _mm_mul_ps(vec_ww2, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww2)); + + vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww1); + vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww2); + + vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1)); + vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2)); +#endif + + vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5)); + vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5)); + vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5)); + vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5)); + + vec_w = _mm_sub_ps(vec_w1, vec_w2); + + __m128 vec_w_previous = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0)); + __m128 vec_w_current = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2)); + + __m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp); + vec_in = _mm_shuffle_ps(vec_in,vec_in,_MM_SHUFFLE(1, 0, 1, 0)); + + vec_previous = + _mm_add_ps(vec_previous, _mm_mul_ps(vec_in, vec_w_previous)); + vec_current = + _mm_add_ps(vec_current, _mm_mul_ps(vec_in, vec_w_current)); + + re->distance++; + inp++; + + if (re->distance > (ratio + 0.5)) + { + _mm_storel_pi((__m64*)outp, vec_previous); + vec_previous = + _mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2)); + vec_current = + _mm_shuffle_ps(vec_current,_mm_setzero_ps(),_MM_SHUFFLE(1, 0, 3, 2)); + + re->distance -= ratio; + outp++; + } + } + + _mm_storeu_ps((float*)&re->previous, vec_previous); + _mm_storeu_ps((float*)&re->current, vec_current); + + data->output_frames = outp - (audio_frame_float_t*)data->data_out; +} + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +static void resampler_CC_upsample(void *re_, struct resampler_data *data) +{ + float b, ratio; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + + audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in; + audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames); + audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out; + + b = min(data->ratio, 1.00); /* cutoff frequency. */ + ratio = 1.0 / data->ratio; + + __m128 vec_previous = _mm_loadu_ps((float*)&re->previous); + __m128 vec_current = _mm_loadu_ps((float*)&re->current); + + + + while (inp != inp_max) + { + __m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp); + vec_previous = + _mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2)); + vec_current = + _mm_shuffle_ps(vec_current,vec_in,_MM_SHUFFLE(1, 0, 3, 2)); + + while (re->distance < 1.0) + { + __m128 vec_w = + _mm_add_ps(_mm_set_ps1(re->distance), _mm_set_ps(-2.0, -1.0, 0.0, 1.0)); + + __m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5)); + __m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5)); + + __m128 vec_b = _mm_set_ps1(b); + vec_w1 = _mm_mul_ps(vec_w1, vec_b); + vec_w2 = _mm_mul_ps(vec_w2, vec_b); + +#if (CC_RESAMPLER_PRECISION > 0) + __m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1); + __m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2); + + + vec_ww1 = _mm_mul_ps(vec_ww1,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww1)); + vec_ww2 = _mm_mul_ps(vec_ww2,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww2)); + + vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww1); + vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww2); + + vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1)); + vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2)); +#endif + + vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5)); + vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5)); + vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5)); + vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5)); + + vec_w = _mm_sub_ps(vec_w1, vec_w2); + + __m128 vec_w_previous = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0)); + __m128 vec_w_current = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2)); + + __m128 vec_out = _mm_mul_ps(vec_previous, vec_w_previous); + vec_out = _mm_add_ps(vec_out, _mm_mul_ps(vec_current, vec_w_current)); + vec_out = + _mm_add_ps(vec_out, _mm_shuffle_ps(vec_out,vec_out,_MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_pi((__m64*)outp,vec_out); + + re->distance += ratio; + outp++; + } + + re->distance -= 1.0; + inp++; + } + + _mm_storeu_ps((float*)&re->previous, vec_previous); + _mm_storeu_ps((float*)&re->current, vec_current); + + data->output_frames = outp - (audio_frame_float_t*)data->data_out; +} + + +static void resampler_CC_process(void *re_, struct resampler_data *data) +{ + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + re->process(re_, data); +} + +static void resampler_CC_free(void *re_) +{ + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + if (re) + free(re); +} + +static void *resampler_CC_init(double bandwidth_mod) +{ + int i; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*) + calloc(1, sizeof(rarch_CC_resampler_t)); + if (!re) + return NULL; + + for (i = 0; i < 4; i++) + { + re->previous = _mm_setzero_ps(); + re->current = _mm_setzero_ps(); + } + + RARCH_LOG("Convoluted Cosine resampler (SSE) : "); + + /* variations of data->ratio around 0.75 are safer + * than around 1.0 for both up/downsampler. */ + if (bandwidth_mod < 0.75) + { + RARCH_LOG("CC_downsample @%f \n", bandwidth_mod); + re->process = resampler_CC_downsample; + re->distance = 0.0; + } + else + { + RARCH_LOG("CC_upsample @%f \n", bandwidth_mod); + re->process = resampler_CC_upsample; + re->distance = 2.0; + } + + return re; +} #else /* C reference version. Not optimized. */ @@ -295,9 +535,9 @@ static void *resampler_CC_init(double bandwidth_mod) RARCH_LOG("Convoluted Cosine resampler (C) : "); - /* variations of data->ratio around 0.75 are safer + /* variations of data->ratio around 0.75 are safer * than around 1.0 for both up/downsampler. */ - if (bandwidth_mod < 0.75) + if (bandwidth_mod < 0.75) { RARCH_LOG("CC_downsample @%f \n", bandwidth_mod); re->process = resampler_CC_downsample;