diff --git a/libgambatte/libretro/blipper.c b/libgambatte/libretro/blipper.c index 8ba4586..0a136e9 100644 --- a/libgambatte/libretro/blipper.c +++ b/libgambatte/libretro/blipper.c @@ -31,12 +31,6 @@ #include #include -#if BLIPPER_SIMD -#ifdef __SSE2__ -#include -#endif -#endif - #if BLIPPER_LOG_PERFORMANCE #include static double get_time(void) @@ -49,7 +43,7 @@ static double get_time(void) struct blipper { - blipper_fixed_t *output_buffer; + blipper_long_sample_t *output_buffer; unsigned output_avail; unsigned output_buffer_samples; @@ -60,7 +54,7 @@ struct blipper unsigned phases_log2; unsigned taps; - blipper_fixed_t integrator; + blipper_long_sample_t integrator; blipper_sample_t last_sample; #if BLIPPER_LOG_PERFORMANCE @@ -68,6 +62,8 @@ struct blipper double integrator_time; unsigned long total_samples; #endif + + int owns_filter; }; void blipper_free(blipper_t *blip) @@ -78,7 +74,8 @@ void blipper_free(blipper_t *blip) fprintf(stderr, "[blipper]: Processed %lu samples, using %.6f seconds blipping and %.6f seconds integrating.\n", blip->total_samples, blip->total_time, blip->integrator_time); #endif - free(blip->filter_bank); + if (blip->owns_filter) + free(blip->filter_bank); free(blip->output_buffer); free(blip); } @@ -128,14 +125,14 @@ static double kaiser_window(double index, double beta) #define M_PI 3.14159265358979323846 #endif -static float *blipper_create_sinc(unsigned phases, unsigned taps, +static blipper_real_t *blipper_create_sinc(unsigned phases, unsigned taps, double cutoff, double beta) { unsigned i, filter_len; double sidelobes, window_mod, window_phase, sinc_phase; - float *filter; + blipper_real_t *filter; - filter = malloc(phases * taps * sizeof(*filter)); + filter = (blipper_real_t*)malloc(phases * taps * sizeof(*filter)); if (!filter) return NULL; @@ -166,17 +163,17 @@ static float *blipper_create_sinc(unsigned phases, unsigned taps, * This filtering creates a finite length filter, albeit slightly longer. * * phases is the same as decimation rate. */ -static float *blipper_prefilter_sinc(float *filter, unsigned phases, - unsigned *out_taps) +static blipper_real_t *blipper_prefilter_sinc(blipper_real_t *filter, unsigned phases, + unsigned taps) { unsigned i; - unsigned taps = *out_taps; - float *tmp_filter; - float *new_filter = malloc((phases * taps + phases) * sizeof(*filter)); + float filter_amp = 0.75f / phases; + blipper_real_t *tmp_filter; + blipper_real_t *new_filter = (blipper_real_t*)malloc((phases * taps + phases) * sizeof(*filter)); if (!new_filter) goto error; - tmp_filter = realloc(filter, (phases * taps + phases) * sizeof(*filter)); + tmp_filter = (blipper_real_t*)realloc(filter, (phases * taps + phases) * sizeof(*filter)); if (!tmp_filter) goto error; filter = tmp_filter; @@ -188,12 +185,19 @@ static float *blipper_prefilter_sinc(float *filter, unsigned phases, for (i = phases * taps; i < phases * taps + phases; i++) new_filter[i] = new_filter[phases * taps - 1]; + taps++; + /* Differentiate with offset of D. */ memcpy(filter, new_filter, phases * sizeof(*filter)); - for (i = phases; i < phases * taps + phases; i++) + for (i = phases; i < phases * taps; i++) filter[i] = new_filter[i] - new_filter[i - phases]; - *out_taps = taps + 1; + /* blipper_prefilter_sinc() boosts the gain of the sinc. + * Have to compensate for this. Attenuate a bit more to ensure + * we don't clip, especially in fixed point. */ + for (i = 0; i < phases * taps; i++) + filter[i] *= filter_amp; + free(new_filter); return filter; @@ -206,11 +210,11 @@ error: /* Creates a polyphase filter bank. * Interleaves the filter for cache coherency and possibilities * for SIMD processing. */ -static float *blipper_interleave_sinc(float *filter, unsigned phases, +static blipper_real_t *blipper_interleave_sinc(blipper_real_t *filter, unsigned phases, unsigned taps) { unsigned t, p; - float *new_filter = malloc(phases * taps * sizeof(*filter)); + blipper_real_t *new_filter = (blipper_real_t*)malloc(phases * taps * sizeof(*filter)); if (!new_filter) goto error; @@ -227,15 +231,16 @@ error: return NULL; } -static blipper_sample_t *blipper_quantize_sinc(float *filter, unsigned taps, float amp) +#if BLIPPER_FIXED_POINT +static blipper_sample_t *blipper_quantize_sinc(blipper_real_t *filter, unsigned taps) { unsigned t; - blipper_sample_t *filt = malloc(taps * sizeof(*filt)); + blipper_sample_t *filt = (blipper_sample_t*)malloc(taps * sizeof(*filt)); if (!filt) goto error; for (t = 0; t < taps; t++) - filt[t] = (blipper_sample_t)floor(filter[t] * 0x7fff * amp + 0.5); + filt[t] = (blipper_sample_t)floor(filter[t] * 0x7fff + 0.5); free(filter); return filt; @@ -245,33 +250,38 @@ error: free(filt); return NULL; } +#endif -static int blipper_create_filter_bank(blipper_t *blip, unsigned taps, +blipper_sample_t *blipper_create_filter_bank(unsigned phases, unsigned taps, double cutoff, double beta) { - float *sinc_filter; + blipper_real_t *sinc_filter; + /* blipper_prefilter_sinc() will add one tap. + * To keep number of taps as expected, compensate for it here + * to keep the interface more obvious. */ if (taps <= 1) return 0; taps--; - sinc_filter = blipper_create_sinc(blip->phases, taps, cutoff, beta); + sinc_filter = blipper_create_sinc(phases, taps, cutoff, beta); if (!sinc_filter) return 0; - sinc_filter = blipper_prefilter_sinc(sinc_filter, blip->phases, &taps); + sinc_filter = blipper_prefilter_sinc(sinc_filter, phases, taps); if (!sinc_filter) return 0; - sinc_filter = blipper_interleave_sinc(sinc_filter, blip->phases, taps); + taps++; + + sinc_filter = blipper_interleave_sinc(sinc_filter, phases, taps); if (!sinc_filter) return 0; - blip->filter_bank = blipper_quantize_sinc(sinc_filter, blip->phases * taps, 0.85f / blip->phases); - if (!blip->filter_bank) - return 0; - - blip->taps = taps; - return 1; +#if BLIPPER_FIXED_POINT + return blipper_quantize_sinc(sinc_filter, phases * taps); +#else + return sinc_filter; +#endif } static unsigned log2_int(unsigned v) @@ -282,8 +292,19 @@ static unsigned log2_int(unsigned v) return ret; } +void blipper_reset(blipper_t *blip) +{ + blip->phase = 0; + memset(blip->output_buffer, 0, + (blip->output_avail + blip->taps) * sizeof(*blip->output_buffer)); + blip->output_avail = 0; + blip->last_sample = 0; + blip->integrator = 0; +} + blipper_t *blipper_new(unsigned taps, double cutoff, double beta, - unsigned decimation, unsigned buffer_samples) + unsigned decimation, unsigned buffer_samples, + const blipper_sample_t *filter_bank) { blipper_t *blip = NULL; @@ -300,17 +321,26 @@ blipper_t *blipper_new(unsigned taps, double cutoff, double beta, return NULL; } - blip = calloc(1, sizeof(*blip)); + blip = (blipper_t*)calloc(1, sizeof(*blip)); if (!blip) return NULL; blip->phases = decimation; blip->phases_log2 = log2_int(decimation); - if (!blipper_create_filter_bank(blip, taps, cutoff, beta)) - goto error; + blip->taps = taps; - blip->output_buffer = calloc(buffer_samples + blip->taps, + if (!filter_bank) + { + blip->filter_bank = blipper_create_filter_bank(blip->phases, taps, cutoff, beta); + if (!blip->filter_bank) + goto error; + blip->owns_filter = 1; + } + else + blip->filter_bank = (blipper_sample_t*)filter_bank; + + blip->output_buffer = (blipper_long_sample_t*)calloc(buffer_samples + blip->taps, sizeof(*blip->output_buffer)); if (!blip->output_buffer) goto error; @@ -323,11 +353,11 @@ error: return NULL; } -void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_step) +void blipper_push_delta(blipper_t *blip, blipper_long_sample_t delta, unsigned clocks_step) { unsigned target_output, filter_phase, taps, i; const blipper_sample_t *response; - blipper_fixed_t *target; + blipper_long_sample_t *target; blip->phase += clocks_step; @@ -339,31 +369,8 @@ void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_ target = blip->output_buffer + target_output; taps = blip->taps; - /* Decent SIMD target */ - /* This is extremely unlikely to ever saturate, so don't bother. - * The sinc is attenuated a bit, and positive deltas generally have to be alternating. */ - -#if BLIPPER_SIMD && defined(__SSE2__) - { - __m128i t, t0, t1, res0, res1; - __m128i d = _mm_set1_epi16(delta); - for (i = 0; i + 8 <= taps; i += 8) - { - t = _mm_loadu_si128((__m128i*)(response + i)); - t0 = _mm_unpacklo_epi16(t, _mm_setzero_si128()); - t1 = _mm_unpackhi_epi16(t, _mm_setzero_si128()); - res0 = _mm_add_epi32(_mm_madd_epi16(t0, d), _mm_loadu_si128((__m128i*)(target + i + 0))); - res1 = _mm_add_epi32(_mm_madd_epi16(t1, d), _mm_loadu_si128((__m128i*)(target + i + 4))); - _mm_storeu_si128((__m128i*)(target + i + 0), res0); - _mm_storeu_si128((__m128i*)(target + i + 4), res1); - } - for (; i < taps; i++) - target[i] += delta * response[i]; - } -#else for (i = 0; i < taps; i++) target[i] += delta * response[i]; -#endif blip->output_avail = target_output; } @@ -384,7 +391,7 @@ void blipper_push_samples(blipper_t *blip, const blipper_sample_t *data, blipper_sample_t val = *data; if (val != last) { - blipper_push_delta(blip, (blipper_fixed_t)val - (blipper_fixed_t)last, clocks_skip + 1); + blipper_push_delta(blip, (blipper_long_sample_t)val - (blipper_long_sample_t)last, clocks_skip + 1); clocks_skip = 0; last = val; } @@ -411,38 +418,44 @@ void blipper_read(blipper_t *blip, blipper_sample_t *output, unsigned samples, unsigned stride) { unsigned s; - blipper_sample_t quant; - blipper_fixed_t sum = blip->integrator; - const blipper_fixed_t *out = blip->output_buffer; + blipper_long_sample_t sum = blip->integrator; + const blipper_long_sample_t *out = blip->output_buffer; #if BLIPPER_LOG_PERFORMANCE double t0 = get_time(); #endif +#if BLIPPER_FIXED_POINT for (s = 0; s < samples; s++, output += stride) { - /* Cannot overflow */ - sum += out[s] >> 1; + blipper_long_sample_t quant; - /* Clip sum. Really shoudn't happen though. */ - if (sum > 0x3fff0000l) - { -#if BLIPPER_LOG_CLIPPING - fprintf(stderr, "Positive clipping: 0x%lx -> 0x3fff0000.\n", (unsigned long)sum); -#endif - sum = 0x3fff0000l; - } - else if (sum < -0x3fff0000l) - { -#if BLIPPER_LOG_CLIPPING - fprintf(stderr, "Negative clipping: -0x%lx -> -0x3fff0000.\n", (unsigned long)-sum); -#endif - sum = -0x3fff0000l; - } + /* Cannot overflow. Also add a leaky integrator. + Mitigates DC shift numerical instability which is + inherent for integrators. */ + sum += (out[s] >> 1) - (sum >> 9); + /* Rounded. With leaky integrator, this cannot overflow. */ quant = (sum + 0x4000) >> 15; + + /* Clamp. quant can potentially have range [-0x10000, 0xffff] here. + * In both cases, top 16-bits will have a uniform bit pattern which can be exploited. */ + if ((blipper_sample_t)quant != quant) + { + quant = (quant >> 16) ^ 0x7fff; + sum = quant << 15; + } + *output = quant; } +#else + for (s = 0; s < samples; s++, output += stride) + { + /* Leaky integrator, same as fixed point (1.0f / 512.0f) */ + sum += out[s] - sum * 0.00195f; + *output = sum; + } +#endif /* Don't bother with ring buffering. * The entire buffer should be read out ideally anyways. */ diff --git a/libgambatte/libretro/blipper.h b/libgambatte/libretro/blipper.h index ed49da5..74063bd 100644 --- a/libgambatte/libretro/blipper.h +++ b/libgambatte/libretro/blipper.h @@ -27,17 +27,26 @@ #ifndef BLIPPER_H__ #define BLIPPER_H__ -/* Configurables. */ +/* Compile time configurables. */ #ifndef BLIPPER_LOG_PERFORMANCE #define BLIPPER_LOG_PERFORMANCE 0 #endif -#ifndef BLIPPER_LOG_CLIPPING -#define BLIPPER_LOG_CLIPPING 1 +#ifndef BLIPPER_FIXED_POINT +#define BLIPPER_FIXED_POINT 1 #endif -#ifndef BLIPPER_SIMD -#define BLIPPER_SIMD 1 +/* Set to float or double. + * long double is unlikely to provide any improved precision. */ +#ifndef BLIPPER_REAL_T +#define BLIPPER_REAL_T float +#endif + +/* Allows including several implementations in one lib. */ +#if BLIPPER_FIXED_POINT +#define BLIPPER_MANGLE(x) x##_fixed +#else +#define BLIPPER_MANGLE(x) x##_##BLIPPER_REAL_T #endif #ifdef __cplusplus @@ -47,11 +56,13 @@ extern "C" { #include typedef struct blipper blipper_t; +typedef BLIPPER_REAL_T blipper_real_t; +#if BLIPPER_FIXED_POINT #ifdef HAVE_STDINT_H #include typedef int16_t blipper_sample_t; -typedef int32_t blipper_fixed_t; +typedef int32_t blipper_long_sample_t; #else #if SHRT_MAX == 0x7fff typedef short blipper_sample_t; @@ -62,30 +73,54 @@ typedef int blipper_sample_t; #endif #if INT_MAX == 0x7fffffffl -typedef int blipper_fixed_t; +typedef int blipper_long_sample_t; #elif LONG_MAX == 0x7fffffffl -typedef long blipper_fixed_t; +typedef long blipper_long_sample_t; #else -#error "Cannot find suitable type for blipper_fixed_t." +#error "Cannot find suitable type for blipper_long_sample_t." #endif #endif +#else +typedef BLIPPER_REAL_T blipper_sample_t; +typedef BLIPPER_REAL_T blipper_long_sample_t; /* Meaningless for float version. */ +#endif /* Create a new blipper. * taps: Number of filter taps per impulse. + * * cutoff: Cutoff frequency in the passband. Has a range of [0, 1]. + * * beta: Beta used for Kaiser window. + * * decimation: Sets decimation rate. Must be power-of-two (2^n). * The input sampling rate is then output_rate * 2^decimation. * buffer_samples: The maximum number of processed output samples that can be * buffered up by blipper. * + * filter_bank: An optional filter which has already been created by + * blipper_create_filter_bank(). blipper_new() does not take ownership + * of the buffer and must be freed by caller. + * If non-NULL, cutoff and beta will be ignored. + * * Some sane values: * taps = 64, cutoff = 0.85, beta = 8.0 */ +#define blipper_new BLIPPER_MANGLE(blipper_new) blipper_t *blipper_new(unsigned taps, double cutoff, double beta, - unsigned decimation, unsigned buffer_samples); + unsigned decimation, unsigned buffer_samples, const blipper_sample_t *filter_bank); + +/* Reset the blipper to its initiate state. */ +#define blipper_reset BLIPPER_MANGLE(blipper_reset) +void blipper_reset(blipper_t *blip); + +/* Create a filter which can be passed to blipper_new() in filter_bank. + * Arguments to decimation and taps must match. */ +#define blipper_create_filter_bank BLIPPER_MANGLE(blipper_create_filter_bank) +blipper_sample_t *blipper_create_filter_bank(unsigned decimation, + unsigned taps, double cutoff, double beta); /* Frees the blipper. blip can be NULL (no-op). */ +#define blipper_free BLIPPER_MANGLE(blipper_free) void blipper_free(blipper_t *blip); /* Data pushing interfaces. One of these should be used exclusively. */ @@ -100,19 +135,22 @@ void blipper_free(blipper_t *blip); * The caller must ensure not to push deltas in a way that can destabilize * the final integration. */ -void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_step); +#define blipper_push_delta BLIPPER_MANGLE(blipper_push_delta) +void blipper_push_delta(blipper_t *blip, blipper_long_sample_t delta, unsigned clocks_step); /* Push raw samples. blipper will find the deltas themself and push them. * stride is the number of samples between each sample to be used. * This can be used to push interleaved stereo data to two independent * blippers. */ +#define blipper_push_samples BLIPPER_MANGLE(blipper_push_samples) void blipper_push_samples(blipper_t *blip, const blipper_sample_t *delta, unsigned samples, unsigned stride); /* Returns the number of samples available for reading using * blipper_read(). */ +#define blipper_read_avail BLIPPER_MANGLE(blipper_read_avail) unsigned blipper_read_avail(blipper_t *blip); /* Reads processed samples. The caller must ensure to not read @@ -121,6 +159,7 @@ unsigned blipper_read_avail(blipper_t *blip); * between each output sample in output. * Can be used to write to an interleaved stereo buffer. */ +#define blipper_read BLIPPER_MANGLE(blipper_read) void blipper_read(blipper_t *blip, blipper_sample_t *output, unsigned samples, unsigned stride); diff --git a/libgambatte/libretro/libretro.cpp b/libgambatte/libretro/libretro.cpp index ee64877..fc2e804 100644 --- a/libgambatte/libretro/libretro.cpp +++ b/libgambatte/libretro/libretro.cpp @@ -74,8 +74,8 @@ void retro_init() double fps = 4194304.0 / 70224.0; double sample_rate = fps * 35112; - resampler_l = blipper_new(32, 0.85, 6.5, 64, 1024); - resampler_r = blipper_new(32, 0.85, 6.5, 64, 1024); + resampler_l = blipper_new(32, 0.85, 6.5, 64, 1024, NULL); + resampler_r = blipper_new(32, 0.85, 6.5, 64, 1024, NULL); if (environ_cb) {