Update blipper.

2024-11-22 23:39:58 +00:00 · 2013-11-03 19:22:56 +01:00 · 2013-11-03 19:22:56 +01:00 · 246a88ff40
commit 246a88ff40
parent b2240b89de
3 changed files with 152 additions and 100 deletions
--- a/libgambatte/libretro/blipper.c
+++ b/libgambatte/libretro/blipper.c
@ -31,12 +31,6 @@
 #include <string.h>
 #include <math.h>

-#if BLIPPER_SIMD
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-#endif
-
 #if BLIPPER_LOG_PERFORMANCE
 #include <time.h>
 static double get_time(void)
@ -49,7 +43,7 @@ static double get_time(void)

 struct blipper
 {
-   blipper_fixed_t *output_buffer;
+   blipper_long_sample_t *output_buffer;
   unsigned output_avail;
   unsigned output_buffer_samples;

@ -60,7 +54,7 @@ struct blipper
   unsigned phases_log2;
   unsigned taps;

-   blipper_fixed_t integrator;
+   blipper_long_sample_t integrator;
   blipper_sample_t last_sample;

 #if BLIPPER_LOG_PERFORMANCE
@ -68,6 +62,8 @@ struct blipper
   double integrator_time;
   unsigned long total_samples;
 #endif
+
+   int owns_filter;
 };

 void blipper_free(blipper_t *blip)
@ -78,7 +74,8 @@ void blipper_free(blipper_t *blip)
      fprintf(stderr, "[blipper]: Processed %lu samples, using %.6f seconds blipping and %.6f seconds integrating.\n", blip->total_samples, blip->total_time, blip->integrator_time);
 #endif

-      free(blip->filter_bank);
+      if (blip->owns_filter)
+         free(blip->filter_bank);
      free(blip->output_buffer);
      free(blip);
   }
@ -128,14 +125,14 @@ static double kaiser_window(double index, double beta)
 #define M_PI 3.14159265358979323846
 #endif

-static float *blipper_create_sinc(unsigned phases, unsigned taps,
+static blipper_real_t *blipper_create_sinc(unsigned phases, unsigned taps,
      double cutoff, double beta)
 {
   unsigned i, filter_len;
   double sidelobes, window_mod, window_phase, sinc_phase;
-   float *filter;
+   blipper_real_t *filter;

-   filter = malloc(phases * taps * sizeof(*filter));
+   filter = (blipper_real_t*)malloc(phases * taps * sizeof(*filter));
   if (!filter)
      return NULL;

@ -166,17 +163,17 @@ static float *blipper_create_sinc(unsigned phases, unsigned taps,
 * This filtering creates a finite length filter, albeit slightly longer.
 *
 * phases is the same as decimation rate. */
-static float *blipper_prefilter_sinc(float *filter, unsigned phases,
-      unsigned *out_taps)
+static blipper_real_t *blipper_prefilter_sinc(blipper_real_t *filter, unsigned phases,
+      unsigned taps)
 {
   unsigned i;
-   unsigned taps = *out_taps;
-   float *tmp_filter;
-   float *new_filter = malloc((phases * taps + phases) * sizeof(*filter));
+   float filter_amp = 0.75f / phases;
+   blipper_real_t *tmp_filter;
+   blipper_real_t *new_filter = (blipper_real_t*)malloc((phases * taps + phases) * sizeof(*filter));
   if (!new_filter)
      goto error;

-   tmp_filter = realloc(filter, (phases * taps + phases) * sizeof(*filter));
+   tmp_filter = (blipper_real_t*)realloc(filter, (phases * taps + phases) * sizeof(*filter));
   if (!tmp_filter)
      goto error;
   filter = tmp_filter;
@ -188,12 +185,19 @@ static float *blipper_prefilter_sinc(float *filter, unsigned phases,
   for (i = phases * taps; i < phases * taps + phases; i++)
      new_filter[i] = new_filter[phases * taps - 1];

+   taps++;
+
   /* Differentiate with offset of D. */
   memcpy(filter, new_filter, phases * sizeof(*filter));
-   for (i = phases; i < phases * taps + phases; i++)
+   for (i = phases; i < phases * taps; i++)
      filter[i] = new_filter[i] - new_filter[i - phases];

-   *out_taps = taps + 1;
+   /* blipper_prefilter_sinc() boosts the gain of the sinc.
+    * Have to compensate for this. Attenuate a bit more to ensure
+    * we don't clip, especially in fixed point. */
+   for (i = 0; i < phases * taps; i++)
+      filter[i] *= filter_amp;
+
   free(new_filter);
   return filter;

@ -206,11 +210,11 @@ error:
 /* Creates a polyphase filter bank.
 * Interleaves the filter for cache coherency and possibilities
 * for SIMD processing. */
-static float *blipper_interleave_sinc(float *filter, unsigned phases,
+static blipper_real_t *blipper_interleave_sinc(blipper_real_t *filter, unsigned phases,
      unsigned taps)
 {
   unsigned t, p;
-   float *new_filter = malloc(phases * taps * sizeof(*filter));
+   blipper_real_t *new_filter = (blipper_real_t*)malloc(phases * taps * sizeof(*filter));
   if (!new_filter)
      goto error;

@ -227,15 +231,16 @@ error:
   return NULL;
 }

-static blipper_sample_t *blipper_quantize_sinc(float *filter, unsigned taps, float amp)
+#if BLIPPER_FIXED_POINT
+static blipper_sample_t *blipper_quantize_sinc(blipper_real_t *filter, unsigned taps)
 {
   unsigned t;
-   blipper_sample_t *filt = malloc(taps * sizeof(*filt));
+   blipper_sample_t *filt = (blipper_sample_t*)malloc(taps * sizeof(*filt));
   if (!filt)
      goto error;

   for (t = 0; t < taps; t++)
-      filt[t] = (blipper_sample_t)floor(filter[t] * 0x7fff * amp + 0.5);
+      filt[t] = (blipper_sample_t)floor(filter[t] * 0x7fff + 0.5);

   free(filter);
   return filt;
@ -245,33 +250,38 @@ error:
   free(filt);
   return NULL;
 }
+#endif

-static int blipper_create_filter_bank(blipper_t *blip, unsigned taps,
+blipper_sample_t *blipper_create_filter_bank(unsigned phases, unsigned taps,
      double cutoff, double beta)
 {
-   float *sinc_filter;
+   blipper_real_t *sinc_filter;

+   /* blipper_prefilter_sinc() will add one tap.
+    * To keep number of taps as expected, compensate for it here
+    * to keep the interface more obvious. */
   if (taps <= 1)
      return 0;
   taps--;

-   sinc_filter = blipper_create_sinc(blip->phases, taps, cutoff, beta);
+   sinc_filter = blipper_create_sinc(phases, taps, cutoff, beta);
   if (!sinc_filter)
      return 0;

-   sinc_filter = blipper_prefilter_sinc(sinc_filter, blip->phases, &taps);
+   sinc_filter = blipper_prefilter_sinc(sinc_filter, phases, taps);
   if (!sinc_filter)
      return 0;
-   sinc_filter = blipper_interleave_sinc(sinc_filter, blip->phases, taps);
+   taps++;
+
+   sinc_filter = blipper_interleave_sinc(sinc_filter, phases, taps);
   if (!sinc_filter)
      return 0;

-   blip->filter_bank = blipper_quantize_sinc(sinc_filter, blip->phases * taps, 0.85f / blip->phases);
-   if (!blip->filter_bank)
-      return 0;
-
-   blip->taps = taps;
-   return 1;
+#if BLIPPER_FIXED_POINT
+   return blipper_quantize_sinc(sinc_filter, phases * taps);
+#else
+   return sinc_filter;
+#endif
 }

 static unsigned log2_int(unsigned v)
@ -282,8 +292,19 @@ static unsigned log2_int(unsigned v)
   return ret;
 }

+void blipper_reset(blipper_t *blip)
+{
+   blip->phase = 0;
+   memset(blip->output_buffer, 0,
+         (blip->output_avail + blip->taps) * sizeof(*blip->output_buffer));
+   blip->output_avail = 0;
+   blip->last_sample = 0;
+   blip->integrator = 0;
+}
+
 blipper_t *blipper_new(unsigned taps, double cutoff, double beta,
-      unsigned decimation, unsigned buffer_samples)
+      unsigned decimation, unsigned buffer_samples,
+      const blipper_sample_t *filter_bank)
 {
   blipper_t *blip = NULL;

@ -300,17 +321,26 @@ blipper_t *blipper_new(unsigned taps, double cutoff, double beta,
      return NULL;
   }

-   blip = calloc(1, sizeof(*blip));
+   blip = (blipper_t*)calloc(1, sizeof(*blip));
   if (!blip)
      return NULL;

   blip->phases = decimation;
   blip->phases_log2 = log2_int(decimation);

-   if (!blipper_create_filter_bank(blip, taps, cutoff, beta))
-      goto error;
+   blip->taps = taps;

-   blip->output_buffer = calloc(buffer_samples + blip->taps,
+   if (!filter_bank)
+   {
+      blip->filter_bank = blipper_create_filter_bank(blip->phases, taps, cutoff, beta);
+      if (!blip->filter_bank)
+         goto error;
+      blip->owns_filter = 1;
+   }
+   else
+      blip->filter_bank = (blipper_sample_t*)filter_bank;
+
+   blip->output_buffer = (blipper_long_sample_t*)calloc(buffer_samples + blip->taps,
         sizeof(*blip->output_buffer));
   if (!blip->output_buffer)
      goto error;
@ -323,11 +353,11 @@ error:
   return NULL;
 }

-void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_step)
+void blipper_push_delta(blipper_t *blip, blipper_long_sample_t delta, unsigned clocks_step)
 {
   unsigned target_output, filter_phase, taps, i;
   const blipper_sample_t *response;
-   blipper_fixed_t *target;
+   blipper_long_sample_t *target;

   blip->phase += clocks_step;

@ -339,31 +369,8 @@ void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_
   target = blip->output_buffer + target_output;
   taps = blip->taps;

-   /* Decent SIMD target */
-   /* This is extremely unlikely to ever saturate, so don't bother.
-    * The sinc is attenuated a bit, and positive deltas generally have to be alternating. */
-
-#if BLIPPER_SIMD && defined(__SSE2__)
-   {
-      __m128i t, t0, t1, res0, res1;
-      __m128i d = _mm_set1_epi16(delta);
-      for (i = 0; i + 8 <= taps; i += 8)
-      {
-         t = _mm_loadu_si128((__m128i*)(response + i));
-         t0 = _mm_unpacklo_epi16(t, _mm_setzero_si128());
-         t1 = _mm_unpackhi_epi16(t, _mm_setzero_si128());
-         res0 = _mm_add_epi32(_mm_madd_epi16(t0, d), _mm_loadu_si128((__m128i*)(target + i + 0)));
-         res1 = _mm_add_epi32(_mm_madd_epi16(t1, d), _mm_loadu_si128((__m128i*)(target + i + 4)));
-         _mm_storeu_si128((__m128i*)(target + i + 0), res0);
-         _mm_storeu_si128((__m128i*)(target + i + 4), res1);
-      }
-      for (; i < taps; i++)
-         target[i] += delta * response[i];
-   }
-#else
   for (i = 0; i < taps; i++)
      target[i] += delta * response[i];
-#endif

   blip->output_avail = target_output;
 }
@ -384,7 +391,7 @@ void blipper_push_samples(blipper_t *blip, const blipper_sample_t *data,
      blipper_sample_t val = *data;
      if (val != last)
      {
-         blipper_push_delta(blip, (blipper_fixed_t)val - (blipper_fixed_t)last, clocks_skip + 1);
+         blipper_push_delta(blip, (blipper_long_sample_t)val - (blipper_long_sample_t)last, clocks_skip + 1);
         clocks_skip = 0;
         last = val;
      }
@ -411,38 +418,44 @@ void blipper_read(blipper_t *blip, blipper_sample_t *output, unsigned samples,
      unsigned stride)
 {
   unsigned s;
-   blipper_sample_t quant;
-   blipper_fixed_t sum = blip->integrator;
-   const blipper_fixed_t *out = blip->output_buffer;
+   blipper_long_sample_t sum = blip->integrator;
+   const blipper_long_sample_t *out = blip->output_buffer;

 #if BLIPPER_LOG_PERFORMANCE
   double t0 = get_time();
 #endif

+#if BLIPPER_FIXED_POINT
   for (s = 0; s < samples; s++, output += stride)
   {
-      /* Cannot overflow */
-      sum += out[s] >> 1;
+      blipper_long_sample_t quant;

-      /* Clip sum. Really shoudn't happen though. */
-      if (sum > 0x3fff0000l)
-      {
-#if BLIPPER_LOG_CLIPPING
-         fprintf(stderr, "Positive clipping: 0x%lx -> 0x3fff0000.\n", (unsigned long)sum);
-#endif
-         sum = 0x3fff0000l;
-      }
-      else if (sum < -0x3fff0000l)
-      {
-#if BLIPPER_LOG_CLIPPING
-         fprintf(stderr, "Negative clipping: -0x%lx -> -0x3fff0000.\n", (unsigned long)-sum);
-#endif
-         sum = -0x3fff0000l;
-      }
+      /* Cannot overflow. Also add a leaky integrator.
+         Mitigates DC shift numerical instability which is
+         inherent for integrators. */
+      sum += (out[s] >> 1) - (sum >> 9);

+      /* Rounded. With leaky integrator, this cannot overflow. */
      quant = (sum + 0x4000) >> 15;
+
+      /* Clamp. quant can potentially have range [-0x10000, 0xffff] here.
+       * In both cases, top 16-bits will have a uniform bit pattern which can be exploited. */
+      if ((blipper_sample_t)quant != quant)
+      {
+         quant = (quant >> 16) ^ 0x7fff;
+         sum = quant << 15;
+      }
+
      *output = quant;
   }
+#else
+   for (s = 0; s < samples; s++, output += stride)
+   {
+      /* Leaky integrator, same as fixed point (1.0f / 512.0f) */
+      sum += out[s] - sum * 0.00195f;
+      *output = sum;
+   }
+#endif

   /* Don't bother with ring buffering.
    * The entire buffer should be read out ideally anyways. */
--- a/libgambatte/libretro/blipper.h
+++ b/libgambatte/libretro/blipper.h
@ -27,17 +27,26 @@
 #ifndef BLIPPER_H__
 #define BLIPPER_H__

-/* Configurables. */
+/* Compile time configurables. */
 #ifndef BLIPPER_LOG_PERFORMANCE
 #define BLIPPER_LOG_PERFORMANCE 0
 #endif

-#ifndef BLIPPER_LOG_CLIPPING
-#define BLIPPER_LOG_CLIPPING 1
+#ifndef BLIPPER_FIXED_POINT
+#define BLIPPER_FIXED_POINT 1
 #endif

-#ifndef BLIPPER_SIMD
-#define BLIPPER_SIMD 1
+/* Set to float or double.
+ * long double is unlikely to provide any improved precision. */
+#ifndef BLIPPER_REAL_T
+#define BLIPPER_REAL_T float
+#endif
+
+/* Allows including several implementations in one lib. */
+#if BLIPPER_FIXED_POINT
+#define BLIPPER_MANGLE(x) x##_fixed
+#else
+#define BLIPPER_MANGLE(x) x##_##BLIPPER_REAL_T
 #endif

 #ifdef __cplusplus
@ -47,11 +56,13 @@ extern "C" {
 #include <limits.h>

 typedef struct blipper blipper_t;
+typedef BLIPPER_REAL_T blipper_real_t;

+#if BLIPPER_FIXED_POINT
 #ifdef HAVE_STDINT_H
 #include <stdint.h>
 typedef int16_t blipper_sample_t;
-typedef int32_t blipper_fixed_t;
+typedef int32_t blipper_long_sample_t;
 #else
 #if SHRT_MAX == 0x7fff
 typedef short blipper_sample_t;
@ -62,30 +73,54 @@ typedef int blipper_sample_t;
 #endif

 #if INT_MAX == 0x7fffffffl
-typedef int blipper_fixed_t;
+typedef int blipper_long_sample_t;
 #elif LONG_MAX == 0x7fffffffl
-typedef long blipper_fixed_t;
+typedef long blipper_long_sample_t;
 #else
-#error "Cannot find suitable type for blipper_fixed_t."
+#error "Cannot find suitable type for blipper_long_sample_t."
 #endif
 #endif
+#else
+typedef BLIPPER_REAL_T blipper_sample_t;
+typedef BLIPPER_REAL_T blipper_long_sample_t; /* Meaningless for float version. */
+#endif

 /* Create a new blipper.
 * taps: Number of filter taps per impulse.
+ *
 * cutoff: Cutoff frequency in the passband. Has a range of [0, 1].
+ *
 * beta: Beta used for Kaiser window.
+ *
 * decimation: Sets decimation rate. Must be power-of-two (2^n).
 * The input sampling rate is then output_rate * 2^decimation.
 * buffer_samples: The maximum number of processed output samples that can be
 * buffered up by blipper.
 *
+ * filter_bank: An optional filter which has already been created by
+ * blipper_create_filter_bank(). blipper_new() does not take ownership
+ * of the buffer and must be freed by caller.
+ * If non-NULL, cutoff and beta will be ignored.
+ *
 * Some sane values:
 * taps = 64, cutoff = 0.85, beta = 8.0
 */
+#define blipper_new BLIPPER_MANGLE(blipper_new)
 blipper_t *blipper_new(unsigned taps, double cutoff, double beta,
-      unsigned decimation, unsigned buffer_samples);
+      unsigned decimation, unsigned buffer_samples, const blipper_sample_t *filter_bank);
+
+/* Reset the blipper to its initiate state. */
+#define blipper_reset BLIPPER_MANGLE(blipper_reset)
+void blipper_reset(blipper_t *blip);
+
+/* Create a filter which can be passed to blipper_new() in filter_bank.
+ * Arguments to decimation and taps must match. */
+#define blipper_create_filter_bank BLIPPER_MANGLE(blipper_create_filter_bank)
+blipper_sample_t *blipper_create_filter_bank(unsigned decimation,
+      unsigned taps, double cutoff, double beta);

 /* Frees the blipper. blip can be NULL (no-op). */
+#define blipper_free BLIPPER_MANGLE(blipper_free)
 void blipper_free(blipper_t *blip);

 /* Data pushing interfaces. One of these should be used exclusively. */
@ -100,19 +135,22 @@ void blipper_free(blipper_t *blip);
 * The caller must ensure not to push deltas in a way that can destabilize
 * the final integration.
 */
-void blipper_push_delta(blipper_t *blip, blipper_fixed_t delta, unsigned clocks_step);
+#define blipper_push_delta BLIPPER_MANGLE(blipper_push_delta)
+void blipper_push_delta(blipper_t *blip, blipper_long_sample_t delta, unsigned clocks_step);

 /* Push raw samples. blipper will find the deltas themself and push them.
 * stride is the number of samples between each sample to be used.
 * This can be used to push interleaved stereo data to two independent
 * blippers.
 */
+#define blipper_push_samples BLIPPER_MANGLE(blipper_push_samples)
 void blipper_push_samples(blipper_t *blip, const blipper_sample_t *delta,
      unsigned samples, unsigned stride);

 /* Returns the number of samples available for reading using
 * blipper_read().
 */
+#define blipper_read_avail BLIPPER_MANGLE(blipper_read_avail)
 unsigned blipper_read_avail(blipper_t *blip);

 /* Reads processed samples. The caller must ensure to not read
@ -121,6 +159,7 @@ unsigned blipper_read_avail(blipper_t *blip);
 * between each output sample in output.
 * Can be used to write to an interleaved stereo buffer.
 */
+#define blipper_read BLIPPER_MANGLE(blipper_read)
 void blipper_read(blipper_t *blip, blipper_sample_t *output, unsigned samples,
      unsigned stride);

--- a/libgambatte/libretro/libretro.cpp
+++ b/libgambatte/libretro/libretro.cpp
@ -74,8 +74,8 @@ void retro_init()
   double fps = 4194304.0 / 70224.0;
   double sample_rate = fps * 35112;

-   resampler_l = blipper_new(32, 0.85, 6.5, 64, 1024);
-   resampler_r = blipper_new(32, 0.85, 6.5, 64, 1024);
+   resampler_l = blipper_new(32, 0.85, 6.5, 64, 1024, NULL);
+   resampler_r = blipper_new(32, 0.85, 6.5, 64, 1024, NULL);

   if (environ_cb)
   {