add NEON optimizations for the CC resampler.

2024-11-26 09:41:15 +00:00 · 2014-09-22 08:28:24 +01:00 · 2014-09-22 08:28:24 +01:00 · 7899dea89f
commit 7899dea89f
parent 40402174d9
4 changed files with 458 additions and 82 deletions
--- a/Makefile.common
+++ b/Makefile.common
@ -232,6 +232,7 @@ endif

 ifeq ($(HAVE_NEON),1)
   OBJ += audio/resamplers/sinc_neon.o
+   OBJ += audio/resamplers/cc_resampler_neon.S
   # When compiled without this, tries to attempt to compile sinc lerp,
   # which will error out
 	#
--- a/android/phoenix/jni/Android.mk
+++ b/android/phoenix/jni/Android.mk
@ -28,6 +28,7 @@ ifeq ($(HAVE_NEON),1)
 	LOCAL_CFLAGS += -D__ARM_NEON__
   LOCAL_SRC_FILES += $(RARCH_DIR)/audio/utils_neon.S.neon
   LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/sinc_neon.S.neon
+   LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/cc_resampler_neon.S.neon
 endif
 LOCAL_CFLAGS += -DSINC_LOWER_QUALITY 

--- a/audio/resamplers/cc_resampler.c
+++ b/audio/resamplers/cc_resampler.c
@ -29,6 +29,45 @@
 #endif


+/* since SSE and NEON don't provide support for trigonometric functions
+ * we approximate those with polynoms
+ *
+ * CC_RESAMPLER_PRECISION defines how accurate the approximation is
+ * a setting of 5 or more means full precison.
+ * setting 0 doesn't use a polynom
+ * setting 1 uses P(X) = X - (3/4)*X^3 + (1/4)*X^5
+ *
+ * only 0 and 1 are implemented for SSE and NEON currently
+ *
+ * the MIPS_ARCH_ALLEGREX target doesnt require this setting since it has
+ * native support for the required functions so it will always use full precision.
+ */
+
+#ifndef CC_RESAMPLER_PRECISION
+#define CC_RESAMPLER_PRECISION 1
+#endif
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef ALIGN_MEMORY
+#ifdef _MSC_VER
+# define ALIGN_MEMORY(X) __declspec(align(X))
+#else
+# define ALIGN_MEMORY(X) __attribute__((aligned(X)))
+#endif
+#endif
+
+typedef struct rarch_CC_resampler
+{
+   ALIGN_MEMORY(32) audio_frame_float_t buffer[4];
+
+   float distance;
+   void (*process)(void *re, struct resampler_data *data);
+} rarch_CC_resampler_t;
+
+
 #ifdef _MIPS_ARCH_ALLEGREX
 static void resampler_CC_process(void *re_, struct resampler_data *data)
 {
@ -146,29 +185,14 @@ static void *resampler_CC_init(double bandwidth_mod)
   RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n");
   return (void*)-1;
 }
-#elif defined(__SSE__)
+#else

-/* uses a fast polynomial approximation
- * since SSE lacks native support for trigonometric functions
- * cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5
- */

+#if defined(__SSE__)

 #include <xmmintrin.h>

-#ifndef CC_RESAMPLER_PRECISION
-#define CC_RESAMPLER_PRECISION 1
-#endif
-
-typedef struct rarch_CC_resampler
-{
-   __m128 previous;
-   __m128 current;
-
-   float distance;
-   void (*process)(void *re, struct resampler_data *data);
-} rarch_CC_resampler_t;
-
+#define CC_RESAMPLER_IDENT "SSE"

 static void resampler_CC_downsample(void *re_, struct resampler_data *data)
 {
@ -182,8 +206,8 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
   ratio = 1.0 / data->ratio;
   b = data->ratio; /* cutoff frequency. */

-   __m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
-   __m128 vec_current  = _mm_loadu_ps((float*)&re->current);
+   __m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
+   __m128 vec_current  = _mm_loadu_ps((float*)&re->buffer[2]);

   while (inp != inp_max)
   {
@ -249,16 +273,12 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
      }
   }

-   _mm_storeu_ps((float*)&re->previous, vec_previous);
-   _mm_storeu_ps((float*)&re->current,  vec_current);
+   _mm_storeu_ps((float*)&re->buffer[0], vec_previous);
+   _mm_storeu_ps((float*)&re->buffer[2],  vec_current);

   data->output_frames = outp - (audio_frame_float_t*)data->data_out;
 }

-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
 static void resampler_CC_upsample(void *re_, struct resampler_data *data)
 {
   float b, ratio;
@ -271,8 +291,8 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
   b = min(data->ratio, 1.00); /* cutoff frequency. */
   ratio = 1.0 / data->ratio;

-   __m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
-   __m128 vec_current  = _mm_loadu_ps((float*)&re->current);
+   __m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
+   __m128 vec_current  = _mm_loadu_ps((float*)&re->buffer[2]);



@ -338,69 +358,39 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
      inp++;
   }

-   _mm_storeu_ps((float*)&re->previous, vec_previous);
-   _mm_storeu_ps((float*)&re->current,  vec_current);
+   _mm_storeu_ps((float*)&re->buffer[0], vec_previous);
+   _mm_storeu_ps((float*)&re->buffer[2],  vec_current);

   data->output_frames = outp - (audio_frame_float_t*)data->data_out;
 }


-static void resampler_CC_process(void *re_, struct resampler_data *data)
+#elif defined (__ARM_NEON__)
+
+#define CC_RESAMPLER_IDENT "NEON"
+
+size_t resampler_CC_downsample_neon(float *outp, const float *inp,
+      rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
+size_t resampler_CC_upsample_neon  (float *outp, const float *inp,
+      rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
+
+static void resampler_CC_downsample(void *re_, struct resampler_data *data)
 {
-   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
-   re->process(re_, data);
+   data->output_frames = resampler_CC_downsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
 }

-static void resampler_CC_free(void *re_)
+static void resampler_CC_upsample(void *re_, struct resampler_data *data)
 {
-   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
-   if (re)
-      free(re);
+   data->output_frames = resampler_CC_upsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
 }

-static void *resampler_CC_init(double bandwidth_mod)
-{
-   int i;
-   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)
-      calloc(1, sizeof(rarch_CC_resampler_t));
-   if (!re)
-      return NULL;
-
-   for (i = 0; i < 4; i++)
-   {
-      re->previous = _mm_setzero_ps();
-      re->current    = _mm_setzero_ps();
-   }
-
-   RARCH_LOG("Convoluted Cosine resampler (SSE) : ");
-
-   /* variations of data->ratio around 0.75 are safer
-    * than around 1.0 for both up/downsampler. */
-   if (bandwidth_mod < 0.75)
-   {
-      RARCH_LOG("CC_downsample @%f \n", bandwidth_mod);
-      re->process = resampler_CC_downsample;
-      re->distance = 0.0;
-   }
-   else
-   {
-      RARCH_LOG("CC_upsample @%f \n", bandwidth_mod);
-      re->process = resampler_CC_upsample;
-      re->distance = 2.0;
-   }
-
-   return re;
-}
 #else

 /* C reference version. Not optimized. */
-typedef struct rarch_CC_resampler
-{
-   audio_frame_float_t buffer[4];
-   float distance;
-   void (*process)(void *re, struct resampler_data *data);
-} rarch_CC_resampler_t;

+#define CC_RESAMPLER_IDENT "C"
+
+#if (CC_RESAMPLER_PRECISION > 4)
 static inline float cc_int(float x, float b)
 {
   float val = x * b * M_PI + sinf(x * b * M_PI);
@ -411,6 +401,21 @@ static inline float cc_kernel(float x, float b)
 {
   return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b)) / (2.0 * M_PI);
 }
+#else
+static inline float cc_int(float x, float b)
+{
+   float val = x * b;
+#if (CC_RESAMPLER_PRECISION > 0)
+   val = val*(1 - 0.25 * val * val * (3.0 - val * val));
+#endif
+   return (val > 0.5) ? 0.5 : (val < -0.5) ? -0.5 : val;
+}
+
+static inline float cc_kernel(float x, float b)
+{
+   return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b));
+}
+#endif

 static inline void add_to(const audio_frame_float_t *source,
      audio_frame_float_t *target, float ratio)
@ -458,10 +463,6 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
   data->output_frames = outp - (audio_frame_float_t*)data->data_out;
 }

-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-
 static void resampler_CC_upsample(void *re_, struct resampler_data *data)
 {
   float b, ratio;
@ -505,6 +506,7 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)

   data->output_frames = outp - (audio_frame_float_t*)data->data_out;
 }
+#endif

 static void resampler_CC_process(void *re_, struct resampler_data *data)
 {
@ -533,7 +535,7 @@ static void *resampler_CC_init(double bandwidth_mod)
      re->buffer[i].r = 0.0;
   }

-   RARCH_LOG("Convoluted Cosine resampler (C) : ");
+   RARCH_LOG("Convoluted Cosine resampler (" CC_RESAMPLER_IDENT ") - precision = %i : ", CC_RESAMPLER_PRECISION);

   /* variations of data->ratio around 0.75 are safer
    * than around 1.0 for both up/downsampler. */
--- a/audio/resamplers/cc_resampler_neon.S
+++ b/audio/resamplers/cc_resampler_neon.S
@ -0,0 +1,372 @@
+/*  RetroArch - A frontend for libretro.
+ *  Copyright (C) 2010-2014 - Hans-Kristian Arntzen
+ *  Copyright (C) 2014      - Ali Bouhlel ( aliaspider@gmail.com )
+ *
+ *  RetroArch is free software: you can redistribute it and/or modify it under the terms
+ *  of the GNU General Public License as published by the Free Software Found-
+ *  ation, either version 3 of the License, or (at your option) any later version.
+ *
+ *  RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ *  PURPOSE.  See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along with RetroArch.
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined(__ARM_NEON__)
+
+#ifndef CC_RESAMPLER_PRECISION
+#define CC_RESAMPLER_PRECISION 1
+#endif
+
+#ifndef __MACH__
+.arm
+#endif
+.align 4
+.globl resampler_CC_downsample_neon
+.globl _resampler_CC_downsample_neon
+
+# size_t resampler_CC_downsample_neon(float *outp, const float *inp,
+#       rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
+
+# r0: outp initial (and output_frames return value)
+# r1: inp initial/current
+# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
+# r3: input_frames/inp_max
+# r4: outp current
+# r5:
+# r6:
+# r7:
+
+# q0:  d0:  s0:  0.0             # q4:  d8:  s16: min(ratio, 1.0)
+#           s1:  1.0             #           s17: min(ratio, 1.0)
+#      d1:  s2:  2.0             #      d9:  s18: min(ratio, 1.0)
+#           s3:  3.0             #           s19: min(ratio, 1.0)
+# q1:  d2:  s4:  ratio           # q5:  d10: s20: 1.0
+#           s5:  distance        #           s21: 1.0
+#      d3:  s6:  (1.0/ratio)     #      d11: s22: 1.0
+#           s7:  (1.0/ratio)+0.5 #           s23: 1.0
+# q2:  d4:  s8:  0.5             # q6:  d12: s24: 3.0
+#           s9:  0.5             #           s25: 3.0
+#      d5:  s10: 0.5             #      d13: s26: 3.0
+#           s11: 0.5             #           s27: 3.0
+# q3:  d6:  s12: -0.5            # q7:  d14: s28: 0.25
+#           s13: -0.5            #           s29: 0.25
+#      d7:  s14: -0.5            #      d15: s30: 0.25
+#           s15: -0.5            #           s31: 0.25
+
+# q8:  d16: (temp)               # q12: d24: (temp)
+#           (temp)               #           (temp)
+#      d17: (temp)               #      d25: (temp)
+#           (temp)               #           (temp)
+# q9:  d18: (temp)               # q13: d26: (temp)
+#           (temp)               #           (temp)
+#      d19: (temp)               #      d27: (temp)
+#           (temp)               #           (temp)
+# q10: d20: (temp)               # q14: d28: buffer[0]
+#           (temp)               #           buffer[1]
+#      d21: (temp)               #      d29: buffer[2]
+#           (temp)               #           buffer[3]
+# q11: d22: (temp)               # q15: d30: buffer[4]
+#           (temp)               #           buffer[5]
+#      d23: (temp)               #      d31: buffer[6]
+#           (temp)               #           buffer[7]
+
+
+resampler_CC_downsample_neon:
+_resampler_CC_downsample_neon:
+
+vld1.f32 {q14-q15}, [r2, :256]
+vldr s4, [sp]
+vpush {q4,q5,q6,q7}
+push {r4}
+
+mov r4, r0
+
+
+veor q0, q0, q0
+vmov.f32 s1, #1.0
+vmov.f32 s2, #2.0
+vmov.f32 s3, #3.0
+
+vmov.f32 q2, #0.5
+vmov.f32 q3, #-0.5
+
+vmov.f32 q5, #1.0
+vmov.f32 q6, #3.0
+vmov.f32 q7, #0.25
+
+
+vldr s5, [r2, #32]
+vdiv.f32 s6, s20, s4
+vadd.f32 s7, s6, s8
+vdup.f32 q4, d2[0]
+vmin.f32 q4, q4, q5
+
+
+lsl r3, #3
+add r3, r3, r1
+
+
+cmp r3, r1
+beq 3f
+1:
+
+vdup.f32 q8, d3[0]
+vmul.f32 q8, q8, q0
+vdup.f32 q9, d2[1]
+vsub.f32 q8, q9, q8
+
+vadd.f32 q10, q8, q2
+vsub.f32 q11, q8, q2
+
+vmul.f32 q10, q10, q4
+vmul.f32 q11, q11, q4
+
+#if (CC_RESAMPLER_PRECISION > 0)
+vmul.f32 q8, q10, q10
+vmul.f32 q9, q11, q11
+
+vsub.f32 q12, q6, q8
+vsub.f32 q13, q6, q9
+
+vmul.f32 q12, q12, q8
+vmul.f32 q13, q13, q9
+
+vmul.f32 q12, q12, q7
+vmul.f32 q13, q13, q7
+
+vsub.f32 q12, q5, q12
+vsub.f32 q13, q5, q13
+
+vmul.f32 q10, q10, q12
+vmul.f32 q11, q11, q13
+#endif
+
+vmin.f32 q10, q10, q2
+vmin.f32 q11, q11, q2
+
+vmax.f32 q10, q10, q3
+vmax.f32 q11, q11, q3
+
+vsub.f32 q10, q10, q11
+vmov.f32 q11, q10
+
+vzip.f32 q10, q11
+
+vld1.f32 d16, [r1, :64]!
+vmov.f32 d17, d16
+
+vmul.f32 q10, q10, q8
+vmul.f32 q11, q11, q8
+
+vadd.f32 q14, q14, q10
+vadd.f32 q15, q15, q11
+
+# distance++
+vadd.f32 s5, s5, s20
+
+vcmpe.f32	s5, s7
+vmrs	APSR_nzcv, fpscr
+ble 2f
+
+vst1.f32 d28, [r4, :64]!
+vmov.f32 d28, d29
+vmov.f32 d29, d30
+vmov.f32 d30, d31
+vmov.f32 d31, #0.0
+
+vsub.f32 s5, s5, s6
+
+2:
+cmp r3, r1
+bne 1b
+
+3:
+vst1.f32 {q14-q15}, [r2, :256]
+vstr s5, [r2, #32]
+sub r0, r4, r0
+lsr r0, r0, #3
+
+pop  {r4}
+vpop {q4,q5,q6,q7}
+
+bx lr
+
+
+.align 4
+.globl resampler_CC_upsample_neon
+.globl _resampler_CC_upsample_neon
+
+# size_t resampler_CC_upsample_neon(float *outp, const float *inp,
+#       rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
+
+# r0: outp initial (and output_frames return value)
+# r1: inp initial/current
+# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
+# r3: input_frames/inp_max
+# r4: outp current
+# r5:
+# r6:
+# r7:
+
+# q0:  d0:  s0:  1.0             # q4:  d8:  s16: min(ratio, 1.0)
+#           s1:  0.0             #           s17: min(ratio, 1.0)
+#      d1:  s2: -1.0             #      d9:  s18: min(ratio, 1.0)
+#           s3: -2.0             #           s19: min(ratio, 1.0)
+# q1:  d2:  s4:  ratio           # q5:  d10: s20: 1.0
+#           s5:  distance        #           s21: 1.0
+#      d3:  s6:  (1.0/ratio)     #      d11: s22: 1.0
+#           s7:  (1.0/ratio)+0.5 #           s23: 1.0
+# q2:  d4:  s8:  0.5             # q6:  d12: s24: 3.0
+#           s9:  0.5             #           s25: 3.0
+#      d5:  s10: 0.5             #      d13: s26: 3.0
+#           s11: 0.5             #           s27: 3.0
+# q3:  d6:  s12: -0.5            # q7:  d14: s28: 0.25
+#           s13: -0.5            #           s29: 0.25
+#      d7:  s14: -0.5            #      d15: s30: 0.25
+#           s15: -0.5            #           s31: 0.25
+
+# q8:  d16: (temp)               # q12: d24: (temp)
+#           (temp)               #           (temp)
+#      d17: (temp)               #      d25: (temp)
+#           (temp)               #           (temp)
+# q9:  d18: (temp)               # q13: d26: (temp)
+#           (temp)               #           (temp)
+#      d19: (temp)               #      d27: (temp)
+#           (temp)               #           (temp)
+# q10: d20: (temp)               # q14: d28: buffer[0]
+#           (temp)               #           buffer[1]
+#      d21: (temp)               #      d29: buffer[2]
+#           (temp)               #           buffer[3]
+# q11: d22: (temp)               # q15: d30: buffer[4]
+#           (temp)               #           buffer[5]
+#      d23: (temp)               #      d31: buffer[6]
+#           (temp)               #           buffer[7]
+
+
+resampler_CC_upsample_neon:
+_resampler_CC_upsample_neon:
+
+vld1.f32 {q14-q15}, [r2, :256]
+vldr s4, [sp]
+vpush {q4,q5,q6,q7}
+push {r4}
+
+mov r4, r0
+
+
+veor q0, q0, q0
+vmov.f32 s0, #1.0
+vmov.f32 s2, #-1.0
+vmov.f32 s3, #-2.0
+
+vmov.f32 q2, #0.5
+vmov.f32 q3, #-0.5
+
+vmov.f32 q5, #1.0
+vmov.f32 q6, #3.0
+vmov.f32 q7, #0.25
+
+
+vldr s5, [r2, #32]
+vdiv.f32 s6, s20, s4
+vadd.f32 s7, s6, s8
+vdup.f32 q4, d2[0]
+vmin.f32 q4, q4, q5
+
+
+lsl r3, #3
+add r3, r3, r1
+
+
+cmp r3, r1
+beq 4f
+1:
+
+vld1.f32 d16, [r1, :64]!
+vmov.f32 d28, d29
+vmov.f32 d29, d30
+vmov.f32 d30, d31
+vmov.f32 d31, d16
+
+vcmpe.f32	s5, s20
+vmrs	APSR_nzcv, fpscr
+bge 3f
+2:
+vdup.f32 q8, d2[1]
+vadd.f32 q8, q8, q0
+
+vadd.f32 q10, q8, q2
+vsub.f32 q11, q8, q2
+
+vmul.f32 q10, q10, q4
+vmul.f32 q11, q11, q4
+
+#if (CC_RESAMPLER_PRECISION > 0)
+vmul.f32 q8, q10, q10
+vmul.f32 q9, q11, q11
+
+vsub.f32 q12, q6, q8
+vsub.f32 q13, q6, q9
+
+vmul.f32 q12, q12, q8
+vmul.f32 q13, q13, q9
+
+vmul.f32 q12, q12, q7
+vmul.f32 q13, q13, q7
+
+vsub.f32 q12, q5, q12
+vsub.f32 q13, q5, q13
+
+vmul.f32 q10, q10, q12
+vmul.f32 q11, q11, q13
+#endif
+
+vmin.f32 q10, q10, q2
+vmin.f32 q11, q11, q2
+
+vmax.f32 q10, q10, q3
+vmax.f32 q11, q11, q3
+
+vsub.f32 q10, q10, q11
+vmov.f32 q11, q10
+
+vzip.f32 q10, q11
+
+vmul.f32 q10, q10, q14
+vmul.f32 q11, q11, q15
+
+vadd.f32 q10, q10, q11
+vadd.f32 d20, d20, d21
+
+vst1.f32 d20, [r4, :64]!
+
+vadd.f32 s5, s5, s6
+
+
+vcmpe.f32	s5, s20
+vmrs	APSR_nzcv, fpscr
+blt 2b
+
+3:
+# distance--
+vsub.f32 s5, s5, s20
+
+cmp r3, r1
+bne 1b
+
+
+4:
+vst1.f32 {q14-q15}, [r2, :256]
+vstr s5, [r2, #32]
+sub r0, r4, r0
+lsr r0, r0, #3
+
+pop  {r4}
+vpop {q4,q5,q6,q7}
+
+bx lr
+
+#endif