add NEON optimizations for the CC resampler.

This commit is contained in:
aliaspider 2014-09-22 08:28:24 +01:00
parent 40402174d9
commit 7899dea89f
4 changed files with 458 additions and 82 deletions

View File

@ -232,6 +232,7 @@ endif
ifeq ($(HAVE_NEON),1)
OBJ += audio/resamplers/sinc_neon.o
OBJ += audio/resamplers/cc_resampler_neon.S
# When compiled without this, tries to attempt to compile sinc lerp,
# which will error out
#

View File

@ -28,6 +28,7 @@ ifeq ($(HAVE_NEON),1)
LOCAL_CFLAGS += -D__ARM_NEON__
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/utils_neon.S.neon
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/sinc_neon.S.neon
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/cc_resampler_neon.S.neon
endif
LOCAL_CFLAGS += -DSINC_LOWER_QUALITY

View File

@ -1,7 +1,7 @@
/* RetroArch - A frontend for libretro.
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
* Copyright (C) 2014 - Ali Bouhlel ( aliaspider@gmail.com )
*
*
* RetroArch is free software: you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
@ -29,6 +29,45 @@
#endif
/* since SSE and NEON don't provide support for trigonometric functions
* we approximate those with polynoms
*
* CC_RESAMPLER_PRECISION defines how accurate the approximation is
* a setting of 5 or more means full precison.
* setting 0 doesn't use a polynom
* setting 1 uses P(X) = X - (3/4)*X^3 + (1/4)*X^5
*
* only 0 and 1 are implemented for SSE and NEON currently
*
* the MIPS_ARCH_ALLEGREX target doesnt require this setting since it has
* native support for the required functions so it will always use full precision.
*/
#ifndef CC_RESAMPLER_PRECISION
#define CC_RESAMPLER_PRECISION 1
#endif
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
#ifndef ALIGN_MEMORY
#ifdef _MSC_VER
# define ALIGN_MEMORY(X) __declspec(align(X))
#else
# define ALIGN_MEMORY(X) __attribute__((aligned(X)))
#endif
#endif
typedef struct rarch_CC_resampler
{
ALIGN_MEMORY(32) audio_frame_float_t buffer[4];
float distance;
void (*process)(void *re, struct resampler_data *data);
} rarch_CC_resampler_t;
#ifdef _MIPS_ARCH_ALLEGREX
static void resampler_CC_process(void *re_, struct resampler_data *data)
{
@ -146,29 +185,14 @@ static void *resampler_CC_init(double bandwidth_mod)
RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n");
return (void*)-1;
}
#elif defined(__SSE__)
#else
/* uses a fast polynomial approximation
* since SSE lacks native support for trigonometric functions
* cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5
*/
#if defined(__SSE__)
#include <xmmintrin.h>
#ifndef CC_RESAMPLER_PRECISION
#define CC_RESAMPLER_PRECISION 1
#endif
typedef struct rarch_CC_resampler
{
__m128 previous;
__m128 current;
float distance;
void (*process)(void *re, struct resampler_data *data);
} rarch_CC_resampler_t;
#define CC_RESAMPLER_IDENT "SSE"
static void resampler_CC_downsample(void *re_, struct resampler_data *data)
{
@ -182,8 +206,8 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
ratio = 1.0 / data->ratio;
b = data->ratio; /* cutoff frequency. */
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
__m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
__m128 vec_current = _mm_loadu_ps((float*)&re->buffer[2]);
while (inp != inp_max)
{
@ -249,16 +273,12 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
}
}
_mm_storeu_ps((float*)&re->previous, vec_previous);
_mm_storeu_ps((float*)&re->current, vec_current);
_mm_storeu_ps((float*)&re->buffer[0], vec_previous);
_mm_storeu_ps((float*)&re->buffer[2], vec_current);
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
{
float b, ratio;
@ -271,8 +291,8 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
b = min(data->ratio, 1.00); /* cutoff frequency. */
ratio = 1.0 / data->ratio;
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
__m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
__m128 vec_current = _mm_loadu_ps((float*)&re->buffer[2]);
@ -338,69 +358,39 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
inp++;
}
_mm_storeu_ps((float*)&re->previous, vec_previous);
_mm_storeu_ps((float*)&re->current, vec_current);
_mm_storeu_ps((float*)&re->buffer[0], vec_previous);
_mm_storeu_ps((float*)&re->buffer[2], vec_current);
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
static void resampler_CC_process(void *re_, struct resampler_data *data)
#elif defined (__ARM_NEON__)
#define CC_RESAMPLER_IDENT "NEON"
size_t resampler_CC_downsample_neon(float *outp, const float *inp,
rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
size_t resampler_CC_upsample_neon (float *outp, const float *inp,
rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
static void resampler_CC_downsample(void *re_, struct resampler_data *data)
{
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
re->process(re_, data);
data->output_frames = resampler_CC_downsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
}
static void resampler_CC_free(void *re_)
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
{
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
if (re)
free(re);
data->output_frames = resampler_CC_upsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
}
static void *resampler_CC_init(double bandwidth_mod)
{
int i;
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)
calloc(1, sizeof(rarch_CC_resampler_t));
if (!re)
return NULL;
for (i = 0; i < 4; i++)
{
re->previous = _mm_setzero_ps();
re->current = _mm_setzero_ps();
}
RARCH_LOG("Convoluted Cosine resampler (SSE) : ");
/* variations of data->ratio around 0.75 are safer
* than around 1.0 for both up/downsampler. */
if (bandwidth_mod < 0.75)
{
RARCH_LOG("CC_downsample @%f \n", bandwidth_mod);
re->process = resampler_CC_downsample;
re->distance = 0.0;
}
else
{
RARCH_LOG("CC_upsample @%f \n", bandwidth_mod);
re->process = resampler_CC_upsample;
re->distance = 2.0;
}
return re;
}
#else
/* C reference version. Not optimized. */
typedef struct rarch_CC_resampler
{
audio_frame_float_t buffer[4];
float distance;
void (*process)(void *re, struct resampler_data *data);
} rarch_CC_resampler_t;
#define CC_RESAMPLER_IDENT "C"
#if (CC_RESAMPLER_PRECISION > 4)
static inline float cc_int(float x, float b)
{
float val = x * b * M_PI + sinf(x * b * M_PI);
@ -411,6 +401,21 @@ static inline float cc_kernel(float x, float b)
{
return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b)) / (2.0 * M_PI);
}
#else
static inline float cc_int(float x, float b)
{
float val = x * b;
#if (CC_RESAMPLER_PRECISION > 0)
val = val*(1 - 0.25 * val * val * (3.0 - val * val));
#endif
return (val > 0.5) ? 0.5 : (val < -0.5) ? -0.5 : val;
}
static inline float cc_kernel(float x, float b)
{
return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b));
}
#endif
static inline void add_to(const audio_frame_float_t *source,
audio_frame_float_t *target, float ratio)
@ -458,10 +463,6 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
{
float b, ratio;
@ -505,6 +506,7 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
#endif
static void resampler_CC_process(void *re_, struct resampler_data *data)
{
@ -533,7 +535,7 @@ static void *resampler_CC_init(double bandwidth_mod)
re->buffer[i].r = 0.0;
}
RARCH_LOG("Convoluted Cosine resampler (C) : ");
RARCH_LOG("Convoluted Cosine resampler (" CC_RESAMPLER_IDENT ") - precision = %i : ", CC_RESAMPLER_PRECISION);
/* variations of data->ratio around 0.75 are safer
* than around 1.0 for both up/downsampler. */

View File

@ -0,0 +1,372 @@
/* RetroArch - A frontend for libretro.
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
* Copyright (C) 2014 - Ali Bouhlel ( aliaspider@gmail.com )
*
* RetroArch is free software: you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with RetroArch.
* If not, see <http://www.gnu.org/licenses/>.
*/
#if defined(__ARM_NEON__)
#ifndef CC_RESAMPLER_PRECISION
#define CC_RESAMPLER_PRECISION 1
#endif
#ifndef __MACH__
.arm
#endif
.align 4
.globl resampler_CC_downsample_neon
.globl _resampler_CC_downsample_neon
# size_t resampler_CC_downsample_neon(float *outp, const float *inp,
# rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
# r0: outp initial (and output_frames return value)
# r1: inp initial/current
# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
# r3: input_frames/inp_max
# r4: outp current
# r5:
# r6:
# r7:
# q0: d0: s0: 0.0 # q4: d8: s16: min(ratio, 1.0)
# s1: 1.0 # s17: min(ratio, 1.0)
# d1: s2: 2.0 # d9: s18: min(ratio, 1.0)
# s3: 3.0 # s19: min(ratio, 1.0)
# q1: d2: s4: ratio # q5: d10: s20: 1.0
# s5: distance # s21: 1.0
# d3: s6: (1.0/ratio) # d11: s22: 1.0
# s7: (1.0/ratio)+0.5 # s23: 1.0
# q2: d4: s8: 0.5 # q6: d12: s24: 3.0
# s9: 0.5 # s25: 3.0
# d5: s10: 0.5 # d13: s26: 3.0
# s11: 0.5 # s27: 3.0
# q3: d6: s12: -0.5 # q7: d14: s28: 0.25
# s13: -0.5 # s29: 0.25
# d7: s14: -0.5 # d15: s30: 0.25
# s15: -0.5 # s31: 0.25
# q8: d16: (temp) # q12: d24: (temp)
# (temp) # (temp)
# d17: (temp) # d25: (temp)
# (temp) # (temp)
# q9: d18: (temp) # q13: d26: (temp)
# (temp) # (temp)
# d19: (temp) # d27: (temp)
# (temp) # (temp)
# q10: d20: (temp) # q14: d28: buffer[0]
# (temp) # buffer[1]
# d21: (temp) # d29: buffer[2]
# (temp) # buffer[3]
# q11: d22: (temp) # q15: d30: buffer[4]
# (temp) # buffer[5]
# d23: (temp) # d31: buffer[6]
# (temp) # buffer[7]
resampler_CC_downsample_neon:
_resampler_CC_downsample_neon:
vld1.f32 {q14-q15}, [r2, :256]
vldr s4, [sp]
vpush {q4,q5,q6,q7}
push {r4}
mov r4, r0
veor q0, q0, q0
vmov.f32 s1, #1.0
vmov.f32 s2, #2.0
vmov.f32 s3, #3.0
vmov.f32 q2, #0.5
vmov.f32 q3, #-0.5
vmov.f32 q5, #1.0
vmov.f32 q6, #3.0
vmov.f32 q7, #0.25
vldr s5, [r2, #32]
vdiv.f32 s6, s20, s4
vadd.f32 s7, s6, s8
vdup.f32 q4, d2[0]
vmin.f32 q4, q4, q5
lsl r3, #3
add r3, r3, r1
cmp r3, r1
beq 3f
1:
vdup.f32 q8, d3[0]
vmul.f32 q8, q8, q0
vdup.f32 q9, d2[1]
vsub.f32 q8, q9, q8
vadd.f32 q10, q8, q2
vsub.f32 q11, q8, q2
vmul.f32 q10, q10, q4
vmul.f32 q11, q11, q4
#if (CC_RESAMPLER_PRECISION > 0)
vmul.f32 q8, q10, q10
vmul.f32 q9, q11, q11
vsub.f32 q12, q6, q8
vsub.f32 q13, q6, q9
vmul.f32 q12, q12, q8
vmul.f32 q13, q13, q9
vmul.f32 q12, q12, q7
vmul.f32 q13, q13, q7
vsub.f32 q12, q5, q12
vsub.f32 q13, q5, q13
vmul.f32 q10, q10, q12
vmul.f32 q11, q11, q13
#endif
vmin.f32 q10, q10, q2
vmin.f32 q11, q11, q2
vmax.f32 q10, q10, q3
vmax.f32 q11, q11, q3
vsub.f32 q10, q10, q11
vmov.f32 q11, q10
vzip.f32 q10, q11
vld1.f32 d16, [r1, :64]!
vmov.f32 d17, d16
vmul.f32 q10, q10, q8
vmul.f32 q11, q11, q8
vadd.f32 q14, q14, q10
vadd.f32 q15, q15, q11
# distance++
vadd.f32 s5, s5, s20
vcmpe.f32 s5, s7
vmrs APSR_nzcv, fpscr
ble 2f
vst1.f32 d28, [r4, :64]!
vmov.f32 d28, d29
vmov.f32 d29, d30
vmov.f32 d30, d31
vmov.f32 d31, #0.0
vsub.f32 s5, s5, s6
2:
cmp r3, r1
bne 1b
3:
vst1.f32 {q14-q15}, [r2, :256]
vstr s5, [r2, #32]
sub r0, r4, r0
lsr r0, r0, #3
pop {r4}
vpop {q4,q5,q6,q7}
bx lr
.align 4
.globl resampler_CC_upsample_neon
.globl _resampler_CC_upsample_neon
# size_t resampler_CC_upsample_neon(float *outp, const float *inp,
# rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
# r0: outp initial (and output_frames return value)
# r1: inp initial/current
# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
# r3: input_frames/inp_max
# r4: outp current
# r5:
# r6:
# r7:
# q0: d0: s0: 1.0 # q4: d8: s16: min(ratio, 1.0)
# s1: 0.0 # s17: min(ratio, 1.0)
# d1: s2: -1.0 # d9: s18: min(ratio, 1.0)
# s3: -2.0 # s19: min(ratio, 1.0)
# q1: d2: s4: ratio # q5: d10: s20: 1.0
# s5: distance # s21: 1.0
# d3: s6: (1.0/ratio) # d11: s22: 1.0
# s7: (1.0/ratio)+0.5 # s23: 1.0
# q2: d4: s8: 0.5 # q6: d12: s24: 3.0
# s9: 0.5 # s25: 3.0
# d5: s10: 0.5 # d13: s26: 3.0
# s11: 0.5 # s27: 3.0
# q3: d6: s12: -0.5 # q7: d14: s28: 0.25
# s13: -0.5 # s29: 0.25
# d7: s14: -0.5 # d15: s30: 0.25
# s15: -0.5 # s31: 0.25
# q8: d16: (temp) # q12: d24: (temp)
# (temp) # (temp)
# d17: (temp) # d25: (temp)
# (temp) # (temp)
# q9: d18: (temp) # q13: d26: (temp)
# (temp) # (temp)
# d19: (temp) # d27: (temp)
# (temp) # (temp)
# q10: d20: (temp) # q14: d28: buffer[0]
# (temp) # buffer[1]
# d21: (temp) # d29: buffer[2]
# (temp) # buffer[3]
# q11: d22: (temp) # q15: d30: buffer[4]
# (temp) # buffer[5]
# d23: (temp) # d31: buffer[6]
# (temp) # buffer[7]
resampler_CC_upsample_neon:
_resampler_CC_upsample_neon:
vld1.f32 {q14-q15}, [r2, :256]
vldr s4, [sp]
vpush {q4,q5,q6,q7}
push {r4}
mov r4, r0
veor q0, q0, q0
vmov.f32 s0, #1.0
vmov.f32 s2, #-1.0
vmov.f32 s3, #-2.0
vmov.f32 q2, #0.5
vmov.f32 q3, #-0.5
vmov.f32 q5, #1.0
vmov.f32 q6, #3.0
vmov.f32 q7, #0.25
vldr s5, [r2, #32]
vdiv.f32 s6, s20, s4
vadd.f32 s7, s6, s8
vdup.f32 q4, d2[0]
vmin.f32 q4, q4, q5
lsl r3, #3
add r3, r3, r1
cmp r3, r1
beq 4f
1:
vld1.f32 d16, [r1, :64]!
vmov.f32 d28, d29
vmov.f32 d29, d30
vmov.f32 d30, d31
vmov.f32 d31, d16
vcmpe.f32 s5, s20
vmrs APSR_nzcv, fpscr
bge 3f
2:
vdup.f32 q8, d2[1]
vadd.f32 q8, q8, q0
vadd.f32 q10, q8, q2
vsub.f32 q11, q8, q2
vmul.f32 q10, q10, q4
vmul.f32 q11, q11, q4
#if (CC_RESAMPLER_PRECISION > 0)
vmul.f32 q8, q10, q10
vmul.f32 q9, q11, q11
vsub.f32 q12, q6, q8
vsub.f32 q13, q6, q9
vmul.f32 q12, q12, q8
vmul.f32 q13, q13, q9
vmul.f32 q12, q12, q7
vmul.f32 q13, q13, q7
vsub.f32 q12, q5, q12
vsub.f32 q13, q5, q13
vmul.f32 q10, q10, q12
vmul.f32 q11, q11, q13
#endif
vmin.f32 q10, q10, q2
vmin.f32 q11, q11, q2
vmax.f32 q10, q10, q3
vmax.f32 q11, q11, q3
vsub.f32 q10, q10, q11
vmov.f32 q11, q10
vzip.f32 q10, q11
vmul.f32 q10, q10, q14
vmul.f32 q11, q11, q15
vadd.f32 q10, q10, q11
vadd.f32 d20, d20, d21
vst1.f32 d20, [r4, :64]!
vadd.f32 s5, s5, s6
vcmpe.f32 s5, s20
vmrs APSR_nzcv, fpscr
blt 2b
3:
# distance--
vsub.f32 s5, s5, s20
cmp r3, r1
bne 1b
4:
vst1.f32 {q14-q15}, [r2, :256]
vstr s5, [r2, #32]
sub r0, r4, r0
lsr r0, r0, #3
pop {r4}
vpop {q4,q5,q6,q7}
bx lr
#endif