mirror of
https://github.com/libretro/RetroArch.git
synced 2024-11-26 09:41:15 +00:00
add NEON optimizations for the CC resampler.
This commit is contained in:
parent
40402174d9
commit
7899dea89f
@ -232,6 +232,7 @@ endif
|
||||
|
||||
ifeq ($(HAVE_NEON),1)
|
||||
OBJ += audio/resamplers/sinc_neon.o
|
||||
OBJ += audio/resamplers/cc_resampler_neon.S
|
||||
# When compiled without this, tries to attempt to compile sinc lerp,
|
||||
# which will error out
|
||||
#
|
||||
|
@ -28,6 +28,7 @@ ifeq ($(HAVE_NEON),1)
|
||||
LOCAL_CFLAGS += -D__ARM_NEON__
|
||||
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/utils_neon.S.neon
|
||||
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/sinc_neon.S.neon
|
||||
LOCAL_SRC_FILES += $(RARCH_DIR)/audio/resamplers/cc_resampler_neon.S.neon
|
||||
endif
|
||||
LOCAL_CFLAGS += -DSINC_LOWER_QUALITY
|
||||
|
||||
|
@ -29,6 +29,45 @@
|
||||
#endif
|
||||
|
||||
|
||||
/* since SSE and NEON don't provide support for trigonometric functions
|
||||
* we approximate those with polynoms
|
||||
*
|
||||
* CC_RESAMPLER_PRECISION defines how accurate the approximation is
|
||||
* a setting of 5 or more means full precison.
|
||||
* setting 0 doesn't use a polynom
|
||||
* setting 1 uses P(X) = X - (3/4)*X^3 + (1/4)*X^5
|
||||
*
|
||||
* only 0 and 1 are implemented for SSE and NEON currently
|
||||
*
|
||||
* the MIPS_ARCH_ALLEGREX target doesnt require this setting since it has
|
||||
* native support for the required functions so it will always use full precision.
|
||||
*/
|
||||
|
||||
#ifndef CC_RESAMPLER_PRECISION
|
||||
#define CC_RESAMPLER_PRECISION 1
|
||||
#endif
|
||||
|
||||
#ifndef min
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
#ifndef ALIGN_MEMORY
|
||||
#ifdef _MSC_VER
|
||||
# define ALIGN_MEMORY(X) __declspec(align(X))
|
||||
#else
|
||||
# define ALIGN_MEMORY(X) __attribute__((aligned(X)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef struct rarch_CC_resampler
|
||||
{
|
||||
ALIGN_MEMORY(32) audio_frame_float_t buffer[4];
|
||||
|
||||
float distance;
|
||||
void (*process)(void *re, struct resampler_data *data);
|
||||
} rarch_CC_resampler_t;
|
||||
|
||||
|
||||
#ifdef _MIPS_ARCH_ALLEGREX
|
||||
static void resampler_CC_process(void *re_, struct resampler_data *data)
|
||||
{
|
||||
@ -146,29 +185,14 @@ static void *resampler_CC_init(double bandwidth_mod)
|
||||
RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n");
|
||||
return (void*)-1;
|
||||
}
|
||||
#elif defined(__SSE__)
|
||||
#else
|
||||
|
||||
/* uses a fast polynomial approximation
|
||||
* since SSE lacks native support for trigonometric functions
|
||||
* cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5
|
||||
*/
|
||||
|
||||
#if defined(__SSE__)
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#ifndef CC_RESAMPLER_PRECISION
|
||||
#define CC_RESAMPLER_PRECISION 1
|
||||
#endif
|
||||
|
||||
typedef struct rarch_CC_resampler
|
||||
{
|
||||
__m128 previous;
|
||||
__m128 current;
|
||||
|
||||
float distance;
|
||||
void (*process)(void *re, struct resampler_data *data);
|
||||
} rarch_CC_resampler_t;
|
||||
|
||||
#define CC_RESAMPLER_IDENT "SSE"
|
||||
|
||||
static void resampler_CC_downsample(void *re_, struct resampler_data *data)
|
||||
{
|
||||
@ -182,8 +206,8 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
|
||||
ratio = 1.0 / data->ratio;
|
||||
b = data->ratio; /* cutoff frequency. */
|
||||
|
||||
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
|
||||
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
|
||||
__m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
|
||||
__m128 vec_current = _mm_loadu_ps((float*)&re->buffer[2]);
|
||||
|
||||
while (inp != inp_max)
|
||||
{
|
||||
@ -249,16 +273,12 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
|
||||
}
|
||||
}
|
||||
|
||||
_mm_storeu_ps((float*)&re->previous, vec_previous);
|
||||
_mm_storeu_ps((float*)&re->current, vec_current);
|
||||
_mm_storeu_ps((float*)&re->buffer[0], vec_previous);
|
||||
_mm_storeu_ps((float*)&re->buffer[2], vec_current);
|
||||
|
||||
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
|
||||
}
|
||||
|
||||
#ifndef min
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
{
|
||||
float b, ratio;
|
||||
@ -271,8 +291,8 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
b = min(data->ratio, 1.00); /* cutoff frequency. */
|
||||
ratio = 1.0 / data->ratio;
|
||||
|
||||
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
|
||||
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
|
||||
__m128 vec_previous = _mm_loadu_ps((float*)&re->buffer[0]);
|
||||
__m128 vec_current = _mm_loadu_ps((float*)&re->buffer[2]);
|
||||
|
||||
|
||||
|
||||
@ -338,69 +358,39 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
inp++;
|
||||
}
|
||||
|
||||
_mm_storeu_ps((float*)&re->previous, vec_previous);
|
||||
_mm_storeu_ps((float*)&re->current, vec_current);
|
||||
_mm_storeu_ps((float*)&re->buffer[0], vec_previous);
|
||||
_mm_storeu_ps((float*)&re->buffer[2], vec_current);
|
||||
|
||||
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
|
||||
}
|
||||
|
||||
|
||||
static void resampler_CC_process(void *re_, struct resampler_data *data)
|
||||
#elif defined (__ARM_NEON__)
|
||||
|
||||
#define CC_RESAMPLER_IDENT "NEON"
|
||||
|
||||
size_t resampler_CC_downsample_neon(float *outp, const float *inp,
|
||||
rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
|
||||
size_t resampler_CC_upsample_neon (float *outp, const float *inp,
|
||||
rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
|
||||
|
||||
static void resampler_CC_downsample(void *re_, struct resampler_data *data)
|
||||
{
|
||||
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
|
||||
re->process(re_, data);
|
||||
data->output_frames = resampler_CC_downsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
|
||||
}
|
||||
|
||||
static void resampler_CC_free(void *re_)
|
||||
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
{
|
||||
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
|
||||
if (re)
|
||||
free(re);
|
||||
data->output_frames = resampler_CC_upsample_neon(data->data_out, data->data_in, re_, data->input_frames, data->ratio);
|
||||
}
|
||||
|
||||
static void *resampler_CC_init(double bandwidth_mod)
|
||||
{
|
||||
int i;
|
||||
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)
|
||||
calloc(1, sizeof(rarch_CC_resampler_t));
|
||||
if (!re)
|
||||
return NULL;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
re->previous = _mm_setzero_ps();
|
||||
re->current = _mm_setzero_ps();
|
||||
}
|
||||
|
||||
RARCH_LOG("Convoluted Cosine resampler (SSE) : ");
|
||||
|
||||
/* variations of data->ratio around 0.75 are safer
|
||||
* than around 1.0 for both up/downsampler. */
|
||||
if (bandwidth_mod < 0.75)
|
||||
{
|
||||
RARCH_LOG("CC_downsample @%f \n", bandwidth_mod);
|
||||
re->process = resampler_CC_downsample;
|
||||
re->distance = 0.0;
|
||||
}
|
||||
else
|
||||
{
|
||||
RARCH_LOG("CC_upsample @%f \n", bandwidth_mod);
|
||||
re->process = resampler_CC_upsample;
|
||||
re->distance = 2.0;
|
||||
}
|
||||
|
||||
return re;
|
||||
}
|
||||
#else
|
||||
|
||||
/* C reference version. Not optimized. */
|
||||
typedef struct rarch_CC_resampler
|
||||
{
|
||||
audio_frame_float_t buffer[4];
|
||||
float distance;
|
||||
void (*process)(void *re, struct resampler_data *data);
|
||||
} rarch_CC_resampler_t;
|
||||
|
||||
#define CC_RESAMPLER_IDENT "C"
|
||||
|
||||
#if (CC_RESAMPLER_PRECISION > 4)
|
||||
static inline float cc_int(float x, float b)
|
||||
{
|
||||
float val = x * b * M_PI + sinf(x * b * M_PI);
|
||||
@ -411,6 +401,21 @@ static inline float cc_kernel(float x, float b)
|
||||
{
|
||||
return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b)) / (2.0 * M_PI);
|
||||
}
|
||||
#else
|
||||
static inline float cc_int(float x, float b)
|
||||
{
|
||||
float val = x * b;
|
||||
#if (CC_RESAMPLER_PRECISION > 0)
|
||||
val = val*(1 - 0.25 * val * val * (3.0 - val * val));
|
||||
#endif
|
||||
return (val > 0.5) ? 0.5 : (val < -0.5) ? -0.5 : val;
|
||||
}
|
||||
|
||||
static inline float cc_kernel(float x, float b)
|
||||
{
|
||||
return (cc_int(x + 0.5, b) - cc_int(x - 0.5, b));
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void add_to(const audio_frame_float_t *source,
|
||||
audio_frame_float_t *target, float ratio)
|
||||
@ -458,10 +463,6 @@ static void resampler_CC_downsample(void *re_, struct resampler_data *data)
|
||||
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
|
||||
}
|
||||
|
||||
#ifndef min
|
||||
#define min(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
{
|
||||
float b, ratio;
|
||||
@ -505,6 +506,7 @@ static void resampler_CC_upsample(void *re_, struct resampler_data *data)
|
||||
|
||||
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void resampler_CC_process(void *re_, struct resampler_data *data)
|
||||
{
|
||||
@ -533,7 +535,7 @@ static void *resampler_CC_init(double bandwidth_mod)
|
||||
re->buffer[i].r = 0.0;
|
||||
}
|
||||
|
||||
RARCH_LOG("Convoluted Cosine resampler (C) : ");
|
||||
RARCH_LOG("Convoluted Cosine resampler (" CC_RESAMPLER_IDENT ") - precision = %i : ", CC_RESAMPLER_PRECISION);
|
||||
|
||||
/* variations of data->ratio around 0.75 are safer
|
||||
* than around 1.0 for both up/downsampler. */
|
||||
|
372
audio/resamplers/cc_resampler_neon.S
Normal file
372
audio/resamplers/cc_resampler_neon.S
Normal file
@ -0,0 +1,372 @@
|
||||
/* RetroArch - A frontend for libretro.
|
||||
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
|
||||
* Copyright (C) 2014 - Ali Bouhlel ( aliaspider@gmail.com )
|
||||
*
|
||||
* RetroArch is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with RetroArch.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
|
||||
#ifndef CC_RESAMPLER_PRECISION
|
||||
#define CC_RESAMPLER_PRECISION 1
|
||||
#endif
|
||||
|
||||
#ifndef __MACH__
|
||||
.arm
|
||||
#endif
|
||||
.align 4
|
||||
.globl resampler_CC_downsample_neon
|
||||
.globl _resampler_CC_downsample_neon
|
||||
|
||||
# size_t resampler_CC_downsample_neon(float *outp, const float *inp,
|
||||
# rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
|
||||
|
||||
# r0: outp initial (and output_frames return value)
|
||||
# r1: inp initial/current
|
||||
# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
|
||||
# r3: input_frames/inp_max
|
||||
# r4: outp current
|
||||
# r5:
|
||||
# r6:
|
||||
# r7:
|
||||
|
||||
# q0: d0: s0: 0.0 # q4: d8: s16: min(ratio, 1.0)
|
||||
# s1: 1.0 # s17: min(ratio, 1.0)
|
||||
# d1: s2: 2.0 # d9: s18: min(ratio, 1.0)
|
||||
# s3: 3.0 # s19: min(ratio, 1.0)
|
||||
# q1: d2: s4: ratio # q5: d10: s20: 1.0
|
||||
# s5: distance # s21: 1.0
|
||||
# d3: s6: (1.0/ratio) # d11: s22: 1.0
|
||||
# s7: (1.0/ratio)+0.5 # s23: 1.0
|
||||
# q2: d4: s8: 0.5 # q6: d12: s24: 3.0
|
||||
# s9: 0.5 # s25: 3.0
|
||||
# d5: s10: 0.5 # d13: s26: 3.0
|
||||
# s11: 0.5 # s27: 3.0
|
||||
# q3: d6: s12: -0.5 # q7: d14: s28: 0.25
|
||||
# s13: -0.5 # s29: 0.25
|
||||
# d7: s14: -0.5 # d15: s30: 0.25
|
||||
# s15: -0.5 # s31: 0.25
|
||||
|
||||
# q8: d16: (temp) # q12: d24: (temp)
|
||||
# (temp) # (temp)
|
||||
# d17: (temp) # d25: (temp)
|
||||
# (temp) # (temp)
|
||||
# q9: d18: (temp) # q13: d26: (temp)
|
||||
# (temp) # (temp)
|
||||
# d19: (temp) # d27: (temp)
|
||||
# (temp) # (temp)
|
||||
# q10: d20: (temp) # q14: d28: buffer[0]
|
||||
# (temp) # buffer[1]
|
||||
# d21: (temp) # d29: buffer[2]
|
||||
# (temp) # buffer[3]
|
||||
# q11: d22: (temp) # q15: d30: buffer[4]
|
||||
# (temp) # buffer[5]
|
||||
# d23: (temp) # d31: buffer[6]
|
||||
# (temp) # buffer[7]
|
||||
|
||||
|
||||
resampler_CC_downsample_neon:
|
||||
_resampler_CC_downsample_neon:
|
||||
|
||||
vld1.f32 {q14-q15}, [r2, :256]
|
||||
vldr s4, [sp]
|
||||
vpush {q4,q5,q6,q7}
|
||||
push {r4}
|
||||
|
||||
mov r4, r0
|
||||
|
||||
|
||||
veor q0, q0, q0
|
||||
vmov.f32 s1, #1.0
|
||||
vmov.f32 s2, #2.0
|
||||
vmov.f32 s3, #3.0
|
||||
|
||||
vmov.f32 q2, #0.5
|
||||
vmov.f32 q3, #-0.5
|
||||
|
||||
vmov.f32 q5, #1.0
|
||||
vmov.f32 q6, #3.0
|
||||
vmov.f32 q7, #0.25
|
||||
|
||||
|
||||
vldr s5, [r2, #32]
|
||||
vdiv.f32 s6, s20, s4
|
||||
vadd.f32 s7, s6, s8
|
||||
vdup.f32 q4, d2[0]
|
||||
vmin.f32 q4, q4, q5
|
||||
|
||||
|
||||
lsl r3, #3
|
||||
add r3, r3, r1
|
||||
|
||||
|
||||
cmp r3, r1
|
||||
beq 3f
|
||||
1:
|
||||
|
||||
vdup.f32 q8, d3[0]
|
||||
vmul.f32 q8, q8, q0
|
||||
vdup.f32 q9, d2[1]
|
||||
vsub.f32 q8, q9, q8
|
||||
|
||||
vadd.f32 q10, q8, q2
|
||||
vsub.f32 q11, q8, q2
|
||||
|
||||
vmul.f32 q10, q10, q4
|
||||
vmul.f32 q11, q11, q4
|
||||
|
||||
#if (CC_RESAMPLER_PRECISION > 0)
|
||||
vmul.f32 q8, q10, q10
|
||||
vmul.f32 q9, q11, q11
|
||||
|
||||
vsub.f32 q12, q6, q8
|
||||
vsub.f32 q13, q6, q9
|
||||
|
||||
vmul.f32 q12, q12, q8
|
||||
vmul.f32 q13, q13, q9
|
||||
|
||||
vmul.f32 q12, q12, q7
|
||||
vmul.f32 q13, q13, q7
|
||||
|
||||
vsub.f32 q12, q5, q12
|
||||
vsub.f32 q13, q5, q13
|
||||
|
||||
vmul.f32 q10, q10, q12
|
||||
vmul.f32 q11, q11, q13
|
||||
#endif
|
||||
|
||||
vmin.f32 q10, q10, q2
|
||||
vmin.f32 q11, q11, q2
|
||||
|
||||
vmax.f32 q10, q10, q3
|
||||
vmax.f32 q11, q11, q3
|
||||
|
||||
vsub.f32 q10, q10, q11
|
||||
vmov.f32 q11, q10
|
||||
|
||||
vzip.f32 q10, q11
|
||||
|
||||
vld1.f32 d16, [r1, :64]!
|
||||
vmov.f32 d17, d16
|
||||
|
||||
vmul.f32 q10, q10, q8
|
||||
vmul.f32 q11, q11, q8
|
||||
|
||||
vadd.f32 q14, q14, q10
|
||||
vadd.f32 q15, q15, q11
|
||||
|
||||
# distance++
|
||||
vadd.f32 s5, s5, s20
|
||||
|
||||
vcmpe.f32 s5, s7
|
||||
vmrs APSR_nzcv, fpscr
|
||||
ble 2f
|
||||
|
||||
vst1.f32 d28, [r4, :64]!
|
||||
vmov.f32 d28, d29
|
||||
vmov.f32 d29, d30
|
||||
vmov.f32 d30, d31
|
||||
vmov.f32 d31, #0.0
|
||||
|
||||
vsub.f32 s5, s5, s6
|
||||
|
||||
2:
|
||||
cmp r3, r1
|
||||
bne 1b
|
||||
|
||||
3:
|
||||
vst1.f32 {q14-q15}, [r2, :256]
|
||||
vstr s5, [r2, #32]
|
||||
sub r0, r4, r0
|
||||
lsr r0, r0, #3
|
||||
|
||||
pop {r4}
|
||||
vpop {q4,q5,q6,q7}
|
||||
|
||||
bx lr
|
||||
|
||||
|
||||
.align 4
|
||||
.globl resampler_CC_upsample_neon
|
||||
.globl _resampler_CC_upsample_neon
|
||||
|
||||
# size_t resampler_CC_upsample_neon(float *outp, const float *inp,
|
||||
# rarch_CC_resampler_t* re_, size_t input_frames, float ratio);
|
||||
|
||||
# r0: outp initial (and output_frames return value)
|
||||
# r1: inp initial/current
|
||||
# r2: re_ [r2+0] --> {q14,q15}=buffer , [r2+32] --> s5=distance
|
||||
# r3: input_frames/inp_max
|
||||
# r4: outp current
|
||||
# r5:
|
||||
# r6:
|
||||
# r7:
|
||||
|
||||
# q0: d0: s0: 1.0 # q4: d8: s16: min(ratio, 1.0)
|
||||
# s1: 0.0 # s17: min(ratio, 1.0)
|
||||
# d1: s2: -1.0 # d9: s18: min(ratio, 1.0)
|
||||
# s3: -2.0 # s19: min(ratio, 1.0)
|
||||
# q1: d2: s4: ratio # q5: d10: s20: 1.0
|
||||
# s5: distance # s21: 1.0
|
||||
# d3: s6: (1.0/ratio) # d11: s22: 1.0
|
||||
# s7: (1.0/ratio)+0.5 # s23: 1.0
|
||||
# q2: d4: s8: 0.5 # q6: d12: s24: 3.0
|
||||
# s9: 0.5 # s25: 3.0
|
||||
# d5: s10: 0.5 # d13: s26: 3.0
|
||||
# s11: 0.5 # s27: 3.0
|
||||
# q3: d6: s12: -0.5 # q7: d14: s28: 0.25
|
||||
# s13: -0.5 # s29: 0.25
|
||||
# d7: s14: -0.5 # d15: s30: 0.25
|
||||
# s15: -0.5 # s31: 0.25
|
||||
|
||||
# q8: d16: (temp) # q12: d24: (temp)
|
||||
# (temp) # (temp)
|
||||
# d17: (temp) # d25: (temp)
|
||||
# (temp) # (temp)
|
||||
# q9: d18: (temp) # q13: d26: (temp)
|
||||
# (temp) # (temp)
|
||||
# d19: (temp) # d27: (temp)
|
||||
# (temp) # (temp)
|
||||
# q10: d20: (temp) # q14: d28: buffer[0]
|
||||
# (temp) # buffer[1]
|
||||
# d21: (temp) # d29: buffer[2]
|
||||
# (temp) # buffer[3]
|
||||
# q11: d22: (temp) # q15: d30: buffer[4]
|
||||
# (temp) # buffer[5]
|
||||
# d23: (temp) # d31: buffer[6]
|
||||
# (temp) # buffer[7]
|
||||
|
||||
|
||||
resampler_CC_upsample_neon:
|
||||
_resampler_CC_upsample_neon:
|
||||
|
||||
vld1.f32 {q14-q15}, [r2, :256]
|
||||
vldr s4, [sp]
|
||||
vpush {q4,q5,q6,q7}
|
||||
push {r4}
|
||||
|
||||
mov r4, r0
|
||||
|
||||
|
||||
veor q0, q0, q0
|
||||
vmov.f32 s0, #1.0
|
||||
vmov.f32 s2, #-1.0
|
||||
vmov.f32 s3, #-2.0
|
||||
|
||||
vmov.f32 q2, #0.5
|
||||
vmov.f32 q3, #-0.5
|
||||
|
||||
vmov.f32 q5, #1.0
|
||||
vmov.f32 q6, #3.0
|
||||
vmov.f32 q7, #0.25
|
||||
|
||||
|
||||
vldr s5, [r2, #32]
|
||||
vdiv.f32 s6, s20, s4
|
||||
vadd.f32 s7, s6, s8
|
||||
vdup.f32 q4, d2[0]
|
||||
vmin.f32 q4, q4, q5
|
||||
|
||||
|
||||
lsl r3, #3
|
||||
add r3, r3, r1
|
||||
|
||||
|
||||
cmp r3, r1
|
||||
beq 4f
|
||||
1:
|
||||
|
||||
vld1.f32 d16, [r1, :64]!
|
||||
vmov.f32 d28, d29
|
||||
vmov.f32 d29, d30
|
||||
vmov.f32 d30, d31
|
||||
vmov.f32 d31, d16
|
||||
|
||||
vcmpe.f32 s5, s20
|
||||
vmrs APSR_nzcv, fpscr
|
||||
bge 3f
|
||||
2:
|
||||
vdup.f32 q8, d2[1]
|
||||
vadd.f32 q8, q8, q0
|
||||
|
||||
vadd.f32 q10, q8, q2
|
||||
vsub.f32 q11, q8, q2
|
||||
|
||||
vmul.f32 q10, q10, q4
|
||||
vmul.f32 q11, q11, q4
|
||||
|
||||
#if (CC_RESAMPLER_PRECISION > 0)
|
||||
vmul.f32 q8, q10, q10
|
||||
vmul.f32 q9, q11, q11
|
||||
|
||||
vsub.f32 q12, q6, q8
|
||||
vsub.f32 q13, q6, q9
|
||||
|
||||
vmul.f32 q12, q12, q8
|
||||
vmul.f32 q13, q13, q9
|
||||
|
||||
vmul.f32 q12, q12, q7
|
||||
vmul.f32 q13, q13, q7
|
||||
|
||||
vsub.f32 q12, q5, q12
|
||||
vsub.f32 q13, q5, q13
|
||||
|
||||
vmul.f32 q10, q10, q12
|
||||
vmul.f32 q11, q11, q13
|
||||
#endif
|
||||
|
||||
vmin.f32 q10, q10, q2
|
||||
vmin.f32 q11, q11, q2
|
||||
|
||||
vmax.f32 q10, q10, q3
|
||||
vmax.f32 q11, q11, q3
|
||||
|
||||
vsub.f32 q10, q10, q11
|
||||
vmov.f32 q11, q10
|
||||
|
||||
vzip.f32 q10, q11
|
||||
|
||||
vmul.f32 q10, q10, q14
|
||||
vmul.f32 q11, q11, q15
|
||||
|
||||
vadd.f32 q10, q10, q11
|
||||
vadd.f32 d20, d20, d21
|
||||
|
||||
vst1.f32 d20, [r4, :64]!
|
||||
|
||||
vadd.f32 s5, s5, s6
|
||||
|
||||
|
||||
vcmpe.f32 s5, s20
|
||||
vmrs APSR_nzcv, fpscr
|
||||
blt 2b
|
||||
|
||||
3:
|
||||
# distance--
|
||||
vsub.f32 s5, s5, s20
|
||||
|
||||
cmp r3, r1
|
||||
bne 1b
|
||||
|
||||
|
||||
4:
|
||||
vst1.f32 {q14-q15}, [r2, :256]
|
||||
vstr s5, [r2, #32]
|
||||
sub r0, r4, r0
|
||||
lsr r0, r0, #3
|
||||
|
||||
pop {r4}
|
||||
vpop {q4,q5,q6,q7}
|
||||
|
||||
bx lr
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user