From 12655c48049f9a52e5504bde90fe738862b0ff08 Mon Sep 17 00:00:00 2001 From: Peter Meerwald Date: Thu, 19 Feb 2015 23:28:26 +0100 Subject: [PATCH] libavresample: NEON optimized FIR audio resampling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit modelled after aarch64 code on Cortex-A8, s16 and s32 code is about 2x faster, float code about 7x faster Signed-off-by: Peter Meerwald Signed-off-by: Martin Storsjö --- libavresample/arm/Makefile | 6 +- libavresample/arm/asm-offsets.h | 29 +++ libavresample/arm/resample_init.c | 74 ++++++ libavresample/arm/resample_neon.S | 358 ++++++++++++++++++++++++++++++ libavresample/internal.h | 3 + libavresample/resample.c | 2 + 6 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 libavresample/arm/asm-offsets.h create mode 100644 libavresample/arm/resample_init.c create mode 100644 libavresample/arm/resample_neon.S diff --git a/libavresample/arm/Makefile b/libavresample/arm/Makefile index 60f3f6d6b8..affc2bfa45 100644 --- a/libavresample/arm/Makefile +++ b/libavresample/arm/Makefile @@ -1,5 +1,7 @@ -OBJS += arm/audio_convert_init.o +OBJS += arm/audio_convert_init.o \ + arm/resample_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o -NEON-OBJS += arm/audio_convert_neon.o +NEON-OBJS += arm/audio_convert_neon.o \ + arm/resample_neon.o diff --git a/libavresample/arm/asm-offsets.h b/libavresample/arm/asm-offsets.h new file mode 100644 index 0000000000..a1623ed68b --- /dev/null +++ b/libavresample/arm/asm-offsets.h @@ -0,0 +1,29 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVRESAMPLE_ARM_ASM_OFFSETS_H +#define AVRESAMPLE_ARM_ASM_OFFSETS_H + +/* struct ResampleContext */ +#define FILTER_BANK 0x08 +#define FILTER_LENGTH 0x0c +#define SRC_INCR 0x20 +#define PHASE_SHIFT 0x28 +#define PHASE_MASK (PHASE_SHIFT + 0x04) + +#endif /* AVRESAMPLE_ARM_ASM_OFFSETS_H */ diff --git a/libavresample/arm/resample_init.c b/libavresample/arm/resample_init.c new file mode 100644 index 0000000000..540177ed0b --- /dev/null +++ b/libavresample/arm/resample_init.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2014 Peter Meerwald + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavutil/internal.h" +#include "libavutil/samplefmt.h" + +#include "libavresample/resample.h" + +#include "asm-offsets.h" + +AV_CHECK_OFFSET(struct ResampleContext, filter_bank, FILTER_BANK); +AV_CHECK_OFFSET(struct ResampleContext, filter_length, FILTER_LENGTH); +AV_CHECK_OFFSET(struct ResampleContext, src_incr, SRC_INCR); +AV_CHECK_OFFSET(struct ResampleContext, phase_shift, PHASE_SHIFT); +AV_CHECK_OFFSET(struct ResampleContext, phase_mask, PHASE_MASK); + +void ff_resample_one_flt_neon(struct ResampleContext *c, void *dst0, + int dst_index, const void *src0, + unsigned int index, int frac); +void ff_resample_one_s16_neon(struct ResampleContext *c, void *dst0, + int dst_index, const void *src0, + unsigned int index, int frac); +void ff_resample_one_s32_neon(struct ResampleContext *c, void *dst0, + int dst_index, const void *src0, + unsigned int index, int frac); + +void ff_resample_linear_flt_neon(struct ResampleContext *c, void *dst0, + int dst_index, const void *src0, + unsigned int index, int frac); + +av_cold void ff_audio_resample_init_arm(ResampleContext *c, + enum AVSampleFormat sample_fmt) +{ + int cpu_flags = av_get_cpu_flags(); + if (have_neon(cpu_flags)) { + switch (sample_fmt) { + case AV_SAMPLE_FMT_FLTP: + if (c->linear) + c->resample_one = ff_resample_linear_flt_neon; + else + c->resample_one = ff_resample_one_flt_neon; + break; + case AV_SAMPLE_FMT_S16P: + if (!c->linear) + c->resample_one = ff_resample_one_s16_neon; + break; + case AV_SAMPLE_FMT_S32P: + if (!c->linear) + c->resample_one = ff_resample_one_s32_neon; + break; + } + } +} diff --git a/libavresample/arm/resample_neon.S b/libavresample/arm/resample_neon.S new file mode 100644 index 0000000000..a8181e55ce --- /dev/null +++ b/libavresample/arm/resample_neon.S @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2014 Peter Meerwald + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +#include "asm-offsets.h" + +.macro resample_one fmt, es=2 +function ff_resample_one_\fmt\()_neon, export=1 + push {r4, r5} + add r1, r1, r2, lsl #\es + + ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */ + ldr ip, [sp, #8] /* index */ + ldr r5, [r0, #FILTER_LENGTH] + and r2, ip, r2 /* (index & phase_mask) */ + ldr r4, [r0, #PHASE_SHIFT] + lsr r4, ip, r4 /* compute sample_index */ + mul r2, r2, r5 + + ldr ip, [r0, #FILTER_BANK] + add r3, r3, r4, lsl #\es /* &src[sample_index] */ + + cmp r5, #8 + add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */ + + blt 5f +8: + subs r5, r5, #8 + LOAD4 + MUL4 +7: + LOAD4 + beq 6f + cmp r5, #8 + MLA4 + blt 4f + subs r5, r5, #8 + LOAD4 + MLA4 + b 7b +6: + MLA4 + STORE + pop {r4, r5} + bx lr +5: + INIT4 +4: /* remaining filter_length 1 to 7 */ + cmp r5, #4 + blt 2f + subs r5, r5, #4 + LOAD4 + MLA4 + beq 0f +2: /* remaining filter_length 1 to 3 */ + cmp r5, #2 + blt 1f + subs r5, r5, #2 + LOAD2 + MLA2 + beq 0f +1: /* remaining filter_length 1 */ + LOAD1 + MLA1 +0: + STORE + pop {r4, r5} + bx lr +endfunc + +.purgem LOAD1 +.purgem LOAD2 +.purgem LOAD4 +.purgem MLA1 +.purgem MLA2 +.purgem MLA4 +.purgem MUL4 +.purgem INIT4 +.purgem STORE +.endm + + +/* float32 */ +.macro LOAD1 + veor.32 d0, d0 + vld1.32 {d0[0]}, [r0]! /* load filter */ + vld1.32 {d4[0]}, [r3]! /* load src */ +.endm +.macro LOAD2 + vld1.32 {d0}, [r0]! /* load filter */ + vld1.32 {d4}, [r3]! /* load src */ +.endm +.macro LOAD4 + vld1.32 {d0,d1}, [r0]! /* load filter */ + vld1.32 {d4,d5}, [r3]! /* load src */ +.endm +.macro MLA1 + vmla.f32 d16, d0, d4[0] +.endm +.macro MLA2 + vmla.f32 d16, d0, d4 +.endm +.macro MLA4 + vmla.f32 d16, d0, d4 + vmla.f32 d17, d1, d5 +.endm +.macro MUL4 + vmul.f32 d16, d0, d4 + vmul.f32 d17, d1, d5 +.endm +.macro INIT4 + veor.f32 q8, q8 +.endm +.macro STORE + vpadd.f32 d16, d16, d17 + vpadd.f32 d16, d16, d16 + vst1.32 d16[0], [r1] +.endm + +resample_one flt, 2 + + +/* s32 */ +.macro LOAD1 + veor.32 d0, d0 + vld1.32 {d0[0]}, [r0]! /* load filter */ + vld1.32 {d4[0]}, [r3]! /* load src */ +.endm +.macro LOAD2 + vld1.32 {d0}, [r0]! /* load filter */ + vld1.32 {d4}, [r3]! /* load src */ +.endm +.macro LOAD4 + vld1.32 {d0,d1}, [r0]! /* load filter */ + vld1.32 {d4,d5}, [r3]! /* load src */ +.endm +.macro MLA1 + vmlal.s32 q8, d0, d4[0] +.endm +.macro MLA2 + vmlal.s32 q8, d0, d4 +.endm +.macro MLA4 + vmlal.s32 q8, d0, d4 + vmlal.s32 q9, d1, d5 +.endm +.macro MUL4 + vmull.s32 q8, d0, d4 + vmull.s32 q9, d1, d5 +.endm +.macro INIT4 + veor.s64 q8, q8 + veor.s64 q9, q9 +.endm +.macro STORE + vadd.s64 q8, q8, q9 + vadd.s64 d16, d16, d17 + vqrshrn.s64 d16, q8, #30 + vst1.32 d16[0], [r1] +.endm + +resample_one s32, 2 + + +/* s16 */ +.macro LOAD1 + veor.16 d0, d0 + vld1.16 {d0[0]}, [r0]! /* load filter */ + vld1.16 {d4[0]}, [r3]! /* load src */ +.endm +.macro LOAD2 + veor.16 d0, d0 + vld1.32 {d0[0]}, [r0]! /* load filter */ + veor.16 d4, d4 + vld1.32 {d4[0]}, [r3]! /* load src */ +.endm +.macro LOAD4 + vld1.16 {d0}, [r0]! /* load filter */ + vld1.16 {d4}, [r3]! /* load src */ +.endm +.macro MLA1 + vmlal.s16 q8, d0, d4[0] +.endm +.macro MLA2 + vmlal.s16 q8, d0, d4 +.endm +.macro MLA4 + vmlal.s16 q8, d0, d4 +.endm +.macro MUL4 + vmull.s16 q8, d0, d4 +.endm +.macro INIT4 + veor.s32 q8, q8 +.endm +.macro STORE + vpadd.s32 d16, d16, d17 + vpadd.s32 d16, d16, d16 + vqrshrn.s32 d16, q8, #15 + vst1.16 d16[0], [r1] +.endm + +resample_one s16, 1 + + +.macro resample_linear fmt, es=2 +function ff_resample_linear_\fmt\()_neon, export=1 + push {r4, r5} + add r1, r1, r2, lsl #\es + + ldr r2, [r0, #PHASE_SHIFT+4] /* phase_mask */ + ldr ip, [sp, #8] /* index */ + ldr r5, [r0, #FILTER_LENGTH] + and r2, ip, r2 /* (index & phase_mask) */ + ldr r4, [r0, #PHASE_SHIFT] + lsr r4, ip, r4 /* compute sample_index */ + mul r2, r2, r5 + + ldr ip, [r0, #FILTER_BANK] + add r3, r3, r4, lsl #\es /* &src[sample_index] */ + + cmp r5, #8 + ldr r4, [r0, #SRC_INCR] + add r0, ip, r2, lsl #\es /* filter = &filter_bank[...] */ + add r2, r0, r5, lsl #\es /* filter[... + c->filter_length] */ + + blt 5f +8: + subs r5, r5, #8 + LOAD4 + MUL4 +7: + LOAD4 + beq 6f + cmp r5, #8 + MLA4 + blt 4f + subs r5, r5, #8 + LOAD4 + MLA4 + b 7b +6: + MLA4 + STORE + pop {r4, r5} + bx lr +5: + INIT4 +4: /* remaining filter_length 1 to 7 */ + cmp r5, #4 + blt 2f + subs r5, r5, #4 + LOAD4 + MLA4 + beq 0f +2: /* remaining filter_length 1 to 3 */ + cmp r5, #2 + blt 1f + subs r5, r5, #2 + LOAD2 + MLA2 + beq 0f +1: /* remaining filter_length 1 */ + LOAD1 + MLA1 +0: + STORE + pop {r4, r5} + bx lr +endfunc + +.purgem LOAD1 +.purgem LOAD2 +.purgem LOAD4 +.purgem MLA1 +.purgem MLA2 +.purgem MLA4 +.purgem MUL4 +.purgem INIT4 +.purgem STORE +.endm + + +/* float32 linear */ +.macro LOAD1 + veor.32 d0, d0 + veor.32 d2, d2 + vld1.32 {d0[0]}, [r0]! /* load filter */ + vld1.32 {d2[0]}, [r2]! /* load filter */ + vld1.32 {d4[0]}, [r3]! /* load src */ +.endm +.macro LOAD2 + vld1.32 {d0}, [r0]! /* load filter */ + vld1.32 {d2}, [r2]! /* load filter */ + vld1.32 {d4}, [r3]! /* load src */ +.endm +.macro LOAD4 + vld1.32 {d0,d1}, [r0]! /* load filter */ + vld1.32 {d2,d3}, [r2]! /* load filter */ + vld1.32 {d4,d5}, [r3]! /* load src */ +.endm +.macro MLA1 + vmla.f32 d18, d0, d4[0] + vmla.f32 d16, d2, d4[0] +.endm +.macro MLA2 + vmla.f32 d18, d0, d4 + vmla.f32 d16, d2, d4 +.endm +.macro MLA4 + vmla.f32 q9, q0, q2 + vmla.f32 q8, q1, q2 +.endm +.macro MUL4 + vmul.f32 q9, q0, q2 + vmul.f32 q8, q1, q2 +.endm +.macro INIT4 + veor.f32 q9, q9 + veor.f32 q8, q8 +.endm +.macro STORE + vldr s0, [sp, #12] /* frac */ + vmov s1, r4 + vcvt.f32.s32 d0, d0 + + vsub.f32 q8, q8, q9 /* v2 - val */ + vpadd.f32 d18, d18, d19 + vpadd.f32 d16, d16, d17 + vpadd.f32 d2, d18, d18 + vpadd.f32 d1, d16, d16 + + vmul.f32 s2, s2, s0 /* (v2 - val) * frac */ + vdiv.f32 s2, s2, s1 /* / c->src_incr */ + vadd.f32 s4, s4, s2 + + vstr s4, [r1] +.endm + +resample_linear flt, 2 diff --git a/libavresample/internal.h b/libavresample/internal.h index e59758c29b..b88b7587aa 100644 --- a/libavresample/internal.h +++ b/libavresample/internal.h @@ -110,4 +110,7 @@ struct AVAudioResampleContext { void ff_audio_resample_init_aarch64(ResampleContext *c, enum AVSampleFormat sample_fmt); +void ff_audio_resample_init_arm(ResampleContext *c, + enum AVSampleFormat sample_fmt); + #endif /* AVRESAMPLE_INTERNAL_H */ diff --git a/libavresample/resample.c b/libavresample/resample.c index 4553b2c6eb..dd0ea133f6 100644 --- a/libavresample/resample.c +++ b/libavresample/resample.c @@ -172,6 +172,8 @@ ResampleContext *ff_audio_resample_init(AVAudioResampleContext *avr) if (ARCH_AARCH64) ff_audio_resample_init_aarch64(c, avr->internal_sample_fmt); + if (ARCH_ARM) + ff_audio_resample_init_arm(c, avr->internal_sample_fmt); felem_size = av_get_bytes_per_sample(avr->internal_sample_fmt); c->filter_bank = av_mallocz(c->filter_length * (phase_count + 1) * felem_size);