mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-24 03:59:43 +00:00
arm: add ff_int32_to_float_fmul_array8_neon
Quite a bit faster than int32_to_float_fmul_array8_c calling ff_int32_to_float_fmul_scalar_neon through FmtConvertContext. Number of cycles per int32_to_float_fmul_array8 call while decoding padded.dts on exynos5422: before after change cortex-a7: 1270 951 -25% cortex-a15: 434 285 -34% checkasm --bench cycle counts: cortex-a15 cortex-a7 int32_to_float_fmul_array8_c: 1730.4 4384.5 int32_to_float_fmul_array8_neon_c: 571.5 1694.3 int32_to_float_fmul_array8_neon: 374.0 1448.8 Interesting are the differences between int32_to_float_fmul_array8_neon_c and int32_to_float_fmul_array8_neon. The former is current behaviour of calling ff_int32_to_float_fmul_scalar_neon repeatedly from the c function, The raw numbers differ since checkasm uses different lengths than the dca decoder.
This commit is contained in:
parent
a0fc780a20
commit
90b1b9350c
@ -25,6 +25,9 @@
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
|
||||
void ff_int32_to_float_fmul_array8_neon(FmtConvertContext *c, float *dst,
|
||||
const int32_t *src, const float *mul,
|
||||
int len);
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
|
||||
float mul, int len);
|
||||
|
||||
@ -46,6 +49,7 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
|
||||
}
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_neon;
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* ARM NEON optimised Format Conversion Utils
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>b
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
@ -49,3 +50,39 @@ NOVFP len .req r3
|
||||
bx lr
|
||||
.unreq len
|
||||
endfunc
|
||||
|
||||
function ff_int32_to_float_fmul_array8_neon, export=1
|
||||
ldr r0, [sp]
|
||||
lsr r0, r0, #3
|
||||
subs r0, r0, #1
|
||||
beq 1f
|
||||
2:
|
||||
vld1.32 {q0-q1}, [r2,:128]!
|
||||
vld1.32 {q2-q3}, [r2,:128]!
|
||||
vld1.32 {d20}, [r3]!
|
||||
subs r0, r0, #2
|
||||
vcvt.f32.s32 q0, q0
|
||||
vcvt.f32.s32 q1, q1
|
||||
vdup.32 q8, d20[0]
|
||||
vcvt.f32.s32 q2, q2
|
||||
vcvt.f32.s32 q3, q3
|
||||
vmul.f32 q0, q0, q8
|
||||
vdup.32 q9, d20[1]
|
||||
vmul.f32 q1, q1, q8
|
||||
vmul.f32 q2, q2, q9
|
||||
vmul.f32 q3, q3, q9
|
||||
vst1.32 {q0-q1}, [r1,:128]!
|
||||
vst1.32 {q2-q3}, [r1,:128]!
|
||||
bgt 2b
|
||||
it lt
|
||||
bxlt lr
|
||||
1:
|
||||
vld1.32 {q0-q1}, [r2,:128]
|
||||
vld1.32 {d16[],d17[]}, [r3]
|
||||
vcvt.f32.s32 q0, q0
|
||||
vcvt.f32.s32 q1, q1
|
||||
vmul.f32 q0, q0, q8
|
||||
vmul.f32 q1, q1, q8
|
||||
vst1.32 {q0-q1}, [r1,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
Loading…
Reference in New Issue
Block a user