mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2025-02-10 14:24:10 +00:00
opus_pvq_search: Restore the proper use of conditional define and simplify the function name suffix handling.
Using named define properly documents the code paths. It also avoids passing additional numbered arguments through multiple levels of macro templates. The suffix handling is done by concatenation, like in other asm functions and avoid having two separate "cglobal" defines. Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
This commit is contained in:
parent
1146133df8
commit
43dab86bcd
@ -82,7 +82,7 @@ SECTION .text
|
|||||||
%endif
|
%endif
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
|
%macro PULSES_SEARCH 1
|
||||||
; m6 Syy_norm
|
; m6 Syy_norm
|
||||||
; m7 Sxy_norm
|
; m7 Sxy_norm
|
||||||
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
|
||||||
@ -96,17 +96,17 @@ align 16
|
|||||||
movaps m4, [tmpY + r4] ; y[i]
|
movaps m4, [tmpY + r4] ; y[i]
|
||||||
movaps m5, [tmpX + r4] ; X[i]
|
movaps m5, [tmpX + r4] ; X[i]
|
||||||
|
|
||||||
%if %2
|
%if USE_APPROXIMATION == 1
|
||||||
xorps m0, m0
|
xorps m0, m0
|
||||||
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
|
||||||
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
|
||||||
|
|
||||||
%if %2
|
%if USE_APPROXIMATION == 1
|
||||||
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%else
|
%else
|
||||||
movaps m5, [tmpY + r4] ; m5 = y[i]
|
movaps m5, [tmpY + r4] ; m5 = y[i]
|
||||||
@ -119,7 +119,7 @@ align 16
|
|||||||
andps m5, m0 ; (0<y)?m5:0
|
andps m5, m0 ; (0<y)?m5:0
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
%if %2
|
%if USE_APPROXIMATION == 1
|
||||||
rsqrtps m4, m4
|
rsqrtps m4, m4
|
||||||
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
|
||||||
%else
|
%else
|
||||||
@ -211,13 +211,8 @@ align 16
|
|||||||
; uint32 K - Number of pulses to have after quantizations.
|
; uint32 K - Number of pulses to have after quantizations.
|
||||||
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
; uint32 N - Number of vector elements. Must be 0 < N < 256
|
||||||
;
|
;
|
||||||
%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
|
%macro PVQ_FAST_SEARCH 1
|
||||||
%if %1
|
cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
||||||
cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
|
||||||
%else
|
|
||||||
cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
|
|
||||||
%endif
|
|
||||||
|
|
||||||
%define tmpX rsp
|
%define tmpX rsp
|
||||||
%define tmpY outYq
|
%define tmpY outYq
|
||||||
|
|
||||||
@ -260,7 +255,7 @@ align 16
|
|||||||
jz %%zero_input ; if (Sx==0) goto zero_input
|
jz %%zero_input ; if (Sx==0) goto zero_input
|
||||||
|
|
||||||
cvtsi2ss xm0, dword Kd ; m0 = K
|
cvtsi2ss xm0, dword Kd ; m0 = K
|
||||||
%if %1
|
%if USE_APPROXIMATION == 1
|
||||||
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
rcpss xm1, xm1 ; m1 = approx(1/Sx)
|
||||||
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
mulss xm0, xm1 ; m0 = K*(1/Sx)
|
||||||
%else
|
%else
|
||||||
@ -313,7 +308,7 @@ align 16
|
|||||||
align 16 ; K - pulses > 0
|
align 16 ; K - pulses > 0
|
||||||
%%add_pulses_loop:
|
%%add_pulses_loop:
|
||||||
|
|
||||||
PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm
|
PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
|
||||||
|
|
||||||
sub Kd, 1
|
sub Kd, 1
|
||||||
jnz %%add_pulses_loop
|
jnz %%add_pulses_loop
|
||||||
@ -325,7 +320,7 @@ align 16 ; K - pulses > 0
|
|||||||
align 16
|
align 16
|
||||||
%%remove_pulses_loop:
|
%%remove_pulses_loop:
|
||||||
|
|
||||||
PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm
|
PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
|
||||||
|
|
||||||
add Kd, 1
|
add Kd, 1
|
||||||
jnz %%remove_pulses_loop
|
jnz %%remove_pulses_loop
|
||||||
@ -376,11 +371,15 @@ align 16
|
|||||||
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
; On Skylake & Ryzen the division is much faster (around 11c/3),
|
||||||
; that makes the full precision code about 2% slower.
|
; that makes the full precision code about 2% slower.
|
||||||
; Opus also does use rsqrt approximation in their intrinsics code.
|
; Opus also does use rsqrt approximation in their intrinsics code.
|
||||||
|
%define USE_APPROXIMATION 1
|
||||||
|
|
||||||
INIT_XMM sse2
|
INIT_XMM sse2
|
||||||
PVQ_FAST_SEARCH 1
|
PVQ_FAST_SEARCH _approx
|
||||||
|
|
||||||
INIT_XMM sse4
|
INIT_XMM sse4
|
||||||
PVQ_FAST_SEARCH 1
|
PVQ_FAST_SEARCH _approx
|
||||||
|
|
||||||
|
%define USE_APPROXIMATION 0
|
||||||
|
|
||||||
INIT_XMM avx
|
INIT_XMM avx
|
||||||
PVQ_FAST_SEARCH 0
|
PVQ_FAST_SEARCH _exact
|
||||||
|
Loading…
x
Reference in New Issue
Block a user