mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-27 21:40:34 +00:00
x86/aacpsdsp: add ff_ps_hybrid_synthesis_deint_{sse,sse4}
About 2x faster than the c version.
This commit is contained in:
parent
3c5a53cdfa
commit
e229df9478
@ -166,6 +166,129 @@ align 16
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
;***********************************************************
|
||||
;void ps_hybrid_synthesis_deint_sse4(float out[2][38][64],
|
||||
; float (*in)[32][2],
|
||||
; int i, int len)
|
||||
;***********************************************************
|
||||
%macro HYBRID_SYNTHESIS_DEINT 0
|
||||
cglobal ps_hybrid_synthesis_deint, 3, 7, 5, out, in, i, len, out0, out1, tmp
|
||||
%if cpuflag(sse4)
|
||||
%define MOVH movsd
|
||||
%else
|
||||
%define MOVH movlps
|
||||
%endif
|
||||
movsxdifnidn iq, id
|
||||
mov lend, 32 << 3
|
||||
lea outq, [outq+iq*4]
|
||||
mov tmpd, id
|
||||
shl tmpd, 8
|
||||
add inq, tmpq
|
||||
mov tmpd, 64
|
||||
sub tmpd, id
|
||||
mov id, tmpd
|
||||
|
||||
test id, 1
|
||||
jne .loop4
|
||||
test id, 2
|
||||
jne .loop8
|
||||
|
||||
align 16
|
||||
.loop16:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop16:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
movaps m2, [inq+lenq*2]
|
||||
movaps m3, [inq+3*32*2*4]
|
||||
TRANSPOSE4x4PS 0, 1, 2, 3, 4
|
||||
movaps [out0q], m0
|
||||
movaps [out1q], m1
|
||||
movaps [out0q+lenq], m2
|
||||
movaps [out1q+lenq], m3
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop16
|
||||
add outq, 16
|
||||
add inq, 3*32*2*4
|
||||
sub id, 4
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop8:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop8:
|
||||
movaps m0, [inq]
|
||||
movaps m1, [inq+lenq]
|
||||
SBUTTERFLYPS 0, 1, 2
|
||||
SBUTTERFLYPD 0, 1, 2
|
||||
MOVH [out0q], m0
|
||||
MOVH [out1q], m1
|
||||
movhps [out0q+lenq], m0
|
||||
movhps [out1q+lenq], m1
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop8
|
||||
add outq, 8
|
||||
add inq, lenq
|
||||
sub id, 2
|
||||
jg .loop16
|
||||
RET
|
||||
|
||||
align 16
|
||||
.loop4:
|
||||
mov out0q, outq
|
||||
mov out1q, 38*64*4
|
||||
add out1q, out0q
|
||||
mov tmpd, lend
|
||||
|
||||
.inner_loop4:
|
||||
movaps m0, [inq]
|
||||
movss [out0q], m0
|
||||
%if cpuflag(sse4)
|
||||
extractps [out1q], m0, 1
|
||||
extractps [out0q+lenq], m0, 2
|
||||
extractps [out1q+lenq], m0, 3
|
||||
%else
|
||||
movhlps m1, m0
|
||||
movss [out0q+lenq], m1
|
||||
shufps m0, m0, 0xb1
|
||||
movss [out1q], m0
|
||||
movhlps m1, m0
|
||||
movss [out1q+lenq], m1
|
||||
%endif
|
||||
lea out0q, [out0q+lenq*2]
|
||||
lea out1q, [out1q+lenq*2]
|
||||
add inq, mmsize
|
||||
sub tmpd, mmsize
|
||||
jg .inner_loop4
|
||||
add outq, 4
|
||||
sub id, 1
|
||||
test id, 2
|
||||
jne .loop8
|
||||
cmp id, 4
|
||||
jge .loop16
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
INIT_XMM sse4
|
||||
HYBRID_SYNTHESIS_DEINT
|
||||
|
||||
;*******************************************************************
|
||||
;void ff_ps_hybrid_analysis_<opt>(float (*out)[2], float (*in)[2],
|
||||
; const float (*filter)[8][2],
|
||||
|
@ -40,6 +40,10 @@ void ff_ps_stereo_interpolate_sse3(float (*l)[2], float (*r)[2],
|
||||
void ff_ps_stereo_interpolate_ipdopd_sse3(float (*l)[2], float (*r)[2],
|
||||
float h[2][4], float h_step[2][4],
|
||||
int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
void ff_ps_hybrid_synthesis_deint_sse4(float out[2][38][64], float (*in)[32][2],
|
||||
int i, int len);
|
||||
|
||||
av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
{
|
||||
@ -48,6 +52,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
if (EXTERNAL_SSE(cpu_flags)) {
|
||||
s->add_squares = ff_ps_add_squares_sse;
|
||||
s->mul_pair_single = ff_ps_mul_pair_single_sse;
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE3(cpu_flags)) {
|
||||
@ -56,4 +61,7 @@ av_cold void ff_psdsp_init_x86(PSDSPContext *s)
|
||||
s->stereo_interpolate[1] = ff_ps_stereo_interpolate_ipdopd_sse3;
|
||||
s->hybrid_analysis = ff_ps_hybrid_analysis_sse3;
|
||||
}
|
||||
if (EXTERNAL_SSE4(cpu_flags)) {
|
||||
s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_sse4;
|
||||
}
|
||||
}
|
||||
|
@ -71,6 +71,12 @@
|
||||
SWAP %1, %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLYPD 3
|
||||
movlhps m%3, m%1, m%2
|
||||
movhlps m%2, m%2, m%1
|
||||
SWAP %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4B 5
|
||||
SBUTTERFLY bw, %1, %2, %5
|
||||
SBUTTERFLY bw, %3, %4, %5
|
||||
@ -117,12 +123,9 @@
|
||||
%macro TRANSPOSE4x4PS 5
|
||||
SBUTTERFLYPS %1, %2, %5
|
||||
SBUTTERFLYPS %3, %4, %5
|
||||
movlhps m%5, m%1, m%3
|
||||
movhlps m%3, m%1
|
||||
SWAP %5, %1
|
||||
movlhps m%5, m%2, m%4
|
||||
movhlps m%4, m%2
|
||||
SWAP %5, %2, %3
|
||||
SBUTTERFLYPD %1, %3, %5
|
||||
SBUTTERFLYPD %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE8x4D 9-11
|
||||
|
Loading…
Reference in New Issue
Block a user