x86/vf_ssim: add ff_ssim_4x4_line_xop

~20% faster than ssse3. Also enabled for x86_32

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
James Almer 2015-07-20 00:50:52 -03:00
parent e1778fb657
commit e3851169ee
2 changed files with 64 additions and 3 deletions

View File

@ -30,16 +30,50 @@ ssim_c2: times 4 dd 235963 ;(.03*.03*255*255*64*63 + .5)
SECTION .text
%macro SSIM_4X4_LINE 1
%if ARCH_X86_64
INIT_XMM ssse3
cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
cglobal ssim_4x4_line, 6, 8, %1, buf, buf_stride, ref, ref_stride, sums, w, buf_stride3, ref_stride3
%else
cglobal ssim_4x4_line, 5, 7, %1, buf, buf_stride, ref, ref_stride, sums, buf_stride3, ref_stride3
%define wd r5mp
%endif
lea ref_stride3q, [ref_strideq*3]
lea buf_stride3q, [buf_strideq*3]
%if notcpuflag(xop)
pxor m7, m7
mova m15, [pw_1]
%endif
.loop:
%if cpuflag(xop)
pmovzxbw m0, [bufq+buf_strideq*0]
pmovzxbw m1, [refq+ref_strideq*0]
pmaddwd m4, m0, m0
pmaddwd m6, m0, m1
pmovzxbw m2, [bufq+buf_strideq*1]
vpmadcswd m4, m1, m1, m4
pmovzxbw m3, [refq+ref_strideq*1]
paddw m0, m2
vpmadcswd m4, m2, m2, m4
vpmadcswd m6, m2, m3, m6
paddw m1, m3
vpmadcswd m4, m3, m3, m4
pmovzxbw m2, [bufq+buf_strideq*2]
pmovzxbw m3, [refq+ref_strideq*2]
vpmadcswd m4, m2, m2, m4
vpmadcswd m6, m2, m3, m6
pmovzxbw m5, [bufq+buf_stride3q]
pmovzxbw m7, [refq+ref_stride3q]
vpmadcswd m4, m3, m3, m4
vpmadcswd m6, m5, m7, m6
paddw m0, m2
paddw m1, m3
vpmadcswd m4, m5, m5, m4
paddw m0, m5
paddw m1, m7
vpmadcswd m4, m7, m7, m4
%else
movh m0, [bufq+buf_strideq*0] ; a1
movh m1, [refq+ref_strideq*0] ; b1
movh m2, [bufq+buf_strideq*1] ; a2
@ -85,12 +119,25 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
paddd m4, m9
paddd m6, m14
paddd m4, m12
%endif
; m0 = [word] s1 a,a,a,a,b,b,b,b
; m1 = [word] s2 a,a,a,a,b,b,b,b
; m4 = [dword] ss a,a,b,b
; m6 = [dword] s12 a,a,b,b
%if cpuflag(xop)
vphaddwq m0, m0 ; [dword] s1 a, 0, b, 0
vphaddwq m1, m1 ; [dword] s2 a, 0, b, 0
vphadddq m4, m4 ; [dword] ss a, 0, b, 0
vphadddq m6, m6 ; [dword] s12 a, 0, b, 0
punpckhdq m2, m0, m1 ; [dword] s1 b, s2 b, 0, 0
punpckldq m0, m1 ; [dword] s1 a, s2 a, 0, 0
punpckhdq m3, m4, m6 ; [dword] ss b, s12 b, 0, 0
punpckldq m4, m6 ; [dword] ss a, s12 a, 0, 0
punpcklqdq m1, m2, m3 ; [dword] b s1, s2, ss, s12
punpcklqdq m0, m4 ; [dword] a s1, s2, ss, s12
%else
pmaddwd m0, m15 ; [dword] s1 a,a,b,b
pmaddwd m1, m15 ; [dword] s2 a,a,b,b
phaddd m0, m4 ; [dword] s1 a, b, ss a, b
@ -99,6 +146,7 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
punpckldq m0, m1 ; [dword] s1 a, s2 a, s1 b, s2 b
punpckhqdq m1, m0, m2 ; [dword] b s1, s2, ss, s12
punpcklqdq m0, m2 ; [dword] a s1, s2, ss, s12
%endif
mova [sumsq+ 0], m0
mova [sumsq+mmsize], m1
@ -109,7 +157,15 @@ cglobal ssim_4x4_line, 6, 8, 16, buf, buf_stride, ref, ref_stride, sums, w, buf_
sub wd, mmsize/8
jg .loop
RET
%endmacro
%if ARCH_X86_64
INIT_XMM ssse3
SSIM_4X4_LINE 16
%endif
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
SSIM_4X4_LINE 8
%endif
INIT_XMM sse4

View File

@ -25,6 +25,9 @@
void ff_ssim_4x4_line_ssse3(const uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *ref, ptrdiff_t ref_stride,
int (*sums)[4], int w);
void ff_ssim_4x4_line_xop (const uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *ref, ptrdiff_t ref_stride,
int (*sums)[4], int w);
float ff_ssim_end_line_sse4(const int (*sum0)[4], const int (*sum1)[4], int w);
void ff_ssim_init_x86(SSIMDSPContext *dsp)
@ -35,4 +38,6 @@ void ff_ssim_init_x86(SSIMDSPContext *dsp)
dsp->ssim_4x4_line = ff_ssim_4x4_line_ssse3;
if (EXTERNAL_SSE4(cpu_flags))
dsp->ssim_end_line = ff_ssim_end_line_sse4;
if (EXTERNAL_XOP(cpu_flags))
dsp->ssim_4x4_line = ff_ssim_4x4_line_xop;
}