x86/exrdsp: optimize ff_reorder_pixels_avx2()

Tested with "checkasm --test=exrdsp -bench"

Before:
reorder_pixels_c: 5187.8
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 331.3

After:
reorder_pixels_c: 5181.5
reorder_pixels_sse2: 377.0
reorder_pixels_avx2: 313.8

Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
Henrik Gramner 2017-09-17 22:52:13 -03:00 committed by James Almer
parent 3ffd3b7f5f
commit 18821e3ba1

View File

@ -39,16 +39,15 @@ cglobal reorder_pixels, 3,4,3, dst, src1, size, src2
neg sizeq ; size = offset for dst, src1, src2
.loop:
%if cpuflag(avx2)
vpermq m0, [src1q + sizeq], 0xd8; load first part
vpermq m1, [src2q + sizeq], 0xd8; load second part
%else
mova m0, [src1q+sizeq] ; load first part
movu m1, [src2q+sizeq] ; load second part
%endif
SBUTTERFLY bw, 0, 1, 2 ; interleaved
mova [dstq+2*sizeq ], m0 ; copy to dst
mova [dstq+2*sizeq+mmsize], m1
mova [dstq+2*sizeq ], xm0 ; copy to dst
mova [dstq+2*sizeq+16], xm1
%if cpuflag(avx2)
vperm2i128 m0, m0, m1, q0301
mova [dstq+2*sizeq+32], m0
%endif
add sizeq, mmsize
jl .loop
RET