mirror of
https://github.com/xenia-project/FFmpeg.git
synced 2024-11-24 12:09:55 +00:00
vp9mc/x86: add 16px functions (64bit only).
Signed-off-by: Anton Khirnov <anton@khirnov.net>
This commit is contained in:
parent
89466de4ae
commit
3a09494939
@ -64,6 +64,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
|
||||
|
||||
mc_funcs(4);
|
||||
mc_funcs(8);
|
||||
#if ARCH_X86_64
|
||||
mc_funcs(16);
|
||||
#endif
|
||||
|
||||
#undef mc_funcs
|
||||
#undef mc_func
|
||||
@ -95,7 +98,9 @@ ff_ ## avg ## _8tap_1d_ ## dir ## _ ## sz ## _ ## opt(uint8_t *dst, \
|
||||
mc_rep_func(put, sz, hsz, v, ssse3); \
|
||||
mc_rep_func(avg, sz, hsz, v, ssse3)
|
||||
|
||||
#if ARCH_X86_32
|
||||
mc_rep_funcs(16, 8);
|
||||
#endif
|
||||
mc_rep_funcs(32, 16);
|
||||
mc_rep_funcs(64, 32);
|
||||
|
||||
|
@ -144,6 +144,62 @@ INIT_XMM ssse3
|
||||
filter_h_fn put
|
||||
filter_h_fn avg
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro filter_hx2_fn 1
|
||||
%assign %%px mmsize
|
||||
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, src, dstride, sstride, h, filtery
|
||||
mova m13, [pw_256]
|
||||
mova m8, [filteryq+ 0]
|
||||
mova m9, [filteryq+16]
|
||||
mova m10, [filteryq+32]
|
||||
mova m11, [filteryq+48]
|
||||
.loop:
|
||||
movu m0, [srcq-3]
|
||||
movu m1, [srcq-2]
|
||||
movu m2, [srcq-1]
|
||||
movu m3, [srcq+0]
|
||||
movu m4, [srcq+1]
|
||||
movu m5, [srcq+2]
|
||||
movu m6, [srcq+3]
|
||||
movu m7, [srcq+4]
|
||||
add srcq, sstrideq
|
||||
SBUTTERFLY bw, 0, 1, 12
|
||||
SBUTTERFLY bw, 2, 3, 12
|
||||
SBUTTERFLY bw, 4, 5, 12
|
||||
SBUTTERFLY bw, 6, 7, 12
|
||||
pmaddubsw m0, m8
|
||||
pmaddubsw m1, m8
|
||||
pmaddubsw m2, m9
|
||||
pmaddubsw m3, m9
|
||||
pmaddubsw m4, m10
|
||||
pmaddubsw m5, m10
|
||||
pmaddubsw m6, m11
|
||||
pmaddubsw m7, m11
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddsw m0, m4
|
||||
paddsw m1, m5
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
packuswb m0, m1
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
filter_hx2_fn put
|
||||
filter_hx2_fn avg
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro filter_v_fn 1
|
||||
%assign %%px mmsize/2
|
||||
%if ARCH_X86_64
|
||||
@ -218,6 +274,72 @@ INIT_XMM ssse3
|
||||
filter_v_fn put
|
||||
filter_v_fn avg
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
%macro filter_vx2_fn 1
|
||||
%assign %%px mmsize
|
||||
cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, src, dstride, sstride, h, filtery, src4, sstride3
|
||||
sub srcq, sstrideq
|
||||
lea sstride3q, [sstrideq*3]
|
||||
sub srcq, sstrideq
|
||||
mova m13, [pw_256]
|
||||
sub srcq, sstrideq
|
||||
mova m8, [filteryq+ 0]
|
||||
lea src4q, [srcq+sstrideq*4]
|
||||
mova m9, [filteryq+16]
|
||||
mova m10, [filteryq+32]
|
||||
mova m11, [filteryq+48]
|
||||
.loop:
|
||||
; FIXME maybe reuse loads from previous rows, or just
|
||||
; more generally unroll this to prevent multiple loads of
|
||||
; the same data?
|
||||
movu m0, [srcq]
|
||||
movu m1, [srcq+sstrideq]
|
||||
movu m2, [srcq+sstrideq*2]
|
||||
movu m3, [srcq+sstride3q]
|
||||
movu m4, [src4q]
|
||||
movu m5, [src4q+sstrideq]
|
||||
movu m6, [src4q+sstrideq*2]
|
||||
movu m7, [src4q+sstride3q]
|
||||
add srcq, sstrideq
|
||||
add src4q, sstrideq
|
||||
SBUTTERFLY bw, 0, 1, 12
|
||||
SBUTTERFLY bw, 2, 3, 12
|
||||
SBUTTERFLY bw, 4, 5, 12
|
||||
SBUTTERFLY bw, 6, 7, 12
|
||||
pmaddubsw m0, m8
|
||||
pmaddubsw m1, m8
|
||||
pmaddubsw m2, m9
|
||||
pmaddubsw m3, m9
|
||||
pmaddubsw m4, m10
|
||||
pmaddubsw m5, m10
|
||||
pmaddubsw m6, m11
|
||||
pmaddubsw m7, m11
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddsw m0, m4
|
||||
paddsw m1, m5
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
packuswb m0, m1
|
||||
%ifidn %1, avg
|
||||
pavgb m0, [dstq]
|
||||
%endif
|
||||
mova [dstq], m0
|
||||
add dstq, dstrideq
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
filter_vx2_fn put
|
||||
filter_vx2_fn avg
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
%macro fpel_fn 6
|
||||
%if %2 == 4
|
||||
%define %%srcfn movh
|
||||
|
Loading…
Reference in New Issue
Block a user