vp9: 10/12bpp SIMD (sse2/ssse3/avx) for directional intra prediction.

This commit is contained in:
Ronald S. Bultje 2015-10-02 09:11:55 -04:00
parent 26ece7a511
commit 061b67fb50
6 changed files with 1562 additions and 2 deletions

View File

@ -85,3 +85,5 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000
0x0000001000000010ULL, 0x0000001000000010ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL,
0x0000002000000020ULL, 0x0000002000000020ULL };
DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL };

View File

@ -65,5 +65,6 @@ extern const xmm_reg ff_ps_neg;
extern const ymm_reg ff_pd_1;
extern const ymm_reg ff_pd_16;
extern const ymm_reg ff_pd_32;
extern const ymm_reg ff_pd_65535;
#endif /* AVCODEC_X86_CONSTANTS_H */

View File

@ -26,6 +26,7 @@
SECTION_RODATA 32
cextern pd_65535
cextern pw_1023
%define pw_pixel_max pw_1023
cextern pw_16
@ -42,7 +43,6 @@ unpad: times 8 dw 16*1022/32 ; needs to be mod 16
tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
pd_0f: times 4 dd 0xffff
SECTION .text
@ -708,7 +708,7 @@ h%1_loop_op:
psrad m1, 10
psrad m2, 10
pslld m2, 16
pand m1, [pd_0f]
pand m1, [pd_65535]
por m1, m2
%if num_mmregs <= 8
pxor m0, m0

View File

@ -165,6 +165,10 @@ filters_8tap_2d_fn(op, 4, align, bpp, bytes, opt4, f_opt)
init_ipred_func(type, enum, 16, bpp, opt); \
init_ipred_func(type, enum, 32, bpp, opt)
#define init_ipred_funcs(type, enum, bpp, opt) \
init_ipred_func(type, enum, 4, bpp, opt); \
init_8_16_32_ipred_funcs(type, enum, bpp, opt)
void ff_vp9dsp_init_10bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_12bpp_x86(VP9DSPContext *dsp);
void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp);

View File

@ -51,6 +51,18 @@ decl_ipred_fns(h, 16, mmxext, sse2);
decl_ipred_fns(dc, 16, mmxext, sse2);
decl_ipred_fns(dc_top, 16, mmxext, sse2);
decl_ipred_fns(dc_left, 16, mmxext, sse2);
#define decl_ipred_dir_funcs(type) \
decl_ipred_fns(type, 16, sse2, sse2); \
decl_ipred_fns(type, 16, ssse3, ssse3); \
decl_ipred_fns(type, 16, avx, avx)
decl_ipred_dir_funcs(dl);
decl_ipred_dir_funcs(dr);
decl_ipred_dir_funcs(vl);
decl_ipred_dir_funcs(vr);
decl_ipred_dir_funcs(hu);
decl_ipred_dir_funcs(hd);
#endif /* HAVE_YASM */
av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
@ -88,12 +100,33 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp)
init_8_16_32_ipred_funcs(dc, DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_top, TOP_DC, 16, sse2);
init_8_16_32_ipred_funcs(dc_left, LEFT_DC, 16, sse2);
init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, sse2);
init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, sse2);
init_ipred_funcs(vl, VERT_LEFT, 16, sse2);
init_ipred_funcs(vr, VERT_RIGHT, 16, sse2);
init_ipred_funcs(hu, HOR_UP, 16, sse2);
init_ipred_funcs(hd, HOR_DOWN, 16, sse2);
}
if (EXTERNAL_SSSE3(cpu_flags)) {
init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, ssse3);
init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, ssse3);
init_ipred_funcs(vl, VERT_LEFT, 16, ssse3);
init_ipred_funcs(vr, VERT_RIGHT, 16, ssse3);
init_ipred_funcs(hu, HOR_UP, 16, ssse3);
init_ipred_funcs(hd, HOR_DOWN, 16, ssse3);
}
if (EXTERNAL_AVX_FAST(cpu_flags)) {
init_fpel_func(2, 0, 32, put, , avx);
init_fpel_func(1, 0, 64, put, , avx);
init_fpel_func(0, 0, 128, put, , avx);
init_ipred_funcs(dl, DIAG_DOWN_LEFT, 16, avx);
init_ipred_funcs(dr, DIAG_DOWN_RIGHT, 16, avx);
init_ipred_funcs(vl, VERT_LEFT, 16, avx);
init_ipred_funcs(vr, VERT_RIGHT, 16, avx);
init_ipred_funcs(hu, HOR_UP, 16, avx);
init_ipred_funcs(hd, HOR_DOWN, 16, avx);
}
if (EXTERNAL_AVX2(cpu_flags)) {

File diff suppressed because it is too large Load Diff