mirror of
https://gitee.com/openharmony/third_party_ffmpeg
synced 2024-11-23 19:30:05 +00:00
vp9: sse2/ssse3/avx 16bpp loopfilter x86 simd.
This commit is contained in:
parent
254c64c574
commit
db7786e8ff
@ -160,6 +160,7 @@ YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
|
||||
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9intrapred.o \
|
||||
x86/vp9itxfm.o \
|
||||
x86/vp9lpf.o \
|
||||
x86/vp9lpf_16bpp.o \
|
||||
x86/vp9mc.o \
|
||||
x86/vp9mc_16bpp.o
|
||||
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
|
||||
|
@ -55,6 +55,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pw_1024) = { 0x0400040004000400ULL, 0x040
|
||||
0x0400040004000400ULL, 0x0400040004000400ULL};
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_2048) = { 0x0800080008000800ULL, 0x0800080008000800ULL,
|
||||
0x0800080008000800ULL, 0x0800080008000800ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4095) = { 0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL,
|
||||
0x0fff0fff0fff0fffULL, 0x0fff0fff0fff0fffULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_4096) = { 0x1000100010001000ULL, 0x1000100010001000ULL,
|
||||
0x1000100010001000ULL, 0x1000100010001000ULL };
|
||||
DECLARE_ALIGNED(32, const ymm_reg, ff_pw_8192) = { 0x2000200020002000ULL, 0x2000200020002000ULL,
|
||||
|
@ -47,6 +47,7 @@ extern const ymm_reg ff_pw_512;
|
||||
extern const ymm_reg ff_pw_1023;
|
||||
extern const ymm_reg ff_pw_1024;
|
||||
extern const ymm_reg ff_pw_2048;
|
||||
extern const ymm_reg ff_pw_4095;
|
||||
extern const ymm_reg ff_pw_4096;
|
||||
extern const ymm_reg ff_pw_8192;
|
||||
extern const ymm_reg ff_pw_m1;
|
||||
|
@ -65,6 +65,62 @@ filters_8tap_1d_fn2(put, 16, BPC, avx2, 16bpp)
|
||||
filters_8tap_1d_fn2(avg, 16, BPC, avx2, 16bpp)
|
||||
#endif
|
||||
|
||||
#define decl_lpf_func(dir, wd, bpp, opt) \
|
||||
void ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int E, int I, int H)
|
||||
|
||||
#define decl_lpf_funcs(dir, wd, bpp) \
|
||||
decl_lpf_func(dir, wd, bpp, sse2); \
|
||||
decl_lpf_func(dir, wd, bpp, ssse3); \
|
||||
decl_lpf_func(dir, wd, bpp, avx)
|
||||
|
||||
#define decl_lpf_funcs_wd(dir) \
|
||||
decl_lpf_funcs(dir, 4, BPC); \
|
||||
decl_lpf_funcs(dir, 8, BPC); \
|
||||
decl_lpf_funcs(dir, 16, BPC)
|
||||
|
||||
decl_lpf_funcs_wd(h);
|
||||
decl_lpf_funcs_wd(v);
|
||||
|
||||
#define lpf_16_wrapper(dir, off, bpp, opt) \
|
||||
static void loop_filter_##dir##_16_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int E, int I, int H) \
|
||||
{ \
|
||||
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst, stride, E, I, H); \
|
||||
ff_vp9_loop_filter_##dir##_16_##bpp##_##opt(dst + off, stride, E, I, H); \
|
||||
}
|
||||
|
||||
#define lpf_16_wrappers(bpp, opt) \
|
||||
lpf_16_wrapper(h, 8 * stride, bpp, opt); \
|
||||
lpf_16_wrapper(v, 16, bpp, opt)
|
||||
|
||||
lpf_16_wrappers(BPC, sse2);
|
||||
lpf_16_wrappers(BPC, ssse3);
|
||||
lpf_16_wrappers(BPC, avx);
|
||||
|
||||
#define lpf_mix2_wrapper(dir, off, wd1, wd2, bpp, opt) \
|
||||
static void loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt(uint8_t *dst, ptrdiff_t stride, \
|
||||
int E, int I, int H) \
|
||||
{ \
|
||||
ff_vp9_loop_filter_##dir##_##wd1##_##bpp##_##opt(dst, stride, \
|
||||
E & 0xff, I & 0xff, H & 0xff); \
|
||||
ff_vp9_loop_filter_##dir##_##wd2##_##bpp##_##opt(dst + off, stride, \
|
||||
E >> 8, I >> 8, H >> 8); \
|
||||
}
|
||||
|
||||
#define lpf_mix2_wrappers(wd1, wd2, bpp, opt) \
|
||||
lpf_mix2_wrapper(h, 8 * stride, wd1, wd2, bpp, opt); \
|
||||
lpf_mix2_wrapper(v, 16, wd1, wd2, bpp, opt)
|
||||
|
||||
#define lpf_mix2_wrappers_set(bpp, opt) \
|
||||
lpf_mix2_wrappers(4, 4, bpp, opt); \
|
||||
lpf_mix2_wrappers(4, 8, bpp, opt); \
|
||||
lpf_mix2_wrappers(8, 4, bpp, opt); \
|
||||
lpf_mix2_wrappers(8, 8, bpp, opt); \
|
||||
|
||||
lpf_mix2_wrappers_set(BPC, sse2);
|
||||
lpf_mix2_wrappers_set(BPC, ssse3);
|
||||
lpf_mix2_wrappers_set(BPC, avx);
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
||||
@ -72,9 +128,43 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp)
|
||||
#if HAVE_YASM
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#define init_lpf_8_func(idx1, idx2, dir, wd, bpp, opt) \
|
||||
dsp->loop_filter_8[idx1][idx2] = ff_vp9_loop_filter_##dir##_##wd##_##bpp##_##opt
|
||||
#define init_lpf_16_func(idx, dir, bpp, opt) \
|
||||
dsp->loop_filter_16[idx] = loop_filter_##dir##_16_##bpp##_##opt
|
||||
#define init_lpf_mix2_func(idx1, idx2, idx3, dir, wd1, wd2, bpp, opt) \
|
||||
dsp->loop_filter_mix2[idx1][idx2][idx3] = loop_filter_##dir##_##wd1##wd2##_##bpp##_##opt
|
||||
|
||||
#define init_lpf_funcs(bpp, opt) \
|
||||
init_lpf_8_func(0, 0, h, 4, bpp, opt); \
|
||||
init_lpf_8_func(0, 1, v, 4, bpp, opt); \
|
||||
init_lpf_8_func(1, 0, h, 8, bpp, opt); \
|
||||
init_lpf_8_func(1, 1, v, 8, bpp, opt); \
|
||||
init_lpf_8_func(2, 0, h, 16, bpp, opt); \
|
||||
init_lpf_8_func(2, 1, v, 16, bpp, opt); \
|
||||
init_lpf_16_func(0, h, bpp, opt); \
|
||||
init_lpf_16_func(1, v, bpp, opt); \
|
||||
init_lpf_mix2_func(0, 0, 0, h, 4, 4, bpp, opt); \
|
||||
init_lpf_mix2_func(0, 1, 0, h, 4, 8, bpp, opt); \
|
||||
init_lpf_mix2_func(1, 0, 0, h, 8, 4, bpp, opt); \
|
||||
init_lpf_mix2_func(1, 1, 0, h, 8, 8, bpp, opt); \
|
||||
init_lpf_mix2_func(0, 0, 1, v, 4, 4, bpp, opt); \
|
||||
init_lpf_mix2_func(0, 1, 1, v, 4, 8, bpp, opt); \
|
||||
init_lpf_mix2_func(1, 0, 1, v, 8, 4, bpp, opt); \
|
||||
init_lpf_mix2_func(1, 1, 1, v, 8, 8, bpp, opt)
|
||||
|
||||
if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
init_subpel3(0, put, BPC, sse2);
|
||||
init_subpel3(1, avg, BPC, sse2);
|
||||
init_lpf_funcs(BPC, sse2);
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(cpu_flags)) {
|
||||
init_lpf_funcs(BPC, ssse3);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(cpu_flags)) {
|
||||
init_lpf_funcs(BPC, avx);
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX2(cpu_flags)) {
|
||||
|
823
libavcodec/x86/vp9lpf_16bpp.asm
Normal file
823
libavcodec/x86/vp9lpf_16bpp.asm
Normal file
@ -0,0 +1,823 @@
|
||||
;******************************************************************************
|
||||
;* VP9 loop filter SIMD optimizations
|
||||
;*
|
||||
;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_511: times 16 dw 511
|
||||
pw_2047: times 16 dw 2047
|
||||
pw_16384: times 16 dw 16384
|
||||
pw_m512: times 16 dw -512
|
||||
pw_m2048: times 16 dw -2048
|
||||
|
||||
cextern pw_1
|
||||
cextern pw_3
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
cextern pw_16
|
||||
cextern pw_256
|
||||
cextern pw_1023
|
||||
cextern pw_4095
|
||||
cextern pw_m1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro SCRATCH 3-4
|
||||
%if ARCH_X86_64
|
||||
SWAP %1, %2
|
||||
%if %0 == 4
|
||||
%define reg_%4 m%2
|
||||
%endif
|
||||
%else
|
||||
mova [%3], m%1
|
||||
%if %0 == 4
|
||||
%define reg_%4 [%3]
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro UNSCRATCH 3-4
|
||||
%if ARCH_X86_64
|
||||
SWAP %1, %2
|
||||
%else
|
||||
mova m%1, [%3]
|
||||
%endif
|
||||
%if %0 == 4
|
||||
%undef reg_%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PRELOAD 2-3
|
||||
%if ARCH_X86_64
|
||||
mova m%1, [%2]
|
||||
%if %0 == 3
|
||||
%define reg_%3 m%1
|
||||
%endif
|
||||
%elif %0 == 3
|
||||
%define reg_%3 [%2]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; calulate p or q portion of flat8out
|
||||
%macro FLAT8OUT_HALF 0
|
||||
psubw m4, m0 ; q4-q0
|
||||
psubw m5, m0 ; q5-q0
|
||||
psubw m6, m0 ; q6-q0
|
||||
psubw m7, m0 ; q7-q0
|
||||
ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
|
||||
ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
|
||||
pcmpgtw m4, reg_F ; abs(q4-q0) > F
|
||||
pcmpgtw m5, reg_F ; abs(q5-q0) > F
|
||||
pcmpgtw m6, reg_F ; abs(q6-q0) > F
|
||||
pcmpgtw m7, reg_F ; abs(q7-q0) > F
|
||||
por m5, m4
|
||||
por m7, m6
|
||||
por m7, m5 ; !flat8out, q portion
|
||||
%endmacro
|
||||
|
||||
; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
|
||||
%macro FLAT8IN_HALF 1
|
||||
%if %1 > 4
|
||||
psubw m4, m3, m0 ; q3-q0
|
||||
psubw m5, m2, m0 ; q2-q0
|
||||
ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
|
||||
pcmpgtw m4, reg_F ; abs(q3-q0) > F
|
||||
pcmpgtw m5, reg_F ; abs(q2-q0) > F
|
||||
%endif
|
||||
psubw m3, m2 ; q3-q2
|
||||
psubw m2, m1 ; q2-q1
|
||||
ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
|
||||
pcmpgtw m3, reg_I ; abs(q3-q2) > I
|
||||
pcmpgtw m2, reg_I ; abs(q2-q1) > I
|
||||
%if %1 > 4
|
||||
por m4, m5
|
||||
%endif
|
||||
por m2, m3
|
||||
psubw m3, m1, m0 ; q1-q0
|
||||
ABS1 m3, m5 ; abs(q1-q0)
|
||||
%if %1 > 4
|
||||
pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
|
||||
%endif
|
||||
pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
|
||||
pcmpgtw m3, reg_I ; abs(q1-q0) > I
|
||||
%if %1 > 4
|
||||
por m4, m6
|
||||
%endif
|
||||
por m2, m3
|
||||
%endmacro
|
||||
|
||||
; one step in filter_14/filter_6
|
||||
;
|
||||
; take sum $reg, downshift, apply mask and write into dst
|
||||
;
|
||||
; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
|
||||
; step's sum $reg. This is omitted for the last row in each filter.
|
||||
;
|
||||
; if dont_store is set, don't write the result into memory, instead keep the
|
||||
; values in register so we can write it out later
|
||||
%macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
|
||||
; src/sub1, sub2, add1, add2, dont_store
|
||||
psrlw %1, %2, %4
|
||||
psubw %1, %6 ; abs->delta
|
||||
%ifnidn %7, ""
|
||||
psubw %2, %6
|
||||
psubw %2, %7
|
||||
paddw %2, %8
|
||||
paddw %2, %9
|
||||
%endif
|
||||
pand %1, reg_%3 ; apply mask
|
||||
%if %10 == 1
|
||||
paddw %6, %1 ; delta->abs
|
||||
%else
|
||||
paddw %1, %6 ; delta->abs
|
||||
mova [%5], %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
|
||||
|
||||
%macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if %2 == 16
|
||||
%assign %%num_xmm_regs 16
|
||||
%elif %2 == 8
|
||||
%assign %%num_xmm_regs 15
|
||||
%else ; %2 == 4
|
||||
%assign %%num_xmm_regs 14
|
||||
%endif ; %2
|
||||
%assign %%bak_mem 0
|
||||
%else ; ARCH_X86_32
|
||||
%assign %%num_xmm_regs 8
|
||||
%if %2 == 16
|
||||
%assign %%bak_mem 7
|
||||
%elif %2 == 8
|
||||
%assign %%bak_mem 6
|
||||
%else ; %2 == 4
|
||||
%assign %%bak_mem 5
|
||||
%endif ; %2
|
||||
%endif ; ARCH_X86_64/32
|
||||
|
||||
%if %2 == 16
|
||||
%ifidn %1, v
|
||||
%assign %%num_gpr_regs 6
|
||||
%else ; %1 == h
|
||||
%assign %%num_gpr_regs 5
|
||||
%endif ; %1
|
||||
%assign %%wd_mem 6
|
||||
%else ; %2 == 8/4
|
||||
%assign %%num_gpr_regs 5
|
||||
%if ARCH_X86_32 && %2 == 8
|
||||
%assign %%wd_mem 2
|
||||
%else ; ARCH_X86_64 || %2 == 4
|
||||
%assign %%wd_mem 0
|
||||
%endif ; ARCH_X86_64/32 etc.
|
||||
%endif ; %2
|
||||
|
||||
%ifidn %1, v
|
||||
%assign %%tsp_mem 0
|
||||
%elif %2 == 16 ; && %1 == h
|
||||
%assign %%tsp_mem 16
|
||||
%else ; %1 == h && %1 == 8/4
|
||||
%assign %%tsp_mem 8
|
||||
%endif ; %1/%2
|
||||
|
||||
%assign %%off %%wd_mem
|
||||
%assign %%tspoff %%bak_mem+%%wd_mem
|
||||
%assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
|
||||
|
||||
%if %3 == 10
|
||||
%define %%maxsgn 511
|
||||
%define %%minsgn m512
|
||||
%define %%maxusgn 1023
|
||||
%define %%maxf 4
|
||||
%else ; %3 == 12
|
||||
%define %%maxsgn 2047
|
||||
%define %%minsgn m2048
|
||||
%define %%maxusgn 4095
|
||||
%define %%maxf 16
|
||||
%endif ; %3
|
||||
|
||||
cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
|
||||
; prepare E, I and H masks
|
||||
shl Ed, %3-8
|
||||
shl Id, %3-8
|
||||
shl Hd, %3-8
|
||||
%if cpuflag(ssse3)
|
||||
mova m0, [pw_256]
|
||||
%endif
|
||||
movd m1, Ed
|
||||
movd m2, Id
|
||||
movd m3, Hd
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m1, m0 ; E << (bit_depth - 8)
|
||||
pshufb m2, m0 ; I << (bit_depth - 8)
|
||||
pshufb m3, m0 ; H << (bit_depth - 8)
|
||||
%else
|
||||
punpcklwd m1, m1
|
||||
punpcklwd m2, m2
|
||||
punpcklwd m3, m3
|
||||
pshufd m1, m1, q0000
|
||||
pshufd m2, m2, q0000
|
||||
pshufd m3, m3, q0000
|
||||
%endif
|
||||
SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E
|
||||
SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I
|
||||
SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H
|
||||
%if %2 > 4
|
||||
PRELOAD 11, pw_ %+ %%maxf, F
|
||||
%endif
|
||||
|
||||
; set up variables to load data
|
||||
%ifidn %1, v
|
||||
DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
|
||||
lea stride3q, [strideq*3]
|
||||
neg strideq
|
||||
%if %2 == 16
|
||||
lea dst0q, [dst8q+strideq*8]
|
||||
%else
|
||||
lea dst4q, [dst8q+strideq*4]
|
||||
%endif
|
||||
neg strideq
|
||||
%if %2 == 16
|
||||
lea dst12q, [dst8q+strideq*4]
|
||||
lea dst4q, [dst0q+strideq*4]
|
||||
%endif
|
||||
|
||||
%if %2 == 16
|
||||
%define %%p7 dst0q
|
||||
%define %%p6 dst0q+strideq
|
||||
%define %%p5 dst0q+strideq*2
|
||||
%define %%p4 dst0q+stride3q
|
||||
%endif
|
||||
%define %%p3 dst4q
|
||||
%define %%p2 dst4q+strideq
|
||||
%define %%p1 dst4q+strideq*2
|
||||
%define %%p0 dst4q+stride3q
|
||||
%define %%q0 dst8q
|
||||
%define %%q1 dst8q+strideq
|
||||
%define %%q2 dst8q+strideq*2
|
||||
%define %%q3 dst8q+stride3q
|
||||
%if %2 == 16
|
||||
%define %%q4 dst12q
|
||||
%define %%q5 dst12q+strideq
|
||||
%define %%q6 dst12q+strideq*2
|
||||
%define %%q7 dst12q+stride3q
|
||||
%endif
|
||||
%else ; %1 == h
|
||||
DEFINE_ARGS dst0, stride, stride3, dst4
|
||||
lea stride3q, [strideq*3]
|
||||
lea dst4q, [dst0q+strideq*4]
|
||||
|
||||
%define %%p3 rsp+(%%tspoff+0)*mmsize
|
||||
%define %%p2 rsp+(%%tspoff+1)*mmsize
|
||||
%define %%p1 rsp+(%%tspoff+2)*mmsize
|
||||
%define %%p0 rsp+(%%tspoff+3)*mmsize
|
||||
%define %%q0 rsp+(%%tspoff+4)*mmsize
|
||||
%define %%q1 rsp+(%%tspoff+5)*mmsize
|
||||
%define %%q2 rsp+(%%tspoff+6)*mmsize
|
||||
%define %%q3 rsp+(%%tspoff+7)*mmsize
|
||||
|
||||
%if %2 < 16
|
||||
movu m0, [dst0q+strideq*0-8]
|
||||
movu m1, [dst0q+strideq*1-8]
|
||||
movu m2, [dst0q+strideq*2-8]
|
||||
movu m3, [dst0q+stride3q -8]
|
||||
movu m4, [dst4q+strideq*0-8]
|
||||
movu m5, [dst4q+strideq*1-8]
|
||||
movu m6, [dst4q+strideq*2-8]
|
||||
movu m7, [dst4q+stride3q -8]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
|
||||
%endif
|
||||
|
||||
mova [%%p3], m0
|
||||
mova [%%p2], m1
|
||||
mova [%%p1], m2
|
||||
mova [%%p0], m3
|
||||
%if ARCH_X86_64
|
||||
mova [%%q0], m4
|
||||
%endif
|
||||
mova [%%q1], m5
|
||||
mova [%%q2], m6
|
||||
mova [%%q3], m7
|
||||
|
||||
; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
|
||||
; order here accordingly
|
||||
%else ; %2 == 16
|
||||
|
||||
%define %%p7 rsp+(%%tspoff+ 8)*mmsize
|
||||
%define %%p6 rsp+(%%tspoff+ 9)*mmsize
|
||||
%define %%p5 rsp+(%%tspoff+10)*mmsize
|
||||
%define %%p4 rsp+(%%tspoff+11)*mmsize
|
||||
%define %%q4 rsp+(%%tspoff+12)*mmsize
|
||||
%define %%q5 rsp+(%%tspoff+13)*mmsize
|
||||
%define %%q6 rsp+(%%tspoff+14)*mmsize
|
||||
%define %%q7 rsp+(%%tspoff+15)*mmsize
|
||||
|
||||
mova m0, [dst0q+strideq*0-16]
|
||||
mova m1, [dst0q+strideq*1-16]
|
||||
mova m2, [dst0q+strideq*2-16]
|
||||
mova m3, [dst0q+stride3q -16]
|
||||
mova m4, [dst4q+strideq*0-16]
|
||||
mova m5, [dst4q+strideq*1-16]
|
||||
%if ARCH_X86_64
|
||||
mova m6, [dst4q+strideq*2-16]
|
||||
%endif
|
||||
mova m7, [dst4q+stride3q -16]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
|
||||
%endif
|
||||
|
||||
mova [%%p7], m0
|
||||
mova [%%p6], m1
|
||||
mova [%%p5], m2
|
||||
mova [%%p4], m3
|
||||
%if ARCH_X86_64
|
||||
mova [%%p3], m4
|
||||
%endif
|
||||
mova [%%p2], m5
|
||||
mova [%%p1], m6
|
||||
mova [%%p0], m7
|
||||
|
||||
mova m0, [dst0q+strideq*0]
|
||||
mova m1, [dst0q+strideq*1]
|
||||
mova m2, [dst0q+strideq*2]
|
||||
mova m3, [dst0q+stride3q ]
|
||||
mova m4, [dst4q+strideq*0]
|
||||
mova m5, [dst4q+strideq*1]
|
||||
%if ARCH_X86_64
|
||||
mova m6, [dst4q+strideq*2]
|
||||
%endif
|
||||
mova m7, [dst4q+stride3q ]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
|
||||
%endif
|
||||
|
||||
mova [%%q0], m0
|
||||
mova [%%q1], m1
|
||||
mova [%%q2], m2
|
||||
mova [%%q3], m3
|
||||
%if ARCH_X86_64
|
||||
mova [%%q4], m4
|
||||
%endif
|
||||
mova [%%q5], m5
|
||||
mova [%%q6], m6
|
||||
mova [%%q7], m7
|
||||
|
||||
; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
|
||||
; order here accordingly
|
||||
%endif ; %2
|
||||
%endif ; %1
|
||||
|
||||
; load q0|q4-7 data
|
||||
mova m0, [%%q0]
|
||||
%if %2 == 16
|
||||
mova m4, [%%q4]
|
||||
mova m5, [%%q5]
|
||||
mova m6, [%%q6]
|
||||
mova m7, [%%q7]
|
||||
|
||||
; flat8out q portion
|
||||
FLAT8OUT_HALF
|
||||
SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
|
||||
%endif
|
||||
|
||||
; load q1-3 data
|
||||
mova m1, [%%q1]
|
||||
mova m2, [%%q2]
|
||||
mova m3, [%%q3]
|
||||
|
||||
; r6-8|pw_4[m8-11]=reg_E/I/H/F
|
||||
; r9[m15]=!flatout[q]
|
||||
; m12-14=free
|
||||
; m0-3=q0-q3
|
||||
; m4-7=free
|
||||
|
||||
; flat8in|fm|hev q portion
|
||||
FLAT8IN_HALF %2
|
||||
SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
|
||||
%if %2 > 4
|
||||
SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I
|
||||
%endif
|
||||
|
||||
; r6-8|pw_4[m8-11]=reg_E/I/H/F
|
||||
; r9[m15]=!flat8out[q]
|
||||
; r10[m13]=hev[q]
|
||||
; r11[m14]=!flat8in[q]
|
||||
; m2=!fm[q]
|
||||
; m0,1=q0-q1
|
||||
; m2-7=free
|
||||
; m12=free
|
||||
|
||||
; load p0-1
|
||||
mova m3, [%%p0]
|
||||
mova m4, [%%p1]
|
||||
|
||||
; fm mb_edge portion
|
||||
psubw m5, m3, m0 ; q0-p0
|
||||
psubw m6, m4, m1 ; q1-p1
|
||||
%if ARCH_X86_64
|
||||
ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
|
||||
%else
|
||||
ABS1 m5, m7 ; abs(q0-p0)
|
||||
ABS1 m6, m7 ; abs(q1-p1)
|
||||
%endif
|
||||
paddw m5, m5
|
||||
psraw m6, 1
|
||||
paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
|
||||
pcmpgtw m6, reg_E
|
||||
por m2, m6
|
||||
SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM
|
||||
|
||||
; r6-8|pw_4[m8-11]=reg_E/I/H/F
|
||||
; r9[m15]=!flat8out[q]
|
||||
; r10[m13]=hev[q]
|
||||
; r11[m14]=!flat8in[q]
|
||||
; r12[m12]=!fm[q]
|
||||
; m3-4=q0-1
|
||||
; m0-2/5-7=free
|
||||
|
||||
; load p4-7 data
|
||||
SWAP 3, 0 ; p0
|
||||
SWAP 4, 1 ; p1
|
||||
%if %2 == 16
|
||||
mova m7, [%%p7]
|
||||
mova m6, [%%p6]
|
||||
mova m5, [%%p5]
|
||||
mova m4, [%%p4]
|
||||
|
||||
; flat8out p portion
|
||||
FLAT8OUT_HALF
|
||||
por m7, reg_F8O
|
||||
SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
|
||||
%endif
|
||||
|
||||
; r6-8|pw_4[m8-11]=reg_E/I/H/F
|
||||
; r9[m15]=!flat8out
|
||||
; r10[m13]=hev[q]
|
||||
; r11[m14]=!flat8in[q]
|
||||
; r12[m12]=!fm[q]
|
||||
; m0=p0
|
||||
; m1-7=free
|
||||
|
||||
; load p2-3 data
|
||||
mova m2, [%%p2]
|
||||
mova m3, [%%p3]
|
||||
|
||||
; flat8in|fm|hev p portion
|
||||
FLAT8IN_HALF %2
|
||||
por m7, reg_HEV
|
||||
%if %2 > 4
|
||||
por m4, reg_F8I
|
||||
%endif
|
||||
por m2, reg_FM
|
||||
%if %2 > 4
|
||||
por m4, m2 ; !flat8|!fm
|
||||
%if %2 == 16
|
||||
por m5, m4, reg_F8O ; !flat16|!fm
|
||||
pandn m2, m4 ; filter4_mask
|
||||
pandn m4, m5 ; filter8_mask
|
||||
pxor m5, [pw_m1] ; filter16_mask
|
||||
SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M
|
||||
%else
|
||||
pandn m2, m4 ; filter4_mask
|
||||
pxor m4, [pw_m1] ; filter8_mask
|
||||
%endif
|
||||
SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M
|
||||
%else
|
||||
pxor m2, [pw_m1] ; filter4_mask
|
||||
%endif
|
||||
SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
|
||||
SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M
|
||||
|
||||
; r9[m15]=filter16_mask
|
||||
; r10[m13]=hev
|
||||
; r11[m14]=filter8_mask
|
||||
; r12[m12]=filter4_mask
|
||||
; m0,1=p0-p1
|
||||
; m2-7=free
|
||||
; m8-11=free
|
||||
|
||||
%if %2 > 4
|
||||
%if %2 == 16
|
||||
; filter_14
|
||||
mova m2, [%%p7]
|
||||
mova m3, [%%p6]
|
||||
mova m6, [%%p5]
|
||||
mova m7, [%%p4]
|
||||
PRELOAD 8, %%p3, P3
|
||||
PRELOAD 9, %%p2, P2
|
||||
%endif
|
||||
PRELOAD 10, %%q0, Q0
|
||||
PRELOAD 11, %%q1, Q1
|
||||
%if %2 == 16
|
||||
psllw m4, m2, 3
|
||||
paddw m5, m3, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddw m4, reg_P3
|
||||
paddw m5, reg_P2
|
||||
paddw m4, m1
|
||||
paddw m5, m0
|
||||
paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
|
||||
psubw m5, m2 ; p0+p2+p4+p6*2-p7
|
||||
paddw m4, [pw_8]
|
||||
paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
|
||||
|
||||
; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
|
||||
; at the end of the filter
|
||||
|
||||
mova [rsp+0*mmsize], m3
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1
|
||||
%endif
|
||||
mova m3, [%%q2]
|
||||
%if %2 == 16
|
||||
mova [rsp+1*mmsize], m6
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3
|
||||
%endif
|
||||
mova m6, [%%q3]
|
||||
%if %2 == 16
|
||||
mova [rsp+2*mmsize], m7
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6
|
||||
mova m7, [%%q4]
|
||||
%if ARCH_X86_64
|
||||
mova [rsp+3*mmsize], reg_P3
|
||||
%else
|
||||
mova m4, reg_P3
|
||||
mova [rsp+3*mmsize], m4
|
||||
%endif
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7
|
||||
PRELOAD 8, %%q5, Q5
|
||||
%if ARCH_X86_64
|
||||
mova [rsp+4*mmsize], reg_P2
|
||||
%else
|
||||
mova m4, reg_P2
|
||||
mova [rsp+4*mmsize], m4
|
||||
%endif
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5
|
||||
PRELOAD 9, %%q6, Q6
|
||||
mova [rsp+5*mmsize], m1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6
|
||||
mova m1, [%%q7]
|
||||
FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
|
||||
FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6
|
||||
|
||||
mova m7, [%%p1]
|
||||
%else
|
||||
SWAP 1, 7
|
||||
%endif
|
||||
|
||||
mova m2, [%%p3]
|
||||
mova m1, [%%p2]
|
||||
|
||||
; reg_Q0-1 (m10-m11)
|
||||
; m0=p0
|
||||
; m1=p2
|
||||
; m2=p3
|
||||
; m3=q2
|
||||
; m4-5=free
|
||||
; m6=q3
|
||||
; m7=p1
|
||||
; m8-9 unused
|
||||
|
||||
; filter_6
|
||||
psllw m4, m2, 2
|
||||
paddw m5, m1, m1
|
||||
paddw m4, m7
|
||||
psubw m5, m2
|
||||
paddw m4, m0
|
||||
paddw m5, reg_Q0
|
||||
paddw m4, [pw_4]
|
||||
paddw m5, m4
|
||||
|
||||
%if ARCH_X86_64
|
||||
mova m8, m1
|
||||
mova m9, m7
|
||||
%else
|
||||
mova [rsp+0*mmsize], m1
|
||||
mova [rsp+1*mmsize], m7
|
||||
%endif
|
||||
%ifidn %1, v
|
||||
FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1
|
||||
%else
|
||||
FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1
|
||||
%endif
|
||||
FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1
|
||||
FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1
|
||||
%if ARCH_X86_64
|
||||
FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
|
||||
FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
|
||||
%else
|
||||
FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
|
||||
FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64
|
||||
%endif
|
||||
FILTER_STEP m4, m5, F8M, 3, %%q2, m3
|
||||
|
||||
UNSCRATCH 2, 10, %%q0
|
||||
UNSCRATCH 6, 11, %%q1
|
||||
%else
|
||||
SWAP 1, 7
|
||||
mova m2, [%%q0]
|
||||
mova m6, [%%q1]
|
||||
%endif
|
||||
UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV
|
||||
|
||||
; m0=p0
|
||||
; m1=p2
|
||||
; m2=q0
|
||||
; m3=hev_mask
|
||||
; m4-5=free
|
||||
; m6=q1
|
||||
; m7=p1
|
||||
|
||||
; filter_4
|
||||
psubw m4, m7, m6 ; p1-q1
|
||||
psubw m5, m2, m0 ; q0-p0
|
||||
pand m4, m3
|
||||
pminsw m4, [pw_ %+ %%maxsgn]
|
||||
pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
|
||||
paddw m4, m5
|
||||
paddw m5, m5
|
||||
paddw m4, m5 ; 3*(q0-p0)+f
|
||||
pminsw m4, [pw_ %+ %%maxsgn]
|
||||
pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
|
||||
pand m4, reg_F4M
|
||||
paddw m5, m4, [pw_4]
|
||||
paddw m4, [pw_3]
|
||||
pminsw m5, [pw_ %+ %%maxsgn]
|
||||
pminsw m4, [pw_ %+ %%maxsgn]
|
||||
psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
|
||||
psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
|
||||
psubw m2, m5 ; q0-f1
|
||||
paddw m0, m4 ; p0+f2
|
||||
pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
|
||||
pxor m4, m4
|
||||
mova m5, [pw_ %+ %%maxusgn]
|
||||
pmaxsw m2, m4
|
||||
pmaxsw m0, m4
|
||||
pminsw m2, m5
|
||||
pminsw m0, m5
|
||||
%if cpuflag(ssse3)
|
||||
pmulhrsw m3, [pw_16384] ; (f1+1)>>1
|
||||
%else
|
||||
paddw m3, [pw_1]
|
||||
psraw m3, 1
|
||||
%endif
|
||||
paddw m7, m3 ; p1+f
|
||||
psubw m6, m3 ; q1-f
|
||||
pmaxsw m7, m4
|
||||
pmaxsw m6, m4
|
||||
pminsw m7, m5
|
||||
pminsw m6, m5
|
||||
|
||||
; store
|
||||
%ifidn %1, v
|
||||
mova [%%p1], m7
|
||||
mova [%%p0], m0
|
||||
mova [%%q0], m2
|
||||
mova [%%q1], m6
|
||||
%else ; %1 == h
|
||||
%if %2 == 4
|
||||
TRANSPOSE4x4W 7, 0, 2, 6, 1
|
||||
movh [dst0q+strideq*0-4], m7
|
||||
movhps [dst0q+strideq*1-4], m7
|
||||
movh [dst0q+strideq*2-4], m0
|
||||
movhps [dst0q+stride3q -4], m0
|
||||
movh [dst4q+strideq*0-4], m2
|
||||
movhps [dst4q+strideq*1-4], m2
|
||||
movh [dst4q+strideq*2-4], m6
|
||||
movhps [dst4q+stride3q -4], m6
|
||||
%elif %2 == 8
|
||||
mova m3, [%%p3]
|
||||
mova m4, [%%q2]
|
||||
mova m5, [%%q3]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
|
||||
mova m2, [%%q0]
|
||||
%endif
|
||||
|
||||
movu [dst0q+strideq*0-8], m3
|
||||
movu [dst0q+strideq*1-8], m1
|
||||
movu [dst0q+strideq*2-8], m7
|
||||
movu [dst0q+stride3q -8], m0
|
||||
movu [dst4q+strideq*0-8], m2
|
||||
movu [dst4q+strideq*1-8], m6
|
||||
movu [dst4q+strideq*2-8], m4
|
||||
movu [dst4q+stride3q -8], m5
|
||||
%else ; %2 == 16
|
||||
SCRATCH 2, 8, %%q0
|
||||
SCRATCH 6, 9, %%q1
|
||||
mova m2, [%%p7]
|
||||
mova m3, [%%p6]
|
||||
mova m4, [%%p5]
|
||||
mova m5, [%%p4]
|
||||
mova m6, [%%p3]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10
|
||||
%else
|
||||
mova [%%p1], m7
|
||||
TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
|
||||
%endif
|
||||
|
||||
mova [dst0q+strideq*0-16], m2
|
||||
mova [dst0q+strideq*1-16], m3
|
||||
mova [dst0q+strideq*2-16], m4
|
||||
mova [dst0q+stride3q -16], m5
|
||||
%if ARCH_X86_64
|
||||
mova [dst4q+strideq*0-16], m6
|
||||
%endif
|
||||
mova [dst4q+strideq*1-16], m1
|
||||
mova [dst4q+strideq*2-16], m7
|
||||
mova [dst4q+stride3q -16], m0
|
||||
|
||||
UNSCRATCH 2, 8, %%q0
|
||||
UNSCRATCH 6, 9, %%q1
|
||||
mova m0, [%%q2]
|
||||
mova m1, [%%q3]
|
||||
mova m3, [%%q4]
|
||||
mova m4, [%%q5]
|
||||
%if ARCH_X86_64
|
||||
mova m5, [%%q6]
|
||||
%endif
|
||||
mova m7, [%%q7]
|
||||
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
|
||||
%endif
|
||||
|
||||
mova [dst0q+strideq*0], m2
|
||||
mova [dst0q+strideq*1], m6
|
||||
mova [dst0q+strideq*2], m0
|
||||
mova [dst0q+stride3q ], m1
|
||||
%if ARCH_X86_64
|
||||
mova [dst4q+strideq*0], m3
|
||||
%endif
|
||||
mova [dst4q+strideq*1], m4
|
||||
mova [dst4q+strideq*2], m5
|
||||
mova [dst4q+stride3q ], m7
|
||||
%endif ; %2
|
||||
%endif ; %1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro LOOP_FILTER_CPUSETS 3
|
||||
INIT_XMM sse2
|
||||
LOOP_FILTER %1, %2, %3
|
||||
INIT_XMM ssse3
|
||||
LOOP_FILTER %1, %2, %3
|
||||
INIT_XMM avx
|
||||
LOOP_FILTER %1, %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro LOOP_FILTER_WDSETS 2
|
||||
LOOP_FILTER_CPUSETS %1, 4, %2
|
||||
LOOP_FILTER_CPUSETS %1, 8, %2
|
||||
LOOP_FILTER_CPUSETS %1, 16, %2
|
||||
%endmacro
|
||||
|
||||
LOOP_FILTER_WDSETS h, 10
|
||||
LOOP_FILTER_WDSETS v, 10
|
||||
LOOP_FILTER_WDSETS h, 12
|
||||
LOOP_FILTER_WDSETS v, 12
|
@ -24,10 +24,10 @@
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pw_4095: times 16 dw 0xfff
|
||||
pd_64: times 8 dd 64
|
||||
|
||||
cextern pw_1023
|
||||
cextern pw_4095
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user