Bug 1657200 - Update libdav1d to d0e50cac for Firefox 81. r=mjf

Differential Revision: https://phabricator.services.mozilla.com/D88182
This commit is contained in:
Jon Bauman 2020-08-26 17:42:07 +00:00
parent 14bfc1e8f4
commit 342f755dce
28 changed files with 2000 additions and 566 deletions

View File

@ -66,6 +66,8 @@ if stack_alignment == 0:
DEFINES['STACK_ALIGNMENT'] = stack_alignment DEFINES['STACK_ALIGNMENT'] = stack_alignment
if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'): if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
ASFLAGS += ['-Dprivate_prefix=dav1d']
SOURCES += [ SOURCES += [
'../../../third_party/dav1d/src/x86/cpu.c', '../../../third_party/dav1d/src/x86/cpu.c',
'../../../third_party/dav1d/src/x86/msac_init.c', '../../../third_party/dav1d/src/x86/msac_init.c',

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release # Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS" # Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 6cf58c8e7deb54e287afeee6710b2a3774eded9c (2020-07-20T15:40:01.000+02:00). release: commit d0e50cacead63e9904dde184580ce9a746374bd5 (2020-08-21T15:13:49.000+02:00).
# Revision to pull in # Revision to pull in
# Must be a long or short commit SHA (long preferred) # Must be a long or short commit SHA (long preferred)
revision: 6cf58c8e7deb54e287afeee6710b2a3774eded9c revision: d0e50cacead63e9904dde184580ce9a746374bd5
# The package's license, where possible using the mnemonic from # The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/ # https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */ /* auto-generated, do not edit */
#define DAV1D_VERSION "0.7.1-36-g6cf58c8" #define DAV1D_VERSION "0.7.1-49-gd0e50ca"

View File

@ -351,6 +351,7 @@ cdata.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') cdata.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
if host_machine.cpu_family().startswith('x86') if host_machine.cpu_family().startswith('x86')
cdata_asm.set('private_prefix', 'dav1d')
cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64') cdata_asm.set10('ARCH_X86_64', host_machine.cpu_family() == 'x86_64')
cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86') cdata_asm.set10('ARCH_X86_32', host_machine.cpu_family() == 'x86')
cdata_asm.set10('PIC', true) cdata_asm.set10('PIC', true)

View File

@ -1,5 +1,5 @@
;***************************************************************************** ;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer ;* x86inc.asm: x86 abstraction layer
;***************************************************************************** ;*****************************************************************************
;* Copyright (C) 2005-2020 x264 project ;* Copyright (C) 2005-2020 x264 project
;* ;*
@ -21,23 +21,14 @@
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
;***************************************************************************** ;*****************************************************************************
; This is a header file for the x264ASM assembly language, which uses ; This is a header file for the x86inc.asm assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy ; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64). ; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of ; It also has various other useful features to simplify writing the kind of
; DSP functions that are most often used in x264. ; DSP functions that are most often used.
; Unlike the rest of x264, this file is available under an ISC license, as it
; has significant usefulness outside of x264 and we want it to be available
; to the largest audience possible. Of course, if you modify it for your own
; purposes to add a new feature, we strongly encourage contributing a patch
; as this feature might be useful for others as well. Send patches or ideas
; to x264-devel@videolan.org .
%include "config.asm"
%ifndef private_prefix %ifndef private_prefix
%define private_prefix dav1d %error private_prefix not defined
%endif %endif
%ifndef public_prefix %ifndef public_prefix
@ -118,7 +109,7 @@
; Macros to eliminate most code duplication between x86_32 and x86_64: ; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments ; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that ; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm. ; covers most use cases.
; PROLOGUE: ; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed. ; %1 = number of arguments. loads them from stack if needed.
@ -1522,18 +1513,18 @@ AVX_INSTR orps, sse, 1, 0, 1
AVX_INSTR pabsb, ssse3 AVX_INSTR pabsb, ssse3
AVX_INSTR pabsd, ssse3 AVX_INSTR pabsd, ssse3
AVX_INSTR pabsw, ssse3 AVX_INSTR pabsw, ssse3
AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packssdw, mmx, 0, 0, 0 AVX_INSTR packssdw, mmx, 0, 0, 0
AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR packsswb, mmx, 0, 0, 0
AVX_INSTR packusdw, sse4, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0
AVX_INSTR packuswb, mmx, 0, 0, 0
AVX_INSTR paddb, mmx, 0, 0, 1 AVX_INSTR paddb, mmx, 0, 0, 1
AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1
AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1
AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1
AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1
AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1
AVX_INSTR paddusw, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1
AVX_INSTR paddw, mmx, 0, 0, 1
AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR palignr, ssse3, 0, 1, 0
AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pand, mmx, 0, 0, 1
AVX_INSTR pandn, mmx, 0, 0, 0 AVX_INSTR pandn, mmx, 0, 0, 0
@ -1541,71 +1532,71 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1
AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1
AVX_INSTR pblendvb, sse4 ; can't be emulated AVX_INSTR pblendvb, sse4 ; can't be emulated
AVX_INSTR pblendw, sse4, 0, 1, 0 AVX_INSTR pblendw, sse4, 0, 1, 0
AVX_INSTR pclmulqdq, fnord, 0, 1, 0
AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0
AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0
AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0
AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0
AVX_INSTR pcmpestri, sse42 AVX_INSTR pclmulqdq, fnord, 0, 1, 0
AVX_INSTR pcmpestrm, sse42
AVX_INSTR pcmpistri, sse42
AVX_INSTR pcmpistrm, sse42
AVX_INSTR pcmpeqb, mmx, 0, 0, 1 AVX_INSTR pcmpeqb, mmx, 0, 0, 1
AVX_INSTR pcmpeqw, mmx, 0, 0, 1
AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1
AVX_INSTR pcmpeqq, sse4, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1
AVX_INSTR pcmpeqw, mmx, 0, 0, 1
AVX_INSTR pcmpestri, sse42
AVX_INSTR pcmpestrm, sse42
AVX_INSTR pcmpgtb, mmx, 0, 0, 0 AVX_INSTR pcmpgtb, mmx, 0, 0, 0
AVX_INSTR pcmpgtw, mmx, 0, 0, 0
AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0
AVX_INSTR pcmpgtq, sse42, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0
AVX_INSTR pcmpgtw, mmx, 0, 0, 0
AVX_INSTR pcmpistri, sse42
AVX_INSTR pcmpistrm, sse42
AVX_INSTR pextrb, sse4 AVX_INSTR pextrb, sse4
AVX_INSTR pextrd, sse4 AVX_INSTR pextrd, sse4
AVX_INSTR pextrq, sse4 AVX_INSTR pextrq, sse4
AVX_INSTR pextrw, mmx2 AVX_INSTR pextrw, mmx2
AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddd, ssse3, 0, 0, 0
AVX_INSTR phaddsw, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0
AVX_INSTR phaddw, ssse3, 0, 0, 0
AVX_INSTR phminposuw, sse4 AVX_INSTR phminposuw, sse4
AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0
AVX_INSTR phsubsw, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0
AVX_INSTR phsubw, ssse3, 0, 0, 0
AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0
AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0
AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0
AVX_INSTR pinsrw, mmx2, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0
AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0
AVX_INSTR pmaddwd, mmx, 0, 0, 1
AVX_INSTR pmaxsb, sse4, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1
AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxsd, sse4, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1
AVX_INSTR pmaxsw, mmx2, 0, 0, 1
AVX_INSTR pmaxub, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1
AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pmaxud, sse4, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1
AVX_INSTR pmaxuw, sse4, 0, 0, 1
AVX_INSTR pminsb, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1
AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminsd, sse4, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1
AVX_INSTR pminsw, mmx2, 0, 0, 1
AVX_INSTR pminub, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1
AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pminud, sse4, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1
AVX_INSTR pminuw, sse4, 0, 0, 1
AVX_INSTR pmovmskb, mmx2 AVX_INSTR pmovmskb, mmx2
AVX_INSTR pmovsxbw, sse4
AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbd, sse4
AVX_INSTR pmovsxbq, sse4 AVX_INSTR pmovsxbq, sse4
AVX_INSTR pmovsxbw, sse4
AVX_INSTR pmovsxdq, sse4
AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwd, sse4
AVX_INSTR pmovsxwq, sse4 AVX_INSTR pmovsxwq, sse4
AVX_INSTR pmovsxdq, sse4
AVX_INSTR pmovzxbw, sse4
AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbd, sse4
AVX_INSTR pmovzxbq, sse4 AVX_INSTR pmovzxbq, sse4
AVX_INSTR pmovzxbw, sse4
AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwd, sse4
AVX_INSTR pmovzxwq, sse4 AVX_INSTR pmovzxwq, sse4
AVX_INSTR pmovzxdq, sse4
AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmuldq, sse4, 0, 0, 1
AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1
AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1
AVX_INSTR pmulhw, mmx, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1
AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmulld, sse4, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1
AVX_INSTR pmullw, mmx, 0, 0, 1
AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1
AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1
AVX_INSTR psadbw, mmx2, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1
@ -1614,35 +1605,35 @@ AVX_INSTR pshufd, sse2
AVX_INSTR pshufhw, sse2 AVX_INSTR pshufhw, sse2
AVX_INSTR pshuflw, sse2 AVX_INSTR pshuflw, sse2
AVX_INSTR psignb, ssse3, 0, 0, 0 AVX_INSTR psignb, ssse3, 0, 0, 0
AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR psignd, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0
AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psignw, ssse3, 0, 0, 0
AVX_INSTR pslld, mmx, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0
AVX_INSTR psllq, mmx, 0, 0, 0
AVX_INSTR pslldq, sse2, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0
AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psllq, mmx, 0, 0, 0
AVX_INSTR psllw, mmx, 0, 0, 0
AVX_INSTR psrad, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0
AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psraw, mmx, 0, 0, 0
AVX_INSTR psrld, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0
AVX_INSTR psrlq, mmx, 0, 0, 0
AVX_INSTR psrldq, sse2, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0
AVX_INSTR psrlq, mmx, 0, 0, 0
AVX_INSTR psrlw, mmx, 0, 0, 0
AVX_INSTR psubb, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0
AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0
AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0
AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0
AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0
AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0
AVX_INSTR psubusw, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0
AVX_INSTR psubw, mmx, 0, 0, 0
AVX_INSTR ptest, sse4 AVX_INSTR ptest, sse4
AVX_INSTR punpckhbw, mmx, 0, 0, 0 AVX_INSTR punpckhbw, mmx, 0, 0, 0
AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0
AVX_INSTR punpckhqdq, sse2, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0
AVX_INSTR punpckhwd, mmx, 0, 0, 0
AVX_INSTR punpcklbw, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0
AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0
AVX_INSTR punpcklqdq, sse2, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0
AVX_INSTR punpcklwd, mmx, 0, 0, 0
AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR pxor, mmx, 0, 0, 1
AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpps, sse, 1
AVX_INSTR rcpss, sse, 1, 0, 0 AVX_INSTR rcpss, sse, 1, 0, 0
@ -1674,8 +1665,8 @@ AVX_INSTR xorps, sse, 1, 0, 1
; 3DNow instructions, for sharing code between AVX, SSE and 3DN ; 3DNow instructions, for sharing code between AVX, SSE and 3DN
AVX_INSTR pfadd, 3dnow, 1, 0, 1 AVX_INSTR pfadd, 3dnow, 1, 0, 1
AVX_INSTR pfsub, 3dnow, 1, 0, 0
AVX_INSTR pfmul, 3dnow, 1, 0, 1 AVX_INSTR pfmul, 3dnow, 1, 0, 1
AVX_INSTR pfsub, 3dnow, 1, 0, 0
;%1 == instruction ;%1 == instruction
;%2 == minimal instruction set ;%2 == minimal instruction set
@ -1740,9 +1731,9 @@ GPR_INSTR shrx, bmi2
%endmacro %endmacro
%endmacro %endmacro
FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmadcswd, pmaddwd, paddd FMA_INSTR pmadcswd, pmaddwd, paddd
; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax.

View File

@ -1198,7 +1198,6 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
const unsigned init_byte_pos = init_bit_pos >> 3; const unsigned init_byte_pos = init_bit_pos >> 3;
const unsigned pkt_bytelen = init_byte_pos + len;
// We must have read a whole number of bytes at this point (1 byte // We must have read a whole number of bytes at this point (1 byte
// for the header and whole bytes at a time when reading the // for the header and whole bytes at a time when reading the
@ -1342,6 +1341,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
// The current bit position is a multiple of 8 (because we // The current bit position is a multiple of 8 (because we
// just aligned it) and less than 8*pkt_bytelen because // just aligned it) and less than 8*pkt_bytelen because
// otherwise the overrun check would have fired. // otherwise the overrun check would have fired.
const unsigned pkt_bytelen = init_byte_pos + len;
const unsigned bit_pos = dav1d_get_bits_pos(&gb); const unsigned bit_pos = dav1d_get_bits_pos(&gb);
assert((bit_pos & 7) == 0); assert((bit_pos & 7) == 0);
assert(pkt_bytelen >= (bit_pos >> 3)); assert(pkt_bytelen >= (bit_pos >> 3));
@ -1368,17 +1368,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb); const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3; const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
if (gb.error) goto error; if (gb.error) goto error;
Dav1dRef *ref;
Dav1dContentLightLevel *content_light;
Dav1dMasteringDisplay *mastering_display;
Dav1dITUTT35 *itut_t35_metadata;
switch (meta_type) { switch (meta_type) {
case OBU_META_HDR_CLL: case OBU_META_HDR_CLL: {
ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel)); Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dContentLightLevel));
if (!ref) return DAV1D_ERR(ENOMEM); if (!ref) return DAV1D_ERR(ENOMEM);
content_light = ref->data; Dav1dContentLightLevel *const content_light = ref->data;
memset(content_light, 0, sizeof(*content_light));
content_light->max_content_light_level = dav1d_get_bits(&gb, 16); content_light->max_content_light_level = dav1d_get_bits(&gb, 16);
content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16); content_light->max_frame_average_light_level = dav1d_get_bits(&gb, 16);
@ -1395,11 +1390,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
c->content_light = content_light; c->content_light = content_light;
c->content_light_ref = ref; c->content_light_ref = ref;
break; break;
}
case OBU_META_HDR_MDCV: { case OBU_META_HDR_MDCV: {
ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay)); Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dMasteringDisplay));
if (!ref) return DAV1D_ERR(ENOMEM); if (!ref) return DAV1D_ERR(ENOMEM);
mastering_display = ref->data; Dav1dMasteringDisplay *const mastering_display = ref->data;
memset(mastering_display, 0, sizeof(*mastering_display));
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16); mastering_display->primaries[i][0] = dav1d_get_bits(&gb, 16);
@ -1447,9 +1442,9 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
goto error; goto error;
} }
ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t)); Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
if (!ref) return DAV1D_ERR(ENOMEM); if (!ref) return DAV1D_ERR(ENOMEM);
itut_t35_metadata = ref->data; Dav1dITUTT35 *const itut_t35_metadata = ref->data;
// We need our public headers to be C++ compatible, so payload can't be // We need our public headers to be C++ compatible, so payload can't be
// a flexible array member // a flexible array member

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if HAVE_AVX512ICL && ARCH_X86_64 %if HAVE_AVX512ICL && ARCH_X86_64

View File

@ -24,6 +24,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA 16 SECTION_RODATA 16

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION .text SECTION .text

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA SECTION_RODATA

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA 16 SECTION_RODATA 16

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64

View File

@ -24,6 +24,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA 16 SECTION_RODATA 16

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if ARCH_X86_64 %if ARCH_X86_64
@ -2766,20 +2767,20 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
%ifidn %1, put %ifidn %1, put
%assign isprep 0 %assign isprep 0
%if required_stack_alignment <= STACK_ALIGNMENT %if required_stack_alignment <= STACK_ALIGNMENT
cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy cglobal put_8tap_scaled, 4, 15, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
%else %else
cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy cglobal put_8tap_scaled, 4, 14, 16, 128, dst, ds, src, ss, w, h, mx, my, dx, dy
%endif %endif
%xdefine base_reg r12 %xdefine base_reg r12
%define rndshift 10 %define rndshift 10
%else %else
%assign isprep 1 %assign isprep 1
%if required_stack_alignment <= STACK_ALIGNMENT %if required_stack_alignment <= STACK_ALIGNMENT
cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy cglobal prep_8tap_scaled, 4, 15, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%xdefine tmp_stridem r14q %xdefine tmp_stridem r14q
%else %else
cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy cglobal prep_8tap_scaled, 4, 14, 16, 128, tmp, src, ss, w, h, mx, my, dx, dy
%define tmp_stridem qword [rsp+104] %define tmp_stridem qword [rsp+120]
%endif %endif
%xdefine base_reg r11 %xdefine base_reg r11
%define rndshift 6 %define rndshift 6
@ -2808,7 +2809,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
%define hm r6m %define hm r6m
%endif %endif
%if required_stack_alignment > STACK_ALIGNMENT %if required_stack_alignment > STACK_ALIGNMENT
%define dsm [rsp+96] %define dsm [rsp+112]
%define rX r1 %define rX r1
%define rXd r1d %define rXd r1d
%else %else
@ -2824,7 +2825,7 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
%define dxm r7m %define dxm r7m
%else %else
DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
%define hm [rsp+96] %define hm [rsp+112]
%endif %endif
MCT_8TAP_SCALED_REMAP_REGS_TO_PREV MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%define rX r14 %define rX r14
@ -3104,181 +3105,9 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
lea srcq, [srcq+ssq*2] lea srcq, [srcq+ssq*2]
jmp .w4_loop jmp .w4_loop
.w8: .w8:
%ifidn %1, put mov dword [rsp+48], 1
movifnidn dsm, dsq movifprep tmp_stridem, 16
%endif jmp .w_start
shr t0d, 16
sub srcq, 3
movd xm15, t0d
pmaddwd m8, [base+rescale_mul]
vpbroadcastq m11, [base+pq_0x40000000]
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
pand m6, m14, m10
psrld m6, 6
paddd m15, m6
pcmpeqd m6, m9
vextracti128 xm7, m15, 1
movd r4d, xm15
pextrd r6d, xm15, 2
pextrd r7d, xm15, 1
pextrd r9d, xm15, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movq xm15, [base+subpel_filters+r4*8]
movq xm10, [base+subpel_filters+r6*8]
movhps xm15, [base+subpel_filters+r7*8]
movhps xm10, [base+subpel_filters+r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
vpbroadcastq m8, [base+subpel_filters+rX*8]
psrld m14, 10
mova [rsp], xm14
vextracti128 xm7, m14, 1
movd r4d, xm14
pextrd r6d, xm14, 2
pextrd r7d, xm14, 1
pextrd r9d, xm14, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
pshufd m5, m6, q1100
pshufd m6, m6, q3322
vpblendd m15, m9, 0xc0
vpblendd m10, m8, 0xc0
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
mov dyd, dym
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b
vbroadcasti128 m14, [base+wswap]
.w8_loop:
and myd, 0x3ff
mov r6d, 64 << 24
mov r4d, myd
shr r4d, 6
lea r4d, [t1+r4]
cmovnz r6q, [base+subpel_filters+r4*8]
movq xm11, r6q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
pshufd m8, m11, q0000
pshufd m9, m11, q1111
pmaddwd m4, m0, m8
pmaddwd m5, m1, m9
pshufd m8, m11, q2222
pshufd m11, m11, q3333
pmaddwd m6, m2, m8
pmaddwd m7, m3, m11
paddd m4, m5
paddd m6, m7
paddd m4, m13
paddd m4, m6
psrad m4, rndshift
vextracti128 xm5, m4, 1
packssdw xm4, xm5
%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
%else
mova [tmpq], xm4
add tmpq, 16
%endif
dec hd
jz .ret
add myd, dyd
test myd, ~0x3ff
jz .w8_loop
test myd, 0x400
mov [rsp+16], myd
mov r4d, [rsp+ 0]
mov r6d, [rsp+ 8]
mov r7d, [rsp+ 4]
mov r9d, [rsp+12]
jz .w8_skip_line
vpbroadcastq m6, [srcq+r13]
vpbroadcastq m7, [srcq+ rX]
movq xm4, [srcq+ r4]
movq xm5, [srcq+ r6]
movhps xm4, [srcq+ r7]
movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
add srcq, ssq
mov myd, [rsp+16]
mov dyd, dym
pshufb m0, m14
pshufb m1, m14
pshufb m2, m14
pshufb m3, m14
vpblendd m4, m6, 0xc0
vpblendd m5, m7, 0xc0
pmaddubsw m4, m15
pmaddubsw m5, m10
phaddw m4, m5
pslld m5, m4, 16
paddw m4, m5
pmulhrsw m4, m12
pblendw m0, m1, 0xaa
pblendw m1, m2, 0xaa
pblendw m2, m3, 0xaa
pblendw m3, m4, 0xaa
jmp .w8_loop
.w8_skip_line:
mova m0, m1
mova m1, m2
mova m2, m3
vpbroadcastq m7, [srcq+r13]
vpbroadcastq m8, [srcq+ rX]
movq xm3, [srcq+ r4]
movq xm4, [srcq+ r6]
movhps xm3, [srcq+ r7]
movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
add srcq, ssq
movq xm5, [srcq+ r4]
movq xm6, [srcq+ r6]
movhps xm5, [srcq+ r7]
movhps xm6, [srcq+ r9]
vinserti128 m5, [srcq+r10], 1
vinserti128 m6, [srcq+r11], 1
vpbroadcastq m9, [srcq+r13]
vpbroadcastq m11, [srcq+ rX]
add srcq, ssq
mov myd, [rsp+16]
mov dyd, dym
vpblendd m3, m7, 0xc0
vpblendd m4, m8, 0xc0
vpblendd m5, m9, 0xc0
vpblendd m6, m11, 0xc0
pmaddubsw m3, m15
pmaddubsw m4, m10
pmaddubsw m5, m15
pmaddubsw m6, m10
phaddw m3, m4
phaddw m5, m6
psrld m4, m3, 16
pslld m6, m5, 16
paddw m3, m4
paddw m5, m6
pblendw m3, m5, 0xaa
pmulhrsw m3, m12
jmp .w8_loop
.w16: .w16:
mov dword [rsp+48], 2 mov dword [rsp+48], 2
movifprep tmp_stridem, 32 movifprep tmp_stridem, 32
@ -3698,127 +3527,9 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
jg .dy1_w4_loop jg .dy1_w4_loop
MC_8TAP_SCALED_RET MC_8TAP_SCALED_RET
.dy1_w8: .dy1_w8:
%ifidn %1, put mov dword [rsp+72], 1
movifnidn dsm, dsq movifprep tmp_stridem, 16
%endif jmp .dy1_w_start
shr t0d, 16
sub srcq, 3
movd xm15, t0d
pmaddwd m8, [base+rescale_mul]
vpbroadcastq m11, [base+pq_0x40000000]
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
pand m6, m14, m10
psrld m6, 6
paddd m15, m6
pcmpeqd m6, m9
vextracti128 xm7, m15, 1
movd r4d, xm15
pextrd r6d, xm15, 2
pextrd r7d, xm15, 1
pextrd r9d, xm15, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movq xm15, [base+subpel_filters+ r4*8]
movq xm10, [base+subpel_filters+ r6*8]
movhps xm15, [base+subpel_filters+ r7*8]
movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movd r4d, xm14
pextrd r6d, xm14, 2
pextrd r7d, xm14, 1
pextrd r9d, xm14, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
mov [rsp+32], r7d
pshufd m5, m6, q1100
pshufd m6, m6, q3322
vpblendd m15, m9, 0xc0
vpblendd m10, m8, 0xc0
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
movu [rsp], m10
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b
shr myd, 6
lea myd, [t1+myq]
mov t1d, 64 << 24
cmovnz t1q, [base+subpel_filters+myq*8]
vbroadcasti128 m14, [base+wswap]
movq xm11, t1q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
mov r7d, [rsp+32]
pshufd m8, m11, q0000
pshufd m9, m11, q1111
pshufd m10, m11, q2222
pshufd m11, m11, q3333
.dy1_w8_loop:
pmaddwd m4, m0, m8
pmaddwd m5, m1, m9
pmaddwd m6, m2, m10
pmaddwd m7, m3, m11
paddd m4, m5
paddd m6, m7
paddd m4, m13
paddd m4, m6
psrad m4, rndshift
vextracti128 xm5, m4, 1
packssdw xm4, xm5
%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
%else
mova [tmpq], xm4
add tmpq, 16
%endif
dec hd
jz .ret
movq xm4, [srcq+ r4]
movq xm5, [srcq+ r6]
movhps xm4, [srcq+ r7]
movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
pshufb m0, m14
pshufb m1, m14
pshufb m2, m14
pshufb m3, m14
vpblendd m4, m6, 0xc0
vpblendd m5, m7, 0xc0
pmaddubsw m4, m15
pmaddubsw m5, [rsp]
phaddw m4, m5
pslld m5, m4, 16
paddw m4, m5
pmulhrsw m4, m12
pblendw m0, m1, 0xaa
pblendw m1, m2, 0xaa
pblendw m2, m3, 0xaa
pblendw m3, m4, 0xaa
jmp .dy1_w8_loop
.dy1_w16: .dy1_w16:
mov dword [rsp+72], 2 mov dword [rsp+72], 2
movifprep tmp_stridem, 32 movifprep tmp_stridem, 32
@ -3835,11 +3546,16 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
mov dword [rsp+72], 16 mov dword [rsp+72], 16
movifprep tmp_stridem, 256 movifprep tmp_stridem, 256
.dy1_w_start: .dy1_w_start:
mov myd, mym
%ifidn %1, put %ifidn %1, put
movifnidn dsm, dsq movifnidn dsm, dsq
%endif %endif
shr t0d, 16 shr t0d, 16
sub srcq, 3 sub srcq, 3
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
pmaddwd m8, [base+rescale_mul] pmaddwd m8, [base+rescale_mul]
movd xm15, t0d movd xm15, t0d
mov [rsp+76], t0d mov [rsp+76], t0d
@ -3851,6 +3567,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
shl dword dxm, 3 ; dx*8 shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15 vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7] paddd m14, m8 ; mx+dx*[0-7]
movq xm0, r4q
punpcklbw xm0, xm0
psraw xm0, 8
mova [rsp+96], xm0
jmp .dy1_hloop jmp .dy1_hloop
.dy1_hloop_prep: .dy1_hloop_prep:
dec dword [rsp+72] dec dword [rsp+72]
@ -3910,27 +3630,16 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
movu [rsp], m10 movu [rsp], m10
vpbroadcastd m8, [rsp+0x60]
vpbroadcastd m9, [rsp+0x64]
vpbroadcastd m10, [rsp+0x68]
vpbroadcastd m11, [rsp+0x6c]
pshufb m0, m14 ; 01a 01b pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b pshufb m3, m14 ; 67a 67b
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
vbroadcasti128 m14, [base+wswap] vbroadcasti128 m14, [base+wswap]
movq xm11, r4q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
mov r4d, [rsp+64]
mov r7d, [rsp+68]
pshufd m8, m11, q0000
pshufd m9, m11, q1111
pshufd m10, m11, q2222
pshufd m11, m11, q3333
.dy1_vloop: .dy1_vloop:
pmaddwd m4, m0, m8 pmaddwd m4, m0, m8
pmaddwd m5, m1, m9 pmaddwd m5, m1, m9
@ -4182,137 +3891,9 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
jg .dy2_w4_loop jg .dy2_w4_loop
MC_8TAP_SCALED_RET MC_8TAP_SCALED_RET
.dy2_w8: .dy2_w8:
%ifidn %1, put mov dword [rsp+40], 1
movifnidn dsm, dsq movifprep tmp_stridem, 16
%endif jmp .dy2_w_start
shr t0d, 16
sub srcq, 3
movd xm15, t0d
pmaddwd m8, [base+rescale_mul]
vpbroadcastq m11, [base+pq_0x40000000]
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
pand m6, m14, m10
psrld m6, 6
paddd m15, m6
pcmpeqd m6, m9
vextracti128 xm7, m15, 1
movd r4d, xm15
pextrd r6d, xm15, 2
pextrd r7d, xm15, 1
pextrd r9d, xm15, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movq xm15, [base+subpel_filters+ r4*8]
movq xm10, [base+subpel_filters+ r6*8]
movhps xm15, [base+subpel_filters+ r7*8]
movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movd r4d, xm14
pextrd r6d, xm14, 2
pextrd r7d, xm14, 1
pextrd r9d, xm14, 3
movd r10d, xm7
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
mov [rsp], r7d
pshufd m5, m6, q1100
pshufd m6, m6, q3322
vpblendd m15, m9, 0xc0
vpblendd m10, m8, 0xc0
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b
shr myd, 6
lea myd, [t1+myq]
mov t1d, 64 << 24
cmovnz t1q, [base+subpel_filters+myq*8]
movq xm11, t1q
punpcklbw xm11, xm11
psraw xm11, 8
vinserti128 m11, xm11, 1
mov r7d, [rsp]
pshufd m8, m11, q0000
pshufd m9, m11, q1111
pshufd m14, m11, q2222
pshufd m11, m11, q3333
.dy2_w8_loop:
pmaddwd m4, m0, m8
pmaddwd m5, m1, m9
pmaddwd m6, m2, m14
pmaddwd m7, m3, m11
paddd m4, m5
paddd m6, m7
paddd m4, m13
paddd m4, m6
psrad m4, rndshift
vextracti128 xm5, m4, 1
packssdw xm4, xm5
%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
%else
mova [tmpq], xm4
add tmpq, 16
%endif
dec hd
jz .ret
mova m0, m1
mova m1, m2
mova m2, m3
movq xm3, [srcq+ r4]
movq xm4, [srcq+ r6]
movhps xm3, [srcq+ r7]
movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
vpbroadcastq m5, [srcq+r13]
vpbroadcastq m6, [srcq+ rX]
add srcq, ssq
vpblendd m3, m5, 0xc0
vpblendd m4, m6, 0xc0
pmaddubsw m3, m15
pmaddubsw m4, m10
phaddw m3, m4
movq xm4, [srcq+ r4]
movq xm5, [srcq+ r6]
movhps xm4, [srcq+ r7]
movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
vpblendd m4, m6, 0xc0
vpblendd m5, m7, 0xc0
pmaddubsw m4, m15
pmaddubsw m5, m10
phaddw m4, m5
psrld m5, m3, 16
pslld m6, m4, 16
paddw m3, m5
paddw m4, m6
pblendw m3, m4, 0xaa
pmulhrsw m3, m12
jmp .dy2_w8_loop
.dy2_w16: .dy2_w16:
mov dword [rsp+40], 2 mov dword [rsp+40], 2
movifprep tmp_stridem, 32 movifprep tmp_stridem, 32
@ -4329,11 +3910,16 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
mov dword [rsp+40], 16 mov dword [rsp+40], 16
movifprep tmp_stridem, 256 movifprep tmp_stridem, 256
.dy2_w_start: .dy2_w_start:
mov myd, mym
%ifidn %1, put %ifidn %1, put
movifnidn dsm, dsq movifnidn dsm, dsq
%endif %endif
shr t0d, 16 shr t0d, 16
sub srcq, 3 sub srcq, 3
shr myd, 6
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
pmaddwd m8, [base+rescale_mul] pmaddwd m8, [base+rescale_mul]
movd xm15, t0d movd xm15, t0d
mov [rsp+64], t0d mov [rsp+64], t0d
@ -4345,6 +3931,10 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
shl dword dxm, 3 ; dx*8 shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15 vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7] paddd m14, m8 ; mx+dx*[0-7]
movq xm0, r4q
punpcklbw xm0, xm0
psraw xm0, 8
mova [rsp+0x50], xm0
jmp .dy2_hloop jmp .dy2_hloop
.dy2_hloop_prep: .dy2_hloop_prep:
dec dword [rsp+40] dec dword [rsp+40]
@ -4384,7 +3974,6 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
vpbroadcastq m8, [base+subpel_filters+ rX*8] vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10 psrld m14, 10
vextracti128 xm7, m14, 1 vextracti128 xm7, m14, 1
movq [rsp+32], xm14
movd r4d, xm14 movd r4d, xm14
pextrd r6d, xm14, 2 pextrd r6d, xm14, 2
pextrd r7d, xm14, 1 pextrd r7d, xm14, 1
@ -4404,25 +3993,15 @@ cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym vpbroadcastd m8, [rsp+0x50]
vpbroadcastd m9, [rsp+0x54]
vpbroadcastd m11, [rsp+0x58]
vpbroadcastd m4, [rsp+0x5c]
pshufb m0, m14 ; 01a 01b pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b pshufb m1, m14 ; 23a 23b
pshufb m2, m14 ; 45a 45b pshufb m2, m14 ; 45a 45b
pshufb m3, m14 ; 67a 67b pshufb m3, m14 ; 67a 67b
shr myd, 6 SWAP m14, m4
mov r4d, 64 << 24
lea myd, [t1+myq]
cmovnz r4q, [base+subpel_filters+myq*8]
movq xm14, r4q
punpcklbw xm14, xm14
psraw xm14, 8
vinserti128 m14, xm14, 1
mov r4d, [rsp+32]
mov r7d, [rsp+36]
pshufd m8, m14, q0000
pshufd m9, m14, q1111
pshufd m11, m14, q2222
pshufd m14, m14, q3333
.dy2_vloop: .dy2_vloop:
pmaddwd m4, m0, m8 pmaddwd m4, m0, m8
pmaddwd m5, m1, m9 pmaddwd m5, m1, m9

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
%if HAVE_AVX512ICL && ARCH_X86_64 %if HAVE_AVX512ICL && ARCH_X86_64

View File

@ -91,26 +91,46 @@ decl_mct_fn(dav1d_prep_bilin_ssse3);
decl_mct_fn(dav1d_prep_bilin_sse2); decl_mct_fn(dav1d_prep_bilin_sse2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_ssse3);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_ssse3);
decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
decl_mc_scaled_fn(dav1d_put_bilin_scaled_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_ssse3);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_ssse3);
decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
decl_mct_scaled_fn(dav1d_prep_bilin_scaled_ssse3);
decl_avg_fn(dav1d_avg_avx512icl); decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2); decl_avg_fn(dav1d_avg_avx2);
@ -207,6 +227,30 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
#if ARCH_X86_64
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, ssse3);
init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, ssse3);
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
#endif
c->avg = dav1d_avg_ssse3; c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3; c->mask = dav1d_mask_ssse3;

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,7 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA 64 ; avoids cacheline splits SECTION_RODATA 64 ; avoids cacheline splits

View File

@ -518,9 +518,7 @@ static void print_cpu_name(void) {
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
(void)func_new, (void)func_ref;
state.seed = get_seed(); state.seed = get_seed();
int ret = 0;
while (argc > 1) { while (argc > 1) {
if (!strncmp(argv[1], "--help", 6)) { if (!strncmp(argv[1], "--help", 6)) {
@ -568,6 +566,24 @@ int main(int argc, char *argv[]) {
dav1d_init_cpu(); dav1d_init_cpu();
#ifdef readtime
if (state.bench_pattern) {
static int testing = 0;
checkasm_save_context();
if (!testing) {
checkasm_set_signal_handler_state(1);
testing = 1;
readtime();
checkasm_set_signal_handler_state(0);
} else {
fprintf(stderr, "checkasm: unable to access cycle counter\n");
return 1;
}
}
#endif
int ret = 0;
if (!state.function_listing) { if (!state.function_listing) {
fprintf(stderr, "checkasm: using random seed %u\n", state.seed); fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
#if ARCH_X86_64 #if ARCH_X86_64
@ -672,7 +688,9 @@ int checkasm_bench_func(void) {
/* Indicate that the current test has failed, return whether verbose printing /* Indicate that the current test has failed, return whether verbose printing
* is requested. */ * is requested. */
int checkasm_fail_func(const char *const msg, ...) { int checkasm_fail_func(const char *const msg, ...) {
if (state.current_func_ver->cpu && state.current_func_ver->ok) { if (state.current_func_ver && state.current_func_ver->cpu &&
state.current_func_ver->ok)
{
va_list arg; va_list arg;
print_cpu_name(); print_cpu_name();

View File

@ -86,8 +86,6 @@ int float_near_abs_eps_array(const float *a, const float *b, float eps,
int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps, int float_near_abs_eps_array_ulp(const float *a, const float *b, float eps,
unsigned max_ulp, int len); unsigned max_ulp, int len);
static void *func_ref, *func_new;
#define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */ #define BENCH_RUNS (1 << 12) /* Trade-off between accuracy and speed */
/* Decide whether or not the specified function needs to be tested */ /* Decide whether or not the specified function needs to be tested */
@ -99,6 +97,7 @@ static void *func_ref, *func_new;
* is optional. */ * is optional. */
#define declare_func(ret, ...)\ #define declare_func(ret, ...)\
declare_new(ret, __VA_ARGS__)\ declare_new(ret, __VA_ARGS__)\
void *func_ref, *func_new;\
typedef ret func_type(__VA_ARGS__);\ typedef ret func_type(__VA_ARGS__);\
checkasm_save_context() checkasm_save_context()
@ -127,6 +126,9 @@ static inline uint64_t readtime(void) {
} }
#define readtime readtime #define readtime readtime
#endif #endif
#elif (ARCH_AARCH64 || ARCH_ARM) && defined(__APPLE__)
#include <mach/mach_time.h>
#define readtime() mach_absolute_time()
#elif ARCH_AARCH64 #elif ARCH_AARCH64
#ifdef _MSC_VER #ifdef _MSC_VER
#include <windows.h> #include <windows.h>

View File

@ -140,11 +140,11 @@ static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
report("decode_symbol"); report("decode_symbol");
} }
static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) { static void check_decode_bool_adapt(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a; MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) { if (check_func(c->bool_adapt, "msac_decode_bool_adapt")) {
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
uint16_t cdf[2][2]; uint16_t cdf[2][2];
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
@ -165,9 +165,13 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
bench_new(&s_a, cdf[1]); bench_new(&s_a, cdf[1]);
} }
} }
}
static void check_decode_bool_equi(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s);
if (check_func(c->bool_equi, "msac_decode_bool_equi")) { if (check_func(c->bool_equi, "msac_decode_bool_equi")) {
declare_func(unsigned, MsacContext *s);
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c; s_a = s_c;
for (int i = 0; i < 64; i++) { for (int i = 0; i < 64; i++) {
@ -180,9 +184,13 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
} }
bench_new(&s_a); bench_new(&s_a);
} }
}
static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, unsigned f);
if (check_func(c->bool, "msac_decode_bool")) { if (check_func(c->bool, "msac_decode_bool")) {
declare_func(unsigned, MsacContext *s, unsigned f);
dav1d_msac_init(&s_c, buf, BUF_SIZE, 1); dav1d_msac_init(&s_c, buf, BUF_SIZE, 1);
s_a = s_c; s_a = s_c;
for (int i = 0; i < 64; i++) { for (int i = 0; i < 64; i++) {
@ -197,6 +205,12 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
bench_new(&s_a, 16384); bench_new(&s_a, 16384);
} }
}
static void check_decode_bool_funcs(MsacDSPContext *const c, uint8_t *const buf) {
check_decode_bool_adapt(c, buf);
check_decode_bool_equi(c, buf);
check_decode_bool(c, buf);
report("decode_bool"); report("decode_bool");
} }
@ -204,8 +218,8 @@ static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
ALIGN_STK_16(uint16_t, cdf, 2, [16]); ALIGN_STK_16(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a; MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
if (check_func(c->hi_tok, "msac_decode_hi_tok")) { if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
s_a = s_c; s_a = s_c;
@ -272,6 +286,6 @@ void checkasm_check_msac(void) {
buf[i] = rnd(); buf[i] = rnd();
check_decode_symbol(&c, buf); check_decode_symbol(&c, buf);
check_decode_bool(&c, buf); check_decode_bool_funcs(&c, buf);
check_decode_hi_tok(&c, buf); check_decode_hi_tok(&c, buf);
} }

View File

@ -23,8 +23,9 @@
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%define private_prefix checkasm
%include "config.asm" %include "config.asm"
%undef private_prefix
%define private_prefix checkasm
%include "ext/x86/x86inc.asm" %include "ext/x86/x86inc.asm"
SECTION_RODATA 16 SECTION_RODATA 16