Bug 1800912 - Update libdav1d to revision 4b9f5b704. r=media-playback-reviewers,alwu

Differential Revision: https://phabricator.services.mozilla.com/D162211
This commit is contained in:
Paul Adenot 2022-12-01 10:44:20 +00:00
parent 67f82af8fd
commit d17cafec50
39 changed files with 3733 additions and 965 deletions

View File

@ -98,6 +98,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/ipred_avx2.asm',
'../../../third_party/dav1d/src/x86/ipred_avx512.asm',
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
'../../../third_party/dav1d/src/x86/itx16_avx512.asm',
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
'../../../third_party/dav1d/src/x86/itx_avx512.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: cd5e415270285a58f48c1e9ec1a2dd024b9acf9f (2022-08-19T13:58:13.000-03:00).
release: 4b9f5b704e299543afcea87f375a308b90ef6c70 (2022-11-10T00:58:40.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: cd5e415270285a58f48c1e9ec1a2dd024b9acf9f
revision: 4b9f5b704e299543afcea87f375a308b90ef6c70
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "cd5e415270285a58f48c1e9ec1a2dd024b9acf9f"
#define DAV1D_VERSION "4b9f5b704e299543afcea87f375a308b90ef6c70"

View File

@ -1,3 +1,4 @@
exclude = .*/tests/.*
exclude = .*/tools/.*
exclude = .*/include/common/dump.h
gcov-ignore-parse-errors = yes

View File

@ -113,6 +113,12 @@
#define ALWAYS_INLINE __attribute__((always_inline)) inline
#endif
#if (defined(__ELF__) || defined(__MACH__) || (defined(_WIN32) && defined(__clang__))) && __has_attribute(visibility)
#define EXTERN extern __attribute__((visibility("hidden")))
#else
#define EXTERN extern
#endif
#ifdef __clang__
#define NO_SANITIZE(x) __attribute__((no_sanitize(x)))
#else

View File

@ -44,6 +44,7 @@ typedef unsigned int atomic_uint;
#define atomic_fetch_sub(p_a, dec) __atomic_fetch_sub(p_a, dec, __ATOMIC_SEQ_CST)
#define atomic_exchange(p_a, v) __atomic_exchange_n(p_a, v, __ATOMIC_SEQ_CST)
#define atomic_fetch_or(p_a, v) __atomic_fetch_or(p_a, v, __ATOMIC_SEQ_CST)
#define atomic_compare_exchange_strong(p_a, expected, desired) __atomic_compare_exchange_n(p_a, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#endif /* !defined(__cplusplus) */

View File

@ -55,6 +55,15 @@ typedef enum {
#define atomic_exchange(p_a, v) InterlockedExchange(p_a, v)
#define atomic_load_explicit(p_a, mo) atomic_load(p_a)
static inline int atomic_compare_exchange_strong_int(LONG *obj, LONG *expected,
LONG desired)
{
LONG orig = *expected;
*expected = InterlockedCompareExchange(obj, desired, orig);
return *expected == orig;
}
#define atomic_compare_exchange_strong(p_a, expected, desired) atomic_compare_exchange_strong_int((LONG *)p_a, (LONG *)expected, (LONG)desired)
/*
* TODO use a special call to increment/decrement
* using InterlockedIncrement/InterlockedDecrement

View File

@ -126,7 +126,7 @@ DAV1D_API int dav1d_open(Dav1dContext **c_out, const Dav1dSettings *s);
* 0: Success, and out is filled with the parsed Sequence Header
* OBU parameters.
* DAV1D_ERR(ENOENT): No Sequence Header OBUs were found in the buffer.
* other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
* Other negative DAV1D_ERR codes: Invalid data in the buffer, invalid passed-in
* arguments, and other errors during parsing.
*
* @note It is safe to feed this function data containing other OBUs than a
@ -137,7 +137,8 @@ DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
const uint8_t *buf, const size_t sz);
/**
* Feed bitstream data to the decoder.
* Feed bitstream data to the decoder, in the form of one or multiple AV1
* Open Bitstream Units (OBUs).
*
* @param c Input decoder instance.
* @param in Input bitstream data. On success, ownership of the reference is
@ -148,8 +149,9 @@ DAV1D_API int dav1d_parse_sequence_header(Dav1dSequenceHeader *out,
* DAV1D_ERR(EAGAIN): The data can't be consumed. dav1d_get_picture() should
* be called to get one or more frames before the function
* can consume new data.
* other negative DAV1D_ERR codes: Error during decoding or because of invalid
* passed-in arguments.
* Other negative DAV1D_ERR codes: Error during decoding or because of invalid
* passed-in arguments. The reference remains
* owned by the caller.
*/
DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
@ -164,7 +166,7 @@ DAV1D_API int dav1d_send_data(Dav1dContext *c, Dav1dData *in);
* 0: Success, and a frame is returned.
* DAV1D_ERR(EAGAIN): Not enough data to output a frame. dav1d_send_data()
* should be called with new input.
* other negative DAV1D_ERR codes: Error during decoding or because of invalid
* Other negative DAV1D_ERR codes: Error during decoding or because of invalid
* passed-in arguments.
*
* @note To drain buffered frames from the decoder (i.e. on end of stream),
@ -216,7 +218,7 @@ DAV1D_API int dav1d_get_picture(Dav1dContext *c, Dav1dPicture *out);
*
* @return
* 0: Success, and a frame is returned.
* other negative DAV1D_ERR codes: Error due to lack of memory or because of
* Other negative DAV1D_ERR codes: Error due to lack of memory or because of
* invalid passed-in arguments.
*
* @note If `Dav1dSettings.apply_grain` is true, film grain was already applied

View File

@ -63,7 +63,7 @@ endforeach
# ASM option
is_asm_enabled = (get_option('enable_asm') == true and
(host_machine.cpu_family() == 'x86' or
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__') == '') or
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le'))
@ -87,16 +87,11 @@ cdata.set10('CONFIG_LOG', get_option('logging'))
test_args = []
optional_arguments = []
optional_link_arguments = []
if host_machine.system() == 'linux'
if host_machine.system() in ['linux', 'gnu']
test_args += '-D_GNU_SOURCE'
add_project_arguments('-D_GNU_SOURCE', language: 'c')
elif host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
else
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
endif
if host_machine.system() == 'windows'
@ -112,6 +107,18 @@ if host_machine.system() == 'windows'
cdata.set('ftello', '_ftelli64')
endif
if host_machine.cpu_family() == 'x86_64'
if cc.get_argument_syntax() != 'msvc'
optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--high-entropy-va'
endif
elif host_machine.cpu_family() == 'x86' or host_machine.cpu_family() == 'arm'
if cc.get_argument_syntax() == 'msvc'
optional_link_arguments += '/largeaddressaware'
else
optional_link_arguments += '-Wl,--dynamicbase,--nxcompat,--tsaware,--large-address-aware'
endif
endif
# On Windows, we use a compatibility layer to emulate pthread
thread_dependency = []
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
@ -135,7 +142,7 @@ else
rt_dependency = []
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
cdata.set('HAVE_CLOCK_GETTIME', 1)
elif host_machine.system() != 'darwin'
elif host_machine.system() not in ['darwin', 'ios', 'tvos']
rt_dependency = cc.find_library('rt', required: false)
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
error('clock_gettime not found')
@ -248,6 +255,10 @@ if cc.has_function('pthread_getaffinity_np', prefix : pthread_np_prefix, args :
cdata.set('HAVE_PTHREAD_GETAFFINITY_NP', 1)
endif
if cc.compiles('int x = _Generic(0, default: 0);', name: '_Generic', args: test_args)
cdata.set('HAVE_C11_GENERIC', 1)
endif
# Compiler flag tests
if cc.has_argument('-fvisibility=hidden')
@ -289,13 +300,14 @@ if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
optional_arguments += '-ffast-math'
endif
if (host_machine.system() == 'darwin' and cc.get_id() == 'clang' and
if (host_machine.system() in ['darwin', 'ios', 'tvos'] and cc.get_id() == 'clang' and
cc.version().startswith('11'))
# Workaround for Xcode 11 -fstack-check bug, see #301
optional_arguments += '-fno-stack-check'
endif
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
add_project_link_arguments(cc.get_supported_link_arguments(optional_link_arguments), language : 'c')
# libFuzzer related things
fuzzing_engine = get_option('fuzzing_engine')
@ -330,7 +342,7 @@ if host_machine.cpu_family().startswith('x86')
stack_alignment = 16
endif
else
if host_machine.system() == 'linux' or host_machine.system() == 'darwin'
if host_machine.system() == 'linux' or host_machine.system() in ['darwin', 'ios', 'tvos']
stack_alignment = 16
elif cc.has_argument('-mpreferred-stack-boundary=4')
stackalign_flag = ['-mpreferred-stack-boundary=4']
@ -391,7 +403,7 @@ if host_machine.cpu_family().startswith('x86')
cdata_asm.set10('PIC', true)
# Convert SSE asm into (128-bit) AVX when compiler flags are set to use AVX instructions
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__') != '')
cdata_asm.set10('FORCE_VEX_ENCODING', cc.get_define('__AVX__').strip() != '')
endif
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
@ -399,7 +411,7 @@ cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
# see following meson issue https://github.com/mesonbuild/meson/issues/5482
if (host_machine.system() == 'darwin' or
if (host_machine.system() in ['darwin', 'ios', 'tvos'] or
(host_machine.system() == 'windows' and host_machine.cpu_family() == 'x86'))
cdata.set10('PREFIX', true)
cdata_asm.set10('PREFIX', true)
@ -433,7 +445,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
if host_machine.system() == 'windows'
nasm_format = 'win'
elif host_machine.system() == 'darwin'
elif host_machine.system() in ['darwin', 'ios', 'tvos']
nasm_format = 'macho'
else
nasm_format = 'elf'
@ -462,7 +474,8 @@ use_gaspp = false
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')) and
cc.get_argument_syntax() == 'msvc')
cc.get_argument_syntax() == 'msvc' and
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
gaspp = find_program('gas-preprocessor.pl')
use_gaspp = true
gaspp_gen = generator(gaspp,

View File

@ -668,12 +668,21 @@ def_fn_4x4 identity, flipadst
.macro idct_4s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4s_x4 \r0, \r2, \r4, \r6
vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.irp r, \r0, \r2, \r4, \r6
vmin.s32 \r, \r, q5
.endr
.irp r, \r0, \r2, \r4, \r6
vmax.s32 \r, \r, q4
.endr
vmul_vmls q2, \r1, \r7, d2[0], d2[1] // -> t4a
vmul_vmla q4, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmla q3, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls q6, \r5, \r3, d3[0], d3[1] // -> t5a
vmul_vmla q7, \r5, \r3, d3[1], d3[0] // -> t6a
vrshr.s32 \r1, q2, #12 // t4a
vrshr.s32 \r7, q4, #12 // t7a
vrshr.s32 \r7, q3, #12 // t7a
vrshr.s32 \r3, q6, #12 // t5a
vrshr.s32 \r5, q7, #12 // t6a
@ -682,17 +691,24 @@ def_fn_4x4 identity, flipadst
vqadd.s32 q3, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
vmul_vmls q4, \r3, \r1, d0[0], d0[0] // -> t5
.irp r, q2, \r1, q3, \r3
vmin.s32 \r, \r, q5
.endr
.irp r, q2, \r1, q3, \r3
vmax.s32 \r, \r, q4
.endr
vmul_vmls q7, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla q6, \r3, \r1, d0[0], d0[0] // -> t6
vrshr.s32 q4, q4, #12 // t5
vrshr.s32 q7, q7, #12 // t5
vrshr.s32 q5, q6, #12 // t6
vqsub.s32 \r7, \r0, q3 // out7
vqadd.s32 \r0, \r0, q3 // out0
vqadd.s32 \r1, \r2, q5 // out1
vqsub.s32 q6, \r2, q5 // out6
vqadd.s32 \r2, \r4, q4 // out2
vqsub.s32 \r5, \r4, q4 // out5
vqadd.s32 \r2, \r4, q7 // out2
vqsub.s32 \r5, \r4, q7 // out5
vqadd.s32 \r3, \r6, q2 // out3
vqsub.s32 \r4, \r6, q2 // out4
vmov \r6, q6 // out6
@ -701,6 +717,15 @@ def_fn_4x4 identity, flipadst
.macro idct_2s_x8 r0, r1, r2, r3, r4, r5, r6, r7
idct_2s_x4 \r0, \r2, \r4, \r6
vmov.i32 d9, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 d8, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.irp r, \r0, \r2, \r4, \r6
vmin.s32 \r, \r, d9
.endr
.irp r, \r0, \r2, \r4, \r6
vmax.s32 \r, \r, d8
.endr
vmul_vmls d4, \r1, \r7, d2[0], d2[1] // -> t4a
vmul_vmla d5, \r1, \r7, d2[1], d2[0] // -> t7a
vmul_vmls d6, \r5, \r3, d3[0], d3[1] // -> t5a
@ -715,6 +740,13 @@ def_fn_4x4 identity, flipadst
vqadd.s32 d5, \r7, \r5 // t7
vqsub.s32 \r3, \r7, \r5 // t6a
.irp r, d4, \r1, d5, \r3
vmin.s32 \r, \r, d9
.endr
.irp r, d4, \r1, d5, \r3
vmax.s32 \r, \r, d8
.endr
vmul_vmls d6, \r3, \r1, d0[0], d0[0] // -> t5
vmul_vmla d7, \r3, \r1, d0[0], d0[0] // -> t6
vrshr.s32 d6, d6, #12 // t5
@ -763,19 +795,28 @@ endfunc
vqadd.s32 q2, q8, q12 // t0
vqsub.s32 q3, q8, q12 // t4
vmov.i32 q12, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vqadd.s32 q4, q15, q11 // t1
vqsub.s32 q5, q15, q11 // t5
vqadd.s32 q6, q10, q14 // t2
vqsub.s32 q7, q10, q14 // t6
vmvn.i32 q14, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 q10, q13, q9 // t3
vqsub.s32 q11, q13, q9 // t7
.irp r, q2, q3, q4, q5, q6, q7, q10, q11
vmin.s32 \r, \r, q12
.endr
.irp r, q2, q3, q4, q5, q6, q7, q10, q11
vmax.s32 \r, \r, q14
.endr
vmul_vmla q8, q3, q5, d1[1], d1[0]
vmul_vmls q12, q3, q5, d1[0], d1[1]
vmul_vmls q13, q3, q5, d1[0], d1[1]
vmul_vmls q14, q11, q7, d1[1], d1[0]
vrshr.s32 q3, q8, #12 // t4a
vrshr.s32 q5, q12, #12 // t5a
vrshr.s32 q5, q13, #12 // t5a
vmul_vmla q8, q11, q7, d1[0], d1[1]
@ -786,12 +827,24 @@ endfunc
vqsub.s32 q2, q2, q6 // t2
vqadd.s32 \r7, q4, q10 // out7
vqsub.s32 q4, q4, q10 // t3
vqneg.s32 \r7, \r7 // out7
vmvn.i32 q10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqadd.s32 \r1, q3, q7 // out1
vqsub.s32 q3, q3, q7 // t6
vqadd.s32 \r6, q5, q11 // out6
vqsub.s32 q5, q5, q11 // t7
// Not clipping the output registers, as they will be downshifted and
// narrowed afterwards anyway.
.irp r, q2, q4, q3, q5
vmin.s32 \r, \r, q12
.endr
.irp r, q2, q4, q3, q5
vmax.s32 \r, \r, q10
.endr
vqneg.s32 \r7, \r7 // out7
vqneg.s32 \r1, \r1 // out1
vmul_vmla q10, q2, q4, d0[0], d0[0] // -> out3 (q11 or q12)
@ -1068,6 +1121,14 @@ function inv_dct_2s_x16_neon
idct_2s_x8 d16, d18, d20, d22, d24, d26, d28, d30
// idct_8 leaves the row_clip_max/min constants in d9 and d8
.irp r, d16, d18, d20, d22, d24, d26, d28, d30
vmin.s32 \r, \r, d9
.endr
.irp r, d16, d18, d20, d22, d24, d26, d28, d30
vmax.s32 \r, \r, d8
.endr
vld1.32 {q0, q1}, [r12, :128]
sub r12, r12, #32
@ -1099,6 +1160,13 @@ function inv_dct_2s_x16_neon
vqadd.s32 d25, d29, d27 // t12
vqsub.s32 d29, d29, d27 // t13
.irp r, d4, d17, d5, d31, d23, d19, d25, d29
vmin.s32 \r, \r, d9
.endr
.irp r, d4, d17, d5, d31, d23, d19, d25, d29
vmax.s32 \r, \r, d8
.endr
vmul_vmls d6, d5, d4, d1[0], d1[1] // -> t9a
vmul_vmla d7, d5, d4, d1[1], d1[0] // -> t14a
vrshr.s32 d21, d6, #12 // t9a
@ -1119,6 +1187,13 @@ function inv_dct_2s_x16_neon
vqsub.s32 d25, d27, d29 // t13
vqadd.s32 d27, d27, d29 // t14
.irp r, d4, d17, d5, d31, d19, d21, d25, d27
vmin.s32 \r, \r, d9
.endr
.irp r, d4, d17, d5, d31, d19, d21, d25, d27
vmax.s32 \r, \r, d8
.endr
vmul_vmls d6, d5, d4, d0[0], d0[0] // -> t11
vmul_vmla d7, d5, d4, d0[0], d0[0] // -> t12
vmul_vmls d4, d25, d21, d0[0], d0[0] // -> t10a
@ -1193,6 +1268,9 @@ endfunc
vld1.32 {q0, q1}, [r12, :128]
vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqsub.s32 d5, d16, d23 // t8a
vqadd.s32 d16, d16, d23 // t0a
vqsub.s32 d7, d31, d24 // t9a
@ -1210,6 +1288,13 @@ endfunc
vqadd.s32 d28, d25, d30 // t7a
vqsub.s32 d25, d25, d30 // t15a
.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
vmin.s32 \r, \r, d11
.endr
.irp r, d5, d16, d7, d31, d23, d18, d24, d29, d21, d20, d26, d27, d19, d22, d28, d25
vmax.s32 \r, \r, d10
.endr
vmul_vmla d4, d5, d7, d2[1], d2[0] // -> t8
vmul_vmls d6, d5, d7, d2[0], d2[1] // -> t9
vmul_vmla d8, d18, d29, d3[1], d3[0] // -> t10
@ -1244,6 +1329,13 @@ endfunc
vqadd.s32 d20, d29, d22 // t11a
vqsub.s32 d29, d29, d22 // t15a
.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
vmin.s32 \r, \r, d11
.endr
.irp r, d2, d16, d3, d31, d21, d23, d26, d24, d19, d17, d28, d30, d27, d18, d20, d29
vmax.s32 \r, \r, d10
.endr
vmul_vmla d4, d2, d3, d1[1], d1[0] // -> t4a
vmul_vmls d6, d2, d3, d1[0], d1[1] // -> t5a
vmul_vmls d8, d24, d23, d1[1], d1[0] // -> t6a
@ -1272,24 +1364,34 @@ endfunc
vqadd.s32 \o15,d31, d26 // out15
vmov \o0, d4
.endif
vqneg.s32 \o15, \o15 // out15
vqsub.s32 d3, d29, d18 // t15a
vqadd.s32 \o13,d29, d18 // out13
vqadd.s32 \o2, d17, d30 // out2
vqsub.s32 d26, d17, d30 // t14a
vqneg.s32 \o13,\o13 // out13
vqadd.s32 \o1, d19, d27 // out1
vqsub.s32 d27, d19, d27 // t10
vqadd.s32 \o14,d28, d20 // out14
vqsub.s32 d20, d28, d20 // t11
vqneg.s32 \o1, \o1 // out1
vqadd.s32 \o3, d22, d24 // out3
vqsub.s32 d22, d22, d24 // t6
vqadd.s32 \o12,d25, d23 // out12
vqsub.s32 d23, d25, d23 // t7
// Not clipping the output registers, as they will be downshifted and
// narrowed afterwards anyway.
.irp r, d2, d21, d3, d26, d27, d20, d22, d23
vmin.s32 \r, \r, d11
.endr
.irp r, d2, d21, d3, d26, d27, d20, d22, d23
vmax.s32 \r, \r, d10
.endr
vqneg.s32 \o15, \o15 // out15
vqneg.s32 \o13,\o13 // out13
vqneg.s32 \o1, \o1 // out1
vqneg.s32 \o3, \o3 // out3
vmul_vmls d24, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23)
@ -1947,6 +2049,9 @@ function inv_dct32_odd_2s_x16_neon
vld1.32 {q0, q1}, [r12, :128]
vmov.i32 d11, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 d10, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
vqsub.s32 d5, d16, d24 // t17
vqadd.s32 d16, d16, d24 // t16
vqsub.s32 d7, d31, d23 // t30
@ -1964,6 +2069,13 @@ function inv_dct32_odd_2s_x16_neon
vqadd.s32 d25, d19, d27 // t28
vqsub.s32 d19, d19, d27 // t29
.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
vmin.s32 \r, \r, d11
.endr
.irp r, d5, d16, d7, d31, d24, d28, d23, d18, d20, d30, d26, d17, d22, d29, d25, d19
vmax.s32 \r, \r, d10
.endr
vmul_vmls d4, d7, d5, d2[0], d2[1] // -> t17a
vmul_vmla d6, d7, d5, d2[1], d2[0] // -> t30a
vmul_vmla d8, d19, d24, d2[1], d2[0] // -> t18a
@ -2000,6 +2112,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d29, d31, d25 // t28a
vqadd.s32 d31, d31, d25 // t31a
.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
vmin.s32 \r, \r, d11
.endr
.irp r, d2, d27, d3, d21, d24, d16, d19, d30, d28, d17, d23, d26, d22, d20, d29, d31
vmax.s32 \r, \r, d10
.endr
vmul_vmls d4, d2, d3, d1[0], d1[1] // -> t18a
vmul_vmla d6, d2, d3, d1[1], d1[0] // -> t29a
vmul_vmls d8, d29, d24, d1[0], d1[1] // -> t19
@ -2037,6 +2156,13 @@ function inv_dct32_odd_2s_x16_neon
vqsub.s32 d24, d24, d19 // t27a
vmov d19, d4 // out19
.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
vmin.s32 \r, \r, d11
.endr
.irp r, d2, d16, d3, d31, d23, d17, d30, d21, d27, d18, d19, d26, d29, d25, d28, d24
vmax.s32 \r, \r, d10
.endr
vmul_vmls d4, d24, d26, d0[0], d0[0] // -> t20
vmul_vmla d6, d24, d26, d0[0], d0[0] // -> t27
vrshr.s32 d20, d4, #12 // t20
@ -2081,6 +2207,18 @@ function inv_txfm_horz\suffix\()_dct_32x2_neon
scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15
.endif
bl inv_dct_2s_x16_neon
// idct_16 leaves the row_clip_max/min constants in d9 and d8,
// but here we want to use full q registers for clipping.
vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
vmin.s32 \r, \r, q3
.endr
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
vmax.s32 \r, \r, q2
.endr
vtrn.32 d16, d17
vtrn.32 d18, d19
vtrn.32 d20, d21
@ -2745,14 +2883,21 @@ function inv_dct64_step1_neon
vqsub.s32 d30, d23, d22 // t62
vqadd.s32 d31, d23, d22 // t63
.irp r, q12, q13, q14, q15
vmin.s32 \r, \r, q5
.endr
.irp r, q12, q13, q14, q15
vmax.s32 \r, \r, q4
.endr
vmul_vmla d4, d29, d26, d0[0], d0[1] // -> t34a
vmul_vmls d6, d29, d26, d0[1], d0[0] // -> t61a
vneg.s32 d4, d4 // t34a
vmul_vmls d8, d30, d25, d0[1], d0[0] // -> t33a
vmul_vmls d7, d30, d25, d0[1], d0[0] // -> t33a
vrshr.s32 d26, d4, #12 // t34a
vmul_vmla d4, d30, d25, d0[0], d0[1] // -> t62a
vrshr.s32 d29, d6, #12 // t61a
vrshr.s32 d25, d8, #12 // t33a
vrshr.s32 d25, d7, #12 // t33a
vrshr.s32 d30, d4, #12 // t62a
vqadd.s32 d16, d24, d27 // t32a
@ -2764,13 +2909,20 @@ function inv_dct64_step1_neon
vqsub.s32 d21, d30, d29 // t61
vqadd.s32 d22, d30, d29 // t62
.irp r, q8, q9, q10, q11
vmin.s32 \r, \r, q5
.endr
.irp r, q8, q9, q10, q11
vmax.s32 \r, \r, q4
.endr
vmul_vmla d4, d21, d18, d1[0], d1[1] // -> t61a
vmul_vmls d6, d21, d18, d1[1], d1[0] // -> t34a
vmul_vmla d8, d20, d19, d1[0], d1[1] // -> t60
vmul_vmla d7, d20, d19, d1[0], d1[1] // -> t60
vrshr.s32 d21, d4, #12 // t61a
vrshr.s32 d18, d6, #12 // t34a
vmul_vmls d4, d20, d19, d1[1], d1[0] // -> t35
vrshr.s32 d20, d8, #12 // t60
vrshr.s32 d20, d7, #12 // t60
vrshr.s32 d19, d4, #12 // t35
vst1.32 {d16, d17, d18, d19}, [r6, :128]!
@ -2805,14 +2957,21 @@ function inv_dct64_step2_neon
vqadd.s32 d30, d23, d22 // t48
vqsub.s32 d31, d23, d22 // t55
.irp r, q12, q13, q14, q15
vmin.s32 \r, \r, q5
.endr
.irp r, q12, q13, q14, q15
vmax.s32 \r, \r, q4
.endr
vmul_vmla d4, d27, d25, d1[1], d1[0] // -> t56a
vmul_vmls d6, d27, d25, d1[0], d1[1] // -> t39a
vmul_vmla d8, d31, d28, d1[1], d1[0] // -> t40a
vmul_vmla d7, d31, d28, d1[1], d1[0] // -> t40a
vrshr.s32 d25, d4, #12 // t56a
vrshr.s32 d27, d6, #12 // t39a
vneg.s32 d8, d8 // t40a
vneg.s32 d7, d7 // t40a
vmul_vmls d4, d31, d28, d1[0], d1[1] // -> t55a
vrshr.s32 d31, d8, #12 // t40a
vrshr.s32 d31, d7, #12 // t40a
vrshr.s32 d28, d4, #12 // t55a
vqadd.s32 d16, d24, d29 // t32a
@ -2824,13 +2983,20 @@ function inv_dct64_step2_neon
vqsub.s32 d21, d25, d28 // t55
vqadd.s32 d22, d25, d28 // t56
.irp r, q8, q9, q10, q11
vmin.s32 \r, \r, q5
.endr
.irp r, q8, q9, q10, q11
vmax.s32 \r, \r, q4
.endr
vmul_vmls d4, d21, d18, d0[0], d0[0] // -> t40a
vmul_vmla d6, d21, d18, d0[0], d0[0] // -> t55a
vmul_vmls d8, d20, d19, d0[0], d0[0] // -> t47
vmul_vmls d7, d20, d19, d0[0], d0[0] // -> t47
vrshr.s32 d18, d4, #12 // t40a
vrshr.s32 d21, d6, #12 // t55a
vmul_vmla d4, d20, d19, d0[0], d0[0] // -> t48
vrshr.s32 d19, d8, #12 // t47
vrshr.s32 d19, d7, #12 // t47
vrshr.s32 d20, d4, #12 // t48
vstr d16, [r6, #4*2*0] // t32a
@ -2916,6 +3082,17 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
bl inv_dct_2s_x16_neon
// idct_16 leaves the row_clip_max/min constants in d9 and d8,
// but here we want to use full q registers for clipping.
vmov.i32 q3, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 q2, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
vmin.s32 \r, \r, q3
.endr
.irp r, q8, q9, q10, q11, q12, q13, q14, q15
vmax.s32 \r, \r, q2
.endr
store16 r6
movdup_if d0, r12, 2896*8*(1<<16), \scale
@ -2934,6 +3111,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
mov r9, #-8
vmov.i32 d1, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 d0, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
.macro store_addsub r0, r1, r2, r3
vld1.32 {d2}, [r6, :64]!
vld1.32 {d3}, [r6, :64]!
@ -2942,16 +3121,32 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
vld1.32 {d4}, [r6, :64]!
vqadd.s32 d7, d3, \r1
vqsub.s32 \r1, d3, \r1
vmin.s32 d6, d6, d1
vmin.s32 \r0, \r0, d1
vld1.32 {d5}, [r6, :64]!
vqadd.s32 d2, d4, \r2
sub r6, r6, #8*4
vmax.s32 d6, d6, d0
vmax.s32 \r0, \r0, d0
vqsub.s32 \r2, d4, \r2
vmin.s32 d7, d7, d1
vmin.s32 \r1, \r1, d1
vst1.32 {d6}, [r6, :64]!
vst1.32 {\r0}, [r10, :64], r9
vmin.s32 d2, d2, d1
vmin.s32 \r2, \r2, d1
vmax.s32 d7, d7, d0
vmax.s32 \r1, \r1, d0
vqadd.s32 d3, d5, \r3
vqsub.s32 \r3, d5, \r3
vmax.s32 d2, d2, d0
vmax.s32 \r2, \r2, d0
vmin.s32 d3, d3, d1
vmin.s32 \r3, \r3, d1
vst1.32 {d7}, [r6, :64]!
vst1.32 {\r1}, [r10, :64], r9
vmax.s32 d3, d3, d0
vmax.s32 \r3, \r3, d0
vst1.32 {d2}, [r6, :64]!
vst1.32 {\r2}, [r10, :64], r9
vst1.32 {d3}, [r6, :64]!
@ -2966,6 +3161,8 @@ function inv_txfm_dct\suffix\()_2s_x64_neon
add r6, r6, #2*4*16
movrel_local r12, idct64_coeffs
vmov.i32 q5, #0x1ffff // row_clip_max = ~(~bdmax << 7), 0x1ffff
vmvn.i32 q4, #0x1ffff // row_clip_min = (~bdmax << 7), 0xfffe0000
movdup_if d0, lr, 2896*8*(1<<16), \scale
vmov_if d7, #0, \clear
add r9, r7, r8, lsl #4 // offset 16

View File

@ -483,10 +483,10 @@ endfunc
add \o1\().4s, v5.4s, v7.4s
sub \o3\().4s, \o3\().4s, v7.4s
rshrn \o0\().4h, \o0\().4s, #12
rshrn \o2\().4h, \o2\().4s, #12
rshrn \o1\().4h, \o1\().4s, #12
rshrn \o3\().4h, \o3\().4s, #12
sqrshrn \o0\().4h, \o0\().4s, #12
sqrshrn \o2\().4h, \o2\().4s, #12
sqrshrn \o1\().4h, \o1\().4s, #12
sqrshrn \o3\().4h, \o3\().4s, #12
.endm
function inv_adst_4h_x4_neon, export=1
@ -538,21 +538,21 @@ endfunc
sub v4.4s, v4.4s, v2.4s // out3
sub v5.4s, v5.4s, v3.4s
rshrn v18.4h, v18.4s, #12
rshrn2 v18.8h, v19.4s, #12
sqrshrn v18.4h, v18.4s, #12
sqrshrn2 v18.8h, v19.4s, #12
rshrn \o0\().4h, v16.4s, #12
rshrn2 \o0\().8h, v17.4s, #12
sqrshrn \o0\().4h, v16.4s, #12
sqrshrn2 \o0\().8h, v17.4s, #12
.ifc \o2, v17
mov v17.16b, v18.16b
.endif
rshrn \o1\().4h, v6.4s, #12
rshrn2 \o1\().8h, v7.4s, #12
sqrshrn \o1\().4h, v6.4s, #12
sqrshrn2 \o1\().8h, v7.4s, #12
rshrn \o3\().4h, v4.4s, #12
rshrn2 \o3\().8h, v5.4s, #12
sqrshrn \o3\().4h, v4.4s, #12
sqrshrn2 \o3\().8h, v5.4s, #12
.endm
function inv_adst_8h_x4_neon, export=1

View File

@ -124,6 +124,13 @@ endconst
.endif
.endm
.macro smin_4s r0, r1, r2
smin \r0\().4s, \r1\().4s, \r2\().4s
.endm
.macro smax_4s r0, r1, r2
smax \r0\().4s, \r1\().4s, \r2\().4s
.endm
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4 \r0, \r2, \r4, \r6
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
.irp r, \r0, \r2, \r4, \r6
smin_4s \r, \r, v5
.endr
.irp r, \r0, \r2, \r4, \r6
smax_4s \r, \r, v4
.endr
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
srshr \r1\().4s, v2.4s, #12 // t4a
srshr \r7\().4s, v4.4s, #12 // t7a
srshr \r7\().4s, v3.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // t6a
@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
.irp r, v2, \r1, v3, \r3
smin_4s \r, \r, v5
.endr
.irp r, v2, \r1, v3, \r3
smax_4s \r, \r, v4
.endr
mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
srshr v4.4s, v4.4s, #12 // t5
srshr v5.4s, v6.4s, #12 // t6
srshr v7.4s, v7.4s, #12 // t5
srshr v6.4s, v6.4s, #12 // t6
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
sqadd \r1\().4s, \r2\().4s, v5.4s // out1
sqsub v6.4s, \r2\().4s, v5.4s // out6
sqadd \r2\().4s, \r4\().4s, v4.4s // out2
sqsub \r5\().4s, \r4\().4s, v4.4s // out5
sqadd \r1\().4s, \r2\().4s, v6.4s // out1
sqsub v6.4s, \r2\().4s, v6.4s // out6
sqadd \r2\().4s, \r4\().4s, v7.4s // out2
sqsub \r5\().4s, \r4\().4s, v7.4s // out5
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
mov \r6\().16b, v6.16b // out6
@ -660,8 +683,11 @@ endfunc
ld1 {v0.4s}, [x16]
movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
sqadd v2.4s, v16.4s, v20.4s // t0
sqsub v3.4s, v16.4s, v20.4s // t4
mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd v4.4s, v23.4s, v19.4s // t1
sqsub v5.4s, v23.4s, v19.4s // t5
sqadd v6.4s, v18.4s, v22.4s // t2
@ -669,6 +695,13 @@ endfunc
sqadd v18.4s, v21.4s, v17.4s // t3
sqsub v19.4s, v21.4s, v17.4s // t7
.irp r, v2, v3, v4, v5, v6, v7, v18, v19
smin_4s \r, \r, v1
.endr
.irp r, v2, v3, v4, v5, v6, v7, v18, v19
smax_4s \r, \r, v20
.endr
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
@ -685,12 +718,24 @@ endfunc
sqsub v2.4s, v2.4s, v6.4s // t2
sqadd \o7\().4s, v4.4s, v18.4s // out7
sqsub v4.4s, v4.4s, v18.4s // t3
sqneg \o7\().4s, \o7\().4s // out7
mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd \o1\().4s, v3.4s, v7.4s // out1
sqsub v3.4s, v3.4s, v7.4s // t6
sqadd \o6\().4s, v5.4s, v19.4s // out6
sqsub v5.4s, v5.4s, v19.4s // t7
// Not clipping the output registers, as they will be downshifted and
// narrowed afterwards anyway.
.irp r, v2, v4, v3, v5
smin_4s \r, \r, v1
.endr
.irp r, v2, v4, v3, v5
smax_4s \r, \r, v18
.endr
sqneg \o7\().4s, \o7\().4s // out7
sqneg \o1\().4s, \o1\().4s // out1
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
// idct_8 leaves the row_clip_max/min constants in v5 and v4
.irp r, v16, v18, v20, v22, v24, v26, v28, v30
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v16, v18, v20, v22, v24, v26, v28, v30
smax \r\().4s, \r\().4s, v4.4s
.endr
ld1 {v0.4s, v1.4s}, [x16]
sub x16, x16, #32
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
srshr v17.4s, v2.4s, #12 // t8a
srshr v31.4s, v4.4s, #12 // t15a
srshr v31.4s, v3.4s, #12 // t15a
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
srshr v23.4s, v6.4s, #12 // t9a
srshr v25.4s, v2.4s, #12 // t14a
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
srshr v21.4s, v4.4s, #12 // t10a
srshr v21.4s, v3.4s, #12 // t10a
srshr v27.4s, v6.4s, #12 // t13a
mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
srshr v19.4s, v2.4s, #12 // t11a
srshr v29.4s, v4.4s, #12 // t12a
srshr v29.4s, v3.4s, #12 // t12a
ld1 {v0.4s}, [x16]
@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
sqadd v25.4s, v29.4s, v27.4s // t12
sqsub v29.4s, v29.4s, v27.4s // t13
mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
.irp r, v2, v17, v3, v31, v23, v19, v25, v29
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v2, v17, v3, v31, v23, v19, v25, v29
smax \r\().4s, \r\().4s, v4.4s
.endr
mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
srshr v21.4s, v4.4s, #12 // t9a
srshr v21.4s, v7.4s, #12 // t9a
srshr v27.4s, v6.4s, #12 // t14a
mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
srshr v29.4s, v4.4s, #12 // t13a
srshr v29.4s, v7.4s, #12 // t13a
neg v6.4s, v6.4s
srshr v23.4s, v6.4s, #12 // t10a
@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
sqsub v25.4s, v27.4s, v29.4s // t13
sqadd v27.4s, v27.4s, v29.4s // t14
mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
.irp r, v2, v17, v3, v31, v19, v21, v25, v27
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v2, v17, v3, v31, v19, v21, v25, v27
smax \r\().4s, \r\().4s, v4.4s
.endr
mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
srshr v4.4s, v4.4s, #12 // t11
srshr v5.4s, v6.4s, #12 // t12
mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v7.4s, v7.4s, #12 // t11
srshr v6.4s, v6.4s, #12 // t12
mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
srshr v3.4s, v6.4s, #12 // t13a
srshr v3.4s, v3.4s, #12 // t13a
sqadd v6.4s, v16.4s, v31.4s // out0
sqadd v1.4s, v16.4s, v31.4s // out0
sqsub v31.4s, v16.4s, v31.4s // out15
mov v16.16b, v6.16b
mov v16.16b, v1.16b
sqadd v23.4s, v30.4s, v17.4s // out7
sqsub v7.4s, v30.4s, v17.4s // out8
sqsub v1.4s, v30.4s, v17.4s // out8
sqadd v17.4s, v18.4s, v27.4s // out1
sqsub v30.4s, v18.4s, v27.4s // out14
sqadd v18.4s, v20.4s, v3.4s // out2
sqsub v29.4s, v20.4s, v3.4s // out13
sqadd v3.4s, v28.4s, v19.4s // out6
sqsub v25.4s, v28.4s, v19.4s // out9
sqadd v19.4s, v22.4s, v5.4s // out3
sqsub v28.4s, v22.4s, v5.4s // out12
sqadd v20.4s, v24.4s, v4.4s // out4
sqsub v27.4s, v24.4s, v4.4s // out11
sqadd v19.4s, v22.4s, v6.4s // out3
sqsub v28.4s, v22.4s, v6.4s // out12
sqadd v20.4s, v24.4s, v7.4s // out4
sqsub v27.4s, v24.4s, v7.4s // out11
sqadd v21.4s, v26.4s, v2.4s // out5
sqsub v26.4s, v26.4s, v2.4s // out10
mov v24.16b, v7.16b
mov v24.16b, v1.16b
mov v22.16b, v3.16b
ret
@ -1084,6 +1151,9 @@ endfunc
ld1 {v0.4s, v1.4s}, [x16]
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqsub v2.4s, v16.4s, v23.4s // t8a
sqadd v16.4s, v16.4s, v23.4s // t0a
sqsub v3.4s, v31.4s, v24.4s // t9a
@ -1101,6 +1171,13 @@ endfunc
sqadd v28.4s, v25.4s, v30.4s // t7a
sqsub v25.4s, v25.4s, v30.4s // t15a
.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
smin_4s \r, \r, v5
.endr
.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
smax_4s \r, \r, v7
.endr
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
@ -1135,6 +1212,13 @@ endfunc
sqadd v20.4s, v29.4s, v22.4s // t11a
sqsub v29.4s, v29.4s, v22.4s // t15a
.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
smin_4s \r, \r, v5
.endr
.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
smax_4s \r, \r, v7
.endr
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
@ -1163,24 +1247,34 @@ endfunc
sqadd \o15\().4s, v31.4s, v26.4s // out15
mov \o0\().16b, v4.16b
.endif
sqneg \o15\().4s, \o15\().4s // out15
sqsub v3.4s, v29.4s, v18.4s // t15a
sqadd \o13\().4s, v29.4s, v18.4s // out13
sqadd \o2\().4s, v17.4s, v30.4s // out2
sqsub v26.4s, v17.4s, v30.4s // t14a
sqneg \o13\().4s, \o13\().4s // out13
sqadd \o1\().4s, v19.4s, v27.4s // out1
sqsub v27.4s, v19.4s, v27.4s // t10
sqadd \o14\().4s, v28.4s, v20.4s // out14
sqsub v20.4s, v28.4s, v20.4s // t11
sqneg \o1\().4s, \o1\().4s // out1
sqadd \o3\().4s, v22.4s, v24.4s // out3
sqsub v22.4s, v22.4s, v24.4s // t6
sqadd \o12\().4s, v25.4s, v23.4s // out12
sqsub v23.4s, v25.4s, v23.4s // t7
// Not clipping the output registers, as they will be downshifted and
// narrowed afterwards anyway.
.irp r, v2, v21, v3, v26, v27, v20, v22, v23
smin_4s \r, \r, v5
.endr
.irp r, v2, v21, v3, v26, v27, v20, v22, v23
smax_4s \r, \r, v7
.endr
sqneg \o15\().4s, \o15\().4s // out15
sqneg \o13\().4s, \o13\().4s // out13
sqneg \o1\().4s, \o1\().4s // out1
sqneg \o3\().4s, \o3\().4s // out3
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
ld1 {v0.4s, v1.4s}, [x16]
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqsub v2.4s, v16.4s, v24.4s // t17
sqadd v16.4s, v16.4s, v24.4s // t16
sqsub v3.4s, v31.4s, v23.4s // t30
@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
sqadd v25.4s, v19.4s, v27.4s // t28
sqsub v19.4s, v19.4s, v27.4s // t29
mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
smax \r\().4s, \r\().4s, v4.4s
.endr
mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
srshr v21.4s, v4.4s, #12 // t17a
srshr v21.4s, v7.4s, #12 // t17a
srshr v27.4s, v6.4s, #12 // t30a
neg v2.4s, v2.4s // -> t18a
mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
srshr v19.4s, v2.4s, #12 // t18a
srshr v24.4s, v4.4s, #12 // t29a
srshr v24.4s, v7.4s, #12 // t29a
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
srshr v22.4s, v6.4s, #12 // t21a
srshr v18.4s, v2.4s, #12 // t26a
neg v4.4s, v4.4s // -> t22a
neg v7.4s, v7.4s // -> t22a
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
srshr v17.4s, v4.4s, #12 // t22a
srshr v17.4s, v7.4s, #12 // t22a
srshr v20.4s, v6.4s, #12 // t25a
sqsub v2.4s, v27.4s, v24.4s // t29
@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
sqsub v29.4s, v31.4s, v25.4s // t28a
sqadd v31.4s, v31.4s, v25.4s // t31a
mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
smax \r\().4s, \r\().4s, v4.4s
.endr
mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
srshr v18.4s, v4.4s, #12 // t18a
srshr v18.4s, v7.4s, #12 // t18a
srshr v25.4s, v6.4s, #12 // t29a
mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
srshr v29.4s, v2.4s, #12 // t19
srshr v24.4s, v4.4s, #12 // t28
srshr v24.4s, v7.4s, #12 // t28
neg v6.4s, v6.4s // -> t20
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
srshr v26.4s, v6.4s, #12 // t20
srshr v19.4s, v2.4s, #12 // t27
neg v4.4s, v4.4s // -> t21a
neg v7.4s, v7.4s // -> t21a
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
srshr v20.4s, v4.4s, #12 // t21a
srshr v20.4s, v7.4s, #12 // t21a
srshr v28.4s, v6.4s, #12 // t26a
sqsub v2.4s, v16.4s, v30.4s // t23
@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
sqsub v21.4s, v27.4s, v22.4s // t25a
sqsub v27.4s, v18.4s, v20.4s // t21
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
sqadd v4.4s, v29.4s, v26.4s // t19a = out19
sqadd v7.4s, v29.4s, v26.4s // t19a = out19
sqsub v26.4s, v29.4s, v26.4s // t20a
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
sqsub v25.4s, v25.4s, v28.4s // t26
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
sqsub v24.4s, v24.4s, v19.4s // t27a
mov v19.16b, v4.16b // out19
mov v19.16b, v7.16b // out19
mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
smin \r\().4s, \r\().4s, v5.4s
.endr
.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
smax \r\().4s, \r\().4s, v4.4s
.endr
mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
srshr v20.4s, v4.4s, #12 // t20
srshr v20.4s, v7.4s, #12 // t20
srshr v22.4s, v6.4s, #12 // t27
mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
mov v27.16b, v22.16b // t27
srshr v26.4s, v4.4s, #12 // t26a
srshr v26.4s, v7.4s, #12 // t26a
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
srshr v21.4s, v6.4s, #12 // t21a
srshr v22.4s, v24.4s, #12 // t22
srshr v25.4s, v4.4s, #12 // t25
srshr v25.4s, v7.4s, #12 // t25
mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
srshr v23.4s, v4.4s, #12 // t23a
srshr v23.4s, v7.4s, #12 // t23a
srshr v24.4s, v6.4s, #12 // t24a
ret
@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
bl inv_dct_4s_x16_neon
// idct_16 leaves the row_clip_max/min constants in v5 and v4
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
smin_4s \r, \r, v5
.endr
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
smax_4s \r, \r, v4
.endr
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
sqsub v30.4s, v23.4s, v22.4s // t62
sqadd v31.4s, v23.4s, v22.4s // t63
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
smin_4s \r, \r, v5
.endr
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
smax_4s \r, \r, v4
.endr
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
neg v2.4s, v2.4s // t34a
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
srshr v26.4s, v2.4s, #12 // t34a
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
srshr v29.4s, v4.4s, #12 // t61a
srshr v29.4s, v7.4s, #12 // t61a
srshr v25.4s, v6.4s, #12 // t33a
srshr v30.4s, v2.4s, #12 // t62a
@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
sqsub v21.4s, v30.4s, v29.4s // t61
sqadd v22.4s, v30.4s, v29.4s // t62
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
smin_4s \r, \r, v5
.endr
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
smax_4s \r, \r, v4
.endr
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
srshr v21.4s, v2.4s, #12 // t61a
srshr v18.4s, v4.4s, #12 // t34a
srshr v18.4s, v7.4s, #12 // t34a
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
srshr v20.4s, v6.4s, #12 // t60
srshr v19.4s, v2.4s, #12 // t35
@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
sqadd v30.4s, v23.4s, v22.4s // t48
sqsub v31.4s, v23.4s, v22.4s // t55
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
smin_4s \r, \r, v5
.endr
.irp r, v24, v25, v26, v27, v28, v29, v30, v31
smax_4s \r, \r, v4
.endr
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
srshr v25.4s, v2.4s, #12 // t56a
srshr v27.4s, v4.4s, #12 // t39a
srshr v27.4s, v7.4s, #12 // t39a
neg v6.4s, v6.4s // t40a
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
srshr v31.4s, v6.4s, #12 // t40a
@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
sqsub v21.4s, v25.4s, v28.4s // t55
sqadd v22.4s, v25.4s, v28.4s // t56
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
smin_4s \r, \r, v5
.endr
.irp r, v16, v19, v17, v18, v20, v23, v21, v22
smax_4s \r, \r, v4
.endr
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
srshr v18.4s, v2.4s, #12 // t40a
srshr v21.4s, v4.4s, #12 // t55a
srshr v21.4s, v7.4s, #12 // t55a
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
srshr v19.4s, v6.4s, #12 // t47
srshr v20.4s, v2.4s, #12 // t48
@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
bl inv_dct_4s_x16_neon
// idct_16 leaves the row_clip_max/min constants in v5 and v4
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
smin_4s \r, \r, v5
.endr
.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
smax_4s \r, \r, v4
.endr
store16 x6
movz16dup_if v0.2s, w16, #2896*8, \scale
@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
mov x9, #-16
movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
.macro store_addsub r0, r1, r2, r3
ld1 {v2.4s}, [x6], #16
ld1 {v3.4s}, [x6], #16
@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
ld1 {v4.4s}, [x6], #16
sqadd v7.4s, v3.4s, \r1
sqsub \r1, v3.4s, \r1
smin v6.4s, v6.4s, v1.4s
smin \r0, \r0, v1.4s
ld1 {v5.4s}, [x6], #16
sqadd v2.4s, v4.4s, \r2
sub x6, x6, #16*4
smax v6.4s, v6.4s, v0.4s
smax \r0, \r0, v0.4s
sqsub \r2, v4.4s, \r2
smin v7.4s, v7.4s, v1.4s
smin \r1, \r1, v1.4s
st1 {v6.4s}, [x6], #16
st1 {\r0}, [x10], x9
smin v2.4s, v2.4s, v1.4s
smin \r2, \r2, v1.4s
smax v7.4s, v7.4s, v0.4s
smax \r1, \r1, v0.4s
sqadd v3.4s, v5.4s, \r3
sqsub \r3, v5.4s, \r3
smax v2.4s, v2.4s, v0.4s
smax \r2, \r2, v0.4s
smin v3.4s, v3.4s, v1.4s
smin \r3, \r3, v1.4s
st1 {v7.4s}, [x6], #16
st1 {\r1}, [x10], x9
smax v3.4s, v3.4s, v0.4s
smax \r3, \r3, v0.4s
st1 {v2.4s}, [x6], #16
st1 {\r2}, [x10], x9
st1 {v3.4s}, [x6], #16
@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
add x6, x6, #4*4*16
movrel x17, idct64_coeffs
movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
movz16dup_if v0.2s, w16, #2896*8, \scale
movi_if v7.4s, #0, \clear
add x9, x7, x8, lsl #4 // offset 16

View File

@ -137,7 +137,7 @@ void BF(dav1d_fguv_32x32_##nm, neon)(pixel *const dst, \
static void \
fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
const ptrdiff_t stride, const Dav1dFilmGrainData *const data, \
const int pw, const uint8_t scaling[SCALING_SIZE], \
const size_t pw, const uint8_t scaling[SCALING_SIZE], \
const entry grain_lut[][GRAIN_WIDTH], const int bh, \
const int row_num, const pixel *const luma_row, \
const ptrdiff_t luma_stride, const int uv, const int is_id \
@ -156,7 +156,7 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
int offsets[2 /* col offset */][2 /* row offset */]; \
\
/* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
if (data->overlap_flag && bx) { \
/* shift previous offsets left */ \
for (int i = 0; i < rows; i++) \

View File

@ -43,8 +43,8 @@
#include "src/x86/cpu.h"
#endif
extern unsigned dav1d_cpu_flags;
extern unsigned dav1d_cpu_flags_mask;
EXTERN unsigned dav1d_cpu_flags;
EXTERN unsigned dav1d_cpu_flags_mask;
void dav1d_init_cpu(void);
DAV1D_API void dav1d_set_cpu_flags_mask(unsigned mask);

View File

@ -2087,11 +2087,14 @@ static int decode_b(Dav1dTaskContext *const t,
const uint8_t (*const lf_lvls)[8][2] = (const uint8_t (*)[8][2])
&ts->lflvl[b->seg_id][0][b->ref[0] + 1][!is_globalmv];
const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
enum RectTxfmSize ytx = b->max_ytx, uvtx = b->uvtx;
if (f->frame_hdr->segmentation.lossless[b->seg_id]) {
ytx = (enum RectTxfmSize) TX_4X4;
uvtx = (enum RectTxfmSize) TX_4X4;
}
dav1d_create_lf_mask_inter(t->lf_mask, f->lf.level, f->b4_stride, lf_lvls,
t->bx, t->by, f->w4, f->h4, b->skip, bs,
f->frame_hdr->segmentation.lossless[b->seg_id] ?
(enum RectTxfmSize) TX_4X4 : b->max_ytx,
tx_split, b->uvtx, f->cur.p.layout,
ytx, tx_split, uvtx, f->cur.p.layout,
&t->a->tx_lpf_y[bx4], &t->l.tx_lpf_y[by4],
has_chroma ? &t->a->tx_lpf_uv[cbx4] : NULL,
has_chroma ? &t->l.tx_lpf_uv[cby4] : NULL);
@ -3456,11 +3459,12 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
// wait until all threads have completed
if (!res) {
if (f->c->n_tc > 1) {
pthread_mutex_lock(&f->task_thread.ttd->lock);
res = dav1d_task_create_tile_sbrow(f, 0, 1);
pthread_mutex_lock(&f->task_thread.ttd->lock);
pthread_cond_signal(&f->task_thread.ttd->cond);
if (!res) {
while (!f->task_thread.done[0] ||
f->task_thread.task_counter > 0)
atomic_load(&f->task_thread.task_counter) > 0)
{
pthread_cond_wait(&f->task_thread.cond,
&f->task_thread.ttd->lock);
@ -3483,7 +3487,7 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
static int get_upscale_x0(const int in_w, const int out_w, const int step) {
const int err = out_w * step - (in_w << 14);
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err / 2);
return x0 & 0x3fff;
}
@ -3505,10 +3509,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
&c->task_thread.lock);
out_delayed = &c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
unsigned first = atomic_load(&c->task_thread.first);
if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
&first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
@ -3720,7 +3727,8 @@ int dav1d_submit_frame(Dav1dContext *const c) {
const int uses_2pass = c->n_fc > 1;
const int cols = f->frame_hdr->tiling.cols;
const int rows = f->frame_hdr->tiling.rows;
f->task_thread.task_counter = (cols * rows + f->sbh) << uses_2pass;
atomic_store(&f->task_thread.task_counter,
(cols * rows + f->sbh) << uses_2pass);
// ref_mvs
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc) {
@ -3740,9 +3748,10 @@ int dav1d_submit_frame(Dav1dContext *const c) {
if (f->frame_hdr->use_ref_frame_mvs) {
for (int i = 0; i < 7; i++) {
const int refidx = f->frame_hdr->refidx[i];
const int ref_w = ((ref_coded_width[i] + 7) >> 3) << 1;
const int ref_h = ((f->refp[i].p.p.h + 7) >> 3) << 1;
if (c->refs[refidx].refmvs != NULL &&
ref_coded_width[i] == f->cur.p.w &&
f->refp[i].p.p.h == f->cur.p.h)
ref_w == f->bw && ref_h == f->bh)
{
f->ref_mvs_ref[i] = c->refs[refidx].refmvs;
dav1d_ref_inc(f->ref_mvs_ref[i]);

View File

@ -32,6 +32,6 @@
#include "src/levels.h"
extern const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
EXTERN const uint16_t dav1d_dq_tbl[3][QINDEX_RANGE][2];
#endif /* DAV1D_SRC_DEQUANT_TABLES_H */

View File

@ -51,6 +51,11 @@ static void generate_scaling(const int bitdepth,
const int scaling_size = 1 << bitdepth;
#endif
if (num == 0) {
memset(scaling, 0, scaling_size);
return;
}
// Fill up the preceding entries with the initial value
memset(scaling, points[0][1], points[0][0] << shift_x);
@ -113,7 +118,7 @@ void bitfn(dav1d_prep_grain)(const Dav1dFilmGrainDSPContext *const dsp,
data, 1 HIGHBD_TAIL_SUFFIX);
// Generate scaling LUTs as needed
if (data->num_y_points)
if (data->num_y_points || data->chroma_scaling_from_luma)
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
if (data->num_uv_points[0])
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);

View File

@ -64,7 +64,7 @@ typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
#define decl_fguv_32x32xn_fn(name) \
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
const Dav1dFilmGrainData *data, int pw, \
const Dav1dFilmGrainData *data, size_t pw, \
const uint8_t scaling[SCALING_SIZE], \
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
const pixel *luma_row, ptrdiff_t luma_stride, \

View File

@ -278,7 +278,7 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
static NOINLINE void
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
const int pw, const uint8_t scaling[SCALING_SIZE],
const size_t pw, const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH], const int bh,
const int row_num, const pixel *const luma_row,
const ptrdiff_t luma_stride, const int uv, const int is_id,
@ -311,8 +311,8 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks (subsampled)
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
if (data->overlap_flag && bx) {
// shift previous offsets left
for (int i = 0; i < rows; i++)

View File

@ -275,7 +275,7 @@ struct Dav1dFrameContext {
struct {
int next_tile_row[2 /* 0: reconstruction, 1: entropy */];
int entropy_progress;
atomic_int entropy_progress;
atomic_int deblock_progress; // in sby units
atomic_uint *frame_progress, *copy_lpf_progress;
// indexed using t->by * f->b4_stride + t->bx
@ -324,22 +324,28 @@ struct Dav1dFrameContext {
} lf;
struct {
pthread_mutex_t lock;
pthread_cond_t cond;
struct TaskThreadData *ttd;
struct Dav1dTask *tasks, *tile_tasks[2], init_task;
int num_tasks, num_tile_tasks;
int init_done;
int done[2];
atomic_int init_done;
atomic_int done[2];
int retval;
int update_set; // whether we need to update CDF reference
atomic_int error;
int task_counter;
atomic_int task_counter;
struct Dav1dTask *task_head, *task_tail;
// Points to the task directly before the cur pointer in the queue.
// This cur pointer is theoretical here, we actually keep track of the
// "prev_t" variable. This is needed to not loose the tasks in
// [head;cur-1] when picking one for execution.
struct Dav1dTask *task_cur_prev;
struct { // async task insertion
atomic_int merge;
pthread_mutex_t lock;
Dav1dTask *head, *tail;
} pending_tasks;
} task_thread;
// threading (refer to tc[] for per-thread things)

View File

@ -235,8 +235,18 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
}
for (unsigned n = 0; n < c->n_fc; n++) {
Dav1dFrameContext *const f = &c->fc[n];
if (c->n_tc > 1)
if (pthread_cond_init(&f->task_thread.cond, NULL)) goto error;
if (c->n_tc > 1) {
if (pthread_mutex_init(&f->task_thread.lock, NULL)) goto error;
if (pthread_cond_init(&f->task_thread.cond, NULL)) {
pthread_mutex_destroy(&f->task_thread.lock);
goto error;
}
if (pthread_mutex_init(&f->task_thread.pending_tasks.lock, NULL)) {
pthread_cond_destroy(&f->task_thread.cond);
pthread_mutex_destroy(&f->task_thread.lock);
goto error;
}
}
f->c = c;
f->task_thread.ttd = &c->task_thread;
f->lf.last_sharpness = -1;
@ -335,7 +345,8 @@ static int has_grain(const Dav1dPicture *const pic)
{
const Dav1dFilmGrainData *fgdata = &pic->frame_hdr->film_grain.data;
return fgdata->num_y_points || fgdata->num_uv_points[0] ||
fgdata->num_uv_points[1];
fgdata->num_uv_points[1] || (fgdata->clip_to_restricted_range &&
fgdata->chroma_scaling_from_luma);
}
static int output_image(Dav1dContext *const c, Dav1dPicture *const out)
@ -392,10 +403,13 @@ static int drain_picture(Dav1dContext *const c, Dav1dPicture *const out) {
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
unsigned first = atomic_load(&c->task_thread.first);
if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
&first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}
@ -591,6 +605,9 @@ void dav1d_flush(Dav1dContext *const c) {
c->fc[i].task_thread.task_head = NULL;
c->fc[i].task_thread.task_tail = NULL;
c->fc[i].task_thread.task_cur_prev = NULL;
c->fc[i].task_thread.pending_tasks.head = NULL;
c->fc[i].task_thread.pending_tasks.tail = NULL;
atomic_init(&c->fc[i].task_thread.pending_tasks.merge, 0);
}
atomic_init(&c->task_thread.first, 0);
c->task_thread.cur = c->n_fc;
@ -664,7 +681,9 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
freep(&f->frame_thread.cbi);
}
if (c->n_tc > 1) {
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
pthread_cond_destroy(&f->task_thread.cond);
pthread_mutex_destroy(&f->task_thread.lock);
}
freep(&f->frame_thread.frame_progress);
freep(&f->task_thread.tasks);

View File

@ -174,6 +174,7 @@ if is_asm_enabled
'x86/cpuid.asm',
'x86/msac.asm',
'x86/refmvs.asm',
'x86/itx_avx512.asm',
'x86/cdef_avx2.asm',
'x86/itx_avx2.asm',
'x86/looprestoration_avx2.asm',
@ -186,7 +187,6 @@ if is_asm_enabled
'x86/cdef_avx512.asm',
'x86/filmgrain_avx512.asm',
'x86/ipred_avx512.asm',
'x86/itx_avx512.asm',
'x86/loopfilter_avx512.asm',
'x86/looprestoration_avx512.asm',
'x86/mc_avx512.asm',
@ -207,6 +207,7 @@ if is_asm_enabled
'x86/cdef16_avx512.asm',
'x86/filmgrain16_avx512.asm',
'x86/ipred16_avx512.asm',
'x86/itx16_avx512.asm',
'x86/loopfilter16_avx512.asm',
'x86/looprestoration16_avx512.asm',
'x86/mc16_avx512.asm',

View File

@ -1509,7 +1509,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
if (payload_size <= 0) {
dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
goto error;
break;
}
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
@ -1581,10 +1581,13 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
Dav1dThreadPicture *const out_delayed =
&c->frame_thread.out_delayed[next];
if (out_delayed->p.data[0] || atomic_load(&f->task_thread.error)) {
if (atomic_load(&c->task_thread.first) + 1U < c->n_fc)
unsigned first = atomic_load(&c->task_thread.first);
if (first + 1U < c->n_fc)
atomic_fetch_add(&c->task_thread.first, 1U);
else
atomic_store(&c->task_thread.first, 0);
atomic_compare_exchange_strong(&c->task_thread.reset_task_cur,
&first, UINT_MAX);
if (c->task_thread.cur && c->task_thread.cur < c->n_fc)
c->task_thread.cur--;
}

View File

@ -30,7 +30,7 @@
#include "src/levels.h"
extern const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
EXTERN const uint8_t *dav1d_qm_tbl[16][2][N_RECT_TX_SIZES];
void dav1d_init_qm_tables(void);

View File

@ -591,7 +591,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
const int dq_shift = imax(0, t_dim->ctx - 2);
const unsigned cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
unsigned cul_level, dc_sign_level;
if (!dc_tok) {
@ -608,7 +608,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
unsigned dc_dq = dq_tbl[0];
int dc_dq = dq_tbl[0];
dc_sign_level = (dc_sign - 1) & (2 << 6);
if (qm_tbl) {
@ -628,7 +628,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
}
cul_level = dc_tok;
dc_dq >>= dq_shift;
cf[0] = (coef) (umin(dc_dq - dc_sign, cf_max) ^ -dc_sign);
dc_dq = umin(dc_dq, cf_max + dc_sign);
cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_qm: {
const unsigned ac_dq = dq_tbl[1];
@ -638,6 +639,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
int dq_sat;
if (rc_tok >= (15 << 11)) {
tok = read_golomb(&ts->msac) + 15;
@ -654,7 +656,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
}
cul_level += tok;
dq >>= dq_shift;
cf[rc] = (coef) (umin(dq - sign, cf_max) ^ -sign);
dq_sat = umin(dq, cf_max + sign);
cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
rc = rc_tok & 0x3ff;
} while (rc);
@ -669,13 +672,13 @@ static int decode_coefs(Dav1dTaskContext *const t,
dc_tok &= 0xfffff;
dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
dc_dq = umin(dc_dq - dc_sign, cf_max);
dc_dq = umin(dc_dq, cf_max + dc_sign);
} else {
dc_dq = ((dc_dq * dc_tok) >> dq_shift) - dc_sign;
dc_dq = ((dc_dq * dc_tok) >> dq_shift);
assert(dc_dq <= cf_max);
}
cul_level = dc_tok;
cf[0] = (coef) (dc_dq ^ -dc_sign);
cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
if (rc) ac_noqm: {
const unsigned ac_dq = dq_tbl[1];
@ -684,7 +687,8 @@ static int decode_coefs(Dav1dTaskContext *const t,
if (dbg)
printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
const unsigned rc_tok = cf[rc];
unsigned tok, dq;
unsigned tok;
int dq;
// residual
if (rc_tok >= (15 << 11)) {
@ -698,15 +702,15 @@ static int decode_coefs(Dav1dTaskContext *const t,
// dequant, see 7.12.3
dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
dq = umin(dq - sign, cf_max);
dq = umin(dq, cf_max + sign);
} else {
// cannot exceed cf_max, so we can avoid the clipping
tok = rc_tok >> 11;
dq = ((ac_dq * tok) >> dq_shift) - sign;
dq = ((ac_dq * tok) >> dq_shift);
assert(dq <= cf_max);
}
cul_level += tok;
cf[rc] = (coef) (dq ^ -sign);
cf[rc] = (coef) (sign ? -dq : dq);
rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
} while (rc);
@ -1092,9 +1096,10 @@ static int obmc(Dav1dTaskContext *const t,
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
const int step4 = iclip(a_b_dim[0], 2, 16);
if (a_r->ref.ref[0] > 0) {
const int ow4 = iclip(a_b_dim[0], 2, b_dim[0]);
const int ow4 = imin(step4, b_dim[0]);
const int oh4 = imin(b_dim[1], 16) >> 1;
res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
t->bx + x, t->by, pl, a_r->mv.mv[0],
@ -1105,7 +1110,7 @@ static int obmc(Dav1dTaskContext *const t,
h_mul * ow4, v_mul * oh4);
i++;
}
x += imax(a_b_dim[0], 2);
x += step4;
}
}
@ -1114,10 +1119,11 @@ static int obmc(Dav1dTaskContext *const t,
// only odd blocks are considered for overlap handling, hence +1
const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
const int step4 = iclip(l_b_dim[1], 2, 16);
if (l_r->ref.ref[0] > 0) {
const int ow4 = imin(b_dim[0], 16) >> 1;
const int oh4 = iclip(l_b_dim[1], 2, b_dim[1]);
const int oh4 = imin(step4, b_dim[1]);
res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
t->bx, t->by + y, pl, l_r->mv.mv[0],
&f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
@ -1127,7 +1133,7 @@ static int obmc(Dav1dTaskContext *const t,
dst_stride, lap, h_mul * ow4, v_mul * oh4);
i++;
}
y += imax(l_b_dim[1], 2);
y += step4;
}
return 0;
}

View File

@ -32,6 +32,6 @@
#include "src/levels.h"
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
EXTERN const uint16_t *const dav1d_scans[N_RECT_TX_SIZES];
#endif /* DAV1D_SRC_SCAN_H */

View File

@ -34,38 +34,38 @@
#include "src/levels.h"
extern const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
extern const uint8_t /* enum BlockSize */
EXTERN const uint8_t dav1d_al_part_ctx[2][N_BL_LEVELS][N_PARTITIONS];
EXTERN const uint8_t /* enum BlockSize */
dav1d_block_sizes[N_BL_LEVELS][N_PARTITIONS][2];
// width, height (in 4px blocks), log2 versions of these two
extern const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
EXTERN const uint8_t dav1d_block_dimensions[N_BS_SIZES][4];
typedef struct TxfmInfo {
// width, height (in 4px blocks), log2 of them, min/max of log2, sub, pad
uint8_t w, h, lw, lh, min, max, sub, ctx;
} TxfmInfo;
extern const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
extern const uint8_t /* enum (Rect)TxfmSize */
EXTERN const TxfmInfo dav1d_txfm_dimensions[N_RECT_TX_SIZES];
EXTERN const uint8_t /* enum (Rect)TxfmSize */
dav1d_max_txfm_size_for_bs[N_BS_SIZES][4 /* y, 420, 422, 444 */];
extern const uint8_t /* enum TxfmType */
EXTERN const uint8_t /* enum TxfmType */
dav1d_txtp_from_uvmode[N_UV_INTRA_PRED_MODES];
extern const uint8_t /* enum InterPredMode */
EXTERN const uint8_t /* enum InterPredMode */
dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
EXTERN const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
EXTERN const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
extern const uint8_t dav1d_filter_mode_to_y_mode[5];
extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
extern const uint8_t dav1d_skip_ctx[5][5];
extern const uint8_t /* enum TxClass */
EXTERN const uint8_t dav1d_filter_mode_to_y_mode[5];
EXTERN const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
EXTERN const uint8_t dav1d_lo_ctx_offsets[3][5][5];
EXTERN const uint8_t dav1d_skip_ctx[5][5];
EXTERN const uint8_t /* enum TxClass */
dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
extern const uint8_t /* enum Filter2d */
EXTERN const uint8_t /* enum Filter2d */
dav1d_filter_2d[DAV1D_N_FILTERS /* h */][DAV1D_N_FILTERS /* v */];
extern const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
extern const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
extern const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
EXTERN const uint8_t /* enum Dav1dFilterMode */ dav1d_filter_dir[N_2D_FILTERS][2];
EXTERN const uint8_t dav1d_intra_mode_context[N_INTRA_PRED_MODES];
EXTERN const uint8_t dav1d_wedge_ctx_lut[N_BS_SIZES];
static const unsigned cfl_allowed_mask =
(1 << BS_32x32) |
@ -103,23 +103,23 @@ static const unsigned interintra_allowed_mask =
(1 << BS_8x16) |
(1 << BS_8x8);
extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
EXTERN const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int8_t dav1d_cdef_directions[12][2];
EXTERN const int8_t dav1d_cdef_directions[12][2];
extern const uint16_t dav1d_sgr_params[16][2];
extern const uint8_t dav1d_sgr_x_by_x[256];
EXTERN const uint16_t dav1d_sgr_params[16][2];
EXTERN const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[6][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8];
extern const int8_t dav1d_resize_filter[64][8];
EXTERN const int8_t dav1d_mc_subpel_filters[6][15][8];
EXTERN const int8_t dav1d_mc_warp_filter[193][8];
EXTERN const int8_t dav1d_resize_filter[64][8];
extern const uint8_t dav1d_sm_weights[128];
extern const uint16_t dav1d_dr_intra_derivative[44];
extern const int8_t dav1d_filter_intra_taps[5][64];
EXTERN const uint8_t dav1d_sm_weights[128];
EXTERN const uint16_t dav1d_dr_intra_derivative[44];
EXTERN const int8_t dav1d_filter_intra_taps[5][64];
extern const uint8_t dav1d_obmc_masks[64];
EXTERN const uint8_t dav1d_obmc_masks[64];
extern const int16_t dav1d_gaussian_sequence[2048]; // for fgs
EXTERN const int16_t dav1d_gaussian_sequence[2048]; // for fgs
#endif /* DAV1D_SRC_TABLES_H */

View File

@ -49,9 +49,13 @@ static inline int reset_task_cur(const Dav1dContext *const c,
unsigned frame_idx)
{
const unsigned first = atomic_load(&ttd->first);
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
if (reset_frame_idx < first) {
if (frame_idx == UINT_MAX) return 0;
reset_frame_idx = UINT_MAX;
}
if (!ttd->cur && c->fc[first].task_thread.task_cur_prev == NULL)
return 0;
unsigned reset_frame_idx = atomic_exchange(&ttd->reset_task_cur, UINT_MAX);
if (reset_frame_idx != UINT_MAX) {
if (frame_idx == UINT_MAX) {
if (reset_frame_idx > first + ttd->cur)
@ -78,12 +82,17 @@ cur_found:
static inline void reset_task_cur_async(struct TaskThreadData *const ttd,
unsigned frame_idx, unsigned n_frames)
{
if (frame_idx < (unsigned)atomic_load(&ttd->first)) frame_idx += n_frames;
const unsigned first = atomic_load(&ttd->first);
if (frame_idx < first) frame_idx += n_frames;
unsigned last_idx = frame_idx;
do {
frame_idx = last_idx;
last_idx = atomic_exchange(&ttd->reset_task_cur, frame_idx);
} while (last_idx < frame_idx);
if (frame_idx == first && atomic_load(&ttd->first) != first) {
unsigned expected = frame_idx;
atomic_compare_exchange_strong(&ttd->reset_task_cur, &expected, UINT_MAX);
}
}
static void insert_tasks_between(Dav1dFrameContext *const f,
@ -164,6 +173,43 @@ static inline void insert_task(Dav1dFrameContext *const f,
insert_tasks(f, t, t, cond_signal);
}
static inline void add_pending(Dav1dFrameContext *const f, Dav1dTask *const t) {
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
t->next = NULL;
if (!f->task_thread.pending_tasks.head)
f->task_thread.pending_tasks.head = t;
else
f->task_thread.pending_tasks.tail->next = t;
f->task_thread.pending_tasks.tail = t;
atomic_store(&f->task_thread.pending_tasks.merge, 1);
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
}
static inline int merge_pending_frame(Dav1dFrameContext *const f) {
int const merge = atomic_load(&f->task_thread.pending_tasks.merge);
if (merge) {
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
Dav1dTask *t = f->task_thread.pending_tasks.head;
f->task_thread.pending_tasks.head = NULL;
f->task_thread.pending_tasks.tail = NULL;
atomic_store(&f->task_thread.pending_tasks.merge, 0);
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
while (t) {
Dav1dTask *const tmp = t->next;
insert_task(f, t, 0);
t = tmp;
}
}
return merge;
}
static inline int merge_pending(const Dav1dContext *const c) {
int res = 0;
for (unsigned i = 0; i < c->n_fc; i++)
res |= merge_pending_frame(&c->fc[i]);
return res;
}
static int create_filter_sbrow(Dav1dFrameContext *const f,
const int pass, Dav1dTask **res_t)
{
@ -192,13 +238,14 @@ static int create_filter_sbrow(Dav1dFrameContext *const f,
const int prog_sz = ((f->sbh + 31) & ~31) >> 5;
if (prog_sz > f->frame_thread.prog_sz) {
atomic_uint *const prog = realloc(f->frame_thread.frame_progress,
prog_sz * 2 * sizeof(*prog));
2 * prog_sz * sizeof(*prog));
if (!prog) return -1;
f->frame_thread.frame_progress = prog;
f->frame_thread.copy_lpf_progress = prog + prog_sz;
f->frame_thread.prog_sz = prog_sz;
}
memset(f->frame_thread.frame_progress, 0, prog_sz * 2 * sizeof(atomic_uint));
f->frame_thread.prog_sz = prog_sz;
memset(f->frame_thread.frame_progress, 0, prog_sz * sizeof(atomic_uint));
memset(f->frame_thread.copy_lpf_progress, 0, prog_sz * sizeof(atomic_uint));
atomic_store(&f->frame_thread.deblock_progress, 0);
}
f->frame_thread.next_tile_row[pass & 1] = 0;
@ -224,16 +271,18 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
Dav1dTask *tasks = f->task_thread.tile_tasks[0];
const int uses_2pass = f->c->n_fc > 1;
const int num_tasks = f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
tasks = realloc(f->task_thread.tile_tasks[0], size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->task_thread.tile_tasks[0] = tasks;
f->task_thread.num_tile_tasks = alloc_num_tasks;
if (pass < 2) {
int alloc_num_tasks = num_tasks * (1 + uses_2pass);
if (alloc_num_tasks > f->task_thread.num_tile_tasks) {
const size_t size = sizeof(Dav1dTask) * alloc_num_tasks;
tasks = realloc(f->task_thread.tile_tasks[0], size);
if (!tasks) return -1;
memset(tasks, 0, size);
f->task_thread.tile_tasks[0] = tasks;
f->task_thread.num_tile_tasks = alloc_num_tasks;
}
f->task_thread.tile_tasks[1] = tasks + num_tasks;
}
f->task_thread.tile_tasks[1] = tasks + num_tasks;
tasks += num_tasks * (pass & 1);
Dav1dTask *pf_t;
@ -263,8 +312,22 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
prev_t->next = pf_t;
prev_t = pf_t;
}
insert_tasks(f, &tasks[0], prev_t, cond_signal);
f->task_thread.done[pass & 1] = 0;
prev_t->next = NULL;
atomic_store(&f->task_thread.done[pass & 1], 0);
// XXX in theory this could be done locklessly, at this point they are no
// tasks in the frameQ, so no other runner should be using this lock, but
// we must add both passes at once
pthread_mutex_lock(&f->task_thread.pending_tasks.lock);
assert(f->task_thread.pending_tasks.head == NULL || pass == 2);
if (!f->task_thread.pending_tasks.head)
f->task_thread.pending_tasks.head = &tasks[0];
else
f->task_thread.pending_tasks.tail->next = &tasks[0];
f->task_thread.pending_tasks.tail = prev_t;
atomic_store(&f->task_thread.pending_tasks.merge, 1);
pthread_mutex_unlock(&f->task_thread.pending_tasks.lock);
return 0;
}
@ -272,7 +335,7 @@ int dav1d_task_create_tile_sbrow(Dav1dFrameContext *const f, const int pass,
void dav1d_task_frame_init(Dav1dFrameContext *const f) {
const Dav1dContext *const c = f->c;
f->task_thread.init_done = 0;
atomic_store(&f->task_thread.init_done, 0);
// schedule init task, which will schedule the remaining tasks
Dav1dTask *const t = &f->task_thread.init_task;
t->type = DAV1D_TASK_TYPE_INIT;
@ -307,16 +370,12 @@ static inline int ensure_progress(struct TaskThreadData *const ttd,
// so ensure that completed. if not, re-add to task-queue; else, fall-through
int p1 = atomic_load(state);
if (p1 < t->sby) {
t->type = type;
t->recon_progress = t->deblock_progress = 0;
*target = t->sby;
add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
p1 = atomic_load(state);
if (p1 < t->sby) {
t->type = type;
t->recon_progress = t->deblock_progress = 0;
*target = t->sby;
insert_task(f, t, 0);
return 1;
}
pthread_mutex_unlock(&ttd->lock);
return 1;
}
return 0;
}
@ -369,11 +428,29 @@ static inline int check_tile(Dav1dTask *const t, Dav1dFrameContext *const f,
return 0;
}
static inline int get_frame_progress(const Dav1dContext *const c,
const Dav1dFrameContext *const f)
{
unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
if (frame_prog >= FRAME_ERROR)
return f->sbh - 1;
int idx = frame_prog >> (f->sb_shift + 7);
int prog;
do {
atomic_uint *state = &f->frame_thread.frame_progress[idx];
const unsigned val = ~atomic_load(state);
prog = val ? ctz(val) : 32;
if (prog != 32) break;
prog = 0;
} while (++idx < f->frame_thread.prog_sz);
return ((idx << 5) | prog) - 1;
}
static inline void abort_frame(Dav1dFrameContext *const f, const int error) {
atomic_store(&f->task_thread.error, error == DAV1D_ERR(EINVAL) ? 1 : -1);
f->task_thread.task_counter = 0;
f->task_thread.done[0] = 1;
f->task_thread.done[1] = 1;
atomic_store(&f->task_thread.task_counter, 0);
atomic_store(&f->task_thread.done[0], 1);
atomic_store(&f->task_thread.done[1], 1);
atomic_store(&f->sr_cur.progress[0], FRAME_ERROR);
atomic_store(&f->sr_cur.progress[1], FRAME_ERROR);
dav1d_decode_frame_exit(f, error);
@ -478,6 +555,8 @@ void *dav1d_worker_task(void *data) {
for (;;) {
if (tc->task_thread.die) break;
if (atomic_load(c->flush)) goto park;
merge_pending(c);
if (ttd->delayed_fg.exec) { // run delayed film grain first
delayed_fg_task(c, ttd);
continue;
@ -488,11 +567,18 @@ void *dav1d_worker_task(void *data) {
for (unsigned i = 0; i < c->n_fc; i++) {
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + i) % c->n_fc];
if (f->task_thread.init_done) continue;
if (atomic_load(&f->task_thread.init_done)) continue;
t = f->task_thread.task_head;
if (!t) continue;
if (t->type == DAV1D_TASK_TYPE_INIT) goto found;
if (t->type == DAV1D_TASK_TYPE_INIT_CDF) {
// XXX This can be a simple else, if adding tasks of both
// passes at once (in dav1d_task_create_tile_sbrow).
// Adding the tasks to the pending Q can result in a
// thread merging them before setting init_done.
// We will need to set init_done before adding to the
// pending Q, so maybe return the tasks, set init_done,
// and add to pending Q only then.
const int p1 = f->in_cdf.progress ?
atomic_load(f->in_cdf.progress) : 1;
if (p1) {
@ -505,6 +591,7 @@ void *dav1d_worker_task(void *data) {
while (ttd->cur < c->n_fc) { // run decoding tasks last
const unsigned first = atomic_load(&ttd->first);
f = &c->fc[(first + ttd->cur) % c->n_fc];
merge_pending_frame(f);
prev_t = f->task_thread.task_cur_prev;
t = prev_t ? prev_t->next : f->task_thread.task_head;
while (t) {
@ -519,11 +606,12 @@ void *dav1d_worker_task(void *data) {
} else if (t->recon_progress) {
const int p = t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
int error = atomic_load(&f->task_thread.error);
assert(!f->task_thread.done[p] || error);
assert(!atomic_load(&f->task_thread.done[p]) || error);
const int tile_row_base = f->frame_hdr->tiling.cols *
f->frame_thread.next_tile_row[p];
if (p) {
const int p1 = f->frame_thread.entropy_progress;
atomic_int *const prog = &f->frame_thread.entropy_progress;
const int p1 = atomic_load(prog);
if (p1 < t->sby) goto next;
atomic_fetch_or(&f->task_thread.error, p1 == TILE_ERROR);
}
@ -567,6 +655,7 @@ void *dav1d_worker_task(void *data) {
ttd->cur++;
}
if (reset_task_cur(c, ttd, UINT_MAX)) continue;
if (merge_pending(c)) continue;
park:
tc->task_thread.flushed = 1;
pthread_cond_signal(&tc->task_thread.td.cond);
@ -584,6 +673,7 @@ void *dav1d_worker_task(void *data) {
if (!t->next) f->task_thread.task_tail = prev_t;
if (t->type > DAV1D_TASK_TYPE_INIT_CDF && !f->task_thread.task_head)
ttd->cur++;
t->next = NULL;
// we don't need to check cond_signaled here, since we found a task
// after the last signal so we want to re-signal the next waiting thread
// and again won't need to signal after that
@ -605,13 +695,13 @@ void *dav1d_worker_task(void *data) {
if (res || p1 == TILE_ERROR) {
pthread_mutex_lock(&ttd->lock);
abort_frame(f, res ? res : DAV1D_ERR(EINVAL));
} else if (!res) {
reset_task_cur(c, ttd, t->frame_idx);
} else {
t->type = DAV1D_TASK_TYPE_INIT_CDF;
if (p1) goto found_unlocked;
add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
insert_task(f, t, 0);
}
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
case DAV1D_TASK_TYPE_INIT_CDF: {
@ -619,7 +709,6 @@ void *dav1d_worker_task(void *data) {
int res = DAV1D_ERR(EINVAL);
if (!atomic_load(&f->task_thread.error))
res = dav1d_decode_frame_init_cdf(f);
pthread_mutex_lock(&ttd->lock);
if (f->frame_hdr->refresh_context && !f->task_thread.update_set) {
atomic_store(f->out_cdf.progress, res < 0 ? TILE_ERROR : 1);
}
@ -628,23 +717,34 @@ void *dav1d_worker_task(void *data) {
for (int p = 1; p <= 2; p++) {
const int res = dav1d_task_create_tile_sbrow(f, p, 0);
if (res) {
pthread_mutex_lock(&ttd->lock);
// memory allocation failed
f->task_thread.done[2 - p] = 1;
atomic_store(&f->task_thread.done[2 - p], 1);
atomic_store(&f->task_thread.error, -1);
f->task_thread.task_counter -= f->sbh +
f->frame_hdr->tiling.cols * f->frame_hdr->tiling.rows;
atomic_fetch_sub(&f->task_thread.task_counter,
f->frame_hdr->tiling.cols *
f->frame_hdr->tiling.rows + f->sbh);
atomic_store(&f->sr_cur.progress[p - 1], FRAME_ERROR);
if (p == 2 && f->task_thread.done[1]) {
assert(!f->task_thread.task_counter);
if (p == 2 && atomic_load(&f->task_thread.done[1])) {
assert(!atomic_load(&f->task_thread.task_counter));
dav1d_decode_frame_exit(f, DAV1D_ERR(ENOMEM));
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
atomic_store(&f->task_thread.init_done, 1);
continue;
} else {
pthread_mutex_unlock(&ttd->lock);
}
}
}
} else abort_frame(f, res);
reset_task_cur(c, ttd, t->frame_idx);
f->task_thread.init_done = 1;
atomic_store(&f->task_thread.init_done, 1);
pthread_mutex_lock(&ttd->lock);
} else {
pthread_mutex_lock(&ttd->lock);
abort_frame(f, res);
reset_task_cur(c, ttd, t->frame_idx);
atomic_store(&f->task_thread.init_done, 1);
}
continue;
}
case DAV1D_TASK_TYPE_TILE_ENTROPY:
@ -673,10 +773,9 @@ void *dav1d_worker_task(void *data) {
pthread_cond_signal(&ttd->cond);
goto found_unlocked;
}
pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
reset_task_cur(c, ttd, t->frame_idx);
insert_task(f, t, 0);
add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
} else {
pthread_mutex_lock(&ttd->lock);
atomic_store(&ts->progress[p], progress);
@ -692,15 +791,16 @@ void *dav1d_worker_task(void *data) {
if (c->n_fc > 1)
atomic_store(f->out_cdf.progress, error ? TILE_ERROR : 1);
}
if (!--f->task_thread.task_counter && f->task_thread.done[0] &&
(!uses_2pass || f->task_thread.done[1]))
if (atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1 == 0 &&
atomic_load(&f->task_thread.done[0]) &&
(!uses_2pass || atomic_load(&f->task_thread.done[1])))
{
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
error ? DAV1D_ERR(ENOMEM) : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
assert(f->task_thread.task_counter >= 0);
assert(atomic_load(&f->task_thread.task_counter) >= 0);
if (!atomic_fetch_or(&ttd->cond_signaled, 1))
pthread_cond_signal(&ttd->cond);
}
@ -734,15 +834,11 @@ void *dav1d_worker_task(void *data) {
if (sby) {
int prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
if (~prog & (1U << ((sby - 1) & 31))) {
t->type = DAV1D_TASK_TYPE_CDEF;
t->recon_progress = t->deblock_progress = 0;
add_pending(f, t);
pthread_mutex_lock(&ttd->lock);
prog = atomic_load(&f->frame_thread.copy_lpf_progress[(sby - 1) >> 5]);
if (~prog & (1U << ((sby - 1) & 31))) {
t->type = DAV1D_TASK_TYPE_CDEF;
t->recon_progress = t->deblock_progress = 0;
insert_task(f, t, 0);
continue;
}
pthread_mutex_unlock(&ttd->lock);
continue;
}
}
}
@ -776,40 +872,53 @@ void *dav1d_worker_task(void *data) {
const int uses_2pass = c->n_fc > 1;
const int sbh = f->sbh;
const int sbsz = f->sb_step * 4;
const enum PlaneType progress_plane_type =
t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS ? PLANE_TYPE_BLOCK :
c->n_fc > 1 ? PLANE_TYPE_Y : PLANE_TYPE_ALL;
if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS)
atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
1U << (sby & 31));
pthread_mutex_lock(&ttd->lock);
if (t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
unsigned frame_prog = c->n_fc > 1 ? atomic_load(&f->sr_cur.progress[1]) : 0;
if (frame_prog < FRAME_ERROR) {
int idx = frame_prog >> (f->sb_shift + 7);
int prog;
do {
atomic_uint *state = &f->frame_thread.frame_progress[idx];
const unsigned val = ~atomic_load(state);
prog = val ? ctz(val) : 32;
if (prog != 32) break;
prog = 0;
} while (++idx < f->frame_thread.prog_sz);
sby = ((idx << 5) | prog) - 1;
} else sby = sbh - 1;
if (t->type == DAV1D_TASK_TYPE_ENTROPY_PROGRESS) {
error = atomic_load(&f->task_thread.error);
const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
assert(c->n_fc > 1);
if (f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
atomic_store(&f->sr_cur.progress[0], error ? FRAME_ERROR : y);
atomic_store(&f->frame_thread.entropy_progress,
error ? TILE_ERROR : sby + 1);
if (sby + 1 == sbh)
atomic_store(&f->task_thread.done[1], 1);
pthread_mutex_lock(&ttd->lock);
const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
if (sby + 1 < sbh && num_tasks) {
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
atomic_load(&f->task_thread.done[1]))
{
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
error ? DAV1D_ERR(ENOMEM) : 0);
f->n_tile_data = 0;
pthread_cond_signal(&f->task_thread.cond);
}
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
// t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS
atomic_fetch_or(&f->frame_thread.frame_progress[sby >> 5],
1U << (sby & 31));
pthread_mutex_lock(&f->task_thread.lock);
sby = get_frame_progress(c, f);
error = atomic_load(&f->task_thread.error);
const unsigned y = sby + 1 == sbh ? UINT_MAX : (unsigned)(sby + 1) * sbsz;
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */) {
const int idx = t->type != DAV1D_TASK_TYPE_ENTROPY_PROGRESS;
atomic_store(&f->sr_cur.progress[idx], error ? FRAME_ERROR : y);
}
if (progress_plane_type == PLANE_TYPE_BLOCK)
f->frame_thread.entropy_progress = error ? TILE_ERROR : sby + 1;
if (c->n_fc > 1 && f->sr_cur.p.data[0] /* upon flush, this can be free'ed already */)
atomic_store(&f->sr_cur.progress[1], error ? FRAME_ERROR : y);
pthread_mutex_unlock(&f->task_thread.lock);
if (sby + 1 == sbh)
f->task_thread.done[progress_plane_type == PLANE_TYPE_BLOCK] = 1;
if (!--f->task_thread.task_counter &&
f->task_thread.done[0] && (!uses_2pass || f->task_thread.done[1]))
atomic_store(&f->task_thread.done[0], 1);
pthread_mutex_lock(&ttd->lock);
const int num_tasks = atomic_fetch_sub(&f->task_thread.task_counter, 1) - 1;
if (sby + 1 < sbh && num_tasks) {
reset_task_cur(c, ttd, t->frame_idx);
continue;
}
if (!num_tasks && atomic_load(&f->task_thread.done[0]) &&
(!uses_2pass || atomic_load(&f->task_thread.done[1])))
{
dav1d_decode_frame_exit(f, error == 1 ? DAV1D_ERR(EINVAL) :
error ? DAV1D_ERR(ENOMEM) : 0);

View File

@ -31,11 +31,11 @@
#include "src/levels.h"
void dav1d_init_wedge_masks(void);
extern const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[2 /* sign */][16 /* wedge_idx */];
void dav1d_init_interintra_masks(void);
extern const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[N_INTER_INTRA_PRED_MODES];
#endif /* DAV1D_SRC_WEDGE_H */

View File

@ -126,6 +126,7 @@ decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x32, bpc, ext)); \
decl_itx_fn(BF_BPC(dav1d_inv_txfm_add_dct_dct_64x64, bpc, ext))
decl_itx_fns(avx512icl);
decl_itx_bpc_fns(10, avx512icl);
decl_itx_fns(avx2);
decl_itx_bpc_fns(10, avx2);
decl_itx_bpc_fns(12, avx2);
@ -341,6 +342,13 @@ static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, cons
assign_itx1_fn (R, 64, 16, avx512icl);
assign_itx1_fn (R, 64, 32, avx512icl);
assign_itx1_fn ( , 64, 64, avx512icl);
#else
if (bpc == 10) {
assign_itx16_bpc_fn( , 8, 8, 10, avx512icl);
assign_itx16_bpc_fn(R, 8, 16, 10, avx512icl);
assign_itx16_bpc_fn(R, 16, 8, 10, avx512icl);
assign_itx12_bpc_fn( , 16, 16, 10, avx512icl);
}
#endif
#endif
}

File diff suppressed because it is too large Load Diff

1979
third_party/dav1d/src/x86/itx16_avx512.asm vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -361,18 +361,32 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
movd m1, [o(pw_2896x8)]
imul r5d, [cq], 181
mov [cq], eobd ; 0
add r5d, 2048
sar r5d, 12
mov r3d, 4
.dconly:
add r5d, 128
sar r5d, 8
.dconly2:
imul r5d, 2896
mova m2, [o(pixel_10bpc_max)]
add r5d, 34816
movd m0, r5d
packssdw m0, m0
pmulhrsw m0, m1
pshuflw m0, m0, q0000
pshuflw m0, m0, q1111
pxor m3, m3
punpcklqdq m0, m0
mova m1, m0
TAIL_CALL m(iadst_4x4_internal_16bpc).end
.dconly_loop:
movq m1, [dstq+strideq*0]
movhps m1, [dstq+strideq*1]
paddw m1, m0
pminsw m1, m2
pmaxsw m1, m3
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
%endif
%endmacro
@ -662,40 +676,13 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X8_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 4x8
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 2048
sar r5d, 12
.end:
imul r5d, 2896
add r5d, 34816
movd m0, r5d
pshuflw m0, m0, q1111
punpcklqdq m0, m0
pxor m4, m4
mova m3, [o(pixel_10bpc_max)]
lea r2, [strideq*3]
.loop:
movq m1, [dstq+strideq*0]
movq m2, [dstq+strideq*2]
movhps m1, [dstq+strideq*1]
movhps m2, [dstq+r2]
paddw m1, m0
paddw m2, m0
REPX {pminsw x, m3}, m1, m2
REPX {pmaxsw x, m4}, m1, m2
movq [dstq+strideq*0], m1
movhps [dstq+strideq*1], m1
movq [dstq+strideq*2], m2
movhps [dstq+r2 ], m2
lea dstq, [dstq+strideq*4]
dec r3d
jg .loop
RET
mov r3d, 8
add r5d, 128
sar r5d, 8
imul r5d, 181
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly
%endif
%endmacro
@ -944,12 +931,12 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
%macro INV_TXFM_4X16_FN 2-3 2d ; type1, type2, eob_tbl_suffix
INV_TXFM_FN %1, %2, tbl_4x16_%3, 4x16
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
add r5d, 6144
sar r5d, 13
jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
mov r3d, 16
add r5d, 384
sar r5d, 9
jmp m(inv_txfm_add_dct_dct_4x4_16bpc).dconly2
%endif
%endmacro
@ -1297,13 +1284,13 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 8x4, 8, 0-4*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 2048
sar r5d, 12
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 128
sar r5d, 8
imul r5d, 2896
add r5d, 34816
movd m0, r5d
@ -1783,12 +1770,12 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 8x8, 8, 0-5*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 2
.end:
add r5d, 6144
sar r5d, 13
add r5d, 384
sar r5d, 9
.end2:
imul r5d, 2896
add r5d, 34816
@ -2146,11 +2133,11 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_8x16_%3, 8x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 128
sar r5d, 8
imul r5d, 181
mov r3d, 4
%if stack_size_padded > 0
; adjust to caller's stack allocation
@ -2477,12 +2464,12 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, 0, 16x4, 8, 0-12*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
add r5d, 6144
sar r5d, 13
add r5d, 384
sar r5d, 9
.dconly2:
imul r5d, 2896
add r5d, 34816
@ -2755,6 +2742,8 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
pcmpeqd m8, m8
REPX {psubd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@ -2784,6 +2773,14 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
m8, m9, m10, m11, m12, m13, m14, m15
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
pmaxsd m0, [r3+ 0*16]
mova [r3+ 0*16], m7
mova m7, [o(clip_18b_max)]
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
pminsd m7, [r3+ 0*16]
mova [r3+ 0*16], m0
pcmpeqd m0, m0
REPX {psubd x, m0}, m1, m2, m3, m4, m5, m6, m7
@ -3472,12 +3469,12 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, %3, 16x8, 8, 0-13*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 128
sar r5d, 8
imul r5d, 181
%if ARCH_X86_32
add rsp, 1*16
%endif
@ -3939,11 +3936,11 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
INV_TXFM_FN %1, %2, tbl_16x16_%3, 16x16, 8, 0-17*16
%endif
%ifidn %1_%2, dct_dct
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
add r5d, 10240
sar r5d, 14
add r5d, 640
sar r5d, 10
add rsp, (5+ARCH_X86_64*3+WIN64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
@ -4057,6 +4054,8 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
ret
.round:
%if ARCH_X86_64
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
psrld m8, m11, 10 ; 2
REPX {paddd x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
mova m8, [r3+1*16]
@ -4086,6 +4085,14 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, stride, c, eob, tx2
m8, m9, m10, m11, m12, m13, m14, m15
; and out0-15 is now in m0-15
%else
mova [r3+ 0*16], m0
mova m0, [o(clip_18b_min)]
REPX {pmaxsd x, m0}, m1, m2, m3, m4, m5, m6, m7
pmaxsd m0, [r3+ 0*16]
mova [r3+ 0*16], m7
mova m7, [o(clip_18b_max)]
REPX {pminsd x, m7}, m0, m1, m2, m3, m4, m5, m6
pminsd m7, [r3+ 0*16]
mova [r3+ 0*16], m0
mova m0, [o(pd_2)]
REPX {paddd x, m0}, m1, m2, m3, m4, m5, m6, m7
@ -5162,11 +5169,11 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-36*16, \
call m(idct_8x8_internal_16bpc).round1_and_write_8x8
ret
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
add r5d, 10240
sar r5d, 14
add r5d, 640
sar r5d, 10
add rsp, (31+2*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).end2
@ -5339,12 +5346,12 @@ cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 16, 0-77*16, \
%endif
RET
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 128
sar r5d, 8
imul r5d, 181
add rsp, (65+4*ARCH_X86_64)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
@ -5944,6 +5951,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
; final sumsub for idct16 as well as idct32, plus final downshift
%macro IDCT32_END 6 ; in/out1, out2-4, tmp, shift, idx
mova m%4, [r3+16*(23-%1)]
pmaxsd m%1, m12
pminsd m%1, m13
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
@ -6019,6 +6028,8 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
.loop_dct32_end:
mova m0, [r3+16*16]
mova m6, [r3+16*24]
pmaxsd m0, m2
pminsd m0, m3
psubd m5, m0, m6 ; idct16 out15 - n
paddd m0, m6 ; idct16 out0 + n
pmaxsd m0, m2
@ -6045,12 +6056,12 @@ cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 8
.dconly1:
add r5d, 10240
sar r5d, 14
add r5d, 640
sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@ -6344,14 +6355,14 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-(24+8*ARCH_X86_32)*16, \
%endif
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 6144
sar r5d, 13
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
@ -6565,7 +6576,7 @@ cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 16, 0-(5*32+1)*16, \
jmp m(inv_txfm_add_dct_dct_16x32_16bpc).loop_pass2_entry
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add rsp, (5*32+1-(24+8*ARCH_X86_32))*16
@ -6838,11 +6849,11 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
ret
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add r5d, 10240
sar r5d, 14
add r5d, 640
sar r5d, 10
add rsp, (12+2*64)*16+(4+4*ARCH_X86_32)*gprsize-(8+4*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
@ -7098,14 +7109,14 @@ cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 16, \
jmp m(inv_txfm_add_dct_dct_16x64_16bpc).loop_pass2
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 6144
sar r5d, 13
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
add rsp, (32+4*64)*16+(4+4*ARCH_X86_32)*gprsize-(24+8*ARCH_X86_32)*16
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
@ -7537,6 +7548,8 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova m5, [r3-16* 4] ; idct64 48 + n
mova m6, [r4-16*20] ; idct64 47 - n
mova m7, [r3-16*20] ; idct64 32 + n
pmaxsd m0, m12
pminsd m0, m13
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
@ -7565,11 +7578,13 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
mova [r4-16* 4], m6
mova [r3+16*12], m8
%else
mova m1, [r3+16*44] ; idct16 15 - n
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
mova m5, [o(clip_18b_min)]
mova m6, [o(clip_18b_max)]
mova m1, [r3+16*44] ; idct16 15 - n
pmaxsd m0, m5
pminsd m0, m6
paddd m4, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m5}, m4, m0
REPX {pminsd x, m6}, m4, m0
paddd m1, m4, m3 ; idct32 out0 + n
@ -7632,12 +7647,12 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-(64+8*ARCH_X86_32)*16, \
ret
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 16
.dconly1:
add r5d, 10240
sar r5d, 14
add r5d, 640
sar r5d, 10
.dconly2:
imul r5d, 2896
add r5d, 34816
@ -7876,14 +7891,14 @@ cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 16, \
ret
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 32
add r5d, 2048
sar r5d, 12
imul r5d, 2896
add r5d, 6144
sar r5d, 13
add r5d, 128
sar r5d, 8
imul r5d, 181
add r5d, 384
sar r5d, 9
add rsp, (1+8*32+1*WIN64)*16
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
@ -8112,7 +8127,7 @@ cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 16, \
ret
.dconly:
imul r5d, [cq], 2896
imul r5d, [cq], 181
mov [cq], eobd ; 0
mov r3d, 64
add rsp, (64+8*ARCH_X86_32+8*64+1*ARCH_X86_64)*16 + \

View File

@ -29,7 +29,8 @@
%if ARCH_X86_64
SECTION_RODATA 64
int8_permA: db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
const \
int8_permA, db 0, 1, 16, 17, 32, 33, 48, 49, 2, 3, 18, 19, 34, 35, 50, 51
db 4, 5, 20, 21, 36, 37, 52, 53, 6, 7, 22, 23, 38, 39, 54, 55
db 8, 9, 24, 25, 40, 41, 56, 57, 10, 11, 26, 27, 42, 43, 58, 59
db 12, 13, 28, 29, 44, 45, 60, 61, 14, 15, 30, 31, 46, 47, 62, 63
@ -845,7 +846,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
punpcklwd m3, m5 ; dct8 in3 in5
punpckhwd m5, m2 ; dct16 in11 in5
punpcklwd m6, m2 ; dct4 in3 in1
.main2:
cglobal_label .main2
vpbroadcastd m10, [o(pd_2048)]
.main3:
vpbroadcastq m13, [o(int_mshift)]
@ -1355,7 +1356,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpermq m3, m3, q2031
jmp m(iadst_8x8_internal_8bpc).end2
ALIGN function_align
.main:
cglobal_label .main
IDCT8_1D_PACKED
ret
@ -1422,7 +1423,7 @@ ALIGN function_align
punpckhqdq m0, m4 ; out0 -out1
ret
ALIGN function_align
.main_pass2:
cglobal_label .main_pass2
IADST8_1D_PACKED 2
ret
@ -1608,7 +1609,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vpscatterdq [r3+ym8]{k2}, m2
RET
ALIGN function_align
.main:
cglobal_label .main
WRAP_YMM IDCT16_1D_PACKED
ret
@ -1685,13 +1686,14 @@ ALIGN function_align
vpermi2q m6, m0, m2 ; in4 in8 in6 in10
vpermt2q m1, m10, m3 ; in11 in7 in9 in5
.main:
vpbroadcastd m9, [o(pd_2048)]
vpbroadcastq m13, [o(int_mshift)]
kxnorb k1, k1, k1
punpcklwd m0, m4, m5 ; in0 in15 in2 in13
punpckhwd m4, m5 ; in12 in3 in14 in1
punpcklwd m5, m6, m1 ; in4 in11 in6 in9
punpckhwd m6, m1 ; in8 in7 in10 in5
cglobal_label .main2
vpbroadcastd m9, [o(pd_2048)]
vpbroadcastq m13, [o(int_mshift)]
kxnorb k1, k1, k1
vpcmpub k7, m13, m9, 6 ; 0x33...
pxor m8, m8
ITX_MUL4X_PACK 0, 1, 2, 3, 7, 9, 201, 4091, 995, 3973, 5
@ -2114,7 +2116,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
vextracti32x4 [r3 +r4 ], m1, 3
RET
ALIGN function_align
.main:
cglobal_label .main
IDCT8_1D_PACKED
ret
@ -2168,6 +2170,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call .main_pass2
movshdup m4, [o(permC)]
pmulhrsw m0, m6
pmulhrsw m1, m6
psrlq m6, m4, 4
@ -2194,9 +2197,8 @@ ALIGN function_align
IADST8_1D_PACKED 1
ret
ALIGN function_align
.main_pass2:
cglobal_label .main_pass2
IADST8_1D_PACKED 2
movshdup m4, [o(permC)]
pxor m5, m5
psubd m5, m6
packssdw m6, m5
@ -2222,6 +2224,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, stride, c, eob, tx2
pshufd m4, m0, q1032 ; 1 0
pshufd m5, m1, q1032 ; 3 2
call m(iadst_16x8_internal_8bpc).main_pass2
movshdup m4, [o(permC)]
pmulhrsw m5, m6, m0
pmulhrsw m0, m6, m1
psrlq m1, m4, 12
@ -2456,7 +2459,7 @@ ALIGN function_align
pmulhrsw m3, m4 ; t5a t6a
jmp .main4
ALIGN function_align
.main:
cglobal_label .main
IDCT16_1D_PACKED
ret
@ -2562,6 +2565,7 @@ ALIGN function_align
vshufi32x4 m1, m5, q2020 ; 2 3
vshufi32x4 m5, m7, m9, q2020 ; 10 11
vshufi32x4 m7, m9, q3131 ; 14 15
cglobal_label .main_pass2b
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
call .main
vpbroadcastd m8, [o(pw_2896x8)]

View File

@ -329,11 +329,11 @@ ALIGN function_align
packuswb m2, m4
psrlw m2, 8
vpackuswb m2{k2}, m3, m5
mova [dstq+r10], m2
add r10, 64
jl .hv_loop
mov t6, t5
mov t5, t4
movu [dstq+r10], m2 ; We don't have a separate 5-tap version so the 7-tap
add r10, 64 ; function is used for chroma as well, and in some
jl .hv_loop ; esoteric edge cases chroma dst pointers may only
mov t6, t5 ; have a 32-byte alignment despite having a width
mov t5, t4 ; larger than 32, so use an unaligned store here.
mov t4, t3
mov t3, t2
mov t2, t1
@ -379,7 +379,7 @@ ALIGN function_align
packuswb m0, m2
psrlw m0, 8
vpackuswb m0{k2}, m1, m3
mova [dstq+r10], m0
movu [dstq+r10], m0
add r10, 64
jl .v_loop
mov t6, t5

View File

@ -1604,7 +1604,7 @@ cglobal put_8tap_16bpc, 4, 9, 16, dst, ds, src, ss, w, h, mx, my
vpbroadcastd m11, [buf+ 4]
vpbroadcastd m12, [buf+ 8]
vpbroadcastd m13, [buf+12]
cmp wd, 16
sub wd, 16
je .h_w16
jg .h_w32
.h_w8:

View File

@ -121,15 +121,15 @@ static struct {
CheckasmFunc *current_func;
CheckasmFuncVersion *current_func_ver;
const char *current_test_name;
const char *bench_pattern;
size_t bench_pattern_len;
int num_checked;
int num_failed;
int nop_time;
unsigned cpu_flag;
const char *cpu_flag_name;
const char *test_name;
const char *test_pattern;
const char *function_pattern;
unsigned seed;
int bench;
int bench_c;
int verbose;
int function_listing;
@ -489,6 +489,21 @@ static void signal_handler(const int s) {
}
#endif
/* Compares a string with a wildcard pattern. */
static int wildstrcmp(const char *str, const char *pattern) {
const char *wild = strchr(pattern, '*');
if (wild) {
const size_t len = wild - pattern;
if (strncmp(str, pattern, len)) return 1;
while (*++wild == '*');
if (!*wild) return 0;
str += len;
while (*str && wildstrcmp(str, wild)) str++;
return !*str;
}
return strcmp(str, pattern);
}
/* Perform tests and benchmarks for the specified
* cpu flag if supported by the host */
static void check_cpu_flag(const char *const name, unsigned flag) {
@ -501,7 +516,7 @@ static void check_cpu_flag(const char *const name, unsigned flag) {
if (!flag || state.cpu_flag != old_cpu_flag) {
state.cpu_flag_name = name;
for (int i = 0; tests[i].func; i++) {
if (state.test_name && strcmp(tests[i].name, state.test_name))
if (state.test_pattern && wildstrcmp(tests[i].name, state.test_pattern))
continue;
xor128_srand(state.seed);
state.current_test_name = tests[i].name;
@ -536,33 +551,40 @@ int main(int argc, char *argv[]) {
state.seed = get_seed();
while (argc > 1) {
if (!strncmp(argv[1], "--help", 6)) {
if (!strncmp(argv[1], "--help", 6) || !strcmp(argv[1], "-h")) {
fprintf(stderr,
"checkasm [options] <random seed>\n"
" <random seed> Numeric value to seed the rng\n"
" <random seed> Numeric value to seed the rng\n"
"Options:\n"
" --test=<test_name> Test only <test_name>\n"
" --bench=<pattern> Test and benchmark the functions matching <pattern>\n"
" --list-functions List available functions\n"
" --list-tests List available tests\n"
" --bench-c Benchmark the C-only functions\n"
" --verbose -v Print failures verbosely\n");
" --test=<pattern> Test only <pattern>\n"
" --function=<pattern> -f Test only the functions matching <pattern>\n"
" --bench -b Benchmark the tested functions\n"
" --list-functions List available functions\n"
" --list-tests List available tests\n"
" --bench-c -c Benchmark the C-only functions\n"
" --verbose -v Print failures verbosely\n");
return 0;
} else if (!strncmp(argv[1], "--bench-c", 9)) {
} else if (!strcmp(argv[1], "--bench-c") || !strcmp(argv[1], "-c")) {
state.bench_c = 1;
} else if (!strncmp(argv[1], "--bench", 7)) {
} else if (!strcmp(argv[1], "--bench") || !strcmp(argv[1], "-b")) {
#ifndef readtime
fprintf(stderr,
"checkasm: --bench is not supported on your system\n");
return 1;
#endif
if (argv[1][7] == '=') {
state.bench_pattern = argv[1] + 8;
state.bench_pattern_len = strlen(state.bench_pattern);
} else
state.bench_pattern = "";
state.bench = 1;
} else if (!strncmp(argv[1], "--test=", 7)) {
state.test_name = argv[1] + 7;
state.test_pattern = argv[1] + 7;
} else if (!strcmp(argv[1], "-t")) {
state.test_pattern = argc > 1 ? argv[2] : "";
argc--;
argv++;
} else if (!strncmp(argv[1], "--function=", 11)) {
state.function_pattern = argv[1] + 11;
} else if (!strcmp(argv[1], "-f")) {
state.function_pattern = argc > 1 ? argv[2] : "";
argc--;
argv++;
} else if (!strcmp(argv[1], "--list-functions")) {
state.function_listing = 1;
} else if (!strcmp(argv[1], "--list-tests")) {
@ -602,7 +624,7 @@ int main(int argc, char *argv[]) {
#endif
#ifdef readtime
if (state.bench_pattern) {
if (state.bench) {
static int testing = 0;
checkasm_save_context();
if (!testing) {
@ -658,7 +680,7 @@ int main(int argc, char *argv[]) {
} else {
fprintf(stderr, "checkasm: all %d tests passed\n", state.num_checked);
#ifdef readtime
if (state.bench_pattern) {
if (state.bench) {
state.nop_time = measure_nop_time();
printf("nop: %d.%d\n", state.nop_time/10, state.nop_time%10);
print_benchs(state.funcs);
@ -682,8 +704,11 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
const int name_length = vsnprintf(name_buf, sizeof(name_buf), name, arg);
va_end(arg);
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf))
if (!func || name_length <= 0 || (size_t)name_length >= sizeof(name_buf) ||
(state.function_pattern && wildstrcmp(name_buf, state.function_pattern)))
{
return NULL;
}
state.current_func = get_func(&state.funcs, name_buf);
@ -724,9 +749,7 @@ void *checkasm_check_func(void *const func, const char *const name, ...) {
/* Decide whether or not the current function needs to be benchmarked */
int checkasm_bench_func(void) {
return !state.num_failed && state.bench_pattern &&
!strncmp(state.current_func->name, state.bench_pattern,
state.bench_pattern_len);
return !state.num_failed && state.bench;
}
/* Indicate that the current test has failed, return whether verbose printing

View File

@ -185,17 +185,6 @@ static inline uint64_t readtime(void) {
void checkasm_checked_call(void *func, ...);
#if ARCH_X86_64
/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended
* to 64-bit. This is done by clobbering the stack with junk around the stack
* pointer and calling the assembly function through checked_call() with added
* dummy arguments which forces all real arguments to be passed on the stack
* and not in registers. For 32-bit arguments the upper half of the 64-bit
* register locations on the stack will now contain junk which will cause
* misbehaving functions to either produce incorrect output or segfault. Note
* that even though this works extremely well in practice, it's technically
* not guaranteed and false negatives is theoretically possible, but there
* can never be any false positives. */
void checkasm_stack_clobber(uint64_t clobber, ...);
/* YMM and ZMM registers on x86 are turned off to save power when they haven't
* been used for some period of time. When they are used there will be a
* "warmup" period during which performance will be reduced and inconsistent
@ -203,24 +192,54 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
* work around this by periodically issuing "dummy" instructions that uses
* those registers to keep them powered on. */
void checkasm_simd_warmup(void);
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
int, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call;
#define CLOB (UINT64_C(0xdeadbeefdeadbeef))
/* The upper 32 bits of 32-bit data types are undefined when passed as function
* parameters. In practice those bits usually end up being zero which may hide
* certain bugs, such as using a register containing undefined bits as a pointer
* offset, so we want to intentionally clobber those bits with junk to expose
* any issues. The following set of macros automatically calculates a bitmask
* specifying which parameters should have their upper halves clobbered. */
#ifdef _WIN32
#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
/* Integer and floating-point parameters share "register slots". */
#define IGNORED_FP_ARGS 0
#else
#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
/* Up to 8 floating-point parameters are passed in XMM registers, which are
* handled orthogonally from integer parameters passed in GPR registers. */
#define IGNORED_FP_ARGS 8
#endif
#ifdef HAVE_C11_GENERIC
#define clobber_type(arg) _Generic((void (*)(void*, arg))NULL,\
void (*)(void*, int32_t ): clobber_mask |= 1 << mpos++,\
void (*)(void*, uint32_t): clobber_mask |= 1 << mpos++,\
void (*)(void*, float ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
void (*)(void*, double ): mpos += (fp_args++ >= IGNORED_FP_ARGS),\
default: mpos++)
#define init_clobber_mask(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, ...)\
unsigned clobber_mask = 0;\
{\
int mpos = 0, fp_args = 0;\
clobber_type(a); clobber_type(b); clobber_type(c); clobber_type(d);\
clobber_type(e); clobber_type(f); clobber_type(g); clobber_type(h);\
clobber_type(i); clobber_type(j); clobber_type(k); clobber_type(l);\
clobber_type(m); clobber_type(n); clobber_type(o); clobber_type(p);\
}
#else
/* Skip parameter clobbering on compilers without support for _Generic() */
#define init_clobber_mask(...) unsigned clobber_mask = 0
#endif
#define declare_new(ret, ...)\
ret (*checked_call)(__VA_ARGS__, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int, int, int,\
void*, unsigned) =\
(void*)checkasm_checked_call;\
init_clobber_mask(__VA_ARGS__, void*, void*, void*, void*,\
void*, void*, void*, void*, void*, void*,\
void*, void*, void*, void*, void*);
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checkasm_simd_warmup(),\
checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
checked_call(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8,\
7, 6, 5, 4, 3, 2, 1, func_new, clobber_mask));\
checkasm_set_signal_handler_state(0)
#elif ARCH_X86_32
#define declare_new(ret, ...)\

View File

@ -55,6 +55,7 @@ n14: dq 0x249214109d5d1c88
%endif
errmsg_stack: db "stack corruption", 0
errmsg_register: db "failed to preserve register:%s", 0
errmsg_vzeroupper: db "missing vzeroupper", 0
SECTION .bss
@ -151,56 +152,44 @@ cglobal init_x86, 0, 5
RET
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int checkasm_stack_clobber(uint64_t clobber, ...)
;-----------------------------------------------------------------------------
cglobal stack_clobber, 1, 2
; Clobber the stack with junk below the stack pointer
%define argsize (max_args+6)*8
SUB rsp, argsize
mov r1, argsize-8
.loop:
mov [rsp+r1], r0
sub r1, 8
jge .loop
ADD rsp, argsize
RET
%if WIN64
%assign free_regs 7
%define stack_param rsp+32 ; shadow space
%define num_stack_params rsp+stack_offset+22*8
%define num_fn_args rsp+stack_offset+17*8
%assign num_reg_args 4
%assign free_regs 7
%assign clobber_mask_stack_bit 16
DECLARE_REG_TMP 4
%else
%assign free_regs 9
%define stack_param rsp
%define num_stack_params rsp+stack_offset+16*8
%define num_fn_args rsp+stack_offset+11*8
%assign num_reg_args 6
%assign free_regs 9
%assign clobber_mask_stack_bit 64
DECLARE_REG_TMP 7
%endif
;-----------------------------------------------------------------------------
; void checkasm_checked_call(void *func, ...)
;-----------------------------------------------------------------------------
%macro CLOBBER_UPPER 2 ; reg, mask_bit
mov r13d, %1d
or r13, r8
test r9b, %2
cmovnz %1, r13
%endmacro
cglobal checked_call, 2, 15, 16, max_args*8+64+8
mov t0, r0
mov r10d, [num_fn_args]
mov r8, 0xdeadbeef00000000
mov r9d, [num_fn_args+r10*8+8] ; clobber_mask
mov t0, [num_fn_args+r10*8] ; func
; All arguments have been pushed on the stack instead of registers in
; order to test for incorrect assumptions that 32-bit ints are
; zero-extended to 64-bit.
mov r0, r6mp
mov r1, r7mp
mov r2, r8mp
mov r3, r9mp
; Clobber the upper halves of 32-bit parameters
CLOBBER_UPPER r0, 1
CLOBBER_UPPER r1, 2
CLOBBER_UPPER r2, 4
CLOBBER_UPPER r3, 8
%if UNIX64
mov r4, r10mp
mov r5, r11mp
CLOBBER_UPPER r4, 16
CLOBBER_UPPER r5, 32
%else ; WIN64
; Move possible floating-point arguments to the correct registers
movq m0, r0
movq m1, r1
movq m2, r2
movq m3, r3
%assign i 6
%rep 16-6
mova m %+ i, [x %+ i]
@ -208,22 +197,29 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
%endrep
%endif
xor r11d, r11d
sub r10d, num_reg_args
cmovs r10d, r11d ; num stack args
; write stack canaries to the area above parameters passed on the stack
mov r9d, [num_stack_params]
mov r8, [rsp+stack_offset] ; return address
not r8
mov r12, [rsp+stack_offset] ; return address
not r12
%assign i 0
%rep 8 ; 64 bytes
mov [stack_param+(r9+i)*8], r8
mov [stack_param+(r10+i)*8], r12
%assign i i+1
%endrep
dec r9d
jl .stack_setup_done ; no stack parameters
test r10d, r10d
jz .stack_setup_done ; no stack parameters
.copy_stack_parameter:
mov r8, [stack_param+stack_offset+7*8+r9*8]
mov [stack_param+r9*8], r8
dec r9d
jge .copy_stack_parameter
mov r12, [stack_param+stack_offset+8+r11*8]
CLOBBER_UPPER r12, clobber_mask_stack_bit
shr r9d, 1
mov [stack_param+r11*8], r12
inc r11d
cmp r11d, r10d
jl .copy_stack_parameter
.stack_setup_done:
%assign i 14
@ -234,7 +230,11 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
call t0
; check for stack corruption
mov r0d, [num_stack_params]
mov r0d, [num_fn_args]
xor r3d, r3d
sub r0d, num_reg_args
cmovs r0d, r3d ; num stack args
mov r3, [rsp+stack_offset]
mov r4, [stack_param+r0*8]
not r3
@ -247,27 +247,32 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
%assign i i+1
%endrep
xor r3, [stack_param+(r0+7)*8]
lea r0, [errmsg_stack]
or r4, r3
jnz .save_retval_and_fail
jz .stack_ok
; Save the return value located in rdx:rax first to prevent clobbering.
mov r10, rax
mov r11, rdx
lea r0, [errmsg_stack]
jmp .fail
.stack_ok:
; check for failure to preserve registers
%assign i 14
%rep 15-free_regs
cmp r %+ i, [r0-errmsg_stack+n %+ i]
cmp r %+ i, [n %+ i]
setne r4b
lea r3d, [r4+r3*2]
%assign i i-1
%endrep
%if WIN64
lea r0, [rsp+60] ; account for shadow space
lea r0, [rsp+32] ; account for shadow space
mov r5, r0
test r3d, r3d
jz .gpr_ok
%else
test r3d, r3d
jz .gpr_xmm_ok
lea r0, [rsp+28]
mov r0, rsp
%endif
%assign i free_regs
%rep 15-free_regs
@ -324,22 +329,15 @@ cglobal checked_call, 2, 15, 16, max_args*8+64+8
cmp r0, r5
je .gpr_xmm_ok
mov byte [r0], 0
lea r0, [r5-28]
mov r11, rdx
mov r1, r5
%else
mov byte [r0], 0
mov r0, rsp
%endif
mov dword [r0+ 0], "fail"
mov dword [r0+ 4], "ed t"
mov dword [r0+ 8], "o pr"
mov dword [r0+12], "eser"
mov dword [r0+16], "ve r"
mov dword [r0+20], "egis"
mov dword [r0+24], "ter:"
.save_retval_and_fail:
; Save the return value located in rdx:rax first to prevent clobbering.
mov r10, rax
mov r11, rdx
mov r1, rsp
%endif
mov r10, rax
lea r0, [errmsg_register]
jmp .fail
.gpr_xmm_ok:
; Check for dirty YMM state, i.e. missing vzeroupper
@ -420,25 +418,19 @@ cglobal checked_call, 1, 7
test r3, r3
jz .gpr_ok
lea r1, [esp+16]
mov dword [r1+ 0], "fail"
mov dword [r1+ 4], "ed t"
mov dword [r1+ 8], "o pr"
mov dword [r1+12], "eser"
mov dword [r1+16], "ve r"
mov dword [r1+20], "egis"
mov dword [r1+24], "ter:"
lea r4, [r1+28]
mov [esp+4], r1
%assign i 3
%rep 4
mov dword [r4], " r0" + (i << 16)
lea r5, [r4+3]
mov dword [r1], " r0" + (i << 16)
lea r4, [r1+3]
test r3, 1 << ((6 - i) * 8)
cmovnz r4, r5
cmovnz r1, r4
%assign i i+1
%endrep
mov byte [r4], 0
mov byte [r1], 0
mov r5, eax
mov r6, edx
LEA r1, errmsg_register
jmp .fail
.gpr_ok:
; check for stack corruption