mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-10-08 10:44:56 +00:00
Bug 1582743 - Update dav1d from upstream to commit c0865f3. r=TD-Linux
Differential Revision: https://phabricator.services.mozilla.com/D46762 --HG-- extra : moz-landing-system : lando
This commit is contained in:
parent
5be5470fba
commit
ac0da8b368
@ -82,6 +82,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
# an error when it compiles empty files.
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/x86/cdef.asm',
|
||||
'../../../third_party/dav1d/src/x86/film_grain.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter.asm',
|
||||
@ -94,6 +95,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
'../../../third_party/dav1d/src/x86/cpuid.asm',
|
||||
'../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_ssse3.asm',
|
||||
'../../../third_party/dav1d/src/x86/msac.asm',
|
||||
@ -103,6 +105,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
relative_path = '../../../third_party/dav1d/src/x86/'
|
||||
bitdepth_basenames = [
|
||||
'cdef_init_tmpl.c',
|
||||
'film_grain_init_tmpl.c',
|
||||
'ipred_init_tmpl.c',
|
||||
'itx_init_tmpl.c',
|
||||
'loopfilter_init_tmpl.c',
|
||||
|
@ -1,7 +1,7 @@
|
||||
#define API_VERSION_NUMBER 2,0,0,0
|
||||
#define API_VERSION_NUMBER_STR "2.0.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,3,1,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.3.1"
|
||||
#define API_VERSION_NUMBER 3,0,0,0
|
||||
#define API_VERSION_NUMBER_STR "3.0.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,4,0,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.4.0"
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
|
@ -124,6 +124,7 @@ relative_path = '../../third_party/dav1d/src/'
|
||||
bitdepth_basenames = [
|
||||
'cdef_apply_tmpl.c',
|
||||
'cdef_tmpl.c',
|
||||
'fg_apply_tmpl.c',
|
||||
'film_grain_tmpl.c',
|
||||
'ipred_prepare_tmpl.c',
|
||||
'ipred_tmpl.c',
|
||||
@ -163,6 +164,7 @@ SOURCES += [
|
||||
EXPORTS.dav1d.src += [
|
||||
'../../third_party/dav1d/src/cdef.h',
|
||||
'../../third_party/dav1d/src/cdef_apply.h',
|
||||
'../../third_party/dav1d/src/fg_apply.h',
|
||||
'../../third_party/dav1d/src/ipred.h',
|
||||
'../../third_party/dav1d/src/ipred_prepare.h',
|
||||
'../../third_party/dav1d/src/itx.h',
|
||||
|
@ -20,7 +20,7 @@ origin:
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit c138435f5aee794ff9d9ac23c3718017927f2e20 (2019-07-17T12:39:10.000Z).
|
||||
release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -1,2 +1,2 @@
|
||||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.3.1-69-gc138435"
|
||||
#define DAV1D_VERSION "0.4.0-49-gc0865f3"
|
||||
|
@ -27,7 +27,7 @@
|
||||
#ifndef DAV1D_VERSION_H
|
||||
#define DAV1D_VERSION_H
|
||||
|
||||
#define DAV1D_API_VERSION_MAJOR 2
|
||||
#define DAV1D_API_VERSION_MAJOR 3
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
|
||||
|
9
third_party/dav1d/.gitlab-ci.yml
vendored
9
third_party/dav1d/.gitlab-ci.yml
vendored
@ -269,6 +269,7 @@ build-debian-ppc64le:
|
||||
test-debian:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -289,6 +290,7 @@ test-debian:
|
||||
test-debian-unaligned-stack:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -309,6 +311,7 @@ test-debian-unaligned-stack:
|
||||
test-debian-asan:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -331,6 +334,7 @@ test-debian-asan:
|
||||
test-debian-msan:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -353,6 +357,7 @@ test-debian-msan:
|
||||
test-debian-ubsan:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-debian"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -375,6 +380,7 @@ test-debian-ubsan:
|
||||
test-win64:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
|
||||
stage: test
|
||||
needs: ["build-win64"]
|
||||
tags:
|
||||
- debian
|
||||
- amd64
|
||||
@ -399,6 +405,7 @@ test-win64:
|
||||
test-debian-aarch64:
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
|
||||
stage: test
|
||||
needs: ["build-debian-aarch64"]
|
||||
tags:
|
||||
- aarch64
|
||||
- debian
|
||||
@ -421,6 +428,7 @@ test-debian-aarch64:
|
||||
test-debian-ppc64le:
|
||||
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
|
||||
stage: test
|
||||
needs: ["build-debian-ppc64le"]
|
||||
tags:
|
||||
- ppc64le
|
||||
- docker
|
||||
@ -443,6 +451,7 @@ test-debian-ppc64le:
|
||||
test-debian-armv7-clang-5:
|
||||
stage: test
|
||||
image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
|
||||
needs: ["build-debian-armv7-clang-5"]
|
||||
tags:
|
||||
- armv7
|
||||
- debian
|
||||
|
7
third_party/dav1d/NEWS
vendored
7
third_party/dav1d/NEWS
vendored
@ -6,6 +6,13 @@ Changes for 0.4.0 'Cheetah':
|
||||
- SSE2 and ARM64 optimizations for MSAC
|
||||
- Improve speed on 32bits systems
|
||||
- Optimization in obmc blend
|
||||
- Reduce RAM usage significantly
|
||||
- The initial PPC SIMD code, cdef_filter
|
||||
- NEON optimizations for blend functions on ARM
|
||||
- NEON optimizations for w_mask functions on ARM
|
||||
- NEON optimizations for inverse transforms on ARM64
|
||||
- Improve handling of malloc failures
|
||||
- Simple Player example in tools
|
||||
|
||||
|
||||
Changes for 0.3.1 'Sailfish':
|
||||
|
1087
third_party/dav1d/examples/dav1dplay.c
vendored
Normal file
1087
third_party/dav1d/examples/dav1dplay.c
vendored
Normal file
File diff suppressed because it is too large
Load Diff
62
third_party/dav1d/examples/meson.build
vendored
Normal file
62
third_party/dav1d/examples/meson.build
vendored
Normal file
@ -0,0 +1,62 @@
|
||||
# Copyright © 2018, VideoLAN and dav1d authors
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
# Build definition for the dav1d examples
|
||||
#
|
||||
|
||||
# Leave subdir if examples are disabled
|
||||
if not get_option('enable_examples')
|
||||
subdir_done()
|
||||
endif
|
||||
|
||||
|
||||
# dav1d player sources
|
||||
dav1dplay_sources = files(
|
||||
'dav1dplay.c',
|
||||
)
|
||||
|
||||
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: false)
|
||||
|
||||
if sdl2_dependency.found()
|
||||
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
|
||||
vulkan_dependency = dependency('vulkan', required: false)
|
||||
sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
|
||||
cflag_placebo = []
|
||||
deps_placebo = []
|
||||
if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
|
||||
cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
|
||||
deps_placebo = [vulkan_dependency, placebo_dependency]
|
||||
endif
|
||||
dav1dplay = executable('dav1dplay',
|
||||
dav1dplay_sources,
|
||||
rev_target,
|
||||
|
||||
link_with : [libdav1d, dav1d_input_objs],
|
||||
include_directories : [dav1d_inc_dirs],
|
||||
dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
|
||||
install : true,
|
||||
c_args : cflag_placebo,
|
||||
)
|
||||
endif
|
10
third_party/dav1d/include/common/attributes.h
vendored
10
third_party/dav1d/include/common/attributes.h
vendored
@ -46,7 +46,7 @@
|
||||
/* x86-64 needs 32-byte alignment for AVX2. */
|
||||
#define ALIGN_32_VAL 32
|
||||
#define ALIGN_16_VAL 16
|
||||
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
|
||||
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
|
||||
/* ARM doesn't benefit from anything more than 16-byte alignment. */
|
||||
#define ALIGN_32_VAL 16
|
||||
#define ALIGN_16_VAL 16
|
||||
@ -92,6 +92,14 @@
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#endif /* !_MSC_VER */
|
||||
|
||||
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
|
||||
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
|
||||
#elif defined(NDEBUG) && defined(_MSC_VER)
|
||||
#define assert __assume
|
||||
#else
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
|
||||
# define dav1d_uninit(x) x=x
|
||||
#else
|
||||
|
8
third_party/dav1d/include/common/intops.h
vendored
8
third_party/dav1d/include/common/intops.h
vendored
@ -40,6 +40,14 @@ static inline int imin(const int a, const int b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline unsigned umax(const unsigned a, const unsigned b) {
|
||||
return a > b ? a : b;
|
||||
}
|
||||
|
||||
static inline unsigned umin(const unsigned a, const unsigned b) {
|
||||
return a < b ? a : b;
|
||||
}
|
||||
|
||||
static inline int iclip(const int v, const int min, const int max) {
|
||||
return v < min ? min : v > max ? max : v;
|
||||
}
|
||||
|
3
third_party/dav1d/include/common/mem.h
vendored
3
third_party/dav1d/include/common/mem.h
vendored
@ -28,13 +28,14 @@
|
||||
#ifndef DAV1D_COMMON_MEM_H
|
||||
#define DAV1D_COMMON_MEM_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
/*
|
||||
* Allocate 32-byte aligned memory. The return value can be released
|
||||
* by calling the standard free() function.
|
||||
|
11
third_party/dav1d/include/dav1d/headers.h
vendored
11
third_party/dav1d/include/dav1d/headers.h
vendored
@ -28,6 +28,8 @@
|
||||
#ifndef DAV1D_HEADERS_H
|
||||
#define DAV1D_HEADERS_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
// Constants from Section 3. "Symbols and abbreviated terms"
|
||||
#define DAV1D_MAX_CDEF_STRENGTHS 8
|
||||
#define DAV1D_MAX_OPERATING_POINTS 32
|
||||
@ -176,6 +178,13 @@ typedef struct Dav1dMasteringDisplay {
|
||||
uint32_t min_luminance;
|
||||
} Dav1dMasteringDisplay;
|
||||
|
||||
typedef struct Dav1dITUTT35 {
|
||||
uint8_t country_code;
|
||||
uint8_t country_code_extension_byte;
|
||||
size_t payload_size;
|
||||
uint8_t *payload;
|
||||
} Dav1dITUTT35;
|
||||
|
||||
typedef struct Dav1dSequenceHeader {
|
||||
/**
|
||||
* Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
|
||||
@ -289,7 +298,7 @@ typedef struct Dav1dLoopfilterModeRefDeltas {
|
||||
} Dav1dLoopfilterModeRefDeltas;
|
||||
|
||||
typedef struct Dav1dFilmGrainData {
|
||||
uint16_t seed;
|
||||
unsigned seed;
|
||||
int num_y_points;
|
||||
uint8_t y_points[14][2 /* value, scaling */];
|
||||
int chroma_scaling_from_luma;
|
||||
|
9
third_party/dav1d/include/dav1d/picture.h
vendored
9
third_party/dav1d/include/dav1d/picture.h
vendored
@ -77,9 +77,16 @@ typedef struct Dav1dPicture {
|
||||
* this picture, as defined in section 5.8.4 and 6.7.4
|
||||
*/
|
||||
Dav1dMasteringDisplay *mastering_display;
|
||||
/**
|
||||
* ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2
|
||||
*/
|
||||
Dav1dITUTT35 *itut_t35;
|
||||
|
||||
uintptr_t reserved[4]; ///< reserved for future use
|
||||
|
||||
struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins
|
||||
struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins
|
||||
struct Dav1dRef *content_light_ref, *mastering_display_ref, *itut_t35_ref; ///< Metadata allocation origins
|
||||
uintptr_t reserved_ref[4]; ///< reserved for future use
|
||||
struct Dav1dRef *ref; ///< Frame data allocation origin
|
||||
|
||||
void *allocator_data; ///< pointer managed by the allocator
|
||||
|
13
third_party/dav1d/meson.build
vendored
13
third_party/dav1d/meson.build
vendored
@ -23,14 +23,14 @@
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.3.1',
|
||||
version: '0.4.0',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.47.0')
|
||||
|
||||
dav1d_soname_version = '2.0.0'
|
||||
dav1d_soname_version = '3.0.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
@ -85,9 +85,14 @@ test_args = []
|
||||
optional_arguments = []
|
||||
|
||||
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
|
||||
test_args += '-D_POSIX_C_SOURCE=200112L'
|
||||
test_args += '-D_POSIX_C_SOURCE=200112L'
|
||||
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
|
||||
|
||||
if host_machine.system() == 'darwin'
|
||||
test_args += '-D_DARWIN_C_SOURCE'
|
||||
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
|
||||
endif
|
||||
|
||||
if host_machine.system() == 'windows'
|
||||
cdata.set('_WIN32_WINNT', '0x0601')
|
||||
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
|
||||
@ -389,4 +394,6 @@ subdir('src')
|
||||
|
||||
subdir('tools')
|
||||
|
||||
subdir('examples')
|
||||
|
||||
subdir('tests')
|
||||
|
5
third_party/dav1d/meson_options.txt
vendored
5
third_party/dav1d/meson_options.txt
vendored
@ -15,6 +15,11 @@ option('enable_tools',
|
||||
value: true,
|
||||
description: 'Build dav1d cli tools')
|
||||
|
||||
option('enable_examples',
|
||||
type: 'boolean',
|
||||
value: false,
|
||||
description: 'Build dav1d examples')
|
||||
|
||||
option('enable_tests',
|
||||
type: 'boolean',
|
||||
value: true,
|
||||
|
392
third_party/dav1d/src/arm/32/mc.S
vendored
392
third_party/dav1d/src/arm/32/mc.S
vendored
@ -91,6 +91,7 @@ function \type\()_8bpc_neon, export=1
|
||||
\type d16, d17, q0, q1, q2, q3
|
||||
add r12, r12, r4
|
||||
bx r12
|
||||
|
||||
.align 2
|
||||
L(\type\()_tbl):
|
||||
.word 1280f - L(\type\()_tbl) + CONFIG_THUMB
|
||||
@ -99,6 +100,7 @@ L(\type\()_tbl):
|
||||
.word 160f - L(\type\()_tbl) + CONFIG_THUMB
|
||||
.word 80f - L(\type\()_tbl) + CONFIG_THUMB
|
||||
.word 4f - L(\type\()_tbl) + CONFIG_THUMB
|
||||
|
||||
4:
|
||||
add r6, r0, r1
|
||||
lsl r1, r1, #1
|
||||
@ -217,17 +219,17 @@ bidir_fn mask
|
||||
|
||||
.macro w_mask_fn type
|
||||
function w_mask_\type\()_8bpc_neon, export=1
|
||||
push {r4-r10,lr}
|
||||
ldr r4, [sp, #32]
|
||||
ldr r5, [sp, #36]
|
||||
ldr r6, [sp, #40]
|
||||
ldr r7, [sp, #44]
|
||||
push {r4-r9,lr}
|
||||
ldr r4, [sp, #28]
|
||||
ldr r5, [sp, #32]
|
||||
ldr r6, [sp, #36]
|
||||
ldr r7, [sp, #40]
|
||||
clz r8, r4
|
||||
adr r9, L(w_mask_\type\()_tbl)
|
||||
sub r8, r8, #24
|
||||
ldr r8, [r9, r8, lsl #2]
|
||||
add r9, r9, r8
|
||||
mov r12, #6903
|
||||
movw r12, #6903
|
||||
vdup.16 q14, r12
|
||||
.if \type == 444
|
||||
vmov.i8 q15, #64
|
||||
@ -243,6 +245,7 @@ function w_mask_\type\()_8bpc_neon, export=1
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r9
|
||||
|
||||
.align 2
|
||||
L(w_mask_\type\()_tbl):
|
||||
.word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
|
||||
@ -251,9 +254,10 @@ L(w_mask_\type\()_tbl):
|
||||
.word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
|
||||
.word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
|
||||
.word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
|
||||
|
||||
4:
|
||||
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1 (four rows at once)
|
||||
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2 (four rows at once)
|
||||
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once)
|
||||
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once)
|
||||
subs r5, r5, #4
|
||||
vsub.i16 q8, q2, q0 // tmp2-tmp1
|
||||
vsub.i16 q9, q3, q1
|
||||
@ -275,30 +279,30 @@ L(w_mask_\type\()_tbl):
|
||||
vmovn.u16 d20, q10 // 64 - m
|
||||
vmovn.u16 d21, q11
|
||||
vsub.i8 q10, q15, q10 // m
|
||||
vst1.8 {d20, d21}, [r6]!
|
||||
vst1.8 {d20, d21}, [r6, :128]!
|
||||
.elseif \type == 422
|
||||
vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
|
||||
vpadd.s16 d21, d22, d23
|
||||
vmovn.s16 d6, q10
|
||||
vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
|
||||
vst1.8 {d6}, [r6]!
|
||||
vst1.8 {d6}, [r6, :64]!
|
||||
.elseif \type == 420
|
||||
vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
|
||||
vadd.s16 d21, d22, d23
|
||||
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
|
||||
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
|
||||
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
|
||||
vst1.32 {d20[0]}, [r6]!
|
||||
vst1.32 {d20[0]}, [r6, :32]!
|
||||
.endif
|
||||
vst1.32 {d24[0]}, [r0], r1
|
||||
vst1.32 {d24[1]}, [r12], r1
|
||||
vst1.32 {d25[0]}, [r0], r1
|
||||
vst1.32 {d25[1]}, [r12], r1
|
||||
vst1.32 {d24[0]}, [r0, :32], r1
|
||||
vst1.32 {d24[1]}, [r12, :32], r1
|
||||
vst1.32 {d25[0]}, [r0, :32], r1
|
||||
vst1.32 {d25[1]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4-r10,pc}
|
||||
pop {r4-r9,pc}
|
||||
8:
|
||||
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1, tmp1y2
|
||||
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1, tmp2y2
|
||||
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2
|
||||
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2
|
||||
subs r5, r5, #2
|
||||
vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
|
||||
vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
|
||||
@ -320,43 +324,42 @@ L(w_mask_\type\()_tbl):
|
||||
vmovn.u16 d20, q10 // 64 - m
|
||||
vmovn.u16 d21, q11
|
||||
vsub.i8 q10, q15, q10 // m
|
||||
vst1.8 {d20, d21}, [r6]!
|
||||
vst1.8 {d20, d21}, [r6, :128]!
|
||||
.elseif \type == 422
|
||||
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
|
||||
vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
|
||||
vmovn.s16 d20, q10
|
||||
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
|
||||
vst1.8 {d20}, [r6]!
|
||||
vst1.8 {d20}, [r6, :64]!
|
||||
.elseif \type == 420
|
||||
vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
|
||||
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
|
||||
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
|
||||
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
|
||||
vst1.32 {d20[0]}, [r6]!
|
||||
vst1.32 {d20[0]}, [r6, :32]!
|
||||
.endif
|
||||
vst1.16 {d24}, [r0], r1
|
||||
vst1.16 {d25}, [r12], r1
|
||||
vst1.16 {d24}, [r0, :64], r1
|
||||
vst1.16 {d25}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r10,pc}
|
||||
pop {r4-r9,pc}
|
||||
1280:
|
||||
640:
|
||||
320:
|
||||
160:
|
||||
sub r1, r1, r4
|
||||
.if \type == 444
|
||||
add r10, r6, r4
|
||||
add lr, r6, r4
|
||||
.elseif \type == 422
|
||||
add r10, r6, r4, lsr #1
|
||||
add lr, r6, r4, lsr #1
|
||||
.endif
|
||||
mov lr, r7
|
||||
add r9, r3, r4, lsl #1
|
||||
add r7, r2, r4, lsl #1
|
||||
161:
|
||||
mov r8, r4
|
||||
16:
|
||||
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1
|
||||
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1
|
||||
vld1.16 {d16, d17, d18, d19}, [r7]! // tmp1y2
|
||||
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1
|
||||
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1
|
||||
vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2
|
||||
subs r8, r8, #16
|
||||
vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
|
||||
vsub.i16 q3, q3, q1
|
||||
@ -372,24 +375,24 @@ L(w_mask_\type\()_tbl):
|
||||
vqdmulh.s16 q13, q13, q3
|
||||
vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
|
||||
vadd.i16 q13, q13, q1
|
||||
vld1.16 {d0, d1, d2, d3}, [r9]! // tmp2h2
|
||||
vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2
|
||||
.if \type == 444
|
||||
vmovn.u16 d20, q10 // 64 - my1
|
||||
vmovn.u16 d21, q11
|
||||
vsub.i8 q10, q15, q10 // my1
|
||||
vst1.8 {d20, d21}, [r6]!
|
||||
vst1.8 {d20, d21}, [r6, :128]!
|
||||
.elseif \type == 422
|
||||
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
|
||||
vpadd.s16 d21, d22, d23
|
||||
vmovn.s16 d20, q10
|
||||
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
|
||||
vst1.8 {d20}, [r6]!
|
||||
vst1.8 {d20}, [r6, :64]!
|
||||
.endif
|
||||
vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
|
||||
vqrshrun.s16 d25, q13, #4
|
||||
vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
|
||||
vsub.i16 q1, q1, q9
|
||||
vst1.16 {d24, d25}, [r0]! // store dsty1
|
||||
vst1.16 {d24, d25}, [r0, :128]! // store dsty1
|
||||
vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
|
||||
vabs.s16 q3, q1
|
||||
vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
|
||||
@ -402,13 +405,13 @@ L(w_mask_\type\()_tbl):
|
||||
vmovn.u16 d4, q2 // 64 - my2
|
||||
vmovn.u16 d5, q3
|
||||
vsub.i8 q2, q15, q2 // my2
|
||||
vst1.8 {d4, d5}, [r10]!
|
||||
vst1.8 {d4, d5}, [lr, :128]!
|
||||
.elseif \type == 422
|
||||
vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
|
||||
vpadd.s16 d5, d6, d7
|
||||
vmovn.s16 d4, q2
|
||||
vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
|
||||
vst1.8 {d4}, [r10]!
|
||||
vst1.8 {d4}, [lr, :64]!
|
||||
.elseif \type == 420
|
||||
vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
|
||||
vadd.s16 q11, q11, q3
|
||||
@ -416,7 +419,7 @@ L(w_mask_\type\()_tbl):
|
||||
vpadd.s16 d21, d22, d23
|
||||
vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
|
||||
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
|
||||
vst1.8 {d20}, [r6]!
|
||||
vst1.8 {d20}, [r6, :64]!
|
||||
.endif
|
||||
vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
|
||||
vqdmulh.s16 q13, q13, q1
|
||||
@ -424,7 +427,7 @@ L(w_mask_\type\()_tbl):
|
||||
vadd.i16 q13, q13, q9
|
||||
vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
|
||||
vqrshrun.s16 d25, q13, #4
|
||||
vst1.16 {d24, d25}, [r12]! // store dsty2
|
||||
vst1.16 {d24, d25}, [r12, :128]! // store dsty2
|
||||
bgt 16b
|
||||
subs r5, r5, #2
|
||||
add r2, r2, r4, lsl #1
|
||||
@ -433,15 +436,15 @@ L(w_mask_\type\()_tbl):
|
||||
add r9, r9, r4, lsl #1
|
||||
.if \type == 444
|
||||
add r6, r6, r4
|
||||
add r10, r10, r4
|
||||
add lr, lr, r4
|
||||
.elseif \type == 422
|
||||
add r6, r6, r4, lsr #1
|
||||
add r10, r10, r4, lsr #1
|
||||
add lr, lr, r4, lsr #1
|
||||
.endif
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
bgt 161b
|
||||
pop {r4-r10,pc}
|
||||
pop {r4-r9,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
@ -451,15 +454,16 @@ w_mask_fn 420
|
||||
|
||||
|
||||
function blend_8bpc_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
ldr r4, [sp, #24]
|
||||
ldr r5, [sp, #28]
|
||||
clz r6, r3
|
||||
adr r7, L(blend_tbl)
|
||||
sub r6, r6, #26
|
||||
ldr r6, [r7, r6, lsl #2]
|
||||
add r7, r7, r6
|
||||
bx r7
|
||||
push {r4-r5,lr}
|
||||
ldr r4, [sp, #12]
|
||||
ldr r5, [sp, #16]
|
||||
clz lr, r3
|
||||
adr r3, L(blend_tbl)
|
||||
sub lr, lr, #26
|
||||
ldr lr, [r3, lr, lsl #2]
|
||||
add r3, r3, lr
|
||||
bx r3
|
||||
|
||||
.align 2
|
||||
L(blend_tbl):
|
||||
.word 320f - L(blend_tbl) + CONFIG_THUMB
|
||||
@ -472,33 +476,29 @@ L(blend_tbl):
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
4:
|
||||
vld1.32 {d2[]}, [r5], r3
|
||||
vld1.32 {d1[]}, [r2], r3
|
||||
vld1.32 {d0[]}, [r0]
|
||||
vld1.u8 {d2}, [r5, :64]!
|
||||
vld1.u8 {d1}, [r2, :64]!
|
||||
vld1.32 {d0[]}, [r0, :32]
|
||||
subs r4, r4, #2
|
||||
vld1.32 {d2[1]}, [r5], r3
|
||||
vld1.32 {d1[1]}, [r2], r3
|
||||
vld1.32 {d0[1]}, [r12]
|
||||
vld1.32 {d0[1]}, [r12, :32]
|
||||
vsub.i8 d3, d22, d2
|
||||
vmull.u8 q8, d1, d2
|
||||
vmlal.u8 q8, d0, d3
|
||||
vrshrn.i16 d20, q8, #6
|
||||
vst1.32 {d20[0]}, [r0], r1
|
||||
vst1.32 {d20[1]}, [r12], r1
|
||||
vst1.32 {d20[0]}, [r0, :32], r1
|
||||
vst1.32 {d20[1]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
80:
|
||||
vmov.i8 d16, #64
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
8:
|
||||
vld1.u8 {d2}, [r5], r3
|
||||
vld1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d0}, [r0]
|
||||
vld1.u8 {q1}, [r5, :128]!
|
||||
vld1.u8 {q2}, [r2, :128]!
|
||||
vld1.u8 {d0}, [r0, :64]
|
||||
vsub.i8 d17, d16, d2
|
||||
vld1.u8 {d3}, [r5], r3
|
||||
vld1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d1}, [r12]
|
||||
vld1.u8 {d1}, [r12, :64]
|
||||
subs r4, r4, #2
|
||||
vsub.i8 d18, d16, d3
|
||||
vmull.u8 q3, d2, d4
|
||||
@ -507,47 +507,44 @@ L(blend_tbl):
|
||||
vmlal.u8 q10, d1, d18
|
||||
vrshrn.i16 d22, q3, #6
|
||||
vrshrn.i16 d23, q10, #6
|
||||
vst1.u8 {d22}, [r0], r1
|
||||
vst1.u8 {d23}, [r12], r1
|
||||
vst1.u8 {d22}, [r0, :64], r1
|
||||
vst1.u8 {d23}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
160:
|
||||
vmov.i8 q12, #64
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
16:
|
||||
vld1.u8 {q2}, [r5], r3
|
||||
vld1.u8 {q1}, [r2], r3
|
||||
vld1.u8 {q0}, [r0]
|
||||
vld1.u8 {q1, q2}, [r5, :128]!
|
||||
vld1.u8 {q8, q9}, [r2, :128]!
|
||||
vld1.u8 {q0}, [r0, :128]
|
||||
subs r4, r4, #2
|
||||
vsub.i8 q11, q12, q2
|
||||
vld1.u8 {q15}, [r5], r3
|
||||
vld1.u8 {q14}, [r2], r3
|
||||
vld1.u8 {q13}, [r12]
|
||||
vmull.u8 q3, d2, d4
|
||||
vmlal.u8 q3, d0, d22
|
||||
vmull.u8 q8, d3, d5
|
||||
vmlal.u8 q8, d1, d23
|
||||
vsub.i8 q11, q12, q15
|
||||
vrshrn.i16 d18, q3, #6
|
||||
vrshrn.i16 d19, q8, #6
|
||||
vmull.u8 q3, d28, d30
|
||||
vmlal.u8 q3, d26, d22
|
||||
vmull.u8 q8, d29, d31
|
||||
vmlal.u8 q8, d27, d23
|
||||
vsub.i8 q15, q12, q1
|
||||
vld1.u8 {q13}, [r12, :128]
|
||||
vmull.u8 q3, d16, d2
|
||||
vmlal.u8 q3, d0, d30
|
||||
vmull.u8 q14, d17, d3
|
||||
vmlal.u8 q14, d1, d31
|
||||
vsub.i8 q15, q12, q2
|
||||
vrshrn.i16 d20, q3, #6
|
||||
vrshrn.i16 d21, q8, #6
|
||||
vst1.u8 {q9}, [r0], r1
|
||||
vst1.u8 {q10}, [r12], r1
|
||||
vrshrn.i16 d21, q14, #6
|
||||
vmull.u8 q3, d18, d4
|
||||
vmlal.u8 q3, d26, d30
|
||||
vmull.u8 q14, d19, d5
|
||||
vmlal.u8 q14, d27, d31
|
||||
vrshrn.i16 d22, q3, #6
|
||||
vrshrn.i16 d23, q14, #6
|
||||
vst1.u8 {q10}, [r0, :128], r1
|
||||
vst1.u8 {q11}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4-r8,pc}
|
||||
|
||||
pop {r4-r5,pc}
|
||||
320:
|
||||
vmov.i8 q10, #64
|
||||
32:
|
||||
vld1.u8 {q2, q3}, [r5], r3
|
||||
vld1.u8 {q8, q9}, [r2], r3
|
||||
vld1.u8 {q0, q1}, [r0]
|
||||
vld1.u8 {q2, q3}, [r5, :128]!
|
||||
vld1.u8 {q8, q9}, [r2, :128]!
|
||||
vld1.u8 {q0, q1}, [r0, :128]
|
||||
subs r4, r4, #1
|
||||
vsub.i8 q11, q10, q2
|
||||
vmull.u8 q15, d16, d4
|
||||
@ -563,9 +560,9 @@ L(blend_tbl):
|
||||
vmlal.u8 q14, d3, d23
|
||||
vrshrn.i16 d26, q15, #6
|
||||
vrshrn.i16 d27, q14, #6
|
||||
vst1.u8 {q12, q13}, [r0], r1
|
||||
vst1.u8 {q12, q13}, [r0, :128], r1
|
||||
bgt 32b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
endfunc
|
||||
|
||||
function blend_h_8bpc_neon, export=1
|
||||
@ -580,6 +577,7 @@ function blend_h_8bpc_neon, export=1
|
||||
ldr r6, [r7, r6, lsl #2]
|
||||
add r7, r7, r6
|
||||
bx r7
|
||||
|
||||
.align 2
|
||||
L(blend_h_tbl):
|
||||
.word 1280f - L(blend_h_tbl) + CONFIG_THUMB
|
||||
@ -595,19 +593,18 @@ L(blend_h_tbl):
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
2:
|
||||
vld1.16 {d2[], d3[]}, [r5]!
|
||||
vld1.16 {d1[]}, [r2], r3
|
||||
vld1.16 {d2[], d3[]}, [r5, :16]!
|
||||
vld1.32 {d1[0]}, [r2, :32]!
|
||||
subs r4, r4, #2
|
||||
vld1.16 {d0[]}, [r0]
|
||||
vld1.16 {d0[]}, [r0, :16]
|
||||
vzip.8 d2, d3
|
||||
vld1.16 {d1[1]}, [r2], r3
|
||||
vsub.i8 d4, d22, d2
|
||||
vld1.16 {d0[1]}, [r12]
|
||||
vld1.16 {d0[1]}, [r12, :16]
|
||||
vmull.u8 q8, d1, d2
|
||||
vmlal.u8 q8, d0, d4
|
||||
vrshrn.i16 d20, q8, #6
|
||||
vst1.16 {d20[0]}, [r0], r1
|
||||
vst1.16 {d20[1]}, [r12], r1
|
||||
vst1.16 {d20[0]}, [r0, :16], r1
|
||||
vst1.16 {d20[1]}, [r12, :16], r1
|
||||
bgt 2b
|
||||
pop {r4-r8,pc}
|
||||
40:
|
||||
@ -615,74 +612,66 @@ L(blend_h_tbl):
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
4:
|
||||
vld1.u8 {d2[]}, [r5]!
|
||||
vld1.32 {d1[]}, [r2], r3
|
||||
vld2.u8 {d2[], d3[]}, [r5, :16]!
|
||||
vld1.u8 {d1}, [r2, :64]!
|
||||
subs r4, r4, #2
|
||||
vld1.u8 {d6[]}, [r5]!
|
||||
vld1.32 {d1[1]}, [r2], r3
|
||||
vext.u8 d2, d2, d6, #4
|
||||
vld1.32 {d0[]}, [r0]
|
||||
vsub.i8 d3, d22, d2
|
||||
vld1.32 {d0[1]}, [r12]
|
||||
vext.u8 d2, d2, d3, #4
|
||||
vld1.32 {d0[]}, [r0, :32]
|
||||
vsub.i8 d6, d22, d2
|
||||
vld1.32 {d0[1]}, [r12, :32]
|
||||
vmull.u8 q8, d1, d2
|
||||
vmlal.u8 q8, d0, d3
|
||||
vmlal.u8 q8, d0, d6
|
||||
vrshrn.i16 d20, q8, #6
|
||||
vst1.32 {d20[0]}, [r0], r1
|
||||
vst1.32 {d20[1]}, [r12], r1
|
||||
vst1.32 {d20[0]}, [r0, :32], r1
|
||||
vst1.32 {d20[1]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4-r8,pc}
|
||||
80:
|
||||
vmov.i8 d16, #64
|
||||
vmov.i8 q8, #64
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
8:
|
||||
vld1.u8 {d2[]}, [r5]!
|
||||
vld1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d0}, [r0]
|
||||
vsub.i8 d17, d16, d2
|
||||
vld1.u8 {d3[]}, [r5]!
|
||||
vld1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d1}, [r12]
|
||||
vld2.u8 {d2[], d3[]}, [r5, :16]!
|
||||
vld1.u8 {d4, d5}, [r2, :128]!
|
||||
vld1.u8 {d0}, [r0, :64]
|
||||
vsub.i8 q9, q8, q1
|
||||
vld1.u8 {d1}, [r12, :64]
|
||||
subs r4, r4, #2
|
||||
vsub.i8 d18, d16, d3
|
||||
vmull.u8 q3, d2, d4
|
||||
vmlal.u8 q3, d0, d17
|
||||
vmlal.u8 q3, d0, d18
|
||||
vmull.u8 q10, d3, d5
|
||||
vmlal.u8 q10, d1, d18
|
||||
vmlal.u8 q10, d1, d19
|
||||
vrshrn.i16 d22, q3, #6
|
||||
vrshrn.i16 d23, q10, #6
|
||||
vst1.u8 {d22}, [r0], r1
|
||||
vst1.u8 {d23}, [r12], r1
|
||||
vst1.u8 {d22}, [r0, :64], r1
|
||||
vst1.u8 {d23}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r8,pc}
|
||||
160:
|
||||
vmov.i8 d24, #64
|
||||
vmov.i8 q12, #64
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
16:
|
||||
vld1.u8 {d4[]}, [r5]!
|
||||
vld1.u8 {q1}, [r2], r3
|
||||
vsub.i8 d5, d24, d4
|
||||
vld1.u8 {q0}, [r0]
|
||||
vld2.u8 {d28[], d29[]}, [r5, :16]!
|
||||
vld1.u8 {d2, d3, d4, d5}, [r2, :128]!
|
||||
vsub.i8 q15, q12, q14
|
||||
vld1.u8 {q0}, [r0, :128]
|
||||
subs r4, r4, #2
|
||||
vld1.u8 {d30[]}, [r5]!
|
||||
vld1.u8 {q14}, [r2], r3
|
||||
vsub.i8 d31, d24, d30
|
||||
vld1.u8 {q13}, [r12]
|
||||
vmull.u8 q3, d2, d4
|
||||
vmlal.u8 q3, d0, d5
|
||||
vmull.u8 q8, d3, d4
|
||||
vmlal.u8 q8, d1, d5
|
||||
vld1.u8 {q13}, [r12, :128]
|
||||
vmull.u8 q3, d2, d28
|
||||
vmlal.u8 q3, d0, d30
|
||||
vmull.u8 q8, d3, d28
|
||||
vmlal.u8 q8, d1, d30
|
||||
vrshrn.i16 d18, q3, #6
|
||||
vrshrn.i16 d19, q8, #6
|
||||
vmull.u8 q3, d28, d30
|
||||
vmull.u8 q3, d4, d29
|
||||
vmlal.u8 q3, d26, d31
|
||||
vmull.u8 q8, d29, d30
|
||||
vmull.u8 q8, d5, d29
|
||||
vmlal.u8 q8, d27, d31
|
||||
vrshrn.i16 d20, q3, #6
|
||||
vrshrn.i16 d21, q8, #6
|
||||
vst1.u8 {q9}, [r0], r1
|
||||
vst1.u8 {q10}, [r12], r1
|
||||
vst1.u8 {q9}, [r0, :128], r1
|
||||
vst1.u8 {q10}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4-r8,pc}
|
||||
320:
|
||||
@ -691,12 +680,12 @@ L(blend_h_tbl):
|
||||
vmov.i8 d20, #64
|
||||
sub r1, r1, r3
|
||||
321:
|
||||
vld1.u8 {d6[]}, [r5]!
|
||||
vld1.u8 {d6[]}, [r5]!
|
||||
vsub.i8 d7, d20, d6
|
||||
mov r8, r3
|
||||
32:
|
||||
vld1.u8 {q8, q9}, [r2]!
|
||||
vld1.u8 {q0, q1}, [r0]
|
||||
vld1.u8 {q8, q9}, [r2, :128]!
|
||||
vld1.u8 {q0, q1}, [r0, :128]
|
||||
vmull.u8 q15, d16, d6
|
||||
vmlal.u8 q15, d0, d7
|
||||
vmull.u8 q14, d17, d6
|
||||
@ -709,7 +698,7 @@ L(blend_h_tbl):
|
||||
vmlal.u8 q14, d3, d7
|
||||
vrshrn.i16 d2, q15, #6
|
||||
vrshrn.i16 d3, q14, #6
|
||||
vst1.u8 {q0, q1}, [r0]!
|
||||
vst1.u8 {q0, q1}, [r0, :128]!
|
||||
subs r8, r8, #32
|
||||
bgt 32b
|
||||
add r0, r0, r1
|
||||
@ -719,16 +708,17 @@ L(blend_h_tbl):
|
||||
endfunc
|
||||
|
||||
function blend_v_8bpc_neon, export=1
|
||||
push {r4-r8,lr}
|
||||
ldr r4, [sp, #24]
|
||||
push {r4-r5,lr}
|
||||
ldr r4, [sp, #12]
|
||||
movrel r5, X(obmc_masks)
|
||||
add r5, r5, r3
|
||||
clz r8, r3
|
||||
adr r7, L(blend_v_tbl)
|
||||
sub r8, r8, #26
|
||||
ldr r8, [r7, r8, lsl #2]
|
||||
add r7, r7, r8
|
||||
bx r7
|
||||
clz lr, r3
|
||||
adr r3, L(blend_v_tbl)
|
||||
sub lr, lr, #26
|
||||
ldr lr, [r3, lr, lsl #2]
|
||||
add r3, r3, lr
|
||||
bx r3
|
||||
|
||||
.align 2
|
||||
L(blend_v_tbl):
|
||||
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
|
||||
@ -739,59 +729,58 @@ L(blend_v_tbl):
|
||||
|
||||
20:
|
||||
vmov.i8 d22, #64
|
||||
vld1.8 {d2[]}, [r5]
|
||||
vld1.8 {d2[]}, [r5]
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 d3, d22, d2
|
||||
2:
|
||||
vld1.8 {d1[]}, [r2], r3
|
||||
vld1.8 {d0[]}, [r0]
|
||||
vld1.16 {d1[0]}, [r2, :16]!
|
||||
vld1.8 {d0[]}, [r0]
|
||||
subs r4, r4, #2
|
||||
vld1.8 {d1[1]}, [r2], r3
|
||||
vld1.8 {d0[1]}, [r12]
|
||||
vld1.8 {d1[1]}, [r2]
|
||||
vld1.8 {d0[1]}, [r12]
|
||||
vmull.u8 q2, d1, d2
|
||||
vmlal.u8 q2, d0, d3
|
||||
vrshrn.i16 d6, q2, #6
|
||||
vst1.8 {d6[0]}, [r0], r1
|
||||
vst1.8 {d6[1]}, [r12], r1
|
||||
add r2, r2, #2
|
||||
vst1.8 {d6[0]}, [r0], r1
|
||||
vst1.8 {d6[1]}, [r12], r1
|
||||
bgt 2b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
40:
|
||||
vmov.i8 d22, #64
|
||||
vld1.32 {d4[]}, [r5]
|
||||
vld1.32 {d4[]}, [r5, :32]
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 d5, d22, d4
|
||||
sub r1, r1, #3
|
||||
4:
|
||||
vld1.32 {d2[]}, [r2], r3
|
||||
vld1.32 {d0[]}, [r0]
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d0[1]}, [r12]
|
||||
vld1.u8 {d2}, [r2, :64]!
|
||||
vld1.32 {d0[]}, [r0, :32]
|
||||
vld1.32 {d0[1]}, [r12, :32]
|
||||
subs r4, r4, #2
|
||||
vmull.u8 q3, d2, d4
|
||||
vmlal.u8 q3, d0, d5
|
||||
vrshrn.i16 d20, q3, #6
|
||||
vst1.16 {d20[0]}, [r0]!
|
||||
vst1.16 {d20[2]}, [r12]!
|
||||
vst1.16 {d20[0]}, [r0, :16]!
|
||||
vst1.16 {d20[2]}, [r12, :16]!
|
||||
vst1.8 {d20[2]}, [r0]!
|
||||
vst1.8 {d20[6]}, [r12]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
bgt 4b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
80:
|
||||
vmov.i8 d16, #64
|
||||
vld1.u8 {d2}, [r5]
|
||||
vld1.u8 {d2}, [r5, :64]
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 d17, d16, d2
|
||||
sub r1, r1, #6
|
||||
8:
|
||||
vld1.u8 {d4}, [r2], r3
|
||||
vld1.u8 {d0}, [r0]
|
||||
vld1.u8 {d5}, [r2], r3
|
||||
vld1.u8 {d1}, [r12]
|
||||
vld1.u8 {d4, d5}, [r2, :128]!
|
||||
vld1.u8 {d0}, [r0, :64]
|
||||
vld1.u8 {d1}, [r12, :64]
|
||||
subs r4, r4, #2
|
||||
vmull.u8 q3, d2, d4
|
||||
vmlal.u8 q3, d0, d17
|
||||
@ -799,55 +788,54 @@ L(blend_v_tbl):
|
||||
vmlal.u8 q10, d1, d17
|
||||
vrshrn.i16 d22, q3, #6
|
||||
vrshrn.i16 d23, q10, #6
|
||||
vst1.32 {d22[0]}, [r0]!
|
||||
vst1.32 {d23[0]}, [r12]!
|
||||
vst1.16 {d22[2]}, [r0]!
|
||||
vst1.16 {d23[2]}, [r12]!
|
||||
vst1.32 {d22[0]}, [r0, :32]!
|
||||
vst1.32 {d23[0]}, [r12, :32]!
|
||||
vst1.16 {d22[2]}, [r0, :16]!
|
||||
vst1.16 {d23[2]}, [r12, :16]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
bgt 8b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
160:
|
||||
vmov.i8 q12, #64
|
||||
vld1.u8 {q2}, [r5]
|
||||
vld1.u8 {q14}, [r5, :128]
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
vsub.i8 q11, q12, q2
|
||||
vsub.i8 q11, q12, q14
|
||||
sub r1, r1, #12
|
||||
16:
|
||||
vld1.u8 {q1}, [r2], r3
|
||||
vld1.u8 {q0}, [r0]
|
||||
vld1.u8 {q1, q2}, [r2, :128]!
|
||||
vld1.u8 {q0}, [r0, :128]
|
||||
subs r4, r4, #2
|
||||
vld1.u8 {q14}, [r2], r3
|
||||
vld1.u8 {q13}, [r12]
|
||||
vmull.u8 q3, d2, d4
|
||||
vld1.u8 {q13}, [r12, :128]
|
||||
vmull.u8 q3, d2, d28
|
||||
vmlal.u8 q3, d0, d22
|
||||
vmull.u8 q8, d3, d5
|
||||
vmull.u8 q8, d3, d29
|
||||
vmlal.u8 q8, d1, d23
|
||||
vrshrn.i16 d18, q3, #6
|
||||
vrshrn.i16 d19, q8, #6
|
||||
vmull.u8 q3, d28, d4
|
||||
vmull.u8 q3, d4, d28
|
||||
vmlal.u8 q3, d26, d22
|
||||
vmull.u8 q8, d29, d5
|
||||
vmull.u8 q8, d5, d29
|
||||
vmlal.u8 q8, d27, d23
|
||||
vrshrn.i16 d20, q3, #6
|
||||
vrshrn.i16 d21, q8, #6
|
||||
vst1.u8 {d18}, [r0]!
|
||||
vst1.u8 {d20}, [r12]!
|
||||
vst1.32 {d19[0]}, [r0]!
|
||||
vst1.32 {d21[0]}, [r12]!
|
||||
vst1.u8 {d18}, [r0, :64]!
|
||||
vst1.u8 {d20}, [r12, :64]!
|
||||
vst1.32 {d19[0]}, [r0, :32]!
|
||||
vst1.32 {d21[0]}, [r12, :32]!
|
||||
add r0, r0, r1
|
||||
add r12, r12, r1
|
||||
bgt 16b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
320:
|
||||
vmov.i8 q10, #64
|
||||
vld1.u8 {q2, q3}, [r5]
|
||||
vld1.u8 {q2, q3}, [r5, :128]
|
||||
vsub.i8 q11, q10, q2
|
||||
vsub.i8 q12, q10, q3
|
||||
32:
|
||||
vld1.u8 {q8, q9}, [r2], r3
|
||||
vld1.u8 {q0, q1}, [r0]
|
||||
vld1.u8 {q8, q9}, [r2, :128]!
|
||||
vld1.u8 {q0, q1}, [r0, :128]
|
||||
subs r4, r4, #1
|
||||
vmull.u8 q15, d16, d4
|
||||
vmlal.u8 q15, d0, d22
|
||||
@ -858,9 +846,9 @@ L(blend_v_tbl):
|
||||
vmull.u8 q15, d18, d6
|
||||
vmlal.u8 q15, d2, d24
|
||||
vrshrn.i16 d2, q15, #6
|
||||
vst1.u8 {d0, d1, d2}, [r0], r1
|
||||
vst1.u8 {d0, d1, d2}, [r0, :64], r1
|
||||
bgt 32b
|
||||
pop {r4-r8,pc}
|
||||
pop {r4-r5,pc}
|
||||
endfunc
|
||||
|
||||
|
||||
|
131
third_party/dav1d/src/arm/64/itx.S
vendored
131
third_party/dav1d/src/arm/64/itx.S
vendored
@ -98,7 +98,8 @@ const idct64_coeffs, align=4
|
||||
endconst
|
||||
|
||||
const iadst4_coeffs, align=4
|
||||
.short 1321, 3803, 2482, 3344, 3344*8
|
||||
// .h[4-5] can be interpreted as .s[2]
|
||||
.short 1321, 3803, 2482, 3344, 3344, 0
|
||||
endconst
|
||||
|
||||
const iadst8_coeffs, align=4
|
||||
@ -147,6 +148,27 @@ endconst
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro saddl_sz d0, d1, s0, s1, sz
|
||||
saddl \d0\().4s, \s0\().4h, \s1\().4h
|
||||
.ifc \sz, .8h
|
||||
saddl2 \d1\().4s, \s0\().8h, \s1\().8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro ssubl_sz d0, d1, s0, s1, sz
|
||||
ssubl \d0\().4s, \s0\().4h, \s1\().4h
|
||||
.ifc \sz, .8h
|
||||
ssubl2 \d1\().4s, \s0\().8h, \s1\().8h
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro mul_4s_sz d0, d1, s0, s1, c, sz
|
||||
mul \d0\().4s, \s0\().4s, \c
|
||||
.ifc \sz, .8h
|
||||
mul \d1\().4s, \s1\().4s, \c
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
|
||||
sqrdmulh \r0\sz, \r0\sz, \c
|
||||
sqrdmulh \r1\sz, \r1\sz, \c
|
||||
@ -499,23 +521,24 @@ endfunc
|
||||
movrel x16, iadst4_coeffs
|
||||
ld1 {v0.8h}, [x16]
|
||||
|
||||
sub v3.4h, v16.4h, v18.4h
|
||||
ssubl v3.4s, v16.4h, v18.4h
|
||||
smull v4.4s, v16.4h, v0.h[0]
|
||||
smlal v4.4s, v18.4h, v0.h[1]
|
||||
smlal v4.4s, v19.4h, v0.h[2]
|
||||
smull v7.4s, v17.4h, v0.h[3]
|
||||
add v3.4h, v3.4h, v19.4h
|
||||
saddw v3.4s, v3.4s, v19.4h
|
||||
smull v5.4s, v16.4h, v0.h[2]
|
||||
smlsl v5.4s, v18.4h, v0.h[0]
|
||||
smlsl v5.4s, v19.4h, v0.h[1]
|
||||
|
||||
add \o3\().4s, v4.4s, v5.4s
|
||||
sqrdmulh \o2\().4h, v3.4h, v0.h[4]
|
||||
mul \o2\().4s, v3.4s, v0.s[2]
|
||||
add \o0\().4s, v4.4s, v7.4s
|
||||
add \o1\().4s, v5.4s, v7.4s
|
||||
sub \o3\().4s, \o3\().4s, v7.4s
|
||||
|
||||
rshrn \o0\().4h, \o0\().4s, #12
|
||||
rshrn \o2\().4h, \o2\().4s, #12
|
||||
rshrn \o1\().4h, \o1\().4s, #12
|
||||
rshrn \o3\().4h, \o3\().4s, #12
|
||||
.endm
|
||||
@ -534,14 +557,16 @@ endfunc
|
||||
movrel x16, iadst4_coeffs
|
||||
ld1 {v0.8h}, [x16]
|
||||
|
||||
sub v3.8h, v16.8h, v18.8h
|
||||
ssubl v2.4s, v16.4h, v18.4h
|
||||
ssubl2 v3.4s, v16.8h, v18.8h
|
||||
smull v4.4s, v16.4h, v0.h[0]
|
||||
smlal v4.4s, v18.4h, v0.h[1]
|
||||
smlal v4.4s, v19.4h, v0.h[2]
|
||||
smull2 v5.4s, v16.8h, v0.h[0]
|
||||
smlal2 v5.4s, v18.8h, v0.h[1]
|
||||
smlal2 v5.4s, v19.8h, v0.h[2]
|
||||
add v3.8h, v3.8h, v19.8h
|
||||
saddw v2.4s, v2.4s, v19.4h
|
||||
saddw2 v3.4s, v3.4s, v19.8h
|
||||
smull v6.4s, v16.4h, v0.h[2]
|
||||
smlsl v6.4s, v18.4h, v0.h[0]
|
||||
smlsl v6.4s, v19.4h, v0.h[1]
|
||||
@ -549,7 +574,8 @@ endfunc
|
||||
smlsl2 v7.4s, v18.8h, v0.h[0]
|
||||
smlsl2 v7.4s, v19.8h, v0.h[1]
|
||||
|
||||
sqrdmulh v18.8h, v3.8h, v0.h[4]
|
||||
mul v18.4s, v2.4s, v0.s[2]
|
||||
mul v19.4s, v3.4s, v0.s[2]
|
||||
|
||||
smull v2.4s, v17.4h, v0.h[3]
|
||||
smull2 v3.4s, v17.8h, v0.h[3]
|
||||
@ -566,6 +592,9 @@ endfunc
|
||||
sub v4.4s, v4.4s, v2.4s // out3
|
||||
sub v5.4s, v5.4s, v3.4s
|
||||
|
||||
rshrn v18.4h, v18.4s, #12
|
||||
rshrn2 v18.8h, v19.4s, #12
|
||||
|
||||
rshrn \o0\().4h, v16.4s, #12
|
||||
rshrn2 \o0\().8h, v17.4s, #12
|
||||
|
||||
@ -836,16 +865,25 @@ endfunc
|
||||
sqsub v5\sz, v5\sz, v19\sz // t7
|
||||
sqneg \o1\()\sz, \o1\()\sz // out1
|
||||
|
||||
add v6\sz, v2\sz, v4\sz
|
||||
sub v7\sz, v2\sz, v4\sz
|
||||
add v4\sz, v3\sz, v5\sz
|
||||
sub v5\sz, v3\sz, v5\sz
|
||||
sqrdmulh \o3\sz, v6\sz, v1.h[1] // out3
|
||||
sqrdmulh \o4\sz, v7\sz, v1.h[1] // out4
|
||||
sqrdmulh \o2\sz, v4\sz, v1.h[1] // out2
|
||||
sqrdmulh \o5\sz, v5\sz, v1.h[1] // out5
|
||||
neg \o3\()\sz, \o3\()\sz // out3
|
||||
neg \o5\()\sz, \o5\()\sz // out5
|
||||
movi v0.4s, #2896>>4
|
||||
|
||||
saddl_sz v18, v19, v2, v4, \sz // -> out3 (v19 or v20)
|
||||
ssubl_sz v6, v7, v2, v4, \sz // -> out4 (v20 or v19)
|
||||
ssubl_sz v20, v21, v3, v5, \sz // -> out5 (v21 or v18)
|
||||
saddl_sz v4, v5, v3, v5, \sz // -> out2 (v18 or v21)
|
||||
|
||||
mul_4s_sz v18, v19, v18, v19, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v20, v21, v20, v21, v0.s[0], \sz
|
||||
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
|
||||
|
||||
rshrn_sz v2, v18, v19, #8, \sz // out3
|
||||
rshrn_sz v3, v20, v21, #8, \sz // out5
|
||||
rshrn_sz \o2, v4, v5, #8, \sz // out2 (v18 or v21)
|
||||
rshrn_sz \o4, v6, v7, #8, \sz // out4 (v20 or v19)
|
||||
|
||||
sqneg \o3\()\sz, v2\sz // out3
|
||||
sqneg \o5\()\sz, v3\sz // out5
|
||||
.endm
|
||||
|
||||
function inv_adst_8x8_neon
|
||||
@ -1272,28 +1310,47 @@ endfunc
|
||||
sqsub v23\sz, v25\sz, v23\sz // t7
|
||||
sqneg \o3\sz, \o3\sz // out3
|
||||
|
||||
sqsub v24\sz, v2\sz, v21\sz // -> out8
|
||||
sqadd v2\sz, v2\sz, v21\sz // -> out7
|
||||
sqadd v21\sz, v26\sz, v3\sz // -> out5
|
||||
sqsub v26\sz, v26\sz, v3\sz // -> out10
|
||||
sqadd v3\sz, v27\sz, v20\sz // -> out6
|
||||
sqsub v25\sz, v27\sz, v20\sz // -> out9
|
||||
sqadd v20\sz, v22\sz, v23\sz // -> out4
|
||||
sqsub v27\sz, v22\sz, v23\sz // -> out11
|
||||
movi v0.4s, #2896>>4
|
||||
|
||||
sqrdmulh v2\sz, v2\sz, v0.h[1] // out7
|
||||
sqrdmulh v4\sz, v21\sz, v0.h[1] // out5
|
||||
sqrdmulh v5\sz, v25\sz, v0.h[1] // out9
|
||||
sqrdmulh v6\sz, v27\sz, v0.h[1] // out11
|
||||
sqrdmulh \o6\sz, v3\sz, v0.h[1] // out6
|
||||
sqrdmulh \o8\sz, v24\sz, v0.h[1] // out8
|
||||
sqrdmulh \o10\sz, v26\sz, v0.h[1] // out10
|
||||
sqrdmulh \o4\sz, v20\sz, v0.h[1] // out4
|
||||
ssubl_sz v24, v25, v2, v21, \sz // -> out8 (v24 or v23)
|
||||
saddl_sz v4, v5, v2, v21, \sz // -> out7 (v23 or v24)
|
||||
saddl_sz v6, v7, v26, v3, \sz // -> out5 (v21 or v26)
|
||||
ssubl_sz v2, v3, v26, v3, \sz // -> out10 (v26 or v21)
|
||||
|
||||
neg \o7\sz, v2\sz // out7
|
||||
neg \o5\sz, v4\sz // out5
|
||||
neg \o9\sz, v5\sz // out9
|
||||
neg \o11\sz, v6\sz // out11
|
||||
mul_4s_sz v24, v25, v24, v25, v0.s[0], \sz
|
||||
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
|
||||
|
||||
rshrn_sz v24, v24, v25, #8, \sz // out8
|
||||
rshrn_sz v4, v4, v5, #8, \sz // out7
|
||||
rshrn_sz v5, v6, v7, #8, \sz // out5
|
||||
rshrn_sz v26, v2, v3, #8, \sz // out10
|
||||
|
||||
saddl_sz v2, v3, v22, v23, \sz // -> out4 (v20 or v27)
|
||||
ssubl_sz v6, v7, v22, v23, \sz // -> out11 (v27 or v20)
|
||||
saddl_sz v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
|
||||
ssubl_sz v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
|
||||
|
||||
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
|
||||
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
|
||||
mul_4s_sz v22, v23, v22, v23, v0.s[0], \sz
|
||||
mul_4s_sz v21, v25, v21, v25, v0.s[0], \sz
|
||||
|
||||
rshrn_sz \o4, v2, v3, #8, \sz // out4
|
||||
rshrn_sz v6, v6, v7, #8, \sz // out11
|
||||
rshrn_sz v7, v21, v25, #8, \sz // out9
|
||||
rshrn_sz \o6, v22, v23, #8, \sz // out6
|
||||
|
||||
.ifc \o8, v23
|
||||
mov \o8\szb, v24\szb
|
||||
mov \o10\szb, v26\szb
|
||||
.endif
|
||||
|
||||
sqneg \o7\sz, v4\sz // out7
|
||||
sqneg \o5\sz, v5\sz // out5
|
||||
sqneg \o11\sz, v6\sz // out11
|
||||
sqneg \o9\sz, v7\sz // out9
|
||||
.endm
|
||||
|
||||
function inv_adst_8x16_neon
|
||||
|
629
third_party/dav1d/src/arm/64/mc.S
vendored
629
third_party/dav1d/src/arm/64/mc.S
vendored
@ -234,6 +234,635 @@ bidir_fn w_avg
|
||||
bidir_fn mask
|
||||
|
||||
|
||||
.macro w_mask_fn type
|
||||
function w_mask_\type\()_8bpc_neon, export=1
|
||||
clz w8, w4
|
||||
adr x9, L(w_mask_\type\()_tbl)
|
||||
sub w8, w8, #24
|
||||
ldrh w8, [x9, x8, lsl #1]
|
||||
sub x9, x9, w8, uxtw
|
||||
mov w10, #6903
|
||||
dup v0.8h, w10
|
||||
.if \type == 444
|
||||
movi v1.16b, #64
|
||||
.elseif \type == 422
|
||||
dup v2.8b, w7
|
||||
movi v3.8b, #129
|
||||
sub v3.8b, v3.8b, v2.8b
|
||||
.elseif \type == 420
|
||||
dup v2.8h, w7
|
||||
movi v3.8h, #1, lsl #8
|
||||
sub v3.8h, v3.8h, v2.8h
|
||||
.endif
|
||||
add x12, x0, x1
|
||||
lsl x1, x1, #1
|
||||
br x9
|
||||
4:
|
||||
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
|
||||
ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
|
||||
subs w5, w5, #4
|
||||
sub v16.8h, v6.8h, v4.8h
|
||||
sub v17.8h, v7.8h, v5.8h
|
||||
sabd v18.8h, v4.8h, v6.8h
|
||||
sabd v19.8h, v5.8h, v7.8h
|
||||
uqsub v18.8h, v0.8h, v18.8h
|
||||
uqsub v19.8h, v0.8h, v19.8h
|
||||
ushr v18.8h, v18.8h, #8
|
||||
ushr v19.8h, v19.8h, #8
|
||||
shl v20.8h, v18.8h, #9
|
||||
shl v21.8h, v19.8h, #9
|
||||
sqdmulh v20.8h, v20.8h, v16.8h
|
||||
sqdmulh v21.8h, v21.8h, v17.8h
|
||||
add v20.8h, v20.8h, v4.8h
|
||||
add v21.8h, v21.8h, v5.8h
|
||||
sqrshrun v22.8b, v20.8h, #4
|
||||
sqrshrun v23.8b, v21.8h, #4
|
||||
.if \type == 444
|
||||
xtn v18.8b, v18.8h
|
||||
xtn2 v18.16b, v19.8h
|
||||
sub v18.16b, v1.16b, v18.16b
|
||||
st1 {v18.16b}, [x6], #16
|
||||
.elseif \type == 422
|
||||
addp v18.8h, v18.8h, v19.8h
|
||||
xtn v18.8b, v18.8h
|
||||
uhsub v18.8b, v3.8b, v18.8b
|
||||
st1 {v18.8b}, [x6], #8
|
||||
.elseif \type == 420
|
||||
trn1 v24.2d, v18.2d, v19.2d
|
||||
trn2 v25.2d, v18.2d, v19.2d
|
||||
add v24.8h, v24.8h, v25.8h
|
||||
addp v18.8h, v24.8h, v24.8h
|
||||
sub v18.4h, v3.4h, v18.4h
|
||||
rshrn v18.8b, v18.8h, #2
|
||||
st1 {v18.s}[0], [x6], #4
|
||||
.endif
|
||||
st1 {v22.s}[0], [x0], x1
|
||||
st1 {v22.s}[1], [x12], x1
|
||||
st1 {v23.s}[0], [x0], x1
|
||||
st1 {v23.s}[1], [x12], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
ld1 {v4.8h, v5.8h}, [x2], #32
|
||||
ld1 {v6.8h, v7.8h}, [x3], #32
|
||||
subs w5, w5, #2
|
||||
sub v16.8h, v6.8h, v4.8h
|
||||
sub v17.8h, v7.8h, v5.8h
|
||||
sabd v18.8h, v4.8h, v6.8h
|
||||
sabd v19.8h, v5.8h, v7.8h
|
||||
uqsub v18.8h, v0.8h, v18.8h
|
||||
uqsub v19.8h, v0.8h, v19.8h
|
||||
ushr v18.8h, v18.8h, #8
|
||||
ushr v19.8h, v19.8h, #8
|
||||
shl v20.8h, v18.8h, #9
|
||||
shl v21.8h, v19.8h, #9
|
||||
sqdmulh v20.8h, v20.8h, v16.8h
|
||||
sqdmulh v21.8h, v21.8h, v17.8h
|
||||
add v20.8h, v20.8h, v4.8h
|
||||
add v21.8h, v21.8h, v5.8h
|
||||
sqrshrun v22.8b, v20.8h, #4
|
||||
sqrshrun v23.8b, v21.8h, #4
|
||||
.if \type == 444
|
||||
xtn v18.8b, v18.8h
|
||||
xtn2 v18.16b, v19.8h
|
||||
sub v18.16b, v1.16b, v18.16b
|
||||
st1 {v18.16b}, [x6], #16
|
||||
.elseif \type == 422
|
||||
addp v18.8h, v18.8h, v19.8h
|
||||
xtn v18.8b, v18.8h
|
||||
uhsub v18.8b, v3.8b, v18.8b
|
||||
st1 {v18.8b}, [x6], #8
|
||||
.elseif \type == 420
|
||||
add v18.8h, v18.8h, v19.8h
|
||||
addp v18.8h, v18.8h, v18.8h
|
||||
sub v18.4h, v3.4h, v18.4h
|
||||
rshrn v18.8b, v18.8h, #2
|
||||
st1 {v18.s}[0], [x6], #4
|
||||
.endif
|
||||
st1 {v22.8b}, [x0], x1
|
||||
st1 {v23.8b}, [x12], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
1280:
|
||||
640:
|
||||
320:
|
||||
160:
|
||||
mov w11, w4
|
||||
sub x1, x1, w4, uxtw
|
||||
.if \type == 444
|
||||
add x10, x6, w4, uxtw
|
||||
.elseif \type == 422
|
||||
add x10, x6, x11, lsr #1
|
||||
.endif
|
||||
add x9, x3, w4, uxtw #1
|
||||
add x7, x2, w4, uxtw #1
|
||||
161:
|
||||
mov w8, w4
|
||||
16:
|
||||
ld1 {v4.8h, v5.8h}, [x2], #32
|
||||
ld1 {v6.8h, v7.8h}, [x3], #32
|
||||
ld1 {v16.8h, v17.8h}, [x7], #32
|
||||
ld1 {v18.8h, v19.8h}, [x9], #32
|
||||
subs w8, w8, #16
|
||||
sub v6.8h, v6.8h, v4.8h
|
||||
sub v7.8h, v7.8h, v5.8h
|
||||
sub v18.8h, v18.8h, v16.8h
|
||||
sub v19.8h, v19.8h, v17.8h
|
||||
abs v20.8h, v6.8h
|
||||
abs v21.8h, v7.8h
|
||||
abs v22.8h, v18.8h
|
||||
abs v23.8h, v19.8h
|
||||
uqsub v20.8h, v0.8h, v20.8h
|
||||
uqsub v21.8h, v0.8h, v21.8h
|
||||
uqsub v22.8h, v0.8h, v22.8h
|
||||
uqsub v23.8h, v0.8h, v23.8h
|
||||
ushr v20.8h, v20.8h, #8
|
||||
ushr v21.8h, v21.8h, #8
|
||||
ushr v22.8h, v22.8h, #8
|
||||
ushr v23.8h, v23.8h, #8
|
||||
shl v24.8h, v20.8h, #9
|
||||
shl v25.8h, v21.8h, #9
|
||||
shl v26.8h, v22.8h, #9
|
||||
shl v27.8h, v23.8h, #9
|
||||
sqdmulh v24.8h, v24.8h, v6.8h
|
||||
sqdmulh v25.8h, v25.8h, v7.8h
|
||||
sqdmulh v26.8h, v26.8h, v18.8h
|
||||
sqdmulh v27.8h, v27.8h, v19.8h
|
||||
add v24.8h, v24.8h, v4.8h
|
||||
add v25.8h, v25.8h, v5.8h
|
||||
add v26.8h, v26.8h, v16.8h
|
||||
add v27.8h, v27.8h, v17.8h
|
||||
sqrshrun v24.8b, v24.8h, #4
|
||||
sqrshrun v25.8b, v25.8h, #4
|
||||
sqrshrun v26.8b, v26.8h, #4
|
||||
sqrshrun v27.8b, v27.8h, #4
|
||||
.if \type == 444
|
||||
xtn v20.8b, v20.8h
|
||||
xtn2 v20.16b, v21.8h
|
||||
xtn v21.8b, v22.8h
|
||||
xtn2 v21.16b, v23.8h
|
||||
sub v20.16b, v1.16b, v20.16b
|
||||
sub v21.16b, v1.16b, v21.16b
|
||||
st1 {v20.16b}, [x6], #16
|
||||
st1 {v21.16b}, [x10], #16
|
||||
.elseif \type == 422
|
||||
addp v20.8h, v20.8h, v21.8h
|
||||
addp v21.8h, v22.8h, v23.8h
|
||||
xtn v20.8b, v20.8h
|
||||
xtn v21.8b, v21.8h
|
||||
uhsub v20.8b, v3.8b, v20.8b
|
||||
uhsub v21.8b, v3.8b, v21.8b
|
||||
st1 {v20.8b}, [x6], #8
|
||||
st1 {v21.8b}, [x10], #8
|
||||
.elseif \type == 420
|
||||
add v20.8h, v20.8h, v22.8h
|
||||
add v21.8h, v21.8h, v23.8h
|
||||
addp v20.8h, v20.8h, v21.8h
|
||||
sub v20.8h, v3.8h, v20.8h
|
||||
rshrn v20.8b, v20.8h, #2
|
||||
st1 {v20.8b}, [x6], #8
|
||||
.endif
|
||||
st1 {v24.8b, v25.8b}, [x0], #16
|
||||
st1 {v26.8b, v27.8b}, [x12], #16
|
||||
b.gt 16b
|
||||
subs w5, w5, #2
|
||||
add x2, x2, w4, uxtw #1
|
||||
add x3, x3, w4, uxtw #1
|
||||
add x7, x7, w4, uxtw #1
|
||||
add x9, x9, w4, uxtw #1
|
||||
.if \type == 444
|
||||
add x6, x6, w4, uxtw
|
||||
add x10, x10, w4, uxtw
|
||||
.elseif \type == 422
|
||||
add x6, x6, x11, lsr #1
|
||||
add x10, x10, x11, lsr #1
|
||||
.endif
|
||||
add x0, x0, x1
|
||||
add x12, x12, x1
|
||||
b.gt 161b
|
||||
ret
|
||||
L(w_mask_\type\()_tbl):
|
||||
.hword L(w_mask_\type\()_tbl) - 1280b
|
||||
.hword L(w_mask_\type\()_tbl) - 640b
|
||||
.hword L(w_mask_\type\()_tbl) - 320b
|
||||
.hword L(w_mask_\type\()_tbl) - 160b
|
||||
.hword L(w_mask_\type\()_tbl) - 8b
|
||||
.hword L(w_mask_\type\()_tbl) - 4b
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
w_mask_fn 444
|
||||
w_mask_fn 422
|
||||
w_mask_fn 420
|
||||
|
||||
|
||||
function blend_8bpc_neon, export=1
|
||||
adr x6, L(blend_tbl)
|
||||
clz w3, w3
|
||||
sub w3, w3, #26
|
||||
ldrh w3, [x6, x3, lsl #1]
|
||||
sub x6, x6, w3, uxtw
|
||||
movi v4.16b, #64
|
||||
add x8, x0, x1
|
||||
lsl w1, w1, #1
|
||||
br x6
|
||||
4:
|
||||
ld1 {v2.d}[0], [x5], #8
|
||||
ld1 {v1.d}[0], [x2], #8
|
||||
ld1 {v0.s}[0], [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v0.s}[1], [x8]
|
||||
sub v3.8b, v4.8b, v2.8b
|
||||
umull v5.8h, v1.8b, v2.8b
|
||||
umlal v5.8h, v0.8b, v3.8b
|
||||
rshrn v6.8b, v5.8h, #6
|
||||
st1 {v6.s}[0], [x0], x1
|
||||
st1 {v6.s}[1], [x8], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
ld1 {v2.2d}, [x5], #16
|
||||
ld1 {v1.2d}, [x2], #16
|
||||
ld1 {v0.d}[0], [x0]
|
||||
ld1 {v0.d}[1], [x8]
|
||||
sub v3.16b, v4.16b, v2.16b
|
||||
subs w4, w4, #2
|
||||
umull v5.8h, v1.8b, v2.8b
|
||||
umlal v5.8h, v0.8b, v3.8b
|
||||
umull2 v6.8h, v1.16b, v2.16b
|
||||
umlal2 v6.8h, v0.16b, v3.16b
|
||||
rshrn v7.8b, v5.8h, #6
|
||||
rshrn2 v7.16b, v6.8h, #6
|
||||
st1 {v7.d}[0], [x0], x1
|
||||
st1 {v7.d}[1], [x8], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
16:
|
||||
ld1 {v1.2d, v2.2d}, [x5], #32
|
||||
ld1 {v5.2d, v6.2d}, [x2], #32
|
||||
ld1 {v0.2d}, [x0]
|
||||
subs w4, w4, #2
|
||||
sub v7.16b, v4.16b, v1.16b
|
||||
sub v20.16b, v4.16b, v2.16b
|
||||
ld1 {v3.2d}, [x8]
|
||||
umull v16.8h, v5.8b, v1.8b
|
||||
umlal v16.8h, v0.8b, v7.8b
|
||||
umull2 v17.8h, v5.16b, v1.16b
|
||||
umlal2 v17.8h, v0.16b, v7.16b
|
||||
umull v21.8h, v6.8b, v2.8b
|
||||
umlal v21.8h, v3.8b, v20.8b
|
||||
umull2 v22.8h, v6.16b, v2.16b
|
||||
umlal2 v22.8h, v3.16b, v20.16b
|
||||
rshrn v18.8b, v16.8h, #6
|
||||
rshrn2 v18.16b, v17.8h, #6
|
||||
rshrn v19.8b, v21.8h, #6
|
||||
rshrn2 v19.16b, v22.8h, #6
|
||||
st1 {v18.2d}, [x0], x1
|
||||
st1 {v19.2d}, [x8], x1
|
||||
b.gt 16b
|
||||
ret
|
||||
32:
|
||||
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
|
||||
ld1 {v20.2d, v21.2d}, [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v22.2d, v23.2d}, [x8]
|
||||
sub v5.16b, v4.16b, v0.16b
|
||||
sub v6.16b, v4.16b, v1.16b
|
||||
sub v30.16b, v4.16b, v2.16b
|
||||
sub v31.16b, v4.16b, v3.16b
|
||||
umull v24.8h, v16.8b, v0.8b
|
||||
umlal v24.8h, v20.8b, v5.8b
|
||||
umull2 v26.8h, v16.16b, v0.16b
|
||||
umlal2 v26.8h, v20.16b, v5.16b
|
||||
umull v28.8h, v17.8b, v1.8b
|
||||
umlal v28.8h, v21.8b, v6.8b
|
||||
umull2 v7.8h, v17.16b, v1.16b
|
||||
umlal2 v7.8h, v21.16b, v6.16b
|
||||
umull v27.8h, v18.8b, v2.8b
|
||||
umlal v27.8h, v22.8b, v30.8b
|
||||
umull2 v1.8h, v18.16b, v2.16b
|
||||
umlal2 v1.8h, v22.16b, v30.16b
|
||||
umull v29.8h, v19.8b, v3.8b
|
||||
umlal v29.8h, v23.8b, v31.8b
|
||||
umull2 v21.8h, v19.16b, v3.16b
|
||||
umlal2 v21.8h, v23.16b, v31.16b
|
||||
rshrn v24.8b, v24.8h, #6
|
||||
rshrn2 v24.16b, v26.8h, #6
|
||||
rshrn v25.8b, v28.8h, #6
|
||||
rshrn2 v25.16b, v7.8h, #6
|
||||
rshrn v27.8b, v27.8h, #6
|
||||
rshrn2 v27.16b, v1.8h, #6
|
||||
rshrn v28.8b, v29.8h, #6
|
||||
rshrn2 v28.16b, v21.8h, #6
|
||||
st1 {v24.2d, v25.2d}, [x0], x1
|
||||
st1 {v27.2d, v28.2d}, [x8], x1
|
||||
b.gt 32b
|
||||
ret
|
||||
L(blend_tbl):
|
||||
.hword L(blend_tbl) - 32b
|
||||
.hword L(blend_tbl) - 16b
|
||||
.hword L(blend_tbl) - 8b
|
||||
.hword L(blend_tbl) - 4b
|
||||
endfunc
|
||||
|
||||
function blend_h_8bpc_neon, export=1
|
||||
adr x6, L(blend_h_tbl)
|
||||
movrel x5, X(obmc_masks)
|
||||
add x5, x5, w4, uxtw
|
||||
sub w4, w4, w4, lsr #2
|
||||
clz w7, w3
|
||||
movi v4.16b, #64
|
||||
add x8, x0, x1
|
||||
lsl x1, x1, #1
|
||||
sub w7, w7, #24
|
||||
ldrh w7, [x6, x7, lsl #1]
|
||||
sub x6, x6, w7, uxtw
|
||||
br x6
|
||||
2:
|
||||
ld1 {v0.h}[0], [x5], #2
|
||||
ld1 {v1.s}[0], [x2], #4
|
||||
subs w4, w4, #2
|
||||
ld1 {v2.h}[0], [x0]
|
||||
zip1 v0.8b, v0.8b, v0.8b
|
||||
sub v3.8b, v4.8b, v0.8b
|
||||
ld1 {v2.h}[1], [x8]
|
||||
umull v5.8h, v1.8b, v0.8b
|
||||
umlal v5.8h, v2.8b, v3.8b
|
||||
rshrn v5.8b, v5.8h, #6
|
||||
st1 {v5.h}[0], [x0], x1
|
||||
st1 {v5.h}[1], [x8], x1
|
||||
b.gt 2b
|
||||
ret
|
||||
4:
|
||||
ld2r {v0.8b, v1.8b}, [x5], #2
|
||||
ld1 {v2.2s}, [x2], #8
|
||||
subs w4, w4, #2
|
||||
ext v0.8b, v0.8b, v1.8b, #4
|
||||
ld1 {v3.s}[0], [x0]
|
||||
sub v5.8b, v4.8b, v0.8b
|
||||
ld1 {v3.s}[1], [x8]
|
||||
umull v6.8h, v2.8b, v0.8b
|
||||
umlal v6.8h, v3.8b, v5.8b
|
||||
rshrn v6.8b, v6.8h, #6
|
||||
st1 {v6.s}[0], [x0], x1
|
||||
st1 {v6.s}[1], [x8], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
8:
|
||||
ld2r {v0.16b, v1.16b}, [x5], #2
|
||||
ld1 {v2.16b}, [x2], #16
|
||||
ld1 {v3.d}[0], [x0]
|
||||
ext v0.16b, v0.16b, v1.16b, #8
|
||||
sub v5.16b, v4.16b, v0.16b
|
||||
ld1 {v3.d}[1], [x8]
|
||||
subs w4, w4, #2
|
||||
umull v6.8h, v0.8b, v2.8b
|
||||
umlal v6.8h, v3.8b, v5.8b
|
||||
umull2 v7.8h, v0.16b, v2.16b
|
||||
umlal2 v7.8h, v3.16b, v5.16b
|
||||
rshrn v16.8b, v6.8h, #6
|
||||
rshrn2 v16.16b, v7.8h, #6
|
||||
st1 {v16.d}[0], [x0], x1
|
||||
st1 {v16.d}[1], [x8], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
16:
|
||||
ld2r {v0.16b, v1.16b}, [x5], #2
|
||||
ld1 {v2.16b, v3.16b}, [x2], #32
|
||||
ld1 {v5.16b}, [x0]
|
||||
sub v7.16b, v4.16b, v0.16b
|
||||
sub v16.16b, v4.16b, v1.16b
|
||||
ld1 {v6.16b}, [x8]
|
||||
subs w4, w4, #2
|
||||
umull v17.8h, v0.8b, v2.8b
|
||||
umlal v17.8h, v5.8b, v7.8b
|
||||
umull2 v18.8h, v0.16b, v2.16b
|
||||
umlal2 v18.8h, v5.16b, v7.16b
|
||||
umull v19.8h, v1.8b, v3.8b
|
||||
umlal v19.8h, v6.8b, v16.8b
|
||||
umull2 v20.8h, v1.16b, v3.16b
|
||||
umlal2 v20.8h, v6.16b, v16.16b
|
||||
rshrn v21.8b, v17.8h, #6
|
||||
rshrn2 v21.16b, v18.8h, #6
|
||||
rshrn v22.8b, v19.8h, #6
|
||||
rshrn2 v22.16b, v20.8h, #6
|
||||
st1 {v21.16b}, [x0], x1
|
||||
st1 {v22.16b}, [x8], x1
|
||||
b.gt 16b
|
||||
ret
|
||||
1280:
|
||||
640:
|
||||
320:
|
||||
sub x1, x1, w3, uxtw
|
||||
add x7, x2, w3, uxtw
|
||||
321:
|
||||
ld2r {v0.16b, v1.16b}, [x5], #2
|
||||
mov w6, w3
|
||||
sub v20.16b, v4.16b, v0.16b
|
||||
sub v21.16b, v4.16b, v1.16b
|
||||
32:
|
||||
ld1 {v16.16b, v17.16b}, [x2], #32
|
||||
ld1 {v2.16b, v3.16b}, [x0]
|
||||
subs w6, w6, #32
|
||||
umull v23.8h, v0.8b, v16.8b
|
||||
umlal v23.8h, v2.8b, v20.8b
|
||||
ld1 {v18.16b, v19.16b}, [x7], #32
|
||||
umull2 v27.8h, v0.16b, v16.16b
|
||||
umlal2 v27.8h, v2.16b, v20.16b
|
||||
ld1 {v6.16b, v7.16b}, [x8]
|
||||
umull v24.8h, v0.8b, v17.8b
|
||||
umlal v24.8h, v3.8b, v20.8b
|
||||
umull2 v28.8h, v0.16b, v17.16b
|
||||
umlal2 v28.8h, v3.16b, v20.16b
|
||||
umull v25.8h, v1.8b, v18.8b
|
||||
umlal v25.8h, v6.8b, v21.8b
|
||||
umull2 v5.8h, v1.16b, v18.16b
|
||||
umlal2 v5.8h, v6.16b, v21.16b
|
||||
rshrn v29.8b, v23.8h, #6
|
||||
rshrn2 v29.16b, v27.8h, #6
|
||||
umull v26.8h, v1.8b, v19.8b
|
||||
umlal v26.8h, v7.8b, v21.8b
|
||||
umull2 v31.8h, v1.16b, v19.16b
|
||||
umlal2 v31.8h, v7.16b, v21.16b
|
||||
rshrn v30.8b, v24.8h, #6
|
||||
rshrn2 v30.16b, v28.8h, #6
|
||||
rshrn v23.8b, v25.8h, #6
|
||||
rshrn2 v23.16b, v5.8h, #6
|
||||
rshrn v24.8b, v26.8h, #6
|
||||
st1 {v29.16b, v30.16b}, [x0], #32
|
||||
rshrn2 v24.16b, v31.8h, #6
|
||||
st1 {v23.16b, v24.16b}, [x8], #32
|
||||
b.gt 32b
|
||||
subs w4, w4, #2
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
add x2, x2, w3, uxtw
|
||||
add x7, x7, w3, uxtw
|
||||
b.gt 321b
|
||||
ret
|
||||
L(blend_h_tbl):
|
||||
.hword L(blend_h_tbl) - 1280b
|
||||
.hword L(blend_h_tbl) - 640b
|
||||
.hword L(blend_h_tbl) - 320b
|
||||
.hword L(blend_h_tbl) - 16b
|
||||
.hword L(blend_h_tbl) - 8b
|
||||
.hword L(blend_h_tbl) - 4b
|
||||
.hword L(blend_h_tbl) - 2b
|
||||
endfunc
|
||||
|
||||
function blend_v_8bpc_neon, export=1
|
||||
adr x6, L(blend_v_tbl)
|
||||
movrel x5, X(obmc_masks)
|
||||
add x5, x5, w3, uxtw
|
||||
clz w3, w3
|
||||
movi v4.16b, #64
|
||||
add x8, x0, x1
|
||||
lsl x1, x1, #1
|
||||
sub w3, w3, #26
|
||||
ldrh w3, [x6, x3, lsl #1]
|
||||
sub x6, x6, w3, uxtw
|
||||
br x6
|
||||
20:
|
||||
ld1r {v0.8b}, [x5]
|
||||
sub v1.8b, v4.8b, v0.8b
|
||||
2:
|
||||
ld1 {v2.h}[0], [x2], #2
|
||||
ld1 {v3.b}[0], [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v2.b}[1], [x2]
|
||||
ld1 {v3.b}[1], [x8]
|
||||
umull v5.8h, v2.8b, v0.8b
|
||||
umlal v5.8h, v3.8b, v1.8b
|
||||
rshrn v5.8b, v5.8h, #6
|
||||
add x2, x2, #2
|
||||
st1 {v5.b}[0], [x0], x1
|
||||
st1 {v5.b}[1], [x8], x1
|
||||
b.gt 2b
|
||||
ret
|
||||
40:
|
||||
ld1r {v0.2s}, [x5]
|
||||
sub v1.8b, v4.8b, v0.8b
|
||||
sub x1, x1, #3
|
||||
4:
|
||||
ld1 {v2.8b}, [x2], #8
|
||||
ld1 {v3.s}[0], [x0]
|
||||
ld1 {v3.s}[1], [x8]
|
||||
subs w4, w4, #2
|
||||
umull v5.8h, v2.8b, v0.8b
|
||||
umlal v5.8h, v3.8b, v1.8b
|
||||
rshrn v5.8b, v5.8h, #6
|
||||
st1 {v5.h}[0], [x0], #2
|
||||
st1 {v5.h}[2], [x8], #2
|
||||
st1 {v5.b}[2], [x0], #1
|
||||
st1 {v5.b}[6], [x8], #1
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
b.gt 4b
|
||||
ret
|
||||
80:
|
||||
ld1r {v0.2d}, [x5]
|
||||
sub v1.16b, v4.16b, v0.16b
|
||||
sub x1, x1, #6
|
||||
8:
|
||||
ld1 {v2.16b}, [x2], #16
|
||||
ld1 {v3.d}[0], [x0]
|
||||
ld1 {v3.d}[1], [x8]
|
||||
subs w4, w4, #2
|
||||
umull v5.8h, v0.8b, v2.8b
|
||||
umlal v5.8h, v3.8b, v1.8b
|
||||
umull2 v6.8h, v0.16b, v2.16b
|
||||
umlal2 v6.8h, v3.16b, v1.16b
|
||||
rshrn v7.8b, v5.8h, #6
|
||||
rshrn2 v7.16b, v6.8h, #6
|
||||
st1 {v7.s}[0], [x0], #4
|
||||
st1 {v7.s}[2], [x8], #4
|
||||
st1 {v7.h}[2], [x0], #2
|
||||
st1 {v7.h}[6], [x8], #2
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
b.gt 8b
|
||||
ret
|
||||
160:
|
||||
ld1 {v0.16b}, [x5]
|
||||
sub v2.16b, v4.16b, v0.16b
|
||||
sub x1, x1, #12
|
||||
16:
|
||||
ld1 {v5.16b, v6.16b}, [x2], #32
|
||||
ld1 {v7.16b}, [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v16.16b}, [x8]
|
||||
umull v17.8h, v5.8b, v0.8b
|
||||
umlal v17.8h, v7.8b, v2.8b
|
||||
umull2 v18.8h, v5.16b, v0.16b
|
||||
umlal2 v18.8h, v7.16b, v2.16b
|
||||
umull v20.8h, v6.8b, v0.8b
|
||||
umlal v20.8h, v16.8b, v2.8b
|
||||
umull2 v21.8h, v6.16b, v0.16b
|
||||
umlal2 v21.8h, v16.16b, v2.16b
|
||||
rshrn v19.8b, v17.8h, #6
|
||||
rshrn2 v19.16b, v18.8h, #6
|
||||
rshrn v22.8b, v20.8h, #6
|
||||
rshrn2 v22.16b, v21.8h, #6
|
||||
st1 {v19.8b}, [x0], #8
|
||||
st1 {v22.8b}, [x8], #8
|
||||
st1 {v19.s}[2], [x0], #4
|
||||
st1 {v22.s}[2], [x8], #4
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
b.gt 16b
|
||||
ret
|
||||
320:
|
||||
ld1 {v0.16b, v1.16b}, [x5]
|
||||
sub v2.16b, v4.16b, v0.16b
|
||||
sub v3.16b, v4.16b, v1.16b
|
||||
sub x1, x1, #24
|
||||
32:
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
|
||||
ld1 {v5.16b, v6.16b}, [x0]
|
||||
subs w4, w4, #2
|
||||
ld1 {v20.16b, v21.16b}, [x8]
|
||||
umull v22.8h, v16.8b, v0.8b
|
||||
umlal v22.8h, v5.8b, v2.8b
|
||||
umull2 v23.8h, v16.16b, v0.16b
|
||||
umlal2 v23.8h, v5.16b, v2.16b
|
||||
umull v28.8h, v17.8b, v1.8b
|
||||
umlal v28.8h, v6.8b, v3.8b
|
||||
umull2 v29.8h, v17.16b, v1.16b
|
||||
umlal2 v29.8h, v6.16b, v3.16b
|
||||
umull v30.8h, v18.8b, v0.8b
|
||||
umlal v30.8h, v20.8b, v2.8b
|
||||
umull2 v31.8h, v18.16b, v0.16b
|
||||
umlal2 v31.8h, v20.16b, v2.16b
|
||||
umull v25.8h, v19.8b, v1.8b
|
||||
umlal v25.8h, v21.8b, v3.8b
|
||||
umull2 v26.8h, v19.16b, v1.16b
|
||||
umlal2 v26.8h, v21.16b, v3.16b
|
||||
rshrn v24.8b, v22.8h, #6
|
||||
rshrn2 v24.16b, v23.8h, #6
|
||||
rshrn v28.8b, v28.8h, #6
|
||||
rshrn2 v28.16b, v29.8h, #6
|
||||
rshrn v30.8b, v30.8h, #6
|
||||
rshrn2 v30.16b, v31.8h, #6
|
||||
rshrn v27.8b, v25.8h, #6
|
||||
rshrn2 v27.16b, v26.8h, #6
|
||||
st1 {v24.16b}, [x0], #16
|
||||
st1 {v30.16b}, [x8], #16
|
||||
st1 {v28.8b}, [x0], #8
|
||||
st1 {v27.8b}, [x8], #8
|
||||
add x0, x0, x1
|
||||
add x8, x8, x1
|
||||
b.gt 32b
|
||||
ret
|
||||
L(blend_v_tbl):
|
||||
.hword L(blend_v_tbl) - 320b
|
||||
.hword L(blend_v_tbl) - 160b
|
||||
.hword L(blend_v_tbl) - 80b
|
||||
.hword L(blend_v_tbl) - 40b
|
||||
.hword L(blend_v_tbl) - 20b
|
||||
endfunc
|
||||
|
||||
|
||||
// This has got the same signature as the put_8tap functions,
|
||||
// and assumes that x8 is set to (clz(w)-24).
|
||||
function put_neon
|
||||
|
25
third_party/dav1d/src/arm/64/msac.S
vendored
25
third_party/dav1d/src/arm/64/msac.S
vendored
@ -148,7 +148,7 @@ function msac_decode_symbol_adapt4_neon, export=1
|
||||
add x8, x0, #RNG
|
||||
ld1_n v0, v1, x1, \sz, \n // cdf
|
||||
ld1r {v4\sz}, [x8] // rng
|
||||
movrel x9, coeffs, 32
|
||||
movrel x9, coeffs, 30
|
||||
sub x9, x9, x2, lsl #1
|
||||
ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT
|
||||
str h4, [sp, #14] // store original u = s->rng
|
||||
@ -183,16 +183,24 @@ function msac_decode_symbol_adapt4_neon, export=1
|
||||
// update_cdf
|
||||
ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
|
||||
movi v5\szb, #0xff
|
||||
cmp x2, #4 // set C if n_symbols >= 4 (n_symbols > 3)
|
||||
mov w14, #4
|
||||
lsr w4, w3, #4 // count >> 4
|
||||
.if \n == 16
|
||||
mov w4, #-5
|
||||
.else
|
||||
mvn w14, w2
|
||||
mov w4, #-4
|
||||
cmn w14, #3 // set C if n_symbols <= 2
|
||||
.endif
|
||||
urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
|
||||
adc w4, w4, w14 // (count >> 4) + (n_symbols > 3) + 4
|
||||
neg w4, w4 // -rate
|
||||
.if \n == 16
|
||||
sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
|
||||
.else
|
||||
lsr w14, w3, #4 // count >> 4
|
||||
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
|
||||
.endif
|
||||
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
|
||||
dup v6.8h, w4 // -rate
|
||||
|
||||
sub w3, w3, w3, lsr #5 // count - (count >= 32)
|
||||
sub w3, w3, w3, lsr #5 // count - (count == 32)
|
||||
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
|
||||
sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
|
||||
add w3, w3, #1 // count + (count < 32)
|
||||
@ -224,8 +232,7 @@ L(renorm2):
|
||||
b.ge 9f
|
||||
|
||||
// refill
|
||||
ldr x3, [x0, #BUF_POS]
|
||||
ldr x4, [x0, #BUF_END]
|
||||
ldp x3, x4, [x0] // BUF_POS, BUF_END
|
||||
add x5, x3, #8
|
||||
cmp x5, x4
|
||||
b.gt 2f
|
||||
|
7
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
7
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
@ -101,16 +101,15 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
||||
c->avg = dav1d_avg_8bpc_neon;
|
||||
c->w_avg = dav1d_w_avg_8bpc_neon;
|
||||
c->mask = dav1d_mask_8bpc_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
|
||||
#elif ARCH_ARM
|
||||
c->blend = dav1d_blend_8bpc_neon;
|
||||
c->blend_h = dav1d_blend_h_8bpc_neon;
|
||||
c->blend_v = dav1d_blend_v_8bpc_neon;
|
||||
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
|
||||
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
|
||||
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
1
third_party/dav1d/src/cdef.h
vendored
1
third_party/dav1d/src/cdef.h
vendored
@ -67,6 +67,7 @@ typedef struct Dav1dCdefDSPContext {
|
||||
|
||||
bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
|
||||
bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
|
||||
bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
|
||||
bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_CDEF_H */
|
||||
|
3
third_party/dav1d/src/cdef_tmpl.c
vendored
3
third_party/dav1d/src/cdef_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
@ -263,6 +262,8 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
|
||||
#if HAVE_ASM
|
||||
#if ARCH_AARCH64 || ARCH_ARM
|
||||
bitfn(dav1d_cdef_dsp_init_arm)(c);
|
||||
#elif ARCH_PPC64LE
|
||||
bitfn(dav1d_cdef_dsp_init_ppc)(c);
|
||||
#elif ARCH_X86
|
||||
bitfn(dav1d_cdef_dsp_init_x86)(c);
|
||||
#endif
|
||||
|
7718
third_party/dav1d/src/cdf.c
vendored
7718
third_party/dav1d/src/cdf.c
vendored
File diff suppressed because it is too large
Load Diff
145
third_party/dav1d/src/cdf.h
vendored
145
third_party/dav1d/src/cdf.h
vendored
@ -37,91 +37,94 @@
|
||||
/* Buffers padded to [8] or [16] for SIMD where needed. */
|
||||
|
||||
typedef struct CdfModeContext {
|
||||
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
|
||||
uint16_t use_filter_intra[N_BS_SIZES][2];
|
||||
uint16_t filter_intra[5 + 1];
|
||||
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
|
||||
uint16_t angle_delta[8][8];
|
||||
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
|
||||
uint16_t newmv_mode[6][2];
|
||||
uint16_t globalmv_mode[2][2];
|
||||
uint16_t refmv_mode[6][2];
|
||||
uint16_t drl_bit[3][2];
|
||||
uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES + 1];
|
||||
uint16_t intra[4][2];
|
||||
uint16_t comp[5][2];
|
||||
uint16_t comp_dir[5][2];
|
||||
uint16_t jnt_comp[6][2];
|
||||
uint16_t mask_comp[6][2];
|
||||
uint16_t wedge_comp[9][2];
|
||||
uint16_t wedge_idx[9][16 + 1];
|
||||
uint16_t interintra[7][2];
|
||||
uint16_t interintra_mode[4][5];
|
||||
uint16_t interintra_wedge[7][2];
|
||||
uint16_t ref[6][3][2];
|
||||
uint16_t comp_fwd_ref[3][3][2];
|
||||
uint16_t comp_bwd_ref[2][3][2];
|
||||
uint16_t comp_uni_ref[3][3][2];
|
||||
uint16_t txsz[N_TX_SIZES - 1][3][4];
|
||||
uint16_t txpart[7][3][2];
|
||||
uint16_t txtp_inter[4][N_TX_SIZES][N_TX_TYPES + 1];
|
||||
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
|
||||
uint16_t skip[3][2];
|
||||
uint16_t skip_mode[3][2];
|
||||
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
|
||||
uint16_t seg_pred[3][2];
|
||||
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
|
||||
uint16_t cfl_sign[8 + 1];
|
||||
uint16_t cfl_alpha[6][16 + 1];
|
||||
uint16_t restore_wiener[2];
|
||||
uint16_t restore_sgrproj[2];
|
||||
uint16_t restore_switchable[3 + 1];
|
||||
uint16_t delta_q[4 + 1];
|
||||
uint16_t delta_lf[5][4 + 1];
|
||||
uint16_t obmc[N_BS_SIZES][2];
|
||||
uint16_t motion_mode[N_BS_SIZES][3 + 1];
|
||||
uint16_t pal_y[7][3][2];
|
||||
uint16_t pal_uv[2][2];
|
||||
uint16_t pal_sz[2][7][7 + 1];
|
||||
uint16_t color_map[2][7][5][8 + 1];
|
||||
uint16_t intrabc[2];
|
||||
ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
|
||||
ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
|
||||
ALIGN(uint16_t wedge_idx[9][16], 32);
|
||||
ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
|
||||
ALIGN(uint16_t cfl_alpha[6][16], 32);
|
||||
ALIGN(uint16_t txtp_inter1[2][16], 32);
|
||||
ALIGN(uint16_t txtp_inter2[12 + 4], 32);
|
||||
ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
|
||||
ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
|
||||
ALIGN(uint16_t cfl_sign[8], 16);
|
||||
ALIGN(uint16_t angle_delta[8][8], 16);
|
||||
ALIGN(uint16_t filter_intra[5 + 3], 16);
|
||||
ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
|
||||
ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
|
||||
ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
|
||||
ALIGN(uint16_t color_map[2][7][5][8], 16);
|
||||
ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
|
||||
ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
|
||||
ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
|
||||
ALIGN(uint16_t delta_q[4], 8);
|
||||
ALIGN(uint16_t delta_lf[5][4], 8);
|
||||
ALIGN(uint16_t interintra_mode[4][4], 8);
|
||||
ALIGN(uint16_t restore_switchable[3 + 1], 8);
|
||||
ALIGN(uint16_t restore_wiener[2], 4);
|
||||
ALIGN(uint16_t restore_sgrproj[2], 4);
|
||||
ALIGN(uint16_t interintra[7][2], 4);
|
||||
ALIGN(uint16_t interintra_wedge[7][2], 4);
|
||||
ALIGN(uint16_t txtp_inter3[4][2], 4);
|
||||
ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
|
||||
ALIGN(uint16_t newmv_mode[6][2], 4);
|
||||
ALIGN(uint16_t globalmv_mode[2][2], 4);
|
||||
ALIGN(uint16_t refmv_mode[6][2], 4);
|
||||
ALIGN(uint16_t drl_bit[3][2], 4);
|
||||
ALIGN(uint16_t intra[4][2], 4);
|
||||
ALIGN(uint16_t comp[5][2], 4);
|
||||
ALIGN(uint16_t comp_dir[5][2], 4);
|
||||
ALIGN(uint16_t jnt_comp[6][2], 4);
|
||||
ALIGN(uint16_t mask_comp[6][2], 4);
|
||||
ALIGN(uint16_t wedge_comp[9][2], 4);
|
||||
ALIGN(uint16_t ref[6][3][2], 4);
|
||||
ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
|
||||
ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
|
||||
ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
|
||||
ALIGN(uint16_t txpart[7][3][2], 4);
|
||||
ALIGN(uint16_t skip[3][2], 4);
|
||||
ALIGN(uint16_t skip_mode[3][2], 4);
|
||||
ALIGN(uint16_t seg_pred[3][2], 4);
|
||||
ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
|
||||
ALIGN(uint16_t pal_y[7][3][2], 4);
|
||||
ALIGN(uint16_t pal_uv[2][2], 4);
|
||||
ALIGN(uint16_t intrabc[2], 4);
|
||||
} CdfModeContext;
|
||||
|
||||
typedef struct CdfCoefContext {
|
||||
uint16_t skip[N_TX_SIZES][13][2];
|
||||
uint16_t eob_bin_16[2][2][6];
|
||||
uint16_t eob_bin_32[2][2][7 + 1];
|
||||
uint16_t eob_bin_64[2][2][8];
|
||||
uint16_t eob_bin_128[2][2][9];
|
||||
uint16_t eob_bin_256[2][2][10 + 6];
|
||||
uint16_t eob_bin_512[2][2][11 + 5];
|
||||
uint16_t eob_bin_1024[2][2][12 + 4];
|
||||
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
|
||||
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
|
||||
uint16_t base_tok[N_TX_SIZES][2][41][5];
|
||||
uint16_t dc_sign[2][3][2];
|
||||
uint16_t br_tok[4 /*5*/][2][21][5];
|
||||
ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
|
||||
ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
|
||||
ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
|
||||
ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
|
||||
ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
|
||||
ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
|
||||
ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
|
||||
ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
|
||||
ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
|
||||
ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
|
||||
ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
|
||||
ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
|
||||
ALIGN(uint16_t dc_sign[2][3][2], 4);
|
||||
} CdfCoefContext;
|
||||
|
||||
typedef struct CdfMvComponent {
|
||||
uint16_t classes[11 + 1 + 4];
|
||||
uint16_t class0[2];
|
||||
uint16_t classN[10][2];
|
||||
uint16_t class0_fp[2][4 + 1];
|
||||
uint16_t classN_fp[4 + 1];
|
||||
uint16_t class0_hp[2];
|
||||
uint16_t classN_hp[2];
|
||||
uint16_t sign[2];
|
||||
ALIGN(uint16_t classes[11 + 5], 32);
|
||||
ALIGN(uint16_t class0_fp[2][4], 8);
|
||||
ALIGN(uint16_t classN_fp[4], 8);
|
||||
ALIGN(uint16_t class0_hp[2], 4);
|
||||
ALIGN(uint16_t classN_hp[2], 4);
|
||||
ALIGN(uint16_t class0[2], 4);
|
||||
ALIGN(uint16_t classN[10][2], 4);
|
||||
ALIGN(uint16_t sign[2], 4);
|
||||
} CdfMvComponent;
|
||||
|
||||
typedef struct CdfMvContext {
|
||||
CdfMvComponent comp[2];
|
||||
uint16_t joint[N_MV_JOINTS + 1];
|
||||
ALIGN(uint16_t joint[N_MV_JOINTS], 8);
|
||||
} CdfMvContext;
|
||||
|
||||
typedef struct CdfContext {
|
||||
CdfModeContext m;
|
||||
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
|
||||
ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
|
||||
CdfCoefContext coef;
|
||||
CdfMvContext mv, dmv;
|
||||
} CdfContext;
|
||||
|
2
third_party/dav1d/src/data.c
vendored
2
third_party/dav1d/src/data.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@ -35,6 +34,7 @@
|
||||
|
||||
#include "dav1d/data.h"
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "common/validate.h"
|
||||
|
||||
#include "src/data.h"
|
||||
|
94
third_party/dav1d/src/decode.c
vendored
94
third_party/dav1d/src/decode.c
vendored
@ -42,6 +42,7 @@
|
||||
#include "src/decode.h"
|
||||
#include "src/dequant_tables.h"
|
||||
#include "src/env.h"
|
||||
#include "src/film_grain.h"
|
||||
#include "src/log.h"
|
||||
#include "src/qm.h"
|
||||
#include "src/recon.h"
|
||||
@ -81,14 +82,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
||||
const int have_hp = f->frame_hdr->hp;
|
||||
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
|
||||
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
mv_comp->classes, 11);
|
||||
mv_comp->classes, 10);
|
||||
int up, fp, hp;
|
||||
|
||||
if (!cl) {
|
||||
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->class0_fp[up], 4);
|
||||
mv_comp->class0_fp[up], 3);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->class0_hp) : 1;
|
||||
} else {
|
||||
@ -102,7 +103,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
|
||||
mv_comp->classN[n]) << n;
|
||||
if (have_fp) {
|
||||
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
mv_comp->classN_fp, 4);
|
||||
mv_comp->classN_fp, 3);
|
||||
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
mv_comp->classN_hp) : 1;
|
||||
} else {
|
||||
@ -120,7 +121,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
|
||||
CdfMvContext *const mv_cdf, const int have_fp)
|
||||
{
|
||||
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
|
||||
N_MV_JOINTS))
|
||||
N_MV_JOINTS - 1))
|
||||
{
|
||||
case MV_JOINT_HV:
|
||||
ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
|
||||
@ -380,7 +381,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
|
||||
uint16_t cache[16], used_cache[8];
|
||||
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
|
||||
int n_cache = 0;
|
||||
@ -586,7 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const ptrdiff_t stride = bw4 * 4;
|
||||
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
uint16_t (*const color_map_cdf)[8 + 1] =
|
||||
uint16_t (*const color_map_cdf)[8] =
|
||||
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
|
||||
uint8_t (*const order)[8] = t->scratch.pal_order;
|
||||
uint8_t *const ctx = t->scratch.pal_ctx;
|
||||
@ -597,7 +598,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
|
||||
order_palette(pal_idx, stride, i, first, last, order, ctx);
|
||||
for (int j = first, m = 0; j >= last; j--, m++) {
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl]);
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
|
||||
pal_idx[(i - j) * stride + j] = order[m][color_idx];
|
||||
}
|
||||
}
|
||||
@ -647,7 +648,7 @@ static void read_vartx_tree(Dav1dTileContext *const t,
|
||||
}
|
||||
b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
|
||||
} else {
|
||||
assert(imin(bw4, bh4) <= 16 || b->max_ytx == TX_64X64);
|
||||
assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
|
||||
int y, x, y_off, x_off;
|
||||
const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
|
||||
for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
|
||||
@ -673,8 +674,6 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
|
||||
const uint8_t *ref_seg_map,
|
||||
const ptrdiff_t stride)
|
||||
{
|
||||
unsigned seg_id = 8;
|
||||
|
||||
assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
|
||||
if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
|
||||
(by + h4) * 4, PLANE_TYPE_BLOCK))
|
||||
@ -682,12 +681,13 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
|
||||
return 8;
|
||||
}
|
||||
|
||||
unsigned seg_id = 8;
|
||||
ref_seg_map += by * stride + bx;
|
||||
do {
|
||||
for (int x = 0; x < w4; x++)
|
||||
seg_id = imin(seg_id, ref_seg_map[x]);
|
||||
ref_seg_map += stride;
|
||||
} while (--h4 > 0);
|
||||
} while (--h4 > 0 && seg_id);
|
||||
assert(seg_id < 8);
|
||||
|
||||
return seg_id;
|
||||
@ -814,7 +814,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
&seg_ctx, f->cur_segmap, f->b4_stride);
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
DAV1D_MAX_SEGMENTS - 1);
|
||||
const unsigned last_active_seg_id =
|
||||
f->frame_hdr->segmentation.seg_data.last_active_segid;
|
||||
b->seg_id = neg_deinterleave(diff, pred_seg_id,
|
||||
@ -886,7 +886,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
} else {
|
||||
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.seg_id[seg_ctx],
|
||||
DAV1D_MAX_SEGMENTS);
|
||||
DAV1D_MAX_SEGMENTS - 1);
|
||||
const unsigned last_active_seg_id =
|
||||
f->frame_hdr->segmentation.seg_data.last_active_segid;
|
||||
b->seg_id = neg_deinterleave(diff, pred_seg_id,
|
||||
@ -934,7 +934,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
|
||||
if (have_delta_q) {
|
||||
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_q, 4);
|
||||
ts->cdf.m.delta_q, 3);
|
||||
if (delta_q == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
|
||||
@ -955,7 +955,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
|
||||
for (int i = 0; i < n_lfs; i++) {
|
||||
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
|
||||
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
|
||||
if (delta_lf == 3) {
|
||||
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
|
||||
delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
|
||||
@ -1020,7 +1020,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
|
||||
[dav1d_intra_mode_context[t->l.mode[by4]]];
|
||||
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
|
||||
N_INTRA_PRED_MODES);
|
||||
N_INTRA_PRED_MODES - 1);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
|
||||
|
||||
@ -1029,7 +1029,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->y_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
|
||||
b->y_angle = angle - 3;
|
||||
} else {
|
||||
b->y_angle = 0;
|
||||
@ -1040,20 +1040,20 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
|
||||
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
|
||||
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
|
||||
N_UV_INTRA_PRED_MODES - !cfl_allowed);
|
||||
N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
|
||||
|
||||
if (b->uv_mode == CFL_PRED) {
|
||||
#define SIGN(a) (!!(a) + ((a) > 0))
|
||||
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.cfl_sign, 8) + 1;
|
||||
ts->cdf.m.cfl_sign, 7) + 1;
|
||||
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
|
||||
assert(sign_u == sign / 3);
|
||||
if (sign_u) {
|
||||
const int ctx = (sign_u == 2) * 3 + sign_v;
|
||||
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
ts->cdf.m.cfl_alpha[ctx], 15) + 1;
|
||||
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
|
||||
} else {
|
||||
b->cfl_alpha[0] = 0;
|
||||
@ -1061,7 +1061,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
if (sign_v) {
|
||||
const int ctx = (sign_v == 2) * 3 + sign_u;
|
||||
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
|
||||
ts->cdf.m.cfl_alpha[ctx], 15) + 1;
|
||||
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
|
||||
} else {
|
||||
b->cfl_alpha[1] = 0;
|
||||
@ -1074,7 +1074,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->uv_mode <= VERT_LEFT_PRED)
|
||||
{
|
||||
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
|
||||
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
|
||||
b->uv_angle = angle - 3;
|
||||
} else {
|
||||
b->uv_angle = 0;
|
||||
@ -1115,7 +1115,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
if (is_filter) {
|
||||
b->y_mode = FILTER_PRED;
|
||||
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter_intra, 5);
|
||||
ts->cdf.m.filter_intra, 4);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-filterintramode[%d/%d]: r=%d\n",
|
||||
@ -1158,7 +1158,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
|
||||
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
|
||||
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
|
||||
imin(t_dim->max + 1, 3));
|
||||
imin(t_dim->max, 2));
|
||||
|
||||
while (depth--) {
|
||||
b->tx = t_dim->sub;
|
||||
@ -1480,7 +1480,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
|
||||
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.comp_inter_mode[ctx],
|
||||
N_COMP_INTER_PRED_MODES);
|
||||
N_COMP_INTER_PRED_MODES - 1);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
|
||||
b->inter_mode, ctx, n_mvs, ts->msac.rng);
|
||||
@ -1588,7 +1588,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
ts->cdf.m.wedge_comp[ctx]);
|
||||
if (b->comp_type == COMP_INTER_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[ctx], 16);
|
||||
ts->cdf.m.wedge_idx[ctx], 15);
|
||||
} else {
|
||||
b->comp_type = COMP_INTER_SEG;
|
||||
}
|
||||
@ -1743,14 +1743,14 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
{
|
||||
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.interintra_mode[ii_sz_grp],
|
||||
N_INTER_INTRA_PRED_MODES);
|
||||
N_INTER_INTRA_PRED_MODES - 1);
|
||||
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
|
||||
b->interintra_type = INTER_INTRA_BLEND +
|
||||
dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.interintra_wedge[wedge_ctx]);
|
||||
if (b->interintra_type == INTER_INTRA_WEDGE)
|
||||
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.wedge_idx[wedge_ctx], 16);
|
||||
ts->cdf.m.wedge_idx[wedge_ctx], 15);
|
||||
} else {
|
||||
b->interintra_type = INTER_INTRA_NONE;
|
||||
}
|
||||
@ -1783,7 +1783,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
|
||||
b->motion_mode = allow_warp ?
|
||||
dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.motion_mode[bs], 3) :
|
||||
ts->cdf.m.motion_mode[bs], 2) :
|
||||
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
|
||||
if (b->motion_mode == MM_WARP) {
|
||||
has_subpel_filter = 0;
|
||||
@ -1823,7 +1823,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
by4, bx4);
|
||||
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[0][ctx1],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
DAV1D_N_SWITCHABLE_FILTERS - 1);
|
||||
if (f->seq_hdr->dual_filter) {
|
||||
const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
|
||||
b->ref[0], by4, bx4);
|
||||
@ -1832,7 +1832,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
filter[0], ctx1, ts->msac.rng);
|
||||
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.filter[1][ctx2],
|
||||
DAV1D_N_SWITCHABLE_FILTERS);
|
||||
DAV1D_N_SWITCHABLE_FILTERS - 1);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
|
||||
filter[1], ctx2, ts->msac.rng);
|
||||
@ -2023,9 +2023,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
|
||||
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
|
||||
bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
|
||||
} else {
|
||||
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
|
||||
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
|
||||
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
|
||||
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc,
|
||||
dav1d_partition_type_count[bl]);
|
||||
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
|
||||
(bp == PARTITION_V || bp == PARTITION_V4 ||
|
||||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
|
||||
@ -2381,7 +2380,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
|
||||
|
||||
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
|
||||
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.restore_switchable, 3);
|
||||
ts->cdf.m.restore_switchable, 2);
|
||||
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
|
||||
DAV1D_RESTORATION_WIENER :
|
||||
DAV1D_RESTORATION_NONE;
|
||||
@ -2597,8 +2596,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
||||
f->tile_thread.titsati_sz = titsati_sz;
|
||||
}
|
||||
if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
|
||||
f->tile_thread.titsati_init[1] != f->sbh ||
|
||||
f->tile_thread.titsati_init[2] != f->frame_hdr->tiling.rows)
|
||||
f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows ||
|
||||
memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows,
|
||||
sizeof(*f->tile_thread.titsati_index_rows) *
|
||||
(f->frame_hdr->tiling.rows + 1)))
|
||||
{
|
||||
for (int tile_row = 0, tile_idx = 0;
|
||||
tile_row < f->frame_hdr->tiling.rows; tile_row++)
|
||||
@ -2616,8 +2617,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
||||
}
|
||||
}
|
||||
f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
|
||||
f->tile_thread.titsati_init[1] = f->sbh;
|
||||
f->tile_thread.titsati_init[2] = f->frame_hdr->tiling.rows;
|
||||
f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows;
|
||||
memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb,
|
||||
sizeof(*f->tile_thread.titsati_index_rows) *
|
||||
(f->frame_hdr->tiling.rows + 1));
|
||||
}
|
||||
}
|
||||
|
||||
@ -2637,9 +2640,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
|
||||
if (!ts_new) goto error;
|
||||
if (n_ts > f->n_ts) {
|
||||
Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
|
||||
if (!ts_new) goto error;
|
||||
if (f->ts) {
|
||||
memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts);
|
||||
dav1d_free_aligned(f->ts);
|
||||
}
|
||||
f->ts = ts_new;
|
||||
for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) {
|
||||
Dav1dTileState *const ts = &f->ts[n];
|
||||
@ -2655,9 +2662,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
||||
pthread_cond_destroy(&ts->tile_thread.cond);
|
||||
pthread_mutex_destroy(&ts->tile_thread.lock);
|
||||
}
|
||||
memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts);
|
||||
dav1d_free_aligned(f->ts);
|
||||
f->n_ts = n_ts;
|
||||
Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
|
||||
if (!ts_new) goto error;
|
||||
f->ts = ts_new;
|
||||
}
|
||||
}
|
||||
@ -3184,6 +3191,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
||||
dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
|
||||
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
|
||||
dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
|
||||
dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
|
||||
break
|
||||
#if CONFIG_8BPC
|
||||
case 8:
|
||||
|
261
third_party/dav1d/src/env.h
vendored
261
third_party/dav1d/src/env.h
vendored
@ -28,7 +28,6 @@
|
||||
#ifndef DAV1D_SRC_ENV_H
|
||||
#define DAV1D_SRC_ENV_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
@ -90,95 +89,37 @@ static inline int get_partition_ctx(const BlockContext *const a,
|
||||
(((l->partition[yb8] >> (4 - bl)) & 1) << 1);
|
||||
}
|
||||
|
||||
static inline unsigned cdf_element_prob(const uint16_t *const cdf, const int e) {
|
||||
assert(e > 0);
|
||||
return cdf[e - 1] - cdf[e];
|
||||
}
|
||||
|
||||
static inline unsigned gather_left_partition_prob(const uint16_t *const in,
|
||||
const enum BlockLevel bl)
|
||||
{
|
||||
unsigned out = 0;
|
||||
out += cdf_element_prob(in, PARTITION_H);
|
||||
if (bl != BL_128X128)
|
||||
out += cdf_element_prob(in, PARTITION_H4);
|
||||
unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
|
||||
// Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
|
||||
// PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
|
||||
// PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
|
||||
out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
|
||||
if (bl != BL_128X128)
|
||||
out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
|
||||
return out;
|
||||
}
|
||||
|
||||
static inline unsigned gather_top_partition_prob(const uint16_t *const in,
|
||||
const enum BlockLevel bl)
|
||||
{
|
||||
unsigned out = 0;
|
||||
// Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
|
||||
// PARTITION_T_TOP_SPLIT are neighbors.
|
||||
unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
|
||||
// Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
|
||||
// PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
|
||||
// PARTITION_V4 is always zero, and the probability for
|
||||
// PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
|
||||
out += in[PARTITION_T_LEFT_SPLIT - 1];
|
||||
if (bl != BL_128X128)
|
||||
out += cdf_element_prob(in, PARTITION_V4);
|
||||
// Exploit the fact that cdfs for PARTITION_T_LEFT_SPLIT and PARTITION_T_RIGHT_SPLIT,
|
||||
// and PARTITION_V, PARTITION_SPLIT and PARTITION_T_TOP_SPLIT are neighbors.
|
||||
out += in[PARTITION_T_LEFT_SPLIT - 1] - in[PARTITION_T_RIGHT_SPLIT];
|
||||
out += in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
|
||||
out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
|
||||
return out;
|
||||
}
|
||||
|
||||
static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,
|
||||
const int inter,
|
||||
const Dav1dFrameHeader *const hdr,
|
||||
const int seg_id)
|
||||
{
|
||||
if (!hdr->segmentation.qidx[seg_id]) {
|
||||
if (hdr->segmentation.lossless[seg_id]) {
|
||||
assert(tx == (int) TX_4X4);
|
||||
return TXTP_SET_LOSSLESS;
|
||||
} else {
|
||||
return TXTP_SET_DCT;
|
||||
}
|
||||
}
|
||||
|
||||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
|
||||
|
||||
if (t_dim->max >= TX_64X64)
|
||||
return TXTP_SET_DCT;
|
||||
|
||||
if (t_dim->max == TX_32X32)
|
||||
return inter ? TXTP_SET_DCT_ID : TXTP_SET_DCT;
|
||||
|
||||
if (hdr->reduced_txtp_set)
|
||||
return inter ? TXTP_SET_DCT_ID : TXTP_SET_DT4_ID;
|
||||
|
||||
const enum TxfmSize txsqsz = t_dim->min;
|
||||
|
||||
if (inter)
|
||||
return txsqsz == TX_16X16 ? TXTP_SET_DT9_ID_1D : TXTP_SET_ALL;
|
||||
else
|
||||
return txsqsz == TX_16X16 ? TXTP_SET_DT4_ID : TXTP_SET_DT4_ID_1D;
|
||||
}
|
||||
|
||||
static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,
|
||||
const enum RectTxfmSize tx,
|
||||
const Dav1dFrameHeader *const hdr,
|
||||
const int seg_id)
|
||||
{
|
||||
if (hdr->segmentation.lossless[seg_id]) {
|
||||
assert(tx == (int) TX_4X4);
|
||||
return WHT_WHT;
|
||||
}
|
||||
|
||||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
|
||||
|
||||
return t_dim->max == TX_32X32 ? DCT_DCT : dav1d_txtp_from_uvmode[uv_mode];
|
||||
}
|
||||
|
||||
static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
|
||||
const enum TxfmType ytxtp,
|
||||
const Dav1dFrameHeader *const hdr,
|
||||
const int seg_id)
|
||||
const enum TxfmType ytxtp)
|
||||
{
|
||||
if (hdr->segmentation.lossless[seg_id]) {
|
||||
assert(uvt_dim->max == TX_4X4);
|
||||
return WHT_WHT;
|
||||
}
|
||||
|
||||
if (uvt_dim->max == TX_32X32)
|
||||
return ytxtp == IDTX ? IDTX : DCT_DCT;
|
||||
if (uvt_dim->min == TX_16X16 &&
|
||||
@ -528,180 +469,6 @@ static inline unsigned get_cur_frame_segid(const int by, const int bx,
|
||||
}
|
||||
}
|
||||
|
||||
static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim,
|
||||
const enum BlockSize bs,
|
||||
const uint8_t *const a,
|
||||
const uint8_t *const l,
|
||||
const int chroma,
|
||||
const enum Dav1dPixelLayout layout)
|
||||
{
|
||||
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
|
||||
|
||||
if (chroma) {
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
|
||||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
|
||||
int ca, cl;
|
||||
|
||||
#define MERGE_CTX(dir, type, mask) \
|
||||
c##dir = !!((*(const type *) dir) & mask); \
|
||||
break
|
||||
switch (t_dim->lw) {
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
default: abort();
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
default: abort();
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
|
||||
return 7 + not_one_blk * 3 + ca + cl;
|
||||
} else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
|
||||
return 0;
|
||||
} else {
|
||||
static const uint8_t skip_contexts[5][5] = {
|
||||
{ 1, 2, 2, 2, 3 },
|
||||
{ 1, 4, 4, 4, 5 },
|
||||
{ 1, 4, 4, 4, 5 },
|
||||
{ 1, 4, 4, 4, 5 },
|
||||
{ 1, 4, 4, 4, 6 }
|
||||
};
|
||||
uint64_t la, ll;
|
||||
|
||||
#define MERGE_CTX(dir, type, tx) do { \
|
||||
l##dir = *(const type *) dir; \
|
||||
if (tx == TX_64X64) \
|
||||
l##dir |= *(const type *) &dir[sizeof(type)]; \
|
||||
if (tx >= TX_32X32) l##dir |= l##dir >> 32; \
|
||||
if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
|
||||
if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
|
||||
l##dir &= 0x3F; \
|
||||
} while (0); \
|
||||
break
|
||||
switch (t_dim->lw) {
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32);
|
||||
case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64);
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32);
|
||||
case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64);
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
|
||||
const int max = imin((int) (la | ll), 4);
|
||||
const int min = imin(imin((int) la, (int) ll), 4);
|
||||
|
||||
return skip_contexts[min][max];
|
||||
}
|
||||
}
|
||||
|
||||
static inline int get_coef_nz_ctx(uint8_t *const levels,
|
||||
const enum RectTxfmSize tx,
|
||||
const enum TxClass tx_class,
|
||||
const int x, const int y,
|
||||
const ptrdiff_t stride)
|
||||
{
|
||||
static const uint8_t offsets[3][5][2 /* x, y */] = {
|
||||
[TX_CLASS_2D] = {
|
||||
{ 0, 1 }, { 1, 0 }, { 2, 0 }, { 0, 2 }, { 1, 1 }
|
||||
}, [TX_CLASS_V] = {
|
||||
{ 0, 1 }, { 1, 0 }, { 0, 2 }, { 0, 3 }, { 0, 4 }
|
||||
}, [TX_CLASS_H] = {
|
||||
{ 0, 1 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, { 4, 0 }
|
||||
}
|
||||
};
|
||||
const uint8_t (*const off)[2] = offsets[tx_class];
|
||||
int mag = 0;
|
||||
for (int i = 0; i < 5; i++)
|
||||
mag += imin(levels[(x + off[i][0]) * stride + (y + off[i][1])], 3);
|
||||
const int ctx = imin((mag + 1) >> 1, 4);
|
||||
if (tx_class == TX_CLASS_2D) {
|
||||
return dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx;
|
||||
} else {
|
||||
return 26 + imin((tx_class == TX_CLASS_V) ? y : x, 2) * 5 + ctx;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int get_dc_sign_ctx(const TxfmInfo *const t_dim,
|
||||
const uint8_t *const a,
|
||||
const uint8_t *const l)
|
||||
{
|
||||
uint64_t sa, sl;
|
||||
|
||||
#define MERGE_CTX(dir, type, tx, mask) do { \
|
||||
s##dir = ((*(const type *) dir) >> 6) & mask; \
|
||||
if (tx == TX_64X64) \
|
||||
s##dir += ((*(const type *) &dir[sizeof(type)]) >> 6) & mask; \
|
||||
if (tx >= TX_32X32) s##dir += s##dir >> 32; \
|
||||
if (tx >= TX_16X16) s##dir += s##dir >> 16; \
|
||||
if (tx >= TX_8X8) s##dir += s##dir >> 8; \
|
||||
} while (0); \
|
||||
break
|
||||
switch (t_dim->lw) {
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4, 0x03);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8, 0x0303);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16, 0x03030303U);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32, 0x0303030303030303ULL);
|
||||
case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64, 0x0303030303030303ULL);
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4, 0x03);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8, 0x0303);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16, 0x03030303U);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32, 0x0303030303030303ULL);
|
||||
case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64, 0x0303030303030303ULL);
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
const int s = ((int) ((sa + sl) & 0xFF)) - (t_dim->w + t_dim->h);
|
||||
|
||||
return s < 0 ? 1 : s > 0 ? 2 : 0;
|
||||
}
|
||||
|
||||
static inline int get_br_ctx(const uint8_t *const levels,
|
||||
const int ac, const enum TxClass tx_class,
|
||||
const int x, const int y,
|
||||
const ptrdiff_t stride)
|
||||
{
|
||||
int mag = 0;
|
||||
static const uint8_t offsets_from_txclass[3][3][2] = {
|
||||
[TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 1, 1 } },
|
||||
[TX_CLASS_H] = { { 0, 1 }, { 1, 0 }, { 0, 2 } },
|
||||
[TX_CLASS_V] = { { 0, 1 }, { 1, 0 }, { 2, 0 } }
|
||||
};
|
||||
const uint8_t (*const offsets)[2] = offsets_from_txclass[tx_class];
|
||||
for (int i = 0; i < 3; i++)
|
||||
mag += levels[(x + offsets[i][1]) * stride + y + offsets[i][0]];
|
||||
|
||||
mag = imin((mag + 1) >> 1, 6);
|
||||
if (!ac) return mag;
|
||||
switch (tx_class) {
|
||||
case TX_CLASS_2D:
|
||||
if (y < 2 && x < 2) return mag + 7;
|
||||
break;
|
||||
case TX_CLASS_H:
|
||||
if (x == 0) return mag + 7;
|
||||
break;
|
||||
case TX_CLASS_V:
|
||||
if (y == 0) return mag + 7;
|
||||
break;
|
||||
}
|
||||
return mag + 14;
|
||||
}
|
||||
|
||||
static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
|
||||
const int bx4, const int by4,
|
||||
const int bw4, const int bh4,
|
||||
|
41
third_party/dav1d/src/fg_apply.h
vendored
Normal file
41
third_party/dav1d/src/fg_apply.h
vendored
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_FG_APPLY_H
|
||||
#define DAV1D_SRC_FG_APPLY_H
|
||||
|
||||
#include "dav1d/picture.h"
|
||||
|
||||
#include "common/bitdepth.h"
|
||||
|
||||
#include "src/film_grain.h"
|
||||
|
||||
bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
|
||||
Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in);
|
||||
|
||||
#endif /* DAV1D_SRC_FG_APPLY_H */
|
175
third_party/dav1d/src/fg_apply_tmpl.c
vendored
Normal file
175
third_party/dav1d/src/fg_apply_tmpl.c
vendored
Normal file
@ -0,0 +1,175 @@
|
||||
/*
|
||||
* Copyright © 2018, Niklas Haas
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "dav1d/picture.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "common/intops.h"
|
||||
#include "common/bitdepth.h"
|
||||
|
||||
#include "fg_apply.h"
|
||||
|
||||
static void generate_scaling(const int bitdepth,
|
||||
const uint8_t points[][2], const int num,
|
||||
uint8_t scaling[SCALING_SIZE])
|
||||
{
|
||||
const int shift_x = bitdepth - 8;
|
||||
const int scaling_size = 1 << bitdepth;
|
||||
const int pad = 1 << shift_x;
|
||||
|
||||
// Fill up the preceding entries with the initial value
|
||||
for (int i = 0; i < points[0][0] << shift_x; i++)
|
||||
scaling[i] = points[0][1];
|
||||
|
||||
// Linearly interpolate the values in the middle
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
const int bx = points[i][0];
|
||||
const int by = points[i][1];
|
||||
const int ex = points[i+1][0];
|
||||
const int ey = points[i+1][1];
|
||||
const int dx = ex - bx;
|
||||
const int dy = ey - by;
|
||||
const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
|
||||
for (int x = 0; x < dx; x++) {
|
||||
const int v = by + ((x * delta + 0x8000) >> 16);
|
||||
scaling[(bx + x) << shift_x] = v;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill up the remaining entries with the final value
|
||||
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
|
||||
scaling[i] = points[num - 1][1];
|
||||
|
||||
if (pad <= 1) return;
|
||||
|
||||
const int rnd = pad >> 1;
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
const int bx = points[i][0] << shift_x;
|
||||
const int ex = points[i+1][0] << shift_x;
|
||||
const int dx = ex - bx;
|
||||
for (int x = 0; x < dx; x += pad) {
|
||||
const int range = scaling[bx + x + pad] - scaling[bx + x];
|
||||
for (int n = 1; n < pad; n++) {
|
||||
scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef UNIT_TEST
|
||||
void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
||||
Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in)
|
||||
{
|
||||
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
|
||||
|
||||
entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
uint8_t scaling[3][SCALING_SIZE];
|
||||
#if BITDEPTH != 8
|
||||
const int bitdepth_max = (1 << out->p.bpc) - 1;
|
||||
#endif
|
||||
|
||||
// Generate grain LUTs as needed
|
||||
dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
|
||||
if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
|
||||
dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
|
||||
data, 0 HIGHBD_TAIL_SUFFIX);
|
||||
if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
|
||||
dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
|
||||
data, 1 HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
// Generate scaling LUTs as needed
|
||||
if (data->num_y_points)
|
||||
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
|
||||
if (data->num_uv_points[0])
|
||||
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
|
||||
if (data->num_uv_points[1])
|
||||
generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
|
||||
|
||||
// Copy over the non-modified planes
|
||||
// TODO: eliminate in favor of per-plane refs
|
||||
assert(out->stride[0] == in->stride[0]);
|
||||
if (!data->num_y_points) {
|
||||
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
|
||||
}
|
||||
|
||||
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
|
||||
assert(out->stride[1] == in->stride[1]);
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
|
||||
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
memcpy(out->data[1+i], in->data[1+i],
|
||||
(out->p.h >> suby) * out->stride[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Synthesize grain for the affected planes
|
||||
const int rows = (out->p.h + 31) >> 5;
|
||||
const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int cpw = (out->p.w + ss_x) >> ss_x;
|
||||
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
|
||||
for (int row = 0; row < rows; row++) {
|
||||
const pixel *const luma_src =
|
||||
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
|
||||
|
||||
if (data->num_y_points) {
|
||||
const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
|
||||
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
|
||||
luma_src, out->stride[0], data,
|
||||
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
|
||||
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
|
||||
if (data->chroma_scaling_from_luma) {
|
||||
for (int pl = 0; pl < 2; pl++)
|
||||
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
|
||||
((const pixel *) in->data[1 + pl]) + uv_off,
|
||||
in->stride[1], data, cpw,
|
||||
scaling[0], grain_lut[1 + pl],
|
||||
bh, row, luma_src, in->stride[0],
|
||||
pl, is_id HIGHBD_TAIL_SUFFIX);
|
||||
} else {
|
||||
for (int pl = 0; pl < 2; pl++)
|
||||
if (data->num_uv_points[pl])
|
||||
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
|
||||
((const pixel *) in->data[1 + pl]) + uv_off,
|
||||
in->stride[1], data, cpw,
|
||||
scaling[1 + pl], grain_lut[1 + pl],
|
||||
bh, row, luma_src, in->stride[0],
|
||||
pl, is_id HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
55
third_party/dav1d/src/film_grain.h
vendored
55
third_party/dav1d/src/film_grain.h
vendored
@ -28,9 +28,58 @@
|
||||
#ifndef DAV1D_SRC_FILM_GRAIN_H
|
||||
#define DAV1D_SRC_FILM_GRAIN_H
|
||||
|
||||
#include "dav1d/dav1d.h"
|
||||
#include "common/bitdepth.h"
|
||||
|
||||
bitfn_decls(void dav1d_apply_grain, Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in);
|
||||
#include "src/levels.h"
|
||||
|
||||
#define GRAIN_WIDTH 82
|
||||
#define GRAIN_HEIGHT 73
|
||||
#define BLOCK_SIZE 32
|
||||
#if !defined(BITDEPTH) || BITDEPTH == 8
|
||||
#define SCALING_SIZE 256
|
||||
typedef int8_t entry;
|
||||
#else
|
||||
#define SCALING_SIZE 4096
|
||||
typedef int16_t entry;
|
||||
#endif
|
||||
|
||||
#define decl_generate_grain_y_fn(name) \
|
||||
void (name)(entry buf[][GRAIN_WIDTH], \
|
||||
const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
|
||||
|
||||
#define decl_generate_grain_uv_fn(name) \
|
||||
void (name)(entry buf[][GRAIN_WIDTH], \
|
||||
const entry buf_y[][GRAIN_WIDTH], \
|
||||
const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
|
||||
|
||||
#define decl_fgy_32x32xn_fn(name) \
|
||||
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
|
||||
const Dav1dFilmGrainData *data, \
|
||||
size_t pw, const uint8_t scaling[SCALING_SIZE], \
|
||||
const entry grain_lut[][GRAIN_WIDTH], \
|
||||
int bh, int row_num HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
|
||||
|
||||
#define decl_fguv_32x32xn_fn(name) \
|
||||
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
|
||||
const Dav1dFilmGrainData *data, int pw, \
|
||||
const uint8_t scaling[SCALING_SIZE], \
|
||||
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
|
||||
const pixel *luma_row, ptrdiff_t luma_stride, \
|
||||
int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
|
||||
typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
|
||||
|
||||
typedef struct Dav1dFilmGrainDSPContext {
|
||||
generate_grain_y_fn generate_grain_y;
|
||||
generate_grain_uv_fn generate_grain_uv[3];
|
||||
|
||||
fgy_32x32xn_fn fgy_32x32xn;
|
||||
fguv_32x32xn_fn fguv_32x32xn[3];
|
||||
} Dav1dFilmGrainDSPContext;
|
||||
|
||||
bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
|
||||
bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
|
||||
|
||||
#endif /* DAV1D_SRC_FILM_GRAIN_H */
|
||||
|
296
third_party/dav1d/src/film_grain_tmpl.c
vendored
296
third_party/dav1d/src/film_grain_tmpl.c
vendored
@ -26,39 +26,16 @@
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "common.h"
|
||||
#include "common/attributes.h"
|
||||
#include "common/intops.h"
|
||||
#include "common/bitdepth.h"
|
||||
#include "tables.h"
|
||||
|
||||
#include "film_grain.h"
|
||||
#include "tables.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
typedef int8_t entry;
|
||||
#else
|
||||
typedef int16_t entry;
|
||||
#endif
|
||||
#define SUB_GRAIN_WIDTH 44
|
||||
#define SUB_GRAIN_HEIGHT 38
|
||||
|
||||
enum {
|
||||
GRAIN_WIDTH = 82,
|
||||
GRAIN_HEIGHT = 73,
|
||||
SUB_GRAIN_WIDTH = 44,
|
||||
SUB_GRAIN_HEIGHT = 38,
|
||||
SUB_GRAIN_OFFSET = 6,
|
||||
BLOCK_SIZE = 32,
|
||||
#if BITDEPTH == 8
|
||||
SCALING_SIZE = 256
|
||||
#else
|
||||
SCALING_SIZE = 4096
|
||||
#endif
|
||||
};
|
||||
|
||||
static inline int get_random_number(const int bits, unsigned *state) {
|
||||
static inline int get_random_number(const int bits, unsigned *const state) {
|
||||
const int r = *state;
|
||||
unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
|
||||
*state = (r >> 1) | (bit << 15);
|
||||
@ -70,13 +47,14 @@ static inline int round2(const int x, const int shift) {
|
||||
return (x + ((1 << shift) >> 1)) >> shift;
|
||||
}
|
||||
|
||||
static void generate_grain_y(const Dav1dPicture *const in,
|
||||
entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
|
||||
static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *const data
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
unsigned seed = data->seed;
|
||||
const int shift = 12 - in->p.bpc + data->grain_scale_shift;
|
||||
const int grain_ctr = 128 << (in->p.bpc - 8);
|
||||
const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
|
||||
const int grain_ctr = 128 << bitdepth_min_8;
|
||||
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
|
||||
|
||||
for (int y = 0; y < GRAIN_HEIGHT; y++) {
|
||||
@ -101,25 +79,24 @@ static void generate_grain_y(const Dav1dPicture *const in,
|
||||
}
|
||||
}
|
||||
|
||||
int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
|
||||
const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
|
||||
buf[y][x] = iclip(grain, grain_min, grain_max);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void generate_grain_uv(const Dav1dPicture *const in, int uv,
|
||||
entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
|
||||
entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
|
||||
static NOINLINE void
|
||||
generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
|
||||
const entry buf_y[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *const data, const int uv,
|
||||
const int subx, const int suby HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
|
||||
const int shift = 12 - in->p.bpc + data->grain_scale_shift;
|
||||
const int grain_ctr = 128 << (in->p.bpc - 8);
|
||||
const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
|
||||
const int grain_ctr = 128 << bitdepth_min_8;
|
||||
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
|
||||
|
||||
const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
||||
const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
|
||||
const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
|
||||
|
||||
@ -167,40 +144,18 @@ static void generate_grain_uv(const Dav1dPicture *const in, int uv,
|
||||
}
|
||||
}
|
||||
|
||||
static void generate_scaling(const int bitdepth,
|
||||
const uint8_t points[][2], int num,
|
||||
uint8_t scaling[SCALING_SIZE])
|
||||
{
|
||||
const int shift_x = bitdepth - 8;
|
||||
const int scaling_size = 1 << bitdepth;
|
||||
|
||||
// Fill up the preceding entries with the initial value
|
||||
for (int i = 0; i < points[0][0] << shift_x; i++)
|
||||
scaling[i] = points[0][1];
|
||||
|
||||
// Linearly interpolate the values in the middle
|
||||
for (int i = 0; i < num - 1; i++) {
|
||||
const int bx = points[i][0] << shift_x;
|
||||
const int by = points[i][1];
|
||||
const int ex = points[i+1][0] << shift_x;
|
||||
const int ey = points[i+1][1];
|
||||
const int dx = ex - bx;
|
||||
const int dy = ey - by;
|
||||
const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
|
||||
for (int x = 0; x < dx; x++) {
|
||||
const int v = by + ((x * delta + 0x8000) >> 16);
|
||||
scaling[bx + x] = v;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill up the remaining entries with the final value
|
||||
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
|
||||
scaling[i] = points[num - 1][1];
|
||||
#define gnuv_ss_fn(nm, ss_x, ss_y) \
|
||||
static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
|
||||
generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
gnuv_ss_fn(420, 1, 1);
|
||||
gnuv_ss_fn(422, 1, 0);
|
||||
gnuv_ss_fn(444, 0, 0);
|
||||
|
||||
// samples from the correct block of a grain LUT, while taking into account the
|
||||
// offsets provided by the offsets cache
|
||||
static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
|
||||
static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
|
||||
int offsets[2][2], int subx, int suby,
|
||||
int bx, int by, int x, int y)
|
||||
{
|
||||
@ -211,13 +166,15 @@ static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
|
||||
[offx + x + (BLOCK_SIZE >> subx) * bx];
|
||||
}
|
||||
|
||||
static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
|
||||
entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
|
||||
uint8_t scaling[SCALING_SIZE], int row_num)
|
||||
static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
||||
const ptrdiff_t stride,
|
||||
const Dav1dFilmGrainData *const data, const size_t pw,
|
||||
const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH],
|
||||
const int bh, const int row_num HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
|
||||
const int rows = 1 + (data->overlap_flag && row_num > 0);
|
||||
const int bitdepth_min_8 = in->p.bpc - 8;
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int grain_ctr = 128 << bitdepth_min_8;
|
||||
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
|
||||
|
||||
@ -227,7 +184,11 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
|
||||
max_value = 235 << bitdepth_min_8;
|
||||
} else {
|
||||
min_value = 0;
|
||||
max_value = (1U << in->p.bpc) - 1;
|
||||
#if BITDEPTH == 8
|
||||
max_value = 0xff;
|
||||
#else
|
||||
max_value = bitdepth_max;
|
||||
#endif
|
||||
}
|
||||
|
||||
// seed[0] contains the current row, seed[1] contains the previous
|
||||
@ -238,18 +199,13 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
|
||||
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
|
||||
}
|
||||
|
||||
const ptrdiff_t stride = out->stride[0];
|
||||
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
assert(stride == in->stride[0]);
|
||||
pixel *const src_row = (pixel *) in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
|
||||
pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
|
||||
|
||||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks
|
||||
const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);
|
||||
for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
|
||||
const int bw = imin(BLOCK_SIZE, out->p.w - bx);
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
|
||||
const int bw = imin(BLOCK_SIZE, (int) pw - bx);
|
||||
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
@ -267,11 +223,11 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
|
||||
|
||||
static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
|
||||
|
||||
#define add_noise_y(x, y, grain) \
|
||||
pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
|
||||
*dst = iclip(*src + noise, min_value, max_value);
|
||||
#define add_noise_y(x, y, grain) \
|
||||
const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
|
||||
pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
|
||||
const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
|
||||
*dst = iclip(*src + noise, min_value, max_value);
|
||||
|
||||
for (int y = ystart; y < bh; y++) {
|
||||
// Non-overlapped image region (straightforward)
|
||||
@ -323,33 +279,33 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
|
||||
}
|
||||
}
|
||||
|
||||
static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
|
||||
entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
|
||||
uint8_t scaling[SCALING_SIZE], int uv, int row_num)
|
||||
static NOINLINE void
|
||||
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
||||
const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
|
||||
const int pw, const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH], const int bh,
|
||||
const int row_num, const pixel *const luma_row,
|
||||
const ptrdiff_t luma_stride, const int uv, const int is_id,
|
||||
const int sx, const int sy HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
|
||||
const int rows = 1 + (data->overlap_flag && row_num > 0);
|
||||
const int bitdepth_max = (1 << in->p.bpc) - 1;
|
||||
const int bitdepth_min_8 = in->p.bpc - 8;
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int grain_ctr = 128 << bitdepth_min_8;
|
||||
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
|
||||
|
||||
int min_value, max_value;
|
||||
if (data->clip_to_restricted_range) {
|
||||
min_value = 16 << bitdepth_min_8;
|
||||
if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {
|
||||
max_value = 235 << bitdepth_min_8;
|
||||
} else {
|
||||
max_value = 240 << bitdepth_min_8;
|
||||
}
|
||||
max_value = (is_id ? 235 : 240) << bitdepth_min_8;
|
||||
} else {
|
||||
min_value = 0;
|
||||
#if BITDEPTH == 8
|
||||
max_value = 0xff;
|
||||
#else
|
||||
max_value = bitdepth_max;
|
||||
#endif
|
||||
}
|
||||
|
||||
const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
|
||||
// seed[0] contains the current row, seed[1] contains the previous
|
||||
unsigned seed[2];
|
||||
for (int i = 0; i < rows; i++) {
|
||||
@ -358,21 +314,13 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
|
||||
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
|
||||
}
|
||||
|
||||
const ptrdiff_t stride = out->stride[1];
|
||||
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
assert(stride == in->stride[1]);
|
||||
|
||||
const int by = row_num * (BLOCK_SIZE >> sy);
|
||||
pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;
|
||||
pixel *const src_row = (pixel *) in->data[1 + uv] + PXSTRIDE(stride) * by;
|
||||
pixel *const luma_row = (pixel *) out->data[0] + PXSTRIDE(out->stride[0]) * row_num * BLOCK_SIZE;
|
||||
|
||||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks (subsampled)
|
||||
const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;
|
||||
for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
|
||||
const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;
|
||||
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
|
||||
const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
for (int i = 0; i < rows; i++)
|
||||
@ -392,25 +340,23 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
|
||||
{ { 23, 22 } },
|
||||
};
|
||||
|
||||
#define add_noise_uv(x, y, grain) \
|
||||
const int lx = (bx + x) << sx; \
|
||||
const int ly = y << sy; \
|
||||
pixel *luma = luma_row + ly * PXSTRIDE(out->stride[0]) + lx; \
|
||||
pixel avg = luma[0]; \
|
||||
if (sx && lx + 1 < out->p.w) \
|
||||
avg = (avg + luma[1] + 1) >> 1; \
|
||||
\
|
||||
pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
int val = avg; \
|
||||
if (!data->chroma_scaling_from_luma) { \
|
||||
int combined = avg * data->uv_luma_mult[uv] + \
|
||||
*src * data->uv_mult[uv]; \
|
||||
val = iclip_pixel( (combined >> 6) + \
|
||||
(data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
|
||||
} \
|
||||
\
|
||||
int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
|
||||
#define add_noise_uv(x, y, grain) \
|
||||
const int lx = (bx + x) << sx; \
|
||||
const int ly = y << sy; \
|
||||
const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
|
||||
pixel avg = luma[0]; \
|
||||
if (sx) \
|
||||
avg = (avg + luma[1] + 1) >> 1; \
|
||||
const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
|
||||
int val = avg; \
|
||||
if (!data->chroma_scaling_from_luma) { \
|
||||
const int combined = avg * data->uv_luma_mult[uv] + \
|
||||
*src * data->uv_mult[uv]; \
|
||||
val = iclip_pixel( (combined >> 6) + \
|
||||
(data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
|
||||
} \
|
||||
const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
|
||||
*dst = iclip(*src + noise, min_value, max_value);
|
||||
|
||||
for (int y = ystart; y < bh; y++) {
|
||||
@ -463,61 +409,29 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
|
||||
}
|
||||
}
|
||||
|
||||
void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in)
|
||||
{
|
||||
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
|
||||
|
||||
entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
uint8_t scaling[3][SCALING_SIZE];
|
||||
|
||||
// Generate grain LUTs as needed
|
||||
generate_grain_y(out, grain_lut[0]); // always needed
|
||||
if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
|
||||
generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
|
||||
if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
|
||||
generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
|
||||
|
||||
// Generate scaling LUTs as needed
|
||||
if (data->num_y_points)
|
||||
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
|
||||
if (data->num_uv_points[0])
|
||||
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
|
||||
if (data->num_uv_points[1])
|
||||
generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
|
||||
|
||||
// Copy over the non-modified planes
|
||||
// TODO: eliminate in favor of per-plane refs
|
||||
if (!data->num_y_points) {
|
||||
assert(out->stride[0] == in->stride[0]);
|
||||
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
|
||||
}
|
||||
|
||||
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
|
||||
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
assert(out->stride[1] == in->stride[1]);
|
||||
memcpy(out->data[1+i], in->data[1+i],
|
||||
(out->p.h >> suby) * out->stride[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Synthesize grain for the affected planes
|
||||
int rows = (out->p.h + 31) >> 5;
|
||||
for (int row = 0; row < rows; row++) {
|
||||
if (data->num_y_points)
|
||||
apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
|
||||
|
||||
if (data->chroma_scaling_from_luma) {
|
||||
apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
|
||||
apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
|
||||
} else {
|
||||
if (data->num_uv_points[0])
|
||||
apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
|
||||
if (data->num_uv_points[1])
|
||||
apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
|
||||
}
|
||||
}
|
||||
#define fguv_ss_fn(nm, ss_x, ss_y) \
|
||||
static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
|
||||
fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
|
||||
row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
|
||||
HIGHBD_TAIL_SUFFIX); \
|
||||
}
|
||||
|
||||
fguv_ss_fn(420, 1, 1);
|
||||
fguv_ss_fn(422, 1, 0);
|
||||
fguv_ss_fn(444, 0, 0);
|
||||
|
||||
COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
|
||||
c->generate_grain_y = generate_grain_y_c;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
|
||||
|
||||
c->fgy_32x32xn = fgy_32x32xn_c;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
|
||||
|
||||
#if HAVE_ASM && ARCH_X86
|
||||
bitfn(dav1d_film_grain_dsp_init_x86)(c);
|
||||
#endif
|
||||
}
|
||||
|
2
third_party/dav1d/src/getbits.c
vendored
2
third_party/dav1d/src/getbits.c
vendored
@ -27,8 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/getbits.h"
|
||||
|
39
third_party/dav1d/src/internal.h
vendored
39
third_party/dav1d/src/internal.h
vendored
@ -42,6 +42,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
|
||||
#include "src/cdf.h"
|
||||
#include "src/data.h"
|
||||
#include "src/env.h"
|
||||
#include "src/film_grain.h"
|
||||
#include "src/intra_edge.h"
|
||||
#include "src/ipred.h"
|
||||
#include "src/itx.h"
|
||||
@ -57,6 +58,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
|
||||
#include "src/thread.h"
|
||||
|
||||
typedef struct Dav1dDSPContext {
|
||||
Dav1dFilmGrainDSPContext fg;
|
||||
Dav1dIntraPredDSPContext ipred;
|
||||
Dav1dMCDSPContext mc;
|
||||
Dav1dInvTxfmDSPContext itx;
|
||||
@ -89,6 +91,8 @@ struct Dav1dContext {
|
||||
Dav1dContentLightLevel *content_light;
|
||||
Dav1dRef *mastering_display_ref;
|
||||
Dav1dMasteringDisplay *mastering_display;
|
||||
Dav1dRef *itut_t35_ref;
|
||||
Dav1dITUTT35 *itut_t35;
|
||||
|
||||
// decoded output picture queue
|
||||
Dav1dData in;
|
||||
@ -213,7 +217,7 @@ struct Dav1dFrameContext {
|
||||
Av1Restoration *lr_mask;
|
||||
int top_pre_cdef_toggle;
|
||||
int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
|
||||
Av1FilterLUT lim_lut;
|
||||
ALIGN(Av1FilterLUT lim_lut, 16);
|
||||
int last_sharpness;
|
||||
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
|
||||
uint8_t *tx_lpf_right_edge[2];
|
||||
@ -233,20 +237,21 @@ struct Dav1dFrameContext {
|
||||
pthread_cond_t cond, icond;
|
||||
int tasks_left, num_tasks;
|
||||
int (*task_idx_to_sby_and_tile_idx)[2];
|
||||
int titsati_sz, titsati_init[3];
|
||||
int titsati_sz, titsati_init[2];
|
||||
uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
|
||||
int inited;
|
||||
} tile_thread;
|
||||
};
|
||||
|
||||
struct Dav1dTileState {
|
||||
CdfContext cdf;
|
||||
MsacContext msac;
|
||||
|
||||
struct {
|
||||
int col_start, col_end, row_start, row_end; // in 4px units
|
||||
int col, row; // in tile units
|
||||
} tiling;
|
||||
|
||||
CdfContext cdf;
|
||||
MsacContext msac;
|
||||
|
||||
atomic_int progress; // in sby units, TILE_ERROR after a decoding error
|
||||
struct {
|
||||
pthread_mutex_t lock;
|
||||
@ -299,25 +304,27 @@ struct Dav1dTileContext {
|
||||
};
|
||||
};
|
||||
struct {
|
||||
uint8_t interintra_8bpc[64 * 64];
|
||||
uint8_t edge_8bpc[257];
|
||||
};
|
||||
struct {
|
||||
uint16_t interintra_16bpc[64 * 64];
|
||||
uint16_t edge_16bpc[257];
|
||||
};
|
||||
struct {
|
||||
uint8_t pal_idx[2 * 64 * 64];
|
||||
union {
|
||||
uint8_t levels[32 * 34];
|
||||
struct {
|
||||
uint8_t pal_order[64][8];
|
||||
uint8_t pal_ctx[64];
|
||||
};
|
||||
uint8_t levels[36 * 36];
|
||||
};
|
||||
int16_t ac[32 * 32];
|
||||
uint8_t pal_idx[2 * 64 * 64];
|
||||
uint16_t pal[3 /* plane */][8 /* palette_idx */];
|
||||
ALIGN(union, 32) {
|
||||
struct {
|
||||
uint8_t interintra_8bpc[64 * 64];
|
||||
uint8_t edge_8bpc[257];
|
||||
};
|
||||
struct {
|
||||
uint16_t interintra_16bpc[64 * 64];
|
||||
uint16_t edge_16bpc[257];
|
||||
};
|
||||
};
|
||||
};
|
||||
int16_t ac[32 * 32];
|
||||
} scratch;
|
||||
|
||||
Dav1dWarpedMotionParams warpmv;
|
||||
|
3
third_party/dav1d/src/intra_edge.c
vendored
3
third_party/dav1d/src/intra_edge.c
vendored
@ -27,9 +27,10 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/intra_edge.h"
|
||||
#include "src/levels.h"
|
||||
|
||||
|
1
third_party/dav1d/src/ipred_prepare_tmpl.c
vendored
1
third_party/dav1d/src/ipred_prepare_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
|
1
third_party/dav1d/src/ipred_tmpl.c
vendored
1
third_party/dav1d/src/ipred_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
|
1
third_party/dav1d/src/itx_tmpl.c
vendored
1
third_party/dav1d/src/itx_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
11
third_party/dav1d/src/levels.h
vendored
11
third_party/dav1d/src/levels.h
vendored
@ -109,17 +109,6 @@ enum TxfmType {
|
||||
N_TX_TYPES_PLUS_LL,
|
||||
};
|
||||
|
||||
enum TxfmTypeSet {
|
||||
TXTP_SET_DCT,
|
||||
TXTP_SET_DCT_ID,
|
||||
TXTP_SET_DT4_ID,
|
||||
TXTP_SET_DT4_ID_1D,
|
||||
TXTP_SET_DT9_ID_1D,
|
||||
TXTP_SET_ALL,
|
||||
TXTP_SET_LOSSLESS,
|
||||
N_TXTP_SETS
|
||||
};
|
||||
|
||||
enum TxClass {
|
||||
TX_CLASS_2D,
|
||||
TX_CLASS_H,
|
||||
|
1
third_party/dav1d/src/lf_apply_tmpl.c
vendored
1
third_party/dav1d/src/lf_apply_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
|
1
third_party/dav1d/src/lf_mask.c
vendored
1
third_party/dav1d/src/lf_mask.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
|
12
third_party/dav1d/src/lib.c
vendored
12
third_party/dav1d/src/lib.c
vendored
@ -37,6 +37,7 @@
|
||||
#include "common/mem.h"
|
||||
#include "common/validate.h"
|
||||
|
||||
#include "src/fg_apply.h"
|
||||
#include "src/internal.h"
|
||||
#include "src/log.h"
|
||||
#include "src/obu.h"
|
||||
@ -44,12 +45,12 @@
|
||||
#include "src/ref.h"
|
||||
#include "src/thread_task.h"
|
||||
#include "src/wedge.h"
|
||||
#include "src/film_grain.h"
|
||||
|
||||
static COLD void init_internal(void) {
|
||||
dav1d_init_wedge_masks();
|
||||
dav1d_init_interintra_masks();
|
||||
dav1d_init_qm_tables();
|
||||
dav1d_init_thread();
|
||||
}
|
||||
|
||||
COLD const char *dav1d_version(void) {
|
||||
@ -289,13 +290,13 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
|
||||
switch (out->p.bpc) {
|
||||
#if CONFIG_8BPC
|
||||
case 8:
|
||||
dav1d_apply_grain_8bpc(out, in);
|
||||
dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
|
||||
break;
|
||||
#endif
|
||||
#if CONFIG_16BPC
|
||||
case 10:
|
||||
case 12:
|
||||
dav1d_apply_grain_16bpc(out, in);
|
||||
dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -409,8 +410,10 @@ void dav1d_flush(Dav1dContext *const c) {
|
||||
|
||||
c->mastering_display = NULL;
|
||||
c->content_light = NULL;
|
||||
c->itut_t35 = NULL;
|
||||
dav1d_ref_dec(&c->mastering_display_ref);
|
||||
dav1d_ref_dec(&c->content_light_ref);
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
|
||||
if (c->n_fc == 1) return;
|
||||
|
||||
@ -499,7 +502,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
||||
pthread_cond_destroy(&ts->tile_thread.cond);
|
||||
pthread_mutex_destroy(&ts->tile_thread.lock);
|
||||
}
|
||||
free(f->ts);
|
||||
dav1d_free_aligned(f->ts);
|
||||
dav1d_free_aligned(f->tc);
|
||||
dav1d_free_aligned(f->ipred_edge[0]);
|
||||
free(f->a);
|
||||
@ -535,6 +538,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
||||
|
||||
dav1d_ref_dec(&c->mastering_display_ref);
|
||||
dav1d_ref_dec(&c->content_light_ref);
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
|
||||
dav1d_freep_aligned(c_out);
|
||||
}
|
||||
|
4
third_party/dav1d/src/looprestoration_tmpl.c
vendored
4
third_party/dav1d/src/looprestoration_tmpl.c
vendored
@ -172,8 +172,8 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
|
||||
const int round_bits_v = 11 - (bitdepth == 12) * 2;
|
||||
const int rounding_off_v = 1 << (round_bits_v - 1);
|
||||
const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
|
||||
for (int i = 0; i < w; i++) {
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int j = 0; j < h; j++) {
|
||||
for (int i = 0; i < w; i++) {
|
||||
int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
|
||||
|
||||
for (int k = 0; k < 7; k++) {
|
||||
|
2
third_party/dav1d/src/mc_tmpl.c
vendored
2
third_party/dav1d/src/mc_tmpl.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -906,6 +905,7 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
|
||||
src_x += mx >> 14;
|
||||
mx &= 0x3fff;
|
||||
}
|
||||
if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];
|
||||
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
src += PXSTRIDE(src_stride);
|
||||
|
27
third_party/dav1d/src/meson.build
vendored
27
third_party/dav1d/src/meson.build
vendored
@ -55,6 +55,7 @@ libdav1d_sources = files(
|
||||
libdav1d_tmpl_sources = files(
|
||||
'cdef_apply_tmpl.c',
|
||||
'cdef_tmpl.c',
|
||||
'fg_apply_tmpl.c',
|
||||
'film_grain_tmpl.c',
|
||||
'ipred_prepare_tmpl.c',
|
||||
'ipred_tmpl.c',
|
||||
@ -67,6 +68,10 @@ libdav1d_tmpl_sources = files(
|
||||
'recon_tmpl.c',
|
||||
)
|
||||
|
||||
libdav1d_arch_tmpl_sources = []
|
||||
|
||||
libdav1d_bitdepth_objs = []
|
||||
|
||||
# libdav1d entrypoint source files
|
||||
# These source files contain library entry points and are
|
||||
# built with the stack-realign flag set, where necessary.
|
||||
@ -77,6 +82,8 @@ libdav1d_entrypoints_sources = files(
|
||||
|
||||
# ASM specific sources
|
||||
libdav1d_nasm_objs = []
|
||||
# Arch-specific flags
|
||||
arch_flags = []
|
||||
if is_asm_enabled
|
||||
if (host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm'))
|
||||
@ -114,6 +121,7 @@ if is_asm_enabled
|
||||
|
||||
libdav1d_tmpl_sources += files(
|
||||
'x86/cdef_init_tmpl.c',
|
||||
'x86/film_grain_init_tmpl.c',
|
||||
'x86/ipred_init_tmpl.c',
|
||||
'x86/itx_init_tmpl.c',
|
||||
'x86/loopfilter_init_tmpl.c',
|
||||
@ -130,6 +138,7 @@ if is_asm_enabled
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources_asm += files(
|
||||
'x86/cdef.asm',
|
||||
'x86/film_grain.asm',
|
||||
'x86/ipred.asm',
|
||||
'x86/itx.asm',
|
||||
'x86/loopfilter.asm',
|
||||
@ -138,6 +147,7 @@ if is_asm_enabled
|
||||
'x86/cdef_sse.asm',
|
||||
'x86/ipred_ssse3.asm',
|
||||
'x86/itx_ssse3.asm',
|
||||
'x86/loopfilter_ssse3.asm',
|
||||
'x86/looprestoration_ssse3.asm',
|
||||
'x86/mc_ssse3.asm',
|
||||
)
|
||||
@ -151,9 +161,13 @@ if is_asm_enabled
|
||||
# Compile the ASM sources with NASM
|
||||
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
elif host_machine.cpu() == 'ppc64le'
|
||||
arch_flags = ['-maltivec', '-mvsx']
|
||||
libdav1d_sources += files(
|
||||
'ppc/cpu.c',
|
||||
)
|
||||
libdav1d_arch_tmpl_sources += files(
|
||||
'ppc/cdef_init_tmpl.c',
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
@ -223,6 +237,19 @@ foreach bitdepth : dav1d_bitdepths
|
||||
).extract_all_objects()
|
||||
endforeach
|
||||
|
||||
# Helper library for each bitdepth and architecture-specific flags
|
||||
foreach bitdepth : dav1d_bitdepths
|
||||
libdav1d_bitdepth_objs += static_library(
|
||||
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
|
||||
libdav1d_arch_tmpl_sources, config_h_target,
|
||||
include_directories: dav1d_inc_dirs,
|
||||
dependencies : [stdatomic_dependency],
|
||||
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
).extract_all_objects()
|
||||
endforeach
|
||||
|
||||
# The final dav1d library
|
||||
if host_machine.system() == 'windows'
|
||||
dav1d_soversion = ''
|
||||
|
47
third_party/dav1d/src/msac.c
vendored
47
third_party/dav1d/src/msac.c
vendored
@ -116,42 +116,39 @@ int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
|
||||
|
||||
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
|
||||
* table in Q15. */
|
||||
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
|
||||
const size_t n_symbols)
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
|
||||
uint16_t *const cdf,
|
||||
const size_t n_symbols)
|
||||
{
|
||||
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
|
||||
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
|
||||
const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
|
||||
unsigned u, v = s->rng, val = -1;
|
||||
|
||||
assert(!cdf[n_symbols - 1]);
|
||||
assert(n_symbols <= 15);
|
||||
assert(cdf[n_symbols] <= 32);
|
||||
|
||||
do {
|
||||
val++;
|
||||
u = v;
|
||||
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
|
||||
v = r * (cdf[val] >> EC_PROB_SHIFT);
|
||||
v >>= 7 - EC_PROB_SHIFT;
|
||||
v += EC_MIN_PROB * (int) (n_symbols - ret);
|
||||
v += EC_MIN_PROB * ((unsigned)n_symbols - val);
|
||||
} while (c < v);
|
||||
|
||||
assert(u <= s->rng);
|
||||
|
||||
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
|
||||
return ret - 1;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
|
||||
uint16_t *const cdf,
|
||||
const size_t n_symbols)
|
||||
{
|
||||
const unsigned val = decode_symbol(s, cdf, n_symbols);
|
||||
if (s->allow_update_cdf) {
|
||||
const unsigned count = cdf[n_symbols];
|
||||
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
|
||||
const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
|
||||
unsigned i;
|
||||
for (i = 0; i < val; i++)
|
||||
cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
for (; i < n_symbols - 1; i++)
|
||||
for (; i < n_symbols; i++)
|
||||
cdf[i] -= cdf[i] >> rate;
|
||||
cdf[n_symbols] = count + (count < 32);
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@ -163,7 +160,7 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
|
||||
if (s->allow_update_cdf) {
|
||||
// update_cdf() specialized for boolean CDFs
|
||||
const unsigned count = cdf[1];
|
||||
const int rate = (count >> 4) | 4;
|
||||
const int rate = 4 + (count >> 4);
|
||||
if (bit)
|
||||
cdf[0] += (32768 - cdf[0]) >> rate;
|
||||
else
|
||||
@ -174,6 +171,22 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
|
||||
return bit;
|
||||
}
|
||||
|
||||
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
|
||||
unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
|
||||
unsigned tok = 3 + tok_br;
|
||||
if (tok_br == 3) {
|
||||
tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
|
||||
tok = 6 + tok_br;
|
||||
if (tok_br == 3) {
|
||||
tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
|
||||
tok = 9 + tok_br;
|
||||
if (tok_br == 3)
|
||||
tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
|
||||
}
|
||||
}
|
||||
return tok;
|
||||
}
|
||||
|
||||
void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
|
||||
const size_t sz, const int disable_cdf_update_flag)
|
||||
{
|
||||
|
9
third_party/dav1d/src/msac.h
vendored
9
third_party/dav1d/src/msac.h
vendored
@ -28,10 +28,11 @@
|
||||
#ifndef DAV1D_SRC_MSAC_H
|
||||
#define DAV1D_SRC_MSAC_H
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
typedef size_t ec_win;
|
||||
|
||||
typedef struct MsacContext {
|
||||
@ -58,9 +59,10 @@ unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
|
||||
unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
|
||||
unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
|
||||
unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
|
||||
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
|
||||
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
|
||||
|
||||
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
|
||||
/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
|
||||
#ifndef dav1d_msac_decode_symbol_adapt4
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
|
||||
#endif
|
||||
@ -79,6 +81,9 @@ int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
|
||||
#ifndef dav1d_msac_decode_bool
|
||||
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
|
||||
#endif
|
||||
#ifndef dav1d_msac_decode_hi_tok
|
||||
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_c
|
||||
#endif
|
||||
|
||||
static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
|
||||
unsigned v = 0;
|
||||
|
54
third_party/dav1d/src/obu.c
vendored
54
third_party/dav1d/src/obu.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
@ -299,9 +298,10 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
|
||||
Dav1dThreadPicture *const ref =
|
||||
&c->refs[c->frame_hdr->refidx[i]].p;
|
||||
if (!ref->p.data[0]) return -1;
|
||||
// FIXME render_* may be wrong
|
||||
hdr->render_width = hdr->width[1] = ref->p.p.w;
|
||||
hdr->render_height = hdr->height = ref->p.p.h;
|
||||
hdr->width[1] = ref->p.p.w;
|
||||
hdr->height = ref->p.p.h;
|
||||
hdr->render_width = ref->p.frame_hdr->render_width;
|
||||
hdr->render_height = ref->p.frame_hdr->render_height;
|
||||
hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
|
||||
if (hdr->super_res.enabled) {
|
||||
const int d = hdr->super_res.width_scale_denominator =
|
||||
@ -1275,8 +1275,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
|
||||
if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
// ensure that the reference is writable
|
||||
assert(dav1d_ref_is_writable(c->frame_hdr_ref));
|
||||
#endif
|
||||
c->frame_hdr = c->frame_hdr_ref->data;
|
||||
memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
|
||||
c->frame_hdr->temporal_id = temporal_id;
|
||||
@ -1364,10 +1366,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
case OBU_METADATA: {
|
||||
// obu metadta type field
|
||||
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
|
||||
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
|
||||
if (gb.error) goto error;
|
||||
Dav1dRef *ref;
|
||||
Dav1dContentLightLevel *content_light;
|
||||
Dav1dMasteringDisplay *mastering_display;
|
||||
Dav1dITUTT35 *itut_t35_metadata;
|
||||
|
||||
switch (meta_type) {
|
||||
case OBU_META_HDR_CLL:
|
||||
@ -1420,7 +1424,47 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
c->mastering_display_ref = ref;
|
||||
break;
|
||||
}
|
||||
case OBU_META_ITUT_T35:
|
||||
case OBU_META_ITUT_T35: {
|
||||
int payload_size = len;
|
||||
// Don't take into account all the trailing bits for payload_size
|
||||
while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1])
|
||||
payload_size--; // trailing_zero_bit x 8
|
||||
payload_size--; // trailing_one_bit + trailing_zero_bit x 7
|
||||
|
||||
// Don't take into account meta_type bytes
|
||||
payload_size -= meta_type_len;
|
||||
|
||||
int country_code_extension_byte = 0;
|
||||
int country_code = dav1d_get_bits(&gb, 8);
|
||||
payload_size--;
|
||||
if (country_code == 0xFF) {
|
||||
country_code_extension_byte = dav1d_get_bits(&gb, 8);
|
||||
payload_size--;
|
||||
}
|
||||
|
||||
if (payload_size <= 0) {
|
||||
dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
|
||||
goto error;
|
||||
}
|
||||
|
||||
ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
itut_t35_metadata = ref->data;
|
||||
|
||||
// We need our public headers to be C++ compatible, so payload can't be
|
||||
// a flexible array member
|
||||
itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1];
|
||||
itut_t35_metadata->country_code = country_code;
|
||||
itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
|
||||
for (int i = 0; i < payload_size; i++)
|
||||
itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8);
|
||||
itut_t35_metadata->payload_size = payload_size;
|
||||
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = itut_t35_metadata;
|
||||
c->itut_t35_ref = ref;
|
||||
break;
|
||||
}
|
||||
case OBU_META_SCALABILITY:
|
||||
case OBU_META_TIMECODE:
|
||||
// ignore metadata OBUs we don't care about
|
||||
|
14
third_party/dav1d/src/picture.c
vendored
14
third_party/dav1d/src/picture.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
@ -104,6 +103,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
|
||||
Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
|
||||
Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
|
||||
Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
|
||||
Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
|
||||
const int bpc, const Dav1dDataProps *props,
|
||||
Dav1dPicAllocator *const p_allocator,
|
||||
const size_t extra, void **const extra_ptr)
|
||||
@ -125,6 +125,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
|
||||
p->frame_hdr = frame_hdr;
|
||||
p->content_light = content_light;
|
||||
p->mastering_display = mastering_display;
|
||||
p->itut_t35 = itut_t35;
|
||||
p->p.layout = seq_hdr->layout;
|
||||
p->p.bpc = bpc;
|
||||
dav1d_data_props_set_defaults(&p->m);
|
||||
@ -161,6 +162,9 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
|
||||
p->mastering_display_ref = mastering_display_ref;
|
||||
if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
|
||||
|
||||
p->itut_t35_ref = itut_t35_ref;
|
||||
if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -176,11 +180,16 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
|
||||
f->frame_hdr, f->frame_hdr_ref,
|
||||
c->content_light, c->content_light_ref,
|
||||
c->mastering_display, c->mastering_display_ref,
|
||||
c->itut_t35, c->itut_t35_ref,
|
||||
bpc, &f->tile[0].data.m, &c->allocator,
|
||||
p->t != NULL ? sizeof(atomic_int) * 2 : 0,
|
||||
(void **) &p->progress);
|
||||
if (res) return res;
|
||||
|
||||
// Must be removed from the context after being attached to the frame
|
||||
dav1d_ref_dec(&c->itut_t35_ref);
|
||||
c->itut_t35 = NULL;
|
||||
|
||||
p->visible = f->frame_hdr->show_frame;
|
||||
if (p->t) {
|
||||
atomic_init(&p->progress[0], 0);
|
||||
@ -198,6 +207,7 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
|
||||
src->frame_hdr, src->frame_hdr_ref,
|
||||
src->content_light, src->content_light_ref,
|
||||
src->mastering_display, src->mastering_display_ref,
|
||||
src->itut_t35, src->itut_t35_ref,
|
||||
src->p.bpc, &src->m, &pic_ctx->allocator,
|
||||
0, NULL);
|
||||
return res;
|
||||
@ -216,6 +226,7 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
|
||||
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
|
||||
if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
|
||||
if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
|
||||
if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
|
||||
}
|
||||
*dst = *src;
|
||||
}
|
||||
@ -252,6 +263,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
|
||||
dav1d_ref_dec(&p->m.user_data.ref);
|
||||
dav1d_ref_dec(&p->content_light_ref);
|
||||
dav1d_ref_dec(&p->mastering_display_ref);
|
||||
dav1d_ref_dec(&p->itut_t35_ref);
|
||||
}
|
||||
memset(p, 0, sizeof(*p));
|
||||
}
|
||||
|
488
third_party/dav1d/src/ppc/cdef_init_tmpl.c
vendored
Normal file
488
third_party/dav1d/src/ppc/cdef_init_tmpl.c
vendored
Normal file
@ -0,0 +1,488 @@
|
||||
/*
|
||||
* Copyright © 2019, Luca Barbato
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/bitdepth.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "src/cdef.h"
|
||||
#include "src/cpu.h"
|
||||
|
||||
#include "src/ppc/types.h"
|
||||
|
||||
#if BITDEPTH == 8
|
||||
static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
|
||||
const int damping)
|
||||
{
|
||||
const i16x8 zero = vec_splat_s16(0);
|
||||
if (!threshold) return zero;
|
||||
const uint16_t shift = imax(0, damping - ulog2(threshold));
|
||||
const i16x8 abs_diff = vec_abs(diff);
|
||||
const b16x8 mask = vec_cmplt(diff, zero);
|
||||
const i16x8 thr = vec_splats(threshold);
|
||||
const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
|
||||
const i16x8 max = vec_max(zero, sub);
|
||||
const i16x8 min = vec_min(abs_diff, max);
|
||||
const i16x8 neg = vec_sub(zero, min);
|
||||
return vec_sel(min, neg, mask);
|
||||
}
|
||||
|
||||
static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
const uint8_t *src, const ptrdiff_t src_stride,
|
||||
const uint8_t (*left)[2], uint8_t *const top[2],
|
||||
const int w, const int h,
|
||||
const enum CdefEdgeFlags edges)
|
||||
{
|
||||
const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
|
||||
|
||||
u16x8 l0;
|
||||
u16x8 l1;
|
||||
|
||||
int y_start = -2, y_end = h + 2;
|
||||
|
||||
// Copy top and bottom first
|
||||
if (!(edges & CDEF_HAVE_TOP)) {
|
||||
l0 = fill;
|
||||
l1 = fill;
|
||||
y_start = 0;
|
||||
} else {
|
||||
l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
|
||||
l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
|
||||
}
|
||||
|
||||
vec_st(l0, 0, tmp - 2 * 8);
|
||||
vec_st(l1, 0, tmp - 1 * 8);
|
||||
|
||||
if (!(edges & CDEF_HAVE_BOTTOM)) {
|
||||
l0 = fill;
|
||||
l1 = fill;
|
||||
y_end -= 2;
|
||||
} else {
|
||||
l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride));
|
||||
l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride));
|
||||
}
|
||||
|
||||
vec_st(l0, 0, tmp + (h + 0) * 8);
|
||||
vec_st(l1, 0, tmp + (h + 1) * 8);
|
||||
|
||||
for (int y = 0; y < h; y++) {
|
||||
u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
|
||||
vec_st(l, 0, tmp + y * 8);
|
||||
}
|
||||
|
||||
if (!(edges & CDEF_HAVE_LEFT)) {
|
||||
for (int y = y_start; y < y_end; y++) {
|
||||
tmp[y * 8] = INT16_MAX;
|
||||
tmp[1 + y * 8] = INT16_MAX;
|
||||
}
|
||||
} else {
|
||||
for (int y = 0; y < h; y++) {
|
||||
tmp[y * 8] = left[y][0];
|
||||
tmp[1 + y * 8] = left[y][1];
|
||||
}
|
||||
}
|
||||
if (!(edges & CDEF_HAVE_RIGHT)) {
|
||||
for (int y = y_start; y < y_end; y++) {
|
||||
tmp[- 2 + (y + 1) * 8] = INT16_MAX;
|
||||
tmp[- 1 + (y + 1) * 8] = INT16_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
|
||||
const uint8_t *src, const ptrdiff_t src_stride,
|
||||
const uint8_t (*left)[2], uint8_t *const top[2],
|
||||
const int w, const int h,
|
||||
const enum CdefEdgeFlags edges)
|
||||
{
|
||||
const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
|
||||
|
||||
u16x8 l0h, l0l;
|
||||
u16x8 l1h, l1l;
|
||||
|
||||
int y_start = -2, y_end = h + 2;
|
||||
|
||||
// Copy top and bottom first
|
||||
if (!(edges & CDEF_HAVE_TOP)) {
|
||||
l0h = fill;
|
||||
l0l = fill;
|
||||
l1h = fill;
|
||||
l1l = fill;
|
||||
y_start = 0;
|
||||
} else {
|
||||
u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
|
||||
u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
|
||||
l0h = u8h_to_u16(l0);
|
||||
l0l = u8l_to_u16(l0);
|
||||
l1h = u8h_to_u16(l1);
|
||||
l1l = u8l_to_u16(l1);
|
||||
}
|
||||
|
||||
vec_st(l0h, 0, tmp - 4 * 8);
|
||||
vec_st(l0l, 0, tmp - 3 * 8);
|
||||
vec_st(l1h, 0, tmp - 2 * 8);
|
||||
vec_st(l1l, 0, tmp - 1 * 8);
|
||||
|
||||
if (!(edges & CDEF_HAVE_BOTTOM)) {
|
||||
l0h = fill;
|
||||
l0l = fill;
|
||||
l1h = fill;
|
||||
l1l = fill;
|
||||
y_end -= 2;
|
||||
} else {
|
||||
u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride);
|
||||
u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride);
|
||||
l0h = u8h_to_u16(l0);
|
||||
l0l = u8l_to_u16(l0);
|
||||
l1h = u8h_to_u16(l1);
|
||||
l1l = u8l_to_u16(l1);
|
||||
}
|
||||
|
||||
vec_st(l0h, 0, tmp + (h + 0) * 16);
|
||||
vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
|
||||
vec_st(l1h, 0, tmp + (h + 1) * 16);
|
||||
vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
|
||||
|
||||
for (int y = 0; y < h; y++) {
|
||||
u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
|
||||
u16x8 lh = u8h_to_u16(l);
|
||||
u16x8 ll = u8l_to_u16(l);
|
||||
vec_st(lh, 0, tmp + y * 16);
|
||||
vec_st(ll, 0, tmp + 8 + y * 16);
|
||||
}
|
||||
|
||||
if (!(edges & CDEF_HAVE_LEFT)) {
|
||||
for (int y = y_start; y < y_end; y++) {
|
||||
tmp[y * 16] = INT16_MAX;
|
||||
tmp[1 + y * 16] = INT16_MAX;
|
||||
}
|
||||
} else {
|
||||
for (int y = 0; y < h; y++) {
|
||||
tmp[y * 16] = left[y][0];
|
||||
tmp[1 + y * 16] = left[y][1];
|
||||
}
|
||||
}
|
||||
if (!(edges & CDEF_HAVE_RIGHT)) {
|
||||
for (int y = y_start; y < y_end; y++) {
|
||||
tmp[- 6 + (y + 1) * 16] = INT16_MAX;
|
||||
tmp[- 5 + (y + 1) * 16] = INT16_MAX;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline i16x8 max_mask(i16x8 a, i16x8 b) {
|
||||
const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
|
||||
|
||||
const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
|
||||
|
||||
const i16x8 val = vec_sel(a, b, mask);
|
||||
|
||||
return vec_max(val, b);
|
||||
}
|
||||
|
||||
#define LOAD_PIX(addr) \
|
||||
const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
|
||||
i16x8 max = px; \
|
||||
i16x8 min = px; \
|
||||
i16x8 sum = vec_splat_s16(0);
|
||||
|
||||
#define LOAD_PIX4(addr) \
|
||||
const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
|
||||
const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
|
||||
const i16x8 px = vec_xxpermdi(a, b, 0); \
|
||||
i16x8 max = px; \
|
||||
i16x8 min = px; \
|
||||
i16x8 sum = vec_splat_s16(0);
|
||||
|
||||
#define LOAD_DIR(p, addr, o0, o1) \
|
||||
const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
|
||||
const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
|
||||
const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
|
||||
const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
|
||||
|
||||
#define LOAD_DIR4(p, addr, o0, o1) \
|
||||
LOAD_DIR(p ## a, addr, o0, o1) \
|
||||
LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
|
||||
const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
|
||||
const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
|
||||
const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
|
||||
const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
|
||||
|
||||
#define CONSTRAIN(p, strength) \
|
||||
const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
|
||||
const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
|
||||
const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
|
||||
const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
|
||||
\
|
||||
i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
|
||||
i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
|
||||
i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
|
||||
i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
|
||||
|
||||
#define MIN_MAX(p) \
|
||||
max = max_mask(p ## 0, max); \
|
||||
min = vec_min(p ## 0, min); \
|
||||
max = max_mask(p ## 1, max); \
|
||||
min = vec_min(p ## 1, min); \
|
||||
max = max_mask(p ## 2, max); \
|
||||
min = vec_min(p ## 2, min); \
|
||||
max = max_mask(p ## 3, max); \
|
||||
min = vec_min(p ## 3, min);
|
||||
|
||||
#define PRI_0(p) \
|
||||
p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
|
||||
p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
|
||||
|
||||
#define PRI_1(p) \
|
||||
p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
|
||||
p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
|
||||
|
||||
#define SEC_0(p) \
|
||||
p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
|
||||
p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
|
||||
p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
|
||||
p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
|
||||
|
||||
#define UPDATE_SUM(p) \
|
||||
const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
|
||||
const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
|
||||
sum = vec_add(sum, p ## sum0); \
|
||||
sum = vec_add(sum, p ## sum1);
|
||||
|
||||
static inline void
|
||||
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*left)[2], /*const*/ pixel *const top[2],
|
||||
const int w, const int h, const int pri_strength,
|
||||
const int sec_strength, const int dir,
|
||||
const int damping, const enum CdefEdgeFlags edges,
|
||||
const ptrdiff_t tmp_stride, uint16_t *tmp)
|
||||
{
|
||||
const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
|
||||
{ -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
|
||||
{ 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
|
||||
};
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
|
||||
const int off1 = cdef_directions[dir][0];
|
||||
const int off1_1 = cdef_directions[dir][1];
|
||||
|
||||
const int off2 = cdef_directions[(dir + 2) & 7][0];
|
||||
const int off3 = cdef_directions[(dir + 6) & 7][0];
|
||||
|
||||
const int off2_1 = cdef_directions[(dir + 2) & 7][1];
|
||||
const int off3_1 = cdef_directions[(dir + 6) & 7][1];
|
||||
|
||||
|
||||
copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
|
||||
for (int y = 0; y < h / 2; y++) {
|
||||
LOAD_PIX4(tmp)
|
||||
|
||||
// Primary pass
|
||||
LOAD_DIR4(p, tmp, off1, off1_1)
|
||||
|
||||
CONSTRAIN(p, pri_strength)
|
||||
|
||||
MIN_MAX(p)
|
||||
|
||||
PRI_0(p)
|
||||
PRI_1(p)
|
||||
|
||||
UPDATE_SUM(p)
|
||||
|
||||
// Secondary pass 1
|
||||
LOAD_DIR4(s, tmp, off2, off3)
|
||||
|
||||
CONSTRAIN(s, sec_strength)
|
||||
|
||||
MIN_MAX(s)
|
||||
|
||||
SEC_0(s)
|
||||
|
||||
UPDATE_SUM(s)
|
||||
|
||||
// Secondary pass 2
|
||||
LOAD_DIR4(s2, tmp, off2_1, off3_1)
|
||||
|
||||
CONSTRAIN(s2, sec_strength)
|
||||
|
||||
MIN_MAX(s2)
|
||||
|
||||
UPDATE_SUM(s2)
|
||||
|
||||
// Store
|
||||
i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
|
||||
bias = vec_sub(vec_splat_s16(8), bias);
|
||||
i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
|
||||
i16x8 vdst = vec_max(vec_min(unclamped, max), min);
|
||||
|
||||
dst[0] = vdst[0];
|
||||
dst[1] = vdst[1];
|
||||
dst[2] = vdst[2];
|
||||
dst[3] = vdst[3];
|
||||
|
||||
tmp += tmp_stride;
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
dst[0] = vdst[4];
|
||||
dst[1] = vdst[5];
|
||||
dst[2] = vdst[6];
|
||||
dst[3] = vdst[7];
|
||||
|
||||
tmp += tmp_stride;
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
|
||||
const pixel (*left)[2], /*const*/ pixel *const top[2],
|
||||
const int w, const int h, const int pri_strength,
|
||||
const int sec_strength, const int dir,
|
||||
const int damping, const enum CdefEdgeFlags edges,
|
||||
const ptrdiff_t tmp_stride, uint16_t *tmp)
|
||||
{
|
||||
const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
|
||||
{ -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
|
||||
{ 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
|
||||
{ 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
|
||||
{ 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
|
||||
};
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
|
||||
|
||||
const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
|
||||
const int off1 = cdef_directions[dir][0];
|
||||
const int off1_1 = cdef_directions[dir][1];
|
||||
|
||||
const int off2 = cdef_directions[(dir + 2) & 7][0];
|
||||
const int off3 = cdef_directions[(dir + 6) & 7][0];
|
||||
|
||||
const int off2_1 = cdef_directions[(dir + 2) & 7][1];
|
||||
const int off3_1 = cdef_directions[(dir + 6) & 7][1];
|
||||
|
||||
copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
|
||||
|
||||
for (int y = 0; y < h; y++) {
|
||||
LOAD_PIX(tmp)
|
||||
|
||||
// Primary pass
|
||||
LOAD_DIR(p, tmp, off1, off1_1)
|
||||
|
||||
CONSTRAIN(p, pri_strength)
|
||||
|
||||
MIN_MAX(p)
|
||||
|
||||
PRI_0(p)
|
||||
PRI_1(p)
|
||||
|
||||
UPDATE_SUM(p)
|
||||
|
||||
// Secondary pass 1
|
||||
LOAD_DIR(s, tmp, off2, off3)
|
||||
|
||||
CONSTRAIN(s, sec_strength)
|
||||
|
||||
MIN_MAX(s)
|
||||
|
||||
SEC_0(s)
|
||||
|
||||
UPDATE_SUM(s)
|
||||
|
||||
// Secondary pass 2
|
||||
LOAD_DIR(s2, tmp, off2_1, off3_1)
|
||||
|
||||
CONSTRAIN(s2, sec_strength)
|
||||
|
||||
MIN_MAX(s2)
|
||||
|
||||
UPDATE_SUM(s2)
|
||||
|
||||
// Store
|
||||
i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
|
||||
bias = vec_sub(vec_splat_s16(8), bias);
|
||||
i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
|
||||
i16x8 vdst = vec_max(vec_min(unclamped, max), min);
|
||||
|
||||
dst[0] = vdst[0];
|
||||
dst[1] = vdst[1];
|
||||
dst[2] = vdst[2];
|
||||
dst[3] = vdst[3];
|
||||
dst[4] = vdst[4];
|
||||
dst[5] = vdst[5];
|
||||
dst[6] = vdst[6];
|
||||
dst[7] = vdst[7];
|
||||
|
||||
tmp += tmp_stride;
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
#define cdef_fn(w, h, tmp_stride) \
|
||||
static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
|
||||
const ptrdiff_t dst_stride, \
|
||||
const pixel (*left)[2], \
|
||||
/*const*/ pixel *const top[2], \
|
||||
const int pri_strength, \
|
||||
const int sec_strength, \
|
||||
const int dir, \
|
||||
const int damping, \
|
||||
const enum CdefEdgeFlags edges) \
|
||||
{ \
|
||||
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \
|
||||
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
|
||||
filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \
|
||||
dir, damping, edges, tmp_stride, tmp); \
|
||||
}
|
||||
|
||||
cdef_fn(4, 4, 8);
|
||||
cdef_fn(4, 8, 8);
|
||||
cdef_fn(8, 8, 16);
|
||||
#endif
|
||||
|
||||
COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
// c->dir = dav1d_cdef_find_dir_vsx;
|
||||
c->fb[0] = cdef_filter_8x8_vsx;
|
||||
c->fb[1] = cdef_filter_4x8_vsx;
|
||||
c->fb[2] = cdef_filter_4x4_vsx;
|
||||
#endif
|
||||
}
|
52
third_party/dav1d/src/ppc/types.h
vendored
Normal file
52
third_party/dav1d/src/ppc/types.h
vendored
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Luca Barbato
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_PPC_TYPES_H
|
||||
#define DAV1D_SRC_PPC_TYPES_H
|
||||
|
||||
#include <altivec.h>
|
||||
#undef pixel
|
||||
|
||||
#define u8x16 vector unsigned char
|
||||
#define i8x16 vector signed char
|
||||
#define b8x16 vector bool char
|
||||
#define u16x8 vector unsigned short
|
||||
#define i16x8 vector signed short
|
||||
#define b16x8 vector bool short
|
||||
#define u32x4 vector unsigned int
|
||||
#define i32x4 vector signed int
|
||||
#define b32x4 vector bool int
|
||||
#define u64x2 vector unsigned long long
|
||||
#define i64x2 vector signed long long
|
||||
#define b64x2 vector bool long long
|
||||
|
||||
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
|
||||
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
|
||||
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
|
||||
#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
|
||||
|
||||
#endif /* DAV1D_SRC_PPC_TYPES_H */
|
586
third_party/dav1d/src/recon_tmpl.c
vendored
586
third_party/dav1d/src/recon_tmpl.c
vendored
@ -46,16 +46,278 @@
|
||||
#include "src/tables.h"
|
||||
#include "src/wedge.h"
|
||||
|
||||
static unsigned read_golomb(MsacContext *const msac) {
|
||||
static inline unsigned read_golomb(MsacContext *const msac) {
|
||||
int len = 0;
|
||||
unsigned val = 1;
|
||||
|
||||
while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
|
||||
while (len--) val = (val << 1) | dav1d_msac_decode_bool_equi(msac);
|
||||
while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
|
||||
|
||||
return val - 1;
|
||||
}
|
||||
|
||||
static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
|
||||
const enum BlockSize bs,
|
||||
const uint8_t *const a,
|
||||
const uint8_t *const l,
|
||||
const int chroma,
|
||||
const enum Dav1dPixelLayout layout)
|
||||
{
|
||||
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
|
||||
|
||||
if (chroma) {
|
||||
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
|
||||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
|
||||
int ca, cl;
|
||||
|
||||
#define MERGE_CTX(dir, type, mask) \
|
||||
c##dir = !!((*(const type *) dir) & mask); \
|
||||
break
|
||||
|
||||
switch (t_dim->lw) {
|
||||
/* For some reason the MSVC CRT _wassert() function is not flagged as
|
||||
* __declspec(noreturn), so when using those headers the compiler will
|
||||
* expect execution to continue after an assertion has been triggered
|
||||
* and will therefore complain about the use of uninitialized variables
|
||||
* when compiled in debug mode if we put the default case at the end. */
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
|
||||
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
|
||||
return 7 + not_one_blk * 3 + ca + cl;
|
||||
} else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
|
||||
return 0;
|
||||
} else {
|
||||
unsigned la, ll;
|
||||
|
||||
#define MERGE_CTX(dir, type, tx) \
|
||||
if (tx == TX_64X64) { \
|
||||
uint64_t tmp = *(const uint64_t *) dir; \
|
||||
tmp |= *(const uint64_t *) &dir[8]; \
|
||||
l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
|
||||
} else \
|
||||
l##dir = *(const type *) dir; \
|
||||
if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
|
||||
if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
|
||||
if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
|
||||
break
|
||||
|
||||
switch (t_dim->lw) {
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
|
||||
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
|
||||
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
|
||||
case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
|
||||
case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
|
||||
}
|
||||
switch (t_dim->lh) {
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
|
||||
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
|
||||
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
|
||||
case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
|
||||
case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
|
||||
}
|
||||
#undef MERGE_CTX
|
||||
|
||||
return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
|
||||
const uint8_t *const a,
|
||||
const uint8_t *const l)
|
||||
{
|
||||
uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
|
||||
int s;
|
||||
|
||||
#if ARCH_X86_64 && defined(__GNUC__)
|
||||
/* Coerce compilers into producing better code. For some reason
|
||||
* every x86-64 compiler is awful at handling 64-bit constants. */
|
||||
__asm__("" : "+r"(mask), "+r"(mul));
|
||||
#endif
|
||||
|
||||
switch(tx) {
|
||||
default: assert(0); /* fall-through */
|
||||
case TX_4X4: {
|
||||
int t = *(const uint8_t *) a >> 6;
|
||||
t += *(const uint8_t *) l >> 6;
|
||||
s = t - 1 - 1;
|
||||
break;
|
||||
}
|
||||
case TX_8X8: {
|
||||
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
|
||||
t += *(const uint16_t *) l & (uint32_t) mask;
|
||||
t *= 0x04040404U;
|
||||
s = (int) (t >> 24) - 2 - 2;
|
||||
break;
|
||||
}
|
||||
case TX_16X16: {
|
||||
uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
|
||||
t += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
|
||||
t *= (uint32_t) mul;
|
||||
s = (int) (t >> 24) - 4 - 4;
|
||||
break;
|
||||
}
|
||||
case TX_32X32: {
|
||||
uint64_t t = (*(const uint64_t *) a & mask) >> 6;
|
||||
t += (*(const uint64_t *) l & mask) >> 6;
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 8 - 8;
|
||||
break;
|
||||
}
|
||||
case TX_64X64: {
|
||||
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &a[8] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &l[0] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &l[8] & mask) >> 6;
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 16 - 16;
|
||||
break;
|
||||
}
|
||||
case RTX_4X8: {
|
||||
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
|
||||
t += *(const uint16_t *) l & (uint32_t) mask;
|
||||
t *= 0x04040404U;
|
||||
s = (int) (t >> 24) - 1 - 2;
|
||||
break;
|
||||
}
|
||||
case RTX_8X4: {
|
||||
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
|
||||
t += *(const uint8_t *) l & (uint32_t) mask;
|
||||
t *= 0x04040404U;
|
||||
s = (int) (t >> 24) - 2 - 1;
|
||||
break;
|
||||
}
|
||||
case RTX_8X16: {
|
||||
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
|
||||
t += *(const uint32_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * (uint32_t) mul;
|
||||
s = (int) (t >> 24) - 2 - 4;
|
||||
break;
|
||||
}
|
||||
case RTX_16X8: {
|
||||
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
|
||||
t += *(const uint16_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * (uint32_t) mul;
|
||||
s = (int) (t >> 24) - 4 - 2;
|
||||
break;
|
||||
}
|
||||
case RTX_16X32: {
|
||||
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
|
||||
t += *(const uint64_t *) l & mask;
|
||||
t = (t >> 6) * mul;
|
||||
s = (int) (t >> 56) - 4 - 8;
|
||||
break;
|
||||
}
|
||||
case RTX_32X16: {
|
||||
uint64_t t = *(const uint64_t *) a & mask;
|
||||
t += *(const uint32_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * mul;
|
||||
s = (int) (t >> 56) - 8 - 4;
|
||||
break;
|
||||
}
|
||||
case RTX_32X64: {
|
||||
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &l[0] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &l[8] & mask) >> 6;
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 8 - 16;
|
||||
break;
|
||||
}
|
||||
case RTX_64X32: {
|
||||
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &a[8] & mask) >> 6;
|
||||
t += (*(const uint64_t *) &l[0] & mask) >> 6;
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 16 - 8;
|
||||
break;
|
||||
}
|
||||
case RTX_4X16: {
|
||||
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
|
||||
t += *(const uint32_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * (uint32_t) mul;
|
||||
s = (int) (t >> 24) - 1 - 4;
|
||||
break;
|
||||
}
|
||||
case RTX_16X4: {
|
||||
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
|
||||
t += *(const uint8_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * (uint32_t) mul;
|
||||
s = (int) (t >> 24) - 4 - 1;
|
||||
break;
|
||||
}
|
||||
case RTX_8X32: {
|
||||
uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
|
||||
t += *(const uint64_t *) l & mask;
|
||||
t = (t >> 6) * mul;
|
||||
s = (int) (t >> 56) - 2 - 8;
|
||||
break;
|
||||
}
|
||||
case RTX_32X8: {
|
||||
uint64_t t = *(const uint64_t *) a & mask;
|
||||
t += *(const uint16_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) * mul;
|
||||
s = (int) (t >> 56) - 8 - 2;
|
||||
break;
|
||||
}
|
||||
case RTX_16X64: {
|
||||
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
|
||||
t += *(const uint64_t *) &l[0] & mask;
|
||||
t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 4 - 16;
|
||||
break;
|
||||
}
|
||||
case RTX_64X16: {
|
||||
uint64_t t = *(const uint64_t *) &a[0] & mask;
|
||||
t += *(const uint32_t *) l & (uint32_t) mask;
|
||||
t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
|
||||
t *= mul;
|
||||
s = (int) (t >> 56) - 16 - 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (s != 0) + (s > 0);
|
||||
}
|
||||
|
||||
static inline unsigned get_lo_ctx(const uint8_t *const levels,
|
||||
const enum TxClass tx_class,
|
||||
unsigned *const hi_mag,
|
||||
const uint8_t (*const ctx_offsets)[5],
|
||||
const unsigned x, const unsigned y,
|
||||
const ptrdiff_t stride)
|
||||
{
|
||||
unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
|
||||
unsigned offset;
|
||||
if (tx_class == TX_CLASS_2D) {
|
||||
mag += levels[1 * stride + 1];
|
||||
*hi_mag = mag;
|
||||
mag += levels[0 * stride + 2] + levels[2 * stride + 0];
|
||||
offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
|
||||
} else {
|
||||
mag += levels[0 * stride + 2];
|
||||
*hi_mag = mag;
|
||||
mag += levels[0 * stride + 3] + levels[0 * stride + 4];
|
||||
offset = 26 + (y > 1 ? 10 : y * 5);
|
||||
}
|
||||
return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
|
||||
}
|
||||
|
||||
static int decode_coefs(Dav1dTileContext *const t,
|
||||
uint8_t *const a, uint8_t *const l,
|
||||
const enum RectTxfmSize tx, const enum BlockSize bs,
|
||||
@ -66,6 +328,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const int chroma = !!plane;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
|
||||
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
|
||||
const int dbg = DEBUG_BLOCK_INFO && plane && 0;
|
||||
|
||||
@ -73,7 +336,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
printf("Start: r=%d\n", ts->msac.rng);
|
||||
|
||||
// does this block have any non-zero coefficients
|
||||
const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
|
||||
const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
|
||||
const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.coef.skip[t_dim->ctx][sctx]);
|
||||
if (dbg)
|
||||
@ -81,41 +344,56 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
t_dim->ctx, sctx, all_skip, ts->msac.rng);
|
||||
if (all_skip) {
|
||||
*res_ctx = 0x40;
|
||||
*txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT : DCT_DCT;
|
||||
*txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
|
||||
return -1;
|
||||
}
|
||||
|
||||
// transform type (chroma: derived, luma: explicitly coded)
|
||||
if (chroma) {
|
||||
if (intra) {
|
||||
*txtp = get_uv_intra_txtp(b->uv_mode, tx, f->frame_hdr, b->seg_id);
|
||||
} else {
|
||||
const enum TxfmType y_txtp = *txtp;
|
||||
*txtp = get_uv_inter_txtp(t_dim, y_txtp, f->frame_hdr, b->seg_id);
|
||||
}
|
||||
if (lossless) {
|
||||
assert(t_dim->max == TX_4X4);
|
||||
*txtp = WHT_WHT;
|
||||
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
|
||||
t_dim->max + intra >= TX_64X64)
|
||||
{
|
||||
*txtp = DCT_DCT;
|
||||
} else if (chroma) {
|
||||
*txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
|
||||
get_uv_inter_txtp(t_dim, *txtp);
|
||||
} else {
|
||||
const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
|
||||
f->frame_hdr, b->seg_id);
|
||||
const unsigned set_cnt = dav1d_tx_type_count[set];
|
||||
unsigned idx;
|
||||
if (set_cnt == 1) {
|
||||
idx = 0;
|
||||
} else {
|
||||
const int set_idx = dav1d_tx_type_set_index[!intra][set];
|
||||
const enum IntraPredMode y_mode_nofilt = intra ? b->y_mode == FILTER_PRED ?
|
||||
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode : 0;
|
||||
uint16_t *const txtp_cdf = intra ?
|
||||
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
|
||||
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
|
||||
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
|
||||
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
|
||||
|
||||
if (intra) {
|
||||
const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
|
||||
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
|
||||
if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
|
||||
idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
|
||||
*txtp = dav1d_tx_types_per_set[idx + 0];
|
||||
} else {
|
||||
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
|
||||
*txtp = dav1d_tx_types_per_set[idx + 5];
|
||||
}
|
||||
if (dbg)
|
||||
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
|
||||
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
|
||||
idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
|
||||
printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
|
||||
tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
|
||||
} else {
|
||||
if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
|
||||
idx = dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.txtp_inter3[t_dim->min]);
|
||||
*txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
|
||||
} else if (t_dim->min == TX_16X16) {
|
||||
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.txtp_inter2, 11);
|
||||
*txtp = dav1d_tx_types_per_set[idx + 12];
|
||||
} else {
|
||||
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
|
||||
ts->cdf.m.txtp_inter1[t_dim->min], 15);
|
||||
*txtp = dav1d_tx_types_per_set[idx + 24];
|
||||
}
|
||||
if (dbg)
|
||||
printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
|
||||
tx, t_dim->min, idx, *txtp, ts->msac.rng);
|
||||
}
|
||||
*txtp = dav1d_tx_types_per_set[set][idx];
|
||||
}
|
||||
|
||||
// find end-of-block (eob)
|
||||
@ -124,19 +402,19 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
|
||||
const int is_1d = tx_class != TX_CLASS_2D;
|
||||
switch (tx2dszctx) {
|
||||
#define case_sz(sz, bin, ns) \
|
||||
#define case_sz(sz, bin, ns, is_1d) \
|
||||
case sz: { \
|
||||
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
|
||||
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
|
||||
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
|
||||
break; \
|
||||
}
|
||||
case_sz(0, 16, 4);
|
||||
case_sz(1, 32, 8);
|
||||
case_sz(2, 64, 8);
|
||||
case_sz(3, 128, 8);
|
||||
case_sz(4, 256, 16);
|
||||
case_sz(5, 512, 16);
|
||||
case_sz(6, 1024, 16);
|
||||
case_sz(0, 16, 4, [is_1d]);
|
||||
case_sz(1, 32, 8, [is_1d]);
|
||||
case_sz(2, 64, 8, [is_1d]);
|
||||
case_sz(3, 128, 8, [is_1d]);
|
||||
case_sz(4, 256, 16, [is_1d]);
|
||||
case_sz(5, 512, 16, );
|
||||
case_sz(6, 1024, 16, );
|
||||
#undef case_sz
|
||||
}
|
||||
if (dbg)
|
||||
@ -159,122 +437,134 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
}
|
||||
|
||||
// base tokens
|
||||
uint16_t (*const br_cdf)[5] =
|
||||
ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
|
||||
const int16_t *const scan = dav1d_scans[tx][tx_class];
|
||||
uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
|
||||
uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
|
||||
const uint16_t *const scan = dav1d_scans[tx][tx_class];
|
||||
int dc_tok;
|
||||
|
||||
if (eob) {
|
||||
uint8_t *const levels = t->scratch.levels;
|
||||
uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
|
||||
uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
|
||||
const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
|
||||
const ptrdiff_t stride = 4 * (sh + 1);
|
||||
memset(levels, 0, stride * 4 * (sw + 1));
|
||||
const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
|
||||
const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
|
||||
|
||||
{ // eob
|
||||
const int rc = scan[eob], x = rc >> shift, y = rc & mask;
|
||||
/* eob */
|
||||
unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
|
||||
unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
|
||||
int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
|
||||
int tok = eob_tok + 1;
|
||||
int level_tok = tok * 0x41;
|
||||
unsigned mag;
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
|
||||
|
||||
const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
|
||||
uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx];
|
||||
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
|
||||
#define DECODE_COEFS_CLASS(tx_class) \
|
||||
if (eob_tok == 2) { \
|
||||
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
|
||||
tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
|
||||
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
|
||||
level_tok = tok + (3 << 6); \
|
||||
if (dbg) \
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
|
||||
ts->msac.rng); \
|
||||
} \
|
||||
cf[rc] = tok; \
|
||||
if (tx_class == TX_CLASS_H) \
|
||||
/* Transposing reduces the stride and padding requirements */ \
|
||||
levels[y * stride + x] = (uint8_t) level_tok; \
|
||||
else \
|
||||
levels[x * stride + y] = (uint8_t) level_tok; \
|
||||
for (int i = eob - 1; i > 0; i--) { /* ac */ \
|
||||
if (tx_class == TX_CLASS_H) \
|
||||
rc = i, x = rc & mask, y = rc >> shift; \
|
||||
else \
|
||||
rc = scan[i], x = rc >> shift, y = rc & mask; \
|
||||
assert(x < 32 && y < 32); \
|
||||
uint8_t *const level = levels + x * stride + y; \
|
||||
ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
|
||||
if (tx_class == TX_CLASS_2D) \
|
||||
y |= x; \
|
||||
tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
|
||||
level_tok = tok * 0x41; \
|
||||
if (dbg) \
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
|
||||
if (tok == 3) { \
|
||||
mag &= 63; \
|
||||
ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
|
||||
(mag > 12 ? 6 : (mag + 1) >> 1); \
|
||||
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
|
||||
level_tok = tok + (3 << 6); \
|
||||
if (dbg) \
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
|
||||
imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
|
||||
ts->msac.rng); \
|
||||
} \
|
||||
cf[rc] = tok; \
|
||||
*level = (uint8_t) level_tok; \
|
||||
} \
|
||||
/* dc */ \
|
||||
ctx = (tx_class == TX_CLASS_2D) ? 0 : \
|
||||
get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
|
||||
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
|
||||
if (dbg) \
|
||||
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
|
||||
t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
|
||||
if (dc_tok == 3) { \
|
||||
if (tx_class == TX_CLASS_2D) \
|
||||
mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
|
||||
levels[1 * stride + 1]; \
|
||||
mag &= 63; \
|
||||
ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
|
||||
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
|
||||
if (dbg) \
|
||||
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
|
||||
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
|
||||
} \
|
||||
break
|
||||
|
||||
if (tok == 3) {
|
||||
const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[br_ctx], 4);
|
||||
if (dbg)
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, br_ctx,
|
||||
eob, rc, tok_br, tok, ts->msac.rng);
|
||||
tok += tok_br;
|
||||
if (tok_br < 3) break;
|
||||
} while (tok < 15);
|
||||
}
|
||||
|
||||
cf[rc] = tok;
|
||||
levels[x * stride + y] = (uint8_t) tok;
|
||||
switch (tx_class) {
|
||||
case TX_CLASS_2D: {
|
||||
const unsigned nonsquare_tx = tx >= RTX_4X8;
|
||||
const uint8_t (*const lo_ctx_offsets)[5] =
|
||||
dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
|
||||
const ptrdiff_t stride = 4 * sh;
|
||||
memset(levels, 0, stride * (4 * sw + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_2D);
|
||||
}
|
||||
for (int i = eob - 1; i > 0; i--) { // ac
|
||||
const int rc = scan[i], x = rc >> shift, y = rc & mask;
|
||||
|
||||
// lo tok
|
||||
const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride);
|
||||
uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
|
||||
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
|
||||
if (dbg)
|
||||
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
|
||||
|
||||
// hi tok
|
||||
if (tok == 3) {
|
||||
const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[br_ctx], 4);
|
||||
if (dbg)
|
||||
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, br_ctx,
|
||||
i, rc, tok_br, tok, ts->msac.rng);
|
||||
tok += tok_br;
|
||||
if (tok_br < 3) break;
|
||||
} while (tok < 15);
|
||||
}
|
||||
|
||||
cf[rc] = tok;
|
||||
levels[x * stride + y] = (uint8_t) tok;
|
||||
case TX_CLASS_H: {
|
||||
#define lo_ctx_offsets NULL
|
||||
const ptrdiff_t stride = 16;
|
||||
memset(levels, 0, stride * (4 * sh + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_H);
|
||||
}
|
||||
{ // dc
|
||||
int ctx = 0;
|
||||
if (tx_class != TX_CLASS_2D)
|
||||
ctx = get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride);
|
||||
uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
|
||||
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
|
||||
if (dbg)
|
||||
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng);
|
||||
|
||||
if (dc_tok == 3) {
|
||||
const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride);
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[br_ctx], 4);
|
||||
if (dbg)
|
||||
printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, br_ctx,
|
||||
tok_br, dc_tok, ts->msac.rng);
|
||||
dc_tok += tok_br;
|
||||
if (tok_br < 3) break;
|
||||
} while (dc_tok < 15);
|
||||
}
|
||||
case TX_CLASS_V: {
|
||||
const ptrdiff_t stride = 16;
|
||||
memset(levels, 0, stride * (4 * sw + 2));
|
||||
DECODE_COEFS_CLASS(TX_CLASS_V);
|
||||
}
|
||||
#undef lo_ctx_offsets
|
||||
#undef DECODE_COEFS_CLASS
|
||||
default: assert(0);
|
||||
}
|
||||
} else { // dc-only
|
||||
uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0];
|
||||
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
|
||||
int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
|
||||
dc_tok = 1 + tok_br;
|
||||
if (dbg)
|
||||
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
|
||||
t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
|
||||
|
||||
if (dc_tok == 3) {
|
||||
do {
|
||||
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
|
||||
br_cdf[0], 4);
|
||||
if (dbg)
|
||||
printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, 0,
|
||||
tok_br, dc_tok, ts->msac.rng);
|
||||
dc_tok += tok_br;
|
||||
if (tok_br < 3) break;
|
||||
} while (dc_tok < 15);
|
||||
if (tok_br == 2) {
|
||||
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
|
||||
if (dbg)
|
||||
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
|
||||
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
|
||||
}
|
||||
}
|
||||
|
||||
// residual and sign
|
||||
int dc_sign = 1 << 6;
|
||||
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
|
||||
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
|
||||
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
|
||||
const int dq_shift = imax(0, t_dim->ctx - 2);
|
||||
@ -283,7 +573,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
unsigned cul_level = 0;
|
||||
|
||||
if (dc_tok) { // dc
|
||||
const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
|
||||
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
|
||||
uint16_t *const dc_sign_cdf =
|
||||
ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
|
||||
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
|
||||
@ -335,7 +625,7 @@ static int decode_coefs(Dav1dTileContext *const t,
|
||||
}
|
||||
|
||||
// context
|
||||
*res_ctx = imin(cul_level, 63) | dc_sign;
|
||||
*res_ctx = umin(cul_level, 63) | dc_sign;
|
||||
|
||||
return eob;
|
||||
}
|
||||
@ -782,15 +1072,17 @@ static int warp_affine(Dav1dTileContext *const t,
|
||||
// luma pixel units
|
||||
const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
|
||||
const int src_y = t->by * 4 + ((y + 4) << ss_ver);
|
||||
const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
|
||||
const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
|
||||
const int64_t mvx = ((int64_t) mat[2] * src_x +
|
||||
(int64_t) mat[3] * src_y + mat[0]) >> ss_hor;
|
||||
const int64_t mvy = ((int64_t) mat[4] * src_x +
|
||||
(int64_t) mat[5] * src_y + mat[1]) >> ss_ver;
|
||||
|
||||
const int dx = (mvx >> 16) - 4;
|
||||
const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
|
||||
wmp->beta * 7) & ~0x3f;
|
||||
const int dy = (mvy >> 16) - 4;
|
||||
const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
|
||||
wmp->delta * 4) & ~0x3f;
|
||||
const int dx = (int) (mvx >> 16) - 4;
|
||||
const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
|
||||
wmp->beta * 7) & ~0x3f;
|
||||
const int dy = (int) (mvy >> 16) - 4;
|
||||
const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
|
||||
wmp->delta * 4) & ~0x3f;
|
||||
|
||||
const pixel *ref_ptr;
|
||||
ptrdiff_t ref_stride = refp->p.stride[!!pl];
|
||||
|
1
third_party/dav1d/src/ref_mvs.c
vendored
1
third_party/dav1d/src/ref_mvs.c
vendored
@ -47,7 +47,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
|
150
third_party/dav1d/src/scan.c
vendored
150
third_party/dav1d/src/scan.c
vendored
@ -30,25 +30,19 @@
|
||||
#include "common/attributes.h"
|
||||
#include "src/scan.h"
|
||||
|
||||
static const int16_t ALIGN(av1_default_scan_4x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
|
||||
0, 4, 1, 2,
|
||||
5, 8, 12, 9,
|
||||
6, 3, 7, 10,
|
||||
13, 14, 11, 15,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
|
||||
0, 4, 8, 12,
|
||||
1, 5, 9, 13,
|
||||
2, 6, 10, 14,
|
||||
3, 7, 11, 15,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_4x4[], 32) = {
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
|
||||
0, 8, 1, 16,
|
||||
9, 2, 24, 17,
|
||||
10, 3, 25, 18,
|
||||
@ -58,7 +52,7 @@ static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
|
||||
14, 7, 29, 22,
|
||||
15, 30, 23, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
|
||||
0, 8, 16, 24,
|
||||
1, 9, 17, 25,
|
||||
2, 10, 18, 26,
|
||||
@ -68,17 +62,7 @@ static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
|
||||
6, 14, 22, 30,
|
||||
7, 15, 23, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_4x8[], 32) = {
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
16, 17, 18, 19,
|
||||
20, 21, 22, 23,
|
||||
24, 25, 26, 27,
|
||||
28, 29, 30, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
|
||||
0, 16, 1, 32,
|
||||
17, 2, 48, 33,
|
||||
18, 3, 49, 34,
|
||||
@ -96,7 +80,7 @@ static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
|
||||
30, 15, 61, 46,
|
||||
31, 62, 47, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
|
||||
0, 16, 32, 48,
|
||||
1, 17, 33, 49,
|
||||
2, 18, 34, 50,
|
||||
@ -114,43 +98,19 @@ static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
|
||||
14, 30, 46, 62,
|
||||
15, 31, 47, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_4x16[], 32) = {
|
||||
0, 1, 2, 3,
|
||||
4, 5, 6, 7,
|
||||
8, 9, 10, 11,
|
||||
12, 13, 14, 15,
|
||||
16, 17, 18, 19,
|
||||
20, 21, 22, 23,
|
||||
24, 25, 26, 27,
|
||||
28, 29, 30, 31,
|
||||
32, 33, 34, 35,
|
||||
36, 37, 38, 39,
|
||||
40, 41, 42, 43,
|
||||
44, 45, 46, 47,
|
||||
48, 49, 50, 51,
|
||||
52, 53, 54, 55,
|
||||
56, 57, 58, 59,
|
||||
60, 61, 62, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_8x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
|
||||
0, 1, 4, 2, 5, 8, 3, 6,
|
||||
9, 12, 7, 10, 13, 16, 11, 14,
|
||||
17, 20, 15, 18, 21, 24, 19, 22,
|
||||
25, 28, 23, 26, 29, 27, 30, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28,
|
||||
1, 5, 9, 13, 17, 21, 25, 29,
|
||||
2, 6, 10, 14, 18, 22, 26, 30,
|
||||
3, 7, 11, 15, 19, 23, 27, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_8x4[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
|
||||
0, 8, 1, 2, 9, 16, 24, 17,
|
||||
10, 3, 4, 11, 18, 25, 32, 40,
|
||||
33, 26, 19, 12, 5, 6, 13, 20,
|
||||
@ -160,7 +120,7 @@ static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
|
||||
23, 31, 38, 45, 52, 59, 60, 53,
|
||||
46, 39, 47, 54, 61, 62, 55, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
|
||||
0, 8, 16, 24, 32, 40, 48, 56,
|
||||
1, 9, 17, 25, 33, 41, 49, 57,
|
||||
2, 10, 18, 26, 34, 42, 50, 58,
|
||||
@ -170,17 +130,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
|
||||
6, 14, 22, 30, 38, 46, 54, 62,
|
||||
7, 15, 23, 31, 39, 47, 55, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_8x8[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
|
||||
0, 16, 1, 32, 17, 2, 48, 33,
|
||||
18, 3, 64, 49, 34, 19, 4, 80,
|
||||
65, 50, 35, 20, 5, 96, 81, 66,
|
||||
@ -198,7 +148,7 @@ static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
|
||||
47, 123, 108, 93, 78, 63, 124, 109,
|
||||
94, 79, 125, 110, 95, 126, 111, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
|
||||
0, 16, 32, 48, 64, 80, 96, 112,
|
||||
1, 17, 33, 49, 65, 81, 97, 113,
|
||||
2, 18, 34, 50, 66, 82, 98, 114,
|
||||
@ -216,25 +166,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
|
||||
14, 30, 46, 62, 78, 94, 110, 126,
|
||||
15, 31, 47, 63, 79, 95, 111, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_8x16[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23,
|
||||
24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39,
|
||||
40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55,
|
||||
56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 65, 66, 67, 68, 69, 70, 71,
|
||||
72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87,
|
||||
88, 89, 90, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99, 100, 101, 102, 103,
|
||||
104, 105, 106, 107, 108, 109, 110, 111,
|
||||
112, 113, 114, 115, 116, 117, 118, 119,
|
||||
120, 121, 122, 123, 124, 125, 126, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
|
||||
0, 32, 1, 64, 33, 2, 96, 65,
|
||||
34, 3, 128, 97, 66, 35, 4, 160,
|
||||
129, 98, 67, 36, 5, 192, 161, 130,
|
||||
@ -268,25 +200,19 @@ static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
|
||||
95, 251, 220, 189, 158, 127, 252, 221,
|
||||
190, 159, 253, 222, 191, 254, 223, 255,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_16x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
|
||||
0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
|
||||
17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
|
||||
33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
|
||||
49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
|
||||
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
|
||||
1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
|
||||
2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
|
||||
3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_16x4[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
|
||||
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
|
||||
12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
|
||||
@ -296,7 +222,7 @@ static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
|
||||
99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
|
||||
122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
|
||||
0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
|
||||
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
|
||||
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
|
||||
@ -306,17 +232,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
|
||||
6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
|
||||
7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_16x8[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
|
||||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
|
||||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
|
||||
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
|
||||
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
|
||||
0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
|
||||
65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
|
||||
52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
|
||||
@ -334,7 +250,7 @@ static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
|
||||
188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
|
||||
175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
|
||||
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
|
||||
1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
|
||||
2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
|
||||
@ -352,7 +268,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
|
||||
14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
|
||||
15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
|
||||
};
|
||||
static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
|
||||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
|
||||
@ -370,7 +286,7 @@ static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
|
||||
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
|
||||
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
|
||||
0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
|
||||
129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
|
||||
100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
|
||||
@ -404,7 +320,7 @@ static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
|
||||
380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
|
||||
351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
|
||||
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
|
||||
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
|
||||
67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
|
||||
@ -414,7 +330,7 @@ static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
|
||||
195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
|
||||
227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
|
||||
0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
|
||||
67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
|
||||
145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
|
||||
@ -432,7 +348,7 @@ static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
|
||||
381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
|
||||
459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
|
||||
};
|
||||
static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
|
||||
static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
|
||||
0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
|
||||
100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
|
||||
289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
|
||||
@ -467,15 +383,15 @@ static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
|
||||
892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
|
||||
};
|
||||
|
||||
const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
|
||||
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
|
||||
[TX_4X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_4x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [TX_8X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_8x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [TX_16X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x16,
|
||||
@ -487,19 +403,19 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
|
||||
}, [RTX_4X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_4x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_8x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_8x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_8x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X8] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x8,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x8,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x32,
|
||||
}, [RTX_32X16] = {
|
||||
@ -511,11 +427,11 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
|
||||
}, [RTX_4X16] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_4x16,
|
||||
[TX_CLASS_V] = av1_mrow_scan_4x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_4x16,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_16X4] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_16x4,
|
||||
[TX_CLASS_V] = av1_mrow_scan_16x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x4,
|
||||
[TX_CLASS_H] = av1_mcol_scan_16x16,
|
||||
}, [RTX_8X32] = {
|
||||
[TX_CLASS_2D] = av1_default_scan_8x32,
|
||||
}, [RTX_32X8] = {
|
||||
|
2
third_party/dav1d/src/scan.h
vendored
2
third_party/dav1d/src/scan.h
vendored
@ -32,6 +32,6 @@
|
||||
|
||||
#include "src/levels.h"
|
||||
|
||||
extern const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
|
||||
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
|
||||
|
||||
#endif /* DAV1D_SRC_SCAN_H */
|
||||
|
184
third_party/dav1d/src/tables.c
vendored
184
third_party/dav1d/src/tables.c
vendored
@ -225,37 +225,26 @@ const uint8_t /* enum InterPredMode */
|
||||
[NEARMV_NEWMV] = { NEARMV, NEWMV },
|
||||
};
|
||||
|
||||
const uint8_t dav1d_tx_type_count[N_TXTP_SETS] = {
|
||||
[TXTP_SET_DCT] = 1,
|
||||
[TXTP_SET_DCT_ID] = 2,
|
||||
[TXTP_SET_DT4_ID] = 5,
|
||||
[TXTP_SET_DT4_ID_1D] = 7,
|
||||
[TXTP_SET_DT9_ID_1D] = 12,
|
||||
[TXTP_SET_ALL] = 16,
|
||||
[TXTP_SET_LOSSLESS] = 1,
|
||||
const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
|
||||
[BL_128X128] = N_PARTITIONS - 3,
|
||||
[BL_64X64] = N_PARTITIONS - 1,
|
||||
[BL_32X32] = N_PARTITIONS - 1,
|
||||
[BL_16X16] = N_PARTITIONS - 1,
|
||||
[BL_8X8] = N_SUB8X8_PARTITIONS - 1,
|
||||
};
|
||||
|
||||
const uint8_t /* enum TxfmType */
|
||||
dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES] =
|
||||
{
|
||||
[TXTP_SET_DCT] = { DCT_DCT },
|
||||
[TXTP_SET_DCT_ID] = { IDTX, DCT_DCT },
|
||||
[TXTP_SET_DT4_ID] = { IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST },
|
||||
[TXTP_SET_DT4_ID_1D] = { IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT,
|
||||
DCT_ADST },
|
||||
[TXTP_SET_DT9_ID_1D] = { IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST,
|
||||
FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
|
||||
FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
|
||||
[TXTP_SET_ALL] = { IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
|
||||
H_FLIPADST, DCT_DCT, ADST_DCT, DCT_ADST,
|
||||
FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
|
||||
FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
|
||||
[TXTP_SET_LOSSLESS] = { WHT_WHT },
|
||||
};
|
||||
|
||||
const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS] = {
|
||||
{ 0, -1, 2, 1, -1, -1, 3 },
|
||||
{ 0, 3, -1, -1, 2, 1, 4 },
|
||||
const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
|
||||
/* Intra2 */
|
||||
IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
|
||||
/* Intra1 */
|
||||
IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
|
||||
/* Inter2 */
|
||||
IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
|
||||
DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
|
||||
/* Inter1 */
|
||||
IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
|
||||
DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
|
||||
ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
|
||||
};
|
||||
|
||||
const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
|
||||
@ -283,119 +272,34 @@ const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
|
||||
[BS_4x4 ] = 0,
|
||||
};
|
||||
|
||||
const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5] = {
|
||||
[TX_4X4] = {
|
||||
{ 0, 1, 6, 6 },
|
||||
{ 1, 6, 6, 21 },
|
||||
{ 6, 6, 21, 21 },
|
||||
{ 6, 21, 21, 21 },
|
||||
}, [TX_8X8] = {
|
||||
{ 0, 1, 6, 6, 21 },
|
||||
{ 1, 6, 6, 21, 21 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [TX_16X16] = {
|
||||
{ 0, 1, 6, 6, 21 },
|
||||
{ 1, 6, 6, 21, 21 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [TX_32X32] = {
|
||||
{ 0, 1, 6, 6, 21 },
|
||||
{ 1, 6, 6, 21, 21 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [TX_64X64] = {
|
||||
{ 0, 1, 6, 6, 21 },
|
||||
{ 1, 6, 6, 21, 21 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_4X8] = {
|
||||
{ 0, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21 },
|
||||
{ 6, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21 }
|
||||
}, [RTX_8X4] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
|
||||
{ /* w == h */
|
||||
{ 0, 1, 6, 6, 21 },
|
||||
{ 1, 6, 6, 21, 21 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 },
|
||||
}, { /* w > h */
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
}, [RTX_8X16] = {
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
}, { /* w < h */
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_16X8] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 }
|
||||
}, [RTX_16X32] = {
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_32X16] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 }
|
||||
}, [RTX_32X64] = {
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_64X32] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 }
|
||||
}, [RTX_4X16] = {
|
||||
{ 0, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21 },
|
||||
{ 6, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21 }
|
||||
}, [RTX_16X4] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
}, [RTX_8X32] = {
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_32X8] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 }
|
||||
}, [RTX_16X64] = {
|
||||
{ 0, 11, 11, 11, 11 },
|
||||
{ 11, 11, 11, 11, 11 },
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 }
|
||||
}, [RTX_64X16] = {
|
||||
{ 0, 16, 6, 6, 21 },
|
||||
{ 16, 16, 6, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 },
|
||||
{ 16, 16, 21, 21, 21 }
|
||||
}
|
||||
{ 6, 6, 21, 21, 21 },
|
||||
{ 6, 21, 21, 21, 21 },
|
||||
{ 21, 21, 21, 21, 21 },
|
||||
},
|
||||
};
|
||||
|
||||
const uint8_t dav1d_skip_ctx[5][5] = {
|
||||
{ 1, 2, 2, 2, 3 },
|
||||
{ 2, 4, 4, 4, 5 },
|
||||
{ 2, 4, 4, 4, 5 },
|
||||
{ 2, 4, 4, 4, 5 },
|
||||
{ 3, 5, 5, 5, 6 },
|
||||
};
|
||||
|
||||
const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
|
||||
@ -861,7 +765,7 @@ const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
|
||||
}
|
||||
};
|
||||
|
||||
const uint8_t dav1d_obmc_masks[64] = {
|
||||
const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
|
||||
/* Unused */
|
||||
0, 0,
|
||||
/* 2 */
|
||||
|
9
third_party/dav1d/src/tables.h
vendored
9
third_party/dav1d/src/tables.h
vendored
@ -52,14 +52,13 @@ extern const uint8_t /* enum TxfmType */
|
||||
extern const uint8_t /* enum InterPredMode */
|
||||
dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
|
||||
|
||||
extern const uint8_t dav1d_tx_type_count[N_TXTP_SETS];
|
||||
extern const uint8_t /* enum TxfmType */
|
||||
dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES];
|
||||
extern const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS];
|
||||
extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
|
||||
extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
|
||||
|
||||
extern const uint8_t dav1d_filter_mode_to_y_mode[5];
|
||||
extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
|
||||
extern const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5];
|
||||
extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
|
||||
extern const uint8_t dav1d_skip_ctx[5][5];
|
||||
extern const uint8_t /* enum TxClass */
|
||||
dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
|
||||
extern const uint8_t /* enum Filter2d */
|
||||
|
37
third_party/dav1d/src/thread.h
vendored
37
third_party/dav1d/src/thread.h
vendored
@ -48,6 +48,10 @@ typedef SRWLOCK pthread_mutex_t;
|
||||
typedef CONDITION_VARIABLE pthread_cond_t;
|
||||
typedef INIT_ONCE pthread_once_t;
|
||||
|
||||
void dav1d_init_thread(void);
|
||||
void dav1d_set_thread_name(const wchar_t *name);
|
||||
#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
|
||||
|
||||
int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
|
||||
void *(*func)(void*), void *arg);
|
||||
int dav1d_pthread_join(pthread_t *thread, void **res);
|
||||
@ -126,7 +130,7 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#endif
|
||||
#define dav1d_init_thread() do {} while (0)
|
||||
|
||||
/* Thread naming support */
|
||||
|
||||
@ -134,13 +138,40 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
|
||||
|
||||
#include <sys/prctl.h>
|
||||
|
||||
static inline void dav1d_set_thread_name(const char* name) {
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
prctl(PR_SET_NAME, name);
|
||||
}
|
||||
|
||||
#elif defined(__APPLE__)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_setname_np(name);
|
||||
}
|
||||
|
||||
#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
|
||||
|
||||
#if defined(__FreeBSD__)
|
||||
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
|
||||
#define _SYS_PARAM_H_
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
#include <pthread_np.h>
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_set_name_np(pthread_self(), name);
|
||||
}
|
||||
|
||||
#elif defined(__NetBSD__)
|
||||
|
||||
static inline void dav1d_set_thread_name(const char *const name) {
|
||||
pthread_setname_np(pthread_self(), "%s", (void*)name);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define dav1d_set_thread_name(name)
|
||||
#define dav1d_set_thread_name(name) do {} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
1
third_party/dav1d/src/warpmv.c
vendored
1
third_party/dav1d/src/warpmv.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
|
14
third_party/dav1d/src/win32/thread.c
vendored
14
third_party/dav1d/src/win32/thread.c
vendored
@ -37,6 +37,20 @@
|
||||
|
||||
#include "src/thread.h"
|
||||
|
||||
static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
|
||||
|
||||
COLD void dav1d_init_thread(void) {
|
||||
set_thread_description =
|
||||
(void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"),
|
||||
"SetThreadDescription");
|
||||
}
|
||||
|
||||
#undef dav1d_set_thread_name
|
||||
COLD void dav1d_set_thread_name(const wchar_t *const name) {
|
||||
if (set_thread_description) /* Only available since Windows 10 1607 */
|
||||
set_thread_description(GetCurrentThread(), name);
|
||||
}
|
||||
|
||||
static COLD unsigned __stdcall thread_entrypoint(void *const data) {
|
||||
pthread_t *const t = data;
|
||||
t->arg = t->func(t->arg);
|
||||
|
1582
third_party/dav1d/src/x86/film_grain.asm
vendored
Normal file
1582
third_party/dav1d/src/x86/film_grain.asm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
45
third_party/dav1d/src/x86/film_grain_init_tmpl.c
vendored
Normal file
45
third_party/dav1d/src/x86/film_grain_init_tmpl.c
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
#include "src/film_grain.h"
|
||||
|
||||
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
|
||||
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
|
||||
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
|
||||
|
||||
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
c->generate_grain_y = dav1d_generate_grain_y_avx2;
|
||||
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
|
||||
#endif
|
||||
}
|
409
third_party/dav1d/src/x86/itx.asm
vendored
409
third_party/dav1d/src/x86/itx.asm
vendored
@ -47,9 +47,11 @@ pw_m%2_%1: dw -%2, %1
|
||||
pw_3803_1321: dw 3803, 1321
|
||||
pw_m1321_2482: dw -1321, 2482
|
||||
pw_2482_3344: dw 2482, 3344
|
||||
pw_m3344_3344: dw -3344, 3344
|
||||
pw_m3803_3344: dw -3803, 3344
|
||||
pw_m3803_m6688: dw -3803, -6688
|
||||
%define pw_3344x8 iadst4_dconly2b
|
||||
COEF_PAIR 2896, 2896
|
||||
pw_2896_m2896: dw 2896, -2896
|
||||
|
||||
pw_5: times 2 dw 5
|
||||
pw_2048: times 2 dw 2048
|
||||
@ -464,13 +466,15 @@ ALIGN function_align
|
||||
%macro IADST4_1D_PACKED 0
|
||||
punpcklwd m2, m1, m0
|
||||
punpckhwd m3, m1, m0
|
||||
psubw m0, m1
|
||||
punpckhqdq m1, m1
|
||||
paddw m1, m0 ; in0 - in2 + in3
|
||||
vpbroadcastd m5, [o(pw_m3344_3344)]
|
||||
vpbroadcastd m0, [o(pw_3803_1321)]
|
||||
vpbroadcastd m4, [o(pw_m1321_2482)]
|
||||
pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
|
||||
psrld m5, 16
|
||||
pmaddwd m0, m2
|
||||
pmaddwd m2, m4
|
||||
pmaddwd m5, m3 ; 3344*in0
|
||||
paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
|
||||
vpbroadcastd m4, [o(pw_2482_3344)]
|
||||
vpbroadcastd m5, [o(pw_m3803_3344)]
|
||||
pmaddwd m4, m3
|
||||
@ -478,19 +482,16 @@ ALIGN function_align
|
||||
paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
|
||||
vpbroadcastd m0, [o(pw_m3803_m6688)]
|
||||
pmaddwd m3, m0
|
||||
vpbroadcastd m0, [o(pw_3344x8)]
|
||||
pmulhrsw m1, m0 ; out2 ____
|
||||
vpbroadcastd m0, [o(pd_2048)]
|
||||
paddd m2, m0
|
||||
paddd m1, m0
|
||||
paddd m0, m4
|
||||
paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
|
||||
paddd m2, m4
|
||||
paddd m2, m3
|
||||
psrad m0, 12
|
||||
psrad m5, 12
|
||||
psrad m2, 12
|
||||
REPX {psrad x, 12}, m1, m2, m0, m5
|
||||
packssdw m0, m5 ; out0 out1
|
||||
packssdw m2, m2 ; out3 out3
|
||||
packssdw m1, m2 ; out2 out3
|
||||
%endmacro
|
||||
|
||||
INV_TXFM_4X4_FN dct, dct, 0
|
||||
@ -524,14 +525,13 @@ cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
mova m0, [cq+16*0]
|
||||
mova m1, [cq+16*1]
|
||||
call .main
|
||||
punpckhwd m3, m0, m2
|
||||
punpckhwd m3, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m3
|
||||
punpcklwd m0, m3
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call .main
|
||||
vpblendd m1, m1, m2, 0x0c ; out2 out3
|
||||
.end:
|
||||
pxor m2, m2
|
||||
mova [cq+16*0], m2
|
||||
@ -552,14 +552,13 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
mova m0, [cq+16*0]
|
||||
mova m1, [cq+16*1]
|
||||
call m(iadst_4x4_internal).main
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
punpcklwd m0, m2, m1
|
||||
punpckhwd m1, m2, m1
|
||||
punpcklwd m2, m1, m0
|
||||
punpckhwd m1, m0
|
||||
punpcklwd m0, m1, m2
|
||||
punpckhwd m1, m2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call m(iadst_4x4_internal).main
|
||||
vpblendd m1, m1, m2, 0x0c ; out2 out3
|
||||
.end:
|
||||
pxor m2, m2
|
||||
mova [cq+16*0], m2
|
||||
@ -710,12 +709,55 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
paddsw m1, m5 ; out3 out2
|
||||
%endmacro
|
||||
|
||||
%macro IADST8_1D_PACKED 0
|
||||
%macro IADST8_1D_PACKED 1 ; pass
|
||||
vpbroadcastd m6, [o(pd_2048)]
|
||||
punpckhwd m0, m4, m3 ; 0 7
|
||||
punpckhwd m1, m5, m2 ; 2 5
|
||||
punpcklwd m2, m5 ; 4 3
|
||||
punpcklwd m3, m4 ; 6 1
|
||||
%if %1 == 1
|
||||
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
|
||||
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
|
||||
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
|
||||
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
|
||||
psubsw m4, m0, m2 ; t5 t4
|
||||
paddsw m0, m2 ; t1 t0
|
||||
psubsw m5, m1, m3 ; t6 t7
|
||||
paddsw m1, m3 ; t2 t3
|
||||
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
|
||||
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
|
||||
%if mmsize > 16
|
||||
vbroadcasti128 m2, [o(deint_shuf)]
|
||||
%else
|
||||
mova m2, [o(deint_shuf)]
|
||||
%endif
|
||||
pshuflw m1, m1, q2301
|
||||
pshufhw m1, m1, q2301
|
||||
psubsw m3, m0, m1 ; t3 t2
|
||||
paddsw m0, m1 ; -out7 out0
|
||||
psubsw m1, m4, m5 ; t7 t6
|
||||
paddsw m4, m5 ; out6 -out1
|
||||
pshufb m0, m2
|
||||
pshufb m4, m2
|
||||
vpbroadcastd m5, [o(pw_m2896_2896)]
|
||||
pmaddwd m2, m5, m3
|
||||
pmaddwd m5, m1
|
||||
paddd m2, m6
|
||||
paddd m5, m6
|
||||
psrad m2, 12
|
||||
psrad m5, 12
|
||||
packssdw m2, m5 ; out4 -out5
|
||||
vpbroadcastd m5, [o(pw_2896_2896)]
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m1, m5
|
||||
paddd m3, m6
|
||||
paddd m1, m6
|
||||
psrad m3, 12
|
||||
psrad m1, 12
|
||||
packssdw m1, m3 ; out2 -out3
|
||||
punpcklqdq m3, m4, m0 ; out6 -out7
|
||||
punpckhqdq m0, m4 ; out0 -out1
|
||||
%else
|
||||
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
|
||||
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
|
||||
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
|
||||
@ -738,11 +780,12 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
vpblendd m0, m0, m4, 0xcc ; out0 -out1
|
||||
shufps m4, m2, m1, q1032 ; t3 t7
|
||||
vpblendd m1, m2, m1, 0xcc ; t2 t6
|
||||
psubw m2, m1, m4 ; t2-t3 t6-t7
|
||||
paddw m1, m4 ; t2+t3 t6+t7
|
||||
psubsw m2, m1, m4 ; t2-t3 t6-t7
|
||||
paddsw m1, m4 ; t2+t3 t6+t7
|
||||
pmulhrsw m2, m5 ; out4 -out5
|
||||
pshufd m1, m1, q1032
|
||||
pmulhrsw m1, m5 ; out2 -out3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx2
|
||||
@ -790,7 +833,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
call m(iadst_8x4_internal).main
|
||||
punpckhwd m3, m0, m2
|
||||
punpckhwd m3, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m3
|
||||
punpcklwd m0, m3
|
||||
@ -800,7 +843,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vextracti128 xm3, m1, 1
|
||||
pshufd xm4, xm0, q1032
|
||||
pshufd xm5, xm1, q1032
|
||||
call .main
|
||||
call .main_pass2
|
||||
vpbroadcastd m4, [o(pw_2048)]
|
||||
vinserti128 m0, m0, xm2, 1
|
||||
vinserti128 m1, m1, xm3, 1
|
||||
@ -822,8 +865,12 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
WRITE_4X8 0, 1
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
WRAP_XMM IADST8_1D_PACKED
|
||||
.main_pass1:
|
||||
WRAP_XMM IADST8_1D_PACKED 1
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2:
|
||||
WRAP_XMM IADST8_1D_PACKED 2
|
||||
ret
|
||||
|
||||
INV_TXFM_4X8_FN flipadst, dct, 0
|
||||
@ -839,7 +886,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
pmulhrsw m1, m2
|
||||
call m(iadst_8x4_internal).main
|
||||
punpcklwd m3, m1, m0
|
||||
punpckhwd m1, m2, m0
|
||||
punpckhwd m1, m0
|
||||
punpcklwd m0, m1, m3
|
||||
punpckhwd m1, m3
|
||||
jmp tx2q
|
||||
@ -848,7 +895,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vextracti128 xm3, m1, 1
|
||||
pshufd xm4, xm0, q1032
|
||||
pshufd xm5, xm1, q1032
|
||||
call m(iadst_4x8_internal).main
|
||||
call m(iadst_4x8_internal).main_pass2
|
||||
vpbroadcastd m5, [o(pw_2048)]
|
||||
vinserti128 m3, m3, xm1, 1
|
||||
vinserti128 m2, m2, xm0, 1
|
||||
@ -1099,8 +1146,13 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call .main
|
||||
pshufd m1, m1, q1032
|
||||
vpbroadcastd m5, [o(pw_2896x8)]
|
||||
paddsw m1, m2, m4
|
||||
psubsw m2, m4
|
||||
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
|
||||
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
|
||||
vpbroadcastd m5, [o(pw_2048)]
|
||||
pshufd m1, m1, q1032
|
||||
vpblendd m4, m1, m0, 0x33
|
||||
vpblendd m0, m0, m2, 0x33
|
||||
vpblendd m2, m2, m3, 0x33
|
||||
@ -1176,7 +1228,6 @@ ALIGN function_align
|
||||
vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a
|
||||
vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
|
||||
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
|
||||
vpbroadcastd m5, [o(pw_2896x8)]
|
||||
pshufd m2, m2, q1032 ; t6a t7a t14 t15
|
||||
psubsw m1, m0, m3 ; t3a t2a t11 t10
|
||||
paddsw m0, m3 ; -out15 out0 out14 -out1
|
||||
@ -1184,10 +1235,21 @@ ALIGN function_align
|
||||
psubsw m4, m2 ; t6 t7 t14a t15a
|
||||
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
|
||||
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
|
||||
paddw m1, m2, m4
|
||||
psubw m2, m4
|
||||
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
|
||||
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
vpbroadcastd m5, [o(pw_m2896_2896)]
|
||||
vpbroadcastd m6, [o(pw_2896_2896)]
|
||||
punpcklwd m1, m4, m2
|
||||
punpckhwd m4, m2
|
||||
pmaddwd m2, m5, m4
|
||||
pmaddwd m4, m6
|
||||
pmaddwd m5, m1
|
||||
pmaddwd m1, m6
|
||||
REPX {paddd x, m8}, m5, m1, m2, m4
|
||||
REPX {psrad x, 12}, m5, m2, m1, m4
|
||||
packssdw m2, m5 ; -out11 out8 out10 -out9
|
||||
packssdw m1, m4 ; -out7 out4 out6 -out5
|
||||
ret
|
||||
|
||||
INV_TXFM_4X16_FN flipadst, dct, 0
|
||||
@ -1214,8 +1276,13 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call m(iadst_4x16_internal).main
|
||||
pshufd m1, m1, q1032
|
||||
vpbroadcastd m5, [o(pw_2896x8)]
|
||||
paddsw m1, m2, m4
|
||||
psubsw m2, m4
|
||||
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
|
||||
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
|
||||
vpbroadcastd m6, [o(pw_2048)]
|
||||
pshufd m1, m1, q1032
|
||||
vpblendd m4, m0, m2, 0x33
|
||||
vpblendd m0, m0, m1, 0xcc
|
||||
vpblendd m1, m1, m3, 0xcc
|
||||
@ -1381,7 +1448,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
pmulhrsw xm2, xm0, [cq+16*2]
|
||||
pmulhrsw xm4, xm0
|
||||
pmulhrsw xm5, xm0
|
||||
call m(iadst_4x8_internal).main
|
||||
call m(iadst_4x8_internal).main_pass1
|
||||
vinserti128 m0, m0, xm2, 1
|
||||
vinserti128 m1, m1, xm3, 1
|
||||
punpckhwd m2, m0, m1
|
||||
@ -1393,7 +1460,6 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call .main
|
||||
vpblendd m1, m1, m2, 0xcc
|
||||
.end:
|
||||
vpermq m0, m0, q3120
|
||||
vpermq m1, m1, q3120
|
||||
@ -1427,7 +1493,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
pmulhrsw xm2, xm0, [cq+16*2]
|
||||
pmulhrsw xm4, xm0
|
||||
pmulhrsw xm5, xm0
|
||||
call m(iadst_4x8_internal).main
|
||||
call m(iadst_4x8_internal).main_pass1
|
||||
vinserti128 m3, m3, xm1, 1
|
||||
vinserti128 m2, m2, xm0, 1
|
||||
punpckhwd m1, m3, m2
|
||||
@ -1439,7 +1505,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call m(iadst_8x4_internal).main
|
||||
vpblendd m2, m2, m1, 0x33
|
||||
mova m2, m1
|
||||
vpermq m1, m0, q2031
|
||||
vpermq m0, m2, q2031
|
||||
jmp m(iadst_8x4_internal).end2
|
||||
@ -1580,7 +1646,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpermq m3, [cq+32*3], q3120 ; 6 7
|
||||
vpermq m5, [cq+32*1], q1302 ; 3 2
|
||||
vpermq m2, [cq+32*2], q3120 ; 4 5
|
||||
call .main
|
||||
call .main_pass1
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
punpcklwd m4, m0, m1
|
||||
punpckhwd m0, m1
|
||||
@ -1604,7 +1670,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
.pass2:
|
||||
pshufd m4, m0, q1032
|
||||
pshufd m5, m1, q1032
|
||||
call .main
|
||||
call .main_pass2
|
||||
vpbroadcastd m5, [o(pw_2048)]
|
||||
vpbroadcastd xm4, [o(pw_4096)]
|
||||
psubw m4, m5 ; lower half = 2048, upper half = -2048
|
||||
@ -1629,8 +1695,12 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
WRITE_8X4 2, 3, 4, 5
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
IADST8_1D_PACKED
|
||||
.main_pass1:
|
||||
IADST8_1D_PACKED 1
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2:
|
||||
IADST8_1D_PACKED 2
|
||||
ret
|
||||
|
||||
INV_TXFM_8X8_FN flipadst, dct
|
||||
@ -1643,7 +1713,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpermq m3, [cq+32*3], q3120 ; 6 7
|
||||
vpermq m5, [cq+32*1], q1302 ; 3 2
|
||||
vpermq m2, [cq+32*2], q3120 ; 4 5
|
||||
call m(iadst_8x8_internal).main
|
||||
call m(iadst_8x8_internal).main_pass1
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
punpckhwd m4, m3, m2
|
||||
punpcklwd m3, m2
|
||||
@ -1667,7 +1737,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
.pass2:
|
||||
pshufd m4, m0, q1032
|
||||
pshufd m5, m1, q1032
|
||||
call m(iadst_8x8_internal).main
|
||||
call m(iadst_8x8_internal).main_pass2
|
||||
vpbroadcastd m4, [o(pw_2048)]
|
||||
vpbroadcastd xm5, [o(pw_4096)]
|
||||
psubw m4, m5 ; lower half = -2048, upper half = 2048
|
||||
@ -1867,6 +1937,7 @@ INV_TXFM_8X16_FN adst, identity
|
||||
cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ITX_8X16_LOAD_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
vpbroadcastd m10, [o(pw_16384)]
|
||||
pslld m9, m10, 17
|
||||
psubw m10, m9 ; 16384, -16384
|
||||
@ -1874,6 +1945,7 @@ cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
call .main
|
||||
call .main_pass2_end
|
||||
vpbroadcastd m9, [o(pw_2048)]
|
||||
vpbroadcastd xm8, [o(pw_4096)]
|
||||
psubw m8, m9
|
||||
@ -1930,38 +2002,72 @@ ALIGN function_align
|
||||
paddsw m4, m6 ; t8a t9a
|
||||
vpbroadcastd m11, [o(pw_m3784_1567)]
|
||||
vpbroadcastd m12, [o(pw_1567_3784)]
|
||||
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
|
||||
ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
|
||||
psubw m6, m9, m11 ; pw_3784_m1567
|
||||
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
|
||||
ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
|
||||
vpbroadcastd m11, [o(pw_m1567_3784)]
|
||||
vpbroadcastd m12, [o(pw_3784_1567)]
|
||||
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
|
||||
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
|
||||
psubw m6, m9, m11 ; pw_1567_m3784
|
||||
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
|
||||
vbroadcasti128 m11, [o(deint_shuf)]
|
||||
vpbroadcastd m12, [o(pw_2896x8)]
|
||||
psubsw m6, m0, m1 ; t3a t2a
|
||||
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
|
||||
vbroadcasti128 m12, [o(deint_shuf)]
|
||||
paddsw m6, m4, m7 ; -out1 out14
|
||||
psubsw m4, m7 ; t10 t11
|
||||
psubsw m11, m3, m8 ; t7 t6
|
||||
paddsw m8, m3 ; out12 -out3
|
||||
psubsw m3, m0, m1 ; t3a t2a
|
||||
paddsw m0, m1 ; -out15 out0
|
||||
paddsw m1, m2, m5 ; -out13 out2
|
||||
psubsw m5, m2 ; t15a t14a
|
||||
paddsw m2, m4, m7 ; -out1 out14
|
||||
psubsw m4, m7 ; t10 t11
|
||||
psubsw m7, m3, m8 ; t6 t7
|
||||
paddsw m8, m3 ; -out3 out12
|
||||
REPX {pshufb x, m11}, m6, m4, m0, m2
|
||||
vpblendd m3, m6, m4, 0xcc ; t3a t11
|
||||
shufps m6, m6, m4, q1032 ; t2a t10
|
||||
vpblendd m4, m5, m7, 0xcc ; t15a t7
|
||||
shufps m5, m5, m7, q1032 ; t14a t6
|
||||
shufps m7, m2, m0, q1032 ; out14 -out15
|
||||
vpblendd m0, m0, m2, 0x33 ; -out1 out0
|
||||
paddw m2, m5, m4 ; -out5 out4
|
||||
psubw m5, m4 ; out10 -out11
|
||||
psubw m4, m6, m3 ; out8 -out9
|
||||
paddw m3, m6 ; -out7 out6
|
||||
shufps m6, m8, m1, q1032 ; out12 -out13
|
||||
vpblendd m1, m1, m8, 0x33 ; -out3 out2
|
||||
REPX {pmulhrsw x, m12}, m2, m3, m4, m5
|
||||
pshufb m0, m12
|
||||
pshufb m6, m12
|
||||
pshufb m8, m12
|
||||
pshufb m1, m12
|
||||
shufps m7, m6, m0, q1032 ; out14 -out15
|
||||
vpblendd m0, m6, 0x33 ; -out1 out0
|
||||
punpcklqdq m6, m8, m1 ; out12 -out13
|
||||
punpckhqdq m1, m8, m1 ; -out3 out2
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
vpbroadcastd m8, [o(pw_m2896_2896)]
|
||||
vpbroadcastd m12, [o(pw_2896_2896)]
|
||||
pmaddwd m9, m8, m11 ; -out11
|
||||
pmaddwd m2, m12, m5 ; -out5
|
||||
pmaddwd m5, m8 ; out10
|
||||
pmaddwd m11, m12 ; out4
|
||||
REPX {paddd x, m10}, m9, m5, m2, m11
|
||||
REPX {psrad x, 12 }, m9, m5, m2, m11
|
||||
packssdw m5, m9 ; out10 -out11
|
||||
packssdw m2, m11 ; -out5 out4
|
||||
pmaddwd m11, m8, m3 ; out8
|
||||
vpbroadcastd m8, [o(pw_2896_m2896)]
|
||||
pmaddwd m3, m12 ; -out7
|
||||
pmaddwd m8, m4 ; -out9
|
||||
pmaddwd m4, m12 ; out6
|
||||
REPX {paddd x, m10}, m11, m3, m8, m4
|
||||
REPX {psrad x, 12 }, m11, m3, m8, m4
|
||||
packssdw m3, m4 ; -out7 out6
|
||||
packssdw m4, m11, m8 ; out8 -out9
|
||||
vpbroadcastd m10, [o(pw_16384)]
|
||||
pxor m9, m9
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
vpbroadcastd m8, [o(pw_2896x8)]
|
||||
pshufb m2, m11, m12
|
||||
pshufb m5, m12
|
||||
pshufb m3, m12
|
||||
pshufb m4, m12
|
||||
punpcklqdq m11, m5, m2 ; t15a t7
|
||||
punpckhqdq m5, m2 ; t14a t6
|
||||
shufps m2, m3, m4, q1032 ; t2a t10
|
||||
vpblendd m3, m4, 0xcc ; t3a t11
|
||||
psubsw m4, m2, m3 ; out8 -out9
|
||||
paddsw m3, m2 ; -out7 out6
|
||||
paddsw m2, m5, m11 ; -out5 out4
|
||||
psubsw m5, m11 ; out10 -out11
|
||||
REPX {pmulhrsw x, m8}, m2, m3, m4, m5
|
||||
ret
|
||||
|
||||
INV_TXFM_8X16_FN flipadst, dct
|
||||
@ -1972,6 +2078,7 @@ INV_TXFM_8X16_FN flipadst, identity
|
||||
cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ITX_8X16_LOAD_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
vpbroadcastd m9, [o(pw_16384)]
|
||||
pslld m10, m9, 17
|
||||
psubw m10, m9 ; -16384, 16384
|
||||
@ -1990,6 +2097,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
jmp m(idct_8x16_internal).pass1_end2
|
||||
.pass2:
|
||||
call m(iadst_8x16_internal).main
|
||||
call m(iadst_8x16_internal).main_pass2_end
|
||||
vpbroadcastd m8, [o(pw_2048)]
|
||||
vpbroadcastd xm9, [o(pw_4096)]
|
||||
psubw m8, m9
|
||||
@ -2232,7 +2340,7 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
vpermq m1, [cq+32*1], q1230
|
||||
vpermq m2, [cq+32*2], q2103
|
||||
call m(iadst_4x16_internal).main2
|
||||
pshufd m2, m2, q1032
|
||||
call m(iadst_4x16_internal).main_pass1_end
|
||||
punpcklwd m4, m3, m1
|
||||
punpcklwd m5, m2, m0
|
||||
punpckhwd m0, m1
|
||||
@ -2276,20 +2384,26 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
RET
|
||||
ALIGN function_align
|
||||
.main:
|
||||
vpbroadcastd m6, [o(pw_m3344_3344)]
|
||||
vpbroadcastd m7, [o(pw_3803_1321)]
|
||||
vpbroadcastd m8, [o(pw_m1321_2482)]
|
||||
vpbroadcastd m9, [o(pw_2482_3344)]
|
||||
punpcklwd m4, m2, m0 ; in2 in0 l
|
||||
psubw m6, m0, m2
|
||||
punpckhwd m2, m0 ; in2 in0 h
|
||||
paddw m6, m3 ; t2
|
||||
psrld m5, m6, 16
|
||||
pmaddwd m10, m6, m4 ; t2:02 l
|
||||
pmaddwd m6, m2 ; t2:02 h
|
||||
pmaddwd m0, m7, m4 ; t0:02 l
|
||||
pmaddwd m7, m2 ; t0:02 h
|
||||
pmaddwd m4, m8 ; t1:02 l
|
||||
pmaddwd m8, m2 ; t1:02 h
|
||||
punpckhwd m2, m3, m1 ; in3 in1 h
|
||||
punpcklwd m3, m1 ; in3 in1 l
|
||||
pmaddwd m1, m5, m2 ; t2:3 h
|
||||
pmaddwd m5, m3 ; t2:3 l
|
||||
paddd m6, m1
|
||||
vpbroadcastd m1, [o(pd_2048)]
|
||||
paddd m10, m5
|
||||
pmaddwd m5, m9, m3
|
||||
pmaddwd m9, m2
|
||||
paddd m0, m1
|
||||
@ -2299,6 +2413,8 @@ ALIGN function_align
|
||||
vpbroadcastd m9, [o(pw_m3803_3344)]
|
||||
pmaddwd m5, m9, m2
|
||||
pmaddwd m9, m3
|
||||
paddd m10, m1 ; t2 + 2048 l
|
||||
paddd m6, m1 ; t2 + 2048 h
|
||||
paddd m5, m1 ; t1:13 + 2048 h
|
||||
paddd m1, m9 ; t1:13 + 2048 l
|
||||
vpbroadcastd m9, [o(pw_m3803_m6688)]
|
||||
@ -2310,12 +2426,11 @@ ALIGN function_align
|
||||
paddd m4, m0
|
||||
paddd m2, m8 ; t0 + t1 - t3 + 2048 h
|
||||
paddd m3, m4 ; t0 + t1 - t3 + 2048 l
|
||||
REPX {psrad x, 12}, m0, m7, m5, m1, m2, m3
|
||||
REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
|
||||
packssdw m0, m7
|
||||
packssdw m1, m5
|
||||
packssdw m3, m2
|
||||
vpbroadcastd m2, [o(pw_3344x8)]
|
||||
pmulhrsw m2, m6
|
||||
packssdw m2, m10, m6
|
||||
ret
|
||||
|
||||
INV_TXFM_16X4_FN flipadst, dct
|
||||
@ -2329,7 +2444,7 @@ cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
vpermq m1, [cq+32*1], q1230
|
||||
vpermq m2, [cq+32*2], q2103
|
||||
call m(iadst_4x16_internal).main2
|
||||
pshufd m2, m2, q1032
|
||||
call m(iadst_4x16_internal).main_pass1_end
|
||||
punpckhwd m4, m3, m2
|
||||
punpckhwd m5, m1, m0
|
||||
punpcklwd m0, m2
|
||||
@ -2552,7 +2667,7 @@ INV_TXFM_16X8_FN adst, identity
|
||||
cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ITX_16X8_LOAD_COEFS 1302
|
||||
call m(iadst_8x16_internal).main2
|
||||
vpbroadcastd m10, [o(pw_16384)]
|
||||
call m(iadst_8x16_internal).main_pass1_end
|
||||
psubw m11, m9, m10
|
||||
punpcklwd m8, m0, m2
|
||||
punpckhwd m0, m2
|
||||
@ -2567,7 +2682,7 @@ cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
call .main
|
||||
vpbroadcastd m9, [o(pw_2048)]
|
||||
call .main_pass2_end
|
||||
pxor m8, m8
|
||||
psubw m8, m9
|
||||
REPX {pmulhrsw x, m9}, m0, m2, m4, m6
|
||||
@ -2591,21 +2706,50 @@ ALIGN function_align
|
||||
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
|
||||
psubsw m9, m6, m8 ; t7
|
||||
paddsw m6, m8 ; out6
|
||||
vpbroadcastd m8, [o(pw_2896x8)]
|
||||
psubsw m3, m7, m5 ; t3
|
||||
paddsw m7, m5 ; -out7
|
||||
psubsw m5, m0, m2 ; t2
|
||||
paddsw m0, m2 ; out0
|
||||
psubsw m2, m1, m4 ; t6
|
||||
paddsw m1, m4 ; -out1
|
||||
psubw m4, m5, m3
|
||||
paddw m3, m5
|
||||
psubw m5, m2, m9
|
||||
paddw m2, m9
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
vpbroadcastd m11, [o(pw_m2896_2896)]
|
||||
vpbroadcastd m12, [o(pw_2896_2896)]
|
||||
punpckhwd m4, m3, m5
|
||||
punpcklwd m3, m5
|
||||
pmaddwd m5, m11, m4
|
||||
pmaddwd m4, m12
|
||||
pmaddwd m8, m11, m3
|
||||
pmaddwd m3, m12
|
||||
REPX {paddd x, m10}, m5, m4, m8, m3
|
||||
REPX {psrad x, 12 }, m5, m8, m4, m3
|
||||
packssdw m3, m4 ; -out3
|
||||
packssdw m4, m8, m5 ; out4
|
||||
punpcklwd m5, m9, m2
|
||||
punpckhwd m9, m2
|
||||
pmaddwd m2, m12, m5
|
||||
pmaddwd m5, m11
|
||||
pmaddwd m12, m9
|
||||
pmaddwd m11, m9
|
||||
REPX {paddd x, m10}, m2, m5, m12, m11
|
||||
REPX {psrad x, 12 }, m2, m12, m5, m11
|
||||
packssdw m2, m12 ; out2
|
||||
packssdw m5, m11 ; -out5
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
vpbroadcastd m8, [o(pw_2896x8)]
|
||||
psubsw m4, m5, m3
|
||||
paddsw m3, m5
|
||||
psubsw m5, m2, m9
|
||||
paddsw m2, m9
|
||||
pmulhrsw m2, m8 ; out2
|
||||
pmulhrsw m3, m8 ; -out3
|
||||
pmulhrsw m4, m8 ; out4
|
||||
pmulhrsw m5, m8 ; -out5
|
||||
vpbroadcastd m9, [o(pw_2048)]
|
||||
ret
|
||||
|
||||
INV_TXFM_16X8_FN flipadst, dct
|
||||
@ -2616,7 +2760,7 @@ INV_TXFM_16X8_FN flipadst, identity
|
||||
cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
ITX_16X8_LOAD_COEFS 1302
|
||||
call m(iadst_8x16_internal).main2
|
||||
vpbroadcastd m10, [o(pw_16384)]
|
||||
call m(iadst_8x16_internal).main_pass1_end
|
||||
psubw m9, m10
|
||||
punpcklwd m8, m6, m4
|
||||
punpckhwd m6, m4
|
||||
@ -2655,7 +2799,7 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
call m(iadst_16x8_internal).main
|
||||
vpbroadcastd m9, [o(pw_2048)]
|
||||
call m(iadst_16x8_internal).main_pass2_end
|
||||
pxor m8, m8
|
||||
psubw m8, m9
|
||||
pmulhrsw m10, m7, m8
|
||||
@ -2986,8 +3130,12 @@ INV_TXFM_16X16_FN adst, flipadst
|
||||
cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
ITX_16X16_LOAD_COEFS
|
||||
call .main
|
||||
vpbroadcastd m1, [o(pw_8192)]
|
||||
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
|
||||
call .main_pass1_end
|
||||
pmulhrsw m0, m1, [cq+32*0]
|
||||
pmulhrsw m2, m1, [cq+32*1]
|
||||
REPX {pmulhrsw x, m1}, m4, m6, m8, m10
|
||||
pmulhrsw m12, m1, [cq+32*2]
|
||||
pmulhrsw m14, m1, [cq+32*3]
|
||||
vextracti128 [rsp+16*5], m8, 1
|
||||
mova [rsp+16*1], xm8
|
||||
pxor m8, m8
|
||||
@ -2996,7 +3144,7 @@ cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
call .main
|
||||
vpbroadcastd m1, [o(pw_2048)]
|
||||
call .main_pass2_end
|
||||
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
|
||||
mova [rsp+32*0], m6
|
||||
pxor m6, m6
|
||||
@ -3081,16 +3229,73 @@ ALIGN function_align
|
||||
paddsw m0, m12 ; out0
|
||||
paddsw m12, m8, m5 ; out12
|
||||
psubsw m8, m5 ; t7
|
||||
paddw m5, m10, m11 ; -out5
|
||||
psubw m10, m11 ; out10
|
||||
psubw m11, m4, m8 ; -out11
|
||||
paddw m4, m8 ; out4
|
||||
psubw m8, m7, m9 ; out8
|
||||
paddw m7, m9 ; -out7
|
||||
psubw m9, m1, m6 ; -out9
|
||||
paddw m6, m1 ; out6
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
mova [cq+32*0], m0
|
||||
mova [cq+32*1], m2
|
||||
mova [cq+32*2], m12
|
||||
mova [cq+32*3], m14
|
||||
vpbroadcastd m14, [pw_m2896_2896]
|
||||
vpbroadcastd m12, [pw_2896_2896]
|
||||
vpbroadcastd m2, [pd_2048]
|
||||
punpcklwd m5, m11, m10
|
||||
punpckhwd m11, m10
|
||||
pmaddwd m10, m14, m5
|
||||
pmaddwd m0, m14, m11
|
||||
pmaddwd m5, m12
|
||||
pmaddwd m11, m12
|
||||
REPX {paddd x, m2}, m10, m0, m5, m11
|
||||
REPX {psrad x, 12}, m10, m0, m5, m11
|
||||
packssdw m10, m0 ; out10
|
||||
packssdw m5, m11 ; -out5
|
||||
punpcklwd m11, m8, m4
|
||||
punpckhwd m8, m4
|
||||
pmaddwd m4, m12, m11
|
||||
pmaddwd m0, m12, m8
|
||||
pmaddwd m11, m14
|
||||
pmaddwd m8, m14
|
||||
REPX {paddd x, m2}, m4, m0, m11, m8
|
||||
REPX {psrad x, 12}, m4, m0, m11, m8
|
||||
packssdw m4, m0 ; out4
|
||||
packssdw m11, m8 ; -out11
|
||||
punpcklwd m8, m9, m7
|
||||
punpckhwd m9, m7
|
||||
pmaddwd m7, m12, m8
|
||||
pmaddwd m0, m12, m9
|
||||
pmaddwd m8, m14
|
||||
pmaddwd m9, m14
|
||||
REPX {paddd x, m2}, m7, m0, m8, m9
|
||||
REPX {psrad x, 12}, m7, m0, m8, m9
|
||||
packssdw m7, m0 ; -out7
|
||||
packssdw m8, m9 ; out8
|
||||
punpckhwd m0, m6, m1
|
||||
punpcklwd m6, m1
|
||||
pmaddwd m1, m14, m0
|
||||
pmaddwd m9, m14, m6
|
||||
pmaddwd m0, m12
|
||||
pmaddwd m6, m12
|
||||
REPX {paddd x, m2}, m1, m9, m0, m6
|
||||
REPX {psrad x, 12}, m1, m9, m0, m6
|
||||
packssdw m9, m1 ; -out7
|
||||
packssdw m6, m0 ; out8
|
||||
vpbroadcastd m1, [o(pw_8192)]
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
|
||||
; 16-bit here will produce the same result as using 32-bit intermediates.
|
||||
paddsw m5, m10, m11 ; -out5
|
||||
psubsw m10, m11 ; out10
|
||||
psubsw m11, m4, m8 ; -out11
|
||||
paddsw m4, m8 ; out4
|
||||
psubsw m8, m7, m9 ; out8
|
||||
paddsw m7, m9 ; -out7
|
||||
psubsw m9, m1, m6 ; -out9
|
||||
paddsw m6, m1 ; out6
|
||||
vpbroadcastd m1, [o(pw_2896x8)]
|
||||
REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
|
||||
vpbroadcastd m1, [o(pw_2048)]
|
||||
ret
|
||||
|
||||
INV_TXFM_16X16_FN flipadst, dct
|
||||
@ -3100,16 +3305,16 @@ INV_TXFM_16X16_FN flipadst, flipadst
|
||||
cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
ITX_16X16_LOAD_COEFS
|
||||
call m(iadst_16x16_internal).main
|
||||
vpbroadcastd m1, [o(pw_8192)]
|
||||
call m(iadst_16x16_internal).main_pass1_end
|
||||
pmulhrsw m6, m1
|
||||
pmulhrsw m2, m1, m8
|
||||
mova [rsp+32*2], m6
|
||||
pmulhrsw m6, m1, m4
|
||||
pmulhrsw m4, m1, m10
|
||||
pmulhrsw m10, m1, m12
|
||||
pmulhrsw m12, m1, m2
|
||||
pmulhrsw m2, m1, m8
|
||||
pmulhrsw m8, m1, m14
|
||||
pmulhrsw m14, m1, m0
|
||||
pmulhrsw m8, m1, [cq+32*3]
|
||||
pmulhrsw m10, m1, [cq+32*2]
|
||||
pmulhrsw m12, m1, [cq+32*1]
|
||||
pmulhrsw m14, m1, [cq+32*0]
|
||||
pxor m0, m0
|
||||
psubw m0, m1
|
||||
REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
|
||||
@ -3136,7 +3341,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
jmp m(idct_16x16_internal).pass1_end3
|
||||
.pass2:
|
||||
call m(iadst_16x16_internal).main
|
||||
vpbroadcastd m1, [o(pw_2048)]
|
||||
call m(iadst_16x16_internal).main_pass2_end
|
||||
pmulhrsw m0, m1
|
||||
pmulhrsw m8, m1
|
||||
mova [rsp+32*0], m0
|
||||
|
638
third_party/dav1d/src/x86/itx_ssse3.asm
vendored
638
third_party/dav1d/src/x86/itx_ssse3.asm
vendored
@ -43,8 +43,11 @@ pw_1321_3803: times 4 dw 1321, 3803
|
||||
pw_2482_m1321: times 4 dw 2482, -1321
|
||||
pw_3344_2482: times 4 dw 3344, 2482
|
||||
pw_3344_m3803: times 4 dw 3344, -3803
|
||||
pw_3344_m3344: times 4 dw 3344, -3344
|
||||
pw_0_3344 times 4 dw 0, 3344
|
||||
pw_m6688_m3803: times 4 dw -6688, -3803
|
||||
|
||||
COEF_PAIR 2896, 2896
|
||||
COEF_PAIR 1567, 3784
|
||||
COEF_PAIR 799, 4017
|
||||
COEF_PAIR 3406, 2276
|
||||
@ -126,7 +129,6 @@ pw_2675x8: times 8 dw 2675*8
|
||||
pw_4085x8: times 8 dw 4085*8
|
||||
pw_m301x8: times 8 dw -301*8
|
||||
|
||||
|
||||
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
|
||||
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
|
||||
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
|
||||
@ -200,7 +202,6 @@ SECTION .text
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
|
||||
; flags: 1 = swap, 2: coef_regs
|
||||
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
|
||||
%if %6 & 2
|
||||
@ -239,35 +240,6 @@ SECTION .text
|
||||
paddsw m0, m2 ;high: out1 ;low: out0
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro IADST4_1D_PACKED 0
|
||||
punpcklwd m2, m0, m1 ;unpacked in0 in2
|
||||
punpckhwd m3, m0, m1 ;unpacked in1 in3
|
||||
psubw m0, m1
|
||||
punpckhqdq m1, m1 ;
|
||||
paddw m1, m0 ;low: in0 - in2 + in3
|
||||
|
||||
pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
|
||||
paddd m4, m0 ;t0 + t3
|
||||
pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
pmulhrsw m1, [o(pw_3344x8)] ;low: out2
|
||||
mova m0, [o(pd_2048)]
|
||||
paddd m2, m0
|
||||
paddd m0, m4 ;t0 + t3 + 2048
|
||||
paddd m5, m2 ;t1 + t3 + 2048
|
||||
paddd m2, m4
|
||||
paddd m2, m3 ;t0 + t1 - t3 + 2048
|
||||
|
||||
psrad m0, 12 ;out0
|
||||
psrad m5, 12 ;out1
|
||||
psrad m2, 12 ;out3
|
||||
packssdw m0, m5 ;high: out1 ;low: out0
|
||||
packssdw m2, m2 ;high: out3 ;low: out3
|
||||
%endmacro
|
||||
|
||||
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
|
||||
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
|
||||
%undef cmp
|
||||
@ -392,15 +364,14 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m0, [coeffq+16*0]
|
||||
mova m1, [coeffq+16*1]
|
||||
call .main
|
||||
punpckhwd m3, m0, m2
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m3 ;high: in3 ;low :in2
|
||||
punpcklwd m0, m3 ;high: in1 ;low: in0
|
||||
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
|
||||
punpcklwd m0, m2 ;high: in1 ;low: in0
|
||||
jmp tx2q
|
||||
|
||||
.pass2:
|
||||
call .main
|
||||
punpcklqdq m1, m2 ;out2 out3
|
||||
|
||||
.end:
|
||||
pxor m2, m2
|
||||
@ -412,7 +383,28 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
ALIGN function_align
|
||||
.main:
|
||||
IADST4_1D_PACKED
|
||||
punpcklwd m2, m0, m1 ;unpacked in0 in2
|
||||
punpckhwd m0, m1 ;unpacked in1 in3
|
||||
mova m3, m0
|
||||
pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
|
||||
pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
|
||||
paddd m1, m0 ;t2
|
||||
pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
|
||||
paddd m4, m0 ;t0 + t3
|
||||
pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
mova m0, [o(pd_2048)]
|
||||
paddd m1, m0 ;t2 + 2048
|
||||
paddd m2, m0
|
||||
paddd m0, m4 ;t0 + t3 + 2048
|
||||
paddd m5, m2 ;t1 + t3 + 2048
|
||||
paddd m2, m4
|
||||
paddd m2, m3 ;t0 + t1 - t3 + 2048
|
||||
REPX {psrad x, 12}, m1, m0, m5, m2
|
||||
packssdw m0, m5 ;high: out1 ;low: out0
|
||||
packssdw m1, m2 ;high: out3 ;low: out3
|
||||
ret
|
||||
|
||||
INV_TXFM_4X4_FN flipadst, dct, 0
|
||||
@ -424,16 +416,14 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m0, [coeffq+16*0]
|
||||
mova m1, [coeffq+16*1]
|
||||
call m(iadst_4x4_internal).main
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
punpcklwd m0, m2, m1 ;high: in3 ;low :in2
|
||||
punpckhwd m2, m1 ;high: in1 ;low: in0
|
||||
mova m1, m2
|
||||
punpcklwd m2, m1, m0
|
||||
punpckhwd m1, m0
|
||||
punpcklwd m0, m1, m2 ;high: in3 ;low :in2
|
||||
punpckhwd m1, m2 ;high: in1 ;low: in0
|
||||
jmp tx2q
|
||||
|
||||
.pass2:
|
||||
call m(iadst_4x4_internal).main
|
||||
punpcklqdq m1, m2 ;out2 out3
|
||||
|
||||
.end:
|
||||
pxor m2, m2
|
||||
@ -584,99 +574,6 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
|
||||
mova m%4, m%5
|
||||
%endmacro
|
||||
|
||||
%macro IADST4_1D 0
|
||||
mova m4, m2
|
||||
psubw m2, m0, m4
|
||||
paddw m2, m3 ;low: in0 - in2 + in3
|
||||
|
||||
punpckhwd m6, m0, m4 ;unpacked in0 in2
|
||||
punpckhwd m7, m1, m3 ;unpacked in1 in3
|
||||
punpcklwd m0, m4 ;unpacked in0 in2
|
||||
punpcklwd m1, m3 ;unpacked in1 in3
|
||||
|
||||
pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
|
||||
paddd m3, m4 ;t0 + t3
|
||||
|
||||
pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
pmulhrsw m2, [o(pw_3344x8)] ;out2
|
||||
mova m4, [o(pd_2048)]
|
||||
paddd m0, m4
|
||||
paddd m4, m3 ;t0 + t3 + 2048
|
||||
paddd m5, m0 ;t1 + t3 + 2048
|
||||
paddd m3, m0
|
||||
paddd m3, m1 ;t0 + t1 - t3 + 2048
|
||||
|
||||
psrad m4, 12 ;out0
|
||||
psrad m5, 12 ;out1
|
||||
psrad m3, 12 ;out3
|
||||
packssdw m0, m4, m5 ;low: out0 high: out1
|
||||
|
||||
pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
|
||||
paddd m1, m4 ;t0 + t3
|
||||
pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
|
||||
mova m4, [o(pd_2048)]
|
||||
paddd m6, m4
|
||||
paddd m4, m1 ;t0 + t3 + 2048
|
||||
paddd m5, m6 ;t1 + t3 + 2048
|
||||
paddd m1, m6
|
||||
paddd m1, m7 ;t0 + t1 - t3 + 2048
|
||||
|
||||
psrad m4, 12 ;out0
|
||||
psrad m5, 12 ;out1
|
||||
psrad m1, 12 ;out3
|
||||
packssdw m3, m1 ;out3
|
||||
packssdw m4, m5 ;low: out0 high: out1
|
||||
|
||||
punpckhqdq m1, m0, m4 ;out1
|
||||
punpcklqdq m0, m4 ;out0
|
||||
%endmacro
|
||||
|
||||
%macro IADST8_1D_PACKED 0
|
||||
mova m6, [o(pd_2048)]
|
||||
punpckhwd m4, m3, m0 ;unpacked in7 in0
|
||||
punpckhwd m5, m2, m1 ;unpacked in5 in2
|
||||
punpcklwd m1, m2 ;unpacked in3 in4
|
||||
punpcklwd m0, m3 ;unpacked in1 in6
|
||||
ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
|
||||
ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
|
||||
ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
|
||||
ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
|
||||
|
||||
psubsw m3, m4, m1 ;low: t4 high: t5
|
||||
paddsw m4, m1 ;low: t0 high: t1
|
||||
psubsw m2, m5, m0 ;low: t6 high: t7
|
||||
paddsw m5, m0 ;low: t2 high: t3
|
||||
|
||||
shufps m1, m3, m2, q1032
|
||||
punpckhwd m2, m1
|
||||
punpcklwd m3, m1
|
||||
ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
|
||||
ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
|
||||
|
||||
psubsw m1, m4, m5 ;low: t2 high: t3
|
||||
paddsw m4, m5 ;low: out0 high: -out7
|
||||
psubsw m5, m3, m2 ;low: t7 high: t6
|
||||
paddsw m3, m2 ;low: out6 high: -out1
|
||||
shufps m0, m4, m3, q3210 ;low: out0 high: -out1
|
||||
shufps m3, m4, q3210 ;low: out6 high: -out7
|
||||
|
||||
shufps m4, m1, m5, q1032 ;low: t3 high: t7
|
||||
shufps m1, m5, q3210 ;low: t2 high: t6
|
||||
mova m5, [o(pw_2896x8)]
|
||||
psubw m2, m1, m4 ;low: t2-t3 high: t6-t7
|
||||
paddw m1, m4 ;low: t2+t3 high: t6+t7
|
||||
pmulhrsw m2, m5 ;low: out4 high: -out5
|
||||
shufps m1, m1, q1032
|
||||
pmulhrsw m1, m5 ;low: out2 high: -out3
|
||||
%endmacro
|
||||
|
||||
%macro WRITE_4X8 4 ;row[1-4]
|
||||
WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
|
||||
lea dstq, [dstq+strideq*4]
|
||||
@ -838,7 +735,48 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
ALIGN function_align
|
||||
.main:
|
||||
IADST8_1D_PACKED
|
||||
mova m6, [o(pd_2048)]
|
||||
punpckhwd m4, m3, m0 ;unpacked in7 in0
|
||||
punpckhwd m5, m2, m1 ;unpacked in5 in2
|
||||
punpcklwd m1, m2 ;unpacked in3 in4
|
||||
punpcklwd m0, m3 ;unpacked in1 in6
|
||||
ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
|
||||
ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
|
||||
ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
|
||||
ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
|
||||
|
||||
psubsw m3, m4, m1 ;low: t4 high: t5
|
||||
paddsw m4, m1 ;low: t0 high: t1
|
||||
psubsw m2, m5, m0 ;low: t6 high: t7
|
||||
paddsw m5, m0 ;low: t2 high: t3
|
||||
|
||||
shufps m1, m3, m2, q1032
|
||||
punpckhwd m2, m1
|
||||
punpcklwd m3, m1
|
||||
ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
|
||||
ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
|
||||
|
||||
psubsw m1, m4, m5 ;low: t2 high: t3
|
||||
paddsw m4, m5 ;low: out0 high: -out7
|
||||
psubsw m5, m3, m2 ;low: t7 high: t6
|
||||
paddsw m3, m2 ;low: out6 high: -out1
|
||||
shufps m0, m4, m3, q3210 ;low: out0 high: -out1
|
||||
shufps m3, m4, q3210 ;low: out6 high: -out7
|
||||
|
||||
mova m2, [o(pw_2896_m2896)]
|
||||
mova m7, [o(pw_2896_2896)]
|
||||
shufps m4, m1, m5, q1032 ;low: t3 high: t7
|
||||
shufps m1, m5, q3210 ;low: t2 high: t6
|
||||
punpcklwd m5, m1, m4
|
||||
punpckhwd m1, m4
|
||||
pmaddwd m4, m2, m1 ;-out5
|
||||
pmaddwd m2, m5 ; out4
|
||||
pmaddwd m1, m7 ; out2
|
||||
pmaddwd m5, m7 ;-out3
|
||||
REPX {paddd x, m6}, m4, m2, m1, m5
|
||||
REPX {psrad x, 12}, m4, m2, m1, m5
|
||||
packssdw m1, m5 ;low: out2 high: -out3
|
||||
packssdw m2, m4 ;low: out4 high: -out5
|
||||
ret
|
||||
|
||||
INV_TXFM_4X8_FN flipadst, dct, 0
|
||||
@ -1109,7 +1047,67 @@ cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
ALIGN function_align
|
||||
.main:
|
||||
IADST4_1D
|
||||
punpckhwd m6, m0, m2 ;unpacked in0 in2
|
||||
punpcklwd m0, m2 ;unpacked in0 in2
|
||||
punpckhwd m7, m1, m3 ;unpacked in1 in3
|
||||
punpcklwd m1, m3 ;unpacked in1 in3
|
||||
|
||||
mova m2, [o(pw_3344_m3344)]
|
||||
mova m4, [o(pw_0_3344)]
|
||||
pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
|
||||
pmaddwd m5, m4, m7 ;3344 * in3
|
||||
pmaddwd m2, m0
|
||||
pmaddwd m4, m1
|
||||
paddd m3, m5
|
||||
paddd m2, m4
|
||||
mova m4, [o(pd_2048)]
|
||||
paddd m3, m4 ;t2 + 2048
|
||||
paddd m2, m4
|
||||
psrad m3, 12
|
||||
psrad m2, 12
|
||||
packssdw m2, m3 ;out2
|
||||
|
||||
pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
|
||||
paddd m3, m4 ;t0 + t3
|
||||
|
||||
pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
mova m4, [o(pd_2048)]
|
||||
paddd m0, m4
|
||||
paddd m4, m3 ;t0 + t3 + 2048
|
||||
paddd m5, m0 ;t1 + t3 + 2048
|
||||
paddd m3, m0
|
||||
paddd m3, m1 ;t0 + t1 - t3 + 2048
|
||||
|
||||
psrad m4, 12 ;out0
|
||||
psrad m5, 12 ;out1
|
||||
psrad m3, 12 ;out3
|
||||
packssdw m0, m4, m5 ;low: out0 high: out1
|
||||
|
||||
pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
|
||||
pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
|
||||
pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
|
||||
pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
|
||||
paddd m1, m4 ;t0 + t3
|
||||
pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
|
||||
|
||||
mova m4, [o(pd_2048)]
|
||||
paddd m6, m4
|
||||
paddd m4, m1 ;t0 + t3 + 2048
|
||||
paddd m5, m6 ;t1 + t3 + 2048
|
||||
paddd m1, m6
|
||||
paddd m1, m7 ;t0 + t1 - t3 + 2048
|
||||
|
||||
psrad m4, 12 ;out0
|
||||
psrad m5, 12 ;out1
|
||||
psrad m1, 12 ;out3
|
||||
packssdw m3, m1 ;out3
|
||||
packssdw m4, m5 ;low: out0 high: out1
|
||||
|
||||
punpckhqdq m1, m0, m4 ;out1
|
||||
punpcklqdq m0, m4 ;out0
|
||||
ret
|
||||
|
||||
INV_TXFM_8X4_FN flipadst, dct
|
||||
@ -1423,6 +1421,7 @@ cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
.pass1:
|
||||
call .main
|
||||
call .main_pass1_end
|
||||
|
||||
.pass1_end:
|
||||
mova m7, [o(pw_16384)]
|
||||
@ -1441,6 +1440,7 @@ ALIGN function_align
|
||||
|
||||
.pass2_main:
|
||||
call .main
|
||||
call .main_pass2_end
|
||||
|
||||
.end:
|
||||
mova m7, [o(pw_2048)]
|
||||
@ -1491,10 +1491,57 @@ ALIGN function_align
|
||||
psubsw m5, m6 ;t6
|
||||
paddsw m6, m2, m7 ;out6
|
||||
psubsw m2, m7 ;t7
|
||||
paddw m7, m4, m3 ;t2 + t3
|
||||
psubw m4, m3 ;t2 - t3
|
||||
paddw m3, m5, m2 ;t6 + t7
|
||||
psubw m5, m2 ;t6 - t7
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
mova [rsp+gprsize*2+16*1], m1
|
||||
mova [rsp+gprsize*2+16*2], m6
|
||||
punpckhwd m1, m4, m3
|
||||
punpcklwd m4, m3
|
||||
punpckhwd m7, m5, m2
|
||||
punpcklwd m5, m2
|
||||
mova m2, [o(pw_2896_2896)]
|
||||
mova m6, [o(pd_2048)]
|
||||
pmaddwd m3, m2, m7
|
||||
pmaddwd m2, m5
|
||||
paddd m3, m6
|
||||
paddd m2, m6
|
||||
psrad m3, 12
|
||||
psrad m2, 12
|
||||
packssdw m2, m3 ;out2
|
||||
mova m3, [o(pw_2896_m2896)]
|
||||
pmaddwd m7, m3
|
||||
pmaddwd m5, m3
|
||||
paddd m7, m6
|
||||
paddd m5, m6
|
||||
psrad m7, 12
|
||||
psrad m5, 12
|
||||
packssdw m5, m7 ;-out5
|
||||
mova m3, [o(pw_2896_2896)]
|
||||
pmaddwd m7, m3, m1
|
||||
pmaddwd m3, m4
|
||||
paddd m7, m6
|
||||
paddd m3, m6
|
||||
psrad m7, 12
|
||||
psrad m3, 12
|
||||
packssdw m3, m7 ;-out3
|
||||
mova m7, [o(pw_2896_m2896)]
|
||||
pmaddwd m1, m7
|
||||
pmaddwd m4, m7
|
||||
paddd m1, m6
|
||||
paddd m4, m6
|
||||
psrad m1, 12
|
||||
psrad m4, 12
|
||||
packssdw m4, m1 ;-out5
|
||||
mova m1, [rsp+gprsize*2+16*1]
|
||||
mova m6, [rsp+gprsize*2+16*2]
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
paddsw m7, m4, m3 ;t2 + t3
|
||||
psubsw m4, m3 ;t2 - t3
|
||||
paddsw m3, m5, m2 ;t6 + t7
|
||||
psubsw m5, m2 ;t6 - t7
|
||||
mova m2, [o(pw_2896x8)]
|
||||
pmulhrsw m4, m2 ;out4
|
||||
pmulhrsw m5, m2 ;-out5
|
||||
@ -1513,6 +1560,7 @@ cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
.pass1:
|
||||
call m(iadst_8x8_internal).main
|
||||
call m(iadst_8x8_internal).main_pass1_end
|
||||
|
||||
.pass1_end:
|
||||
mova m7, [o(pw_m16384)]
|
||||
@ -1542,6 +1590,7 @@ ALIGN function_align
|
||||
|
||||
.pass2_main:
|
||||
call m(iadst_8x8_internal).main
|
||||
call m(iadst_8x8_internal).main_pass2_end
|
||||
|
||||
.end:
|
||||
mova m7, [o(pw_2048)]
|
||||
@ -1753,6 +1802,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
.pass2:
|
||||
call m(iadst_16x4_internal).main
|
||||
call m(iadst_16x4_internal).main_pass2_end
|
||||
|
||||
punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
|
||||
punpckhqdq m4, m5 ;low: out8 high: out10
|
||||
@ -1820,6 +1870,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
.pass2:
|
||||
call m(iadst_16x4_internal).main
|
||||
call m(iadst_16x4_internal).main_pass2_end
|
||||
|
||||
punpckhqdq m6, m5, m4 ;low: out5 high: out7
|
||||
punpcklqdq m4, m5 ;low: -out8 high: -out10
|
||||
@ -2160,6 +2211,7 @@ INV_TXFM_16X4_FN adst, identity
|
||||
cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
LOAD_7ROWS coeffq, 16
|
||||
call .main
|
||||
call .main_pass1_end
|
||||
|
||||
punpckhwd m6, m7, m0 ;packed -out11, -out15
|
||||
punpcklwd m0, m7 ;packed out0, out4
|
||||
@ -2193,92 +2245,137 @@ cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
ALIGN function_align
|
||||
.main:
|
||||
mova [coeffq+16*6], m0
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m0, m1, q1032
|
||||
pshufd m2, m2, q1032
|
||||
punpckhwd m0, m6, m1 ;packed in13, in2
|
||||
punpcklwd m1, m6 ;packed in3, in12
|
||||
punpckhwd m6, m5, m2 ;packed in11, in4
|
||||
punpckhwd m1, m6, m0 ;packed in13, in2
|
||||
punpcklwd m0, m6 ;packed in3, in12
|
||||
punpckhwd m7, m5, m2 ;packed in11, in4
|
||||
punpcklwd m2, m5 ;packed in5, in10
|
||||
mova m7, [o(pd_2048)]
|
||||
ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3
|
||||
ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5
|
||||
ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11
|
||||
ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13
|
||||
psubsw m5, m0, m2 ;low:t10a high:t11a
|
||||
paddsw m0, m2 ;low:t2a high:t3a
|
||||
psubsw m2, m6, m1 ;low:t12a high:t13a
|
||||
paddsw m6, m1 ;low:t4a high:t5a
|
||||
punpcklqdq m1, m5
|
||||
punpckhwd m1, m5 ;packed t10a, t11a
|
||||
mova m6, [o(pd_2048)]
|
||||
ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
|
||||
ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
|
||||
ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
|
||||
ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
|
||||
psubsw m5, m1, m2 ;low:t10a high:t11a
|
||||
paddsw m1, m2 ;low:t2a high:t3a
|
||||
psubsw m2, m7, m0 ;low:t12a high:t13a
|
||||
paddsw m7, m0 ;low:t4a high:t5a
|
||||
punpcklqdq m0, m5
|
||||
punpckhwd m0, m5 ;packed t10a, t11a
|
||||
punpcklqdq m5, m2
|
||||
punpckhwd m2, m5 ;packed t13a, t12a
|
||||
ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11
|
||||
ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13
|
||||
mova [coeffq+16*4], m0
|
||||
mova [coeffq+16*5], m6
|
||||
mova m0, [coeffq+16*6]
|
||||
mova m6, [coeffq+16*7]
|
||||
pshufd m0, m0, q1032
|
||||
ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
|
||||
ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
|
||||
mova [coeffq+16*4], m1
|
||||
mova [coeffq+16*5], m7
|
||||
mova m1, [coeffq+16*6]
|
||||
mova m7, [coeffq+16*7]
|
||||
pshufd m1, m1, q1032
|
||||
pshufd m3, m3, q1032
|
||||
punpckhwd m5, m6, m0 ;packed in15, in0
|
||||
punpcklwd m0, m6 ;packed in1, in14
|
||||
punpckhwd m6, m4, m3 ;packed in9, in6
|
||||
punpckhwd m5, m7, m1 ;packed in15, in0
|
||||
punpcklwd m1, m7 ;packed in1, in14
|
||||
punpckhwd m7, m4, m3 ;packed in9, in6
|
||||
punpcklwd m3, m4 ;packed in7, in8
|
||||
ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1
|
||||
ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7
|
||||
ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9
|
||||
ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15
|
||||
ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
|
||||
ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
|
||||
ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
|
||||
ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
|
||||
psubsw m4, m5, m3 ;low:t8a high:t9a
|
||||
paddsw m5, m3 ;low:t0a high:t1a
|
||||
psubsw m3, m6, m0 ;low:t14a high:t15a
|
||||
paddsw m6, m0 ;low:t6a high:t7a
|
||||
punpcklqdq m0, m4
|
||||
punpckhwd m0, m4 ;packed t8a, t9a
|
||||
psubsw m3, m7, m1 ;low:t14a high:t15a
|
||||
paddsw m7, m1 ;low:t6a high:t7a
|
||||
punpcklqdq m1, m4
|
||||
punpckhwd m1, m4 ;packed t8a, t9a
|
||||
punpcklqdq m4, m3
|
||||
punpckhwd m3, m4 ;packed t15a, t14a
|
||||
ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9
|
||||
ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15
|
||||
psubsw m4, m0, m2 ;low:t12a high:t13a
|
||||
paddsw m0, m2 ;low:t8a high:t9a
|
||||
psubsw m2, m1, m3 ;low:t14a high:t15a
|
||||
paddsw m1, m3 ;low:t10a high:t11a
|
||||
punpcklqdq m3, m4
|
||||
punpckhwd m3, m4 ;packed t12a, t13a
|
||||
punpcklqdq m4, m2
|
||||
punpckhwd m2, m4 ;packed t15a, t14a
|
||||
ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13
|
||||
ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15
|
||||
psubsw m4, m0, m1 ;low:t10 high:t11
|
||||
paddsw m0, m1 ;low:-out1 high:out14
|
||||
ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
|
||||
ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
|
||||
paddsw m4, m1, m2 ;low:t12a high:t13a
|
||||
psubsw m1, m2 ;low:t8a high:t9a
|
||||
psubsw m2, m0, m3 ;low:t14a high:t15a
|
||||
paddsw m0, m3 ;low:t10a high:t11a
|
||||
punpcklqdq m3, m1
|
||||
punpckhwd m3, m1 ;packed t12a, t13a
|
||||
punpcklqdq m1, m2
|
||||
punpckhwd m2, m1 ;packed t15a, t14a
|
||||
ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
|
||||
ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
|
||||
psubsw m1, m3, m2 ;low:t14a high:t15a
|
||||
paddsw m3, m2 ;low:out2 high:-out13
|
||||
punpckhqdq m2, m4, m1 ;low:t11 high:t15a
|
||||
punpcklqdq m4, m1 ;low:t10 high:t14a
|
||||
psubw m1, m4, m2
|
||||
paddw m2, m4
|
||||
psubsw m2, m4, m0 ;low:t10 high:t11
|
||||
paddsw m0, m4 ;low:-out1 high:out14
|
||||
mova [coeffq+16*6], m0
|
||||
mova [coeffq+16*7], m3
|
||||
mova m0, [coeffq+16*4]
|
||||
mova m3, [coeffq+16*5]
|
||||
psubsw m4, m5, m3 ;low:t4 high:t5
|
||||
paddsw m5, m3 ;low:t0 high:t1
|
||||
psubsw m3, m0 ,m6 ;low:t6 high:t7
|
||||
paddsw m0, m6 ;low:t2 high:t3
|
||||
punpcklqdq m6, m4
|
||||
punpckhwd m6, m4 ;packed t4, t5
|
||||
psubsw m3, m0, m7 ;low:t6 high:t7
|
||||
paddsw m0, m7 ;low:t2 high:t3
|
||||
punpcklqdq m7, m4
|
||||
punpckhwd m7, m4 ;packed t4, t5
|
||||
punpcklqdq m4, m3
|
||||
punpckhwd m3, m4 ;packed t7, t6
|
||||
ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a
|
||||
ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a
|
||||
ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
|
||||
ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
|
||||
psubsw m4, m5, m0 ;low:t2a high:t3a
|
||||
paddsw m0, m5 ;low:out0 high:-out15
|
||||
psubsw m5, m6, m3 ;low:t6 high:t7
|
||||
paddsw m3, m6 ;low:-out3 high:out12
|
||||
psubsw m5, m7, m3 ;low:t6 high:t7
|
||||
paddsw m3, m7 ;low:-out3 high:out12
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
mova m7, [o(deint_shuf1)]
|
||||
mova [coeffq+16*4], m0
|
||||
mova [coeffq+16*5], m3
|
||||
mova m0, [o(pw_2896_m2896)]
|
||||
mova m3, [o(pw_2896_2896)]
|
||||
pshufb m1, m7 ;t14a t15a
|
||||
pshufb m2, m7 ;t10 t11
|
||||
pshufb m4, m7 ;t2a t3a
|
||||
pshufb m5, m7 ;t6 t7
|
||||
pmaddwd m7, m0, m2
|
||||
pmaddwd m2, m3
|
||||
paddd m7, m6
|
||||
paddd m2, m6
|
||||
psrad m7, 12
|
||||
psrad m2, 12
|
||||
packssdw m2, m7 ;low:out6 high:-out9
|
||||
pmaddwd m7, m0, m4
|
||||
pmaddwd m4, m3
|
||||
paddd m7, m6
|
||||
paddd m4, m6
|
||||
psrad m7, 12
|
||||
psrad m4, 12
|
||||
packssdw m4, m7 ;low:-out7 high:out8
|
||||
pmaddwd m7, m3, m5
|
||||
pmaddwd m5, m0
|
||||
paddd m7, m6
|
||||
paddd m5, m6
|
||||
psrad m7, 12
|
||||
psrad m5, 12
|
||||
packssdw m7, m5 ;low:out4 high:-out11
|
||||
pmaddwd m5, m3, m1
|
||||
pmaddwd m1, m0
|
||||
paddd m5, m6
|
||||
paddd m1, m6
|
||||
psrad m5, 12
|
||||
psrad m1, 12
|
||||
packssdw m5, m1 ;low:-out5 high:out10
|
||||
mova m0, [coeffq+16*4]
|
||||
mova m3, [coeffq+16*5]
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
mova m7, [o(pw_2896x8)]
|
||||
punpckhqdq m6, m2, m1 ;low:t11 high:t15a
|
||||
punpcklqdq m2, m1 ;low:t10 high:t14a
|
||||
psubsw m1, m2, m6
|
||||
paddsw m2, m6
|
||||
punpckhqdq m6, m4, m5 ;low:t3a high:t7
|
||||
punpcklqdq m4, m5 ;low:t2a high:t6
|
||||
psubw m5, m4, m6
|
||||
paddw m4, m6
|
||||
psubsw m5, m4, m6
|
||||
paddsw m4, m6
|
||||
pmulhrsw m1, m7 ;low:-out9 high:out10
|
||||
pmulhrsw m2, m7 ;low:out6 high:-out5
|
||||
pmulhrsw m5, m7 ;low:out8 high:-out11
|
||||
@ -2298,6 +2395,7 @@ INV_TXFM_16X4_FN flipadst, identity
|
||||
cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
LOAD_7ROWS coeffq, 16
|
||||
call m(iadst_16x4_internal).main
|
||||
call m(iadst_16x4_internal).main_pass1_end
|
||||
|
||||
punpcklwd m6, m7, m0 ;packed out11, out15
|
||||
punpckhwd m0, m7 ;packed -out0, -out4
|
||||
@ -2360,7 +2458,7 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
%endmacro
|
||||
|
||||
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
|
||||
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16
|
||||
%ifidn %1_%2, dct_dct
|
||||
pshuflw m0, [coeffq], q0000
|
||||
punpcklwd m0, m0
|
||||
@ -2548,6 +2646,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m7, [coeffq+16*11]
|
||||
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass2_end
|
||||
|
||||
mov r3, dstq
|
||||
lea dstq, [dstq+strideq*8]
|
||||
@ -2599,6 +2698,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m7, [coeffq+16*11]
|
||||
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass2_end
|
||||
jmp m(iflipadst_8x8_internal).end
|
||||
|
||||
.end:
|
||||
@ -2652,7 +2752,7 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
|
||||
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12
|
||||
INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16
|
||||
%ifidn %1_%2, dct_dct
|
||||
movd m1, [o(pw_2896x8)]
|
||||
pmulhrsw m0, m1, [coeffq]
|
||||
@ -2893,6 +2993,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
pmulhrsw m7, [coeffq+16*13]
|
||||
|
||||
call .main
|
||||
call .main_pass1_end
|
||||
mov r3, tx2q
|
||||
lea tx2q, [o(m(iadst_16x8_internal).pass1_end)]
|
||||
jmp m(iadst_8x8_internal).pass1_end
|
||||
@ -2998,23 +3099,15 @@ ALIGN function_align
|
||||
mova [rsp+gprsize*2+16*6], m3 ;-out3
|
||||
psubsw m3, m0, m4 ;t7
|
||||
paddsw m0, m4 ;out12
|
||||
mova m7, [o(pw_2896x8)]
|
||||
psubw m4, m2, m3
|
||||
paddw m2, m3
|
||||
mova [rsp+gprsize*2+16*12], m3
|
||||
mova m3, [rsp+gprsize*2+16*7] ;t3
|
||||
pmulhrsw m4, m7 ;-out11
|
||||
pmulhrsw m2, m7 ;out4
|
||||
mova [rsp+gprsize*2+16*7], m2 ;out4
|
||||
mova [rsp+gprsize*2+16* 7], m2 ;out4
|
||||
psubsw m2, m5, m3 ;t3a
|
||||
paddsw m5, m3 ;-out15
|
||||
psubw m3, m1, m2
|
||||
paddw m1, m2
|
||||
mova [rsp+gprsize*2+16*11], m2
|
||||
mova m2, [rsp+gprsize*2+32*5] ;t15
|
||||
pmulhrsw m3, m7 ;out8
|
||||
pmulhrsw m1, m7 ;-out7
|
||||
mova [rsp+gprsize*2+32*5 ], m1 ;-out7
|
||||
mova [rsp+gprsize*2+16*10], m1 ;-out7
|
||||
mova m1, [rsp+gprsize*2+16*0] ;t11
|
||||
mova [rsp+gprsize*2+16*11], m3 ;out8
|
||||
mova [rsp+gprsize*2+16*0 ], m5 ;-out15
|
||||
mova m3, [rsp+gprsize*2+16*1] ;t10
|
||||
mova [rsp+gprsize*2+16*1 ], m4 ;-out11
|
||||
@ -3044,26 +3137,106 @@ ALIGN function_align
|
||||
paddsw m2, m6 ;-out1
|
||||
paddsw m6, m4, m1 ;out14
|
||||
psubsw m4, m1 ;t11
|
||||
psubw m1, m3, m4
|
||||
paddw m3, m4
|
||||
pmulhrsw m1, m7 ;-out9
|
||||
pmulhrsw m3, m7 ;out6
|
||||
mova [rsp+gprsize*2+16*4], m2 ;-out1
|
||||
mova [rsp+gprsize*2+16*14], m4
|
||||
mova [rsp+gprsize*2+16* 4], m2 ;-out1
|
||||
mova m4, [rsp+gprsize*2+16*8] ;t14
|
||||
mova m2, [rsp+gprsize*2+16*9] ;t15
|
||||
mova [rsp+gprsize*2+16*9], m3 ;out6
|
||||
mova [rsp+gprsize*2+16* 9], m3 ;out6
|
||||
psubsw m3, m0, m4 ;t14a
|
||||
paddsw m0, m4 ;out2
|
||||
psubsw m4, m5, m2 ;t15a
|
||||
paddsw m5, m2 ;-out13
|
||||
psubw m2, m3, m4
|
||||
paddw m3, m4
|
||||
mova [rsp+gprsize*2+16*5], m0 ;out2
|
||||
pmulhrsw m3, m7 ;-out5
|
||||
mova [rsp+gprsize*2+16* 5], m0 ;out2
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass1_end:
|
||||
mova m0, [rsp+gprsize*2+16*14]
|
||||
mova [rsp+gprsize*2+16*14], m5
|
||||
mova [rsp+gprsize*2+16*15], m6
|
||||
mova m5, [o(pw_2896_2896)]
|
||||
mova m6, [o(pw_2896_m2896)]
|
||||
mova m7, [o(pd_2048)]
|
||||
punpcklwd m2, m3, m4
|
||||
punpckhwd m3, m4
|
||||
pmaddwd m4, m5, m2
|
||||
pmaddwd m2, m6
|
||||
pmaddwd m1, m5, m3
|
||||
pmaddwd m3, m6
|
||||
REPX {paddd x, m7}, m4, m2, m1, m3
|
||||
REPX {psrad x, 12}, m4, m1, m2, m3
|
||||
packssdw m4, m1 ;-out5
|
||||
packssdw m2, m3 ;out10
|
||||
mova [rsp+gprsize*2+16* 8], m4
|
||||
mova m3, [rsp+gprsize*2+16* 9]
|
||||
punpcklwd m1, m3, m0
|
||||
punpckhwd m3, m0
|
||||
pmaddwd m0, m5, m1
|
||||
pmaddwd m1, m6
|
||||
pmaddwd m4, m5, m3
|
||||
pmaddwd m3, m6
|
||||
REPX {paddd x, m7}, m0, m1, m4, m3
|
||||
REPX {psrad x, 12}, m0, m4, m1, m3
|
||||
packssdw m0, m4 ;out6
|
||||
packssdw m1, m3 ;-out9
|
||||
mova [rsp+gprsize*2+16* 9], m0
|
||||
mova m0, [rsp+gprsize*2+16* 7]
|
||||
mova m4, [rsp+gprsize*2+16*12]
|
||||
punpcklwd m3, m0, m4
|
||||
punpckhwd m0, m4
|
||||
pmaddwd m4, m5, m3
|
||||
pmaddwd m3, m6
|
||||
pmaddwd m5, m0
|
||||
pmaddwd m0, m6
|
||||
REPX {paddd x, m7}, m4, m3, m5, m0
|
||||
REPX {psrad x, 12}, m4, m5, m3, m0
|
||||
packssdw m4, m5 ;out4
|
||||
packssdw m3, m0 ;-out11
|
||||
mova [rsp+gprsize*2+16* 7], m4
|
||||
mova m4, [rsp+gprsize*2+16*10]
|
||||
mova m5, [rsp+gprsize*2+16*11]
|
||||
punpcklwd m0, m4, m5
|
||||
punpckhwd m4, m5
|
||||
pmaddwd m5, m0, [o(pw_2896_2896)]
|
||||
pmaddwd m0, m6
|
||||
pmaddwd m6, m4
|
||||
pmaddwd m4, [o(pw_2896_2896)]
|
||||
REPX {paddd x, m7}, m5, m0, m6, m4
|
||||
REPX {psrad x, 12}, m0, m6, m5, m4
|
||||
packssdw m0, m6 ;out8
|
||||
packssdw m5, m4 ;-out7
|
||||
mova [rsp+gprsize*2+16*10], m5
|
||||
mova m4, [rsp+gprsize*2+16* 2] ;out12
|
||||
mova m5, [rsp+gprsize*2+16*14] ;-out13
|
||||
mova m6, [rsp+gprsize*2+16*15] ;out14
|
||||
ret
|
||||
ALIGN function_align
|
||||
.main_pass2_end:
|
||||
mova m7, [o(pw_2896x8)]
|
||||
mova m1, [rsp+gprsize*2+16* 9]
|
||||
mova m2, [rsp+gprsize*2+16*14]
|
||||
paddsw m0, m1, m2
|
||||
psubsw m1, m2
|
||||
pmulhrsw m0, m7 ;out6
|
||||
pmulhrsw m1, m7 ;-out9
|
||||
mova [rsp+gprsize*2+16* 9], m0
|
||||
psubsw m2, m3, m4
|
||||
paddsw m3, m4
|
||||
pmulhrsw m2, m7 ;out10
|
||||
mova [rsp+gprsize*2+16*8], m3 ;-out5
|
||||
mova m0, [rsp+gprsize*2+16*11] ;out8
|
||||
mova m3, [rsp+gprsize*2+16*1 ] ;-out11
|
||||
pmulhrsw m3, m7 ;-out5
|
||||
mova [rsp+gprsize*2+16* 8], m3
|
||||
mova m3, [rsp+gprsize*2+16* 7]
|
||||
mova m4, [rsp+gprsize*2+16*12]
|
||||
paddsw m0, m3, m4
|
||||
psubsw m3, m4
|
||||
pmulhrsw m0, m7 ;out4
|
||||
pmulhrsw m3, m7 ;-out11
|
||||
mova [rsp+gprsize*2+16* 7], m0
|
||||
mova m0, [rsp+gprsize*2+16*10]
|
||||
paddsw m4, m0, [rsp+gprsize*2+16*11]
|
||||
psubsw m0, [rsp+gprsize*2+16*11]
|
||||
pmulhrsw m4, m7 ;-out7
|
||||
pmulhrsw m0, m7 ;out8
|
||||
mova [rsp+gprsize*2+16*10], m4
|
||||
mova m4, [rsp+gprsize*2+16*2 ] ;out12
|
||||
ret
|
||||
|
||||
@ -3100,6 +3273,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
pmulhrsw m7, [coeffq+16*13]
|
||||
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
|
||||
mova m7, [rsp+gprsize+16*0]
|
||||
SAVE_8ROWS coeffq+16*0, 32
|
||||
@ -3184,7 +3358,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
|
||||
|
||||
%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*12
|
||||
INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16
|
||||
%ifidn %1_%2, dct_dct
|
||||
movd m1, [o(pw_2896x8)]
|
||||
pmulhrsw m0, m1, [coeffq]
|
||||
@ -3423,6 +3597,7 @@ INV_TXFM_16X16_FN adst, flipadst
|
||||
cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
ITX_16X16_ADST_LOAD_ODD_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
|
||||
mov r3, tx2q
|
||||
lea tx2q, [o(m(iadst_16x16_internal).pass1_end)]
|
||||
@ -3441,6 +3616,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
SAVE_8ROWS coeffq+16*1, 32
|
||||
ITX_16X16_ADST_LOAD_EVEN_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
|
||||
lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
|
||||
mova m7, [o(pw_8192)]
|
||||
@ -3496,6 +3672,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
|
||||
cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
ITX_16X16_ADST_LOAD_ODD_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
|
||||
mov r3, tx2q
|
||||
lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
|
||||
@ -3514,6 +3691,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
SAVE_8ROWS coeffq+16*17, 32
|
||||
ITX_16X16_ADST_LOAD_EVEN_COEFS
|
||||
call m(iadst_16x8_internal).main
|
||||
call m(iadst_16x8_internal).main_pass1_end
|
||||
|
||||
mova m7, [rsp+gprsize+16*0]
|
||||
SAVE_8ROWS coeffq+16*0, 32
|
||||
|
21
third_party/dav1d/src/x86/loopfilter_init_tmpl.c
vendored
21
third_party/dav1d/src/x86/loopfilter_init_tmpl.c
vendored
@ -28,14 +28,27 @@
|
||||
#include "src/cpu.h"
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_avx2);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_avx2);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_avx2);
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_avx2);
|
||||
#define decl_loopfilter_sb_fns(ext) \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
|
||||
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
|
||||
|
||||
decl_loopfilter_sb_fns(ssse3);
|
||||
decl_loopfilter_sb_fns(avx2);
|
||||
|
||||
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
|
2348
third_party/dav1d/src/x86/loopfilter_ssse3.asm
vendored
Normal file
2348
third_party/dav1d/src/x86/loopfilter_ssse3.asm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
30
third_party/dav1d/src/x86/mc.asm
vendored
30
third_party/dav1d/src/x86/mc.asm
vendored
@ -170,8 +170,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
|
||||
.put:
|
||||
movzx wd, word [t2+wq*2+table_offset(put,)]
|
||||
add wq, t2
|
||||
lea t1, [ssq*3]
|
||||
lea t2, [dsq*3]
|
||||
jmp wq
|
||||
.put_w2:
|
||||
movzx t0d, word [srcq+ssq*0]
|
||||
@ -194,11 +192,11 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
|
||||
jg .put_w4
|
||||
RET
|
||||
.put_w8:
|
||||
movq m0, [srcq+ssq*0]
|
||||
movq m1, [srcq+ssq*1]
|
||||
mov t0, [srcq+ssq*0]
|
||||
mov t1, [srcq+ssq*1]
|
||||
lea srcq, [srcq+ssq*2]
|
||||
movq [dstq+dsq*0], m0
|
||||
movq [dstq+dsq*1], m1
|
||||
mov [dstq+dsq*0], t0
|
||||
mov [dstq+dsq*1], t1
|
||||
lea dstq, [dstq+dsq*2]
|
||||
sub hd, 2
|
||||
jg .put_w8
|
||||
@ -206,30 +204,22 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
|
||||
.put_w16:
|
||||
movu m0, [srcq+ssq*0]
|
||||
movu m1, [srcq+ssq*1]
|
||||
movu m2, [srcq+ssq*2]
|
||||
movu m3, [srcq+t1 ]
|
||||
lea srcq, [srcq+ssq*4]
|
||||
lea srcq, [srcq+ssq*2]
|
||||
mova [dstq+dsq*0], m0
|
||||
mova [dstq+dsq*1], m1
|
||||
mova [dstq+dsq*2], m2
|
||||
mova [dstq+t2 ], m3
|
||||
lea dstq, [dstq+dsq*4]
|
||||
sub hd, 4
|
||||
lea dstq, [dstq+dsq*2]
|
||||
sub hd, 2
|
||||
jg .put_w16
|
||||
RET
|
||||
INIT_YMM avx2
|
||||
.put_w32:
|
||||
movu m0, [srcq+ssq*0]
|
||||
movu m1, [srcq+ssq*1]
|
||||
movu m2, [srcq+ssq*2]
|
||||
movu m3, [srcq+t1 ]
|
||||
lea srcq, [srcq+ssq*4]
|
||||
lea srcq, [srcq+ssq*2]
|
||||
mova [dstq+dsq*0], m0
|
||||
mova [dstq+dsq*1], m1
|
||||
mova [dstq+dsq*2], m2
|
||||
mova [dstq+t2 ], m3
|
||||
lea dstq, [dstq+dsq*4]
|
||||
sub hd, 4
|
||||
lea dstq, [dstq+dsq*2]
|
||||
sub hd, 2
|
||||
jg .put_w32
|
||||
RET
|
||||
.put_w64:
|
||||
|
15
third_party/dav1d/src/x86/mc_ssse3.asm
vendored
15
third_party/dav1d/src/x86/mc_ssse3.asm
vendored
@ -177,7 +177,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
|
||||
.put:
|
||||
movzx wd, word [t0+wq*2+table_offset(put,)]
|
||||
add wq, t0
|
||||
lea r6, [ssq*3]
|
||||
RESTORE_DSQ_32 t0
|
||||
jmp wq
|
||||
.put_w2:
|
||||
@ -211,20 +210,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
|
||||
jg .put_w8
|
||||
RET
|
||||
.put_w16:
|
||||
lea r4, [dsq*3]
|
||||
.put_w16_in:
|
||||
movu m0, [srcq+ssq*0]
|
||||
movu m1, [srcq+ssq*1]
|
||||
movu m2, [srcq+ssq*2]
|
||||
movu m3, [srcq+r6 ]
|
||||
lea srcq, [srcq+ssq*4]
|
||||
lea srcq, [srcq+ssq*2]
|
||||
mova [dstq+dsq*0], m0
|
||||
mova [dstq+dsq*1], m1
|
||||
mova [dstq+dsq*2], m2
|
||||
mova [dstq+r4 ], m3
|
||||
lea dstq, [dstq+dsq*4]
|
||||
sub hd, 4
|
||||
jg .put_w16_in
|
||||
lea dstq, [dstq+dsq*2]
|
||||
sub hd, 2
|
||||
jg .put_w16
|
||||
RET
|
||||
.put_w32:
|
||||
movu m0, [srcq+ssq*0+16*0]
|
||||
|
206
third_party/dav1d/src/x86/msac.asm
vendored
206
third_party/dav1d/src/x86/msac.asm
vendored
@ -27,7 +27,7 @@
|
||||
|
||||
SECTION_RODATA 64 ; avoids cacheline splits
|
||||
|
||||
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
|
||||
pw_0xff00: times 8 dw 0xff00
|
||||
pw_32: times 8 dw 32
|
||||
|
||||
@ -35,21 +35,24 @@ pw_32: times 8 dw 32
|
||||
%define resp resq
|
||||
%define movp movq
|
||||
%define c_shuf q3333
|
||||
%define DECODE_SYMBOL_ADAPT_INIT
|
||||
%macro DECODE_SYMBOL_ADAPT_INIT 0-1
|
||||
%endmacro
|
||||
%else
|
||||
%define resp resd
|
||||
%define movp movd
|
||||
%define c_shuf q1111
|
||||
%macro DECODE_SYMBOL_ADAPT_INIT 0
|
||||
%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
|
||||
mov t0, r0m
|
||||
mov t1, r1m
|
||||
%if %1 == 0
|
||||
mov t2, r2m
|
||||
%endif
|
||||
%if STACK_ALIGNMENT >= 16
|
||||
sub esp, 40
|
||||
sub esp, 40-%1*4
|
||||
%else
|
||||
mov eax, esp
|
||||
and esp, ~15
|
||||
sub esp, 40
|
||||
sub esp, 40-%1*4
|
||||
mov [esp], eax
|
||||
%endif
|
||||
%endmacro
|
||||
@ -69,13 +72,13 @@ endstruc
|
||||
SECTION .text
|
||||
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3
|
||||
%define buf rsp+8 ; shadow space
|
||||
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
|
||||
%define buf rsp+stack_offset+8 ; shadow space
|
||||
%elif UNIX64
|
||||
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0
|
||||
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
|
||||
%define buf rsp-40 ; red zone
|
||||
%else
|
||||
DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2
|
||||
DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
|
||||
%define buf esp+8
|
||||
%endif
|
||||
|
||||
@ -88,7 +91,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
|
||||
movp m3, [t0+msac.dif]
|
||||
mov t3d, [t0+msac.update_cdf]
|
||||
mov t4d, t2d
|
||||
neg t2
|
||||
not t2 ; -(n_symbols + 1)
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
pand m2, [rax]
|
||||
@ -112,15 +115,15 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
|
||||
pcmpeqw m2, m2
|
||||
mov t2d, t3d
|
||||
shr t3d, 4
|
||||
cmp t4d, 4
|
||||
sbb t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
|
||||
cmp t4d, 3
|
||||
sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
|
||||
cmp t2d, 32
|
||||
adc t2d, 0 ; count + (count < 32)
|
||||
movd m3, t3d
|
||||
pavgw m2, m1 ; i >= val ? -1 : 32768
|
||||
psubw m2, m0 ; for (i = 0; i < val; i++)
|
||||
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
|
||||
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
|
||||
psraw m2, m3 ; for (; i < n_symbols; i++)
|
||||
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
|
||||
movq [t1], m0
|
||||
mov [t1+t4*2], t2w
|
||||
@ -214,11 +217,11 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
|
||||
DECODE_SYMBOL_ADAPT_INIT
|
||||
LEA rax, pw_0xff00
|
||||
movd m2, [t0+msac.rng]
|
||||
movu m1, [t1]
|
||||
mova m1, [t1]
|
||||
movp m3, [t0+msac.dif]
|
||||
mov t3d, [t0+msac.update_cdf]
|
||||
mov t4d, t2d
|
||||
neg t2
|
||||
not t2
|
||||
pshuflw m2, m2, q0000
|
||||
movd [buf+12], m2
|
||||
punpcklqdq m2, m2
|
||||
@ -242,7 +245,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
|
||||
pcmpeqw m2, m2
|
||||
mov t2d, t3d
|
||||
shr t3d, 4
|
||||
cmp t4d, 4 ; may be called with n_symbols < 4
|
||||
cmp t4d, 3 ; may be called with n_symbols <= 2
|
||||
sbb t3d, -5
|
||||
cmp t2d, 32
|
||||
adc t2d, 0
|
||||
@ -252,7 +255,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
|
||||
psubw m0, m1
|
||||
psraw m2, m3
|
||||
paddw m0, m2
|
||||
movu [t1], m0
|
||||
mova [t1], m0
|
||||
mov [t1+t4*2], t2w
|
||||
jmp m(msac_decode_symbol_adapt4).renorm
|
||||
|
||||
@ -260,12 +263,12 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
||||
DECODE_SYMBOL_ADAPT_INIT
|
||||
LEA rax, pw_0xff00
|
||||
movd m4, [t0+msac.rng]
|
||||
movu m2, [t1]
|
||||
movu m3, [t1+16]
|
||||
mova m2, [t1]
|
||||
mova m3, [t1+16]
|
||||
movp m5, [t0+msac.dif]
|
||||
mov t3d, [t0+msac.update_cdf]
|
||||
mov t4d, t2d
|
||||
neg t2
|
||||
not t2
|
||||
%if WIN64
|
||||
sub rsp, 48 ; need 36 bytes, shadow space is only 32
|
||||
%endif
|
||||
@ -288,8 +291,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
||||
punpcklqdq m5, m5
|
||||
paddw m3, m4
|
||||
mova [buf], m2
|
||||
mova [buf+16], m3
|
||||
psubusw m2, m5
|
||||
mova [buf+16], m3
|
||||
psubusw m3, m5
|
||||
pxor m4, m4
|
||||
pcmpeqw m2, m4
|
||||
@ -301,7 +304,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
||||
movzx t3d, word [t1+t4*2]
|
||||
pcmpeqw m4, m4
|
||||
mova m5, m4
|
||||
lea t2d, [t3+80] ; only support n_symbols >= 4
|
||||
lea t2d, [t3+80] ; only support n_symbols > 2
|
||||
shr t2d, 4
|
||||
cmp t3d, 32
|
||||
adc t3d, 0
|
||||
@ -316,8 +319,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
|
||||
psraw m5, m2
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
movu [t1], m0
|
||||
movu [t1+16], m1
|
||||
mova [t1], m0
|
||||
mova [t1+16], m1
|
||||
mov [t1+t4*2], t3w
|
||||
.renorm:
|
||||
tzcnt eax, eax
|
||||
@ -440,3 +443,158 @@ cglobal msac_decode_bool, 0, 6, 0
|
||||
movzx eax, al
|
||||
%endif
|
||||
jmp m(msac_decode_symbol_adapt4).renorm3
|
||||
|
||||
%macro HI_TOK 1 ; update_cdf
|
||||
%if ARCH_X86_64 == 0
|
||||
mov eax, -24
|
||||
%endif
|
||||
%%loop:
|
||||
%if %1
|
||||
movzx t2d, word [t1+3*2]
|
||||
%endif
|
||||
mova m1, m0
|
||||
pshuflw m2, m2, q0000
|
||||
psrlw m1, 6
|
||||
movd [buf+12], m2
|
||||
pand m2, m4
|
||||
psllw m1, 7
|
||||
pmulhuw m1, m2
|
||||
%if ARCH_X86_64 == 0
|
||||
add eax, 5
|
||||
mov [buf+8], eax
|
||||
%endif
|
||||
pshuflw m3, m3, c_shuf
|
||||
paddw m1, m5
|
||||
movq [buf+16], m1
|
||||
psubusw m1, m3
|
||||
pxor m2, m2
|
||||
pcmpeqw m1, m2
|
||||
pmovmskb eax, m1
|
||||
%if %1
|
||||
lea ecx, [t2+80]
|
||||
pcmpeqw m2, m2
|
||||
shr ecx, 4
|
||||
cmp t2d, 32
|
||||
adc t2d, 0
|
||||
movd m3, ecx
|
||||
pavgw m2, m1
|
||||
psubw m2, m0
|
||||
psubw m0, m1
|
||||
psraw m2, m3
|
||||
paddw m0, m2
|
||||
movq [t1], m0
|
||||
mov [t1+3*2], t2w
|
||||
%endif
|
||||
tzcnt eax, eax
|
||||
movzx ecx, word [buf+rax+16]
|
||||
movzx t2d, word [buf+rax+14]
|
||||
not t4
|
||||
%if ARCH_X86_64
|
||||
add t6d, 5
|
||||
%endif
|
||||
sub eax, 5 ; setup for merging the tok_br and tok branches
|
||||
sub t2d, ecx
|
||||
shl rcx, gprsize*8-16
|
||||
add t4, rcx
|
||||
bsr ecx, t2d
|
||||
xor ecx, 15
|
||||
shl t2d, cl
|
||||
shl t4, cl
|
||||
movd m2, t2d
|
||||
mov [t7+msac.rng], t2d
|
||||
not t4
|
||||
sub t5d, ecx
|
||||
jge %%end
|
||||
mov t2, [t7+msac.buf]
|
||||
mov rcx, [t7+msac.end]
|
||||
%if UNIX64 == 0
|
||||
push t8
|
||||
%endif
|
||||
lea t8, [t2+gprsize]
|
||||
cmp t8, rcx
|
||||
ja %%refill_eob
|
||||
mov t2, [t2]
|
||||
lea ecx, [t5+23]
|
||||
add t5d, 16
|
||||
shr ecx, 3
|
||||
bswap t2
|
||||
sub t8, rcx
|
||||
shl ecx, 3
|
||||
shr t2, cl
|
||||
sub ecx, t5d
|
||||
mov t5d, gprsize*8-16
|
||||
shl t2, cl
|
||||
mov [t7+msac.buf], t8
|
||||
%if UNIX64 == 0
|
||||
pop t8
|
||||
%endif
|
||||
sub t5d, ecx
|
||||
xor t4, t2
|
||||
%%end:
|
||||
movp m3, t4
|
||||
%if ARCH_X86_64
|
||||
add t6d, eax ; CF = tok_br < 3 || tok == 15
|
||||
jnc %%loop
|
||||
lea eax, [t6+30]
|
||||
%else
|
||||
add eax, [buf+8]
|
||||
jnc %%loop
|
||||
add eax, 30
|
||||
%if STACK_ALIGNMENT >= 16
|
||||
add esp, 36
|
||||
%else
|
||||
mov esp, [esp]
|
||||
%endif
|
||||
%endif
|
||||
mov [t7+msac.dif], t4
|
||||
shr eax, 1
|
||||
mov [t7+msac.cnt], t5d
|
||||
RET
|
||||
%%refill_eob:
|
||||
mov t8, rcx
|
||||
mov ecx, gprsize*8-24
|
||||
sub ecx, t5d
|
||||
%%refill_eob_loop:
|
||||
cmp t2, t8
|
||||
jae %%refill_eob_end
|
||||
movzx t5d, byte [t2]
|
||||
inc t2
|
||||
shl t5, cl
|
||||
xor t4, t5
|
||||
sub ecx, 8
|
||||
jge %%refill_eob_loop
|
||||
%%refill_eob_end:
|
||||
%if UNIX64 == 0
|
||||
pop t8
|
||||
%endif
|
||||
mov t5d, gprsize*8-24
|
||||
mov [t7+msac.buf], t2
|
||||
sub t5d, ecx
|
||||
jmp %%end
|
||||
%endmacro
|
||||
|
||||
cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
|
||||
DECODE_SYMBOL_ADAPT_INIT 1
|
||||
%if ARCH_X86_64 == 0 && PIC
|
||||
LEA t2, min_prob+12*2
|
||||
%define base t2-(min_prob+12*2)
|
||||
%else
|
||||
%define base 0
|
||||
%endif
|
||||
movq m0, [t1]
|
||||
movd m2, [t0+msac.rng]
|
||||
mov eax, [t0+msac.update_cdf]
|
||||
movq m4, [base+pw_0xff00]
|
||||
movp m3, [t0+msac.dif]
|
||||
movq m5, [base+min_prob+12*2]
|
||||
mov t4, [t0+msac.dif]
|
||||
mov t5d, [t0+msac.cnt]
|
||||
%if ARCH_X86_64
|
||||
mov t6d, -24
|
||||
%endif
|
||||
movifnidn t7, t0
|
||||
test eax, eax
|
||||
jz .no_update_cdf
|
||||
HI_TOK 1
|
||||
.no_update_cdf:
|
||||
HI_TOK 0
|
||||
|
2
third_party/dav1d/src/x86/msac.h
vendored
2
third_party/dav1d/src/x86/msac.h
vendored
@ -37,11 +37,13 @@ unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
|
||||
unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
|
||||
unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
|
||||
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
|
||||
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
|
||||
|
||||
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
|
||||
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
|
||||
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
|
||||
#endif
|
||||
|
||||
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2
|
||||
|
3
third_party/dav1d/tests/checkasm/checkasm.c
vendored
3
third_party/dav1d/tests/checkasm/checkasm.c
vendored
@ -65,6 +65,7 @@ static const struct {
|
||||
{ "msac", checkasm_check_msac },
|
||||
#if CONFIG_8BPC
|
||||
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
|
||||
{ "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
|
||||
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
|
||||
{ "itx_8bpc", checkasm_check_itx_8bpc },
|
||||
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
|
||||
@ -73,6 +74,7 @@ static const struct {
|
||||
#endif
|
||||
#if CONFIG_16BPC
|
||||
{ "cdef_16bpc", checkasm_check_cdef_16bpc },
|
||||
{ "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
|
||||
{ "ipred_16bpc", checkasm_check_ipred_16bpc },
|
||||
{ "itx_16bpc", checkasm_check_itx_16bpc },
|
||||
{ "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
|
||||
@ -703,6 +705,7 @@ void checkasm_set_signal_handler_state(const int enabled) {
|
||||
RemoveVectoredExceptionHandler(signal_handler);
|
||||
#else
|
||||
void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
|
||||
signal(SIGBUS, handler);
|
||||
signal(SIGFPE, handler);
|
||||
signal(SIGILL, handler);
|
||||
signal(SIGSEGV, handler);
|
||||
|
3
third_party/dav1d/tests/checkasm/checkasm.h
vendored
3
third_party/dav1d/tests/checkasm/checkasm.h
vendored
@ -60,6 +60,7 @@ name##_16bpc(void)
|
||||
|
||||
void checkasm_check_msac(void);
|
||||
decl_check_bitfns(void checkasm_check_cdef);
|
||||
decl_check_bitfns(void checkasm_check_filmgrain);
|
||||
decl_check_bitfns(void checkasm_check_ipred);
|
||||
decl_check_bitfns(void checkasm_check_itx);
|
||||
decl_check_bitfns(void checkasm_check_loopfilter);
|
||||
@ -279,7 +280,7 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
}\
|
||||
} while (0)
|
||||
#else
|
||||
#define bench_new(...) while (0)
|
||||
#define bench_new(...) do {} while (0)
|
||||
#endif
|
||||
|
||||
#define DECL_CHECKASM_CHECK_FUNC(type) \
|
||||
|
269
third_party/dav1d/tests/checkasm/filmgrain.c
vendored
Normal file
269
third_party/dav1d/tests/checkasm/filmgrain.c
vendored
Normal file
@ -0,0 +1,269 @@
|
||||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/film_grain.h"
|
||||
#define UNIT_TEST 1
|
||||
#include "src/fg_apply_tmpl.c"
|
||||
|
||||
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
|
||||
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
|
||||
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
|
||||
const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#endif
|
||||
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = i;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
|
||||
call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
if (memcmp(grain_lut_c, grain_lut_a,
|
||||
GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
|
||||
{
|
||||
fail();
|
||||
}
|
||||
|
||||
bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
|
||||
report("gen_grain_y");
|
||||
}
|
||||
|
||||
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, src, 128 * 32,);
|
||||
const ptrdiff_t stride = 128 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
const Dav1dFilmGrainData *data, size_t pw,
|
||||
const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH],
|
||||
int bh, int row_num HIGHBD_DECL_SUFFIX);
|
||||
|
||||
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
|
||||
Dav1dFilmGrainData fg_data;
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
fg_data.num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data.num_y_points;
|
||||
for (int n = 0; n < fg_data.num_y_points; n++) {
|
||||
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
|
||||
fg_data.y_points[n][0] += rnd() % pad;
|
||||
fg_data.y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
|
||||
fg_data.num_y_points, scaling);
|
||||
|
||||
const int w = 1 + (rnd() & 127);
|
||||
const int h = 1 + (rnd() & 31);
|
||||
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
|
||||
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
|
||||
|
||||
fg_data.clip_to_restricted_range = rnd() & 1;
|
||||
fg_data.scaling_shift = (rnd() & 3) + 8;
|
||||
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
|
||||
fg_data.overlap_flag++)
|
||||
{
|
||||
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
}
|
||||
fg_data.overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
|
||||
row_num HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
report("fgy_32x32xn");
|
||||
}
|
||||
|
||||
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, src, 128 * 32,);
|
||||
ALIGN_STK_32(pixel, luma_src, 128 * 32,);
|
||||
const ptrdiff_t lstride = 128 * sizeof(pixel);
|
||||
|
||||
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
|
||||
const Dav1dFilmGrainData *data, size_t pw,
|
||||
const uint8_t scaling[SCALING_SIZE],
|
||||
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
|
||||
const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
|
||||
int is_identity HIGHBD_DECL_SUFFIX);
|
||||
|
||||
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
|
||||
const char ss_name[][4] = {
|
||||
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
|
||||
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
|
||||
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
|
||||
};
|
||||
const enum Dav1dPixelLayout layout = layout_idx + 1;
|
||||
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel);
|
||||
|
||||
for (int csfl = 0; csfl <= 1; csfl++) {
|
||||
if (check_func(dsp->fguv_32x32xn[layout_idx],
|
||||
"fguv_32x32xn_%dbpc_%s_csfl%d",
|
||||
BITDEPTH, ss_name[layout_idx], csfl))
|
||||
{
|
||||
Dav1dFilmGrainData fg_data;
|
||||
|
||||
fg_data.seed = rnd() & 0xFFFF;
|
||||
|
||||
#if BITDEPTH == 16
|
||||
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
|
||||
#else
|
||||
const int bitdepth_max = 0xff;
|
||||
#endif
|
||||
const int uv_pl = rnd() & 1;
|
||||
const int is_identity = rnd() & 1;
|
||||
|
||||
uint8_t scaling[SCALING_SIZE];
|
||||
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
|
||||
fg_data.grain_scale_shift = rnd() & 3;
|
||||
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
|
||||
fg_data.ar_coeff_lag = rnd() & 3;
|
||||
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
|
||||
for (int n = 0; n < num_y_pos; n++)
|
||||
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
|
||||
dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
|
||||
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
|
||||
&fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
const int w = 1 + (rnd() & (127 >> ss_x));
|
||||
const int h = 1 + (rnd() & (31 >> ss_y));
|
||||
const int lw = w << ss_x, lh = h << ss_y;
|
||||
|
||||
for (int y = 0; y < h; y++)
|
||||
for (int x = 0; x < w; x++)
|
||||
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
|
||||
for (int y = 0; y < lh; y++)
|
||||
for (int x = 0; x < lw; x++)
|
||||
luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
|
||||
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
|
||||
|
||||
if (csfl) {
|
||||
fg_data.num_y_points = 2 + (rnd() % 13);
|
||||
const int pad = 0xff / fg_data.num_y_points;
|
||||
for (int n = 0; n < fg_data.num_y_points; n++) {
|
||||
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
|
||||
fg_data.y_points[n][0] += rnd() % pad;
|
||||
fg_data.y_points[n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
|
||||
fg_data.num_y_points, scaling);
|
||||
} else {
|
||||
fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
|
||||
const int pad = 0xff / fg_data.num_uv_points[uv_pl];
|
||||
for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
|
||||
fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
|
||||
fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
|
||||
fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
|
||||
}
|
||||
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
|
||||
fg_data.num_uv_points[uv_pl], scaling);
|
||||
|
||||
fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
|
||||
fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
|
||||
}
|
||||
|
||||
fg_data.clip_to_restricted_range = rnd() & 1;
|
||||
fg_data.scaling_shift = (rnd() & 3) + 8;
|
||||
fg_data.chroma_scaling_from_luma = csfl;
|
||||
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
|
||||
fg_data.overlap_flag++)
|
||||
{
|
||||
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
|
||||
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
|
||||
}
|
||||
|
||||
fg_data.overlap_flag = 1;
|
||||
bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
|
||||
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
report("fguv_32x32xn");
|
||||
}
|
||||
|
||||
void bitfn(checkasm_check_filmgrain)(void) {
|
||||
Dav1dFilmGrainDSPContext c;
|
||||
|
||||
bitfn(dav1d_film_grain_dsp_init)(&c);
|
||||
|
||||
check_gen_grny(&c);
|
||||
check_fgy_sbrow(&c);
|
||||
check_fguv_sbrow(&c);
|
||||
}
|
2
third_party/dav1d/tests/checkasm/itx.c
vendored
2
third_party/dav1d/tests/checkasm/itx.c
vendored
@ -138,7 +138,7 @@ static int copy_subcoefs(coef *coeff,
|
||||
* dimensions are non-zero. This leads to braching to specific optimized
|
||||
* simd versions (e.g. dc-only) so that we get full asm coverage in this
|
||||
* test */
|
||||
const int16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
|
||||
const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
|
||||
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
|
||||
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
|
||||
int n, eob;
|
||||
|
2
third_party/dav1d/tests/checkasm/mc.c
vendored
2
third_party/dav1d/tests/checkasm/mc.c
vendored
@ -27,8 +27,6 @@
|
||||
|
||||
#include "tests/checkasm/checkasm.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "src/levels.h"
|
||||
#include "src/mc.h"
|
||||
|
||||
|
84
third_party/dav1d/tests/checkasm/msac.c
vendored
84
third_party/dav1d/tests/checkasm/msac.c
vendored
@ -38,7 +38,7 @@
|
||||
/* The normal code doesn't use function pointers */
|
||||
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
|
||||
size_t n_symbols);
|
||||
typedef unsigned (*decode_bool_adapt_fn)(MsacContext *s, uint16_t *cdf);
|
||||
typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
|
||||
typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
|
||||
typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);
|
||||
|
||||
@ -46,17 +46,20 @@ typedef struct {
|
||||
decode_symbol_adapt_fn symbol_adapt4;
|
||||
decode_symbol_adapt_fn symbol_adapt8;
|
||||
decode_symbol_adapt_fn symbol_adapt16;
|
||||
decode_bool_adapt_fn bool_adapt;
|
||||
decode_adapt_fn bool_adapt;
|
||||
decode_bool_equi_fn bool_equi;
|
||||
decode_bool_fn bool;
|
||||
decode_adapt_fn hi_tok;
|
||||
} MsacDSPContext;
|
||||
|
||||
static void randomize_cdf(uint16_t *const cdf, int n) {
|
||||
for (int i = 16; i > n; i--)
|
||||
cdf[i] = rnd(); /* randomize padding */
|
||||
cdf[n] = cdf[n-1] = 0;
|
||||
while (--n > 0)
|
||||
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
|
||||
static void randomize_cdf(uint16_t *const cdf, const int n) {
|
||||
int i;
|
||||
for (i = 15; i > n; i--)
|
||||
cdf[i] = rnd(); // padding
|
||||
cdf[i] = 0; // count
|
||||
do {
|
||||
cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
|
||||
} while (--i > 0);
|
||||
}
|
||||
|
||||
/* memcmp() on structs can have weird behavior due to padding etc. */
|
||||
@ -69,7 +72,7 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
|
||||
static void msac_dump(unsigned c_res, unsigned a_res,
|
||||
const MsacContext *const a, const MsacContext *const b,
|
||||
const uint16_t *const cdf_a, const uint16_t *const cdf_b,
|
||||
int num_cdf)
|
||||
const int num_cdf)
|
||||
{
|
||||
if (c_res != a_res)
|
||||
fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
|
||||
@ -86,16 +89,15 @@ static void msac_dump(unsigned c_res, unsigned a_res,
|
||||
if (a->allow_update_cdf)
|
||||
fprintf(stderr, "allow_update_cdf %d vs %d\n",
|
||||
a->allow_update_cdf, b->allow_update_cdf);
|
||||
if (cdf_a != NULL && cdf_b != NULL &&
|
||||
memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * num_cdf)) {
|
||||
if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
|
||||
fprintf(stderr, "cdf:\n");
|
||||
for (int i = 0; i < num_cdf; i++)
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %5u", cdf_a[i]);
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i < num_cdf; i++)
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %5u", cdf_b[i]);
|
||||
fprintf(stderr, "\n");
|
||||
for (int i = 0; i < num_cdf; i++)
|
||||
for (int i = 0; i <= num_cdf; i++)
|
||||
fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
@ -105,7 +107,7 @@ static void msac_dump(unsigned c_res, unsigned a_res,
|
||||
if (check_func(c->symbol_adapt##n, "msac_decode_symbol_adapt%d", n)) { \
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) { \
|
||||
for (int ns = n_min; ns <= n_max; ns++) { \
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update); \
|
||||
s_a = s_c; \
|
||||
randomize_cdf(cdf[0], ns); \
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf)); \
|
||||
@ -117,26 +119,24 @@ static void msac_dump(unsigned c_res, unsigned a_res,
|
||||
{ \
|
||||
if (fail()) \
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, \
|
||||
cdf[0], cdf[1], ns + 1); \
|
||||
cdf[0], cdf[1], ns); \
|
||||
} \
|
||||
} \
|
||||
if (cdf_update && ns == n) \
|
||||
bench_new(&s_a, cdf[0], n); \
|
||||
if (cdf_update && ns == n - 1) \
|
||||
bench_new(&s_a, cdf[1], ns); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
/* Use an aligned CDF buffer for more consistent benchmark
|
||||
* results, and a misaligned one for checking correctness. */
|
||||
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
|
||||
ALIGN_STK_32(uint16_t, cdf, 2, [16]);
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
|
||||
CHECK_SYMBOL_ADAPT( 4, 1, 5);
|
||||
CHECK_SYMBOL_ADAPT( 8, 1, 8);
|
||||
CHECK_SYMBOL_ADAPT(16, 4, 16);
|
||||
CHECK_SYMBOL_ADAPT( 4, 1, 4);
|
||||
CHECK_SYMBOL_ADAPT( 8, 1, 7);
|
||||
CHECK_SYMBOL_ADAPT(16, 3, 15);
|
||||
report("decode_symbol");
|
||||
}
|
||||
|
||||
@ -158,11 +158,11 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
|
||||
{
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 2);
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
|
||||
}
|
||||
}
|
||||
if (cdf_update)
|
||||
bench_new(&s_a, cdf[0]);
|
||||
bench_new(&s_a, cdf[1]);
|
||||
}
|
||||
}
|
||||
|
||||
@ -200,6 +200,35 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
report("decode_bool");
|
||||
}
|
||||
|
||||
static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
|
||||
ALIGN_STK_16(uint16_t, cdf, 2, [16]);
|
||||
MsacContext s_c, s_a;
|
||||
|
||||
if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
|
||||
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
|
||||
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
|
||||
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
|
||||
s_a = s_c;
|
||||
randomize_cdf(cdf[0], 3);
|
||||
memcpy(cdf[1], cdf[0], sizeof(*cdf));
|
||||
for (int i = 0; i < 64; i++) {
|
||||
unsigned c_res = call_ref(&s_c, cdf[0]);
|
||||
unsigned a_res = call_new(&s_a, cdf[1]);
|
||||
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
|
||||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
|
||||
{
|
||||
if (fail())
|
||||
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (cdf_update)
|
||||
bench_new(&s_a, cdf[1]);
|
||||
}
|
||||
}
|
||||
report("decode_hi_tok");
|
||||
}
|
||||
|
||||
void checkasm_check_msac(void) {
|
||||
MsacDSPContext c;
|
||||
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
|
||||
@ -208,6 +237,7 @@ void checkasm_check_msac(void) {
|
||||
c.bool_adapt = dav1d_msac_decode_bool_adapt_c;
|
||||
c.bool_equi = dav1d_msac_decode_bool_equi_c;
|
||||
c.bool = dav1d_msac_decode_bool_c;
|
||||
c.hi_tok = dav1d_msac_decode_hi_tok_c;
|
||||
|
||||
#if ARCH_AARCH64 && HAVE_ASM
|
||||
if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
|
||||
@ -226,6 +256,7 @@ void checkasm_check_msac(void) {
|
||||
c.bool_adapt = dav1d_msac_decode_bool_adapt_sse2;
|
||||
c.bool_equi = dav1d_msac_decode_bool_equi_sse2;
|
||||
c.bool = dav1d_msac_decode_bool_sse2;
|
||||
c.hi_tok = dav1d_msac_decode_hi_tok_sse2;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -235,4 +266,5 @@ void checkasm_check_msac(void) {
|
||||
|
||||
check_decode_symbol(&c, buf);
|
||||
check_decode_bool(&c, buf);
|
||||
check_decode_hi_tok(&c, buf);
|
||||
}
|
||||
|
1
third_party/dav1d/tests/meson.build
vendored
1
third_party/dav1d/tests/meson.build
vendored
@ -41,6 +41,7 @@ if is_asm_enabled
|
||||
|
||||
checkasm_tmpl_sources = files(
|
||||
'checkasm/cdef.c',
|
||||
'checkasm/filmgrain.c',
|
||||
'checkasm/ipred.c',
|
||||
'checkasm/itx.c',
|
||||
'checkasm/loopfilter.c',
|
||||
|
5
third_party/dav1d/tools/dav1d.c
vendored
5
third_party/dav1d/tools/dav1d.c
vendored
@ -29,7 +29,6 @@
|
||||
#include "vcs_version.h"
|
||||
#include "cli_config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
@ -137,7 +136,7 @@ int main(const int argc, char *const *const argv) {
|
||||
Dav1dPicture p;
|
||||
Dav1dContext *c;
|
||||
Dav1dData data;
|
||||
unsigned n_out = 0, total, fps[2];
|
||||
unsigned n_out = 0, total, fps[2], timebase[2];
|
||||
uint64_t nspf, tfirst, elapsed;
|
||||
double i_fps;
|
||||
FILE *frametimes = NULL;
|
||||
@ -155,7 +154,7 @@ int main(const int argc, char *const *const argv) {
|
||||
|
||||
if ((res = input_open(&in, cli_settings.demuxer,
|
||||
cli_settings.inputfile,
|
||||
fps, &total)) < 0)
|
||||
fps, &total, timebase)) < 0)
|
||||
{
|
||||
return res;
|
||||
}
|
||||
|
1
third_party/dav1d/tools/dav1d_cli_parse.c
vendored
1
third_party/dav1d/tools/dav1d_cli_parse.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <getopt.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
|
5
third_party/dav1d/tools/input/annexb.c
vendored
5
third_party/dav1d/tools/input/annexb.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
@ -60,7 +59,7 @@ static int leb128(AnnexbInputContext *const c, size_t *const len) {
|
||||
}
|
||||
|
||||
static int annexb_open(AnnexbInputContext *const c, const char *const file,
|
||||
unsigned fps[2], unsigned *const num_frames)
|
||||
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
|
||||
{
|
||||
int res;
|
||||
size_t len;
|
||||
@ -73,6 +72,8 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file,
|
||||
// TODO: Parse sequence header and read timing info if any.
|
||||
fps[0] = 25;
|
||||
fps[1] = 1;
|
||||
timebase[0] = 25;
|
||||
timebase[1] = 1;
|
||||
for (*num_frames = 0;; (*num_frames)++) {
|
||||
res = leb128(c, &len);
|
||||
if (res < 0)
|
||||
|
2
third_party/dav1d/tools/input/demuxer.h
vendored
2
third_party/dav1d/tools/input/demuxer.h
vendored
@ -36,7 +36,7 @@ typedef struct Demuxer {
|
||||
const char *name;
|
||||
const char *extension;
|
||||
int (*open)(DemuxerPriv *ctx, const char *filename,
|
||||
unsigned fps[2], unsigned *num_frames);
|
||||
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
|
||||
int (*read)(DemuxerPriv *ctx, Dav1dData *data);
|
||||
void (*close)(DemuxerPriv *ctx);
|
||||
} Demuxer;
|
||||
|
7
third_party/dav1d/tools/input/input.c
vendored
7
third_party/dav1d/tools/input/input.c
vendored
@ -27,12 +27,13 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "input/input.h"
|
||||
#include "input/demuxer.h"
|
||||
|
||||
@ -75,7 +76,7 @@ static const char *find_extension(const char *const f) {
|
||||
|
||||
int input_open(DemuxerContext **const c_out,
|
||||
const char *const name, const char *const filename,
|
||||
unsigned fps[2], unsigned *const num_frames)
|
||||
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
|
||||
{
|
||||
const Demuxer *impl;
|
||||
DemuxerContext *c;
|
||||
@ -120,7 +121,7 @@ int input_open(DemuxerContext **const c_out,
|
||||
memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
|
||||
c->impl = impl;
|
||||
c->data = (DemuxerPriv *) &c[1];
|
||||
if ((res = impl->open(c->data, filename, fps, num_frames)) < 0) {
|
||||
if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
|
||||
free(c);
|
||||
return res;
|
||||
}
|
||||
|
2
third_party/dav1d/tools/input/input.h
vendored
2
third_party/dav1d/tools/input/input.h
vendored
@ -35,7 +35,7 @@ typedef struct DemuxerContext DemuxerContext;
|
||||
void init_demuxers(void);
|
||||
int input_open(DemuxerContext **const c_out,
|
||||
const char *const name, const char *const filename,
|
||||
unsigned fps[2], unsigned *num_frames);
|
||||
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
|
||||
int input_read(DemuxerContext *ctx, Dav1dData *data);
|
||||
void input_close(DemuxerContext *ctx);
|
||||
|
||||
|
12
third_party/dav1d/tools/input/ivf.c
vendored
12
third_party/dav1d/tools/input/ivf.c
vendored
@ -27,7 +27,6 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
@ -49,7 +48,7 @@ static int64_t rl64(const uint8_t *const p) {
|
||||
}
|
||||
|
||||
static int ivf_open(IvfInputContext *const c, const char *const file,
|
||||
unsigned fps[2], unsigned *const num_frames)
|
||||
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
|
||||
{
|
||||
size_t res;
|
||||
uint8_t hdr[32];
|
||||
@ -74,17 +73,18 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
|
||||
return -1;
|
||||
}
|
||||
|
||||
fps[0] = rl32(&hdr[16]);
|
||||
fps[1] = rl32(&hdr[20]);
|
||||
timebase[0] = rl32(&hdr[16]);
|
||||
timebase[1] = rl32(&hdr[20]);
|
||||
const unsigned duration = rl32(&hdr[24]);
|
||||
|
||||
uint8_t data[4];
|
||||
for (*num_frames = 0;; (*num_frames)++) {
|
||||
if ((res = fread(data, 4, 1, c->f)) != 1)
|
||||
break; // EOF
|
||||
fseeko(c->f, rl32(data) + 8, SEEK_CUR);
|
||||
}
|
||||
fps[0] *= *num_frames;
|
||||
fps[1] *= duration;
|
||||
fps[0] = timebase[0] * *num_frames;
|
||||
fps[1] = timebase[1] * duration;
|
||||
fseeko(c->f, 32, SEEK_SET);
|
||||
|
||||
return 0;
|
||||
|
49
third_party/dav1d/tools/meson.build
vendored
49
third_party/dav1d/tools/meson.build
vendored
@ -22,9 +22,38 @@
|
||||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#
|
||||
# Build definition for the dav1d tools
|
||||
#
|
||||
# Common source files used by tools and examples
|
||||
|
||||
dav1d_input_sources = files(
|
||||
'input/input.c',
|
||||
'input/annexb.c',
|
||||
'input/ivf.c',
|
||||
)
|
||||
|
||||
dav1d_output_sources = files(
|
||||
'output/md5.c',
|
||||
'output/null.c',
|
||||
'output/output.c',
|
||||
'output/y4m2.c',
|
||||
'output/yuv.c',
|
||||
)
|
||||
|
||||
dav1d_input_objs = static_library('dav1d_input',
|
||||
dav1d_input_sources,
|
||||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
)
|
||||
|
||||
dav1d_output_objs = static_library('dav1d_output',
|
||||
dav1d_output_sources,
|
||||
|
||||
include_directories : dav1d_inc_dirs,
|
||||
install : false,
|
||||
build_by_default : false,
|
||||
)
|
||||
|
||||
|
||||
# Leave subdir if tools are disabled
|
||||
if not get_option('enable_tools')
|
||||
@ -32,6 +61,10 @@ if not get_option('enable_tools')
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# Build definition for the dav1d tools
|
||||
#
|
||||
|
||||
# Configuratin data for cli_config.h
|
||||
cli_cdata = configuration_data()
|
||||
|
||||
@ -56,21 +89,13 @@ cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_
|
||||
dav1d_sources = files(
|
||||
'dav1d.c',
|
||||
'dav1d_cli_parse.c',
|
||||
'input/input.c',
|
||||
'input/annexb.c',
|
||||
'input/ivf.c',
|
||||
'output/md5.c',
|
||||
'output/null.c',
|
||||
'output/output.c',
|
||||
'output/y4m2.c',
|
||||
'output/yuv.c',
|
||||
)
|
||||
|
||||
dav1d = executable('dav1d',
|
||||
dav1d_sources,
|
||||
rev_target, cli_config_h_target,
|
||||
|
||||
link_with : libdav1d,
|
||||
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
|
||||
include_directories : [dav1d_inc_dirs],
|
||||
dependencies : [getopt_dependency, thread_dependency, rt_dependency],
|
||||
install : true,
|
||||
|
3
third_party/dav1d/tools/output/output.c
vendored
3
third_party/dav1d/tools/output/output.c
vendored
@ -27,12 +27,13 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "output/output.h"
|
||||
#include "output/muxer.h"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user