Bug 1582743 - Update dav1d from upstream to commit c0865f3. r=TD-Linux

Differential Revision: https://phabricator.services.mozilla.com/D46762

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Alex Chronopoulos 2019-09-23 18:02:33 +00:00
parent 5be5470fba
commit ac0da8b368
87 changed files with 13062 additions and 5655 deletions

View File

@ -82,6 +82,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
# an error when it compiles empty files.
SOURCES += [
'../../../third_party/dav1d/src/x86/cdef.asm',
'../../../third_party/dav1d/src/x86/film_grain.asm',
'../../../third_party/dav1d/src/x86/ipred.asm',
'../../../third_party/dav1d/src/x86/itx.asm',
'../../../third_party/dav1d/src/x86/loopfilter.asm',
@ -94,6 +95,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/cpuid.asm',
'../../../third_party/dav1d/src/x86/ipred_ssse3.asm',
'../../../third_party/dav1d/src/x86/itx_ssse3.asm',
'../../../third_party/dav1d/src/x86/loopfilter_ssse3.asm',
'../../../third_party/dav1d/src/x86/looprestoration_ssse3.asm',
'../../../third_party/dav1d/src/x86/mc_ssse3.asm',
'../../../third_party/dav1d/src/x86/msac.asm',
@ -103,6 +105,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
relative_path = '../../../third_party/dav1d/src/x86/'
bitdepth_basenames = [
'cdef_init_tmpl.c',
'film_grain_init_tmpl.c',
'ipred_init_tmpl.c',
'itx_init_tmpl.c',
'loopfilter_init_tmpl.c',

View File

@ -1,7 +1,7 @@
#define API_VERSION_NUMBER 2,0,0,0
#define API_VERSION_NUMBER_STR "2.0.0"
#define PROJECT_VERSION_NUMBER 0,3,1,0
#define PROJECT_VERSION_NUMBER_STR "0.3.1"
#define API_VERSION_NUMBER 3,0,0,0
#define API_VERSION_NUMBER_STR "3.0.0"
#define PROJECT_VERSION_NUMBER 0,4,0,0
#define PROJECT_VERSION_NUMBER_STR "0.4.0"
#include <windows.h>

View File

@ -124,6 +124,7 @@ relative_path = '../../third_party/dav1d/src/'
bitdepth_basenames = [
'cdef_apply_tmpl.c',
'cdef_tmpl.c',
'fg_apply_tmpl.c',
'film_grain_tmpl.c',
'ipred_prepare_tmpl.c',
'ipred_tmpl.c',
@ -163,6 +164,7 @@ SOURCES += [
EXPORTS.dav1d.src += [
'../../third_party/dav1d/src/cdef.h',
'../../third_party/dav1d/src/cdef_apply.h',
'../../third_party/dav1d/src/fg_apply.h',
'../../third_party/dav1d/src/ipred.h',
'../../third_party/dav1d/src/ipred_prepare.h',
'../../third_party/dav1d/src/itx.h',

View File

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit c138435f5aee794ff9d9ac23c3718017927f2e20 (2019-07-17T12:39:10.000Z).
release: commit c0865f35c74bdcc71021630f64dca2db35d2bc8c (2019-09-19T12:07:23.000+02:00).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.3.1-69-gc138435"
#define DAV1D_VERSION "0.4.0-49-gc0865f3"

View File

@ -27,7 +27,7 @@
#ifndef DAV1D_VERSION_H
#define DAV1D_VERSION_H
#define DAV1D_API_VERSION_MAJOR 2
#define DAV1D_API_VERSION_MAJOR 3
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_PATCH 0

View File

@ -269,6 +269,7 @@ build-debian-ppc64le:
test-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
@ -289,6 +290,7 @@ test-debian:
test-debian-unaligned-stack:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
@ -309,6 +311,7 @@ test-debian-unaligned-stack:
test-debian-asan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
@ -331,6 +334,7 @@ test-debian-asan:
test-debian-msan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
@ -353,6 +357,7 @@ test-debian-msan:
test-debian-ubsan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
@ -375,6 +380,7 @@ test-debian-ubsan:
test-win64:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-win64"]
tags:
- debian
- amd64
@ -399,6 +405,7 @@ test-win64:
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
stage: test
needs: ["build-debian-aarch64"]
tags:
- aarch64
- debian
@ -421,6 +428,7 @@ test-debian-aarch64:
test-debian-ppc64le:
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
stage: test
needs: ["build-debian-ppc64le"]
tags:
- ppc64le
- docker
@ -443,6 +451,7 @@ test-debian-ppc64le:
test-debian-armv7-clang-5:
stage: test
image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
needs: ["build-debian-armv7-clang-5"]
tags:
- armv7
- debian

View File

@ -6,6 +6,13 @@ Changes for 0.4.0 'Cheetah':
- SSE2 and ARM64 optimizations for MSAC
- Improve speed on 32bits systems
- Optimization in obmc blend
- Reduce RAM usage significantly
- The initial PPC SIMD code, cdef_filter
- NEON optimizations for blend functions on ARM
- NEON optimizations for w_mask functions on ARM
- NEON optimizations for inverse transforms on ARM64
- Improve handling of malloc failures
- Simple Player example in tools
Changes for 0.3.1 'Sailfish':

1087
third_party/dav1d/examples/dav1dplay.c vendored Normal file

File diff suppressed because it is too large Load Diff

62
third_party/dav1d/examples/meson.build vendored Normal file
View File

@ -0,0 +1,62 @@
# Copyright © 2018, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Build definition for the dav1d examples
#
# Leave subdir if examples are disabled
if not get_option('enable_examples')
subdir_done()
endif
# dav1d player sources
dav1dplay_sources = files(
'dav1dplay.c',
)
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: false)
if sdl2_dependency.found()
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
vulkan_dependency = dependency('vulkan', required: false)
sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
cflag_placebo = []
deps_placebo = []
if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
deps_placebo = [vulkan_dependency, placebo_dependency]
endif
dav1dplay = executable('dav1dplay',
dav1dplay_sources,
rev_target,
link_with : [libdav1d, dav1d_input_objs],
include_directories : [dav1d_inc_dirs],
dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
install : true,
c_args : cflag_placebo,
)
endif

View File

@ -46,7 +46,7 @@
/* x86-64 needs 32-byte alignment for AVX2. */
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_32_VAL 16
#define ALIGN_16_VAL 16
@ -92,6 +92,14 @@
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
#elif defined(NDEBUG) && defined(_MSC_VER)
#define assert __assume
#else
#include <assert.h>
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
# define dav1d_uninit(x) x=x
#else

View File

@ -40,6 +40,14 @@ static inline int imin(const int a, const int b) {
return a < b ? a : b;
}
static inline unsigned umax(const unsigned a, const unsigned b) {
return a > b ? a : b;
}
static inline unsigned umin(const unsigned a, const unsigned b) {
return a < b ? a : b;
}
static inline int iclip(const int v, const int min, const int max) {
return v < min ? min : v > max ? max : v;
}

View File

@ -28,13 +28,14 @@
#ifndef DAV1D_COMMON_MEM_H
#define DAV1D_COMMON_MEM_H
#include <assert.h>
#include <stdlib.h>
#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
#include <malloc.h>
#endif
#include "common/attributes.h"
/*
* Allocate 32-byte aligned memory. The return value can be released
* by calling the standard free() function.

View File

@ -28,6 +28,8 @@
#ifndef DAV1D_HEADERS_H
#define DAV1D_HEADERS_H
#include <stddef.h>
// Constants from Section 3. "Symbols and abbreviated terms"
#define DAV1D_MAX_CDEF_STRENGTHS 8
#define DAV1D_MAX_OPERATING_POINTS 32
@ -176,6 +178,13 @@ typedef struct Dav1dMasteringDisplay {
uint32_t min_luminance;
} Dav1dMasteringDisplay;
typedef struct Dav1dITUTT35 {
uint8_t country_code;
uint8_t country_code_extension_byte;
size_t payload_size;
uint8_t *payload;
} Dav1dITUTT35;
typedef struct Dav1dSequenceHeader {
/**
* Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
@ -289,7 +298,7 @@ typedef struct Dav1dLoopfilterModeRefDeltas {
} Dav1dLoopfilterModeRefDeltas;
typedef struct Dav1dFilmGrainData {
uint16_t seed;
unsigned seed;
int num_y_points;
uint8_t y_points[14][2 /* value, scaling */];
int chroma_scaling_from_luma;

View File

@ -77,9 +77,16 @@ typedef struct Dav1dPicture {
* this picture, as defined in section 5.8.4 and 6.7.4
*/
Dav1dMasteringDisplay *mastering_display;
/**
* ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2
*/
Dav1dITUTT35 *itut_t35;
uintptr_t reserved[4]; ///< reserved for future use
struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins
struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins
struct Dav1dRef *content_light_ref, *mastering_display_ref, *itut_t35_ref; ///< Metadata allocation origins
uintptr_t reserved_ref[4]; ///< reserved for future use
struct Dav1dRef *ref; ///< Frame data allocation origin
void *allocator_data; ///< pointer managed by the allocator

View File

@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.3.1',
version: '0.4.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '2.0.0'
dav1d_soname_version = '3.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -88,6 +88,11 @@ optional_arguments = []
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
if host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
endif
if host_machine.system() == 'windows'
cdata.set('_WIN32_WINNT', '0x0601')
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
@ -389,4 +394,6 @@ subdir('src')
subdir('tools')
subdir('examples')
subdir('tests')

View File

@ -15,6 +15,11 @@ option('enable_tools',
value: true,
description: 'Build dav1d cli tools')
option('enable_examples',
type: 'boolean',
value: false,
description: 'Build dav1d examples')
option('enable_tests',
type: 'boolean',
value: true,

View File

@ -91,6 +91,7 @@ function \type\()_8bpc_neon, export=1
\type d16, d17, q0, q1, q2, q3
add r12, r12, r4
bx r12
.align 2
L(\type\()_tbl):
.word 1280f - L(\type\()_tbl) + CONFIG_THUMB
@ -99,6 +100,7 @@ L(\type\()_tbl):
.word 160f - L(\type\()_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_tbl) + CONFIG_THUMB
.word 4f - L(\type\()_tbl) + CONFIG_THUMB
4:
add r6, r0, r1
lsl r1, r1, #1
@ -217,17 +219,17 @@ bidir_fn mask
.macro w_mask_fn type
function w_mask_\type\()_8bpc_neon, export=1
push {r4-r10,lr}
ldr r4, [sp, #32]
ldr r5, [sp, #36]
ldr r6, [sp, #40]
ldr r7, [sp, #44]
push {r4-r9,lr}
ldr r4, [sp, #28]
ldr r5, [sp, #32]
ldr r6, [sp, #36]
ldr r7, [sp, #40]
clz r8, r4
adr r9, L(w_mask_\type\()_tbl)
sub r8, r8, #24
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
mov r12, #6903
movw r12, #6903
vdup.16 q14, r12
.if \type == 444
vmov.i8 q15, #64
@ -243,6 +245,7 @@ function w_mask_\type\()_8bpc_neon, export=1
add r12, r0, r1
lsl r1, r1, #1
bx r9
.align 2
L(w_mask_\type\()_tbl):
.word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
@ -251,9 +254,10 @@ L(w_mask_\type\()_tbl):
.word 160f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 8f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
.word 4f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
4:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1 (four rows at once)
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2 (four rows at once)
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1 (four rows at once)
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2 (four rows at once)
subs r5, r5, #4
vsub.i16 q8, q2, q0 // tmp2-tmp1
vsub.i16 q9, q3, q1
@ -275,30 +279,30 @@ L(w_mask_\type\()_tbl):
vmovn.u16 d20, q10 // 64 - m
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // m
vst1.8 {d20, d21}, [r6]!
vst1.8 {d20, d21}, [r6, :128]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - m) + (64 - n) (column wise addition)
vpadd.s16 d21, d22, d23
vmovn.s16 d6, q10
vhsub.u8 d6, d30, d6 // ((129 - sign) - ((64 - m) + (64 - n))) >> 1
vst1.8 {d6}, [r6]!
vst1.8 {d6}, [r6, :64]!
.elseif \type == 420
vadd.s16 d20, d20, d21 // (64 - my1) + (64 - my2) (row wise addition)
vadd.s16 d21, d22, d23
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.32 {d20[0]}, [r6]!
vst1.32 {d20[0]}, [r6, :32]!
.endif
vst1.32 {d24[0]}, [r0], r1
vst1.32 {d24[1]}, [r12], r1
vst1.32 {d25[0]}, [r0], r1
vst1.32 {d25[1]}, [r12], r1
vst1.32 {d24[0]}, [r0, :32], r1
vst1.32 {d24[1]}, [r12, :32], r1
vst1.32 {d25[0]}, [r0, :32], r1
vst1.32 {d25[1]}, [r12, :32], r1
bgt 4b
pop {r4-r10,pc}
pop {r4-r9,pc}
8:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1, tmp1y2
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1, tmp2y2
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1, tmp1y2
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1, tmp2y2
subs r5, r5, #2
vsub.i16 q8, q2, q0 // tmp2y1 - tmp1y1
vsub.i16 q9, q3, q1 // tmp2y2 - tmp1y2
@ -320,43 +324,42 @@ L(w_mask_\type\()_tbl):
vmovn.u16 d20, q10 // 64 - m
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // m
vst1.8 {d20, d21}, [r6]!
vst1.8 {d20, d21}, [r6, :128]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
vpadd.s16 d21, d22, d23 // (64 - my2) + (64 - ny2)
vmovn.s16 d20, q10
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1/y2) + (64 - ny1/y2))) >> 1
vst1.8 {d20}, [r6]!
vst1.8 {d20}, [r6, :64]!
.elseif \type == 420
vadd.s16 q10, q10, q11 // (64 - my1) + (64 - my2) (row wise addition)
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.32 {d20[0]}, [r6]!
vst1.32 {d20[0]}, [r6, :32]!
.endif
vst1.16 {d24}, [r0], r1
vst1.16 {d25}, [r12], r1
vst1.16 {d24}, [r0, :64], r1
vst1.16 {d25}, [r12, :64], r1
bgt 8b
pop {r4-r10,pc}
pop {r4-r9,pc}
1280:
640:
320:
160:
sub r1, r1, r4
.if \type == 444
add r10, r6, r4
add lr, r6, r4
.elseif \type == 422
add r10, r6, r4, lsr #1
add lr, r6, r4, lsr #1
.endif
mov lr, r7
add r9, r3, r4, lsl #1
add r7, r2, r4, lsl #1
161:
mov r8, r4
16:
vld1.16 {d0, d1, d2, d3}, [r2]! // tmp1y1
vld1.16 {d4, d5, d6, d7}, [r3]! // tmp2y1
vld1.16 {d16, d17, d18, d19}, [r7]! // tmp1y2
vld1.16 {d0, d1, d2, d3}, [r2, :128]! // tmp1y1
vld1.16 {d4, d5, d6, d7}, [r3, :128]! // tmp2y1
vld1.16 {d16, d17, d18, d19}, [r7, :128]! // tmp1y2
subs r8, r8, #16
vsub.i16 q2, q2, q0 // tmp2y1 - tmp1y1
vsub.i16 q3, q3, q1
@ -372,24 +375,24 @@ L(w_mask_\type\()_tbl):
vqdmulh.s16 q13, q13, q3
vadd.i16 q12, q12, q0 // (((tmp2y1 - tmp1y1) * (64 - my1) << 9) >> 15) + tmp1y1
vadd.i16 q13, q13, q1
vld1.16 {d0, d1, d2, d3}, [r9]! // tmp2h2
vld1.16 {d0, d1, d2, d3}, [r9, :128]! // tmp2h2
.if \type == 444
vmovn.u16 d20, q10 // 64 - my1
vmovn.u16 d21, q11
vsub.i8 q10, q15, q10 // my1
vst1.8 {d20, d21}, [r6]!
vst1.8 {d20, d21}, [r6, :128]!
.elseif \type == 422
vpadd.s16 d20, d20, d21 // (64 - my1) + (64 - ny1) (column wise addition)
vpadd.s16 d21, d22, d23
vmovn.s16 d20, q10
vhsub.u8 d20, d30, d20 // ((129 - sign) - ((64 - my1) + (64 - ny1))) >> 1
vst1.8 {d20}, [r6]!
vst1.8 {d20}, [r6, :64]!
.endif
vqrshrun.s16 d24, q12, #4 // (((((tmp2y1 - tmp1y1)*(64 - my1) << 9) >> 15) + tmp1y1) + 8) >> 4
vqrshrun.s16 d25, q13, #4
vsub.i16 q0, q0, q8 // tmp2y2 - tmp1y2
vsub.i16 q1, q1, q9
vst1.16 {d24, d25}, [r0]! // store dsty1
vst1.16 {d24, d25}, [r0, :128]! // store dsty1
vabs.s16 q2, q0 // abs(tmp2y2 - tmp1y2)
vabs.s16 q3, q1
vqsub.u16 q2, q14, q2 // 6903 - abs(tmp2y2 - tmp1y2)
@ -402,13 +405,13 @@ L(w_mask_\type\()_tbl):
vmovn.u16 d4, q2 // 64 - my2
vmovn.u16 d5, q3
vsub.i8 q2, q15, q2 // my2
vst1.8 {d4, d5}, [r10]!
vst1.8 {d4, d5}, [lr, :128]!
.elseif \type == 422
vpadd.s16 d4, d4, d5 // (64 - my2) + (64 - ny2) (column wise addition)
vpadd.s16 d5, d6, d7
vmovn.s16 d4, q2
vhsub.u8 d4, d30, d4 // ((129 - sign) - ((64 - my2) + (64 - ny2))) >> 1
vst1.8 {d4}, [r10]!
vst1.8 {d4}, [lr, :64]!
.elseif \type == 420
vadd.s16 q10, q10, q2 // (64 - my1) + (64 - my2) (row wise addition)
vadd.s16 q11, q11, q3
@ -416,7 +419,7 @@ L(w_mask_\type\()_tbl):
vpadd.s16 d21, d22, d23
vsub.s16 q10, q15, q10 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.8 {d20}, [r6]!
vst1.8 {d20}, [r6, :64]!
.endif
vqdmulh.s16 q12, q12, q0 // ((tmp2y2 - tmp1y2) * (64 - my2) << 9) >> 15
vqdmulh.s16 q13, q13, q1
@ -424,7 +427,7 @@ L(w_mask_\type\()_tbl):
vadd.i16 q13, q13, q9
vqrshrun.s16 d24, q12, #4 // (((((tmp2y2 - tmp1y2)*(64 - my2) << 9) >> 15) + tmp1y2) + 8) >> 4
vqrshrun.s16 d25, q13, #4
vst1.16 {d24, d25}, [r12]! // store dsty2
vst1.16 {d24, d25}, [r12, :128]! // store dsty2
bgt 16b
subs r5, r5, #2
add r2, r2, r4, lsl #1
@ -433,15 +436,15 @@ L(w_mask_\type\()_tbl):
add r9, r9, r4, lsl #1
.if \type == 444
add r6, r6, r4
add r10, r10, r4
add lr, lr, r4
.elseif \type == 422
add r6, r6, r4, lsr #1
add r10, r10, r4, lsr #1
add lr, lr, r4, lsr #1
.endif
add r0, r0, r1
add r12, r12, r1
bgt 161b
pop {r4-r10,pc}
pop {r4-r9,pc}
endfunc
.endm
@ -451,15 +454,16 @@ w_mask_fn 420
function blend_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
ldr r5, [sp, #28]
clz r6, r3
adr r7, L(blend_tbl)
sub r6, r6, #26
ldr r6, [r7, r6, lsl #2]
add r7, r7, r6
bx r7
push {r4-r5,lr}
ldr r4, [sp, #12]
ldr r5, [sp, #16]
clz lr, r3
adr r3, L(blend_tbl)
sub lr, lr, #26
ldr lr, [r3, lr, lsl #2]
add r3, r3, lr
bx r3
.align 2
L(blend_tbl):
.word 320f - L(blend_tbl) + CONFIG_THUMB
@ -472,33 +476,29 @@ L(blend_tbl):
add r12, r0, r1
lsl r1, r1, #1
4:
vld1.32 {d2[]}, [r5], r3
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0]
vld1.u8 {d2}, [r5, :64]!
vld1.u8 {d1}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
subs r4, r4, #2
vld1.32 {d2[1]}, [r5], r3
vld1.32 {d1[1]}, [r2], r3
vld1.32 {d0[1]}, [r12]
vld1.32 {d0[1]}, [r12, :32]
vsub.i8 d3, d22, d2
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d3
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
vst1.32 {d20[0]}, [r0, :32], r1
vst1.32 {d20[1]}, [r12, :32], r1
bgt 4b
pop {r4-r8,pc}
pop {r4-r5,pc}
80:
vmov.i8 d16, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld1.u8 {d2}, [r5], r3
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vld1.u8 {q1}, [r5, :128]!
vld1.u8 {q2}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
vsub.i8 d17, d16, d2
vld1.u8 {d3}, [r5], r3
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
vld1.u8 {d1}, [r12, :64]
subs r4, r4, #2
vsub.i8 d18, d16, d3
vmull.u8 q3, d2, d4
@ -507,47 +507,44 @@ L(blend_tbl):
vmlal.u8 q10, d1, d18
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
vst1.u8 {d22}, [r0, :64], r1
vst1.u8 {d23}, [r12, :64], r1
bgt 8b
pop {r4-r8,pc}
pop {r4-r5,pc}
160:
vmov.i8 q12, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld1.u8 {q2}, [r5], r3
vld1.u8 {q1}, [r2], r3
vld1.u8 {q0}, [r0]
vld1.u8 {q1, q2}, [r5, :128]!
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
subs r4, r4, #2
vsub.i8 q11, q12, q2
vld1.u8 {q15}, [r5], r3
vld1.u8 {q14}, [r2], r3
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d22
vmull.u8 q8, d3, d5
vmlal.u8 q8, d1, d23
vsub.i8 q11, q12, q15
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d30
vmlal.u8 q3, d26, d22
vmull.u8 q8, d29, d31
vmlal.u8 q8, d27, d23
vsub.i8 q15, q12, q1
vld1.u8 {q13}, [r12, :128]
vmull.u8 q3, d16, d2
vmlal.u8 q3, d0, d30
vmull.u8 q14, d17, d3
vmlal.u8 q14, d1, d31
vsub.i8 q15, q12, q2
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
vrshrn.i16 d21, q14, #6
vmull.u8 q3, d18, d4
vmlal.u8 q3, d26, d30
vmull.u8 q14, d19, d5
vmlal.u8 q14, d27, d31
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q14, #6
vst1.u8 {q10}, [r0, :128], r1
vst1.u8 {q11}, [r12, :128], r1
bgt 16b
pop {r4-r8,pc}
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
32:
vld1.u8 {q2, q3}, [r5], r3
vld1.u8 {q8, q9}, [r2], r3
vld1.u8 {q0, q1}, [r0]
vld1.u8 {q2, q3}, [r5, :128]!
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
subs r4, r4, #1
vsub.i8 q11, q10, q2
vmull.u8 q15, d16, d4
@ -563,9 +560,9 @@ L(blend_tbl):
vmlal.u8 q14, d3, d23
vrshrn.i16 d26, q15, #6
vrshrn.i16 d27, q14, #6
vst1.u8 {q12, q13}, [r0], r1
vst1.u8 {q12, q13}, [r0, :128], r1
bgt 32b
pop {r4-r8,pc}
pop {r4-r5,pc}
endfunc
function blend_h_8bpc_neon, export=1
@ -580,6 +577,7 @@ function blend_h_8bpc_neon, export=1
ldr r6, [r7, r6, lsl #2]
add r7, r7, r6
bx r7
.align 2
L(blend_h_tbl):
.word 1280f - L(blend_h_tbl) + CONFIG_THUMB
@ -595,19 +593,18 @@ L(blend_h_tbl):
add r12, r0, r1
lsl r1, r1, #1
2:
vld1.16 {d2[], d3[]}, [r5]!
vld1.16 {d1[]}, [r2], r3
vld1.16 {d2[], d3[]}, [r5, :16]!
vld1.32 {d1[0]}, [r2, :32]!
subs r4, r4, #2
vld1.16 {d0[]}, [r0]
vld1.16 {d0[]}, [r0, :16]
vzip.8 d2, d3
vld1.16 {d1[1]}, [r2], r3
vsub.i8 d4, d22, d2
vld1.16 {d0[1]}, [r12]
vld1.16 {d0[1]}, [r12, :16]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d4
vrshrn.i16 d20, q8, #6
vst1.16 {d20[0]}, [r0], r1
vst1.16 {d20[1]}, [r12], r1
vst1.16 {d20[0]}, [r0, :16], r1
vst1.16 {d20[1]}, [r12, :16], r1
bgt 2b
pop {r4-r8,pc}
40:
@ -615,74 +612,66 @@ L(blend_h_tbl):
add r12, r0, r1
lsl r1, r1, #1
4:
vld1.u8 {d2[]}, [r5]!
vld1.32 {d1[]}, [r2], r3
vld2.u8 {d2[], d3[]}, [r5, :16]!
vld1.u8 {d1}, [r2, :64]!
subs r4, r4, #2
vld1.u8 {d6[]}, [r5]!
vld1.32 {d1[1]}, [r2], r3
vext.u8 d2, d2, d6, #4
vld1.32 {d0[]}, [r0]
vsub.i8 d3, d22, d2
vld1.32 {d0[1]}, [r12]
vext.u8 d2, d2, d3, #4
vld1.32 {d0[]}, [r0, :32]
vsub.i8 d6, d22, d2
vld1.32 {d0[1]}, [r12, :32]
vmull.u8 q8, d1, d2
vmlal.u8 q8, d0, d3
vmlal.u8 q8, d0, d6
vrshrn.i16 d20, q8, #6
vst1.32 {d20[0]}, [r0], r1
vst1.32 {d20[1]}, [r12], r1
vst1.32 {d20[0]}, [r0, :32], r1
vst1.32 {d20[1]}, [r12, :32], r1
bgt 4b
pop {r4-r8,pc}
80:
vmov.i8 d16, #64
vmov.i8 q8, #64
add r12, r0, r1
lsl r1, r1, #1
8:
vld1.u8 {d2[]}, [r5]!
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vsub.i8 d17, d16, d2
vld1.u8 {d3[]}, [r5]!
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
vld2.u8 {d2[], d3[]}, [r5, :16]!
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
vsub.i8 q9, q8, q1
vld1.u8 {d1}, [r12, :64]
subs r4, r4, #2
vsub.i8 d18, d16, d3
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
vmlal.u8 q3, d0, d18
vmull.u8 q10, d3, d5
vmlal.u8 q10, d1, d18
vmlal.u8 q10, d1, d19
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.u8 {d22}, [r0], r1
vst1.u8 {d23}, [r12], r1
vst1.u8 {d22}, [r0, :64], r1
vst1.u8 {d23}, [r12, :64], r1
bgt 8b
pop {r4-r8,pc}
160:
vmov.i8 d24, #64
vmov.i8 q12, #64
add r12, r0, r1
lsl r1, r1, #1
16:
vld1.u8 {d4[]}, [r5]!
vld1.u8 {q1}, [r2], r3
vsub.i8 d5, d24, d4
vld1.u8 {q0}, [r0]
vld2.u8 {d28[], d29[]}, [r5, :16]!
vld1.u8 {d2, d3, d4, d5}, [r2, :128]!
vsub.i8 q15, q12, q14
vld1.u8 {q0}, [r0, :128]
subs r4, r4, #2
vld1.u8 {d30[]}, [r5]!
vld1.u8 {q14}, [r2], r3
vsub.i8 d31, d24, d30
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d5
vmull.u8 q8, d3, d4
vmlal.u8 q8, d1, d5
vld1.u8 {q13}, [r12, :128]
vmull.u8 q3, d2, d28
vmlal.u8 q3, d0, d30
vmull.u8 q8, d3, d28
vmlal.u8 q8, d1, d30
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d30
vmull.u8 q3, d4, d29
vmlal.u8 q3, d26, d31
vmull.u8 q8, d29, d30
vmull.u8 q8, d5, d29
vmlal.u8 q8, d27, d31
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {q9}, [r0], r1
vst1.u8 {q10}, [r12], r1
vst1.u8 {q9}, [r0, :128], r1
vst1.u8 {q10}, [r12, :128], r1
bgt 16b
pop {r4-r8,pc}
320:
@ -695,8 +684,8 @@ L(blend_h_tbl):
vsub.i8 d7, d20, d6
mov r8, r3
32:
vld1.u8 {q8, q9}, [r2]!
vld1.u8 {q0, q1}, [r0]
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
vmull.u8 q15, d16, d6
vmlal.u8 q15, d0, d7
vmull.u8 q14, d17, d6
@ -709,7 +698,7 @@ L(blend_h_tbl):
vmlal.u8 q14, d3, d7
vrshrn.i16 d2, q15, #6
vrshrn.i16 d3, q14, #6
vst1.u8 {q0, q1}, [r0]!
vst1.u8 {q0, q1}, [r0, :128]!
subs r8, r8, #32
bgt 32b
add r0, r0, r1
@ -719,16 +708,17 @@ L(blend_h_tbl):
endfunc
function blend_v_8bpc_neon, export=1
push {r4-r8,lr}
ldr r4, [sp, #24]
push {r4-r5,lr}
ldr r4, [sp, #12]
movrel r5, X(obmc_masks)
add r5, r5, r3
clz r8, r3
adr r7, L(blend_v_tbl)
sub r8, r8, #26
ldr r8, [r7, r8, lsl #2]
add r7, r7, r8
bx r7
clz lr, r3
adr r3, L(blend_v_tbl)
sub lr, lr, #26
ldr lr, [r3, lr, lsl #2]
add r3, r3, lr
bx r3
.align 2
L(blend_v_tbl):
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
@ -744,54 +734,53 @@ L(blend_v_tbl):
lsl r1, r1, #1
vsub.i8 d3, d22, d2
2:
vld1.8 {d1[]}, [r2], r3
vld1.16 {d1[0]}, [r2, :16]!
vld1.8 {d0[]}, [r0]
subs r4, r4, #2
vld1.8 {d1[1]}, [r2], r3
vld1.8 {d1[1]}, [r2]
vld1.8 {d0[1]}, [r12]
vmull.u8 q2, d1, d2
vmlal.u8 q2, d0, d3
vrshrn.i16 d6, q2, #6
add r2, r2, #2
vst1.8 {d6[0]}, [r0], r1
vst1.8 {d6[1]}, [r12], r1
bgt 2b
pop {r4-r8,pc}
pop {r4-r5,pc}
40:
vmov.i8 d22, #64
vld1.32 {d4[]}, [r5]
vld1.32 {d4[]}, [r5, :32]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
sub r1, r1, #3
4:
vld1.32 {d2[]}, [r2], r3
vld1.32 {d0[]}, [r0]
vld1.32 {d2[1]}, [r2], r3
vld1.32 {d0[1]}, [r12]
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
vld1.32 {d0[1]}, [r12, :32]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d5
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0]!
vst1.16 {d20[2]}, [r12]!
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
vst1.8 {d20[2]}, [r0]!
vst1.8 {d20[6]}, [r12]!
add r0, r0, r1
add r12, r12, r1
bgt 4b
pop {r4-r8,pc}
pop {r4-r5,pc}
80:
vmov.i8 d16, #64
vld1.u8 {d2}, [r5]
vld1.u8 {d2}, [r5, :64]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
sub r1, r1, #6
8:
vld1.u8 {d4}, [r2], r3
vld1.u8 {d0}, [r0]
vld1.u8 {d5}, [r2], r3
vld1.u8 {d1}, [r12]
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
vld1.u8 {d1}, [r12, :64]
subs r4, r4, #2
vmull.u8 q3, d2, d4
vmlal.u8 q3, d0, d17
@ -799,55 +788,54 @@ L(blend_v_tbl):
vmlal.u8 q10, d1, d17
vrshrn.i16 d22, q3, #6
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0]!
vst1.32 {d23[0]}, [r12]!
vst1.16 {d22[2]}, [r0]!
vst1.16 {d23[2]}, [r12]!
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
vst1.16 {d22[2]}, [r0, :16]!
vst1.16 {d23[2]}, [r12, :16]!
add r0, r0, r1
add r12, r12, r1
bgt 8b
pop {r4-r8,pc}
pop {r4-r5,pc}
160:
vmov.i8 q12, #64
vld1.u8 {q2}, [r5]
vld1.u8 {q14}, [r5, :128]
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q2
vsub.i8 q11, q12, q14
sub r1, r1, #12
16:
vld1.u8 {q1}, [r2], r3
vld1.u8 {q0}, [r0]
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
subs r4, r4, #2
vld1.u8 {q14}, [r2], r3
vld1.u8 {q13}, [r12]
vmull.u8 q3, d2, d4
vld1.u8 {q13}, [r12, :128]
vmull.u8 q3, d2, d28
vmlal.u8 q3, d0, d22
vmull.u8 q8, d3, d5
vmull.u8 q8, d3, d29
vmlal.u8 q8, d1, d23
vrshrn.i16 d18, q3, #6
vrshrn.i16 d19, q8, #6
vmull.u8 q3, d28, d4
vmull.u8 q3, d4, d28
vmlal.u8 q3, d26, d22
vmull.u8 q8, d29, d5
vmull.u8 q8, d5, d29
vmlal.u8 q8, d27, d23
vrshrn.i16 d20, q3, #6
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0]!
vst1.u8 {d20}, [r12]!
vst1.32 {d19[0]}, [r0]!
vst1.32 {d21[0]}, [r12]!
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
vst1.32 {d19[0]}, [r0, :32]!
vst1.32 {d21[0]}, [r12, :32]!
add r0, r0, r1
add r12, r12, r1
bgt 16b
pop {r4-r8,pc}
pop {r4-r5,pc}
320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5]
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
vsub.i8 q12, q10, q3
32:
vld1.u8 {q8, q9}, [r2], r3
vld1.u8 {q0, q1}, [r0]
vld1.u8 {q8, q9}, [r2, :128]!
vld1.u8 {q0, q1}, [r0, :128]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
@ -858,9 +846,9 @@ L(blend_v_tbl):
vmull.u8 q15, d18, d6
vmlal.u8 q15, d2, d24
vrshrn.i16 d2, q15, #6
vst1.u8 {d0, d1, d2}, [r0], r1
vst1.u8 {d0, d1, d2}, [r0, :64], r1
bgt 32b
pop {r4-r8,pc}
pop {r4-r5,pc}
endfunc

View File

@ -98,7 +98,8 @@ const idct64_coeffs, align=4
endconst
const iadst4_coeffs, align=4
.short 1321, 3803, 2482, 3344, 3344*8
// .h[4-5] can be interpreted as .s[2]
.short 1321, 3803, 2482, 3344, 3344, 0
endconst
const iadst8_coeffs, align=4
@ -147,6 +148,27 @@ endconst
.endif
.endm
.macro saddl_sz d0, d1, s0, s1, sz
saddl \d0\().4s, \s0\().4h, \s1\().4h
.ifc \sz, .8h
saddl2 \d1\().4s, \s0\().8h, \s1\().8h
.endif
.endm
.macro ssubl_sz d0, d1, s0, s1, sz
ssubl \d0\().4s, \s0\().4h, \s1\().4h
.ifc \sz, .8h
ssubl2 \d1\().4s, \s0\().8h, \s1\().8h
.endif
.endm
.macro mul_4s_sz d0, d1, s0, s1, c, sz
mul \d0\().4s, \s0\().4s, \c
.ifc \sz, .8h
mul \d1\().4s, \s1\().4s, \c
.endif
.endm
.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
sqrdmulh \r0\sz, \r0\sz, \c
sqrdmulh \r1\sz, \r1\sz, \c
@ -499,23 +521,24 @@ endfunc
movrel x16, iadst4_coeffs
ld1 {v0.8h}, [x16]
sub v3.4h, v16.4h, v18.4h
ssubl v3.4s, v16.4h, v18.4h
smull v4.4s, v16.4h, v0.h[0]
smlal v4.4s, v18.4h, v0.h[1]
smlal v4.4s, v19.4h, v0.h[2]
smull v7.4s, v17.4h, v0.h[3]
add v3.4h, v3.4h, v19.4h
saddw v3.4s, v3.4s, v19.4h
smull v5.4s, v16.4h, v0.h[2]
smlsl v5.4s, v18.4h, v0.h[0]
smlsl v5.4s, v19.4h, v0.h[1]
add \o3\().4s, v4.4s, v5.4s
sqrdmulh \o2\().4h, v3.4h, v0.h[4]
mul \o2\().4s, v3.4s, v0.s[2]
add \o0\().4s, v4.4s, v7.4s
add \o1\().4s, v5.4s, v7.4s
sub \o3\().4s, \o3\().4s, v7.4s
rshrn \o0\().4h, \o0\().4s, #12
rshrn \o2\().4h, \o2\().4s, #12
rshrn \o1\().4h, \o1\().4s, #12
rshrn \o3\().4h, \o3\().4s, #12
.endm
@ -534,14 +557,16 @@ endfunc
movrel x16, iadst4_coeffs
ld1 {v0.8h}, [x16]
sub v3.8h, v16.8h, v18.8h
ssubl v2.4s, v16.4h, v18.4h
ssubl2 v3.4s, v16.8h, v18.8h
smull v4.4s, v16.4h, v0.h[0]
smlal v4.4s, v18.4h, v0.h[1]
smlal v4.4s, v19.4h, v0.h[2]
smull2 v5.4s, v16.8h, v0.h[0]
smlal2 v5.4s, v18.8h, v0.h[1]
smlal2 v5.4s, v19.8h, v0.h[2]
add v3.8h, v3.8h, v19.8h
saddw v2.4s, v2.4s, v19.4h
saddw2 v3.4s, v3.4s, v19.8h
smull v6.4s, v16.4h, v0.h[2]
smlsl v6.4s, v18.4h, v0.h[0]
smlsl v6.4s, v19.4h, v0.h[1]
@ -549,7 +574,8 @@ endfunc
smlsl2 v7.4s, v18.8h, v0.h[0]
smlsl2 v7.4s, v19.8h, v0.h[1]
sqrdmulh v18.8h, v3.8h, v0.h[4]
mul v18.4s, v2.4s, v0.s[2]
mul v19.4s, v3.4s, v0.s[2]
smull v2.4s, v17.4h, v0.h[3]
smull2 v3.4s, v17.8h, v0.h[3]
@ -566,6 +592,9 @@ endfunc
sub v4.4s, v4.4s, v2.4s // out3
sub v5.4s, v5.4s, v3.4s
rshrn v18.4h, v18.4s, #12
rshrn2 v18.8h, v19.4s, #12
rshrn \o0\().4h, v16.4s, #12
rshrn2 \o0\().8h, v17.4s, #12
@ -836,16 +865,25 @@ endfunc
sqsub v5\sz, v5\sz, v19\sz // t7
sqneg \o1\()\sz, \o1\()\sz // out1
add v6\sz, v2\sz, v4\sz
sub v7\sz, v2\sz, v4\sz
add v4\sz, v3\sz, v5\sz
sub v5\sz, v3\sz, v5\sz
sqrdmulh \o3\sz, v6\sz, v1.h[1] // out3
sqrdmulh \o4\sz, v7\sz, v1.h[1] // out4
sqrdmulh \o2\sz, v4\sz, v1.h[1] // out2
sqrdmulh \o5\sz, v5\sz, v1.h[1] // out5
neg \o3\()\sz, \o3\()\sz // out3
neg \o5\()\sz, \o5\()\sz // out5
movi v0.4s, #2896>>4
saddl_sz v18, v19, v2, v4, \sz // -> out3 (v19 or v20)
ssubl_sz v6, v7, v2, v4, \sz // -> out4 (v20 or v19)
ssubl_sz v20, v21, v3, v5, \sz // -> out5 (v21 or v18)
saddl_sz v4, v5, v3, v5, \sz // -> out2 (v18 or v21)
mul_4s_sz v18, v19, v18, v19, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v20, v21, v20, v21, v0.s[0], \sz
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
rshrn_sz v2, v18, v19, #8, \sz // out3
rshrn_sz v3, v20, v21, #8, \sz // out5
rshrn_sz \o2, v4, v5, #8, \sz // out2 (v18 or v21)
rshrn_sz \o4, v6, v7, #8, \sz // out4 (v20 or v19)
sqneg \o3\()\sz, v2\sz // out3
sqneg \o5\()\sz, v3\sz // out5
.endm
function inv_adst_8x8_neon
@ -1272,28 +1310,47 @@ endfunc
sqsub v23\sz, v25\sz, v23\sz // t7
sqneg \o3\sz, \o3\sz // out3
sqsub v24\sz, v2\sz, v21\sz // -> out8
sqadd v2\sz, v2\sz, v21\sz // -> out7
sqadd v21\sz, v26\sz, v3\sz // -> out5
sqsub v26\sz, v26\sz, v3\sz // -> out10
sqadd v3\sz, v27\sz, v20\sz // -> out6
sqsub v25\sz, v27\sz, v20\sz // -> out9
sqadd v20\sz, v22\sz, v23\sz // -> out4
sqsub v27\sz, v22\sz, v23\sz // -> out11
movi v0.4s, #2896>>4
sqrdmulh v2\sz, v2\sz, v0.h[1] // out7
sqrdmulh v4\sz, v21\sz, v0.h[1] // out5
sqrdmulh v5\sz, v25\sz, v0.h[1] // out9
sqrdmulh v6\sz, v27\sz, v0.h[1] // out11
sqrdmulh \o6\sz, v3\sz, v0.h[1] // out6
sqrdmulh \o8\sz, v24\sz, v0.h[1] // out8
sqrdmulh \o10\sz, v26\sz, v0.h[1] // out10
sqrdmulh \o4\sz, v20\sz, v0.h[1] // out4
ssubl_sz v24, v25, v2, v21, \sz // -> out8 (v24 or v23)
saddl_sz v4, v5, v2, v21, \sz // -> out7 (v23 or v24)
saddl_sz v6, v7, v26, v3, \sz // -> out5 (v21 or v26)
ssubl_sz v2, v3, v26, v3, \sz // -> out10 (v26 or v21)
neg \o7\sz, v2\sz // out7
neg \o5\sz, v4\sz // out5
neg \o9\sz, v5\sz // out9
neg \o11\sz, v6\sz // out11
mul_4s_sz v24, v25, v24, v25, v0.s[0], \sz
mul_4s_sz v4, v5, v4, v5, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
rshrn_sz v24, v24, v25, #8, \sz // out8
rshrn_sz v4, v4, v5, #8, \sz // out7
rshrn_sz v5, v6, v7, #8, \sz // out5
rshrn_sz v26, v2, v3, #8, \sz // out10
saddl_sz v2, v3, v22, v23, \sz // -> out4 (v20 or v27)
ssubl_sz v6, v7, v22, v23, \sz // -> out11 (v27 or v20)
saddl_sz v22, v23, v27, v20, \sz // -> out6 (v22 or v25)
ssubl_sz v21, v25, v27, v20, \sz // -> out9 (v25 or v22)
mul_4s_sz v2, v3, v2, v3, v0.s[0], \sz
mul_4s_sz v6, v7, v6, v7, v0.s[0], \sz
mul_4s_sz v22, v23, v22, v23, v0.s[0], \sz
mul_4s_sz v21, v25, v21, v25, v0.s[0], \sz
rshrn_sz \o4, v2, v3, #8, \sz // out4
rshrn_sz v6, v6, v7, #8, \sz // out11
rshrn_sz v7, v21, v25, #8, \sz // out9
rshrn_sz \o6, v22, v23, #8, \sz // out6
.ifc \o8, v23
mov \o8\szb, v24\szb
mov \o10\szb, v26\szb
.endif
sqneg \o7\sz, v4\sz // out7
sqneg \o5\sz, v5\sz // out5
sqneg \o11\sz, v6\sz // out11
sqneg \o9\sz, v7\sz // out9
.endm
function inv_adst_8x16_neon

View File

@ -234,6 +234,635 @@ bidir_fn w_avg
bidir_fn mask
.macro w_mask_fn type
function w_mask_\type\()_8bpc_neon, export=1
clz w8, w4
adr x9, L(w_mask_\type\()_tbl)
sub w8, w8, #24
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
mov w10, #6903
dup v0.8h, w10
.if \type == 444
movi v1.16b, #64
.elseif \type == 422
dup v2.8b, w7
movi v3.8b, #129
sub v3.8b, v3.8b, v2.8b
.elseif \type == 420
dup v2.8h, w7
movi v3.8h, #1, lsl #8
sub v3.8h, v3.8h, v2.8h
.endif
add x12, x0, x1
lsl x1, x1, #1
br x9
4:
ld1 {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
ld1 {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
subs w5, w5, #4
sub v16.8h, v6.8h, v4.8h
sub v17.8h, v7.8h, v5.8h
sabd v18.8h, v4.8h, v6.8h
sabd v19.8h, v5.8h, v7.8h
uqsub v18.8h, v0.8h, v18.8h
uqsub v19.8h, v0.8h, v19.8h
ushr v18.8h, v18.8h, #8
ushr v19.8h, v19.8h, #8
shl v20.8h, v18.8h, #9
shl v21.8h, v19.8h, #9
sqdmulh v20.8h, v20.8h, v16.8h
sqdmulh v21.8h, v21.8h, v17.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v5.8h
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
xtn v18.8b, v18.8h
xtn2 v18.16b, v19.8h
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
addp v18.8h, v18.8h, v19.8h
xtn v18.8b, v18.8h
uhsub v18.8b, v3.8b, v18.8b
st1 {v18.8b}, [x6], #8
.elseif \type == 420
trn1 v24.2d, v18.2d, v19.2d
trn2 v25.2d, v18.2d, v19.2d
add v24.8h, v24.8h, v25.8h
addp v18.8h, v24.8h, v24.8h
sub v18.4h, v3.4h, v18.4h
rshrn v18.8b, v18.8h, #2
st1 {v18.s}[0], [x6], #4
.endif
st1 {v22.s}[0], [x0], x1
st1 {v22.s}[1], [x12], x1
st1 {v23.s}[0], [x0], x1
st1 {v23.s}[1], [x12], x1
b.gt 4b
ret
8:
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v6.8h, v7.8h}, [x3], #32
subs w5, w5, #2
sub v16.8h, v6.8h, v4.8h
sub v17.8h, v7.8h, v5.8h
sabd v18.8h, v4.8h, v6.8h
sabd v19.8h, v5.8h, v7.8h
uqsub v18.8h, v0.8h, v18.8h
uqsub v19.8h, v0.8h, v19.8h
ushr v18.8h, v18.8h, #8
ushr v19.8h, v19.8h, #8
shl v20.8h, v18.8h, #9
shl v21.8h, v19.8h, #9
sqdmulh v20.8h, v20.8h, v16.8h
sqdmulh v21.8h, v21.8h, v17.8h
add v20.8h, v20.8h, v4.8h
add v21.8h, v21.8h, v5.8h
sqrshrun v22.8b, v20.8h, #4
sqrshrun v23.8b, v21.8h, #4
.if \type == 444
xtn v18.8b, v18.8h
xtn2 v18.16b, v19.8h
sub v18.16b, v1.16b, v18.16b
st1 {v18.16b}, [x6], #16
.elseif \type == 422
addp v18.8h, v18.8h, v19.8h
xtn v18.8b, v18.8h
uhsub v18.8b, v3.8b, v18.8b
st1 {v18.8b}, [x6], #8
.elseif \type == 420
add v18.8h, v18.8h, v19.8h
addp v18.8h, v18.8h, v18.8h
sub v18.4h, v3.4h, v18.4h
rshrn v18.8b, v18.8h, #2
st1 {v18.s}[0], [x6], #4
.endif
st1 {v22.8b}, [x0], x1
st1 {v23.8b}, [x12], x1
b.gt 8b
ret
1280:
640:
320:
160:
mov w11, w4
sub x1, x1, w4, uxtw
.if \type == 444
add x10, x6, w4, uxtw
.elseif \type == 422
add x10, x6, x11, lsr #1
.endif
add x9, x3, w4, uxtw #1
add x7, x2, w4, uxtw #1
161:
mov w8, w4
16:
ld1 {v4.8h, v5.8h}, [x2], #32
ld1 {v6.8h, v7.8h}, [x3], #32
ld1 {v16.8h, v17.8h}, [x7], #32
ld1 {v18.8h, v19.8h}, [x9], #32
subs w8, w8, #16
sub v6.8h, v6.8h, v4.8h
sub v7.8h, v7.8h, v5.8h
sub v18.8h, v18.8h, v16.8h
sub v19.8h, v19.8h, v17.8h
abs v20.8h, v6.8h
abs v21.8h, v7.8h
abs v22.8h, v18.8h
abs v23.8h, v19.8h
uqsub v20.8h, v0.8h, v20.8h
uqsub v21.8h, v0.8h, v21.8h
uqsub v22.8h, v0.8h, v22.8h
uqsub v23.8h, v0.8h, v23.8h
ushr v20.8h, v20.8h, #8
ushr v21.8h, v21.8h, #8
ushr v22.8h, v22.8h, #8
ushr v23.8h, v23.8h, #8
shl v24.8h, v20.8h, #9
shl v25.8h, v21.8h, #9
shl v26.8h, v22.8h, #9
shl v27.8h, v23.8h, #9
sqdmulh v24.8h, v24.8h, v6.8h
sqdmulh v25.8h, v25.8h, v7.8h
sqdmulh v26.8h, v26.8h, v18.8h
sqdmulh v27.8h, v27.8h, v19.8h
add v24.8h, v24.8h, v4.8h
add v25.8h, v25.8h, v5.8h
add v26.8h, v26.8h, v16.8h
add v27.8h, v27.8h, v17.8h
sqrshrun v24.8b, v24.8h, #4
sqrshrun v25.8b, v25.8h, #4
sqrshrun v26.8b, v26.8h, #4
sqrshrun v27.8b, v27.8h, #4
.if \type == 444
xtn v20.8b, v20.8h
xtn2 v20.16b, v21.8h
xtn v21.8b, v22.8h
xtn2 v21.16b, v23.8h
sub v20.16b, v1.16b, v20.16b
sub v21.16b, v1.16b, v21.16b
st1 {v20.16b}, [x6], #16
st1 {v21.16b}, [x10], #16
.elseif \type == 422
addp v20.8h, v20.8h, v21.8h
addp v21.8h, v22.8h, v23.8h
xtn v20.8b, v20.8h
xtn v21.8b, v21.8h
uhsub v20.8b, v3.8b, v20.8b
uhsub v21.8b, v3.8b, v21.8b
st1 {v20.8b}, [x6], #8
st1 {v21.8b}, [x10], #8
.elseif \type == 420
add v20.8h, v20.8h, v22.8h
add v21.8h, v21.8h, v23.8h
addp v20.8h, v20.8h, v21.8h
sub v20.8h, v3.8h, v20.8h
rshrn v20.8b, v20.8h, #2
st1 {v20.8b}, [x6], #8
.endif
st1 {v24.8b, v25.8b}, [x0], #16
st1 {v26.8b, v27.8b}, [x12], #16
b.gt 16b
subs w5, w5, #2
add x2, x2, w4, uxtw #1
add x3, x3, w4, uxtw #1
add x7, x7, w4, uxtw #1
add x9, x9, w4, uxtw #1
.if \type == 444
add x6, x6, w4, uxtw
add x10, x10, w4, uxtw
.elseif \type == 422
add x6, x6, x11, lsr #1
add x10, x10, x11, lsr #1
.endif
add x0, x0, x1
add x12, x12, x1
b.gt 161b
ret
L(w_mask_\type\()_tbl):
.hword L(w_mask_\type\()_tbl) - 1280b
.hword L(w_mask_\type\()_tbl) - 640b
.hword L(w_mask_\type\()_tbl) - 320b
.hword L(w_mask_\type\()_tbl) - 160b
.hword L(w_mask_\type\()_tbl) - 8b
.hword L(w_mask_\type\()_tbl) - 4b
endfunc
.endm
w_mask_fn 444
w_mask_fn 422
w_mask_fn 420
function blend_8bpc_neon, export=1
adr x6, L(blend_tbl)
clz w3, w3
sub w3, w3, #26
ldrh w3, [x6, x3, lsl #1]
sub x6, x6, w3, uxtw
movi v4.16b, #64
add x8, x0, x1
lsl w1, w1, #1
br x6
4:
ld1 {v2.d}[0], [x5], #8
ld1 {v1.d}[0], [x2], #8
ld1 {v0.s}[0], [x0]
subs w4, w4, #2
ld1 {v0.s}[1], [x8]
sub v3.8b, v4.8b, v2.8b
umull v5.8h, v1.8b, v2.8b
umlal v5.8h, v0.8b, v3.8b
rshrn v6.8b, v5.8h, #6
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x8], x1
b.gt 4b
ret
8:
ld1 {v2.2d}, [x5], #16
ld1 {v1.2d}, [x2], #16
ld1 {v0.d}[0], [x0]
ld1 {v0.d}[1], [x8]
sub v3.16b, v4.16b, v2.16b
subs w4, w4, #2
umull v5.8h, v1.8b, v2.8b
umlal v5.8h, v0.8b, v3.8b
umull2 v6.8h, v1.16b, v2.16b
umlal2 v6.8h, v0.16b, v3.16b
rshrn v7.8b, v5.8h, #6
rshrn2 v7.16b, v6.8h, #6
st1 {v7.d}[0], [x0], x1
st1 {v7.d}[1], [x8], x1
b.gt 8b
ret
16:
ld1 {v1.2d, v2.2d}, [x5], #32
ld1 {v5.2d, v6.2d}, [x2], #32
ld1 {v0.2d}, [x0]
subs w4, w4, #2
sub v7.16b, v4.16b, v1.16b
sub v20.16b, v4.16b, v2.16b
ld1 {v3.2d}, [x8]
umull v16.8h, v5.8b, v1.8b
umlal v16.8h, v0.8b, v7.8b
umull2 v17.8h, v5.16b, v1.16b
umlal2 v17.8h, v0.16b, v7.16b
umull v21.8h, v6.8b, v2.8b
umlal v21.8h, v3.8b, v20.8b
umull2 v22.8h, v6.16b, v2.16b
umlal2 v22.8h, v3.16b, v20.16b
rshrn v18.8b, v16.8h, #6
rshrn2 v18.16b, v17.8h, #6
rshrn v19.8b, v21.8h, #6
rshrn2 v19.16b, v22.8h, #6
st1 {v18.2d}, [x0], x1
st1 {v19.2d}, [x8], x1
b.gt 16b
ret
32:
ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x5], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x2], #64
ld1 {v20.2d, v21.2d}, [x0]
subs w4, w4, #2
ld1 {v22.2d, v23.2d}, [x8]
sub v5.16b, v4.16b, v0.16b
sub v6.16b, v4.16b, v1.16b
sub v30.16b, v4.16b, v2.16b
sub v31.16b, v4.16b, v3.16b
umull v24.8h, v16.8b, v0.8b
umlal v24.8h, v20.8b, v5.8b
umull2 v26.8h, v16.16b, v0.16b
umlal2 v26.8h, v20.16b, v5.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v21.8b, v6.8b
umull2 v7.8h, v17.16b, v1.16b
umlal2 v7.8h, v21.16b, v6.16b
umull v27.8h, v18.8b, v2.8b
umlal v27.8h, v22.8b, v30.8b
umull2 v1.8h, v18.16b, v2.16b
umlal2 v1.8h, v22.16b, v30.16b
umull v29.8h, v19.8b, v3.8b
umlal v29.8h, v23.8b, v31.8b
umull2 v21.8h, v19.16b, v3.16b
umlal2 v21.8h, v23.16b, v31.16b
rshrn v24.8b, v24.8h, #6
rshrn2 v24.16b, v26.8h, #6
rshrn v25.8b, v28.8h, #6
rshrn2 v25.16b, v7.8h, #6
rshrn v27.8b, v27.8h, #6
rshrn2 v27.16b, v1.8h, #6
rshrn v28.8b, v29.8h, #6
rshrn2 v28.16b, v21.8h, #6
st1 {v24.2d, v25.2d}, [x0], x1
st1 {v27.2d, v28.2d}, [x8], x1
b.gt 32b
ret
L(blend_tbl):
.hword L(blend_tbl) - 32b
.hword L(blend_tbl) - 16b
.hword L(blend_tbl) - 8b
.hword L(blend_tbl) - 4b
endfunc
function blend_h_8bpc_neon, export=1
adr x6, L(blend_h_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w4, uxtw
sub w4, w4, w4, lsr #2
clz w7, w3
movi v4.16b, #64
add x8, x0, x1
lsl x1, x1, #1
sub w7, w7, #24
ldrh w7, [x6, x7, lsl #1]
sub x6, x6, w7, uxtw
br x6
2:
ld1 {v0.h}[0], [x5], #2
ld1 {v1.s}[0], [x2], #4
subs w4, w4, #2
ld1 {v2.h}[0], [x0]
zip1 v0.8b, v0.8b, v0.8b
sub v3.8b, v4.8b, v0.8b
ld1 {v2.h}[1], [x8]
umull v5.8h, v1.8b, v0.8b
umlal v5.8h, v2.8b, v3.8b
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], x1
st1 {v5.h}[1], [x8], x1
b.gt 2b
ret
4:
ld2r {v0.8b, v1.8b}, [x5], #2
ld1 {v2.2s}, [x2], #8
subs w4, w4, #2
ext v0.8b, v0.8b, v1.8b, #4
ld1 {v3.s}[0], [x0]
sub v5.8b, v4.8b, v0.8b
ld1 {v3.s}[1], [x8]
umull v6.8h, v2.8b, v0.8b
umlal v6.8h, v3.8b, v5.8b
rshrn v6.8b, v6.8h, #6
st1 {v6.s}[0], [x0], x1
st1 {v6.s}[1], [x8], x1
b.gt 4b
ret
8:
ld2r {v0.16b, v1.16b}, [x5], #2
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
ext v0.16b, v0.16b, v1.16b, #8
sub v5.16b, v4.16b, v0.16b
ld1 {v3.d}[1], [x8]
subs w4, w4, #2
umull v6.8h, v0.8b, v2.8b
umlal v6.8h, v3.8b, v5.8b
umull2 v7.8h, v0.16b, v2.16b
umlal2 v7.8h, v3.16b, v5.16b
rshrn v16.8b, v6.8h, #6
rshrn2 v16.16b, v7.8h, #6
st1 {v16.d}[0], [x0], x1
st1 {v16.d}[1], [x8], x1
b.gt 8b
ret
16:
ld2r {v0.16b, v1.16b}, [x5], #2
ld1 {v2.16b, v3.16b}, [x2], #32
ld1 {v5.16b}, [x0]
sub v7.16b, v4.16b, v0.16b
sub v16.16b, v4.16b, v1.16b
ld1 {v6.16b}, [x8]
subs w4, w4, #2
umull v17.8h, v0.8b, v2.8b
umlal v17.8h, v5.8b, v7.8b
umull2 v18.8h, v0.16b, v2.16b
umlal2 v18.8h, v5.16b, v7.16b
umull v19.8h, v1.8b, v3.8b
umlal v19.8h, v6.8b, v16.8b
umull2 v20.8h, v1.16b, v3.16b
umlal2 v20.8h, v6.16b, v16.16b
rshrn v21.8b, v17.8h, #6
rshrn2 v21.16b, v18.8h, #6
rshrn v22.8b, v19.8h, #6
rshrn2 v22.16b, v20.8h, #6
st1 {v21.16b}, [x0], x1
st1 {v22.16b}, [x8], x1
b.gt 16b
ret
1280:
640:
320:
sub x1, x1, w3, uxtw
add x7, x2, w3, uxtw
321:
ld2r {v0.16b, v1.16b}, [x5], #2
mov w6, w3
sub v20.16b, v4.16b, v0.16b
sub v21.16b, v4.16b, v1.16b
32:
ld1 {v16.16b, v17.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x0]
subs w6, w6, #32
umull v23.8h, v0.8b, v16.8b
umlal v23.8h, v2.8b, v20.8b
ld1 {v18.16b, v19.16b}, [x7], #32
umull2 v27.8h, v0.16b, v16.16b
umlal2 v27.8h, v2.16b, v20.16b
ld1 {v6.16b, v7.16b}, [x8]
umull v24.8h, v0.8b, v17.8b
umlal v24.8h, v3.8b, v20.8b
umull2 v28.8h, v0.16b, v17.16b
umlal2 v28.8h, v3.16b, v20.16b
umull v25.8h, v1.8b, v18.8b
umlal v25.8h, v6.8b, v21.8b
umull2 v5.8h, v1.16b, v18.16b
umlal2 v5.8h, v6.16b, v21.16b
rshrn v29.8b, v23.8h, #6
rshrn2 v29.16b, v27.8h, #6
umull v26.8h, v1.8b, v19.8b
umlal v26.8h, v7.8b, v21.8b
umull2 v31.8h, v1.16b, v19.16b
umlal2 v31.8h, v7.16b, v21.16b
rshrn v30.8b, v24.8h, #6
rshrn2 v30.16b, v28.8h, #6
rshrn v23.8b, v25.8h, #6
rshrn2 v23.16b, v5.8h, #6
rshrn v24.8b, v26.8h, #6
st1 {v29.16b, v30.16b}, [x0], #32
rshrn2 v24.16b, v31.8h, #6
st1 {v23.16b, v24.16b}, [x8], #32
b.gt 32b
subs w4, w4, #2
add x0, x0, x1
add x8, x8, x1
add x2, x2, w3, uxtw
add x7, x7, w3, uxtw
b.gt 321b
ret
L(blend_h_tbl):
.hword L(blend_h_tbl) - 1280b
.hword L(blend_h_tbl) - 640b
.hword L(blend_h_tbl) - 320b
.hword L(blend_h_tbl) - 16b
.hword L(blend_h_tbl) - 8b
.hword L(blend_h_tbl) - 4b
.hword L(blend_h_tbl) - 2b
endfunc
function blend_v_8bpc_neon, export=1
adr x6, L(blend_v_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w3, uxtw
clz w3, w3
movi v4.16b, #64
add x8, x0, x1
lsl x1, x1, #1
sub w3, w3, #26
ldrh w3, [x6, x3, lsl #1]
sub x6, x6, w3, uxtw
br x6
20:
ld1r {v0.8b}, [x5]
sub v1.8b, v4.8b, v0.8b
2:
ld1 {v2.h}[0], [x2], #2
ld1 {v3.b}[0], [x0]
subs w4, w4, #2
ld1 {v2.b}[1], [x2]
ld1 {v3.b}[1], [x8]
umull v5.8h, v2.8b, v0.8b
umlal v5.8h, v3.8b, v1.8b
rshrn v5.8b, v5.8h, #6
add x2, x2, #2
st1 {v5.b}[0], [x0], x1
st1 {v5.b}[1], [x8], x1
b.gt 2b
ret
40:
ld1r {v0.2s}, [x5]
sub v1.8b, v4.8b, v0.8b
sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
ld1 {v3.s}[1], [x8]
subs w4, w4, #2
umull v5.8h, v2.8b, v0.8b
umlal v5.8h, v3.8b, v1.8b
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
st1 {v5.b}[2], [x0], #1
st1 {v5.b}[6], [x8], #1
add x0, x0, x1
add x8, x8, x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
sub v1.16b, v4.16b, v0.16b
sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
ld1 {v3.d}[1], [x8]
subs w4, w4, #2
umull v5.8h, v0.8b, v2.8b
umlal v5.8h, v3.8b, v1.8b
umull2 v6.8h, v0.16b, v2.16b
umlal2 v6.8h, v3.16b, v1.16b
rshrn v7.8b, v5.8h, #6
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
st1 {v7.h}[2], [x0], #2
st1 {v7.h}[6], [x8], #2
add x0, x0, x1
add x8, x8, x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
sub v2.16b, v4.16b, v0.16b
sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
subs w4, w4, #2
ld1 {v16.16b}, [x8]
umull v17.8h, v5.8b, v0.8b
umlal v17.8h, v7.8b, v2.8b
umull2 v18.8h, v5.16b, v0.16b
umlal2 v18.8h, v7.16b, v2.16b
umull v20.8h, v6.8b, v0.8b
umlal v20.8h, v16.8b, v2.8b
umull2 v21.8h, v6.16b, v0.16b
umlal2 v21.8h, v16.16b, v2.16b
rshrn v19.8b, v17.8h, #6
rshrn2 v19.16b, v18.8h, #6
rshrn v22.8b, v20.8h, #6
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
st1 {v19.s}[2], [x0], #4
st1 {v22.s}[2], [x8], #4
add x0, x0, x1
add x8, x8, x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
sub v2.16b, v4.16b, v0.16b
sub v3.16b, v4.16b, v1.16b
sub x1, x1, #24
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
subs w4, w4, #2
ld1 {v20.16b, v21.16b}, [x8]
umull v22.8h, v16.8b, v0.8b
umlal v22.8h, v5.8b, v2.8b
umull2 v23.8h, v16.16b, v0.16b
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
umull2 v29.8h, v17.16b, v1.16b
umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
umull2 v26.8h, v19.16b, v1.16b
umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
st1 {v28.8b}, [x0], #8
st1 {v27.8b}, [x8], #8
add x0, x0, x1
add x8, x8, x1
b.gt 32b
ret
L(blend_v_tbl):
.hword L(blend_v_tbl) - 320b
.hword L(blend_v_tbl) - 160b
.hword L(blend_v_tbl) - 80b
.hword L(blend_v_tbl) - 40b
.hword L(blend_v_tbl) - 20b
endfunc
// This has got the same signature as the put_8tap functions,
// and assumes that x8 is set to (clz(w)-24).
function put_neon

View File

@ -148,7 +148,7 @@ function msac_decode_symbol_adapt4_neon, export=1
add x8, x0, #RNG
ld1_n v0, v1, x1, \sz, \n // cdf
ld1r {v4\sz}, [x8] // rng
movrel x9, coeffs, 32
movrel x9, coeffs, 30
sub x9, x9, x2, lsl #1
ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT
str h4, [sp, #14] // store original u = s->rng
@ -183,16 +183,24 @@ function msac_decode_symbol_adapt4_neon, export=1
// update_cdf
ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
movi v5\szb, #0xff
cmp x2, #4 // set C if n_symbols >= 4 (n_symbols > 3)
mov w14, #4
lsr w4, w3, #4 // count >> 4
.if \n == 16
mov w4, #-5
.else
mvn w14, w2
mov w4, #-4
cmn w14, #3 // set C if n_symbols <= 2
.endif
urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
adc w4, w4, w14 // (count >> 4) + (n_symbols > 3) + 4
neg w4, w4 // -rate
.if \n == 16
sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
.else
lsr w14, w3, #4 // count >> 4
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
.endif
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6.8h, w4 // -rate
sub w3, w3, w3, lsr #5 // count - (count >= 32)
sub w3, w3, w3, lsr #5 // count - (count == 32)
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
add w3, w3, #1 // count + (count < 32)
@ -224,8 +232,7 @@ L(renorm2):
b.ge 9f
// refill
ldr x3, [x0, #BUF_POS]
ldr x4, [x0, #BUF_END]
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
cmp x5, x4
b.gt 2f

View File

@ -101,16 +101,15 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
c->mask = dav1d_mask_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#elif ARCH_ARM
c->blend = dav1d_blend_8bpc_neon;
c->blend_h = dav1d_blend_h_8bpc_neon;
c->blend_v = dav1d_blend_v_8bpc_neon;
c->w_mask[0] = dav1d_w_mask_444_8bpc_neon;
c->w_mask[1] = dav1d_w_mask_422_8bpc_neon;
c->w_mask[2] = dav1d_w_mask_420_8bpc_neon;
#if ARCH_AARCH64
c->warp8x8 = dav1d_warp_affine_8x8_8bpc_neon;
c->warp8x8t = dav1d_warp_affine_8x8t_8bpc_neon;
#endif
#endif
}

View File

@ -67,6 +67,7 @@ typedef struct Dav1dCdefDSPContext {
bitfn_decls(void dav1d_cdef_dsp_init, Dav1dCdefDSPContext *c);
bitfn_decls(void dav1d_cdef_dsp_init_arm, Dav1dCdefDSPContext *c);
bitfn_decls(void dav1d_cdef_dsp_init_ppc, Dav1dCdefDSPContext *c);
bitfn_decls(void dav1d_cdef_dsp_init_x86, Dav1dCdefDSPContext *c);
#endif /* DAV1D_SRC_CDEF_H */

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include "common/intops.h"
@ -263,6 +262,8 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
bitfn(dav1d_cdef_dsp_init_arm)(c);
#elif ARCH_PPC64LE
bitfn(dav1d_cdef_dsp_init_ppc)(c);
#elif ARCH_X86
bitfn(dav1d_cdef_dsp_init_x86)(c);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -37,91 +37,94 @@
/* Buffers padded to [8] or [16] for SIMD where needed. */
typedef struct CdfModeContext {
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
uint16_t use_filter_intra[N_BS_SIZES][2];
uint16_t filter_intra[5 + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
uint16_t angle_delta[8][8];
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
uint16_t newmv_mode[6][2];
uint16_t globalmv_mode[2][2];
uint16_t refmv_mode[6][2];
uint16_t drl_bit[3][2];
uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES + 1];
uint16_t intra[4][2];
uint16_t comp[5][2];
uint16_t comp_dir[5][2];
uint16_t jnt_comp[6][2];
uint16_t mask_comp[6][2];
uint16_t wedge_comp[9][2];
uint16_t wedge_idx[9][16 + 1];
uint16_t interintra[7][2];
uint16_t interintra_mode[4][5];
uint16_t interintra_wedge[7][2];
uint16_t ref[6][3][2];
uint16_t comp_fwd_ref[3][3][2];
uint16_t comp_bwd_ref[2][3][2];
uint16_t comp_uni_ref[3][3][2];
uint16_t txsz[N_TX_SIZES - 1][3][4];
uint16_t txpart[7][3][2];
uint16_t txtp_inter[4][N_TX_SIZES][N_TX_TYPES + 1];
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
uint16_t skip[3][2];
uint16_t skip_mode[3][2];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
uint16_t seg_pred[3][2];
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
uint16_t cfl_sign[8 + 1];
uint16_t cfl_alpha[6][16 + 1];
uint16_t restore_wiener[2];
uint16_t restore_sgrproj[2];
uint16_t restore_switchable[3 + 1];
uint16_t delta_q[4 + 1];
uint16_t delta_lf[5][4 + 1];
uint16_t obmc[N_BS_SIZES][2];
uint16_t motion_mode[N_BS_SIZES][3 + 1];
uint16_t pal_y[7][3][2];
uint16_t pal_uv[2][2];
uint16_t pal_sz[2][7][7 + 1];
uint16_t color_map[2][7][5][8 + 1];
uint16_t intrabc[2];
ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
ALIGN(uint16_t wedge_idx[9][16], 32);
ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
ALIGN(uint16_t cfl_alpha[6][16], 32);
ALIGN(uint16_t txtp_inter1[2][16], 32);
ALIGN(uint16_t txtp_inter2[12 + 4], 32);
ALIGN(uint16_t txtp_intra1[2][N_INTRA_PRED_MODES][7 + 1], 16);
ALIGN(uint16_t txtp_intra2[3][N_INTRA_PRED_MODES][5 + 3], 16);
ALIGN(uint16_t cfl_sign[8], 16);
ALIGN(uint16_t angle_delta[8][8], 16);
ALIGN(uint16_t filter_intra[5 + 3], 16);
ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
ALIGN(uint16_t color_map[2][7][5][8], 16);
ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
ALIGN(uint16_t delta_q[4], 8);
ALIGN(uint16_t delta_lf[5][4], 8);
ALIGN(uint16_t interintra_mode[4][4], 8);
ALIGN(uint16_t restore_switchable[3 + 1], 8);
ALIGN(uint16_t restore_wiener[2], 4);
ALIGN(uint16_t restore_sgrproj[2], 4);
ALIGN(uint16_t interintra[7][2], 4);
ALIGN(uint16_t interintra_wedge[7][2], 4);
ALIGN(uint16_t txtp_inter3[4][2], 4);
ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
ALIGN(uint16_t newmv_mode[6][2], 4);
ALIGN(uint16_t globalmv_mode[2][2], 4);
ALIGN(uint16_t refmv_mode[6][2], 4);
ALIGN(uint16_t drl_bit[3][2], 4);
ALIGN(uint16_t intra[4][2], 4);
ALIGN(uint16_t comp[5][2], 4);
ALIGN(uint16_t comp_dir[5][2], 4);
ALIGN(uint16_t jnt_comp[6][2], 4);
ALIGN(uint16_t mask_comp[6][2], 4);
ALIGN(uint16_t wedge_comp[9][2], 4);
ALIGN(uint16_t ref[6][3][2], 4);
ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
ALIGN(uint16_t txpart[7][3][2], 4);
ALIGN(uint16_t skip[3][2], 4);
ALIGN(uint16_t skip_mode[3][2], 4);
ALIGN(uint16_t seg_pred[3][2], 4);
ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
ALIGN(uint16_t pal_y[7][3][2], 4);
ALIGN(uint16_t pal_uv[2][2], 4);
ALIGN(uint16_t intrabc[2], 4);
} CdfModeContext;
typedef struct CdfCoefContext {
uint16_t skip[N_TX_SIZES][13][2];
uint16_t eob_bin_16[2][2][6];
uint16_t eob_bin_32[2][2][7 + 1];
uint16_t eob_bin_64[2][2][8];
uint16_t eob_bin_128[2][2][9];
uint16_t eob_bin_256[2][2][10 + 6];
uint16_t eob_bin_512[2][2][11 + 5];
uint16_t eob_bin_1024[2][2][12 + 4];
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
uint16_t base_tok[N_TX_SIZES][2][41][5];
uint16_t dc_sign[2][3][2];
uint16_t br_tok[4 /*5*/][2][21][5];
ALIGN(uint16_t eob_bin_16[2][2][5 + 3], 16);
ALIGN(uint16_t eob_bin_32[2][2][6 + 2], 16);
ALIGN(uint16_t eob_bin_64[2][2][7 + 1], 16);
ALIGN(uint16_t eob_bin_128[2][2][8 + 0], 16);
ALIGN(uint16_t eob_bin_256[2][2][9 + 7], 32);
ALIGN(uint16_t eob_bin_512[2][10 + 6], 32);
ALIGN(uint16_t eob_bin_1024[2][11 + 5], 32);
ALIGN(uint16_t eob_base_tok[N_TX_SIZES][2][4][4], 8);
ALIGN(uint16_t base_tok[N_TX_SIZES][2][41][4], 8);
ALIGN(uint16_t br_tok[4 /*5*/][2][21][4], 8);
ALIGN(uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2], 4);
ALIGN(uint16_t skip[N_TX_SIZES][13][2], 4);
ALIGN(uint16_t dc_sign[2][3][2], 4);
} CdfCoefContext;
typedef struct CdfMvComponent {
uint16_t classes[11 + 1 + 4];
uint16_t class0[2];
uint16_t classN[10][2];
uint16_t class0_fp[2][4 + 1];
uint16_t classN_fp[4 + 1];
uint16_t class0_hp[2];
uint16_t classN_hp[2];
uint16_t sign[2];
ALIGN(uint16_t classes[11 + 5], 32);
ALIGN(uint16_t class0_fp[2][4], 8);
ALIGN(uint16_t classN_fp[4], 8);
ALIGN(uint16_t class0_hp[2], 4);
ALIGN(uint16_t classN_hp[2], 4);
ALIGN(uint16_t class0[2], 4);
ALIGN(uint16_t classN[10][2], 4);
ALIGN(uint16_t sign[2], 4);
} CdfMvComponent;
typedef struct CdfMvContext {
CdfMvComponent comp[2];
uint16_t joint[N_MV_JOINTS + 1];
ALIGN(uint16_t joint[N_MV_JOINTS], 8);
} CdfMvContext;
typedef struct CdfContext {
CdfModeContext m;
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
CdfCoefContext coef;
CdfMvContext mv, dmv;
} CdfContext;

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
@ -35,6 +34,7 @@
#include "dav1d/data.h"
#include "common/attributes.h"
#include "common/validate.h"
#include "src/data.h"

View File

@ -42,6 +42,7 @@
#include "src/decode.h"
#include "src/dequant_tables.h"
#include "src/env.h"
#include "src/film_grain.h"
#include "src/log.h"
#include "src/qm.h"
#include "src/recon.h"
@ -81,14 +82,14 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
const int have_hp = f->frame_hdr->hp;
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
mv_comp->classes, 11);
mv_comp->classes, 10);
int up, fp, hp;
if (!cl) {
up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->class0_fp[up], 4);
mv_comp->class0_fp[up], 3);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->class0_hp) : 1;
} else {
@ -102,7 +103,7 @@ static int read_mv_component_diff(Dav1dTileContext *const t,
mv_comp->classN[n]) << n;
if (have_fp) {
fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
mv_comp->classN_fp, 4);
mv_comp->classN_fp, 3);
hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
mv_comp->classN_hp) : 1;
} else {
@ -120,7 +121,7 @@ static void read_mv_residual(Dav1dTileContext *const t, mv *const ref_mv,
CdfMvContext *const mv_cdf, const int have_fp)
{
switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
N_MV_JOINTS))
N_MV_JOINTS - 1))
{
case MV_JOINT_HV:
ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
@ -380,7 +381,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 7) + 2;
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
uint16_t cache[16], used_cache[8];
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
int n_cache = 0;
@ -586,7 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
Dav1dTileState *const ts = t->ts;
const ptrdiff_t stride = bw4 * 4;
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
uint16_t (*const color_map_cdf)[8 + 1] =
uint16_t (*const color_map_cdf)[8] =
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
uint8_t (*const order)[8] = t->scratch.pal_order;
uint8_t *const ctx = t->scratch.pal_ctx;
@ -597,7 +598,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
order_palette(pal_idx, stride, i, first, last, order, ctx);
for (int j = first, m = 0; j >= last; j--, m++) {
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
color_map_cdf[ctx[m]], b->pal_sz[pl]);
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
pal_idx[(i - j) * stride + j] = order[m][color_idx];
}
}
@ -647,7 +648,7 @@ static void read_vartx_tree(Dav1dTileContext *const t,
}
b->uvtx = dav1d_max_txfm_size_for_bs[bs][f->cur.p.layout];
} else {
assert(imin(bw4, bh4) <= 16 || b->max_ytx == TX_64X64);
assert(bw4 <= 16 || bh4 <= 16 || b->max_ytx == TX_64X64);
int y, x, y_off, x_off;
const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
for (y = 0, y_off = 0; y < bh4; y += ytx->h, y_off++) {
@ -673,8 +674,6 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
const uint8_t *ref_seg_map,
const ptrdiff_t stride)
{
unsigned seg_id = 8;
assert(f->frame_hdr->primary_ref_frame != DAV1D_PRIMARY_REF_NONE);
if (dav1d_thread_picture_wait(&f->refp[f->frame_hdr->primary_ref_frame],
(by + h4) * 4, PLANE_TYPE_BLOCK))
@ -682,12 +681,13 @@ static inline unsigned get_prev_frame_segid(const Dav1dFrameContext *const f,
return 8;
}
unsigned seg_id = 8;
ref_seg_map += by * stride + bx;
do {
for (int x = 0; x < w4; x++)
seg_id = imin(seg_id, ref_seg_map[x]);
ref_seg_map += stride;
} while (--h4 > 0);
} while (--h4 > 0 && seg_id);
assert(seg_id < 8);
return seg_id;
@ -814,7 +814,7 @@ static int decode_b(Dav1dTileContext *const t,
&seg_ctx, f->cur_segmap, f->b4_stride);
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
DAV1D_MAX_SEGMENTS - 1);
const unsigned last_active_seg_id =
f->frame_hdr->segmentation.seg_data.last_active_segid;
b->seg_id = neg_deinterleave(diff, pred_seg_id,
@ -886,7 +886,7 @@ static int decode_b(Dav1dTileContext *const t,
} else {
const unsigned diff = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.seg_id[seg_ctx],
DAV1D_MAX_SEGMENTS);
DAV1D_MAX_SEGMENTS - 1);
const unsigned last_active_seg_id =
f->frame_hdr->segmentation.seg_data.last_active_segid;
b->seg_id = neg_deinterleave(diff, pred_seg_id,
@ -934,7 +934,7 @@ static int decode_b(Dav1dTileContext *const t,
if (have_delta_q) {
int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_q, 4);
ts->cdf.m.delta_q, 3);
if (delta_q == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
delta_q = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@ -955,7 +955,7 @@ static int decode_b(Dav1dTileContext *const t,
for (int i = 0; i < n_lfs; i++) {
int delta_lf = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 4);
ts->cdf.m.delta_lf[i + f->frame_hdr->delta.lf.multi], 3);
if (delta_lf == 3) {
const int n_bits = 1 + dav1d_msac_decode_bools(&ts->msac, 3);
delta_lf = dav1d_msac_decode_bools(&ts->msac, n_bits) +
@ -1020,7 +1020,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.kfym[dav1d_intra_mode_context[t->a->mode[bx4]]]
[dav1d_intra_mode_context[t->l.mode[by4]]];
b->y_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, ymode_cdf,
N_INTRA_PRED_MODES);
N_INTRA_PRED_MODES - 1);
if (DEBUG_BLOCK_INFO)
printf("Post-ymode[%d]: r=%d\n", b->y_mode, ts->msac.rng);
@ -1029,7 +1029,7 @@ static int decode_b(Dav1dTileContext *const t,
b->y_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->y_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
b->y_angle = angle - 3;
} else {
b->y_angle = 0;
@ -1040,20 +1040,20 @@ static int decode_b(Dav1dTileContext *const t,
cbw4 == 1 && cbh4 == 1 : !!(cfl_allowed_mask & (1 << bs));
uint16_t *const uvmode_cdf = ts->cdf.m.uv_mode[cfl_allowed][b->y_mode];
b->uv_mode = dav1d_msac_decode_symbol_adapt16(&ts->msac, uvmode_cdf,
N_UV_INTRA_PRED_MODES - !cfl_allowed);
N_UV_INTRA_PRED_MODES - 1 - !cfl_allowed);
if (DEBUG_BLOCK_INFO)
printf("Post-uvmode[%d]: r=%d\n", b->uv_mode, ts->msac.rng);
if (b->uv_mode == CFL_PRED) {
#define SIGN(a) (!!(a) + ((a) > 0))
const int sign = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.cfl_sign, 8) + 1;
ts->cdf.m.cfl_sign, 7) + 1;
const int sign_u = sign * 0x56 >> 8, sign_v = sign - sign_u * 3;
assert(sign_u == sign / 3);
if (sign_u) {
const int ctx = (sign_u == 2) * 3 + sign_v;
b->cfl_alpha[0] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
ts->cdf.m.cfl_alpha[ctx], 15) + 1;
if (sign_u == 1) b->cfl_alpha[0] = -b->cfl_alpha[0];
} else {
b->cfl_alpha[0] = 0;
@ -1061,7 +1061,7 @@ static int decode_b(Dav1dTileContext *const t,
if (sign_v) {
const int ctx = (sign_v == 2) * 3 + sign_u;
b->cfl_alpha[1] = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.cfl_alpha[ctx], 16) + 1;
ts->cdf.m.cfl_alpha[ctx], 15) + 1;
if (sign_v == 1) b->cfl_alpha[1] = -b->cfl_alpha[1];
} else {
b->cfl_alpha[1] = 0;
@ -1074,7 +1074,7 @@ static int decode_b(Dav1dTileContext *const t,
b->uv_mode <= VERT_LEFT_PRED)
{
uint16_t *const acdf = ts->cdf.m.angle_delta[b->uv_mode - VERT_PRED];
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 7);
const int angle = dav1d_msac_decode_symbol_adapt8(&ts->msac, acdf, 6);
b->uv_angle = angle - 3;
} else {
b->uv_angle = 0;
@ -1115,7 +1115,7 @@ static int decode_b(Dav1dTileContext *const t,
if (is_filter) {
b->y_mode = FILTER_PRED;
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter_intra, 5);
ts->cdf.m.filter_intra, 4);
}
if (DEBUG_BLOCK_INFO)
printf("Post-filterintramode[%d/%d]: r=%d\n",
@ -1158,7 +1158,7 @@ static int decode_b(Dav1dTileContext *const t,
const int tctx = get_tx_ctx(t->a, &t->l, t_dim, by4, bx4);
uint16_t *const tx_cdf = ts->cdf.m.txsz[t_dim->max - 1][tctx];
int depth = dav1d_msac_decode_symbol_adapt4(&ts->msac, tx_cdf,
imin(t_dim->max + 1, 3));
imin(t_dim->max, 2));
while (depth--) {
b->tx = t_dim->sub;
@ -1480,7 +1480,7 @@ static int decode_b(Dav1dTileContext *const t,
b->inter_mode = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.comp_inter_mode[ctx],
N_COMP_INTER_PRED_MODES);
N_COMP_INTER_PRED_MODES - 1);
if (DEBUG_BLOCK_INFO)
printf("Post-compintermode[%d,ctx=%d,n_mvs=%d]: r=%d\n",
b->inter_mode, ctx, n_mvs, ts->msac.rng);
@ -1588,7 +1588,7 @@ static int decode_b(Dav1dTileContext *const t,
ts->cdf.m.wedge_comp[ctx]);
if (b->comp_type == COMP_INTER_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[ctx], 16);
ts->cdf.m.wedge_idx[ctx], 15);
} else {
b->comp_type = COMP_INTER_SEG;
}
@ -1743,14 +1743,14 @@ static int decode_b(Dav1dTileContext *const t,
{
b->interintra_mode = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.interintra_mode[ii_sz_grp],
N_INTER_INTRA_PRED_MODES);
N_INTER_INTRA_PRED_MODES - 1);
const int wedge_ctx = dav1d_wedge_ctx_lut[bs];
b->interintra_type = INTER_INTRA_BLEND +
dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.interintra_wedge[wedge_ctx]);
if (b->interintra_type == INTER_INTRA_WEDGE)
b->wedge_idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.wedge_idx[wedge_ctx], 16);
ts->cdf.m.wedge_idx[wedge_ctx], 15);
} else {
b->interintra_type = INTER_INTRA_NONE;
}
@ -1783,7 +1783,7 @@ static int decode_b(Dav1dTileContext *const t,
b->motion_mode = allow_warp ?
dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.motion_mode[bs], 3) :
ts->cdf.m.motion_mode[bs], 2) :
dav1d_msac_decode_bool_adapt(&ts->msac, ts->cdf.m.obmc[bs]);
if (b->motion_mode == MM_WARP) {
has_subpel_filter = 0;
@ -1823,7 +1823,7 @@ static int decode_b(Dav1dTileContext *const t,
by4, bx4);
filter[0] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[0][ctx1],
DAV1D_N_SWITCHABLE_FILTERS);
DAV1D_N_SWITCHABLE_FILTERS - 1);
if (f->seq_hdr->dual_filter) {
const int ctx2 = get_filter_ctx(t->a, &t->l, comp, 1,
b->ref[0], by4, bx4);
@ -1832,7 +1832,7 @@ static int decode_b(Dav1dTileContext *const t,
filter[0], ctx1, ts->msac.rng);
filter[1] = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.filter[1][ctx2],
DAV1D_N_SWITCHABLE_FILTERS);
DAV1D_N_SWITCHABLE_FILTERS - 1);
if (DEBUG_BLOCK_INFO)
printf("Post-subpel_filter2[%d,ctx=%d]: r=%d\n",
filter[1], ctx2, ts->msac.rng);
@ -2023,9 +2023,8 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
bp = b->bl == bl ? b->bp : PARTITION_SPLIT;
} else {
const unsigned n_part = bl == BL_8X8 ? N_SUB8X8_PARTITIONS :
bl == BL_128X128 ? N_PARTITIONS - 2 : N_PARTITIONS;
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc, n_part);
bp = dav1d_msac_decode_symbol_adapt16(&t->ts->msac, pc,
dav1d_partition_type_count[bl]);
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 &&
(bp == PARTITION_V || bp == PARTITION_V4 ||
bp == PARTITION_T_LEFT_SPLIT || bp == PARTITION_T_RIGHT_SPLIT))
@ -2381,7 +2380,7 @@ static void read_restoration_info(Dav1dTileContext *const t,
if (frame_type == DAV1D_RESTORATION_SWITCHABLE) {
const int filter = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.restore_switchable, 3);
ts->cdf.m.restore_switchable, 2);
lr->type = filter ? filter == 2 ? DAV1D_RESTORATION_SGRPROJ :
DAV1D_RESTORATION_WIENER :
DAV1D_RESTORATION_NONE;
@ -2597,8 +2596,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
f->tile_thread.titsati_sz = titsati_sz;
}
if (f->tile_thread.titsati_init[0] != f->frame_hdr->tiling.cols ||
f->tile_thread.titsati_init[1] != f->sbh ||
f->tile_thread.titsati_init[2] != f->frame_hdr->tiling.rows)
f->tile_thread.titsati_init[1] != f->frame_hdr->tiling.rows ||
memcmp(f->frame_hdr->tiling.row_start_sb, f->tile_thread.titsati_index_rows,
sizeof(*f->tile_thread.titsati_index_rows) *
(f->frame_hdr->tiling.rows + 1)))
{
for (int tile_row = 0, tile_idx = 0;
tile_row < f->frame_hdr->tiling.rows; tile_row++)
@ -2616,8 +2617,10 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
}
}
f->tile_thread.titsati_init[0] = f->frame_hdr->tiling.cols;
f->tile_thread.titsati_init[1] = f->sbh;
f->tile_thread.titsati_init[2] = f->frame_hdr->tiling.rows;
f->tile_thread.titsati_init[1] = f->frame_hdr->tiling.rows;
memcpy(f->tile_thread.titsati_index_rows, f->frame_hdr->tiling.row_start_sb,
sizeof(*f->tile_thread.titsati_index_rows) *
(f->frame_hdr->tiling.rows + 1));
}
}
@ -2637,9 +2640,13 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
goto error;
}
}
if (n_ts > f->n_ts) {
Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
Dav1dTileState *ts_new = dav1d_alloc_aligned(sizeof(*f->ts) * n_ts, 32);
if (!ts_new) goto error;
if (n_ts > f->n_ts) {
if (f->ts) {
memcpy(ts_new, f->ts, sizeof(*f->ts) * f->n_ts);
dav1d_free_aligned(f->ts);
}
f->ts = ts_new;
for (int n = f->n_ts; n < n_ts; f->n_ts = ++n) {
Dav1dTileState *const ts = &f->ts[n];
@ -2655,9 +2662,9 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
pthread_cond_destroy(&ts->tile_thread.cond);
pthread_mutex_destroy(&ts->tile_thread.lock);
}
memcpy(ts_new, f->ts, sizeof(*f->ts) * n_ts);
dav1d_free_aligned(f->ts);
f->n_ts = n_ts;
Dav1dTileState *ts_new = realloc(f->ts, sizeof(*f->ts) * n_ts);
if (!ts_new) goto error;
f->ts = ts_new;
}
}
@ -3184,6 +3191,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr); \
dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
dav1d_film_grain_dsp_init_##bd##bpc(&dsp->fg); \
break
#if CONFIG_8BPC
case 8:

View File

@ -28,7 +28,6 @@
#ifndef DAV1D_SRC_ENV_H
#define DAV1D_SRC_ENV_H
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
@ -90,95 +89,37 @@ static inline int get_partition_ctx(const BlockContext *const a,
(((l->partition[yb8] >> (4 - bl)) & 1) << 1);
}
static inline unsigned cdf_element_prob(const uint16_t *const cdf, const int e) {
assert(e > 0);
return cdf[e - 1] - cdf[e];
}
static inline unsigned gather_left_partition_prob(const uint16_t *const in,
const enum BlockLevel bl)
{
unsigned out = 0;
out += cdf_element_prob(in, PARTITION_H);
if (bl != BL_128X128)
out += cdf_element_prob(in, PARTITION_H4);
unsigned out = in[PARTITION_H - 1] - in[PARTITION_H];
// Exploit the fact that cdfs for PARTITION_SPLIT, PARTITION_T_TOP_SPLIT,
// PARTITION_T_BOTTOM_SPLIT and PARTITION_T_LEFT_SPLIT are neighbors.
out += in[PARTITION_SPLIT - 1] - in[PARTITION_T_LEFT_SPLIT];
if (bl != BL_128X128)
out += in[PARTITION_H4 - 1] - in[PARTITION_H4];
return out;
}
static inline unsigned gather_top_partition_prob(const uint16_t *const in,
const enum BlockLevel bl)
{
unsigned out = 0;
// Exploit the fact that cdfs for PARTITION_V, PARTITION_SPLIT and
// PARTITION_T_TOP_SPLIT are neighbors.
unsigned out = in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
// Exploit the facts that cdfs for PARTITION_T_LEFT_SPLIT and
// PARTITION_T_RIGHT_SPLIT are neighbors, the probability for
// PARTITION_V4 is always zero, and the probability for
// PARTITION_T_RIGHT_SPLIT is zero in 128x128 blocks.
out += in[PARTITION_T_LEFT_SPLIT - 1];
if (bl != BL_128X128)
out += cdf_element_prob(in, PARTITION_V4);
// Exploit the fact that cdfs for PARTITION_T_LEFT_SPLIT and PARTITION_T_RIGHT_SPLIT,
// and PARTITION_V, PARTITION_SPLIT and PARTITION_T_TOP_SPLIT are neighbors.
out += in[PARTITION_T_LEFT_SPLIT - 1] - in[PARTITION_T_RIGHT_SPLIT];
out += in[PARTITION_V - 1] - in[PARTITION_T_TOP_SPLIT];
out += in[PARTITION_V4 - 1] - in[PARTITION_T_RIGHT_SPLIT];
return out;
}
static inline enum TxfmTypeSet get_ext_txtp_set(const enum RectTxfmSize tx,
const int inter,
const Dav1dFrameHeader *const hdr,
const int seg_id)
{
if (!hdr->segmentation.qidx[seg_id]) {
if (hdr->segmentation.lossless[seg_id]) {
assert(tx == (int) TX_4X4);
return TXTP_SET_LOSSLESS;
} else {
return TXTP_SET_DCT;
}
}
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
if (t_dim->max >= TX_64X64)
return TXTP_SET_DCT;
if (t_dim->max == TX_32X32)
return inter ? TXTP_SET_DCT_ID : TXTP_SET_DCT;
if (hdr->reduced_txtp_set)
return inter ? TXTP_SET_DCT_ID : TXTP_SET_DT4_ID;
const enum TxfmSize txsqsz = t_dim->min;
if (inter)
return txsqsz == TX_16X16 ? TXTP_SET_DT9_ID_1D : TXTP_SET_ALL;
else
return txsqsz == TX_16X16 ? TXTP_SET_DT4_ID : TXTP_SET_DT4_ID_1D;
}
static inline enum TxfmType get_uv_intra_txtp(const enum IntraPredMode uv_mode,
const enum RectTxfmSize tx,
const Dav1dFrameHeader *const hdr,
const int seg_id)
{
if (hdr->segmentation.lossless[seg_id]) {
assert(tx == (int) TX_4X4);
return WHT_WHT;
}
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
return t_dim->max == TX_32X32 ? DCT_DCT : dav1d_txtp_from_uvmode[uv_mode];
}
static inline enum TxfmType get_uv_inter_txtp(const TxfmInfo *const uvt_dim,
const enum TxfmType ytxtp,
const Dav1dFrameHeader *const hdr,
const int seg_id)
const enum TxfmType ytxtp)
{
if (hdr->segmentation.lossless[seg_id]) {
assert(uvt_dim->max == TX_4X4);
return WHT_WHT;
}
if (uvt_dim->max == TX_32X32)
return ytxtp == IDTX ? IDTX : DCT_DCT;
if (uvt_dim->min == TX_16X16 &&
@ -528,180 +469,6 @@ static inline unsigned get_cur_frame_segid(const int by, const int bx,
}
}
static inline int get_coef_skip_ctx(const TxfmInfo *const t_dim,
const enum BlockSize bs,
const uint8_t *const a,
const uint8_t *const l,
const int chroma,
const enum Dav1dPixelLayout layout)
{
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
if (chroma) {
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
int ca, cl;
#define MERGE_CTX(dir, type, mask) \
c##dir = !!((*(const type *) dir) & mask); \
break
switch (t_dim->lw) {
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
default: abort();
}
switch (t_dim->lh) {
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
default: abort();
}
#undef MERGE_CTX
return 7 + not_one_blk * 3 + ca + cl;
} else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
return 0;
} else {
static const uint8_t skip_contexts[5][5] = {
{ 1, 2, 2, 2, 3 },
{ 1, 4, 4, 4, 5 },
{ 1, 4, 4, 4, 5 },
{ 1, 4, 4, 4, 5 },
{ 1, 4, 4, 4, 6 }
};
uint64_t la, ll;
#define MERGE_CTX(dir, type, tx) do { \
l##dir = *(const type *) dir; \
if (tx == TX_64X64) \
l##dir |= *(const type *) &dir[sizeof(type)]; \
if (tx >= TX_32X32) l##dir |= l##dir >> 32; \
if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
l##dir &= 0x3F; \
} while (0); \
break
switch (t_dim->lw) {
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32);
case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64);
}
switch (t_dim->lh) {
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32);
case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64);
}
#undef MERGE_CTX
const int max = imin((int) (la | ll), 4);
const int min = imin(imin((int) la, (int) ll), 4);
return skip_contexts[min][max];
}
}
static inline int get_coef_nz_ctx(uint8_t *const levels,
const enum RectTxfmSize tx,
const enum TxClass tx_class,
const int x, const int y,
const ptrdiff_t stride)
{
static const uint8_t offsets[3][5][2 /* x, y */] = {
[TX_CLASS_2D] = {
{ 0, 1 }, { 1, 0 }, { 2, 0 }, { 0, 2 }, { 1, 1 }
}, [TX_CLASS_V] = {
{ 0, 1 }, { 1, 0 }, { 0, 2 }, { 0, 3 }, { 0, 4 }
}, [TX_CLASS_H] = {
{ 0, 1 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, { 4, 0 }
}
};
const uint8_t (*const off)[2] = offsets[tx_class];
int mag = 0;
for (int i = 0; i < 5; i++)
mag += imin(levels[(x + off[i][0]) * stride + (y + off[i][1])], 3);
const int ctx = imin((mag + 1) >> 1, 4);
if (tx_class == TX_CLASS_2D) {
return dav1d_nz_map_ctx_offset[tx][imin(y, 4)][imin(x, 4)] + ctx;
} else {
return 26 + imin((tx_class == TX_CLASS_V) ? y : x, 2) * 5 + ctx;
}
}
static inline int get_dc_sign_ctx(const TxfmInfo *const t_dim,
const uint8_t *const a,
const uint8_t *const l)
{
uint64_t sa, sl;
#define MERGE_CTX(dir, type, tx, mask) do { \
s##dir = ((*(const type *) dir) >> 6) & mask; \
if (tx == TX_64X64) \
s##dir += ((*(const type *) &dir[sizeof(type)]) >> 6) & mask; \
if (tx >= TX_32X32) s##dir += s##dir >> 32; \
if (tx >= TX_16X16) s##dir += s##dir >> 16; \
if (tx >= TX_8X8) s##dir += s##dir >> 8; \
} while (0); \
break
switch (t_dim->lw) {
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4, 0x03);
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8, 0x0303);
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16, 0x03030303U);
case TX_32X32: MERGE_CTX(a, uint64_t, TX_32X32, 0x0303030303030303ULL);
case TX_64X64: MERGE_CTX(a, uint64_t, TX_64X64, 0x0303030303030303ULL);
}
switch (t_dim->lh) {
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4, 0x03);
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8, 0x0303);
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16, 0x03030303U);
case TX_32X32: MERGE_CTX(l, uint64_t, TX_32X32, 0x0303030303030303ULL);
case TX_64X64: MERGE_CTX(l, uint64_t, TX_64X64, 0x0303030303030303ULL);
}
#undef MERGE_CTX
const int s = ((int) ((sa + sl) & 0xFF)) - (t_dim->w + t_dim->h);
return s < 0 ? 1 : s > 0 ? 2 : 0;
}
static inline int get_br_ctx(const uint8_t *const levels,
const int ac, const enum TxClass tx_class,
const int x, const int y,
const ptrdiff_t stride)
{
int mag = 0;
static const uint8_t offsets_from_txclass[3][3][2] = {
[TX_CLASS_2D] = { { 0, 1 }, { 1, 0 }, { 1, 1 } },
[TX_CLASS_H] = { { 0, 1 }, { 1, 0 }, { 0, 2 } },
[TX_CLASS_V] = { { 0, 1 }, { 1, 0 }, { 2, 0 } }
};
const uint8_t (*const offsets)[2] = offsets_from_txclass[tx_class];
for (int i = 0; i < 3; i++)
mag += levels[(x + offsets[i][1]) * stride + y + offsets[i][0]];
mag = imin((mag + 1) >> 1, 6);
if (!ac) return mag;
switch (tx_class) {
case TX_CLASS_2D:
if (y < 2 && x < 2) return mag + 7;
break;
case TX_CLASS_H:
if (x == 0) return mag + 7;
break;
case TX_CLASS_V:
if (y == 0) return mag + 7;
break;
}
return mag + 14;
}
static inline mv get_gmv_2d(const Dav1dWarpedMotionParams *const gmv,
const int bx4, const int by4,
const int bw4, const int bh4,

41
third_party/dav1d/src/fg_apply.h vendored Normal file
View File

@ -0,0 +1,41 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_FG_APPLY_H
#define DAV1D_SRC_FG_APPLY_H
#include "dav1d/picture.h"
#include "common/bitdepth.h"
#include "src/film_grain.h"
bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in);
#endif /* DAV1D_SRC_FG_APPLY_H */

175
third_party/dav1d/src/fg_apply_tmpl.c vendored Normal file
View File

@ -0,0 +1,175 @@
/*
* Copyright © 2018, Niklas Haas
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <stdint.h>
#include "dav1d/picture.h"
#include "common.h"
#include "common/intops.h"
#include "common/bitdepth.h"
#include "fg_apply.h"
static void generate_scaling(const int bitdepth,
const uint8_t points[][2], const int num,
uint8_t scaling[SCALING_SIZE])
{
const int shift_x = bitdepth - 8;
const int scaling_size = 1 << bitdepth;
const int pad = 1 << shift_x;
// Fill up the preceding entries with the initial value
for (int i = 0; i < points[0][0] << shift_x; i++)
scaling[i] = points[0][1];
// Linearly interpolate the values in the middle
for (int i = 0; i < num - 1; i++) {
const int bx = points[i][0];
const int by = points[i][1];
const int ex = points[i+1][0];
const int ey = points[i+1][1];
const int dx = ex - bx;
const int dy = ey - by;
const int delta = dy * ((0x10000 + (dx >> 1)) / dx);
for (int x = 0; x < dx; x++) {
const int v = by + ((x * delta + 0x8000) >> 16);
scaling[(bx + x) << shift_x] = v;
}
}
// Fill up the remaining entries with the final value
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
scaling[i] = points[num - 1][1];
if (pad <= 1) return;
const int rnd = pad >> 1;
for (int i = 0; i < num - 1; i++) {
const int bx = points[i][0] << shift_x;
const int ex = points[i+1][0] << shift_x;
const int dx = ex - bx;
for (int x = 0; x < dx; x += pad) {
const int range = scaling[bx + x + pad] - scaling[bx + x];
for (int n = 1; n < pad; n++) {
scaling[bx + x + n] = scaling[bx + x] + ((range * n + rnd) >> shift_x);
}
}
}
}
#ifndef UNIT_TEST
void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
entry grain_lut[3][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
uint8_t scaling[3][SCALING_SIZE];
#if BITDEPTH != 8
const int bitdepth_max = (1 << out->p.bpc) - 1;
#endif
// Generate grain LUTs as needed
dsp->generate_grain_y(grain_lut[0], data HIGHBD_TAIL_SUFFIX); // always needed
if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
dsp->generate_grain_uv[in->p.layout - 1](grain_lut[1], grain_lut[0],
data, 0 HIGHBD_TAIL_SUFFIX);
if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
dsp->generate_grain_uv[in->p.layout - 1](grain_lut[2], grain_lut[0],
data, 1 HIGHBD_TAIL_SUFFIX);
// Generate scaling LUTs as needed
if (data->num_y_points)
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
if (data->num_uv_points[0])
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
if (data->num_uv_points[1])
generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
// Copy over the non-modified planes
// TODO: eliminate in favor of per-plane refs
assert(out->stride[0] == in->stride[0]);
if (!data->num_y_points) {
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
}
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
assert(out->stride[1] == in->stride[1]);
for (int i = 0; i < 2; i++) {
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
memcpy(out->data[1+i], in->data[1+i],
(out->p.h >> suby) * out->stride[1]);
}
}
}
// Synthesize grain for the affected planes
const int rows = (out->p.h + 31) >> 5;
const int ss_y = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_x = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
for (int row = 0; row < rows; row++) {
const pixel *const luma_src =
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
if (data->num_y_points) {
const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
luma_src, out->stride[0], data,
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
}
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
if (data->chroma_scaling_from_luma) {
for (int pl = 0; pl < 2; pl++)
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
((const pixel *) in->data[1 + pl]) + uv_off,
in->stride[1], data, cpw,
scaling[0], grain_lut[1 + pl],
bh, row, luma_src, in->stride[0],
pl, is_id HIGHBD_TAIL_SUFFIX);
} else {
for (int pl = 0; pl < 2; pl++)
if (data->num_uv_points[pl])
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
((const pixel *) in->data[1 + pl]) + uv_off,
in->stride[1], data, cpw,
scaling[1 + pl], grain_lut[1 + pl],
bh, row, luma_src, in->stride[0],
pl, is_id HIGHBD_TAIL_SUFFIX);
}
}
}
#endif

View File

@ -28,9 +28,58 @@
#ifndef DAV1D_SRC_FILM_GRAIN_H
#define DAV1D_SRC_FILM_GRAIN_H
#include "dav1d/dav1d.h"
#include "common/bitdepth.h"
bitfn_decls(void dav1d_apply_grain, Dav1dPicture *const out,
const Dav1dPicture *const in);
#include "src/levels.h"
#define GRAIN_WIDTH 82
#define GRAIN_HEIGHT 73
#define BLOCK_SIZE 32
#if !defined(BITDEPTH) || BITDEPTH == 8
#define SCALING_SIZE 256
typedef int8_t entry;
#else
#define SCALING_SIZE 4096
typedef int16_t entry;
#endif
#define decl_generate_grain_y_fn(name) \
void (name)(entry buf[][GRAIN_WIDTH], \
const Dav1dFilmGrainData *const data HIGHBD_DECL_SUFFIX)
typedef decl_generate_grain_y_fn(*generate_grain_y_fn);
#define decl_generate_grain_uv_fn(name) \
void (name)(entry buf[][GRAIN_WIDTH], \
const entry buf_y[][GRAIN_WIDTH], \
const Dav1dFilmGrainData *const data, const int uv HIGHBD_DECL_SUFFIX)
typedef decl_generate_grain_uv_fn(*generate_grain_uv_fn);
#define decl_fgy_32x32xn_fn(name) \
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
const Dav1dFilmGrainData *data, \
size_t pw, const uint8_t scaling[SCALING_SIZE], \
const entry grain_lut[][GRAIN_WIDTH], \
int bh, int row_num HIGHBD_DECL_SUFFIX)
typedef decl_fgy_32x32xn_fn(*fgy_32x32xn_fn);
#define decl_fguv_32x32xn_fn(name) \
void (name)(pixel *dst_row, const pixel *src_row, ptrdiff_t stride, \
const Dav1dFilmGrainData *data, int pw, \
const uint8_t scaling[SCALING_SIZE], \
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num, \
const pixel *luma_row, ptrdiff_t luma_stride, \
int uv_pl, int is_id HIGHBD_DECL_SUFFIX)
typedef decl_fguv_32x32xn_fn(*fguv_32x32xn_fn);
typedef struct Dav1dFilmGrainDSPContext {
generate_grain_y_fn generate_grain_y;
generate_grain_uv_fn generate_grain_uv[3];
fgy_32x32xn_fn fgy_32x32xn;
fguv_32x32xn_fn fguv_32x32xn[3];
} Dav1dFilmGrainDSPContext;
bitfn_decls(void dav1d_film_grain_dsp_init, Dav1dFilmGrainDSPContext *c);
bitfn_decls(void dav1d_film_grain_dsp_init_x86, Dav1dFilmGrainDSPContext *c);
#endif /* DAV1D_SRC_FILM_GRAIN_H */

View File

@ -26,39 +26,16 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <assert.h>
#include <stdint.h>
#include "common.h"
#include "common/attributes.h"
#include "common/intops.h"
#include "common/bitdepth.h"
#include "tables.h"
#include "film_grain.h"
#include "tables.h"
#if BITDEPTH == 8
typedef int8_t entry;
#else
typedef int16_t entry;
#endif
#define SUB_GRAIN_WIDTH 44
#define SUB_GRAIN_HEIGHT 38
enum {
GRAIN_WIDTH = 82,
GRAIN_HEIGHT = 73,
SUB_GRAIN_WIDTH = 44,
SUB_GRAIN_HEIGHT = 38,
SUB_GRAIN_OFFSET = 6,
BLOCK_SIZE = 32,
#if BITDEPTH == 8
SCALING_SIZE = 256
#else
SCALING_SIZE = 4096
#endif
};
static inline int get_random_number(const int bits, unsigned *state) {
static inline int get_random_number(const int bits, unsigned *const state) {
const int r = *state;
unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1;
*state = (r >> 1) | (bit << 15);
@ -70,13 +47,14 @@ static inline int round2(const int x, const int shift) {
return (x + ((1 << shift) >> 1)) >> shift;
}
static void generate_grain_y(const Dav1dPicture *const in,
entry buf[GRAIN_HEIGHT][GRAIN_WIDTH])
static void generate_grain_y_c(entry buf[][GRAIN_WIDTH],
const Dav1dFilmGrainData *const data
HIGHBD_DECL_SUFFIX)
{
const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
unsigned seed = data->seed;
const int shift = 12 - in->p.bpc + data->grain_scale_shift;
const int grain_ctr = 128 << (in->p.bpc - 8);
const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
const int grain_ctr = 128 << bitdepth_min_8;
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
for (int y = 0; y < GRAIN_HEIGHT; y++) {
@ -101,25 +79,24 @@ static void generate_grain_y(const Dav1dPicture *const in,
}
}
int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift);
buf[y][x] = iclip(grain, grain_min, grain_max);
}
}
}
static void generate_grain_uv(const Dav1dPicture *const in, int uv,
entry buf[GRAIN_HEIGHT][GRAIN_WIDTH],
entry buf_y[GRAIN_HEIGHT][GRAIN_WIDTH])
static NOINLINE void
generate_grain_uv_c(entry buf[][GRAIN_WIDTH],
const entry buf_y[][GRAIN_WIDTH],
const Dav1dFilmGrainData *const data, const int uv,
const int subx, const int suby HIGHBD_DECL_SUFFIX)
{
const Dav1dFilmGrainData *data = &in->frame_hdr->film_grain.data;
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524);
const int shift = 12 - in->p.bpc + data->grain_scale_shift;
const int grain_ctr = 128 << (in->p.bpc - 8);
const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift;
const int grain_ctr = 128 << bitdepth_min_8;
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
const int subx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH;
const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT;
@ -167,40 +144,18 @@ static void generate_grain_uv(const Dav1dPicture *const in, int uv,
}
}
static void generate_scaling(const int bitdepth,
const uint8_t points[][2], int num,
uint8_t scaling[SCALING_SIZE])
{
const int shift_x = bitdepth - 8;
const int scaling_size = 1 << bitdepth;
// Fill up the preceding entries with the initial value
for (int i = 0; i < points[0][0] << shift_x; i++)
scaling[i] = points[0][1];
// Linearly interpolate the values in the middle
for (int i = 0; i < num - 1; i++) {
const int bx = points[i][0] << shift_x;
const int by = points[i][1];
const int ex = points[i+1][0] << shift_x;
const int ey = points[i+1][1];
const int dx = ex - bx;
const int dy = ey - by;
const int delta = dy * ((0xFFFF + (dx >> 1))) / dx;
for (int x = 0; x < dx; x++) {
const int v = by + ((x * delta + 0x8000) >> 16);
scaling[bx + x] = v;
}
}
// Fill up the remaining entries with the final value
for (int i = points[num - 1][0] << shift_x; i < scaling_size; i++)
scaling[i] = points[num - 1][1];
#define gnuv_ss_fn(nm, ss_x, ss_y) \
static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \
generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \
}
gnuv_ss_fn(420, 1, 1);
gnuv_ss_fn(422, 1, 0);
gnuv_ss_fn(444, 0, 0);
// samples from the correct block of a grain LUT, while taking into account the
// offsets provided by the offsets cache
static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
int offsets[2][2], int subx, int suby,
int bx, int by, int x, int y)
{
@ -211,13 +166,15 @@ static inline entry sample_lut(entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
[offx + x + (BLOCK_SIZE >> subx) * bx];
}
static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in,
entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
uint8_t scaling[SCALING_SIZE], int row_num)
static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
const ptrdiff_t stride,
const Dav1dFilmGrainData *const data, const size_t pw,
const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH],
const int bh, const int row_num HIGHBD_DECL_SUFFIX)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
const int rows = 1 + (data->overlap_flag && row_num > 0);
const int bitdepth_min_8 = in->p.bpc - 8;
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int grain_ctr = 128 << bitdepth_min_8;
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
@ -227,7 +184,11 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
max_value = 235 << bitdepth_min_8;
} else {
min_value = 0;
max_value = (1U << in->p.bpc) - 1;
#if BITDEPTH == 8
max_value = 0xff;
#else
max_value = bitdepth_max;
#endif
}
// seed[0] contains the current row, seed[1] contains the previous
@ -238,18 +199,13 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
}
const ptrdiff_t stride = out->stride[0];
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
assert(stride == in->stride[0]);
pixel *const src_row = (pixel *) in->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
pixel *const dst_row = (pixel *) out->data[0] + PXSTRIDE(stride) * row_num * BLOCK_SIZE;
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks
const int bh = imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE);
for (int bx = 0; bx < out->p.w; bx += BLOCK_SIZE) {
const int bw = imin(BLOCK_SIZE, out->p.w - bx);
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
const int bw = imin(BLOCK_SIZE, (int) pw - bx);
if (data->overlap_flag && bx) {
// shift previous offsets left
@ -268,9 +224,9 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
static const int w[2][2] = { { 27, 17 }, { 17, 27 } };
#define add_noise_y(x, y, grain) \
pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \
pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \
const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \
*dst = iclip(*src + noise, min_value, max_value);
for (int y = ystart; y < bh; y++) {
@ -323,33 +279,33 @@ static void apply_to_row_y(Dav1dPicture *const out, const Dav1dPicture *const in
}
}
static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const in,
entry grain_lut[GRAIN_HEIGHT][GRAIN_WIDTH],
uint8_t scaling[SCALING_SIZE], int uv, int row_num)
static NOINLINE void
fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
const ptrdiff_t stride, const Dav1dFilmGrainData *const data,
const int pw, const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH], const int bh,
const int row_num, const pixel *const luma_row,
const ptrdiff_t luma_stride, const int uv, const int is_id,
const int sx, const int sy HIGHBD_DECL_SUFFIX)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
const int rows = 1 + (data->overlap_flag && row_num > 0);
const int bitdepth_max = (1 << in->p.bpc) - 1;
const int bitdepth_min_8 = in->p.bpc - 8;
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int grain_ctr = 128 << bitdepth_min_8;
const int grain_min = -grain_ctr, grain_max = grain_ctr - 1;
int min_value, max_value;
if (data->clip_to_restricted_range) {
min_value = 16 << bitdepth_min_8;
if (out->seq_hdr->mtrx == DAV1D_MC_IDENTITY) {
max_value = 235 << bitdepth_min_8;
} else {
max_value = 240 << bitdepth_min_8;
}
max_value = (is_id ? 235 : 240) << bitdepth_min_8;
} else {
min_value = 0;
#if BITDEPTH == 8
max_value = 0xff;
#else
max_value = bitdepth_max;
#endif
}
const int sx = in->p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int sy = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
// seed[0] contains the current row, seed[1] contains the previous
unsigned seed[2];
for (int i = 0; i < rows; i++) {
@ -358,21 +314,13 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
}
const ptrdiff_t stride = out->stride[1];
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
assert(stride == in->stride[1]);
const int by = row_num * (BLOCK_SIZE >> sy);
pixel *const dst_row = (pixel *) out->data[1 + uv] + PXSTRIDE(stride) * by;
pixel *const src_row = (pixel *) in->data[1 + uv] + PXSTRIDE(stride) * by;
pixel *const luma_row = (pixel *) out->data[0] + PXSTRIDE(out->stride[0]) * row_num * BLOCK_SIZE;
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks (subsampled)
const int bh = (imin(out->p.h - row_num * BLOCK_SIZE, BLOCK_SIZE) + sy) >> sy;
for (int bx = 0; bx < (out->p.w + sx) >> sx; bx += BLOCK_SIZE >> sx) {
const int bw = (imin(BLOCK_SIZE, out->p.w - (bx << sx)) + sx) >> sx;
for (int bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
const int bw = imin(BLOCK_SIZE >> sx, pw - bx);
if (data->overlap_flag && bx) {
// shift previous offsets left
for (int i = 0; i < rows; i++)
@ -395,22 +343,20 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
#define add_noise_uv(x, y, grain) \
const int lx = (bx + x) << sx; \
const int ly = y << sy; \
pixel *luma = luma_row + ly * PXSTRIDE(out->stride[0]) + lx; \
const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \
pixel avg = luma[0]; \
if (sx && lx + 1 < out->p.w) \
if (sx) \
avg = (avg + luma[1] + 1) >> 1; \
\
pixel *src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
pixel *dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \
int val = avg; \
if (!data->chroma_scaling_from_luma) { \
int combined = avg * data->uv_luma_mult[uv] + \
const int combined = avg * data->uv_luma_mult[uv] + \
*src * data->uv_mult[uv]; \
val = iclip_pixel( (combined >> 6) + \
(data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \
} \
\
int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \
*dst = iclip(*src + noise, min_value, max_value);
for (int y = ystart; y < bh; y++) {
@ -463,61 +409,29 @@ static void apply_to_row_uv(Dav1dPicture *const out, const Dav1dPicture *const i
}
}
void bitfn(dav1d_apply_grain)(Dav1dPicture *const out,
const Dav1dPicture *const in)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
entry grain_lut[3][GRAIN_HEIGHT][GRAIN_WIDTH];
uint8_t scaling[3][SCALING_SIZE];
// Generate grain LUTs as needed
generate_grain_y(out, grain_lut[0]); // always needed
if (data->num_uv_points[0] || data->chroma_scaling_from_luma)
generate_grain_uv(out, 0, grain_lut[1], grain_lut[0]);
if (data->num_uv_points[1] || data->chroma_scaling_from_luma)
generate_grain_uv(out, 1, grain_lut[2], grain_lut[0]);
// Generate scaling LUTs as needed
if (data->num_y_points)
generate_scaling(in->p.bpc, data->y_points, data->num_y_points, scaling[0]);
if (data->num_uv_points[0])
generate_scaling(in->p.bpc, data->uv_points[0], data->num_uv_points[0], scaling[1]);
if (data->num_uv_points[1])
generate_scaling(in->p.bpc, data->uv_points[1], data->num_uv_points[1], scaling[2]);
// Copy over the non-modified planes
// TODO: eliminate in favor of per-plane refs
if (!data->num_y_points) {
assert(out->stride[0] == in->stride[0]);
memcpy(out->data[0], in->data[0], out->p.h * out->stride[0]);
}
if (in->p.layout != DAV1D_PIXEL_LAYOUT_I400) {
for (int i = 0; i < 2; i++) {
if (!data->num_uv_points[i] && !data->chroma_scaling_from_luma) {
const int suby = in->p.layout == DAV1D_PIXEL_LAYOUT_I420;
assert(out->stride[1] == in->stride[1]);
memcpy(out->data[1+i], in->data[1+i],
(out->p.h >> suby) * out->stride[1]);
}
}
}
// Synthesize grain for the affected planes
int rows = (out->p.h + 31) >> 5;
for (int row = 0; row < rows; row++) {
if (data->num_y_points)
apply_to_row_y(out, in, grain_lut[0], scaling[0], row);
if (data->chroma_scaling_from_luma) {
apply_to_row_uv(out, in, grain_lut[1], scaling[0], 0, row);
apply_to_row_uv(out, in, grain_lut[2], scaling[0], 1, row);
} else {
if (data->num_uv_points[0])
apply_to_row_uv(out, in, grain_lut[1], scaling[1], 0, row);
if (data->num_uv_points[1])
apply_to_row_uv(out, in, grain_lut[2], scaling[2], 1, row);
}
}
#define fguv_ss_fn(nm, ss_x, ss_y) \
static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \
fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \
row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \
HIGHBD_TAIL_SUFFIX); \
}
fguv_ss_fn(420, 1, 1);
fguv_ss_fn(422, 1, 0);
fguv_ss_fn(444, 0, 0);
COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) {
c->generate_grain_y = generate_grain_y_c;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c;
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c;
c->fgy_32x32xn = fgy_32x32xn_c;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c;
#if HAVE_ASM && ARCH_X86
bitfn(dav1d_film_grain_dsp_init_x86)(c);
#endif
}

View File

@ -27,8 +27,6 @@
#include "config.h"
#include <assert.h>
#include "common/intops.h"
#include "src/getbits.h"

View File

@ -42,6 +42,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
#include "src/cdf.h"
#include "src/data.h"
#include "src/env.h"
#include "src/film_grain.h"
#include "src/intra_edge.h"
#include "src/ipred.h"
#include "src/itx.h"
@ -57,6 +58,7 @@ typedef struct Dav1dTileContext Dav1dTileContext;
#include "src/thread.h"
typedef struct Dav1dDSPContext {
Dav1dFilmGrainDSPContext fg;
Dav1dIntraPredDSPContext ipred;
Dav1dMCDSPContext mc;
Dav1dInvTxfmDSPContext itx;
@ -89,6 +91,8 @@ struct Dav1dContext {
Dav1dContentLightLevel *content_light;
Dav1dRef *mastering_display_ref;
Dav1dMasteringDisplay *mastering_display;
Dav1dRef *itut_t35_ref;
Dav1dITUTT35 *itut_t35;
// decoded output picture queue
Dav1dData in;
@ -213,7 +217,7 @@ struct Dav1dFrameContext {
Av1Restoration *lr_mask;
int top_pre_cdef_toggle;
int mask_sz /* w*h */, lr_mask_sz, line_sz /* w */, lr_line_sz, re_sz /* h */;
Av1FilterLUT lim_lut;
ALIGN(Av1FilterLUT lim_lut, 16);
int last_sharpness;
uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
uint8_t *tx_lpf_right_edge[2];
@ -233,20 +237,21 @@ struct Dav1dFrameContext {
pthread_cond_t cond, icond;
int tasks_left, num_tasks;
int (*task_idx_to_sby_and_tile_idx)[2];
int titsati_sz, titsati_init[3];
int titsati_sz, titsati_init[2];
uint16_t titsati_index_rows[1 + DAV1D_MAX_TILE_ROWS];
int inited;
} tile_thread;
};
struct Dav1dTileState {
CdfContext cdf;
MsacContext msac;
struct {
int col_start, col_end, row_start, row_end; // in 4px units
int col, row; // in tile units
} tiling;
CdfContext cdf;
MsacContext msac;
atomic_int progress; // in sby units, TILE_ERROR after a decoding error
struct {
pthread_mutex_t lock;
@ -298,6 +303,18 @@ struct Dav1dTileContext {
uint16_t emu_edge_16bpc[320 * (256 + 7)];
};
};
struct {
union {
uint8_t levels[32 * 34];
struct {
uint8_t pal_order[64][8];
uint8_t pal_ctx[64];
};
};
int16_t ac[32 * 32];
uint8_t pal_idx[2 * 64 * 64];
uint16_t pal[3 /* plane */][8 /* palette_idx */];
ALIGN(union, 32) {
struct {
uint8_t interintra_8bpc[64 * 64];
uint8_t edge_8bpc[257];
@ -306,18 +323,8 @@ struct Dav1dTileContext {
uint16_t interintra_16bpc[64 * 64];
uint16_t edge_16bpc[257];
};
struct {
uint8_t pal_idx[2 * 64 * 64];
union {
struct {
uint8_t pal_order[64][8];
uint8_t pal_ctx[64];
};
uint8_t levels[36 * 36];
};
uint16_t pal[3 /* plane */][8 /* palette_idx */];
};
int16_t ac[32 * 32];
} scratch;
Dav1dWarpedMotionParams warpmv;

View File

@ -27,9 +27,10 @@
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include "common/attributes.h"
#include "src/intra_edge.h"
#include "src/levels.h"

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stdint.h>
#include <string.h>

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

View File

@ -109,17 +109,6 @@ enum TxfmType {
N_TX_TYPES_PLUS_LL,
};
enum TxfmTypeSet {
TXTP_SET_DCT,
TXTP_SET_DCT_ID,
TXTP_SET_DT4_ID,
TXTP_SET_DT4_ID_1D,
TXTP_SET_DT9_ID_1D,
TXTP_SET_ALL,
TXTP_SET_LOSSLESS,
N_TXTP_SETS
};
enum TxClass {
TX_CLASS_2D,
TX_CLASS_H,

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <string.h>
#include "common/intops.h"

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <string.h>
#include "common/intops.h"

View File

@ -37,6 +37,7 @@
#include "common/mem.h"
#include "common/validate.h"
#include "src/fg_apply.h"
#include "src/internal.h"
#include "src/log.h"
#include "src/obu.h"
@ -44,12 +45,12 @@
#include "src/ref.h"
#include "src/thread_task.h"
#include "src/wedge.h"
#include "src/film_grain.h"
static COLD void init_internal(void) {
dav1d_init_wedge_masks();
dav1d_init_interintra_masks();
dav1d_init_qm_tables();
dav1d_init_thread();
}
COLD const char *dav1d_version(void) {
@ -289,13 +290,13 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
switch (out->p.bpc) {
#if CONFIG_8BPC
case 8:
dav1d_apply_grain_8bpc(out, in);
dav1d_apply_grain_8bpc(&c->dsp[0].fg, out, in);
break;
#endif
#if CONFIG_16BPC
case 10:
case 12:
dav1d_apply_grain_16bpc(out, in);
dav1d_apply_grain_16bpc(&c->dsp[(out->p.bpc >> 1) - 4].fg, out, in);
break;
#endif
default:
@ -409,8 +410,10 @@ void dav1d_flush(Dav1dContext *const c) {
c->mastering_display = NULL;
c->content_light = NULL;
c->itut_t35 = NULL;
dav1d_ref_dec(&c->mastering_display_ref);
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
if (c->n_fc == 1) return;
@ -499,7 +502,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
pthread_cond_destroy(&ts->tile_thread.cond);
pthread_mutex_destroy(&ts->tile_thread.lock);
}
free(f->ts);
dav1d_free_aligned(f->ts);
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
@ -535,6 +538,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
dav1d_ref_dec(&c->mastering_display_ref);
dav1d_ref_dec(&c->content_light_ref);
dav1d_ref_dec(&c->itut_t35_ref);
dav1d_freep_aligned(c_out);
}

View File

@ -172,8 +172,8 @@ static void wiener_c(pixel *p, const ptrdiff_t p_stride,
const int round_bits_v = 11 - (bitdepth == 12) * 2;
const int rounding_off_v = 1 << (round_bits_v - 1);
const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
for (int i = 0; i < w; i++) {
for (int j = 0; j < h; j++) {
for (int i = 0; i < w; i++) {
int sum = (hor[(j + 3) * REST_UNIT_STRIDE + i] << 7) - round_offset;
for (int k = 0; k < 7; k++) {

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
@ -906,6 +905,7 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
src_x += mx >> 14;
mx &= 0x3fff;
}
if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];
dst += PXSTRIDE(dst_stride);
src += PXSTRIDE(src_stride);

View File

@ -55,6 +55,7 @@ libdav1d_sources = files(
libdav1d_tmpl_sources = files(
'cdef_apply_tmpl.c',
'cdef_tmpl.c',
'fg_apply_tmpl.c',
'film_grain_tmpl.c',
'ipred_prepare_tmpl.c',
'ipred_tmpl.c',
@ -67,6 +68,10 @@ libdav1d_tmpl_sources = files(
'recon_tmpl.c',
)
libdav1d_arch_tmpl_sources = []
libdav1d_bitdepth_objs = []
# libdav1d entrypoint source files
# These source files contain library entry points and are
# built with the stack-realign flag set, where necessary.
@ -77,6 +82,8 @@ libdav1d_entrypoints_sources = files(
# ASM specific sources
libdav1d_nasm_objs = []
# Arch-specific flags
arch_flags = []
if is_asm_enabled
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm'))
@ -114,6 +121,7 @@ if is_asm_enabled
libdav1d_tmpl_sources += files(
'x86/cdef_init_tmpl.c',
'x86/film_grain_init_tmpl.c',
'x86/ipred_init_tmpl.c',
'x86/itx_init_tmpl.c',
'x86/loopfilter_init_tmpl.c',
@ -130,6 +138,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef.asm',
'x86/film_grain.asm',
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
@ -138,6 +147,7 @@ if is_asm_enabled
'x86/cdef_sse.asm',
'x86/ipred_ssse3.asm',
'x86/itx_ssse3.asm',
'x86/loopfilter_ssse3.asm',
'x86/looprestoration_ssse3.asm',
'x86/mc_ssse3.asm',
)
@ -151,9 +161,13 @@ if is_asm_enabled
# Compile the ASM sources with NASM
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
elif host_machine.cpu() == 'ppc64le'
arch_flags = ['-maltivec', '-mvsx']
libdav1d_sources += files(
'ppc/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
'ppc/cdef_init_tmpl.c',
)
endif
endif
@ -223,6 +237,19 @@ foreach bitdepth : dav1d_bitdepths
).extract_all_objects()
endforeach
# Helper library for each bitdepth and architecture-specific flags
foreach bitdepth : dav1d_bitdepths
libdav1d_bitdepth_objs += static_library(
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
libdav1d_arch_tmpl_sources, config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependency],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + stackalign_flag + arch_flags,
install : false,
build_by_default : false,
).extract_all_objects()
endforeach
# The final dav1d library
if host_machine.system() == 'windows'
dav1d_soversion = ''

View File

@ -116,42 +116,39 @@ int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
uint16_t *const cdf,
const size_t n_symbols)
{
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
const unsigned c = s->dif >> (EC_WIN_SIZE - 16), r = s->rng >> 8;
unsigned u, v = s->rng, val = -1;
assert(!cdf[n_symbols - 1]);
assert(n_symbols <= 15);
assert(cdf[n_symbols] <= 32);
do {
val++;
u = v;
v = r * (cdf[ret++] >> EC_PROB_SHIFT);
v = r * (cdf[val] >> EC_PROB_SHIFT);
v >>= 7 - EC_PROB_SHIFT;
v += EC_MIN_PROB * (int) (n_symbols - ret);
v += EC_MIN_PROB * ((unsigned)n_symbols - val);
} while (c < v);
assert(u <= s->rng);
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
return ret - 1;
}
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
uint16_t *const cdf,
const size_t n_symbols)
{
const unsigned val = decode_symbol(s, cdf, n_symbols);
if (s->allow_update_cdf) {
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
const unsigned rate = 4 + (count >> 4) + (n_symbols > 2);
unsigned i;
for (i = 0; i < val; i++)
cdf[i] += (32768 - cdf[i]) >> rate;
for (; i < n_symbols - 1; i++)
for (; i < n_symbols; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
return val;
}
@ -163,7 +160,7 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
if (s->allow_update_cdf) {
// update_cdf() specialized for boolean CDFs
const unsigned count = cdf[1];
const int rate = (count >> 4) | 4;
const int rate = 4 + (count >> 4);
if (bit)
cdf[0] += (32768 - cdf[0]) >> rate;
else
@ -174,6 +171,22 @@ unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *const s,
return bit;
}
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf) {
unsigned tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
unsigned tok = 3 + tok_br;
if (tok_br == 3) {
tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
tok = 6 + tok_br;
if (tok_br == 3) {
tok_br = dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
tok = 9 + tok_br;
if (tok_br == 3)
tok = 12 + dav1d_msac_decode_symbol_adapt4(s, cdf, 3);
}
}
return tok;
}
void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
const size_t sz, const int disable_cdf_update_flag)
{

View File

@ -28,10 +28,11 @@
#ifndef DAV1D_SRC_MSAC_H
#define DAV1D_SRC_MSAC_H
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include "common/attributes.h"
typedef size_t ec_win;
typedef struct MsacContext {
@ -58,9 +59,10 @@ unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
unsigned dav1d_msac_decode_bool_adapt_c(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_c(MsacContext *s);
unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
#ifndef dav1d_msac_decode_symbol_adapt4
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
#endif
@ -79,6 +81,9 @@ int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
#ifndef dav1d_msac_decode_bool
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
#endif
#ifndef dav1d_msac_decode_hi_tok
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_c
#endif
static inline unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
unsigned v = 0;

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
@ -299,9 +298,10 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
Dav1dThreadPicture *const ref =
&c->refs[c->frame_hdr->refidx[i]].p;
if (!ref->p.data[0]) return -1;
// FIXME render_* may be wrong
hdr->render_width = hdr->width[1] = ref->p.p.w;
hdr->render_height = hdr->height = ref->p.p.h;
hdr->width[1] = ref->p.p.w;
hdr->height = ref->p.p.h;
hdr->render_width = ref->p.frame_hdr->render_width;
hdr->render_height = ref->p.frame_hdr->render_height;
hdr->super_res.enabled = seqhdr->super_res && dav1d_get_bits(gb, 1);
if (hdr->super_res.enabled) {
const int d = hdr->super_res.width_scale_denominator =
@ -1275,8 +1275,10 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
c->frame_hdr_ref = dav1d_ref_create(sizeof(Dav1dFrameHeader));
if (!c->frame_hdr_ref) return DAV1D_ERR(ENOMEM);
}
#ifndef NDEBUG
// ensure that the reference is writable
assert(dav1d_ref_is_writable(c->frame_hdr_ref));
#endif
c->frame_hdr = c->frame_hdr_ref->data;
memset(c->frame_hdr, 0, sizeof(*c->frame_hdr));
c->frame_hdr->temporal_id = temporal_id;
@ -1364,10 +1366,12 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
case OBU_METADATA: {
// obu metadta type field
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
if (gb.error) goto error;
Dav1dRef *ref;
Dav1dContentLightLevel *content_light;
Dav1dMasteringDisplay *mastering_display;
Dav1dITUTT35 *itut_t35_metadata;
switch (meta_type) {
case OBU_META_HDR_CLL:
@ -1420,7 +1424,47 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
c->mastering_display_ref = ref;
break;
}
case OBU_META_ITUT_T35:
case OBU_META_ITUT_T35: {
int payload_size = len;
// Don't take into account all the trailing bits for payload_size
while (payload_size > 0 && !in->data[init_byte_pos + payload_size - 1])
payload_size--; // trailing_zero_bit x 8
payload_size--; // trailing_one_bit + trailing_zero_bit x 7
// Don't take into account meta_type bytes
payload_size -= meta_type_len;
int country_code_extension_byte = 0;
int country_code = dav1d_get_bits(&gb, 8);
payload_size--;
if (country_code == 0xFF) {
country_code_extension_byte = dav1d_get_bits(&gb, 8);
payload_size--;
}
if (payload_size <= 0) {
dav1d_log(c, "Malformed ITU-T T.35 metadata message format\n");
goto error;
}
ref = dav1d_ref_create(sizeof(Dav1dITUTT35) + payload_size * sizeof(uint8_t));
if (!ref) return DAV1D_ERR(ENOMEM);
itut_t35_metadata = ref->data;
// We need our public headers to be C++ compatible, so payload can't be
// a flexible array member
itut_t35_metadata->payload = (uint8_t *) &itut_t35_metadata[1];
itut_t35_metadata->country_code = country_code;
itut_t35_metadata->country_code_extension_byte = country_code_extension_byte;
for (int i = 0; i < payload_size; i++)
itut_t35_metadata->payload[i] = dav1d_get_bits(&gb, 8);
itut_t35_metadata->payload_size = payload_size;
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = itut_t35_metadata;
c->itut_t35_ref = ref;
break;
}
case OBU_META_SCALABILITY:
case OBU_META_TIMECODE:
// ignore metadata OBUs we don't care about

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdio.h>
@ -104,6 +103,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
const int bpc, const Dav1dDataProps *props,
Dav1dPicAllocator *const p_allocator,
const size_t extra, void **const extra_ptr)
@ -125,6 +125,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
p->frame_hdr = frame_hdr;
p->content_light = content_light;
p->mastering_display = mastering_display;
p->itut_t35 = itut_t35;
p->p.layout = seq_hdr->layout;
p->p.bpc = bpc;
dav1d_data_props_set_defaults(&p->m);
@ -161,6 +162,9 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
p->mastering_display_ref = mastering_display_ref;
if (mastering_display_ref) dav1d_ref_inc(mastering_display_ref);
p->itut_t35_ref = itut_t35_ref;
if (itut_t35_ref) dav1d_ref_inc(itut_t35_ref);
return 0;
}
@ -176,11 +180,16 @@ int dav1d_thread_picture_alloc(Dav1dContext *const c, Dav1dFrameContext *const f
f->frame_hdr, f->frame_hdr_ref,
c->content_light, c->content_light_ref,
c->mastering_display, c->mastering_display_ref,
c->itut_t35, c->itut_t35_ref,
bpc, &f->tile[0].data.m, &c->allocator,
p->t != NULL ? sizeof(atomic_int) * 2 : 0,
(void **) &p->progress);
if (res) return res;
// Must be removed from the context after being attached to the frame
dav1d_ref_dec(&c->itut_t35_ref);
c->itut_t35 = NULL;
p->visible = f->frame_hdr->show_frame;
if (p->t) {
atomic_init(&p->progress[0], 0);
@ -198,6 +207,7 @@ int dav1d_picture_alloc_copy(Dav1dContext *const c, Dav1dPicture *const dst, con
src->frame_hdr, src->frame_hdr_ref,
src->content_light, src->content_light_ref,
src->mastering_display, src->mastering_display_ref,
src->itut_t35, src->itut_t35_ref,
src->p.bpc, &src->m, &pic_ctx->allocator,
0, NULL);
return res;
@ -216,6 +226,7 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
if (src->content_light_ref) dav1d_ref_inc(src->content_light_ref);
if (src->mastering_display_ref) dav1d_ref_inc(src->mastering_display_ref);
if (src->itut_t35_ref) dav1d_ref_inc(src->itut_t35_ref);
}
*dst = *src;
}
@ -252,6 +263,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
dav1d_ref_dec(&p->m.user_data.ref);
dav1d_ref_dec(&p->content_light_ref);
dav1d_ref_dec(&p->mastering_display_ref);
dav1d_ref_dec(&p->itut_t35_ref);
}
memset(p, 0, sizeof(*p));
}

View File

@ -0,0 +1,488 @@
/*
* Copyright © 2019, Luca Barbato
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdlib.h>
#include "common/bitdepth.h"
#include "common/intops.h"
#include "src/cdef.h"
#include "src/cpu.h"
#include "src/ppc/types.h"
#if BITDEPTH == 8
static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
const int damping)
{
const i16x8 zero = vec_splat_s16(0);
if (!threshold) return zero;
const uint16_t shift = imax(0, damping - ulog2(threshold));
const i16x8 abs_diff = vec_abs(diff);
const b16x8 mask = vec_cmplt(diff, zero);
const i16x8 thr = vec_splats(threshold);
const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
const i16x8 max = vec_max(zero, sub);
const i16x8 min = vec_min(abs_diff, max);
const i16x8 neg = vec_sub(zero, min);
return vec_sel(min, neg, mask);
}
static inline void copy4xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], uint8_t *const top[2],
const int w, const int h,
const enum CdefEdgeFlags edges)
{
const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
u16x8 l0;
u16x8 l1;
int y_start = -2, y_end = h + 2;
// Copy top and bottom first
if (!(edges & CDEF_HAVE_TOP)) {
l0 = fill;
l1 = fill;
y_start = 0;
} else {
l0 = u8h_to_u16(vec_vsx_ld(0, top[0] - 2));
l1 = u8h_to_u16(vec_vsx_ld(0, top[1] - 2));
}
vec_st(l0, 0, tmp - 2 * 8);
vec_st(l1, 0, tmp - 1 * 8);
if (!(edges & CDEF_HAVE_BOTTOM)) {
l0 = fill;
l1 = fill;
y_end -= 2;
} else {
l0 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 0) * src_stride));
l1 = u8h_to_u16(vec_vsx_ld(0, src - 2 + (h + 1) * src_stride));
}
vec_st(l0, 0, tmp + (h + 0) * 8);
vec_st(l1, 0, tmp + (h + 1) * 8);
for (int y = 0; y < h; y++) {
u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
vec_st(l, 0, tmp + y * 8);
}
if (!(edges & CDEF_HAVE_LEFT)) {
for (int y = y_start; y < y_end; y++) {
tmp[y * 8] = INT16_MAX;
tmp[1 + y * 8] = INT16_MAX;
}
} else {
for (int y = 0; y < h; y++) {
tmp[y * 8] = left[y][0];
tmp[1 + y * 8] = left[y][1];
}
}
if (!(edges & CDEF_HAVE_RIGHT)) {
for (int y = y_start; y < y_end; y++) {
tmp[- 2 + (y + 1) * 8] = INT16_MAX;
tmp[- 1 + (y + 1) * 8] = INT16_MAX;
}
}
}
static inline void copy8xN(uint16_t *tmp, const ptrdiff_t tmp_stride,
const uint8_t *src, const ptrdiff_t src_stride,
const uint8_t (*left)[2], uint8_t *const top[2],
const int w, const int h,
const enum CdefEdgeFlags edges)
{
const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
u16x8 l0h, l0l;
u16x8 l1h, l1l;
int y_start = -2, y_end = h + 2;
// Copy top and bottom first
if (!(edges & CDEF_HAVE_TOP)) {
l0h = fill;
l0l = fill;
l1h = fill;
l1l = fill;
y_start = 0;
} else {
u8x16 l0 = vec_vsx_ld(0, top[0] - 2);
u8x16 l1 = vec_vsx_ld(0, top[1] - 2);
l0h = u8h_to_u16(l0);
l0l = u8l_to_u16(l0);
l1h = u8h_to_u16(l1);
l1l = u8l_to_u16(l1);
}
vec_st(l0h, 0, tmp - 4 * 8);
vec_st(l0l, 0, tmp - 3 * 8);
vec_st(l1h, 0, tmp - 2 * 8);
vec_st(l1l, 0, tmp - 1 * 8);
if (!(edges & CDEF_HAVE_BOTTOM)) {
l0h = fill;
l0l = fill;
l1h = fill;
l1l = fill;
y_end -= 2;
} else {
u8x16 l0 = vec_vsx_ld(0, src - 2 + (h + 0) * src_stride);
u8x16 l1 = vec_vsx_ld(0, src - 2 + (h + 1) * src_stride);
l0h = u8h_to_u16(l0);
l0l = u8l_to_u16(l0);
l1h = u8h_to_u16(l1);
l1l = u8l_to_u16(l1);
}
vec_st(l0h, 0, tmp + (h + 0) * 16);
vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
vec_st(l1h, 0, tmp + (h + 1) * 16);
vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
for (int y = 0; y < h; y++) {
u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
u16x8 lh = u8h_to_u16(l);
u16x8 ll = u8l_to_u16(l);
vec_st(lh, 0, tmp + y * 16);
vec_st(ll, 0, tmp + 8 + y * 16);
}
if (!(edges & CDEF_HAVE_LEFT)) {
for (int y = y_start; y < y_end; y++) {
tmp[y * 16] = INT16_MAX;
tmp[1 + y * 16] = INT16_MAX;
}
} else {
for (int y = 0; y < h; y++) {
tmp[y * 16] = left[y][0];
tmp[1 + y * 16] = left[y][1];
}
}
if (!(edges & CDEF_HAVE_RIGHT)) {
for (int y = y_start; y < y_end; y++) {
tmp[- 6 + (y + 1) * 16] = INT16_MAX;
tmp[- 5 + (y + 1) * 16] = INT16_MAX;
}
}
}
static inline i16x8 max_mask(i16x8 a, i16x8 b) {
const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
const i16x8 val = vec_sel(a, b, mask);
return vec_max(val, b);
}
#define LOAD_PIX(addr) \
const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
i16x8 max = px; \
i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_PIX4(addr) \
const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
const i16x8 b = (i16x8)vec_vsx_ld(0, addr + tmp_stride); \
const i16x8 px = vec_xxpermdi(a, b, 0); \
i16x8 max = px; \
i16x8 min = px; \
i16x8 sum = vec_splat_s16(0);
#define LOAD_DIR(p, addr, o0, o1) \
const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
#define LOAD_DIR4(p, addr, o0, o1) \
LOAD_DIR(p ## a, addr, o0, o1) \
LOAD_DIR(p ## b, addr + tmp_stride, o0, o1) \
const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
#define CONSTRAIN(p, strength) \
const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
\
i16x8 p ## _c0 = vconstrain(p ## _d0, strength, damping); \
i16x8 p ## _c1 = vconstrain(p ## _d1, strength, damping); \
i16x8 p ## _c2 = vconstrain(p ## _d2, strength, damping); \
i16x8 p ## _c3 = vconstrain(p ## _d3, strength, damping);
#define MIN_MAX(p) \
max = max_mask(p ## 0, max); \
min = vec_min(p ## 0, min); \
max = max_mask(p ## 1, max); \
min = vec_min(p ## 1, min); \
max = max_mask(p ## 2, max); \
min = vec_min(p ## 2, min); \
max = max_mask(p ## 3, max); \
min = vec_min(p ## 3, min);
#define PRI_0(p) \
p ## _c0 = vec_add(vec_sl(p ## _c0, vec_splat_u16(1)), vec_sl(p ## _c0, vec_splats(tap_even))); \
p ## _c1 = vec_add(vec_sl(p ## _c1, vec_splat_u16(1)), vec_sl(p ## _c1, vec_splats(tap_even)));
#define PRI_1(p) \
p ## _c2 = vec_sub(vec_sl(p ## _c2, vec_splat_u16(2)), vec_sl(p ## _c2, vec_splats(tap_even))); \
p ## _c3 = vec_sub(vec_sl(p ## _c3, vec_splat_u16(2)), vec_sl(p ## _c3, vec_splats(tap_even)));
#define SEC_0(p) \
p ## _c0 = vec_sl(p ## _c0, vec_splat_u16(1)); \
p ## _c1 = vec_sl(p ## _c1, vec_splat_u16(1)); \
p ## _c2 = vec_sl(p ## _c2, vec_splat_u16(1)); \
p ## _c3 = vec_sl(p ## _c3, vec_splat_u16(1));
#define UPDATE_SUM(p) \
const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
sum = vec_add(sum, p ## sum0); \
sum = vec_add(sum, p ## sum1);
static inline void
filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], /*const*/ pixel *const top[2],
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
const ptrdiff_t tmp_stride, uint16_t *tmp)
{
const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
{ -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
{ 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
{ 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
{ 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
{ 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
};
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
const int off1 = cdef_directions[dir][0];
const int off1_1 = cdef_directions[dir][1];
const int off2 = cdef_directions[(dir + 2) & 7][0];
const int off3 = cdef_directions[(dir + 6) & 7][0];
const int off2_1 = cdef_directions[(dir + 2) & 7][1];
const int off3_1 = cdef_directions[(dir + 6) & 7][1];
copy4xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
for (int y = 0; y < h / 2; y++) {
LOAD_PIX4(tmp)
// Primary pass
LOAD_DIR4(p, tmp, off1, off1_1)
CONSTRAIN(p, pri_strength)
MIN_MAX(p)
PRI_0(p)
PRI_1(p)
UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR4(s, tmp, off2, off3)
CONSTRAIN(s, sec_strength)
MIN_MAX(s)
SEC_0(s)
UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR4(s2, tmp, off2_1, off3_1)
CONSTRAIN(s2, sec_strength)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
bias = vec_sub(vec_splat_s16(8), bias);
i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
i16x8 vdst = vec_max(vec_min(unclamped, max), min);
dst[0] = vdst[0];
dst[1] = vdst[1];
dst[2] = vdst[2];
dst[3] = vdst[3];
tmp += tmp_stride;
dst += PXSTRIDE(dst_stride);
dst[0] = vdst[4];
dst[1] = vdst[5];
dst[2] = vdst[6];
dst[3] = vdst[7];
tmp += tmp_stride;
dst += PXSTRIDE(dst_stride);
}
}
static inline void
filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2], /*const*/ pixel *const top[2],
const int w, const int h, const int pri_strength,
const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges,
const ptrdiff_t tmp_stride, uint16_t *tmp)
{
const int8_t cdef_directions[8 /* dir */][2 /* pass */] = {
{ -1 * tmp_stride + 1, -2 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, -1 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, 0 * tmp_stride + 2 },
{ 0 * tmp_stride + 1, 1 * tmp_stride + 2 },
{ 1 * tmp_stride + 1, 2 * tmp_stride + 2 },
{ 1 * tmp_stride + 0, 2 * tmp_stride + 1 },
{ 1 * tmp_stride + 0, 2 * tmp_stride + 0 },
{ 1 * tmp_stride + 0, 2 * tmp_stride - 1 }
};
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const uint16_t tap_even = !((pri_strength >> bitdepth_min_8) & 1);
const int off1 = cdef_directions[dir][0];
const int off1_1 = cdef_directions[dir][1];
const int off2 = cdef_directions[(dir + 2) & 7][0];
const int off3 = cdef_directions[(dir + 6) & 7][0];
const int off2_1 = cdef_directions[(dir + 2) & 7][1];
const int off3_1 = cdef_directions[(dir + 6) & 7][1];
copy8xN(tmp - 2, tmp_stride, dst, dst_stride, left, top, w, h, edges);
for (int y = 0; y < h; y++) {
LOAD_PIX(tmp)
// Primary pass
LOAD_DIR(p, tmp, off1, off1_1)
CONSTRAIN(p, pri_strength)
MIN_MAX(p)
PRI_0(p)
PRI_1(p)
UPDATE_SUM(p)
// Secondary pass 1
LOAD_DIR(s, tmp, off2, off3)
CONSTRAIN(s, sec_strength)
MIN_MAX(s)
SEC_0(s)
UPDATE_SUM(s)
// Secondary pass 2
LOAD_DIR(s2, tmp, off2_1, off3_1)
CONSTRAIN(s2, sec_strength)
MIN_MAX(s2)
UPDATE_SUM(s2)
// Store
i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1));
bias = vec_sub(vec_splat_s16(8), bias);
i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4)));
i16x8 vdst = vec_max(vec_min(unclamped, max), min);
dst[0] = vdst[0];
dst[1] = vdst[1];
dst[2] = vdst[2];
dst[3] = vdst[3];
dst[4] = vdst[4];
dst[5] = vdst[5];
dst[6] = vdst[6];
dst[7] = vdst[7];
tmp += tmp_stride;
dst += PXSTRIDE(dst_stride);
}
}
#define cdef_fn(w, h, tmp_stride) \
static void cdef_filter_##w##x##h##_vsx(pixel *const dst, \
const ptrdiff_t dst_stride, \
const pixel (*left)[2], \
/*const*/ pixel *const top[2], \
const int pri_strength, \
const int sec_strength, \
const int dir, \
const int damping, \
const enum CdefEdgeFlags edges) \
{ \
ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride,); \
uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
filter_##w##xN(dst, dst_stride, left, top, w, h, pri_strength, sec_strength, \
dir, damping, edges, tmp_stride, tmp); \
}
cdef_fn(4, 4, 8);
cdef_fn(4, 8, 8);
cdef_fn(8, 8, 16);
#endif
COLD void bitfn(dav1d_cdef_dsp_init_ppc)(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_PPC_CPU_FLAG_VSX)) return;
#if BITDEPTH == 8
// c->dir = dav1d_cdef_find_dir_vsx;
c->fb[0] = cdef_filter_8x8_vsx;
c->fb[1] = cdef_filter_4x8_vsx;
c->fb[2] = cdef_filter_4x4_vsx;
#endif
}

52
third_party/dav1d/src/ppc/types.h vendored Normal file
View File

@ -0,0 +1,52 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Luca Barbato
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_PPC_TYPES_H
#define DAV1D_SRC_PPC_TYPES_H
#include <altivec.h>
#undef pixel
#define u8x16 vector unsigned char
#define i8x16 vector signed char
#define b8x16 vector bool char
#define u16x8 vector unsigned short
#define i16x8 vector signed short
#define b16x8 vector bool short
#define u32x4 vector unsigned int
#define i32x4 vector signed int
#define b32x4 vector bool int
#define u64x2 vector unsigned long long
#define i64x2 vector signed long long
#define b64x2 vector bool long long
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))
#define u16l_to_i32(v) ((i32x4) vec_mergel((u16x8) v, vec_splat_u16(0)))
#endif /* DAV1D_SRC_PPC_TYPES_H */

View File

@ -46,16 +46,278 @@
#include "src/tables.h"
#include "src/wedge.h"
static unsigned read_golomb(MsacContext *const msac) {
static inline unsigned read_golomb(MsacContext *const msac) {
int len = 0;
unsigned val = 1;
while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
while (len--) val = (val << 1) | dav1d_msac_decode_bool_equi(msac);
while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
return val - 1;
}
static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
const enum BlockSize bs,
const uint8_t *const a,
const uint8_t *const l,
const int chroma,
const enum Dav1dPixelLayout layout)
{
const uint8_t *const b_dim = dav1d_block_dimensions[bs];
if (chroma) {
const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
int ca, cl;
#define MERGE_CTX(dir, type, mask) \
c##dir = !!((*(const type *) dir) & mask); \
break
switch (t_dim->lw) {
/* For some reason the MSVC CRT _wassert() function is not flagged as
* __declspec(noreturn), so when using those headers the compiler will
* expect execution to continue after an assertion has been triggered
* and will therefore complain about the use of uninitialized variables
* when compiled in debug mode if we put the default case at the end. */
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(a, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(a, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(a, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(a, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
}
switch (t_dim->lh) {
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(l, uint8_t, 0x3F);
case TX_8X8: MERGE_CTX(l, uint16_t, 0x3F3F);
case TX_16X16: MERGE_CTX(l, uint32_t, 0x3F3F3F3FU);
case TX_32X32: MERGE_CTX(l, uint64_t, 0x3F3F3F3F3F3F3F3FULL);
}
#undef MERGE_CTX
return 7 + not_one_blk * 3 + ca + cl;
} else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
return 0;
} else {
unsigned la, ll;
#define MERGE_CTX(dir, type, tx) \
if (tx == TX_64X64) { \
uint64_t tmp = *(const uint64_t *) dir; \
tmp |= *(const uint64_t *) &dir[8]; \
l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
} else \
l##dir = *(const type *) dir; \
if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
if (tx >= TX_8X8) l##dir |= l##dir >> 8; \
break
switch (t_dim->lw) {
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(a, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(a, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
}
switch (t_dim->lh) {
default: assert(0); /* fall-through */
case TX_4X4: MERGE_CTX(l, uint8_t, TX_4X4);
case TX_8X8: MERGE_CTX(l, uint16_t, TX_8X8);
case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
}
#undef MERGE_CTX
return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
}
}
static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
const uint8_t *const a,
const uint8_t *const l)
{
uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
int s;
#if ARCH_X86_64 && defined(__GNUC__)
/* Coerce compilers into producing better code. For some reason
* every x86-64 compiler is awful at handling 64-bit constants. */
__asm__("" : "+r"(mask), "+r"(mul));
#endif
switch(tx) {
default: assert(0); /* fall-through */
case TX_4X4: {
int t = *(const uint8_t *) a >> 6;
t += *(const uint8_t *) l >> 6;
s = t - 1 - 1;
break;
}
case TX_8X8: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t *= 0x04040404U;
s = (int) (t >> 24) - 2 - 2;
break;
}
case TX_16X16: {
uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
t += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
t *= (uint32_t) mul;
s = (int) (t >> 24) - 4 - 4;
break;
}
case TX_32X32: {
uint64_t t = (*(const uint64_t *) a & mask) >> 6;
t += (*(const uint64_t *) l & mask) >> 6;
t *= mul;
s = (int) (t >> 56) - 8 - 8;
break;
}
case TX_64X64: {
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
t += (*(const uint64_t *) &a[8] & mask) >> 6;
t += (*(const uint64_t *) &l[0] & mask) >> 6;
t += (*(const uint64_t *) &l[8] & mask) >> 6;
t *= mul;
s = (int) (t >> 56) - 16 - 16;
break;
}
case RTX_4X8: {
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t *= 0x04040404U;
s = (int) (t >> 24) - 1 - 2;
break;
}
case RTX_8X4: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint8_t *) l & (uint32_t) mask;
t *= 0x04040404U;
s = (int) (t >> 24) - 2 - 1;
break;
}
case RTX_8X16: {
uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6) * (uint32_t) mul;
s = (int) (t >> 24) - 2 - 4;
break;
}
case RTX_16X8: {
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t = (t >> 6) * (uint32_t) mul;
s = (int) (t >> 24) - 4 - 2;
break;
}
case RTX_16X32: {
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint64_t *) l & mask;
t = (t >> 6) * mul;
s = (int) (t >> 56) - 4 - 8;
break;
}
case RTX_32X16: {
uint64_t t = *(const uint64_t *) a & mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6) * mul;
s = (int) (t >> 56) - 8 - 4;
break;
}
case RTX_32X64: {
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
t += (*(const uint64_t *) &l[0] & mask) >> 6;
t += (*(const uint64_t *) &l[8] & mask) >> 6;
t *= mul;
s = (int) (t >> 56) - 8 - 16;
break;
}
case RTX_64X32: {
uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
t += (*(const uint64_t *) &a[8] & mask) >> 6;
t += (*(const uint64_t *) &l[0] & mask) >> 6;
t *= mul;
s = (int) (t >> 56) - 16 - 8;
break;
}
case RTX_4X16: {
uint32_t t = *(const uint8_t *) a & (uint32_t) mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6) * (uint32_t) mul;
s = (int) (t >> 24) - 1 - 4;
break;
}
case RTX_16X4: {
uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint8_t *) l & (uint32_t) mask;
t = (t >> 6) * (uint32_t) mul;
s = (int) (t >> 24) - 4 - 1;
break;
}
case RTX_8X32: {
uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
t += *(const uint64_t *) l & mask;
t = (t >> 6) * mul;
s = (int) (t >> 56) - 2 - 8;
break;
}
case RTX_32X8: {
uint64_t t = *(const uint64_t *) a & mask;
t += *(const uint16_t *) l & (uint32_t) mask;
t = (t >> 6) * mul;
s = (int) (t >> 56) - 8 - 2;
break;
}
case RTX_16X64: {
uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
t += *(const uint64_t *) &l[0] & mask;
t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
t *= mul;
s = (int) (t >> 56) - 4 - 16;
break;
}
case RTX_64X16: {
uint64_t t = *(const uint64_t *) &a[0] & mask;
t += *(const uint32_t *) l & (uint32_t) mask;
t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
t *= mul;
s = (int) (t >> 56) - 16 - 4;
break;
}
}
return (s != 0) + (s > 0);
}
static inline unsigned get_lo_ctx(const uint8_t *const levels,
const enum TxClass tx_class,
unsigned *const hi_mag,
const uint8_t (*const ctx_offsets)[5],
const unsigned x, const unsigned y,
const ptrdiff_t stride)
{
unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
unsigned offset;
if (tx_class == TX_CLASS_2D) {
mag += levels[1 * stride + 1];
*hi_mag = mag;
mag += levels[0 * stride + 2] + levels[2 * stride + 0];
offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
} else {
mag += levels[0 * stride + 2];
*hi_mag = mag;
mag += levels[0 * stride + 3] + levels[0 * stride + 4];
offset = 26 + (y > 1 ? 10 : y * 5);
}
return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
}
static int decode_coefs(Dav1dTileContext *const t,
uint8_t *const a, uint8_t *const l,
const enum RectTxfmSize tx, const enum BlockSize bs,
@ -66,6 +328,7 @@ static int decode_coefs(Dav1dTileContext *const t,
Dav1dTileState *const ts = t->ts;
const int chroma = !!plane;
const Dav1dFrameContext *const f = t->f;
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
const int dbg = DEBUG_BLOCK_INFO && plane && 0;
@ -73,7 +336,7 @@ static int decode_coefs(Dav1dTileContext *const t,
printf("Start: r=%d\n", ts->msac.rng);
// does this block have any non-zero coefficients
const int sctx = get_coef_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.coef.skip[t_dim->ctx][sctx]);
if (dbg)
@ -81,41 +344,56 @@ static int decode_coefs(Dav1dTileContext *const t,
t_dim->ctx, sctx, all_skip, ts->msac.rng);
if (all_skip) {
*res_ctx = 0x40;
*txtp = f->frame_hdr->segmentation.lossless[b->seg_id] ? WHT_WHT : DCT_DCT;
*txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
return -1;
}
// transform type (chroma: derived, luma: explicitly coded)
if (chroma) {
if (intra) {
*txtp = get_uv_intra_txtp(b->uv_mode, tx, f->frame_hdr, b->seg_id);
if (lossless) {
assert(t_dim->max == TX_4X4);
*txtp = WHT_WHT;
} else if (!f->frame_hdr->segmentation.qidx[b->seg_id] ||
t_dim->max + intra >= TX_64X64)
{
*txtp = DCT_DCT;
} else if (chroma) {
*txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
get_uv_inter_txtp(t_dim, *txtp);
} else {
const enum TxfmType y_txtp = *txtp;
*txtp = get_uv_inter_txtp(t_dim, y_txtp, f->frame_hdr, b->seg_id);
}
} else {
const enum TxfmTypeSet set = get_ext_txtp_set(tx, !intra,
f->frame_hdr, b->seg_id);
const unsigned set_cnt = dav1d_tx_type_count[set];
unsigned idx;
if (set_cnt == 1) {
idx = 0;
if (intra) {
const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
*txtp = dav1d_tx_types_per_set[idx + 0];
} else {
const int set_idx = dav1d_tx_type_set_index[!intra][set];
const enum IntraPredMode y_mode_nofilt = intra ? b->y_mode == FILTER_PRED ?
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode : 0;
uint16_t *const txtp_cdf = intra ?
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
if (dbg)
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
idx, dav1d_tx_types_per_set[set][idx], ts->msac.rng);
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
*txtp = dav1d_tx_types_per_set[idx + 5];
}
if (dbg)
printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
} else {
if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
idx = dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.txtp_inter3[t_dim->min]);
*txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
} else if (t_dim->min == TX_16X16) {
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.txtp_inter2, 11);
*txtp = dav1d_tx_types_per_set[idx + 12];
} else {
idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
ts->cdf.m.txtp_inter1[t_dim->min], 15);
*txtp = dav1d_tx_types_per_set[idx + 24];
}
if (dbg)
printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
tx, t_dim->min, idx, *txtp, ts->msac.rng);
}
*txtp = dav1d_tx_types_per_set[set][idx];
}
// find end-of-block (eob)
@ -124,19 +402,19 @@ static int decode_coefs(Dav1dTileContext *const t,
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
const int is_1d = tx_class != TX_CLASS_2D;
switch (tx2dszctx) {
#define case_sz(sz, bin, ns) \
#define case_sz(sz, bin, ns, is_1d) \
case sz: { \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
break; \
}
case_sz(0, 16, 4);
case_sz(1, 32, 8);
case_sz(2, 64, 8);
case_sz(3, 128, 8);
case_sz(4, 256, 16);
case_sz(5, 512, 16);
case_sz(6, 1024, 16);
case_sz(0, 16, 4, [is_1d]);
case_sz(1, 32, 8, [is_1d]);
case_sz(2, 64, 8, [is_1d]);
case_sz(3, 128, 8, [is_1d]);
case_sz(4, 256, 16, [is_1d]);
case_sz(5, 512, 16, );
case_sz(6, 1024, 16, );
#undef case_sz
}
if (dbg)
@ -159,122 +437,134 @@ static int decode_coefs(Dav1dTileContext *const t,
}
// base tokens
uint16_t (*const br_cdf)[5] =
ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
const int16_t *const scan = dav1d_scans[tx][tx_class];
uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
const uint16_t *const scan = dav1d_scans[tx][tx_class];
int dc_tok;
if (eob) {
uint8_t *const levels = t->scratch.levels;
uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
const ptrdiff_t stride = 4 * (sh + 1);
memset(levels, 0, stride * 4 * (sw + 1));
const int shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
const unsigned shift = 2 + imin(t_dim->lh, 3), mask = 4 * sh - 1;
{ // eob
const int rc = scan[eob], x = rc >> shift, y = rc & mask;
const int ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx];
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
/* eob */
unsigned rc = scan[eob], x = rc >> shift, y = rc & mask;
unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
int tok = eob_tok + 1;
int level_tok = tok * 0x41;
unsigned mag;
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng);
if (tok == 3) {
const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, br_ctx,
eob, rc, tok_br, tok, ts->msac.rng);
tok += tok_br;
if (tok_br < 3) break;
} while (tok < 15);
}
#define DECODE_COEFS_CLASS(tx_class) \
if (eob_tok == 2) { \
ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : \
tx_class == TX_CLASS_H ? x != 0 : y != 0) ? 14 : 7; \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
level_tok = tok + (3 << 6); \
if (dbg) \
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
ts->msac.rng); \
} \
cf[rc] = tok; \
if (tx_class == TX_CLASS_H) \
/* Transposing reduces the stride and padding requirements */ \
levels[y * stride + x] = (uint8_t) level_tok; \
else \
levels[x * stride + y] = (uint8_t) level_tok; \
for (int i = eob - 1; i > 0; i--) { /* ac */ \
if (tx_class == TX_CLASS_H) \
rc = i, x = rc & mask, y = rc >> shift; \
else \
rc = scan[i], x = rc >> shift, y = rc & mask; \
assert(x < 32 && y < 32); \
uint8_t *const level = levels + x * stride + y; \
ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
if (tx_class == TX_CLASS_2D) \
y |= x; \
tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
level_tok = tok * 0x41; \
if (dbg) \
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng); \
if (tok == 3) { \
mag &= 63; \
ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
(mag > 12 ? 6 : (mag + 1) >> 1); \
tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
level_tok = tok + (3 << 6); \
if (dbg) \
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
imin(t_dim->ctx, 3), chroma, ctx, i, rc, tok, \
ts->msac.rng); \
} \
cf[rc] = tok; \
*level = (uint8_t) level_tok; \
} \
/* dc */ \
ctx = (tx_class == TX_CLASS_2D) ? 0 : \
get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
if (dbg) \
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
if (dc_tok == 3) { \
if (tx_class == TX_CLASS_2D) \
mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
levels[1 * stride + 1]; \
mag &= 63; \
ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
if (dbg) \
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
} \
break
cf[rc] = tok;
levels[x * stride + y] = (uint8_t) tok;
switch (tx_class) {
case TX_CLASS_2D: {
const unsigned nonsquare_tx = tx >= RTX_4X8;
const uint8_t (*const lo_ctx_offsets)[5] =
dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
const ptrdiff_t stride = 4 * sh;
memset(levels, 0, stride * (4 * sw + 2));
DECODE_COEFS_CLASS(TX_CLASS_2D);
}
for (int i = eob - 1; i > 0; i--) { // ac
const int rc = scan[i], x = rc >> shift, y = rc & mask;
// lo tok
const int ctx = get_coef_nz_ctx(levels, tx, tx_class, x, y, stride);
uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
// hi tok
if (tok == 3) {
const int br_ctx = get_br_ctx(levels, 1, tx_class, x, y, stride);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, br_ctx,
i, rc, tok_br, tok, ts->msac.rng);
tok += tok_br;
if (tok_br < 3) break;
} while (tok < 15);
case TX_CLASS_H: {
#define lo_ctx_offsets NULL
const ptrdiff_t stride = 16;
memset(levels, 0, stride * (4 * sh + 2));
DECODE_COEFS_CLASS(TX_CLASS_H);
}
cf[rc] = tok;
levels[x * stride + y] = (uint8_t) tok;
}
{ // dc
int ctx = 0;
if (tx_class != TX_CLASS_2D)
ctx = get_coef_nz_ctx(levels, tx, tx_class, 0, 0, stride);
uint16_t *const lo_cdf = ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 4);
if (dbg)
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng);
if (dc_tok == 3) {
const int br_ctx = get_br_ctx(levels, 0, tx_class, 0, 0, stride);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, br_ctx,
tok_br, dc_tok, ts->msac.rng);
dc_tok += tok_br;
if (tok_br < 3) break;
} while (dc_tok < 15);
case TX_CLASS_V: {
const ptrdiff_t stride = 16;
memset(levels, 0, stride * (4 * sw + 2));
DECODE_COEFS_CLASS(TX_CLASS_V);
}
#undef lo_ctx_offsets
#undef DECODE_COEFS_CLASS
default: assert(0);
}
} else { // dc-only
uint16_t *const lo_cdf = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][0];
dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf, 3) + 1;
int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
dc_tok = 1 + tok_br;
if (dbg)
printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
if (dc_tok == 3) {
do {
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[0], 4);
if (tok_br == 2) {
dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
if (dbg)
printf("Post-dc_hi_tok[%d][%d][%d][%d->%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, 0,
tok_br, dc_tok, ts->msac.rng);
dc_tok += tok_br;
if (tok_br < 3) break;
} while (dc_tok < 15);
printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
}
}
// residual and sign
int dc_sign = 1 << 6;
const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
const uint8_t *const qm_tbl = f->qm[lossless || is_1d || *txtp == IDTX][tx][plane];
const int dq_shift = imax(0, t_dim->ctx - 2);
@ -283,7 +573,7 @@ static int decode_coefs(Dav1dTileContext *const t,
unsigned cul_level = 0;
if (dc_tok) { // dc
const int dc_sign_ctx = get_dc_sign_ctx(t_dim, a, l);
const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
uint16_t *const dc_sign_cdf =
ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
@ -335,7 +625,7 @@ static int decode_coefs(Dav1dTileContext *const t,
}
// context
*res_ctx = imin(cul_level, 63) | dc_sign;
*res_ctx = umin(cul_level, 63) | dc_sign;
return eob;
}
@ -782,14 +1072,16 @@ static int warp_affine(Dav1dTileContext *const t,
// luma pixel units
const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
const int src_y = t->by * 4 + ((y + 4) << ss_ver);
const int mvx = (mat[2] * src_x + mat[3] * src_y + mat[0]) >> ss_hor;
const int mvy = (mat[4] * src_x + mat[5] * src_y + mat[1]) >> ss_ver;
const int64_t mvx = ((int64_t) mat[2] * src_x +
(int64_t) mat[3] * src_y + mat[0]) >> ss_hor;
const int64_t mvy = ((int64_t) mat[4] * src_x +
(int64_t) mat[5] * src_y + mat[1]) >> ss_ver;
const int dx = (mvx >> 16) - 4;
const int mx = ((mvx & 0xffff) - wmp->alpha * 4 -
const int dx = (int) (mvx >> 16) - 4;
const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
wmp->beta * 7) & ~0x3f;
const int dy = (mvy >> 16) - 4;
const int my = ((mvy & 0xffff) - wmp->gamma * 4 -
const int dy = (int) (mvy >> 16) - 4;
const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
wmp->delta * 4) & ~0x3f;
const pixel *ref_ptr;

View File

@ -47,7 +47,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <limits.h>
#include <stddef.h>

View File

@ -30,25 +30,19 @@
#include "common/attributes.h"
#include "src/scan.h"
static const int16_t ALIGN(av1_default_scan_4x4[], 32) = {
static const uint16_t ALIGN(av1_default_scan_4x4[], 32) = {
0, 4, 1, 2,
5, 8, 12, 9,
6, 3, 7, 10,
13, 14, 11, 15,
};
static const int16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_4x4[], 32) = {
0, 4, 8, 12,
1, 5, 9, 13,
2, 6, 10, 14,
3, 7, 11, 15,
};
static const int16_t ALIGN(av1_mcol_scan_4x4[], 32) = {
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
};
static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
static const uint16_t ALIGN(av1_default_scan_4x8[], 32) = {
0, 8, 1, 16,
9, 2, 24, 17,
10, 3, 25, 18,
@ -58,7 +52,7 @@ static const int16_t ALIGN(av1_default_scan_4x8[], 32) = {
14, 7, 29, 22,
15, 30, 23, 31,
};
static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
0, 8, 16, 24,
1, 9, 17, 25,
2, 10, 18, 26,
@ -68,17 +62,7 @@ static const int16_t ALIGN(av1_mrow_scan_4x8[], 32) = {
6, 14, 22, 30,
7, 15, 23, 31,
};
static const int16_t ALIGN(av1_mcol_scan_4x8[], 32) = {
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
16, 17, 18, 19,
20, 21, 22, 23,
24, 25, 26, 27,
28, 29, 30, 31,
};
static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
static const uint16_t ALIGN(av1_default_scan_4x16[], 32) = {
0, 16, 1, 32,
17, 2, 48, 33,
18, 3, 49, 34,
@ -96,7 +80,7 @@ static const int16_t ALIGN(av1_default_scan_4x16[], 32) = {
30, 15, 61, 46,
31, 62, 47, 63,
};
static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
0, 16, 32, 48,
1, 17, 33, 49,
2, 18, 34, 50,
@ -114,43 +98,19 @@ static const int16_t ALIGN(av1_mrow_scan_4x16[], 32) = {
14, 30, 46, 62,
15, 31, 47, 63,
};
static const int16_t ALIGN(av1_mcol_scan_4x16[], 32) = {
0, 1, 2, 3,
4, 5, 6, 7,
8, 9, 10, 11,
12, 13, 14, 15,
16, 17, 18, 19,
20, 21, 22, 23,
24, 25, 26, 27,
28, 29, 30, 31,
32, 33, 34, 35,
36, 37, 38, 39,
40, 41, 42, 43,
44, 45, 46, 47,
48, 49, 50, 51,
52, 53, 54, 55,
56, 57, 58, 59,
60, 61, 62, 63,
};
static const int16_t ALIGN(av1_default_scan_8x4[], 32) = {
static const uint16_t ALIGN(av1_default_scan_8x4[], 32) = {
0, 1, 4, 2, 5, 8, 3, 6,
9, 12, 7, 10, 13, 16, 11, 14,
17, 20, 15, 18, 21, 24, 19, 22,
25, 28, 23, 26, 29, 27, 30, 31,
};
static const int16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_8x4[], 32) = {
0, 4, 8, 12, 16, 20, 24, 28,
1, 5, 9, 13, 17, 21, 25, 29,
2, 6, 10, 14, 18, 22, 26, 30,
3, 7, 11, 15, 19, 23, 27, 31,
};
static const int16_t ALIGN(av1_mcol_scan_8x4[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
};
static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
static const uint16_t ALIGN(av1_default_scan_8x8[], 32) = {
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
@ -160,7 +120,7 @@ static const int16_t ALIGN(av1_default_scan_8x8[], 32) = {
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
};
static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
0, 8, 16, 24, 32, 40, 48, 56,
1, 9, 17, 25, 33, 41, 49, 57,
2, 10, 18, 26, 34, 42, 50, 58,
@ -170,17 +130,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x8[], 32) = {
6, 14, 22, 30, 38, 46, 54, 62,
7, 15, 23, 31, 39, 47, 55, 63,
};
static const int16_t ALIGN(av1_mcol_scan_8x8[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
};
static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
static const uint16_t ALIGN(av1_default_scan_8x16[], 32) = {
0, 16, 1, 32, 17, 2, 48, 33,
18, 3, 64, 49, 34, 19, 4, 80,
65, 50, 35, 20, 5, 96, 81, 66,
@ -198,7 +148,7 @@ static const int16_t ALIGN(av1_default_scan_8x16[], 32) = {
47, 123, 108, 93, 78, 63, 124, 109,
94, 79, 125, 110, 95, 126, 111, 127,
};
static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
0, 16, 32, 48, 64, 80, 96, 112,
1, 17, 33, 49, 65, 81, 97, 113,
2, 18, 34, 50, 66, 82, 98, 114,
@ -216,25 +166,7 @@ static const int16_t ALIGN(av1_mrow_scan_8x16[], 32) = {
14, 30, 46, 62, 78, 94, 110, 126,
15, 31, 47, 63, 79, 95, 111, 127,
};
static const int16_t ALIGN(av1_mcol_scan_8x16[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71,
72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103,
104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119,
120, 121, 122, 123, 124, 125, 126, 127,
};
static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
static const uint16_t ALIGN(av1_default_scan_8x32[], 32) = {
0, 32, 1, 64, 33, 2, 96, 65,
34, 3, 128, 97, 66, 35, 4, 160,
129, 98, 67, 36, 5, 192, 161, 130,
@ -268,25 +200,19 @@ static const int16_t ALIGN(av1_default_scan_8x32[], 32) = {
95, 251, 220, 189, 158, 127, 252, 221,
190, 159, 253, 222, 191, 254, 223, 255,
};
static const int16_t ALIGN(av1_default_scan_16x4[], 32) = {
static const uint16_t ALIGN(av1_default_scan_16x4[], 32) = {
0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
};
static const int16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_16x4[], 32) = {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
};
static const int16_t ALIGN(av1_mcol_scan_16x4[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
};
static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
static const uint16_t ALIGN(av1_default_scan_16x8[], 32) = {
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5,
12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44,
@ -296,7 +222,7 @@ static const int16_t ALIGN(av1_default_scan_16x8[], 32) = {
99, 106, 113, 120, 79, 86, 93, 100, 107, 114, 121, 87, 94, 101, 108, 115,
122, 95, 102, 109, 116, 123, 103, 110, 117, 124, 111, 118, 125, 119, 126, 127,
};
static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
@ -306,17 +232,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x8[], 32) = {
6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
};
static const int16_t ALIGN(av1_mcol_scan_16x8[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
};
static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
static const uint16_t ALIGN(av1_default_scan_16x16[], 32) = {
0, 16, 1, 2, 17, 32, 48, 33, 18, 3, 4, 19, 34, 49, 64, 80,
65, 50, 35, 20, 5, 6, 21, 36, 51, 66, 81, 96, 112, 97, 82, 67,
52, 37, 22, 7, 8, 23, 38, 53, 68, 83, 98, 113, 128, 144, 129, 114,
@ -334,7 +250,7 @@ static const int16_t ALIGN(av1_default_scan_16x16[], 32) = {
188, 173, 158, 143, 159, 174, 189, 204, 219, 234, 249, 250, 235, 220, 205, 190,
175, 191, 206, 221, 236, 251, 252, 237, 222, 207, 223, 238, 253, 254, 239, 255,
};
static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
static const uint16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
@ -352,7 +268,7 @@ static const int16_t ALIGN(av1_mrow_scan_16x16[], 32) = {
14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
};
static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
static const uint16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
@ -370,7 +286,7 @@ static const int16_t ALIGN(av1_mcol_scan_16x16[], 32) = {
224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
};
static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
static const uint16_t ALIGN(av1_default_scan_16x32[], 32) = {
0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4, 160,
129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193, 162, 131,
100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8, 288, 257, 226,
@ -404,7 +320,7 @@ static const int16_t ALIGN(av1_default_scan_16x32[], 32) = {
380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413, 382,
351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510, 479, 511,
};
static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
static const uint16_t ALIGN(av1_default_scan_32x8[], 32) = {
0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32, 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14, 21, 28,
35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23, 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80, 39, 46, 53, 60,
67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89, 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98, 105, 112, 71, 78, 85, 92,
@ -414,7 +330,7 @@ static const int16_t ALIGN(av1_default_scan_32x8[], 32) = {
195, 202, 209, 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218, 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220,
227, 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243, 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254, 255,
};
static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
static const uint16_t ALIGN(av1_default_scan_32x16[], 32) = {
0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64, 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22, 37, 52,
67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128, 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70, 85, 100, 115, 130,
145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177, 192, 13, 28, 43, 58, 73,
@ -432,7 +348,7 @@ static const int16_t ALIGN(av1_default_scan_32x16[], 32) = {
381, 396, 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472, 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476, 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495, 510, 511,
};
static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
static const uint16_t ALIGN(av1_default_scan_32x32[], 32) = {
0, 32, 1, 2, 33, 64, 96, 65, 34, 3, 4, 35, 66, 97, 128, 160, 129, 98, 67, 36, 5, 6, 37, 68, 99, 130, 161, 192, 224, 193, 162, 131,
100, 69, 38, 7, 8, 39, 70, 101, 132, 163, 194, 225, 256, 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 10, 41, 72, 103, 134, 165, 196, 227, 258,
289, 320, 352, 321, 290, 259, 228, 197, 166, 135, 104, 73, 42, 11, 12, 43, 74, 105, 136, 167, 198, 229, 260, 291, 322, 353, 384, 416, 385, 354, 323, 292,
@ -467,15 +383,15 @@ static const int16_t ALIGN(av1_default_scan_32x32[], 32) = {
892, 861, 830, 799, 831, 862, 893, 924, 955, 986, 1017, 1018, 987, 956, 925, 894, 863, 895, 926, 957, 988, 1019, 1020, 989, 958, 927, 959, 990, 1021, 1022, 991, 1023,
};
const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
[TX_4X4] = {
[TX_CLASS_2D] = av1_default_scan_4x4,
[TX_CLASS_V] = av1_mrow_scan_4x4,
[TX_CLASS_H] = av1_mcol_scan_4x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [TX_8X8] = {
[TX_CLASS_2D] = av1_default_scan_8x8,
[TX_CLASS_V] = av1_mrow_scan_8x8,
[TX_CLASS_H] = av1_mcol_scan_8x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [TX_16X16] = {
[TX_CLASS_2D] = av1_default_scan_16x16,
[TX_CLASS_V] = av1_mrow_scan_16x16,
@ -487,19 +403,19 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
}, [RTX_4X8] = {
[TX_CLASS_2D] = av1_default_scan_4x8,
[TX_CLASS_V] = av1_mrow_scan_4x8,
[TX_CLASS_H] = av1_mcol_scan_4x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X4] = {
[TX_CLASS_2D] = av1_default_scan_8x4,
[TX_CLASS_V] = av1_mrow_scan_8x4,
[TX_CLASS_H] = av1_mcol_scan_8x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X16] = {
[TX_CLASS_2D] = av1_default_scan_8x16,
[TX_CLASS_V] = av1_mrow_scan_8x16,
[TX_CLASS_H] = av1_mcol_scan_8x16,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X8] = {
[TX_CLASS_2D] = av1_default_scan_16x8,
[TX_CLASS_V] = av1_mrow_scan_16x8,
[TX_CLASS_H] = av1_mcol_scan_16x8,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X32] = {
[TX_CLASS_2D] = av1_default_scan_16x32,
}, [RTX_32X16] = {
@ -511,11 +427,11 @@ const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3] = {
}, [RTX_4X16] = {
[TX_CLASS_2D] = av1_default_scan_4x16,
[TX_CLASS_V] = av1_mrow_scan_4x16,
[TX_CLASS_H] = av1_mcol_scan_4x16,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_16X4] = {
[TX_CLASS_2D] = av1_default_scan_16x4,
[TX_CLASS_V] = av1_mrow_scan_16x4,
[TX_CLASS_H] = av1_mcol_scan_16x4,
[TX_CLASS_H] = av1_mcol_scan_16x16,
}, [RTX_8X32] = {
[TX_CLASS_2D] = av1_default_scan_8x32,
}, [RTX_32X8] = {

View File

@ -32,6 +32,6 @@
#include "src/levels.h"
extern const int16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
extern const uint16_t *const dav1d_scans[N_RECT_TX_SIZES][3];
#endif /* DAV1D_SRC_SCAN_H */

View File

@ -225,37 +225,26 @@ const uint8_t /* enum InterPredMode */
[NEARMV_NEWMV] = { NEARMV, NEWMV },
};
const uint8_t dav1d_tx_type_count[N_TXTP_SETS] = {
[TXTP_SET_DCT] = 1,
[TXTP_SET_DCT_ID] = 2,
[TXTP_SET_DT4_ID] = 5,
[TXTP_SET_DT4_ID_1D] = 7,
[TXTP_SET_DT9_ID_1D] = 12,
[TXTP_SET_ALL] = 16,
[TXTP_SET_LOSSLESS] = 1,
const uint8_t dav1d_partition_type_count[N_BL_LEVELS] = {
[BL_128X128] = N_PARTITIONS - 3,
[BL_64X64] = N_PARTITIONS - 1,
[BL_32X32] = N_PARTITIONS - 1,
[BL_16X16] = N_PARTITIONS - 1,
[BL_8X8] = N_SUB8X8_PARTITIONS - 1,
};
const uint8_t /* enum TxfmType */
dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES] =
{
[TXTP_SET_DCT] = { DCT_DCT },
[TXTP_SET_DCT_ID] = { IDTX, DCT_DCT },
[TXTP_SET_DT4_ID] = { IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST },
[TXTP_SET_DT4_ID_1D] = { IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT,
DCT_ADST },
[TXTP_SET_DT9_ID_1D] = { IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST,
FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
[TXTP_SET_ALL] = { IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
H_FLIPADST, DCT_DCT, ADST_DCT, DCT_ADST,
FLIPADST_DCT, DCT_FLIPADST, ADST_ADST,
FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST },
[TXTP_SET_LOSSLESS] = { WHT_WHT },
};
const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS] = {
{ 0, -1, 2, 1, -1, -1, 3 },
{ 0, 3, -1, -1, 2, 1, 4 },
const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40] = {
/* Intra2 */
IDTX, DCT_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
/* Intra1 */
IDTX, DCT_DCT, V_DCT, H_DCT, ADST_ADST, ADST_DCT, DCT_ADST,
/* Inter2 */
IDTX, V_DCT, H_DCT, DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT,
DCT_FLIPADST, ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
/* Inter1 */
IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, H_FLIPADST,
DCT_DCT, ADST_DCT, DCT_ADST, FLIPADST_DCT, DCT_FLIPADST,
ADST_ADST, FLIPADST_FLIPADST, ADST_FLIPADST, FLIPADST_ADST,
};
const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
@ -283,119 +272,34 @@ const uint8_t dav1d_ymode_size_context[N_BS_SIZES] = {
[BS_4x4 ] = 0,
};
const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5] = {
[TX_4X4] = {
{ 0, 1, 6, 6 },
{ 1, 6, 6, 21 },
{ 6, 6, 21, 21 },
{ 6, 21, 21, 21 },
}, [TX_8X8] = {
const uint8_t dav1d_lo_ctx_offsets[3][5][5] = {
{ /* w == h */
{ 0, 1, 6, 6, 21 },
{ 1, 6, 6, 21, 21 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [TX_16X16] = {
{ 0, 1, 6, 6, 21 },
{ 1, 6, 6, 21, 21 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [TX_32X32] = {
{ 0, 1, 6, 6, 21 },
{ 1, 6, 6, 21, 21 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [TX_64X64] = {
{ 0, 1, 6, 6, 21 },
{ 1, 6, 6, 21, 21 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_4X8] = {
{ 0, 11, 11, 11 },
{ 11, 11, 11, 11 },
{ 6, 6, 21, 21 },
{ 6, 21, 21, 21 },
{ 21, 21, 21, 21 }
}, [RTX_8X4] = {
{ 21, 21, 21, 21, 21 },
}, { /* w > h */
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
}, [RTX_8X16] = {
{ 16, 16, 21, 21, 21 },
}, { /* w < h */
{ 0, 11, 11, 11, 11 },
{ 11, 11, 11, 11, 11 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_16X8] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 }
}, [RTX_16X32] = {
{ 0, 11, 11, 11, 11 },
{ 11, 11, 11, 11, 11 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_32X16] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 }
}, [RTX_32X64] = {
{ 0, 11, 11, 11, 11 },
{ 11, 11, 11, 11, 11 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_64X32] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 }
}, [RTX_4X16] = {
{ 0, 11, 11, 11 },
{ 11, 11, 11, 11 },
{ 6, 6, 21, 21 },
{ 6, 21, 21, 21 },
{ 21, 21, 21, 21 }
}, [RTX_16X4] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
}, [RTX_8X32] = {
{ 0, 11, 11, 11, 11 },
{ 11, 11, 11, 11, 11 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_32X8] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 }
}, [RTX_16X64] = {
{ 0, 11, 11, 11, 11 },
{ 11, 11, 11, 11, 11 },
{ 6, 6, 21, 21, 21 },
{ 6, 21, 21, 21, 21 },
{ 21, 21, 21, 21, 21 }
}, [RTX_64X16] = {
{ 0, 16, 6, 6, 21 },
{ 16, 16, 6, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 },
{ 16, 16, 21, 21, 21 }
}
{ 21, 21, 21, 21, 21 },
},
};
const uint8_t dav1d_skip_ctx[5][5] = {
{ 1, 2, 2, 2, 3 },
{ 2, 4, 4, 4, 5 },
{ 2, 4, 4, 4, 5 },
{ 2, 4, 4, 4, 5 },
{ 3, 5, 5, 5, 6 },
};
const uint8_t /* enum TxClass */ dav1d_tx_type_class[N_TX_TYPES_PLUS_LL] = {
@ -861,7 +765,7 @@ const int8_t ALIGN(dav1d_filter_intra_taps[5][64], 16) = {
}
};
const uint8_t dav1d_obmc_masks[64] = {
const uint8_t ALIGN(dav1d_obmc_masks[64], 16) = {
/* Unused */
0, 0,
/* 2 */

View File

@ -52,14 +52,13 @@ extern const uint8_t /* enum TxfmType */
extern const uint8_t /* enum InterPredMode */
dav1d_comp_inter_pred_modes[N_COMP_INTER_PRED_MODES][2];
extern const uint8_t dav1d_tx_type_count[N_TXTP_SETS];
extern const uint8_t /* enum TxfmType */
dav1d_tx_types_per_set[N_TXTP_SETS][N_TX_TYPES];
extern const uint8_t dav1d_tx_type_set_index[2][N_TXTP_SETS];
extern const uint8_t dav1d_partition_type_count[N_BL_LEVELS];
extern const uint8_t /* enum TxfmType */ dav1d_tx_types_per_set[40];
extern const uint8_t dav1d_filter_mode_to_y_mode[5];
extern const uint8_t dav1d_ymode_size_context[N_BS_SIZES];
extern const uint8_t dav1d_nz_map_ctx_offset[N_RECT_TX_SIZES][5][5];
extern const uint8_t dav1d_lo_ctx_offsets[3][5][5];
extern const uint8_t dav1d_skip_ctx[5][5];
extern const uint8_t /* enum TxClass */
dav1d_tx_type_class[N_TX_TYPES_PLUS_LL];
extern const uint8_t /* enum Filter2d */

View File

@ -48,6 +48,10 @@ typedef SRWLOCK pthread_mutex_t;
typedef CONDITION_VARIABLE pthread_cond_t;
typedef INIT_ONCE pthread_once_t;
void dav1d_init_thread(void);
void dav1d_set_thread_name(const wchar_t *name);
#define dav1d_set_thread_name(name) dav1d_set_thread_name(L##name)
int dav1d_pthread_create(pthread_t *thread, const pthread_attr_t *attr,
void *(*func)(void*), void *arg);
int dav1d_pthread_join(pthread_t *thread, void **res);
@ -126,7 +130,7 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
#include <pthread.h>
#endif
#define dav1d_init_thread() do {} while (0)
/* Thread naming support */
@ -134,13 +138,40 @@ static inline int pthread_cond_broadcast(pthread_cond_t *const cond) {
#include <sys/prctl.h>
static inline void dav1d_set_thread_name(const char* name) {
static inline void dav1d_set_thread_name(const char *const name) {
prctl(PR_SET_NAME, name);
}
#elif defined(__APPLE__)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(name);
}
#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
#if defined(__FreeBSD__)
/* ALIGN from <sys/param.h> conflicts with ALIGN from "common/attributes.h" */
#define _SYS_PARAM_H_
#include <sys/types.h>
#endif
#include <pthread_np.h>
static inline void dav1d_set_thread_name(const char *const name) {
pthread_set_name_np(pthread_self(), name);
}
#elif defined(__NetBSD__)
static inline void dav1d_set_thread_name(const char *const name) {
pthread_setname_np(pthread_self(), "%s", (void*)name);
}
#else
#define dav1d_set_thread_name(name)
#define dav1d_set_thread_name(name) do {} while (0)
#endif
#endif

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <stdlib.h>
#include "common/intops.h"

View File

@ -37,6 +37,20 @@
#include "src/thread.h"
static HRESULT (WINAPI *set_thread_description)(HANDLE, PCWSTR);
COLD void dav1d_init_thread(void) {
set_thread_description =
(void*)GetProcAddress(GetModuleHandleW(L"kernel32.dll"),
"SetThreadDescription");
}
#undef dav1d_set_thread_name
COLD void dav1d_set_thread_name(const wchar_t *const name) {
if (set_thread_description) /* Only available since Windows 10 1607 */
set_thread_description(GetCurrentThread(), name);
}
static COLD unsigned __stdcall thread_entrypoint(void *const data) {
pthread_t *const t = data;
t->arg = t->func(t->arg);

1582
third_party/dav1d/src/x86/film_grain.asm vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,45 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/film_grain.h"
decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64
c->generate_grain_y = dav1d_generate_grain_y_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
#endif
}

View File

@ -47,9 +47,11 @@ pw_m%2_%1: dw -%2, %1
pw_3803_1321: dw 3803, 1321
pw_m1321_2482: dw -1321, 2482
pw_2482_3344: dw 2482, 3344
pw_m3344_3344: dw -3344, 3344
pw_m3803_3344: dw -3803, 3344
pw_m3803_m6688: dw -3803, -6688
%define pw_3344x8 iadst4_dconly2b
COEF_PAIR 2896, 2896
pw_2896_m2896: dw 2896, -2896
pw_5: times 2 dw 5
pw_2048: times 2 dw 2048
@ -464,13 +466,15 @@ ALIGN function_align
%macro IADST4_1D_PACKED 0
punpcklwd m2, m1, m0
punpckhwd m3, m1, m0
psubw m0, m1
punpckhqdq m1, m1
paddw m1, m0 ; in0 - in2 + in3
vpbroadcastd m5, [o(pw_m3344_3344)]
vpbroadcastd m0, [o(pw_3803_1321)]
vpbroadcastd m4, [o(pw_m1321_2482)]
pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2
psrld m5, 16
pmaddwd m0, m2
pmaddwd m2, m4
pmaddwd m5, m3 ; 3344*in0
paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3
vpbroadcastd m4, [o(pw_2482_3344)]
vpbroadcastd m5, [o(pw_m3803_3344)]
pmaddwd m4, m3
@ -478,19 +482,16 @@ ALIGN function_align
paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3
vpbroadcastd m0, [o(pw_m3803_m6688)]
pmaddwd m3, m0
vpbroadcastd m0, [o(pw_3344x8)]
pmulhrsw m1, m0 ; out2 ____
vpbroadcastd m0, [o(pd_2048)]
paddd m2, m0
paddd m1, m0
paddd m0, m4
paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3
paddd m2, m4
paddd m2, m3
psrad m0, 12
psrad m5, 12
psrad m2, 12
REPX {psrad x, 12}, m1, m2, m0, m5
packssdw m0, m5 ; out0 out1
packssdw m2, m2 ; out3 out3
packssdw m1, m2 ; out2 out3
%endmacro
INV_TXFM_4X4_FN dct, dct, 0
@ -524,14 +525,13 @@ cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call .main
punpckhwd m3, m0, m2
punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
jmp tx2q
.pass2:
call .main
vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
@ -552,14 +552,13 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
call m(iadst_4x4_internal).main
punpcklwd m1, m0
punpckhwd m2, m0
punpcklwd m0, m2, m1
punpckhwd m1, m2, m1
punpcklwd m2, m1, m0
punpckhwd m1, m0
punpcklwd m0, m1, m2
punpckhwd m1, m2
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
vpblendd m1, m1, m2, 0x0c ; out2 out3
.end:
pxor m2, m2
mova [cq+16*0], m2
@ -710,12 +709,55 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
paddsw m1, m5 ; out3 out2
%endmacro
%macro IADST8_1D_PACKED 0
%macro IADST8_1D_PACKED 1 ; pass
vpbroadcastd m6, [o(pd_2048)]
punpckhwd m0, m4, m3 ; 0 7
punpckhwd m1, m5, m2 ; 2 5
punpcklwd m2, m5 ; 4 3
punpcklwd m3, m4 ; 6 1
%if %1 == 1
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a
psubsw m4, m0, m2 ; t5 t4
paddsw m0, m2 ; t1 t0
psubsw m5, m1, m3 ; t6 t7
paddsw m1, m3 ; t2 t3
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a
%if mmsize > 16
vbroadcasti128 m2, [o(deint_shuf)]
%else
mova m2, [o(deint_shuf)]
%endif
pshuflw m1, m1, q2301
pshufhw m1, m1, q2301
psubsw m3, m0, m1 ; t3 t2
paddsw m0, m1 ; -out7 out0
psubsw m1, m4, m5 ; t7 t6
paddsw m4, m5 ; out6 -out1
pshufb m0, m2
pshufb m4, m2
vpbroadcastd m5, [o(pw_m2896_2896)]
pmaddwd m2, m5, m3
pmaddwd m5, m1
paddd m2, m6
paddd m5, m6
psrad m2, 12
psrad m5, 12
packssdw m2, m5 ; out4 -out5
vpbroadcastd m5, [o(pw_2896_2896)]
pmaddwd m3, m5
pmaddwd m1, m5
paddd m3, m6
paddd m1, m6
psrad m3, 12
psrad m1, 12
packssdw m1, m3 ; out2 -out3
punpcklqdq m3, m4, m0 ; out6 -out7
punpckhqdq m0, m4 ; out0 -out1
%else
ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
@ -738,11 +780,12 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
vpblendd m0, m0, m4, 0xcc ; out0 -out1
shufps m4, m2, m1, q1032 ; t3 t7
vpblendd m1, m2, m1, 0xcc ; t2 t6
psubw m2, m1, m4 ; t2-t3 t6-t7
paddw m1, m4 ; t2+t3 t6+t7
psubsw m2, m1, m4 ; t2-t3 t6-t7
paddsw m1, m4 ; t2+t3 t6+t7
pmulhrsw m2, m5 ; out4 -out5
pshufd m1, m1, q1032
pmulhrsw m1, m5 ; out2 -out3
%endif
%endmacro
INIT_YMM avx2
@ -790,7 +833,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m0, m2
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
punpckhwd m3, m0, m2
punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m3
punpcklwd m0, m3
@ -800,7 +843,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
call .main
call .main_pass2
vpbroadcastd m4, [o(pw_2048)]
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
@ -822,8 +865,12 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
WRITE_4X8 0, 1
RET
ALIGN function_align
.main:
WRAP_XMM IADST8_1D_PACKED
.main_pass1:
WRAP_XMM IADST8_1D_PACKED 1
ret
ALIGN function_align
.main_pass2:
WRAP_XMM IADST8_1D_PACKED 2
ret
INV_TXFM_4X8_FN flipadst, dct, 0
@ -839,7 +886,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m1, m2
call m(iadst_8x4_internal).main
punpcklwd m3, m1, m0
punpckhwd m1, m2, m0
punpckhwd m1, m0
punpcklwd m0, m1, m3
punpckhwd m1, m3
jmp tx2q
@ -848,7 +895,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vextracti128 xm3, m1, 1
pshufd xm4, xm0, q1032
pshufd xm5, xm1, q1032
call m(iadst_4x8_internal).main
call m(iadst_4x8_internal).main_pass2
vpbroadcastd m5, [o(pw_2048)]
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
@ -1099,8 +1146,13 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call .main
pshufd m1, m1, q1032
vpbroadcastd m5, [o(pw_2896x8)]
paddsw m1, m2, m4
psubsw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
vpbroadcastd m5, [o(pw_2048)]
pshufd m1, m1, q1032
vpblendd m4, m1, m0, 0x33
vpblendd m0, m0, m2, 0x33
vpblendd m2, m2, m3, 0x33
@ -1176,7 +1228,6 @@ ALIGN function_align
vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a
vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubsw m1, m0, m3 ; t3a t2a t11 t10
paddsw m0, m3 ; -out15 out0 out14 -out1
@ -1184,10 +1235,21 @@ ALIGN function_align
psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
paddw m1, m2, m4
psubw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
ret
ALIGN function_align
.main_pass1_end:
vpbroadcastd m5, [o(pw_m2896_2896)]
vpbroadcastd m6, [o(pw_2896_2896)]
punpcklwd m1, m4, m2
punpckhwd m4, m2
pmaddwd m2, m5, m4
pmaddwd m4, m6
pmaddwd m5, m1
pmaddwd m1, m6
REPX {paddd x, m8}, m5, m1, m2, m4
REPX {psrad x, 12}, m5, m2, m1, m4
packssdw m2, m5 ; -out11 out8 out10 -out9
packssdw m1, m4 ; -out7 out4 out6 -out5
ret
INV_TXFM_4X16_FN flipadst, dct, 0
@ -1214,8 +1276,13 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(iadst_4x16_internal).main
pshufd m1, m1, q1032
vpbroadcastd m5, [o(pw_2896x8)]
paddsw m1, m2, m4
psubsw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
vpbroadcastd m6, [o(pw_2048)]
pshufd m1, m1, q1032
vpblendd m4, m0, m2, 0x33
vpblendd m0, m0, m1, 0xcc
vpblendd m1, m1, m3, 0xcc
@ -1381,7 +1448,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main
call m(iadst_4x8_internal).main_pass1
vinserti128 m0, m0, xm2, 1
vinserti128 m1, m1, xm3, 1
punpckhwd m2, m0, m1
@ -1393,7 +1460,6 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call .main
vpblendd m1, m1, m2, 0xcc
.end:
vpermq m0, m0, q3120
vpermq m1, m1, q3120
@ -1427,7 +1493,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw xm2, xm0, [cq+16*2]
pmulhrsw xm4, xm0
pmulhrsw xm5, xm0
call m(iadst_4x8_internal).main
call m(iadst_4x8_internal).main_pass1
vinserti128 m3, m3, xm1, 1
vinserti128 m2, m2, xm0, 1
punpckhwd m1, m3, m2
@ -1439,7 +1505,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(iadst_8x4_internal).main
vpblendd m2, m2, m1, 0x33
mova m2, m1
vpermq m1, m0, q2031
vpermq m0, m2, q2031
jmp m(iadst_8x4_internal).end2
@ -1580,7 +1646,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
call .main
call .main_pass1
vpbroadcastd m5, [o(pw_16384)]
punpcklwd m4, m0, m1
punpckhwd m0, m1
@ -1604,7 +1670,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call .main
call .main_pass2
vpbroadcastd m5, [o(pw_2048)]
vpbroadcastd xm4, [o(pw_4096)]
psubw m4, m5 ; lower half = 2048, upper half = -2048
@ -1629,8 +1695,12 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
WRITE_8X4 2, 3, 4, 5
RET
ALIGN function_align
.main:
IADST8_1D_PACKED
.main_pass1:
IADST8_1D_PACKED 1
ret
ALIGN function_align
.main_pass2:
IADST8_1D_PACKED 2
ret
INV_TXFM_8X8_FN flipadst, dct
@ -1643,7 +1713,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m5, [cq+32*1], q1302 ; 3 2
vpermq m2, [cq+32*2], q3120 ; 4 5
call m(iadst_8x8_internal).main
call m(iadst_8x8_internal).main_pass1
vpbroadcastd m5, [o(pw_16384)]
punpckhwd m4, m3, m2
punpcklwd m3, m2
@ -1667,7 +1737,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
.pass2:
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal).main
call m(iadst_8x8_internal).main_pass2
vpbroadcastd m4, [o(pw_2048)]
vpbroadcastd xm5, [o(pw_4096)]
psubw m4, m5 ; lower half = -2048, upper half = 2048
@ -1867,6 +1937,7 @@ INV_TXFM_8X16_FN adst, identity
cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
vpbroadcastd m10, [o(pw_16384)]
pslld m9, m10, 17
psubw m10, m9 ; 16384, -16384
@ -1874,6 +1945,7 @@ cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ALIGN function_align
.pass2:
call .main
call .main_pass2_end
vpbroadcastd m9, [o(pw_2048)]
vpbroadcastd xm8, [o(pw_4096)]
psubw m8, m9
@ -1930,38 +2002,72 @@ ALIGN function_align
paddsw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)]
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a
psubw m6, m9, m11 ; pw_3784_m1567
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a
vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)]
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14
psubw m6, m9, m11 ; pw_1567_m3784
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
vbroadcasti128 m11, [o(deint_shuf)]
vpbroadcastd m12, [o(pw_2896x8)]
psubsw m6, m0, m1 ; t3a t2a
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12
vbroadcasti128 m12, [o(deint_shuf)]
paddsw m6, m4, m7 ; -out1 out14
psubsw m4, m7 ; t10 t11
psubsw m11, m3, m8 ; t7 t6
paddsw m8, m3 ; out12 -out3
psubsw m3, m0, m1 ; t3a t2a
paddsw m0, m1 ; -out15 out0
paddsw m1, m2, m5 ; -out13 out2
psubsw m5, m2 ; t15a t14a
paddsw m2, m4, m7 ; -out1 out14
psubsw m4, m7 ; t10 t11
psubsw m7, m3, m8 ; t6 t7
paddsw m8, m3 ; -out3 out12
REPX {pshufb x, m11}, m6, m4, m0, m2
vpblendd m3, m6, m4, 0xcc ; t3a t11
shufps m6, m6, m4, q1032 ; t2a t10
vpblendd m4, m5, m7, 0xcc ; t15a t7
shufps m5, m5, m7, q1032 ; t14a t6
shufps m7, m2, m0, q1032 ; out14 -out15
vpblendd m0, m0, m2, 0x33 ; -out1 out0
paddw m2, m5, m4 ; -out5 out4
psubw m5, m4 ; out10 -out11
psubw m4, m6, m3 ; out8 -out9
paddw m3, m6 ; -out7 out6
shufps m6, m8, m1, q1032 ; out12 -out13
vpblendd m1, m1, m8, 0x33 ; -out3 out2
REPX {pmulhrsw x, m12}, m2, m3, m4, m5
pshufb m0, m12
pshufb m6, m12
pshufb m8, m12
pshufb m1, m12
shufps m7, m6, m0, q1032 ; out14 -out15
vpblendd m0, m6, 0x33 ; -out1 out0
punpcklqdq m6, m8, m1 ; out12 -out13
punpckhqdq m1, m8, m1 ; -out3 out2
ret
ALIGN function_align
.main_pass1_end:
vpbroadcastd m8, [o(pw_m2896_2896)]
vpbroadcastd m12, [o(pw_2896_2896)]
pmaddwd m9, m8, m11 ; -out11
pmaddwd m2, m12, m5 ; -out5
pmaddwd m5, m8 ; out10
pmaddwd m11, m12 ; out4
REPX {paddd x, m10}, m9, m5, m2, m11
REPX {psrad x, 12 }, m9, m5, m2, m11
packssdw m5, m9 ; out10 -out11
packssdw m2, m11 ; -out5 out4
pmaddwd m11, m8, m3 ; out8
vpbroadcastd m8, [o(pw_2896_m2896)]
pmaddwd m3, m12 ; -out7
pmaddwd m8, m4 ; -out9
pmaddwd m4, m12 ; out6
REPX {paddd x, m10}, m11, m3, m8, m4
REPX {psrad x, 12 }, m11, m3, m8, m4
packssdw m3, m4 ; -out7 out6
packssdw m4, m11, m8 ; out8 -out9
vpbroadcastd m10, [o(pw_16384)]
pxor m9, m9
ret
ALIGN function_align
.main_pass2_end:
vpbroadcastd m8, [o(pw_2896x8)]
pshufb m2, m11, m12
pshufb m5, m12
pshufb m3, m12
pshufb m4, m12
punpcklqdq m11, m5, m2 ; t15a t7
punpckhqdq m5, m2 ; t14a t6
shufps m2, m3, m4, q1032 ; t2a t10
vpblendd m3, m4, 0xcc ; t3a t11
psubsw m4, m2, m3 ; out8 -out9
paddsw m3, m2 ; -out7 out6
paddsw m2, m5, m11 ; -out5 out4
psubsw m5, m11 ; out10 -out11
REPX {pmulhrsw x, m8}, m2, m3, m4, m5
ret
INV_TXFM_8X16_FN flipadst, dct
@ -1972,6 +2078,7 @@ INV_TXFM_8X16_FN flipadst, identity
cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_8X16_LOAD_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
vpbroadcastd m9, [o(pw_16384)]
pslld m10, m9, 17
psubw m10, m9 ; -16384, 16384
@ -1990,6 +2097,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
jmp m(idct_8x16_internal).pass1_end2
.pass2:
call m(iadst_8x16_internal).main
call m(iadst_8x16_internal).main_pass2_end
vpbroadcastd m8, [o(pw_2048)]
vpbroadcastd xm9, [o(pw_4096)]
psubw m8, m9
@ -2232,7 +2340,7 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
pshufd m2, m2, q1032
call m(iadst_4x16_internal).main_pass1_end
punpcklwd m4, m3, m1
punpcklwd m5, m2, m0
punpckhwd m0, m1
@ -2276,20 +2384,26 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
RET
ALIGN function_align
.main:
vpbroadcastd m6, [o(pw_m3344_3344)]
vpbroadcastd m7, [o(pw_3803_1321)]
vpbroadcastd m8, [o(pw_m1321_2482)]
vpbroadcastd m9, [o(pw_2482_3344)]
punpcklwd m4, m2, m0 ; in2 in0 l
psubw m6, m0, m2
punpckhwd m2, m0 ; in2 in0 h
paddw m6, m3 ; t2
psrld m5, m6, 16
pmaddwd m10, m6, m4 ; t2:02 l
pmaddwd m6, m2 ; t2:02 h
pmaddwd m0, m7, m4 ; t0:02 l
pmaddwd m7, m2 ; t0:02 h
pmaddwd m4, m8 ; t1:02 l
pmaddwd m8, m2 ; t1:02 h
punpckhwd m2, m3, m1 ; in3 in1 h
punpcklwd m3, m1 ; in3 in1 l
pmaddwd m1, m5, m2 ; t2:3 h
pmaddwd m5, m3 ; t2:3 l
paddd m6, m1
vpbroadcastd m1, [o(pd_2048)]
paddd m10, m5
pmaddwd m5, m9, m3
pmaddwd m9, m2
paddd m0, m1
@ -2299,6 +2413,8 @@ ALIGN function_align
vpbroadcastd m9, [o(pw_m3803_3344)]
pmaddwd m5, m9, m2
pmaddwd m9, m3
paddd m10, m1 ; t2 + 2048 l
paddd m6, m1 ; t2 + 2048 h
paddd m5, m1 ; t1:13 + 2048 h
paddd m1, m9 ; t1:13 + 2048 l
vpbroadcastd m9, [o(pw_m3803_m6688)]
@ -2310,12 +2426,11 @@ ALIGN function_align
paddd m4, m0
paddd m2, m8 ; t0 + t1 - t3 + 2048 h
paddd m3, m4 ; t0 + t1 - t3 + 2048 l
REPX {psrad x, 12}, m0, m7, m5, m1, m2, m3
REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3
packssdw m0, m7
packssdw m1, m5
packssdw m3, m2
vpbroadcastd m2, [o(pw_3344x8)]
pmulhrsw m2, m6
packssdw m2, m10, m6
ret
INV_TXFM_16X4_FN flipadst, dct
@ -2329,7 +2444,7 @@ cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m1, [cq+32*1], q1230
vpermq m2, [cq+32*2], q2103
call m(iadst_4x16_internal).main2
pshufd m2, m2, q1032
call m(iadst_4x16_internal).main_pass1_end
punpckhwd m4, m3, m2
punpckhwd m5, m1, m0
punpcklwd m0, m2
@ -2552,7 +2667,7 @@ INV_TXFM_16X8_FN adst, identity
cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
call m(iadst_8x16_internal).main_pass1_end
psubw m11, m9, m10
punpcklwd m8, m0, m2
punpckhwd m0, m2
@ -2567,7 +2682,7 @@ cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ALIGN function_align
.pass2:
call .main
vpbroadcastd m9, [o(pw_2048)]
call .main_pass2_end
pxor m8, m8
psubw m8, m9
REPX {pmulhrsw x, m9}, m0, m2, m4, m6
@ -2591,21 +2706,50 @@ ALIGN function_align
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
psubsw m9, m6, m8 ; t7
paddsw m6, m8 ; out6
vpbroadcastd m8, [o(pw_2896x8)]
psubsw m3, m7, m5 ; t3
paddsw m7, m5 ; -out7
psubsw m5, m0, m2 ; t2
paddsw m0, m2 ; out0
psubsw m2, m1, m4 ; t6
paddsw m1, m4 ; -out1
psubw m4, m5, m3
paddw m3, m5
psubw m5, m2, m9
paddw m2, m9
ret
ALIGN function_align
.main_pass1_end:
vpbroadcastd m11, [o(pw_m2896_2896)]
vpbroadcastd m12, [o(pw_2896_2896)]
punpckhwd m4, m3, m5
punpcklwd m3, m5
pmaddwd m5, m11, m4
pmaddwd m4, m12
pmaddwd m8, m11, m3
pmaddwd m3, m12
REPX {paddd x, m10}, m5, m4, m8, m3
REPX {psrad x, 12 }, m5, m8, m4, m3
packssdw m3, m4 ; -out3
packssdw m4, m8, m5 ; out4
punpcklwd m5, m9, m2
punpckhwd m9, m2
pmaddwd m2, m12, m5
pmaddwd m5, m11
pmaddwd m12, m9
pmaddwd m11, m9
REPX {paddd x, m10}, m2, m5, m12, m11
REPX {psrad x, 12 }, m2, m12, m5, m11
packssdw m2, m12 ; out2
packssdw m5, m11 ; -out5
ret
ALIGN function_align
.main_pass2_end:
vpbroadcastd m8, [o(pw_2896x8)]
psubsw m4, m5, m3
paddsw m3, m5
psubsw m5, m2, m9
paddsw m2, m9
pmulhrsw m2, m8 ; out2
pmulhrsw m3, m8 ; -out3
pmulhrsw m4, m8 ; out4
pmulhrsw m5, m8 ; -out5
vpbroadcastd m9, [o(pw_2048)]
ret
INV_TXFM_16X8_FN flipadst, dct
@ -2616,7 +2760,7 @@ INV_TXFM_16X8_FN flipadst, identity
cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
ITX_16X8_LOAD_COEFS 1302
call m(iadst_8x16_internal).main2
vpbroadcastd m10, [o(pw_16384)]
call m(iadst_8x16_internal).main_pass1_end
psubw m9, m10
punpcklwd m8, m6, m4
punpckhwd m6, m4
@ -2655,7 +2799,7 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
jmp tx2q
.pass2:
call m(iadst_16x8_internal).main
vpbroadcastd m9, [o(pw_2048)]
call m(iadst_16x8_internal).main_pass2_end
pxor m8, m8
psubw m8, m9
pmulhrsw m10, m7, m8
@ -2986,8 +3130,12 @@ INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call .main
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
call .main_pass1_end
pmulhrsw m0, m1, [cq+32*0]
pmulhrsw m2, m1, [cq+32*1]
REPX {pmulhrsw x, m1}, m4, m6, m8, m10
pmulhrsw m12, m1, [cq+32*2]
pmulhrsw m14, m1, [cq+32*3]
vextracti128 [rsp+16*5], m8, 1
mova [rsp+16*1], xm8
pxor m8, m8
@ -2996,7 +3144,7 @@ cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ALIGN function_align
.pass2:
call .main
vpbroadcastd m1, [o(pw_2048)]
call .main_pass2_end
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
mova [rsp+32*0], m6
pxor m6, m6
@ -3081,16 +3229,73 @@ ALIGN function_align
paddsw m0, m12 ; out0
paddsw m12, m8, m5 ; out12
psubsw m8, m5 ; t7
paddw m5, m10, m11 ; -out5
psubw m10, m11 ; out10
psubw m11, m4, m8 ; -out11
paddw m4, m8 ; out4
psubw m8, m7, m9 ; out8
paddw m7, m9 ; -out7
psubw m9, m1, m6 ; -out9
paddw m6, m1 ; out6
ret
ALIGN function_align
.main_pass1_end:
mova [cq+32*0], m0
mova [cq+32*1], m2
mova [cq+32*2], m12
mova [cq+32*3], m14
vpbroadcastd m14, [pw_m2896_2896]
vpbroadcastd m12, [pw_2896_2896]
vpbroadcastd m2, [pd_2048]
punpcklwd m5, m11, m10
punpckhwd m11, m10
pmaddwd m10, m14, m5
pmaddwd m0, m14, m11
pmaddwd m5, m12
pmaddwd m11, m12
REPX {paddd x, m2}, m10, m0, m5, m11
REPX {psrad x, 12}, m10, m0, m5, m11
packssdw m10, m0 ; out10
packssdw m5, m11 ; -out5
punpcklwd m11, m8, m4
punpckhwd m8, m4
pmaddwd m4, m12, m11
pmaddwd m0, m12, m8
pmaddwd m11, m14
pmaddwd m8, m14
REPX {paddd x, m2}, m4, m0, m11, m8
REPX {psrad x, 12}, m4, m0, m11, m8
packssdw m4, m0 ; out4
packssdw m11, m8 ; -out11
punpcklwd m8, m9, m7
punpckhwd m9, m7
pmaddwd m7, m12, m8
pmaddwd m0, m12, m9
pmaddwd m8, m14
pmaddwd m9, m14
REPX {paddd x, m2}, m7, m0, m8, m9
REPX {psrad x, 12}, m7, m0, m8, m9
packssdw m7, m0 ; -out7
packssdw m8, m9 ; out8
punpckhwd m0, m6, m1
punpcklwd m6, m1
pmaddwd m1, m14, m0
pmaddwd m9, m14, m6
pmaddwd m0, m12
pmaddwd m6, m12
REPX {paddd x, m2}, m1, m9, m0, m6
REPX {psrad x, 12}, m1, m9, m0, m6
packssdw m9, m1 ; -out7
packssdw m6, m0 ; out8
vpbroadcastd m1, [o(pw_8192)]
ret
ALIGN function_align
.main_pass2_end:
; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to
; 16-bit here will produce the same result as using 32-bit intermediates.
paddsw m5, m10, m11 ; -out5
psubsw m10, m11 ; out10
psubsw m11, m4, m8 ; -out11
paddsw m4, m8 ; out4
psubsw m8, m7, m9 ; out8
paddsw m7, m9 ; -out7
psubsw m9, m1, m6 ; -out9
paddsw m6, m1 ; out6
vpbroadcastd m1, [o(pw_2896x8)]
REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11
vpbroadcastd m1, [o(pw_2048)]
ret
INV_TXFM_16X16_FN flipadst, dct
@ -3100,16 +3305,16 @@ INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
ITX_16X16_LOAD_COEFS
call m(iadst_16x16_internal).main
vpbroadcastd m1, [o(pw_8192)]
call m(iadst_16x16_internal).main_pass1_end
pmulhrsw m6, m1
pmulhrsw m2, m1, m8
mova [rsp+32*2], m6
pmulhrsw m6, m1, m4
pmulhrsw m4, m1, m10
pmulhrsw m10, m1, m12
pmulhrsw m12, m1, m2
pmulhrsw m2, m1, m8
pmulhrsw m8, m1, m14
pmulhrsw m14, m1, m0
pmulhrsw m8, m1, [cq+32*3]
pmulhrsw m10, m1, [cq+32*2]
pmulhrsw m12, m1, [cq+32*1]
pmulhrsw m14, m1, [cq+32*0]
pxor m0, m0
psubw m0, m1
REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15
@ -3136,7 +3341,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
jmp m(idct_16x16_internal).pass1_end3
.pass2:
call m(iadst_16x16_internal).main
vpbroadcastd m1, [o(pw_2048)]
call m(iadst_16x16_internal).main_pass2_end
pmulhrsw m0, m1
pmulhrsw m8, m1
mova [rsp+32*0], m0

View File

@ -43,8 +43,11 @@ pw_1321_3803: times 4 dw 1321, 3803
pw_2482_m1321: times 4 dw 2482, -1321
pw_3344_2482: times 4 dw 3344, 2482
pw_3344_m3803: times 4 dw 3344, -3803
pw_3344_m3344: times 4 dw 3344, -3344
pw_0_3344 times 4 dw 0, 3344
pw_m6688_m3803: times 4 dw -6688, -3803
COEF_PAIR 2896, 2896
COEF_PAIR 1567, 3784
COEF_PAIR 799, 4017
COEF_PAIR 3406, 2276
@ -126,7 +129,6 @@ pw_2675x8: times 8 dw 2675*8
pw_4085x8: times 8 dw 4085*8
pw_m301x8: times 8 dw -301*8
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
@ -200,7 +202,6 @@ SECTION .text
ret
%endmacro
; flags: 1 = swap, 2: coef_regs
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
@ -239,35 +240,6 @@ SECTION .text
paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
%macro IADST4_1D_PACKED 0
punpcklwd m2, m0, m1 ;unpacked in0 in2
punpckhwd m3, m0, m1 ;unpacked in1 in3
psubw m0, m1
punpckhqdq m1, m1 ;
paddw m1, m0 ;low: in0 - in2 + in3
pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
paddd m4, m0 ;t0 + t3
pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
pmulhrsw m1, [o(pw_3344x8)] ;low: out2
mova m0, [o(pd_2048)]
paddd m2, m0
paddd m0, m4 ;t0 + t3 + 2048
paddd m5, m2 ;t1 + t3 + 2048
paddd m2, m4
paddd m2, m3 ;t0 + t1 - t3 + 2048
psrad m0, 12 ;out0
psrad m5, 12 ;out1
psrad m2, 12 ;out3
packssdw m0, m5 ;high: out1 ;low: out0
packssdw m2, m2 ;high: out3 ;low: out3
%endmacro
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
%undef cmp
@ -392,15 +364,14 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call .main
punpckhwd m3, m0, m2
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m3 ;high: in3 ;low :in2
punpcklwd m0, m3 ;high: in1 ;low: in0
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
punpcklwd m0, m2 ;high: in1 ;low: in0
jmp tx2q
.pass2:
call .main
punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
@ -412,7 +383,28 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ALIGN function_align
.main:
IADST4_1D_PACKED
punpcklwd m2, m0, m1 ;unpacked in0 in2
punpckhwd m0, m1 ;unpacked in1 in3
mova m3, m0
pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2
pmaddwd m0, [o(pw_0_3344)] ;3344 * in3
paddd m1, m0 ;t2
pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
paddd m4, m0 ;t0 + t3
pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
mova m0, [o(pd_2048)]
paddd m1, m0 ;t2 + 2048
paddd m2, m0
paddd m0, m4 ;t0 + t3 + 2048
paddd m5, m2 ;t1 + t3 + 2048
paddd m2, m4
paddd m2, m3 ;t0 + t1 - t3 + 2048
REPX {psrad x, 12}, m1, m0, m5, m2
packssdw m0, m5 ;high: out1 ;low: out0
packssdw m1, m2 ;high: out3 ;low: out3
ret
INV_TXFM_4X4_FN flipadst, dct, 0
@ -424,16 +416,14 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call m(iadst_4x4_internal).main
punpcklwd m1, m0
punpckhwd m2, m0
punpcklwd m0, m2, m1 ;high: in3 ;low :in2
punpckhwd m2, m1 ;high: in1 ;low: in0
mova m1, m2
punpcklwd m2, m1, m0
punpckhwd m1, m0
punpcklwd m0, m1, m2 ;high: in3 ;low :in2
punpckhwd m1, m2 ;high: in1 ;low: in0
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
@ -584,99 +574,6 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m%4, m%5
%endmacro
%macro IADST4_1D 0
mova m4, m2
psubw m2, m0, m4
paddw m2, m3 ;low: in0 - in2 + in3
punpckhwd m6, m0, m4 ;unpacked in0 in2
punpckhwd m7, m1, m3 ;unpacked in1 in3
punpcklwd m0, m4 ;unpacked in0 in2
punpcklwd m1, m3 ;unpacked in1 in3
pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m3, m4 ;t0 + t3
pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
pmulhrsw m2, [o(pw_3344x8)] ;out2
mova m4, [o(pd_2048)]
paddd m0, m4
paddd m4, m3 ;t0 + t3 + 2048
paddd m5, m0 ;t1 + t3 + 2048
paddd m3, m0
paddd m3, m1 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m3, 12 ;out3
packssdw m0, m4, m5 ;low: out0 high: out1
pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m1, m4 ;t0 + t3
pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
mova m4, [o(pd_2048)]
paddd m6, m4
paddd m4, m1 ;t0 + t3 + 2048
paddd m5, m6 ;t1 + t3 + 2048
paddd m1, m6
paddd m1, m7 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m1, 12 ;out3
packssdw m3, m1 ;out3
packssdw m4, m5 ;low: out0 high: out1
punpckhqdq m1, m0, m4 ;out1
punpcklqdq m0, m4 ;out0
%endmacro
%macro IADST8_1D_PACKED 0
mova m6, [o(pd_2048)]
punpckhwd m4, m3, m0 ;unpacked in7 in0
punpckhwd m5, m2, m1 ;unpacked in5 in2
punpcklwd m1, m2 ;unpacked in3 in4
punpcklwd m0, m3 ;unpacked in1 in6
ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
psubsw m3, m4, m1 ;low: t4 high: t5
paddsw m4, m1 ;low: t0 high: t1
psubsw m2, m5, m0 ;low: t6 high: t7
paddsw m5, m0 ;low: t2 high: t3
shufps m1, m3, m2, q1032
punpckhwd m2, m1
punpcklwd m3, m1
ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
psubsw m1, m4, m5 ;low: t2 high: t3
paddsw m4, m5 ;low: out0 high: -out7
psubsw m5, m3, m2 ;low: t7 high: t6
paddsw m3, m2 ;low: out6 high: -out1
shufps m0, m4, m3, q3210 ;low: out0 high: -out1
shufps m3, m4, q3210 ;low: out6 high: -out7
shufps m4, m1, m5, q1032 ;low: t3 high: t7
shufps m1, m5, q3210 ;low: t2 high: t6
mova m5, [o(pw_2896x8)]
psubw m2, m1, m4 ;low: t2-t3 high: t6-t7
paddw m1, m4 ;low: t2+t3 high: t6+t7
pmulhrsw m2, m5 ;low: out4 high: -out5
shufps m1, m1, q1032
pmulhrsw m1, m5 ;low: out2 high: -out3
%endmacro
%macro WRITE_4X8 4 ;row[1-4]
WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
lea dstq, [dstq+strideq*4]
@ -838,7 +735,48 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ALIGN function_align
.main:
IADST8_1D_PACKED
mova m6, [o(pd_2048)]
punpckhwd m4, m3, m0 ;unpacked in7 in0
punpckhwd m5, m2, m1 ;unpacked in5 in2
punpcklwd m1, m2 ;unpacked in3 in4
punpcklwd m0, m3 ;unpacked in1 in6
ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
psubsw m3, m4, m1 ;low: t4 high: t5
paddsw m4, m1 ;low: t0 high: t1
psubsw m2, m5, m0 ;low: t6 high: t7
paddsw m5, m0 ;low: t2 high: t3
shufps m1, m3, m2, q1032
punpckhwd m2, m1
punpcklwd m3, m1
ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
psubsw m1, m4, m5 ;low: t2 high: t3
paddsw m4, m5 ;low: out0 high: -out7
psubsw m5, m3, m2 ;low: t7 high: t6
paddsw m3, m2 ;low: out6 high: -out1
shufps m0, m4, m3, q3210 ;low: out0 high: -out1
shufps m3, m4, q3210 ;low: out6 high: -out7
mova m2, [o(pw_2896_m2896)]
mova m7, [o(pw_2896_2896)]
shufps m4, m1, m5, q1032 ;low: t3 high: t7
shufps m1, m5, q3210 ;low: t2 high: t6
punpcklwd m5, m1, m4
punpckhwd m1, m4
pmaddwd m4, m2, m1 ;-out5
pmaddwd m2, m5 ; out4
pmaddwd m1, m7 ; out2
pmaddwd m5, m7 ;-out3
REPX {paddd x, m6}, m4, m2, m1, m5
REPX {psrad x, 12}, m4, m2, m1, m5
packssdw m1, m5 ;low: out2 high: -out3
packssdw m2, m4 ;low: out4 high: -out5
ret
INV_TXFM_4X8_FN flipadst, dct, 0
@ -1109,7 +1047,67 @@ cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ALIGN function_align
.main:
IADST4_1D
punpckhwd m6, m0, m2 ;unpacked in0 in2
punpcklwd m0, m2 ;unpacked in0 in2
punpckhwd m7, m1, m3 ;unpacked in1 in3
punpcklwd m1, m3 ;unpacked in1 in3
mova m2, [o(pw_3344_m3344)]
mova m4, [o(pw_0_3344)]
pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2
pmaddwd m5, m4, m7 ;3344 * in3
pmaddwd m2, m0
pmaddwd m4, m1
paddd m3, m5
paddd m2, m4
mova m4, [o(pd_2048)]
paddd m3, m4 ;t2 + 2048
paddd m2, m4
psrad m3, 12
psrad m2, 12
packssdw m2, m3 ;out2
pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m3, m4 ;t0 + t3
pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
mova m4, [o(pd_2048)]
paddd m0, m4
paddd m4, m3 ;t0 + t3 + 2048
paddd m5, m0 ;t1 + t3 + 2048
paddd m3, m0
paddd m3, m1 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m3, 12 ;out3
packssdw m0, m4, m5 ;low: out0 high: out1
pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m1, m4 ;t0 + t3
pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
mova m4, [o(pd_2048)]
paddd m6, m4
paddd m4, m1 ;t0 + t3 + 2048
paddd m5, m6 ;t1 + t3 + 2048
paddd m1, m6
paddd m1, m7 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m1, 12 ;out3
packssdw m3, m1 ;out3
packssdw m4, m5 ;low: out0 high: out1
punpckhqdq m1, m0, m4 ;out1
punpcklqdq m0, m4 ;out0
ret
INV_TXFM_8X4_FN flipadst, dct
@ -1423,6 +1421,7 @@ cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1:
call .main
call .main_pass1_end
.pass1_end:
mova m7, [o(pw_16384)]
@ -1441,6 +1440,7 @@ ALIGN function_align
.pass2_main:
call .main
call .main_pass2_end
.end:
mova m7, [o(pw_2048)]
@ -1491,10 +1491,57 @@ ALIGN function_align
psubsw m5, m6 ;t6
paddsw m6, m2, m7 ;out6
psubsw m2, m7 ;t7
paddw m7, m4, m3 ;t2 + t3
psubw m4, m3 ;t2 - t3
paddw m3, m5, m2 ;t6 + t7
psubw m5, m2 ;t6 - t7
ret
ALIGN function_align
.main_pass1_end:
mova [rsp+gprsize*2+16*1], m1
mova [rsp+gprsize*2+16*2], m6
punpckhwd m1, m4, m3
punpcklwd m4, m3
punpckhwd m7, m5, m2
punpcklwd m5, m2
mova m2, [o(pw_2896_2896)]
mova m6, [o(pd_2048)]
pmaddwd m3, m2, m7
pmaddwd m2, m5
paddd m3, m6
paddd m2, m6
psrad m3, 12
psrad m2, 12
packssdw m2, m3 ;out2
mova m3, [o(pw_2896_m2896)]
pmaddwd m7, m3
pmaddwd m5, m3
paddd m7, m6
paddd m5, m6
psrad m7, 12
psrad m5, 12
packssdw m5, m7 ;-out5
mova m3, [o(pw_2896_2896)]
pmaddwd m7, m3, m1
pmaddwd m3, m4
paddd m7, m6
paddd m3, m6
psrad m7, 12
psrad m3, 12
packssdw m3, m7 ;-out3
mova m7, [o(pw_2896_m2896)]
pmaddwd m1, m7
pmaddwd m4, m7
paddd m1, m6
paddd m4, m6
psrad m1, 12
psrad m4, 12
packssdw m4, m1 ;-out5
mova m1, [rsp+gprsize*2+16*1]
mova m6, [rsp+gprsize*2+16*2]
ret
ALIGN function_align
.main_pass2_end:
paddsw m7, m4, m3 ;t2 + t3
psubsw m4, m3 ;t2 - t3
paddsw m3, m5, m2 ;t6 + t7
psubsw m5, m2 ;t6 - t7
mova m2, [o(pw_2896x8)]
pmulhrsw m4, m2 ;out4
pmulhrsw m5, m2 ;-out5
@ -1513,6 +1560,7 @@ cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass1:
call m(iadst_8x8_internal).main
call m(iadst_8x8_internal).main_pass1_end
.pass1_end:
mova m7, [o(pw_m16384)]
@ -1542,6 +1590,7 @@ ALIGN function_align
.pass2_main:
call m(iadst_8x8_internal).main
call m(iadst_8x8_internal).main_pass2_end
.end:
mova m7, [o(pw_2048)]
@ -1753,6 +1802,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
call m(iadst_16x4_internal).main
call m(iadst_16x4_internal).main_pass2_end
punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
punpckhqdq m4, m5 ;low: out8 high: out10
@ -1820,6 +1870,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
call m(iadst_16x4_internal).main
call m(iadst_16x4_internal).main_pass2_end
punpckhqdq m6, m5, m4 ;low: out5 high: out7
punpcklqdq m4, m5 ;low: -out8 high: -out10
@ -2160,6 +2211,7 @@ INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
call .main
call .main_pass1_end
punpckhwd m6, m7, m0 ;packed -out11, -out15
punpcklwd m0, m7 ;packed out0, out4
@ -2193,92 +2245,137 @@ cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ALIGN function_align
.main:
mova [coeffq+16*6], m0
pshufd m1, m1, q1032
pshufd m0, m1, q1032
pshufd m2, m2, q1032
punpckhwd m0, m6, m1 ;packed in13, in2
punpcklwd m1, m6 ;packed in3, in12
punpckhwd m6, m5, m2 ;packed in11, in4
punpckhwd m1, m6, m0 ;packed in13, in2
punpcklwd m0, m6 ;packed in3, in12
punpckhwd m7, m5, m2 ;packed in11, in4
punpcklwd m2, m5 ;packed in5, in10
mova m7, [o(pd_2048)]
ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3
ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5
ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11
ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13
psubsw m5, m0, m2 ;low:t10a high:t11a
paddsw m0, m2 ;low:t2a high:t3a
psubsw m2, m6, m1 ;low:t12a high:t13a
paddsw m6, m1 ;low:t4a high:t5a
punpcklqdq m1, m5
punpckhwd m1, m5 ;packed t10a, t11a
mova m6, [o(pd_2048)]
ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3
ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5
ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11
ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13
psubsw m5, m1, m2 ;low:t10a high:t11a
paddsw m1, m2 ;low:t2a high:t3a
psubsw m2, m7, m0 ;low:t12a high:t13a
paddsw m7, m0 ;low:t4a high:t5a
punpcklqdq m0, m5
punpckhwd m0, m5 ;packed t10a, t11a
punpcklqdq m5, m2
punpckhwd m2, m5 ;packed t13a, t12a
ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11
ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13
mova [coeffq+16*4], m0
mova [coeffq+16*5], m6
mova m0, [coeffq+16*6]
mova m6, [coeffq+16*7]
pshufd m0, m0, q1032
ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11
ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13
mova [coeffq+16*4], m1
mova [coeffq+16*5], m7
mova m1, [coeffq+16*6]
mova m7, [coeffq+16*7]
pshufd m1, m1, q1032
pshufd m3, m3, q1032
punpckhwd m5, m6, m0 ;packed in15, in0
punpcklwd m0, m6 ;packed in1, in14
punpckhwd m6, m4, m3 ;packed in9, in6
punpckhwd m5, m7, m1 ;packed in15, in0
punpcklwd m1, m7 ;packed in1, in14
punpckhwd m7, m4, m3 ;packed in9, in6
punpcklwd m3, m4 ;packed in7, in8
ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1
ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7
ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9
ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15
ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1
ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7
ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9
ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15
psubsw m4, m5, m3 ;low:t8a high:t9a
paddsw m5, m3 ;low:t0a high:t1a
psubsw m3, m6, m0 ;low:t14a high:t15a
paddsw m6, m0 ;low:t6a high:t7a
punpcklqdq m0, m4
punpckhwd m0, m4 ;packed t8a, t9a
psubsw m3, m7, m1 ;low:t14a high:t15a
paddsw m7, m1 ;low:t6a high:t7a
punpcklqdq m1, m4
punpckhwd m1, m4 ;packed t8a, t9a
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t15a, t14a
ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9
ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15
psubsw m4, m0, m2 ;low:t12a high:t13a
paddsw m0, m2 ;low:t8a high:t9a
psubsw m2, m1, m3 ;low:t14a high:t15a
paddsw m1, m3 ;low:t10a high:t11a
punpcklqdq m3, m4
punpckhwd m3, m4 ;packed t12a, t13a
punpcklqdq m4, m2
punpckhwd m2, m4 ;packed t15a, t14a
ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13
ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15
psubsw m4, m0, m1 ;low:t10 high:t11
paddsw m0, m1 ;low:-out1 high:out14
ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9
ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15
paddsw m4, m1, m2 ;low:t12a high:t13a
psubsw m1, m2 ;low:t8a high:t9a
psubsw m2, m0, m3 ;low:t14a high:t15a
paddsw m0, m3 ;low:t10a high:t11a
punpcklqdq m3, m1
punpckhwd m3, m1 ;packed t12a, t13a
punpcklqdq m1, m2
punpckhwd m2, m1 ;packed t15a, t14a
ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13
ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15
psubsw m1, m3, m2 ;low:t14a high:t15a
paddsw m3, m2 ;low:out2 high:-out13
punpckhqdq m2, m4, m1 ;low:t11 high:t15a
punpcklqdq m4, m1 ;low:t10 high:t14a
psubw m1, m4, m2
paddw m2, m4
psubsw m2, m4, m0 ;low:t10 high:t11
paddsw m0, m4 ;low:-out1 high:out14
mova [coeffq+16*6], m0
mova [coeffq+16*7], m3
mova m0, [coeffq+16*4]
mova m3, [coeffq+16*5]
psubsw m4, m5, m3 ;low:t4 high:t5
paddsw m5, m3 ;low:t0 high:t1
psubsw m3, m0 ,m6 ;low:t6 high:t7
paddsw m0, m6 ;low:t2 high:t3
punpcklqdq m6, m4
punpckhwd m6, m4 ;packed t4, t5
psubsw m3, m0, m7 ;low:t6 high:t7
paddsw m0, m7 ;low:t2 high:t3
punpcklqdq m7, m4
punpckhwd m7, m4 ;packed t4, t5
punpcklqdq m4, m3
punpckhwd m3, m4 ;packed t7, t6
ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a
ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a
ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a
ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a
psubsw m4, m5, m0 ;low:t2a high:t3a
paddsw m0, m5 ;low:out0 high:-out15
psubsw m5, m6, m3 ;low:t6 high:t7
paddsw m3, m6 ;low:-out3 high:out12
psubsw m5, m7, m3 ;low:t6 high:t7
paddsw m3, m7 ;low:-out3 high:out12
ret
ALIGN function_align
.main_pass1_end:
mova m7, [o(deint_shuf1)]
mova [coeffq+16*4], m0
mova [coeffq+16*5], m3
mova m0, [o(pw_2896_m2896)]
mova m3, [o(pw_2896_2896)]
pshufb m1, m7 ;t14a t15a
pshufb m2, m7 ;t10 t11
pshufb m4, m7 ;t2a t3a
pshufb m5, m7 ;t6 t7
pmaddwd m7, m0, m2
pmaddwd m2, m3
paddd m7, m6
paddd m2, m6
psrad m7, 12
psrad m2, 12
packssdw m2, m7 ;low:out6 high:-out9
pmaddwd m7, m0, m4
pmaddwd m4, m3
paddd m7, m6
paddd m4, m6
psrad m7, 12
psrad m4, 12
packssdw m4, m7 ;low:-out7 high:out8
pmaddwd m7, m3, m5
pmaddwd m5, m0
paddd m7, m6
paddd m5, m6
psrad m7, 12
psrad m5, 12
packssdw m7, m5 ;low:out4 high:-out11
pmaddwd m5, m3, m1
pmaddwd m1, m0
paddd m5, m6
paddd m1, m6
psrad m5, 12
psrad m1, 12
packssdw m5, m1 ;low:-out5 high:out10
mova m0, [coeffq+16*4]
mova m3, [coeffq+16*5]
ret
ALIGN function_align
.main_pass2_end:
mova m7, [o(pw_2896x8)]
punpckhqdq m6, m2, m1 ;low:t11 high:t15a
punpcklqdq m2, m1 ;low:t10 high:t14a
psubsw m1, m2, m6
paddsw m2, m6
punpckhqdq m6, m4, m5 ;low:t3a high:t7
punpcklqdq m4, m5 ;low:t2a high:t6
psubw m5, m4, m6
paddw m4, m6
psubsw m5, m4, m6
paddsw m4, m6
pmulhrsw m1, m7 ;low:-out9 high:out10
pmulhrsw m2, m7 ;low:out6 high:-out5
pmulhrsw m5, m7 ;low:out8 high:-out11
@ -2298,6 +2395,7 @@ INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
call m(iadst_16x4_internal).main
call m(iadst_16x4_internal).main_pass1_end
punpcklwd m6, m7, m0 ;packed out11, out15
punpckhwd m0, m7 ;packed -out0, -out4
@ -2360,7 +2458,7 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%endmacro
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16
%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
@ -2548,6 +2646,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m7, [coeffq+16*11]
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass2_end
mov r3, dstq
lea dstq, [dstq+strideq*8]
@ -2599,6 +2698,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m7, [coeffq+16*11]
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass2_end
jmp m(iflipadst_8x8_internal).end
.end:
@ -2652,7 +2752,7 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12
INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@ -2893,6 +2993,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m7, [coeffq+16*13]
call .main
call .main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iadst_16x8_internal).pass1_end)]
jmp m(iadst_8x8_internal).pass1_end
@ -2998,23 +3099,15 @@ ALIGN function_align
mova [rsp+gprsize*2+16*6], m3 ;-out3
psubsw m3, m0, m4 ;t7
paddsw m0, m4 ;out12
mova m7, [o(pw_2896x8)]
psubw m4, m2, m3
paddw m2, m3
mova [rsp+gprsize*2+16*12], m3
mova m3, [rsp+gprsize*2+16*7] ;t3
pmulhrsw m4, m7 ;-out11
pmulhrsw m2, m7 ;out4
mova [rsp+gprsize*2+16*7], m2 ;out4
mova [rsp+gprsize*2+16* 7], m2 ;out4
psubsw m2, m5, m3 ;t3a
paddsw m5, m3 ;-out15
psubw m3, m1, m2
paddw m1, m2
mova [rsp+gprsize*2+16*11], m2
mova m2, [rsp+gprsize*2+32*5] ;t15
pmulhrsw m3, m7 ;out8
pmulhrsw m1, m7 ;-out7
mova [rsp+gprsize*2+32*5 ], m1 ;-out7
mova [rsp+gprsize*2+16*10], m1 ;-out7
mova m1, [rsp+gprsize*2+16*0] ;t11
mova [rsp+gprsize*2+16*11], m3 ;out8
mova [rsp+gprsize*2+16*0 ], m5 ;-out15
mova m3, [rsp+gprsize*2+16*1] ;t10
mova [rsp+gprsize*2+16*1 ], m4 ;-out11
@ -3044,26 +3137,106 @@ ALIGN function_align
paddsw m2, m6 ;-out1
paddsw m6, m4, m1 ;out14
psubsw m4, m1 ;t11
psubw m1, m3, m4
paddw m3, m4
pmulhrsw m1, m7 ;-out9
pmulhrsw m3, m7 ;out6
mova [rsp+gprsize*2+16*4], m2 ;-out1
mova [rsp+gprsize*2+16*14], m4
mova [rsp+gprsize*2+16* 4], m2 ;-out1
mova m4, [rsp+gprsize*2+16*8] ;t14
mova m2, [rsp+gprsize*2+16*9] ;t15
mova [rsp+gprsize*2+16*9], m3 ;out6
mova [rsp+gprsize*2+16* 9], m3 ;out6
psubsw m3, m0, m4 ;t14a
paddsw m0, m4 ;out2
psubsw m4, m5, m2 ;t15a
paddsw m5, m2 ;-out13
psubw m2, m3, m4
paddw m3, m4
mova [rsp+gprsize*2+16*5], m0 ;out2
pmulhrsw m3, m7 ;-out5
mova [rsp+gprsize*2+16* 5], m0 ;out2
ret
ALIGN function_align
.main_pass1_end:
mova m0, [rsp+gprsize*2+16*14]
mova [rsp+gprsize*2+16*14], m5
mova [rsp+gprsize*2+16*15], m6
mova m5, [o(pw_2896_2896)]
mova m6, [o(pw_2896_m2896)]
mova m7, [o(pd_2048)]
punpcklwd m2, m3, m4
punpckhwd m3, m4
pmaddwd m4, m5, m2
pmaddwd m2, m6
pmaddwd m1, m5, m3
pmaddwd m3, m6
REPX {paddd x, m7}, m4, m2, m1, m3
REPX {psrad x, 12}, m4, m1, m2, m3
packssdw m4, m1 ;-out5
packssdw m2, m3 ;out10
mova [rsp+gprsize*2+16* 8], m4
mova m3, [rsp+gprsize*2+16* 9]
punpcklwd m1, m3, m0
punpckhwd m3, m0
pmaddwd m0, m5, m1
pmaddwd m1, m6
pmaddwd m4, m5, m3
pmaddwd m3, m6
REPX {paddd x, m7}, m0, m1, m4, m3
REPX {psrad x, 12}, m0, m4, m1, m3
packssdw m0, m4 ;out6
packssdw m1, m3 ;-out9
mova [rsp+gprsize*2+16* 9], m0
mova m0, [rsp+gprsize*2+16* 7]
mova m4, [rsp+gprsize*2+16*12]
punpcklwd m3, m0, m4
punpckhwd m0, m4
pmaddwd m4, m5, m3
pmaddwd m3, m6
pmaddwd m5, m0
pmaddwd m0, m6
REPX {paddd x, m7}, m4, m3, m5, m0
REPX {psrad x, 12}, m4, m5, m3, m0
packssdw m4, m5 ;out4
packssdw m3, m0 ;-out11
mova [rsp+gprsize*2+16* 7], m4
mova m4, [rsp+gprsize*2+16*10]
mova m5, [rsp+gprsize*2+16*11]
punpcklwd m0, m4, m5
punpckhwd m4, m5
pmaddwd m5, m0, [o(pw_2896_2896)]
pmaddwd m0, m6
pmaddwd m6, m4
pmaddwd m4, [o(pw_2896_2896)]
REPX {paddd x, m7}, m5, m0, m6, m4
REPX {psrad x, 12}, m0, m6, m5, m4
packssdw m0, m6 ;out8
packssdw m5, m4 ;-out7
mova [rsp+gprsize*2+16*10], m5
mova m4, [rsp+gprsize*2+16* 2] ;out12
mova m5, [rsp+gprsize*2+16*14] ;-out13
mova m6, [rsp+gprsize*2+16*15] ;out14
ret
ALIGN function_align
.main_pass2_end:
mova m7, [o(pw_2896x8)]
mova m1, [rsp+gprsize*2+16* 9]
mova m2, [rsp+gprsize*2+16*14]
paddsw m0, m1, m2
psubsw m1, m2
pmulhrsw m0, m7 ;out6
pmulhrsw m1, m7 ;-out9
mova [rsp+gprsize*2+16* 9], m0
psubsw m2, m3, m4
paddsw m3, m4
pmulhrsw m2, m7 ;out10
mova [rsp+gprsize*2+16*8], m3 ;-out5
mova m0, [rsp+gprsize*2+16*11] ;out8
mova m3, [rsp+gprsize*2+16*1 ] ;-out11
pmulhrsw m3, m7 ;-out5
mova [rsp+gprsize*2+16* 8], m3
mova m3, [rsp+gprsize*2+16* 7]
mova m4, [rsp+gprsize*2+16*12]
paddsw m0, m3, m4
psubsw m3, m4
pmulhrsw m0, m7 ;out4
pmulhrsw m3, m7 ;-out11
mova [rsp+gprsize*2+16* 7], m0
mova m0, [rsp+gprsize*2+16*10]
paddsw m4, m0, [rsp+gprsize*2+16*11]
psubsw m0, [rsp+gprsize*2+16*11]
pmulhrsw m4, m7 ;-out7
pmulhrsw m0, m7 ;out8
mova [rsp+gprsize*2+16*10], m4
mova m4, [rsp+gprsize*2+16*2 ] ;out12
ret
@ -3100,6 +3273,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m7, [coeffq+16*13]
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32
@ -3184,7 +3358,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*12
INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16
%ifidn %1_%2, dct_dct
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
@ -3423,6 +3597,7 @@ INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X16_ADST_LOAD_ODD_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iadst_16x16_internal).pass1_end)]
@ -3441,6 +3616,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*1, 32
ITX_16X16_ADST_LOAD_EVEN_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)]
mova m7, [o(pw_8192)]
@ -3496,6 +3672,7 @@ INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X16_ADST_LOAD_ODD_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
mov r3, tx2q
lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)]
@ -3514,6 +3691,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
SAVE_8ROWS coeffq+16*17, 32
ITX_16X16_ADST_LOAD_EVEN_COEFS
call m(iadst_16x8_internal).main
call m(iadst_16x8_internal).main_pass1_end
mova m7, [rsp+gprsize+16*0]
SAVE_8ROWS coeffq+16*0, 32

View File

@ -28,14 +28,27 @@
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_avx2);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_avx2);
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_avx2);
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_avx2);
#define decl_loopfilter_sb_fns(ext) \
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_y_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_y_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_h_sb_uv_##ext); \
decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
decl_loopfilter_sb_fns(ssse3);
decl_loopfilter_sb_fns(avx2);
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
#if BITDEPTH == 8 && ARCH_X86_64

File diff suppressed because it is too large Load Diff

View File

@ -170,8 +170,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.put:
movzx wd, word [t2+wq*2+table_offset(put,)]
add wq, t2
lea t1, [ssq*3]
lea t2, [dsq*3]
jmp wq
.put_w2:
movzx t0d, word [srcq+ssq*0]
@ -194,11 +192,11 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
jg .put_w4
RET
.put_w8:
movq m0, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
mov t0, [srcq+ssq*0]
mov t1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq [dstq+dsq*0], m0
movq [dstq+dsq*1], m1
mov [dstq+dsq*0], t0
mov [dstq+dsq*1], t1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
@ -206,30 +204,22 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.put_w16:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+t1 ]
lea srcq, [srcq+ssq*4]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
mova [dstq+dsq*2], m2
mova [dstq+t2 ], m3
lea dstq, [dstq+dsq*4]
sub hd, 4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
INIT_YMM avx2
.put_w32:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+t1 ]
lea srcq, [srcq+ssq*4]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
mova [dstq+dsq*2], m2
mova [dstq+t2 ], m3
lea dstq, [dstq+dsq*4]
sub hd, 4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:

View File

@ -177,7 +177,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
.put:
movzx wd, word [t0+wq*2+table_offset(put,)]
add wq, t0
lea r6, [ssq*3]
RESTORE_DSQ_32 t0
jmp wq
.put_w2:
@ -211,20 +210,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
jg .put_w8
RET
.put_w16:
lea r4, [dsq*3]
.put_w16_in:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+r6 ]
lea srcq, [srcq+ssq*4]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
mova [dstq+dsq*2], m2
mova [dstq+r4 ], m3
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .put_w16_in
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w16
RET
.put_w32:
movu m0, [srcq+ssq*0+16*0]

View File

@ -27,7 +27,7 @@
SECTION_RODATA 64 ; avoids cacheline splits
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
min_prob: dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
pw_0xff00: times 8 dw 0xff00
pw_32: times 8 dw 32
@ -35,21 +35,24 @@ pw_32: times 8 dw 32
%define resp resq
%define movp movq
%define c_shuf q3333
%define DECODE_SYMBOL_ADAPT_INIT
%macro DECODE_SYMBOL_ADAPT_INIT 0-1
%endmacro
%else
%define resp resd
%define movp movd
%define c_shuf q1111
%macro DECODE_SYMBOL_ADAPT_INIT 0
%macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
mov t0, r0m
mov t1, r1m
%if %1 == 0
mov t2, r2m
%endif
%if STACK_ALIGNMENT >= 16
sub esp, 40
sub esp, 40-%1*4
%else
mov eax, esp
and esp, ~15
sub esp, 40
sub esp, 40-%1*4
mov [esp], eax
%endif
%endmacro
@ -69,13 +72,13 @@ endstruc
SECTION .text
%if WIN64
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3
%define buf rsp+8 ; shadow space
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
%define buf rsp+stack_offset+8 ; shadow space
%elif UNIX64
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0
DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
%define buf rsp-40 ; red zone
%else
DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2
DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
%define buf esp+8
%endif
@ -88,7 +91,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
movp m3, [t0+msac.dif]
mov t3d, [t0+msac.update_cdf]
mov t4d, t2d
neg t2
not t2 ; -(n_symbols + 1)
pshuflw m2, m2, q0000
movd [buf+12], m2
pand m2, [rax]
@ -112,15 +115,15 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
pcmpeqw m2, m2
mov t2d, t3d
shr t3d, 4
cmp t4d, 4
sbb t3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
cmp t4d, 3
sbb t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
cmp t2d, 32
adc t2d, 0 ; count + (count < 32)
movd m3, t3d
pavgw m2, m1 ; i >= val ? -1 : 32768
psubw m2, m0 ; for (i = 0; i < val; i++)
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
psraw m2, m3 ; for (; i < n_symbols; i++)
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
movq [t1], m0
mov [t1+t4*2], t2w
@ -214,11 +217,11 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
LEA rax, pw_0xff00
movd m2, [t0+msac.rng]
movu m1, [t1]
mova m1, [t1]
movp m3, [t0+msac.dif]
mov t3d, [t0+msac.update_cdf]
mov t4d, t2d
neg t2
not t2
pshuflw m2, m2, q0000
movd [buf+12], m2
punpcklqdq m2, m2
@ -242,7 +245,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
pcmpeqw m2, m2
mov t2d, t3d
shr t3d, 4
cmp t4d, 4 ; may be called with n_symbols < 4
cmp t4d, 3 ; may be called with n_symbols <= 2
sbb t3d, -5
cmp t2d, 32
adc t2d, 0
@ -252,7 +255,7 @@ cglobal msac_decode_symbol_adapt8, 0, 6, 6
psubw m0, m1
psraw m2, m3
paddw m0, m2
movu [t1], m0
mova [t1], m0
mov [t1+t4*2], t2w
jmp m(msac_decode_symbol_adapt4).renorm
@ -260,12 +263,12 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
DECODE_SYMBOL_ADAPT_INIT
LEA rax, pw_0xff00
movd m4, [t0+msac.rng]
movu m2, [t1]
movu m3, [t1+16]
mova m2, [t1]
mova m3, [t1+16]
movp m5, [t0+msac.dif]
mov t3d, [t0+msac.update_cdf]
mov t4d, t2d
neg t2
not t2
%if WIN64
sub rsp, 48 ; need 36 bytes, shadow space is only 32
%endif
@ -288,8 +291,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
punpcklqdq m5, m5
paddw m3, m4
mova [buf], m2
mova [buf+16], m3
psubusw m2, m5
mova [buf+16], m3
psubusw m3, m5
pxor m4, m4
pcmpeqw m2, m4
@ -301,7 +304,7 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
movzx t3d, word [t1+t4*2]
pcmpeqw m4, m4
mova m5, m4
lea t2d, [t3+80] ; only support n_symbols >= 4
lea t2d, [t3+80] ; only support n_symbols > 2
shr t2d, 4
cmp t3d, 32
adc t3d, 0
@ -316,8 +319,8 @@ cglobal msac_decode_symbol_adapt16, 0, 6, 6
psraw m5, m2
paddw m0, m4
paddw m1, m5
movu [t1], m0
movu [t1+16], m1
mova [t1], m0
mova [t1+16], m1
mov [t1+t4*2], t3w
.renorm:
tzcnt eax, eax
@ -440,3 +443,158 @@ cglobal msac_decode_bool, 0, 6, 0
movzx eax, al
%endif
jmp m(msac_decode_symbol_adapt4).renorm3
%macro HI_TOK 1 ; update_cdf
%if ARCH_X86_64 == 0
mov eax, -24
%endif
%%loop:
%if %1
movzx t2d, word [t1+3*2]
%endif
mova m1, m0
pshuflw m2, m2, q0000
psrlw m1, 6
movd [buf+12], m2
pand m2, m4
psllw m1, 7
pmulhuw m1, m2
%if ARCH_X86_64 == 0
add eax, 5
mov [buf+8], eax
%endif
pshuflw m3, m3, c_shuf
paddw m1, m5
movq [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2
pmovmskb eax, m1
%if %1
lea ecx, [t2+80]
pcmpeqw m2, m2
shr ecx, 4
cmp t2d, 32
adc t2d, 0
movd m3, ecx
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, m3
paddw m0, m2
movq [t1], m0
mov [t1+3*2], t2w
%endif
tzcnt eax, eax
movzx ecx, word [buf+rax+16]
movzx t2d, word [buf+rax+14]
not t4
%if ARCH_X86_64
add t6d, 5
%endif
sub eax, 5 ; setup for merging the tok_br and tok branches
sub t2d, ecx
shl rcx, gprsize*8-16
add t4, rcx
bsr ecx, t2d
xor ecx, 15
shl t2d, cl
shl t4, cl
movd m2, t2d
mov [t7+msac.rng], t2d
not t4
sub t5d, ecx
jge %%end
mov t2, [t7+msac.buf]
mov rcx, [t7+msac.end]
%if UNIX64 == 0
push t8
%endif
lea t8, [t2+gprsize]
cmp t8, rcx
ja %%refill_eob
mov t2, [t2]
lea ecx, [t5+23]
add t5d, 16
shr ecx, 3
bswap t2
sub t8, rcx
shl ecx, 3
shr t2, cl
sub ecx, t5d
mov t5d, gprsize*8-16
shl t2, cl
mov [t7+msac.buf], t8
%if UNIX64 == 0
pop t8
%endif
sub t5d, ecx
xor t4, t2
%%end:
movp m3, t4
%if ARCH_X86_64
add t6d, eax ; CF = tok_br < 3 || tok == 15
jnc %%loop
lea eax, [t6+30]
%else
add eax, [buf+8]
jnc %%loop
add eax, 30
%if STACK_ALIGNMENT >= 16
add esp, 36
%else
mov esp, [esp]
%endif
%endif
mov [t7+msac.dif], t4
shr eax, 1
mov [t7+msac.cnt], t5d
RET
%%refill_eob:
mov t8, rcx
mov ecx, gprsize*8-24
sub ecx, t5d
%%refill_eob_loop:
cmp t2, t8
jae %%refill_eob_end
movzx t5d, byte [t2]
inc t2
shl t5, cl
xor t4, t5
sub ecx, 8
jge %%refill_eob_loop
%%refill_eob_end:
%if UNIX64 == 0
pop t8
%endif
mov t5d, gprsize*8-24
mov [t7+msac.buf], t2
sub t5d, ecx
jmp %%end
%endmacro
cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
DECODE_SYMBOL_ADAPT_INIT 1
%if ARCH_X86_64 == 0 && PIC
LEA t2, min_prob+12*2
%define base t2-(min_prob+12*2)
%else
%define base 0
%endif
movq m0, [t1]
movd m2, [t0+msac.rng]
mov eax, [t0+msac.update_cdf]
movq m4, [base+pw_0xff00]
movp m3, [t0+msac.dif]
movq m5, [base+min_prob+12*2]
mov t4, [t0+msac.dif]
mov t5d, [t0+msac.cnt]
%if ARCH_X86_64
mov t6d, -24
%endif
movifnidn t7, t0
test eax, eax
jz .no_update_cdf
HI_TOK 1
.no_update_cdf:
HI_TOK 0

View File

@ -37,11 +37,13 @@ unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
unsigned dav1d_msac_decode_bool_adapt_sse2(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_sse2(MsacContext *s);
unsigned dav1d_msac_decode_bool_sse2(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_sse2(MsacContext *s, uint16_t *cdf);
#if ARCH_X86_64 || defined(__SSE2__) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_sse2
#endif
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_sse2

View File

@ -65,6 +65,7 @@ static const struct {
{ "msac", checkasm_check_msac },
#if CONFIG_8BPC
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
{ "filmgrain_8bpc", checkasm_check_filmgrain_8bpc },
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
{ "itx_8bpc", checkasm_check_itx_8bpc },
{ "loopfilter_8bpc", checkasm_check_loopfilter_8bpc },
@ -73,6 +74,7 @@ static const struct {
#endif
#if CONFIG_16BPC
{ "cdef_16bpc", checkasm_check_cdef_16bpc },
{ "filmgrain_16bpc", checkasm_check_filmgrain_16bpc },
{ "ipred_16bpc", checkasm_check_ipred_16bpc },
{ "itx_16bpc", checkasm_check_itx_16bpc },
{ "loopfilter_16bpc", checkasm_check_loopfilter_16bpc },
@ -703,6 +705,7 @@ void checkasm_set_signal_handler_state(const int enabled) {
RemoveVectoredExceptionHandler(signal_handler);
#else
void (*const handler)(int) = enabled ? signal_handler : SIG_DFL;
signal(SIGBUS, handler);
signal(SIGFPE, handler);
signal(SIGILL, handler);
signal(SIGSEGV, handler);

View File

@ -60,6 +60,7 @@ name##_16bpc(void)
void checkasm_check_msac(void);
decl_check_bitfns(void checkasm_check_cdef);
decl_check_bitfns(void checkasm_check_filmgrain);
decl_check_bitfns(void checkasm_check_ipred);
decl_check_bitfns(void checkasm_check_itx);
decl_check_bitfns(void checkasm_check_loopfilter);
@ -279,7 +280,7 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
}\
} while (0)
#else
#define bench_new(...) while (0)
#define bench_new(...) do {} while (0)
#endif
#define DECL_CHECKASM_CHECK_FUNC(type) \

View File

@ -0,0 +1,269 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include <string.h>
#include "src/levels.h"
#include "src/film_grain.h"
#define UNIT_TEST 1
#include "src/fg_apply_tmpl.c"
static void check_gen_grny(const Dav1dFilmGrainDSPContext *const dsp) {
entry grain_lut_c[GRAIN_HEIGHT][GRAIN_WIDTH];
entry grain_lut_a[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
declare_func(void, entry grain_lut[][GRAIN_WIDTH],
const Dav1dFilmGrainData *data HIGHBD_DECL_SUFFIX);
for (int i = 0; i < 4; i++) {
if (check_func(dsp->generate_grain_y, "gen_grain_y_ar%d_%dbpc", i, BITDEPTH)) {
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#endif
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = i;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
call_ref(grain_lut_c, &fg_data HIGHBD_TAIL_SUFFIX);
call_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
if (memcmp(grain_lut_c, grain_lut_a,
GRAIN_WIDTH * GRAIN_HEIGHT * sizeof(entry)))
{
fail();
}
bench_new(grain_lut_a, &fg_data HIGHBD_TAIL_SUFFIX);
}
}
report("gen_grain_y");
}
static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(pixel, src, 128 * 32,);
const ptrdiff_t stride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
const Dav1dFilmGrainData *data, size_t pw,
const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH],
int bh, int row_num HIGHBD_DECL_SUFFIX);
if (check_func(dsp->fgy_32x32xn, "fgy_32x32xn_%dbpc", BITDEPTH)) {
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
uint8_t scaling[SCALING_SIZE];
entry grain_lut[GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut, &fg_data HIGHBD_TAIL_SUFFIX);
fg_data.num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data.num_y_points;
for (int n = 0; n < fg_data.num_y_points; n++) {
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
fg_data.y_points[n][0] += rnd() % pad;
fg_data.y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
fg_data.num_y_points, scaling);
const int w = 1 + (rnd() & 127);
const int h = 1 + (rnd() & 31);
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
fg_data.clip_to_restricted_range = rnd() & 1;
fg_data.scaling_shift = (rnd() & 3) + 8;
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
fg_data.overlap_flag++)
{
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut, h,
row_num HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
fg_data.overlap_flag = 1;
bench_new(a_dst, src, stride, &fg_data, 64, scaling, grain_lut, 32,
row_num HIGHBD_TAIL_SUFFIX);
}
report("fgy_32x32xn");
}
static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(pixel, src, 128 * 32,);
ALIGN_STK_32(pixel, luma_src, 128 * 32,);
const ptrdiff_t lstride = 128 * sizeof(pixel);
declare_func(void, pixel *dst_row, const pixel *src_row, ptrdiff_t stride,
const Dav1dFilmGrainData *data, size_t pw,
const uint8_t scaling[SCALING_SIZE],
const entry grain_lut[][GRAIN_WIDTH], int bh, int row_num,
const pixel *luma_row, ptrdiff_t luma_stride, int uv_pl,
int is_identity HIGHBD_DECL_SUFFIX);
for (int layout_idx = 0; layout_idx < 3; layout_idx++) {
const char ss_name[][4] = {
[DAV1D_PIXEL_LAYOUT_I420 - 1] = "420",
[DAV1D_PIXEL_LAYOUT_I422 - 1] = "422",
[DAV1D_PIXEL_LAYOUT_I444 - 1] = "444",
};
const enum Dav1dPixelLayout layout = layout_idx + 1;
const int ss_x = layout != DAV1D_PIXEL_LAYOUT_I444;
const int ss_y = layout == DAV1D_PIXEL_LAYOUT_I420;
const ptrdiff_t stride = (ss_x ? 96 : 128) * sizeof(pixel);
for (int csfl = 0; csfl <= 1; csfl++) {
if (check_func(dsp->fguv_32x32xn[layout_idx],
"fguv_32x32xn_%dbpc_%s_csfl%d",
BITDEPTH, ss_name[layout_idx], csfl))
{
Dav1dFilmGrainData fg_data;
fg_data.seed = rnd() & 0xFFFF;
#if BITDEPTH == 16
const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
#else
const int bitdepth_max = 0xff;
#endif
const int uv_pl = rnd() & 1;
const int is_identity = rnd() & 1;
uint8_t scaling[SCALING_SIZE];
entry grain_lut[2][GRAIN_HEIGHT + 1][GRAIN_WIDTH];
fg_data.grain_scale_shift = rnd() & 3;
fg_data.ar_coeff_shift = (rnd() & 3) + 6;
fg_data.ar_coeff_lag = rnd() & 3;
const int num_y_pos = 2 * fg_data.ar_coeff_lag * (fg_data.ar_coeff_lag + 1);
for (int n = 0; n < num_y_pos; n++)
fg_data.ar_coeffs_y[n] = (rnd() & 0xff) - 128;
dsp->generate_grain_y(grain_lut[0], &fg_data HIGHBD_TAIL_SUFFIX);
dsp->generate_grain_uv[layout_idx](grain_lut[1], grain_lut[0],
&fg_data, uv_pl HIGHBD_TAIL_SUFFIX);
const int w = 1 + (rnd() & (127 >> ss_x));
const int h = 1 + (rnd() & (31 >> ss_y));
const int lw = w << ss_x, lh = h << ss_y;
for (int y = 0; y < h; y++)
for (int x = 0; x < w; x++)
src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
for (int y = 0; y < lh; y++)
for (int x = 0; x < lw; x++)
luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
if (csfl) {
fg_data.num_y_points = 2 + (rnd() % 13);
const int pad = 0xff / fg_data.num_y_points;
for (int n = 0; n < fg_data.num_y_points; n++) {
fg_data.y_points[n][0] = 0xff * n / fg_data.num_y_points;
fg_data.y_points[n][0] += rnd() % pad;
fg_data.y_points[n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.y_points,
fg_data.num_y_points, scaling);
} else {
fg_data.num_uv_points[uv_pl] = 2 + (rnd() % 9);
const int pad = 0xff / fg_data.num_uv_points[uv_pl];
for (int n = 0; n < fg_data.num_uv_points[uv_pl]; n++) {
fg_data.uv_points[uv_pl][n][0] = 0xff * n / fg_data.num_uv_points[uv_pl];
fg_data.uv_points[uv_pl][n][0] += rnd() % pad;
fg_data.uv_points[uv_pl][n][1] = rnd() & 0xff;
}
generate_scaling(bitdepth_from_max(bitdepth_max), fg_data.uv_points[uv_pl],
fg_data.num_uv_points[uv_pl], scaling);
fg_data.uv_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data.uv_luma_mult[uv_pl] = (rnd() & 0xff) - 128;
fg_data.uv_offset[uv_pl] = (rnd() & 0x1ff) - 256;
}
fg_data.clip_to_restricted_range = rnd() & 1;
fg_data.scaling_shift = (rnd() & 3) + 8;
fg_data.chroma_scaling_from_luma = csfl;
for (fg_data.overlap_flag = 0; fg_data.overlap_flag <= 1;
fg_data.overlap_flag++)
{
call_ref(c_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
call_new(a_dst, src, stride, &fg_data, w, scaling, grain_lut[1], h,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
checkasm_check_pixel(c_dst, stride, a_dst, stride, w, h, "dst");
}
fg_data.overlap_flag = 1;
bench_new(a_dst, src, stride, &fg_data, 32, scaling, grain_lut[1], 16,
row_num, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}
}
}
report("fguv_32x32xn");
}
void bitfn(checkasm_check_filmgrain)(void) {
Dav1dFilmGrainDSPContext c;
bitfn(dav1d_film_grain_dsp_init)(&c);
check_gen_grny(&c);
check_fgy_sbrow(&c);
check_fguv_sbrow(&c);
}

View File

@ -138,7 +138,7 @@ static int copy_subcoefs(coef *coeff,
* dimensions are non-zero. This leads to braching to specific optimized
* simd versions (e.g. dc-only) so that we get full asm coverage in this
* test */
const int16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
const uint16_t *const scan = dav1d_scans[tx][dav1d_tx_type_class[txtp]];
const int sub_high = subsh > 0 ? subsh * 8 - 1 : 0;
const int sub_low = subsh > 1 ? sub_high - 8 : 0;
int n, eob;

View File

@ -27,8 +27,6 @@
#include "tests/checkasm/checkasm.h"
#include <assert.h>
#include "src/levels.h"
#include "src/mc.h"

View File

@ -38,7 +38,7 @@
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
typedef unsigned (*decode_bool_adapt_fn)(MsacContext *s, uint16_t *cdf);
typedef unsigned (*decode_adapt_fn)(MsacContext *s, uint16_t *cdf);
typedef unsigned (*decode_bool_equi_fn)(MsacContext *s);
typedef unsigned (*decode_bool_fn)(MsacContext *s, unsigned f);
@ -46,17 +46,20 @@ typedef struct {
decode_symbol_adapt_fn symbol_adapt4;
decode_symbol_adapt_fn symbol_adapt8;
decode_symbol_adapt_fn symbol_adapt16;
decode_bool_adapt_fn bool_adapt;
decode_adapt_fn bool_adapt;
decode_bool_equi_fn bool_equi;
decode_bool_fn bool;
decode_adapt_fn hi_tok;
} MsacDSPContext;
static void randomize_cdf(uint16_t *const cdf, int n) {
for (int i = 16; i > n; i--)
cdf[i] = rnd(); /* randomize padding */
cdf[n] = cdf[n-1] = 0;
while (--n > 0)
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
static void randomize_cdf(uint16_t *const cdf, const int n) {
int i;
for (i = 15; i > n; i--)
cdf[i] = rnd(); // padding
cdf[i] = 0; // count
do {
cdf[i - 1] = cdf[i] + rnd() % (32768 - cdf[i] - i) + 1;
} while (--i > 0);
}
/* memcmp() on structs can have weird behavior due to padding etc. */
@ -69,7 +72,7 @@ static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
static void msac_dump(unsigned c_res, unsigned a_res,
const MsacContext *const a, const MsacContext *const b,
const uint16_t *const cdf_a, const uint16_t *const cdf_b,
int num_cdf)
const int num_cdf)
{
if (c_res != a_res)
fprintf(stderr, "c_res %u a_res %u\n", c_res, a_res);
@ -86,16 +89,15 @@ static void msac_dump(unsigned c_res, unsigned a_res,
if (a->allow_update_cdf)
fprintf(stderr, "allow_update_cdf %d vs %d\n",
a->allow_update_cdf, b->allow_update_cdf);
if (cdf_a != NULL && cdf_b != NULL &&
memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * num_cdf)) {
if (num_cdf && memcmp(cdf_a, cdf_b, sizeof(*cdf_a) * (num_cdf + 1))) {
fprintf(stderr, "cdf:\n");
for (int i = 0; i < num_cdf; i++)
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %5u", cdf_a[i]);
fprintf(stderr, "\n");
for (int i = 0; i < num_cdf; i++)
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %5u", cdf_b[i]);
fprintf(stderr, "\n");
for (int i = 0; i < num_cdf; i++)
for (int i = 0; i <= num_cdf; i++)
fprintf(stderr, " %c", cdf_a[i] != cdf_b[i] ? 'x' : '.');
fprintf(stderr, "\n");
}
@ -117,26 +119,24 @@ static void msac_dump(unsigned c_res, unsigned a_res,
{ \
if (fail()) \
msac_dump(c_res, a_res, &s_c, &s_a, \
cdf[0], cdf[1], ns + 1); \
cdf[0], cdf[1], ns); \
} \
} \
if (cdf_update && ns == n) \
bench_new(&s_a, cdf[0], n); \
if (cdf_update && ns == n - 1) \
bench_new(&s_a, cdf[1], ns); \
} \
} \
} \
} while (0)
static void check_decode_symbol(MsacDSPContext *const c, uint8_t *const buf) {
/* Use an aligned CDF buffer for more consistent benchmark
* results, and a misaligned one for checking correctness. */
ALIGN_STK_16(uint16_t, cdf, 2, [17]);
ALIGN_STK_32(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a;
declare_func(unsigned, MsacContext *s, uint16_t *cdf, size_t n_symbols);
CHECK_SYMBOL_ADAPT( 4, 1, 5);
CHECK_SYMBOL_ADAPT( 8, 1, 8);
CHECK_SYMBOL_ADAPT(16, 4, 16);
CHECK_SYMBOL_ADAPT( 4, 1, 4);
CHECK_SYMBOL_ADAPT( 8, 1, 7);
CHECK_SYMBOL_ADAPT(16, 3, 15);
report("decode_symbol");
}
@ -158,11 +158,11 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
{
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 2);
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 1);
}
}
if (cdf_update)
bench_new(&s_a, cdf[0]);
bench_new(&s_a, cdf[1]);
}
}
@ -200,6 +200,35 @@ static void check_decode_bool(MsacDSPContext *const c, uint8_t *const buf) {
report("decode_bool");
}
static void check_decode_hi_tok(MsacDSPContext *const c, uint8_t *const buf) {
ALIGN_STK_16(uint16_t, cdf, 2, [16]);
MsacContext s_c, s_a;
if (check_func(c->hi_tok, "msac_decode_hi_tok")) {
declare_func(unsigned, MsacContext *s, uint16_t *cdf);
for (int cdf_update = 0; cdf_update <= 1; cdf_update++) {
dav1d_msac_init(&s_c, buf, BUF_SIZE, !cdf_update);
s_a = s_c;
randomize_cdf(cdf[0], 3);
memcpy(cdf[1], cdf[0], sizeof(*cdf));
for (int i = 0; i < 64; i++) {
unsigned c_res = call_ref(&s_c, cdf[0]);
unsigned a_res = call_new(&s_a, cdf[1]);
if (c_res != a_res || msac_cmp(&s_c, &s_a) ||
memcmp(cdf[0], cdf[1], sizeof(*cdf)))
{
if (fail())
msac_dump(c_res, a_res, &s_c, &s_a, cdf[0], cdf[1], 3);
break;
}
}
if (cdf_update)
bench_new(&s_a, cdf[1]);
}
}
report("decode_hi_tok");
}
void checkasm_check_msac(void) {
MsacDSPContext c;
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt_c;
@ -208,6 +237,7 @@ void checkasm_check_msac(void) {
c.bool_adapt = dav1d_msac_decode_bool_adapt_c;
c.bool_equi = dav1d_msac_decode_bool_equi_c;
c.bool = dav1d_msac_decode_bool_c;
c.hi_tok = dav1d_msac_decode_hi_tok_c;
#if ARCH_AARCH64 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
@ -226,6 +256,7 @@ void checkasm_check_msac(void) {
c.bool_adapt = dav1d_msac_decode_bool_adapt_sse2;
c.bool_equi = dav1d_msac_decode_bool_equi_sse2;
c.bool = dav1d_msac_decode_bool_sse2;
c.hi_tok = dav1d_msac_decode_hi_tok_sse2;
}
#endif
@ -235,4 +266,5 @@ void checkasm_check_msac(void) {
check_decode_symbol(&c, buf);
check_decode_bool(&c, buf);
check_decode_hi_tok(&c, buf);
}

View File

@ -41,6 +41,7 @@ if is_asm_enabled
checkasm_tmpl_sources = files(
'checkasm/cdef.c',
'checkasm/filmgrain.c',
'checkasm/ipred.c',
'checkasm/itx.c',
'checkasm/loopfilter.c',

View File

@ -29,7 +29,6 @@
#include "vcs_version.h"
#include "cli_config.h"
#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <math.h>
@ -137,7 +136,7 @@ int main(const int argc, char *const *const argv) {
Dav1dPicture p;
Dav1dContext *c;
Dav1dData data;
unsigned n_out = 0, total, fps[2];
unsigned n_out = 0, total, fps[2], timebase[2];
uint64_t nspf, tfirst, elapsed;
double i_fps;
FILE *frametimes = NULL;
@ -155,7 +154,7 @@ int main(const int argc, char *const *const argv) {
if ((res = input_open(&in, cli_settings.demuxer,
cli_settings.inputfile,
fps, &total)) < 0)
fps, &total, timebase)) < 0)
{
return res;
}

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <getopt.h>
#include <limits.h>
#include <math.h>

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
@ -60,7 +59,7 @@ static int leb128(AnnexbInputContext *const c, size_t *const len) {
}
static int annexb_open(AnnexbInputContext *const c, const char *const file,
unsigned fps[2], unsigned *const num_frames)
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
{
int res;
size_t len;
@ -73,6 +72,8 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file,
// TODO: Parse sequence header and read timing info if any.
fps[0] = 25;
fps[1] = 1;
timebase[0] = 25;
timebase[1] = 1;
for (*num_frames = 0;; (*num_frames)++) {
res = leb128(c, &len);
if (res < 0)

View File

@ -36,7 +36,7 @@ typedef struct Demuxer {
const char *name;
const char *extension;
int (*open)(DemuxerPriv *ctx, const char *filename,
unsigned fps[2], unsigned *num_frames);
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
int (*read)(DemuxerPriv *ctx, Dav1dData *data);
void (*close)(DemuxerPriv *ctx);
} Demuxer;

View File

@ -27,12 +27,13 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common/attributes.h"
#include "input/input.h"
#include "input/demuxer.h"
@ -75,7 +76,7 @@ static const char *find_extension(const char *const f) {
int input_open(DemuxerContext **const c_out,
const char *const name, const char *const filename,
unsigned fps[2], unsigned *const num_frames)
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
{
const Demuxer *impl;
DemuxerContext *c;
@ -120,7 +121,7 @@ int input_open(DemuxerContext **const c_out,
memset(c, 0, sizeof(DemuxerContext) + impl->priv_data_size);
c->impl = impl;
c->data = (DemuxerPriv *) &c[1];
if ((res = impl->open(c->data, filename, fps, num_frames)) < 0) {
if ((res = impl->open(c->data, filename, fps, num_frames, timebase)) < 0) {
free(c);
return res;
}

View File

@ -35,7 +35,7 @@ typedef struct DemuxerContext DemuxerContext;
void init_demuxers(void);
int input_open(DemuxerContext **const c_out,
const char *const name, const char *const filename,
unsigned fps[2], unsigned *num_frames);
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
int input_read(DemuxerContext *ctx, Dav1dData *data);
void input_close(DemuxerContext *ctx);

View File

@ -27,7 +27,6 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
@ -49,7 +48,7 @@ static int64_t rl64(const uint8_t *const p) {
}
static int ivf_open(IvfInputContext *const c, const char *const file,
unsigned fps[2], unsigned *const num_frames)
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
{
size_t res;
uint8_t hdr[32];
@ -74,17 +73,18 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
return -1;
}
fps[0] = rl32(&hdr[16]);
fps[1] = rl32(&hdr[20]);
timebase[0] = rl32(&hdr[16]);
timebase[1] = rl32(&hdr[20]);
const unsigned duration = rl32(&hdr[24]);
uint8_t data[4];
for (*num_frames = 0;; (*num_frames)++) {
if ((res = fread(data, 4, 1, c->f)) != 1)
break; // EOF
fseeko(c->f, rl32(data) + 8, SEEK_CUR);
}
fps[0] *= *num_frames;
fps[1] *= duration;
fps[0] = timebase[0] * *num_frames;
fps[1] = timebase[1] * duration;
fseeko(c->f, 32, SEEK_SET);
return 0;

View File

@ -22,9 +22,38 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Build definition for the dav1d tools
#
# Common source files used by tools and examples
dav1d_input_sources = files(
'input/input.c',
'input/annexb.c',
'input/ivf.c',
)
dav1d_output_sources = files(
'output/md5.c',
'output/null.c',
'output/output.c',
'output/y4m2.c',
'output/yuv.c',
)
dav1d_input_objs = static_library('dav1d_input',
dav1d_input_sources,
include_directories : dav1d_inc_dirs,
install : false,
build_by_default : false,
)
dav1d_output_objs = static_library('dav1d_output',
dav1d_output_sources,
include_directories : dav1d_inc_dirs,
install : false,
build_by_default : false,
)
# Leave subdir if tools are disabled
if not get_option('enable_tools')
@ -32,6 +61,10 @@ if not get_option('enable_tools')
endif
#
# Build definition for the dav1d tools
#
# Configuratin data for cli_config.h
cli_cdata = configuration_data()
@ -56,21 +89,13 @@ cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_
dav1d_sources = files(
'dav1d.c',
'dav1d_cli_parse.c',
'input/input.c',
'input/annexb.c',
'input/ivf.c',
'output/md5.c',
'output/null.c',
'output/output.c',
'output/y4m2.c',
'output/yuv.c',
)
dav1d = executable('dav1d',
dav1d_sources,
rev_target, cli_config_h_target,
link_with : libdav1d,
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
include_directories : [dav1d_inc_dirs],
dependencies : [getopt_dependency, thread_dependency, rt_dependency],
install : true,

View File

@ -27,12 +27,13 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "common/attributes.h"
#include "output/output.h"
#include "output/muxer.h"