mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-24 21:31:04 +00:00
Bug 1612060 - Update libdav1d to 0.5.2; r=achronop
Differential Revision: https://phabricator.services.mozilla.com/D61223 --HG-- extra : moz-landing-system : lando
This commit is contained in:
parent
2dfc28fbbe
commit
f28bbffcea
@ -202,6 +202,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
||||
elif CONFIG['CPU_ARCH'] == 'arm':
|
||||
SOURCES += [
|
||||
'../../../third_party/dav1d/src/arm/32/cdef.S',
|
||||
'../../../third_party/dav1d/src/arm/32/ipred.S',
|
||||
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
|
||||
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
||||
'../../../third_party/dav1d/src/arm/32/mc.S',
|
||||
]
|
||||
|
@ -1,7 +1,7 @@
|
||||
#define API_VERSION_NUMBER 3,0,0,0
|
||||
#define API_VERSION_NUMBER_STR "3.0.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,4,0,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.4.0"
|
||||
#define API_VERSION_NUMBER 3,1,0,0
|
||||
#define API_VERSION_NUMBER_STR "3.1.0"
|
||||
#define PROJECT_VERSION_NUMBER 0,5,2,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "0.5.2"
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
|
@ -20,7 +20,7 @@ origin:
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 5595102721d3c298d7cee64e64878486a3b8bdad (2019-10-22T19:50:25.000+02:00).
|
||||
release: commit 39667c751d427e447cbe8be783cfecd296659e24 (2019-12-02T18:19:06.000+01:00).
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -1,2 +1,2 @@
|
||||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.5.0-6-g5595102"
|
||||
#define DAV1D_VERSION "0.5.2-0-g39667c7"
|
||||
|
@ -28,7 +28,7 @@
|
||||
#define DAV1D_VERSION_H
|
||||
|
||||
#define DAV1D_API_VERSION_MAJOR 3
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_MINOR 1
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
|
||||
#endif /* DAV1D_VERSION_H */
|
||||
|
24
third_party/dav1d/NEWS
vendored
24
third_party/dav1d/NEWS
vendored
@ -1,5 +1,27 @@
|
||||
Changes for 0.5.2 'Asiatic Cheetah':
|
||||
------------------------------------
|
||||
|
||||
0.5.2 is a small release improving speed for ARM32 and adding minor features:
|
||||
- ARM32 optimizations for loopfilter, ipred_dc|h|v
|
||||
- Add section-5 raw OBU demuxer
|
||||
- Improve the speed by reducing the L2 cache collisions
|
||||
- Fix minor issues
|
||||
|
||||
|
||||
Changes for 0.5.1 'Asiatic Cheetah':
|
||||
------------------------------------
|
||||
|
||||
0.5.1 is a small release improving speeds and fixing minor issues
|
||||
compared to 0.5.0:
|
||||
- SSE2 optimizations for CDEF, wiener and warp_affine
|
||||
- NEON optimizations for SGR on ARM32
|
||||
- Fix mismatch issue in x86 asm in inverse identity transforms
|
||||
- Fix build issue in ARM64 assembly if debug info was enabled
|
||||
- Add a workaround for Xcode 11 -fstack-check bug
|
||||
|
||||
|
||||
Changes for 0.5.0 'Asiatic Cheetah':
|
||||
----------------------------
|
||||
------------------------------------
|
||||
|
||||
0.5.0 is a medium release fixing regressions and minor issues,
|
||||
and improving speed significantly:
|
||||
|
11
third_party/dav1d/README.md
vendored
11
third_party/dav1d/README.md
vendored
@ -31,15 +31,16 @@ The plan is the folllowing:
|
||||
2. Provide a usable API,
|
||||
3. Port to most platforms,
|
||||
4. Make it fast on desktop, by writing asm for AVX-2 chips.
|
||||
5. Make it fast on mobile, by writing asm for ARMv8 chips,
|
||||
6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
|
||||
|
||||
### On-going
|
||||
5. Make it fast on mobile, by writing asm for ARMv8 chips,
|
||||
6. Make it fast on older desktop, by writing asm for SSE chips.
|
||||
7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
|
||||
8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
|
||||
9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
|
||||
|
||||
### After
|
||||
7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
|
||||
8. Accelerate for less common architectures,
|
||||
9. Use more GPU, when possible.
|
||||
10. Use more GPU, when possible.
|
||||
|
||||
# Contribute
|
||||
|
||||
|
11
third_party/dav1d/include/dav1d/headers.h
vendored
11
third_party/dav1d/include/dav1d/headers.h
vendored
@ -41,6 +41,17 @@
|
||||
#define DAV1D_REFS_PER_FRAME 7
|
||||
#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
|
||||
|
||||
enum Dav1dObuType {
|
||||
DAV1D_OBU_SEQ_HDR = 1,
|
||||
DAV1D_OBU_TD = 2,
|
||||
DAV1D_OBU_FRAME_HDR = 3,
|
||||
DAV1D_OBU_TILE_GRP = 4,
|
||||
DAV1D_OBU_METADATA = 5,
|
||||
DAV1D_OBU_FRAME = 6,
|
||||
DAV1D_OBU_REDUNDANT_FRAME_HDR = 7,
|
||||
DAV1D_OBU_PADDING = 15,
|
||||
};
|
||||
|
||||
enum Dav1dTxfmMode {
|
||||
DAV1D_TX_4X4_ONLY,
|
||||
DAV1D_TX_LARGEST,
|
||||
|
2
third_party/dav1d/include/dav1d/picture.h
vendored
2
third_party/dav1d/include/dav1d/picture.h
vendored
@ -37,7 +37,7 @@
|
||||
/* Number of bytes to align AND pad picture memory buffers by, so that SIMD
|
||||
* implementations can over-read by a few bytes, and use aligned read/write
|
||||
* instructions. */
|
||||
#define DAV1D_PICTURE_ALIGNMENT 32
|
||||
#define DAV1D_PICTURE_ALIGNMENT 64
|
||||
|
||||
typedef struct Dav1dPictureParameters {
|
||||
int w; ///< width (in pixels)
|
||||
|
25
third_party/dav1d/meson.build
vendored
25
third_party/dav1d/meson.build
vendored
@ -23,14 +23,14 @@
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
project('dav1d', ['c'],
|
||||
version: '0.5.0',
|
||||
version: '0.5.2',
|
||||
default_options: ['c_std=c99',
|
||||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.47.0')
|
||||
|
||||
dav1d_soname_version = '3.0.0'
|
||||
dav1d_soname_version = '3.1.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
@ -98,6 +98,7 @@ if host_machine.system() == 'windows'
|
||||
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
|
||||
cdata.set('_UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
|
||||
cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf
|
||||
cdata.set('_CRT_DECLARE_NONSTDC_NAMES', 1) # Define to get off_t from sys/types.h on MSVC
|
||||
if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
|
||||
cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows
|
||||
else
|
||||
@ -112,11 +113,23 @@ if host_machine.system() == 'windows'
|
||||
# On Windows, we use a compatibility layer to emulate pthread
|
||||
thread_dependency = []
|
||||
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
||||
|
||||
rt_dependency = []
|
||||
else
|
||||
thread_dependency = dependency('threads')
|
||||
thread_compat_dep = []
|
||||
endif
|
||||
|
||||
rt_dependency = []
|
||||
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
|
||||
cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
elif host_machine.system() != 'darwin'
|
||||
rt_dependency = cc.find_library('rt', required: false)
|
||||
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
|
||||
error('clock_gettime not found')
|
||||
endif
|
||||
cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
# Header checks
|
||||
|
||||
@ -215,6 +228,12 @@ if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
|
||||
optional_arguments += '-ffast-math'
|
||||
endif
|
||||
|
||||
if (host_machine.system() == 'darwin' and cc.get_id() == 'clang' and
|
||||
cc.version().startswith('11'))
|
||||
# Workaround for Xcode 11 -fstack-check bug, see #301
|
||||
optional_arguments += '-fno-stack-check'
|
||||
endif
|
||||
|
||||
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
|
||||
|
||||
# libFuzzer related things
|
||||
|
@ -6,7 +6,7 @@ summary: AV1 decoder from VideoLAN
|
||||
description: |
|
||||
A small and fast AV1 decoder from the people who brought you VLC.
|
||||
|
||||
grade: devel # must be 'stable' to release into candidate/stable channels
|
||||
grade: stable
|
||||
confinement: strict # use 'strict' once you have the right plugs and slots
|
||||
|
||||
apps:
|
||||
|
825
third_party/dav1d/src/arm/32/ipred.S
vendored
Normal file
825
third_party/dav1d/src/arm/32/ipred.S
vendored
Normal file
@ -0,0 +1,825 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Martin Storsjo
|
||||
* Copyright © 2019, B Krishnan Iyer
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_dc_128_neon, export=1
|
||||
push {r4, lr}
|
||||
ldr r4, [sp, #8]
|
||||
clz r3, r3
|
||||
adr r2, L(ipred_dc_128_tbl)
|
||||
sub r3, r3, #25
|
||||
ldr r3, [r2, r3, lsl #2]
|
||||
mov lr, #128
|
||||
vdup.8 q0, lr
|
||||
add r2, r2, r3
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r2
|
||||
|
||||
.align 2
|
||||
L(ipred_dc_128_tbl):
|
||||
.word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
|
||||
.word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
|
||||
.word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
|
||||
.word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
|
||||
.word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
|
||||
4:
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
subs r4, r4, #4
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4, pc}
|
||||
8:
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4, pc}
|
||||
16:
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4, pc}
|
||||
320:
|
||||
vdup.8 q1, lr
|
||||
32:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 32b
|
||||
pop {r4, pc}
|
||||
640:
|
||||
vdup.8 q1, lr
|
||||
vdup.8 q2, lr
|
||||
vdup.8 q3, lr
|
||||
sub r1, r1, #32
|
||||
64:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_v_neon, export=1
|
||||
push {r4, lr}
|
||||
ldr lr, [sp, #8]
|
||||
clz r3, r3
|
||||
adr r4, L(ipred_v_tbl)
|
||||
sub r3, r3, #25
|
||||
ldr r3, [r4, r3, lsl #2]
|
||||
add r2, r2, #1
|
||||
add r4, r4, r3
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r4
|
||||
|
||||
.align 2
|
||||
L(ipred_v_tbl):
|
||||
.word 640f - L(ipred_v_tbl) + CONFIG_THUMB
|
||||
.word 320f - L(ipred_v_tbl) + CONFIG_THUMB
|
||||
.word 160f - L(ipred_v_tbl) + CONFIG_THUMB
|
||||
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
|
||||
40:
|
||||
vld1.32 {d0[0]}, [r2]
|
||||
4:
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
subs lr, lr, #4
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4, pc}
|
||||
80:
|
||||
vld1.8 {d0}, [r2]
|
||||
8:
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
subs lr, lr, #4
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4, pc}
|
||||
160:
|
||||
vld1.8 {q0}, [r2]
|
||||
16:
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
subs lr, lr, #4
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4, pc}
|
||||
320:
|
||||
vld1.8 {q0, q1}, [r2]
|
||||
32:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs lr, lr, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 32b
|
||||
pop {r4, pc}
|
||||
640:
|
||||
vld1.8 {q0, q1}, [r2]!
|
||||
sub r1, r1, #32
|
||||
vld1.8 {q2, q3}, [r2]
|
||||
64:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
subs lr, lr, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4, pc}
|
||||
endfunc
|
||||
|
||||
// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_h_neon, export=1
|
||||
push {r4-r5, lr}
|
||||
ldr r4, [sp, #12]
|
||||
clz r3, r3
|
||||
adr r5, L(ipred_h_tbl)
|
||||
sub r3, r3, #25
|
||||
ldr r3, [r5, r3, lsl #2]
|
||||
sub r2, r2, #4
|
||||
mov lr, #-4
|
||||
add r5, r5, r3
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r5
|
||||
|
||||
.align 2
|
||||
L(ipred_h_tbl):
|
||||
.word 640f - L(ipred_h_tbl) + CONFIG_THUMB
|
||||
.word 320f - L(ipred_h_tbl) + CONFIG_THUMB
|
||||
.word 160f - L(ipred_h_tbl) + CONFIG_THUMB
|
||||
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
|
||||
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
|
||||
4:
|
||||
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
|
||||
vst1.32 {d3[0]}, [r0, :32], r1
|
||||
vst1.32 {d2[0]}, [r12, :32], r1
|
||||
subs r4, r4, #4
|
||||
vst1.32 {d1[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4-r5, pc}
|
||||
8:
|
||||
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
|
||||
vst1.8 {d3}, [r0, :64], r1
|
||||
vst1.8 {d2}, [r12, :64], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d1}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r5, pc}
|
||||
160:
|
||||
add r2, r2, #3
|
||||
mov lr, #-1
|
||||
16:
|
||||
vld1.8 {d0[], d1[]}, [r2], lr
|
||||
subs r4, r4, #4
|
||||
vld1.8 {d2[], d3[]}, [r2], lr
|
||||
vst1.8 {q0}, [r0, :128], r1
|
||||
vld1.8 {d4[], d5[]}, [r2], lr
|
||||
vst1.8 {q1}, [r12, :128], r1
|
||||
vld1.8 {d6[], d7[]}, [r2], lr
|
||||
vst1.8 {q2}, [r0, :128], r1
|
||||
vst1.8 {q3}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4-r5, pc}
|
||||
320:
|
||||
add r2, r2, #3
|
||||
mov lr, #-1
|
||||
sub r1, r1, #16
|
||||
32:
|
||||
vld1.8 {d0[], d1[]}, [r2], lr
|
||||
subs r4, r4, #4
|
||||
vld1.8 {d2[], d3[]}, [r2], lr
|
||||
vst1.8 {q0}, [r0, :128]!
|
||||
vld1.8 {d4[], d5[]}, [r2], lr
|
||||
vst1.8 {q1}, [r12, :128]!
|
||||
vld1.8 {d6[], d7[]}, [r2], lr
|
||||
vst1.8 {q0}, [r0, :128], r1
|
||||
vst1.8 {q1}, [r12, :128], r1
|
||||
vst1.8 {q2}, [r0, :128]!
|
||||
vst1.8 {q3}, [r12, :128]!
|
||||
vst1.8 {q2}, [r0, :128], r1
|
||||
vst1.8 {q3}, [r12, :128], r1
|
||||
bgt 32b
|
||||
pop {r4-r5, pc}
|
||||
640:
|
||||
add r2, r2, #3
|
||||
mov lr, #-1
|
||||
sub r1, r1, #48
|
||||
64:
|
||||
vld1.8 {d0[], d1[]}, [r2], lr
|
||||
subs r4, r4, #4
|
||||
vld1.8 {d2[], d3[]}, [r2], lr
|
||||
vst1.8 {q0}, [r0, :128]!
|
||||
vld1.8 {d4[], d5[]}, [r2], lr
|
||||
vst1.8 {q1}, [r12, :128]!
|
||||
vld1.8 {d6[], d7[]}, [r2], lr
|
||||
vst1.8 {q0}, [r0, :128]!
|
||||
vst1.8 {q1}, [r12, :128]!
|
||||
vst1.8 {q0}, [r0, :128]!
|
||||
vst1.8 {q1}, [r12, :128]!
|
||||
vst1.8 {q0}, [r0, :128], r1
|
||||
vst1.8 {q1}, [r12, :128], r1
|
||||
vst1.8 {q2}, [r0, :128]!
|
||||
vst1.8 {q3}, [r12, :128]!
|
||||
vst1.8 {q2}, [r0, :128]!
|
||||
vst1.8 {q3}, [r12, :128]!
|
||||
vst1.8 {q2}, [r0, :128]!
|
||||
vst1.8 {q3}, [r12, :128]!
|
||||
vst1.8 {q2}, [r0, :128], r1
|
||||
vst1.8 {q3}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4-r5, pc}
|
||||
endfunc
|
||||
|
||||
// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_dc_top_neon, export=1
|
||||
push {r4-r5, lr}
|
||||
ldr r4, [sp, #12]
|
||||
clz r3, r3
|
||||
adr r5, L(ipred_dc_top_tbl)
|
||||
sub r3, r3, #25
|
||||
ldr r3, [r5, r3, lsl #2]
|
||||
add r2, r2, #1
|
||||
add r5, r5, r3
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r5
|
||||
|
||||
.align 2
|
||||
L(ipred_dc_top_tbl):
|
||||
.word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
|
||||
.word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
|
||||
.word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
|
||||
.word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
|
||||
40:
|
||||
vld1.32 {d0[]}, [r2]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d0, d0[0]
|
||||
4:
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
subs r4, r4, #4
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt 4b
|
||||
pop {r4-r5, pc}
|
||||
80:
|
||||
vld1.8 {d0}, [r2]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #3
|
||||
vdup.8 d0, d0[0]
|
||||
8:
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt 8b
|
||||
pop {r4-r5, pc}
|
||||
160:
|
||||
vld1.8 {d0, d1}, [r2]
|
||||
vaddl.u8 q0, d0, d1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
16:
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
bgt 16b
|
||||
pop {r4-r5, pc}
|
||||
320:
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d4, q0, #5
|
||||
vdup.8 q0, d4[0]
|
||||
vdup.8 q1, d4[0]
|
||||
32:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 32b
|
||||
pop {r4-r5, pc}
|
||||
640:
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]!
|
||||
vaddl.u8 q0, d0, d1
|
||||
vld1.8 {d4, d5, d6, d7}, [r2]
|
||||
vaddl.u8 q1, d2, d3
|
||||
vaddl.u8 q2, d4, d5
|
||||
vaddl.u8 q3, d6, d7
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 q1, q2, q3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d18, q0, #6
|
||||
vdup.8 q0, d18[0]
|
||||
vdup.8 q1, d18[0]
|
||||
vdup.8 q2, d18[0]
|
||||
vdup.8 q3, d18[0]
|
||||
sub r1, r1, #32
|
||||
64:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
bgt 64b
|
||||
pop {r4-r5, pc}
|
||||
endfunc
|
||||
|
||||
// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_dc_left_neon, export=1
|
||||
push {r4-r5, lr}
|
||||
ldr r4, [sp, #12]
|
||||
sub r2, r2, r4
|
||||
clz r3, r3
|
||||
clz lr, r4
|
||||
sub lr, lr, #25
|
||||
adr r5, L(ipred_dc_left_tbl)
|
||||
sub r3, r3, #20
|
||||
ldr r3, [r5, r3, lsl #2]
|
||||
ldr lr, [r5, lr, lsl #2]
|
||||
add r3, r5, r3
|
||||
add r5, r5, lr
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r5
|
||||
|
||||
.align 2
|
||||
L(ipred_dc_left_tbl):
|
||||
.word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
|
||||
|
||||
L(ipred_dc_left_h4):
|
||||
vld1.32 {d0[]}, [r2]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w4):
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
subs r4, r4, #4
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt L(ipred_dc_left_w4)
|
||||
pop {r4-r5, pc}
|
||||
L(ipred_dc_left_h8):
|
||||
vld1.8 {d0}, [r2]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #3
|
||||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w8):
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt L(ipred_dc_left_w8)
|
||||
pop {r4-r5, pc}
|
||||
L(ipred_dc_left_h16):
|
||||
vld1.8 {d0, d1}, [r2]
|
||||
vaddl.u8 q0, d0, d1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w16):
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
bgt L(ipred_dc_left_w16)
|
||||
pop {r4-r5, pc}
|
||||
L(ipred_dc_left_h32):
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w32):
|
||||
vmov.8 q1, q0
|
||||
1:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 1b
|
||||
pop {r4-r5, pc}
|
||||
L(ipred_dc_left_h64):
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]!
|
||||
vld1.8 {d4, d5, d6, d7}, [r2]
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vaddl.u8 q2, d4, d5
|
||||
vaddl.u8 q3, d6, d7
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 q1, q2, q3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vrshrn.u16 d0, q0, #6
|
||||
vdup.8 q0, d0[0]
|
||||
bx r3
|
||||
L(ipred_dc_left_w64):
|
||||
sub r1, r1, #32
|
||||
vmov.8 q1, q0
|
||||
vmov.8 q2, q0
|
||||
vmov.8 q3, q0
|
||||
1:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
bgt 1b
|
||||
pop {r4-r5, pc}
|
||||
endfunc
|
||||
|
||||
// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const pixel *const topleft,
|
||||
// const int width, const int height, const int a,
|
||||
// const int max_width, const int max_height);
|
||||
function ipred_dc_neon, export=1
|
||||
push {r4-r6, lr}
|
||||
ldr r4, [sp, #16]
|
||||
sub r2, r2, r4
|
||||
add lr, r3, r4 // width + height
|
||||
clz r3, r3
|
||||
clz r12, r4
|
||||
vdup.16 q15, lr // width + height
|
||||
mov r6, #0
|
||||
adr r5, L(ipred_dc_tbl)
|
||||
rbit lr, lr // rbit(width + height)
|
||||
sub r3, r3, #20 // 25 leading bits, minus table offset 5
|
||||
sub r12, r12, #25
|
||||
clz lr, lr // ctz(width + height)
|
||||
ldr r3, [r5, r3, lsl #2]
|
||||
ldr r12, [r5, r12, lsl #2]
|
||||
neg lr, lr // -ctz(width + height)
|
||||
add r3, r5, r3
|
||||
add r5, r5, r12
|
||||
vshr.u16 q15, q15, #1 // (width + height) >> 1
|
||||
vdup.16 q14, lr // -ctz(width + height)
|
||||
add r12, r0, r1
|
||||
lsl r1, r1, #1
|
||||
bx r5
|
||||
|
||||
.align 2
|
||||
L(ipred_dc_tbl):
|
||||
.word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
|
||||
|
||||
L(ipred_dc_h4):
|
||||
vld1.32 {d0[0]}, [r2]!
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w4):
|
||||
add r2, r2, #1
|
||||
vld1.32 {d1[0]}, [r2]
|
||||
vmov.32 d1[1], r6
|
||||
vadd.s16 d0, d0, d30
|
||||
vpaddl.u8 d1, d1
|
||||
vpadd.u16 d1, d1
|
||||
vpadd.u16 d1, d1
|
||||
cmp r4, #4
|
||||
vadd.s16 d0, d0, d1
|
||||
vshl.u16 d0, d0, d28
|
||||
beq 1f // h = 8/16
|
||||
movw lr, #(0x3334/2)
|
||||
movw r5, #(0x5556/2)
|
||||
cmp r4, #16
|
||||
it ne
|
||||
movne lr, r5
|
||||
vdup.16 d30, lr
|
||||
vqdmulh.s16 d0, d0, d30
|
||||
1:
|
||||
vdup.8 d0, d0[0]
|
||||
2:
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
subs r4, r4, #4
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[0]}, [r12, :32], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
|
||||
L(ipred_dc_h8):
|
||||
vld1.8 {d0}, [r2]!
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w8):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vpaddl.u8 d2, d2
|
||||
vpadd.u16 d2, d2
|
||||
vpadd.u16 d2, d2
|
||||
cmp r4, #8
|
||||
vadd.s16 d0, d0, d2
|
||||
vshl.u16 d0, d0, d28
|
||||
beq 1f // h = 4/16/32
|
||||
cmp r4, #32
|
||||
movw lr, #(0x3334/2)
|
||||
movw r5, #(0x5556/2)
|
||||
it ne
|
||||
movne lr, r5
|
||||
vdup.16 q12, lr
|
||||
vqdmulh.s16 d0, d0, d24
|
||||
1:
|
||||
vdup.8 d0, d0[0]
|
||||
2:
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r12, :64], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
|
||||
L(ipred_dc_h16):
|
||||
vld1.8 {d0, d1}, [r2]!
|
||||
vaddl.u8 q0, d0, d1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w16):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 d2, d2, d3
|
||||
vpadd.u16 d2, d2
|
||||
vpadd.u16 d2, d2
|
||||
cmp r4, #16
|
||||
vadd.s16 d0, d0, d2
|
||||
vshl.u16 d0, d0, d28
|
||||
beq 1f // h = 4/8/32/64
|
||||
tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
|
||||
movw lr, #(0x3334/2)
|
||||
movw r5, #(0x5556/2)
|
||||
it ne
|
||||
movne lr, r5
|
||||
vdup.16 q12, lr
|
||||
vqdmulh.s16 d0, d0, d24
|
||||
1:
|
||||
vdup.8 q0, d0[0]
|
||||
2:
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1}, [r0, :128], r1
|
||||
vst1.8 {d0, d1}, [r12, :128], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
|
||||
L(ipred_dc_h32):
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]!
|
||||
vaddl.u8 q0, d0, d1
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w32):
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3, d4, d5}, [r2]
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q2, d4, d5
|
||||
vadd.u16 d4, d4, d5
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 d2, d2, d3
|
||||
vpadd.u16 d4, d4
|
||||
vpadd.u16 d2, d2
|
||||
vpadd.u16 d4, d4
|
||||
vpadd.u16 d2, d2
|
||||
cmp r4, #32
|
||||
vadd.s16 d0, d0, d4
|
||||
vadd.s16 d0, d0, d2
|
||||
vshl.u16 d4, d0, d28
|
||||
beq 1f // h = 8/16/64
|
||||
cmp r4, #8
|
||||
movw lr, #(0x3334/2)
|
||||
movw r5, #(0x5556/2)
|
||||
it ne
|
||||
movne lr, r5
|
||||
vdup.16 q12, lr
|
||||
vqdmulh.s16 d4, d4, d24
|
||||
1:
|
||||
vdup.8 q0, d4[0]
|
||||
vdup.8 q1, d4[0]
|
||||
2:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
|
||||
L(ipred_dc_h64):
|
||||
vld1.8 {d0, d1, d2, d3}, [r2]!
|
||||
vaddl.u8 q0, d0, d1
|
||||
vld1.8 {d4, d5, d6, d7}, [r2]!
|
||||
vaddl.u8 q1, d2, d3
|
||||
vaddl.u8 q2, d4, d5
|
||||
vaddl.u8 q3, d6, d7
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 q1, q2, q3
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0
|
||||
vpadd.u16 d0, d0
|
||||
bx r3
|
||||
L(ipred_dc_w64):
|
||||
vmov.8 q1, q0
|
||||
vmov.8 q2, q0
|
||||
vmov.8 q3, q0
|
||||
2:
|
||||
add r2, r2, #1
|
||||
vld1.8 {d2, d3, d4, d5}, [r2]!
|
||||
vadd.s16 d0, d0, d30
|
||||
vaddl.u8 q2, d4, d5
|
||||
vaddl.u8 q1, d2, d3
|
||||
vadd.u16 d4, d4, d5
|
||||
vadd.u16 d2, d2, d3
|
||||
vld1.8 {d16, d17, d18, d19}, [r2]
|
||||
vpadd.u16 d4, d4
|
||||
vpadd.u16 d2, d2
|
||||
vpadd.u16 d4, d4
|
||||
vpadd.u16 d2, d2
|
||||
vaddl.u8 q8, d16, d17
|
||||
vaddl.u8 q9, d18, d19
|
||||
vadd.u16 d16, d16, d17
|
||||
vadd.u16 d18, d18, d19
|
||||
vpadd.u16 d16, d16
|
||||
vpadd.u16 d18, d18
|
||||
vpadd.u16 d16, d16
|
||||
vpadd.u16 d18, d18
|
||||
vadd.u16 d2, d2, d4
|
||||
vadd.u16 d3, d16, d18
|
||||
cmp r4, #64
|
||||
vadd.s16 d0, d0, d2
|
||||
vadd.s16 d0, d0, d3
|
||||
vshl.u16 d18, d0, d28
|
||||
beq 1f // h = 16/32
|
||||
movw lr, #(0x5556/2)
|
||||
movt lr, #(0x3334/2)
|
||||
mov r5, r4
|
||||
and r5, r5, #31
|
||||
lsr lr, lr, r5
|
||||
vdup.16 d30, lr
|
||||
vqdmulh.s16 d18, d18, d30
|
||||
1:
|
||||
sub r1, r1, #32
|
||||
vdup.8 q0, d18[0]
|
||||
vdup.8 q1, d18[0]
|
||||
vdup.8 q2, d18[0]
|
||||
vdup.8 q3, d18[0]
|
||||
2:
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
subs r4, r4, #4
|
||||
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
|
||||
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
|
||||
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
|
||||
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
|
||||
bgt 2b
|
||||
pop {r4-r6, pc}
|
||||
endfunc
|
||||
|
868
third_party/dav1d/src/arm/32/loopfilter.S
vendored
Normal file
868
third_party/dav1d/src/arm/32/loopfilter.S
vendored
Normal file
@ -0,0 +1,868 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
.macro loop_filter wd
|
||||
function lpf_8_wd\wd\()_neon
|
||||
vabd.u8 d0, d22, d23 // abs(p1 - p0)
|
||||
vabd.u8 d1, d25, d24 // abs(q1 - q0)
|
||||
vabd.u8 d2, d23, d24 // abs(p0 - q0)
|
||||
vabd.u8 d3, d22, d25 // abs(p1 - q1)
|
||||
.if \wd >= 6
|
||||
vabd.u8 d4, d21, d22 // abs(p2 - p1)
|
||||
vabd.u8 d5, d26, d25 // abs(q2 - q1)
|
||||
.endif
|
||||
.if \wd >= 8
|
||||
vabd.u8 d6, d20, d21 // abs(p3 - p2)
|
||||
vabd.u8 d7, d27, d26 // abs(q3 - q3)
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
vmax.u8 d4, d4, d5
|
||||
.endif
|
||||
vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
|
||||
.if \wd >= 8
|
||||
vmax.u8 d6, d6, d7
|
||||
.endif
|
||||
vshr.u8 d3, d3, #1
|
||||
.if \wd >= 8
|
||||
vmax.u8 d4, d4, d6
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
vand d4, d4, d14
|
||||
.endif
|
||||
vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
|
||||
vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
|
||||
.if \wd >= 6
|
||||
vmax.u8 d4, d0, d4
|
||||
vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
|
||||
.else
|
||||
vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
|
||||
.endif
|
||||
vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
|
||||
vand d1, d1, d2 // fm
|
||||
vand d1, d1, d13 // fm && wd >= 4
|
||||
.if \wd >= 6
|
||||
vand d14, d14, d1 // fm && wd > 4
|
||||
.endif
|
||||
.if \wd >= 16
|
||||
vand d15, d15, d1 // fm && wd == 16
|
||||
.endif
|
||||
|
||||
vmov r10, r11, d1
|
||||
orrs r10, r10, r11
|
||||
beq 9f // if (!fm || wd < 4) return;
|
||||
|
||||
.if \wd >= 6
|
||||
vmov.i8 d10, #1
|
||||
vabd.u8 d2, d21, d23 // abs(p2 - p0)
|
||||
vabd.u8 d3, d22, d23 // abs(p1 - p0)
|
||||
vabd.u8 d4, d25, d24 // abs(q1 - q0)
|
||||
vabd.u8 d5, d26, d24 // abs(q2 - q0)
|
||||
.if \wd >= 8
|
||||
vabd.u8 d6, d20, d23 // abs(p3 - p0)
|
||||
vabd.u8 d7, d27, d24 // abs(q3 - q0)
|
||||
.endif
|
||||
vmax.u8 d2, d2, d3
|
||||
vmax.u8 d4, d4, d5
|
||||
.if \wd >= 8
|
||||
vmax.u8 d6, d6, d7
|
||||
.endif
|
||||
vmax.u8 d2, d2, d4
|
||||
.if \wd >= 8
|
||||
vmax.u8 d2, d2, d6
|
||||
.endif
|
||||
|
||||
.if \wd == 16
|
||||
vabd.u8 d3, d17, d23 // abs(p6 - p0)
|
||||
vabd.u8 d4, d18, d23 // abs(p5 - p0)
|
||||
vabd.u8 d5, d19, d23 // abs(p4 - p0)
|
||||
.endif
|
||||
vcge.u8 d2, d10, d2 // flat8in
|
||||
.if \wd == 16
|
||||
vabd.u8 d6, d28, d24 // abs(q4 - q0)
|
||||
vabd.u8 d7, d29, d24 // abs(q5 - q0)
|
||||
vabd.u8 d8, d30, d24 // abs(q6 - q0)
|
||||
.endif
|
||||
vand d14, d2, d14 // flat8in && fm && wd > 4
|
||||
vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
|
||||
.if \wd == 16
|
||||
vmax.u8 d3, d3, d4
|
||||
vmax.u8 d5, d5, d6
|
||||
.endif
|
||||
vmov r10, r11, d1
|
||||
.if \wd == 16
|
||||
vmax.u8 d7, d7, d8
|
||||
vmax.u8 d3, d3, d5
|
||||
vmax.u8 d3, d3, d7
|
||||
vcge.u8 d3, d10, d3 // flat8out
|
||||
.endif
|
||||
orrs r10, r10, r11
|
||||
.if \wd == 16
|
||||
vand d15, d15, d3 // flat8out && fm && wd == 16
|
||||
vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
|
||||
vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
|
||||
.endif
|
||||
beq 1f // skip wd == 4 case
|
||||
.endif
|
||||
|
||||
vsubl.u8 q1, d22, d25 // p1 - q1
|
||||
vcgt.u8 d0, d0, d12 // hev
|
||||
vqmovn.s16 d2, q1
|
||||
vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
|
||||
vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
|
||||
vsubl.u8 q1, d24, d23
|
||||
vmov.i16 q3, #3
|
||||
vmul.i16 q1, q1, q3
|
||||
vmov.i8 d6, #4
|
||||
vaddw.s8 q1, q1, d4
|
||||
vmov.i8 d7, #3
|
||||
vqmovn.s16 d2, q1 // f
|
||||
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
|
||||
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
|
||||
vshr.s8 d4, d4, #3 // f1
|
||||
vshr.s8 d5, d5, #3 // f2
|
||||
vmovl.u8 q1, d23 // p0
|
||||
vmovl.u8 q3, d24 // q0
|
||||
vaddw.s8 q1, q1, d5
|
||||
vsubw.s8 q3, q3, d4
|
||||
vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
|
||||
vqmovun.s16 d2, q1 // out p0
|
||||
vqmovun.s16 d6, q3 // out q0
|
||||
vbit d23, d2, d1 // if (fm && wd >= 4)
|
||||
vmovl.u8 q1, d22 // p1
|
||||
vbit d24, d6, d1 // if (fm && wd >= 4)
|
||||
vmovl.u8 q3, d25 // q1
|
||||
vaddw.s8 q1, q1, d4
|
||||
vsubw.s8 q3, q3, d4
|
||||
vqmovun.s16 d2, q1 // out p1
|
||||
vqmovun.s16 d6, q3 // out q1
|
||||
vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
|
||||
vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
|
||||
1:
|
||||
|
||||
.if \wd == 6
|
||||
vmov r10, r11, d14
|
||||
orrs r10, r10, r11
|
||||
beq 2f // skip if there's no flat8in
|
||||
|
||||
vaddl.u8 q0, d21, d21 // p2 * 2
|
||||
vaddl.u8 q1, d21, d22 // p2 + p1
|
||||
vaddl.u8 q2, d22, d23 // p1 + p0
|
||||
vaddl.u8 q3, d23, d24 // p0 + q0
|
||||
vadd.i16 q4, q0, q1
|
||||
vadd.i16 q5, q2, q3
|
||||
vaddl.u8 q6, d24, d25 // q0 + q1
|
||||
vadd.i16 q4, q4, q5
|
||||
vsub.i16 q6, q6, q0
|
||||
vaddl.u8 q5, d25, d26 // q1 + q2
|
||||
vrshrn.i16 d0, q4, #3 // out p1
|
||||
|
||||
vadd.i16 q4, q4, q6
|
||||
vsub.i16 q5, q5, q1
|
||||
vaddl.u8 q6, d26, d26 // q2 + q2
|
||||
vrshrn.i16 d1, q4, #3 // out p0
|
||||
|
||||
vadd.i16 q4, q4, q5
|
||||
vsub.i16 q6, q6, q2
|
||||
vrshrn.i16 d2, q4, #3 // out q0
|
||||
|
||||
vbit d22, d0, d14 // p1 if (flat8in)
|
||||
vadd.i16 q4, q4, q6
|
||||
vbit d23, d1, d14 // p0 if (flat8in)
|
||||
vrshrn.i16 d3, q4, #3 // out q1
|
||||
vbit d24, d2, d14 // q0 if (flat8in)
|
||||
vbit d25, d3, d14 // q1 if (flat8in)
|
||||
.elseif \wd >= 8
|
||||
vmov r10, r11, d14
|
||||
orrs r10, r10, r11
|
||||
.if \wd == 8
|
||||
beq 8f // skip if there's no flat8in
|
||||
.else
|
||||
beq 2f // skip if there's no flat8in
|
||||
.endif
|
||||
|
||||
vaddl.u8 q0, d20, d21 // p3 + p2
|
||||
vaddl.u8 q1, d22, d25 // p1 + q1
|
||||
vaddl.u8 q2, d20, d22 // p3 + p1
|
||||
vaddl.u8 q3, d23, d26 // p0 + q2
|
||||
vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
|
||||
vaddw.u8 q4, q4, d23 // + p0
|
||||
vaddw.u8 q4, q4, d24 // + q0
|
||||
vadd.i16 q4, q4, q2 // + p3 + p1
|
||||
vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
|
||||
vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
|
||||
vrshrn.i16 d10, q4, #3 // out p2
|
||||
|
||||
vadd.i16 q4, q4, q1
|
||||
vaddl.u8 q0, d20, d23 // p3 + p0
|
||||
vaddl.u8 q1, d24, d27 // q0 + q3
|
||||
vrshrn.i16 d11, q4, #3 // out p1
|
||||
|
||||
vadd.i16 q4, q4, q3
|
||||
vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
|
||||
vaddl.u8 q2, d21, d24 // p2 + q0
|
||||
vaddl.u8 q3, d25, d27 // q1 + q3
|
||||
vrshrn.i16 d12, q4, #3 // out p0
|
||||
|
||||
vadd.i16 q4, q4, q1
|
||||
vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
|
||||
vaddl.u8 q0, d22, d25 // p1 + q1
|
||||
vaddl.u8 q1, d26, d27 // q2 + q3
|
||||
vrshrn.i16 d13, q4, #3 // out q0
|
||||
|
||||
vadd.i16 q4, q4, q3
|
||||
vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
|
||||
vrshrn.i16 d0, q4, #3 // out q1
|
||||
|
||||
vadd.i16 q4, q4, q1
|
||||
|
||||
vbit d21, d10, d14
|
||||
vbit d22, d11, d14
|
||||
vbit d23, d12, d14
|
||||
vrshrn.i16 d1, q4, #3 // out q2
|
||||
vbit d24, d13, d14
|
||||
vbit d25, d0, d14
|
||||
vbit d26, d1, d14
|
||||
.endif
|
||||
2:
|
||||
.if \wd == 16
|
||||
vmov r10, r11, d15
|
||||
orrs r10, r10, r11
|
||||
bne 1f // check if flat8out is needed
|
||||
vmov r10, r11, d14
|
||||
orrs r10, r10, r11
|
||||
beq 8f // if there was no flat8in, just write the inner 4 pixels
|
||||
b 7f // if flat8in was used, write the inner 6 pixels
|
||||
1:
|
||||
|
||||
vaddl.u8 q1, d17, d17 // p6 + p6
|
||||
vaddl.u8 q2, d17, d18 // p6 + p5
|
||||
vaddl.u8 q3, d17, d19 // p6 + p4
|
||||
vaddl.u8 q4, d17, d20 // p6 + p3
|
||||
vadd.i16 q6, q1, q2
|
||||
vadd.i16 q5, q3, q4
|
||||
vaddl.u8 q3, d17, d21 // p6 + p2
|
||||
vadd.i16 q6, q6, q5
|
||||
vaddl.u8 q4, d17, d22 // p6 + p1
|
||||
vaddl.u8 q5, d18, d23 // p5 + p0
|
||||
vadd.i16 q3, q3, q4
|
||||
vaddl.u8 q4, d19, d24 // p4 + q0
|
||||
vadd.i16 q6, q6, q3
|
||||
vadd.i16 q5, q5, q4
|
||||
vaddl.u8 q3, d20, d25 // p3 + q1
|
||||
vadd.i16 q6, q6, q5
|
||||
vsub.i16 q3, q3, q1
|
||||
vaddl.u8 q1, d21, d26 // p2 + q2
|
||||
vrshrn.i16 d0, q6, #4 // out p5
|
||||
vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
|
||||
vsub.i16 q1, q1, q2
|
||||
vaddl.u8 q2, d22, d27 // p1 + q3
|
||||
vaddl.u8 q3, d17, d19 // p6 + p4
|
||||
vrshrn.i16 d1, q6, #4 // out p4
|
||||
vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
|
||||
vsub.i16 q2, q2, q3
|
||||
vaddl.u8 q3, d23, d28 // p0 + q4
|
||||
vaddl.u8 q4, d17, d20 // p6 + p3
|
||||
vrshrn.i16 d2, q6, #4 // out p3
|
||||
vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
|
||||
vsub.i16 q3, q3, q4
|
||||
vaddl.u8 q4, d24, d29 // q0 + q5
|
||||
vaddl.u8 q2, d17, d21 // p6 + p2
|
||||
vrshrn.i16 d3, q6, #4 // out p2
|
||||
vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
|
||||
vsub.i16 q4, q4, q2
|
||||
vaddl.u8 q3, d25, d30 // q1 + q6
|
||||
vaddl.u8 q5, d17, d22 // p6 + p1
|
||||
vrshrn.i16 d4, q6, #4 // out p1
|
||||
vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
|
||||
vsub.i16 q3, q3, q5
|
||||
vaddl.u8 q4, d26, d30 // q2 + q6
|
||||
vbif d0, d18, d15 // out p5
|
||||
vaddl.u8 q5, d18, d23 // p5 + p0
|
||||
vrshrn.i16 d5, q6, #4 // out p0
|
||||
vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
|
||||
vsub.i16 q4, q4, q5
|
||||
vaddl.u8 q5, d27, d30 // q3 + q6
|
||||
vbif d1, d19, d15 // out p4
|
||||
vaddl.u8 q9, d19, d24 // p4 + q0
|
||||
vrshrn.i16 d6, q6, #4 // out q0
|
||||
vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
|
||||
vsub.i16 q5, q5, q9
|
||||
vaddl.u8 q4, d28, d30 // q4 + q6
|
||||
vbif d2, d20, d15 // out p3
|
||||
vaddl.u8 q9, d20, d25 // p3 + q1
|
||||
vrshrn.i16 d7, q6, #4 // out q1
|
||||
vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
|
||||
vsub.i16 q9, q4, q9
|
||||
vaddl.u8 q5, d29, d30 // q5 + q6
|
||||
vbif d3, d21, d15 // out p2
|
||||
vaddl.u8 q10, d21, d26 // p2 + q2
|
||||
vrshrn.i16 d8, q6, #4 // out q2
|
||||
vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
|
||||
vsub.i16 q5, q5, q10
|
||||
vaddl.u8 q9, d30, d30 // q6 + q6
|
||||
vbif d4, d22, d15 // out p1
|
||||
vaddl.u8 q10, d22, d27 // p1 + q3
|
||||
vrshrn.i16 d9, q6, #4 // out q3
|
||||
vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
|
||||
vsub.i16 q9, q9, q10
|
||||
vbif d5, d23, d15 // out p0
|
||||
vrshrn.i16 d10, q6, #4 // out q4
|
||||
vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
|
||||
vrshrn.i16 d11, q6, #4 // out q5
|
||||
vbif d6, d24, d15 // out q0
|
||||
vbif d7, d25, d15 // out q1
|
||||
vbif d8, d26, d15 // out q2
|
||||
vbif d9, d27, d15 // out q3
|
||||
vbif d10, d28, d15 // out q4
|
||||
vbif d11, d29, d15 // out q5
|
||||
.endif
|
||||
|
||||
bx lr
|
||||
.if \wd == 16
|
||||
7:
|
||||
// Return to a shorter epilogue, writing only the inner 6 pixels
|
||||
bx r8
|
||||
.endif
|
||||
.if \wd >= 8
|
||||
8:
|
||||
// Return to a shorter epilogue, writing only the inner 4 pixels
|
||||
bx r9
|
||||
.endif
|
||||
9:
|
||||
// Return directly without writing back any pixels
|
||||
bx r12
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
loop_filter 16
|
||||
loop_filter 8
|
||||
loop_filter 6
|
||||
loop_filter 4
|
||||
|
||||
.macro lpf_8_wd16
|
||||
adr r8, 7f + CONFIG_THUMB
|
||||
adr r9, 8f + CONFIG_THUMB
|
||||
bl lpf_8_wd16_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd8
|
||||
adr r9, 8f + CONFIG_THUMB
|
||||
bl lpf_8_wd8_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd6
|
||||
bl lpf_8_wd6_neon
|
||||
.endm
|
||||
|
||||
.macro lpf_8_wd4
|
||||
bl lpf_8_wd4_neon
|
||||
.endm
|
||||
|
||||
function lpf_v_4_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, r1, lsl #1
|
||||
vld1.8 {d22}, [r10, :64], r1 // p1
|
||||
vld1.8 {d24}, [r0, :64], r1 // q0
|
||||
vld1.8 {d23}, [r10, :64], r1 // p0
|
||||
vld1.8 {d25}, [r0, :64], r1 // q1
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
lpf_8_wd4
|
||||
|
||||
sub r10, r0, r1, lsl #1
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
sub r0, r0, r1, lsl #1
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_h_4_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, #2
|
||||
add r0, r10, r1, lsl #2
|
||||
vld1.32 {d22[0]}, [r10], r1
|
||||
vld1.32 {d22[1]}, [r0], r1
|
||||
vld1.32 {d23[0]}, [r10], r1
|
||||
vld1.32 {d23[1]}, [r0], r1
|
||||
vld1.32 {d24[0]}, [r10], r1
|
||||
vld1.32 {d24[1]}, [r0], r1
|
||||
vld1.32 {d25[0]}, [r10], r1
|
||||
vld1.32 {d25[1]}, [r0], r1
|
||||
add r0, r0, #2
|
||||
|
||||
transpose_4x8b q11, q12, d22, d23, d24, d25
|
||||
|
||||
lpf_8_wd4
|
||||
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #2
|
||||
transpose_4x8b q11, q12, d22, d23, d24, d25
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.32 {d22[0]}, [r10], r1
|
||||
vst1.32 {d22[1]}, [r0], r1
|
||||
vst1.32 {d23[0]}, [r10], r1
|
||||
vst1.32 {d23[1]}, [r0], r1
|
||||
vst1.32 {d24[0]}, [r10], r1
|
||||
vst1.32 {d24[1]}, [r0], r1
|
||||
vst1.32 {d25[0]}, [r10], r1
|
||||
vst1.32 {d25[1]}, [r0], r1
|
||||
add r0, r0, #2
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_v_6_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, r1, lsl #1
|
||||
sub r10, r10, r1
|
||||
vld1.8 {d21}, [r10, :64], r1 // p2
|
||||
vld1.8 {d24}, [r0, :64], r1 // q0
|
||||
vld1.8 {d22}, [r10, :64], r1 // p1
|
||||
vld1.8 {d25}, [r0, :64], r1 // q1
|
||||
vld1.8 {d23}, [r10, :64], r1 // p0
|
||||
vld1.8 {d26}, [r0, :64], r1 // q2
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r0, r0, r1
|
||||
|
||||
lpf_8_wd6
|
||||
|
||||
sub r10, r0, r1, lsl #1
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
sub r0, r0, r1, lsl #1
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_h_6_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, #4
|
||||
add r0, r10, r1, lsl #2
|
||||
vld1.8 {d20}, [r10], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d21}, [r10], r1
|
||||
vld1.8 {d25}, [r0], r1
|
||||
vld1.8 {d22}, [r10], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d23}, [r10], r1
|
||||
vld1.8 {d27}, [r0], r1
|
||||
add r0, r0, #4
|
||||
|
||||
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
|
||||
|
||||
lpf_8_wd6
|
||||
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #2
|
||||
transpose_4x8b q11, q12, d22, d23, d24, d25
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.32 {d22[0]}, [r10], r1
|
||||
vst1.32 {d22[1]}, [r0], r1
|
||||
vst1.32 {d23[0]}, [r10], r1
|
||||
vst1.32 {d23[1]}, [r0], r1
|
||||
vst1.32 {d24[0]}, [r10], r1
|
||||
vst1.32 {d24[1]}, [r0], r1
|
||||
vst1.32 {d25[0]}, [r10], r1
|
||||
vst1.32 {d25[1]}, [r0], r1
|
||||
add r0, r0, #2
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_v_8_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, r1, lsl #2
|
||||
vld1.8 {d20}, [r10, :64], r1 // p3
|
||||
vld1.8 {d24}, [r0, :64], r1 // q0
|
||||
vld1.8 {d21}, [r10, :64], r1 // p2
|
||||
vld1.8 {d25}, [r0, :64], r1 // q1
|
||||
vld1.8 {d22}, [r10, :64], r1 // p1
|
||||
vld1.8 {d26}, [r0, :64], r1 // q2
|
||||
vld1.8 {d23}, [r10, :64], r1 // p0
|
||||
vld1.8 {d27}, [r0, :64], r1 // q3
|
||||
sub r0, r0, r1, lsl #2
|
||||
|
||||
lpf_8_wd8
|
||||
|
||||
sub r10, r0, r1, lsl #1
|
||||
sub r10, r10, r1
|
||||
vst1.8 {d21}, [r10, :64], r1 // p2
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d26}, [r0, :64], r1 // q2
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r0, r0, r1
|
||||
bx r12
|
||||
|
||||
8:
|
||||
sub r10, r0, r1, lsl #1
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
sub r0, r0, r1, lsl #1
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_h_8_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, #4
|
||||
add r0, r10, r1, lsl #2
|
||||
vld1.8 {d20}, [r10], r1
|
||||
vld1.8 {d24}, [r0], r1
|
||||
vld1.8 {d21}, [r10], r1
|
||||
vld1.8 {d25}, [r0], r1
|
||||
vld1.8 {d22}, [r10], r1
|
||||
vld1.8 {d26}, [r0], r1
|
||||
vld1.8 {d23}, [r10], r1
|
||||
vld1.8 {d27}, [r0], r1
|
||||
add r0, r0, #4
|
||||
|
||||
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
|
||||
|
||||
lpf_8_wd8
|
||||
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #4
|
||||
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.8 {d20}, [r10], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d21}, [r10], r1
|
||||
vst1.8 {d25}, [r0], r1
|
||||
vst1.8 {d22}, [r10], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d23}, [r10], r1
|
||||
vst1.8 {d27}, [r0], r1
|
||||
add r0, r0, #4
|
||||
bx r12
|
||||
8:
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #2
|
||||
transpose_4x8b q11, q12, d22, d23, d24, d25
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.32 {d22[0]}, [r10], r1
|
||||
vst1.32 {d22[1]}, [r0], r1
|
||||
vst1.32 {d23[0]}, [r10], r1
|
||||
vst1.32 {d23[1]}, [r0], r1
|
||||
vst1.32 {d24[0]}, [r10], r1
|
||||
vst1.32 {d24[1]}, [r0], r1
|
||||
vst1.32 {d25[0]}, [r10], r1
|
||||
vst1.32 {d25[1]}, [r0], r1
|
||||
add r0, r0, #2
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_v_16_8_neon
|
||||
mov r12, lr
|
||||
|
||||
sub r10, r0, r1, lsl #3
|
||||
add r10, r10, r1
|
||||
vld1.8 {d17}, [r10, :64], r1 // p6
|
||||
vld1.8 {d24}, [r0, :64], r1 // q0
|
||||
vld1.8 {d18}, [r10, :64], r1 // p5
|
||||
vld1.8 {d25}, [r0, :64], r1 // q1
|
||||
vld1.8 {d19}, [r10, :64], r1 // p4
|
||||
vld1.8 {d26}, [r0, :64], r1 // q2
|
||||
vld1.8 {d20}, [r10, :64], r1 // p3
|
||||
vld1.8 {d27}, [r0, :64], r1 // q3
|
||||
vld1.8 {d21}, [r10, :64], r1 // p2
|
||||
vld1.8 {d28}, [r0, :64], r1 // q4
|
||||
vld1.8 {d22}, [r10, :64], r1 // p1
|
||||
vld1.8 {d29}, [r0, :64], r1 // q5
|
||||
vld1.8 {d23}, [r10, :64], r1 // p0
|
||||
vld1.8 {d30}, [r0, :64], r1 // q6
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, r1
|
||||
|
||||
lpf_8_wd16
|
||||
|
||||
sub r10, r0, r1, lsl #2
|
||||
sub r10, r10, r1, lsl #1
|
||||
vst1.8 {d0}, [r10, :64], r1 // p5
|
||||
vst1.8 {d6}, [r0, :64], r1 // q0
|
||||
vst1.8 {d1}, [r10, :64], r1 // p4
|
||||
vst1.8 {d7}, [r0, :64], r1 // q1
|
||||
vst1.8 {d2}, [r10, :64], r1 // p3
|
||||
vst1.8 {d8}, [r0, :64], r1 // q2
|
||||
vst1.8 {d3}, [r10, :64], r1 // p2
|
||||
vst1.8 {d9}, [r0, :64], r1 // q3
|
||||
vst1.8 {d4}, [r10, :64], r1 // p1
|
||||
vst1.8 {d10}, [r0, :64], r1 // q4
|
||||
vst1.8 {d5}, [r10, :64], r1 // p0
|
||||
vst1.8 {d11}, [r0, :64], r1 // q5
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1, lsl #1
|
||||
bx r12
|
||||
7:
|
||||
sub r10, r0, r1
|
||||
sub r10, r10, r1, lsl #1
|
||||
vst1.8 {d21}, [r10, :64], r1 // p2
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d26}, [r0, :64], r1 // q2
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r0, r0, r1
|
||||
bx r12
|
||||
|
||||
8:
|
||||
sub r10, r0, r1, lsl #1
|
||||
vst1.8 {d22}, [r10, :64], r1 // p1
|
||||
vst1.8 {d24}, [r0, :64], r1 // q0
|
||||
vst1.8 {d23}, [r10, :64], r1 // p0
|
||||
vst1.8 {d25}, [r0, :64], r1 // q1
|
||||
sub r0, r0, r1, lsl #1
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
function lpf_h_16_8_neon
|
||||
mov r12, lr
|
||||
sub r10, r0, #8
|
||||
vld1.8 {d16}, [r10, :64], r1
|
||||
vld1.8 {d24}, [r0, :64], r1
|
||||
vld1.8 {d17}, [r10, :64], r1
|
||||
vld1.8 {d25}, [r0, :64], r1
|
||||
vld1.8 {d18}, [r10, :64], r1
|
||||
vld1.8 {d26}, [r0, :64], r1
|
||||
vld1.8 {d19}, [r10, :64], r1
|
||||
vld1.8 {d27}, [r0, :64], r1
|
||||
vld1.8 {d20}, [r10, :64], r1
|
||||
vld1.8 {d28}, [r0, :64], r1
|
||||
vld1.8 {d21}, [r10, :64], r1
|
||||
vld1.8 {d29}, [r0, :64], r1
|
||||
vld1.8 {d22}, [r10, :64], r1
|
||||
vld1.8 {d30}, [r0, :64], r1
|
||||
vld1.8 {d23}, [r10, :64], r1
|
||||
vld1.8 {d31}, [r0, :64], r1
|
||||
|
||||
transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
|
||||
transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
|
||||
|
||||
lpf_8_wd16
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
sub r10, r0, #8
|
||||
|
||||
transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
|
||||
transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
|
||||
|
||||
vst1.8 {d16}, [r10, :64], r1
|
||||
vst1.8 {d6}, [r0, :64], r1
|
||||
vst1.8 {d17}, [r10, :64], r1
|
||||
vst1.8 {d7}, [r0, :64], r1
|
||||
vst1.8 {d0}, [r10, :64], r1
|
||||
vst1.8 {d8}, [r0, :64], r1
|
||||
vst1.8 {d1}, [r10, :64], r1
|
||||
vst1.8 {d9}, [r0, :64], r1
|
||||
vst1.8 {d2}, [r10, :64], r1
|
||||
vst1.8 {d10}, [r0, :64], r1
|
||||
vst1.8 {d3}, [r10, :64], r1
|
||||
vst1.8 {d11}, [r0, :64], r1
|
||||
vst1.8 {d4}, [r10, :64], r1
|
||||
vst1.8 {d30}, [r0, :64], r1
|
||||
vst1.8 {d5}, [r10, :64], r1
|
||||
vst1.8 {d31}, [r0, :64], r1
|
||||
bx r12
|
||||
|
||||
7:
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #4
|
||||
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.8 {d20}, [r10], r1
|
||||
vst1.8 {d24}, [r0], r1
|
||||
vst1.8 {d21}, [r10], r1
|
||||
vst1.8 {d25}, [r0], r1
|
||||
vst1.8 {d22}, [r10], r1
|
||||
vst1.8 {d26}, [r0], r1
|
||||
vst1.8 {d23}, [r10], r1
|
||||
vst1.8 {d27}, [r0], r1
|
||||
add r0, r0, #4
|
||||
bx r12
|
||||
8:
|
||||
sub r10, r0, r1, lsl #3
|
||||
sub r10, r10, #2
|
||||
transpose_4x8b q11, q12, d22, d23, d24, d25
|
||||
add r0, r10, r1, lsl #2
|
||||
|
||||
vst1.32 {d22[0]}, [r10], r1
|
||||
vst1.32 {d22[1]}, [r0], r1
|
||||
vst1.32 {d23[0]}, [r10], r1
|
||||
vst1.32 {d23[1]}, [r0], r1
|
||||
vst1.32 {d24[0]}, [r10], r1
|
||||
vst1.32 {d24[1]}, [r0], r1
|
||||
vst1.32 {d25[0]}, [r10], r1
|
||||
vst1.32 {d25[1]}, [r0], r1
|
||||
add r0, r0, #2
|
||||
bx r12
|
||||
endfunc
|
||||
|
||||
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint32_t *const vmask,
|
||||
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
|
||||
// const Av1FilterLUT *lut, const int w)
|
||||
|
||||
.macro lpf_func dir, type
|
||||
function lpf_\dir\()_sb_\type\()_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
ldrd r6, r7, [r2] // vmask[0], vmask[1]
|
||||
.ifc \type, y
|
||||
ldr r2, [r2, #8] // vmask[2]
|
||||
.endif
|
||||
add r5, r5, #128 // Move to sharp part of lut
|
||||
.ifc \type, y
|
||||
orr r7, r7, r2 // vmask[1] |= vmask[2]
|
||||
.endif
|
||||
.ifc \dir, v
|
||||
sub r4, r3, r4, lsl #2
|
||||
.else
|
||||
sub r3, r3, #4
|
||||
lsl r4, r4, #2
|
||||
.endif
|
||||
orr r6, r6, r7 // vmask[0] |= vmask[1]
|
||||
|
||||
1:
|
||||
tst r6, #0x03
|
||||
.ifc \dir, v
|
||||
vld1.8 {d0}, [r4]!
|
||||
vld1.8 {d1}, [r3]!
|
||||
.else
|
||||
vld2.32 {d0[0], d1[0]}, [r3], r4
|
||||
vld2.32 {d0[1], d1[1]}, [r3], r4
|
||||
.endif
|
||||
beq 7f // if (!(vm & bits)) continue;
|
||||
|
||||
vld1.8 {d5[]}, [r5] // sharp[0]
|
||||
add r5, r5, #8
|
||||
vmov.i32 d2, #0xff
|
||||
vdup.32 d13, r6 // vmask[0]
|
||||
|
||||
vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
|
||||
vand d1, d1, d2
|
||||
vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
|
||||
vmov.i8 d4, #1
|
||||
vld1.8 {d6[]}, [r5] // sharp[1]
|
||||
sub r5, r5, #8
|
||||
vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
|
||||
vmul.i32 d1, d1, d4 // L
|
||||
.ifc \type, y
|
||||
vdup.32 d15, r2 // vmask[2]
|
||||
.endif
|
||||
vtst.32 d2, d1, d2 // L != 0
|
||||
vdup.32 d14, r7 // vmask[1]
|
||||
vmov r10, r11, d2
|
||||
orrs r10, r10, r11
|
||||
beq 7f // if (!L) continue;
|
||||
vneg.s8 d5, d5 // -sharp[0]
|
||||
movrel_local r10, word_12
|
||||
vshr.u8 d12, d1, #4 // H
|
||||
vld1.32 {d16}, [r10, :64]
|
||||
vshl.s8 d3, d1, d5 // L >> sharp[0]
|
||||
.ifc \type, y
|
||||
vtst.32 d15, d15, d16 // if (vmask[2] & bits)
|
||||
.endif
|
||||
vmov.i8 d7, #2
|
||||
vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
|
||||
vadd.i8 d0, d1, d7 // L + 2
|
||||
vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
|
||||
vadd.u8 d0, d0, d0 // 2*(L + 2)
|
||||
vtst.32 d14, d14, d16 // if (vmask[1] & bits)
|
||||
vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
|
||||
vtst.32 d13, d13, d16 // if (vmask[0] & bits)
|
||||
vand d13, d13, d2 // vmask[0] &= L != 0
|
||||
|
||||
.ifc \type, y
|
||||
tst r2, #0x03
|
||||
beq 2f
|
||||
// wd16
|
||||
bl lpf_\dir\()_16_8_neon
|
||||
b 8f
|
||||
2:
|
||||
.endif
|
||||
tst r7, #0x03
|
||||
beq 3f
|
||||
.ifc \type, y
|
||||
// wd8
|
||||
bl lpf_\dir\()_8_8_neon
|
||||
.else
|
||||
// wd6
|
||||
bl lpf_\dir\()_6_8_neon
|
||||
.endif
|
||||
b 8f
|
||||
3:
|
||||
// wd4
|
||||
bl lpf_\dir\()_4_8_neon
|
||||
.ifc \dir, h
|
||||
b 8f
|
||||
7:
|
||||
// For dir h, the functions above increment r0.
|
||||
// If the whole function is skipped, increment it here instead.
|
||||
add r0, r0, r1, lsl #3
|
||||
.else
|
||||
7:
|
||||
.endif
|
||||
8:
|
||||
lsrs r6, r6, #2 // vmask[0] >>= 2
|
||||
lsr r7, r7, #2 // vmask[1] >>= 2
|
||||
.ifc \type, y
|
||||
lsr r2, r2, #2 // vmask[2] >>= 2
|
||||
.endif
|
||||
.ifc \dir, v
|
||||
add r0, r0, #8
|
||||
.else
|
||||
// For dir h, r0 is returned incremented
|
||||
.endif
|
||||
bne 1b
|
||||
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
lpf_func v, y
|
||||
lpf_func h, y
|
||||
lpf_func v, uv
|
||||
lpf_func h, uv
|
||||
|
||||
const word_12, align=4
|
||||
.word 1, 2
|
||||
endconst
|
1427
third_party/dav1d/src/arm/32/looprestoration.S
vendored
1427
third_party/dav1d/src/arm/32/looprestoration.S
vendored
File diff suppressed because it is too large
Load Diff
17
third_party/dav1d/src/arm/32/util.S
vendored
17
third_party/dav1d/src/arm/32/util.S
vendored
@ -34,11 +34,11 @@
|
||||
|
||||
.macro movrel_local rd, val, offset=0
|
||||
#if defined(PIC)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
|
||||
2:
|
||||
ldr \rd, 90001f
|
||||
b 90002f
|
||||
90001:
|
||||
.word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
|
||||
90002:
|
||||
add \rd, \rd, pc
|
||||
#else
|
||||
movw \rd, #:lower16:\val+\offset
|
||||
@ -84,4 +84,11 @@
|
||||
vtrn.8 \r6, \r7
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8b q0, q1, r0, r1, r2, r3
|
||||
vtrn.16 \q0, \q1
|
||||
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
.endm
|
||||
|
||||
#endif /* DAV1D_SRC_ARM_32_UTIL_S */
|
||||
|
63
third_party/dav1d/src/arm/64/loopfilter.S
vendored
63
third_party/dav1d/src/arm/64/loopfilter.S
vendored
@ -37,11 +37,11 @@ function lpf_16_wd\wd\()_neon
|
||||
.if \wd >= 6
|
||||
uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
|
||||
uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
|
||||
.endif
|
||||
.if \wd >= 8
|
||||
uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
|
||||
uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
|
||||
.endif
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
umax v4.16b, v4.16b, v5.16b
|
||||
.endif
|
||||
@ -70,7 +70,7 @@ function lpf_16_wd\wd\()_neon
|
||||
.if \wd >= 6
|
||||
and v14.16b, v14.16b, v1.16b // fm && wd > 4
|
||||
.endif
|
||||
.if \wd >= 6
|
||||
.if \wd >= 16
|
||||
and v15.16b, v15.16b, v1.16b // fm && wd == 16
|
||||
.endif
|
||||
|
||||
@ -303,7 +303,6 @@ function lpf_16_wd\wd\()_neon
|
||||
rshrn v13.8b, v8.8h, #3 // out q0
|
||||
rshrn2 v13.16b, v9.8h, #3
|
||||
|
||||
|
||||
add v8.8h, v8.8h, v6.8h
|
||||
add v9.8h, v9.8h, v7.8h
|
||||
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
|
||||
@ -420,6 +419,7 @@ function lpf_16_wd\wd\()_neon
|
||||
sub v7.8h, v7.8h, v11.8h
|
||||
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
|
||||
uaddl2 v9.8h, v26.16b, v30.16b
|
||||
bif v0.16b, v18.16b, v15.16b // out p5
|
||||
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
|
||||
uaddl2 v11.8h, v18.16b, v23.16b
|
||||
rshrn v5.8b, v12.8h, #4 // out p0
|
||||
@ -430,56 +430,55 @@ function lpf_16_wd\wd\()_neon
|
||||
sub v9.8h, v9.8h, v11.8h
|
||||
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
|
||||
uaddl2 v11.8h, v27.16b, v30.16b
|
||||
bif v0.16b, v18.16b, v15.16b // out p5
|
||||
uaddl v14.8h, v19.8b, v24.8b // p4 + q0
|
||||
uaddl2 v18.8h, v19.16b, v24.16b
|
||||
bif v1.16b, v19.16b, v15.16b // out p4
|
||||
uaddl v18.8h, v19.8b, v24.8b // p4 + q0
|
||||
uaddl2 v19.8h, v19.16b, v24.16b
|
||||
rshrn v6.8b, v12.8h, #4 // out q0
|
||||
rshrn2 v6.16b, v13.8h, #4
|
||||
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
|
||||
add v13.8h, v13.8h, v9.8h
|
||||
sub v10.8h, v10.8h, v14.8h
|
||||
sub v11.8h, v11.8h, v18.8h
|
||||
uaddl v14.8h, v28.8b, v30.8b // q4 + q6
|
||||
uaddl2 v18.8h, v28.16b, v30.16b
|
||||
bif v1.16b, v19.16b, v15.16b // out p4
|
||||
uaddl v8.8h, v20.8b, v25.8b // p3 + q1
|
||||
uaddl2 v9.8h, v20.16b, v25.16b
|
||||
sub v10.8h, v10.8h, v18.8h
|
||||
sub v11.8h, v11.8h, v19.8h
|
||||
uaddl v8.8h, v28.8b, v30.8b // q4 + q6
|
||||
uaddl2 v9.8h, v28.16b, v30.16b
|
||||
bif v2.16b, v20.16b, v15.16b // out p3
|
||||
uaddl v18.8h, v20.8b, v25.8b // p3 + q1
|
||||
uaddl2 v19.8h, v20.16b, v25.16b
|
||||
rshrn v7.8b, v12.8h, #4 // out q1
|
||||
rshrn2 v7.16b, v13.8h, #4
|
||||
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
|
||||
add v13.8h, v13.8h, v11.8h
|
||||
sub v14.8h, v14.8h, v8.8h
|
||||
sub v18.8h, v18.8h, v9.8h
|
||||
sub v18.8h, v8.8h, v18.8h
|
||||
sub v19.8h, v9.8h, v19.8h
|
||||
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
|
||||
uaddl2 v11.8h, v29.16b, v30.16b
|
||||
bif v2.16b, v20.16b, v15.16b // out p3
|
||||
uaddl v19.8h, v21.8b, v26.8b // p2 + q2
|
||||
uaddl2 v20.8h, v21.16b, v26.16b
|
||||
bif v3.16b, v21.16b, v15.16b // out p2
|
||||
uaddl v20.8h, v21.8b, v26.8b // p2 + q2
|
||||
uaddl2 v21.8h, v21.16b, v26.16b
|
||||
rshrn v8.8b, v12.8h, #4 // out q2
|
||||
rshrn2 v8.16b, v13.8h, #4
|
||||
add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
|
||||
add v13.8h, v13.8h, v18.8h
|
||||
sub v10.8h, v10.8h, v19.8h
|
||||
sub v11.8h, v11.8h, v20.8h
|
||||
uaddl v14.8h, v30.8b, v30.8b // q6 + q6
|
||||
uaddl2 v18.8h, v30.16b, v30.16b
|
||||
bif v3.16b, v21.16b, v15.16b // out p2
|
||||
uaddl v19.8h, v22.8b, v27.8b // p1 + q3
|
||||
uaddl2 v20.8h, v22.16b, v27.16b
|
||||
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
|
||||
add v13.8h, v13.8h, v19.8h
|
||||
sub v10.8h, v10.8h, v20.8h
|
||||
sub v11.8h, v11.8h, v21.8h
|
||||
uaddl v18.8h, v30.8b, v30.8b // q6 + q6
|
||||
uaddl2 v19.8h, v30.16b, v30.16b
|
||||
bif v4.16b, v22.16b, v15.16b // out p1
|
||||
uaddl v20.8h, v22.8b, v27.8b // p1 + q3
|
||||
uaddl2 v21.8h, v22.16b, v27.16b
|
||||
rshrn v9.8b, v12.8h, #4 // out q3
|
||||
rshrn2 v9.16b, v13.8h, #4
|
||||
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
|
||||
add v13.8h, v13.8h, v11.8h
|
||||
sub v14.8h, v14.8h, v19.8h
|
||||
sub v18.8h, v18.8h, v20.8h
|
||||
bif v4.16b, v22.16b, v15.16b // out p1
|
||||
sub v19.8h, v19.8h, v21.8h
|
||||
bif v5.16b, v23.16b, v15.16b // out p0
|
||||
rshrn v10.8b, v12.8h, #4 // out q4
|
||||
rshrn2 v10.16b, v13.8h, #4
|
||||
add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
|
||||
add v13.8h, v13.8h, v18.8h
|
||||
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
|
||||
add v13.8h, v13.8h, v19.8h
|
||||
rshrn v11.8b, v12.8h, #4 // out q5
|
||||
rshrn2 v11.16b, v13.8h, #4
|
||||
bif v5.16b, v23.16b, v15.16b // out p0
|
||||
bif v6.16b, v24.16b, v15.16b // out q0
|
||||
bif v7.16b, v25.16b, v15.16b // out q1
|
||||
bif v8.16b, v26.16b, v15.16b // out q2
|
||||
|
24
third_party/dav1d/src/arm/64/looprestoration.S
vendored
24
third_party/dav1d/src/arm/64/looprestoration.S
vendored
@ -949,8 +949,8 @@ function sgr_box5_h_neon, export=1
|
||||
ext v4.16b, v5.16b, v4.16b, #13
|
||||
b 2f
|
||||
0:
|
||||
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
|
||||
// and shift v3 to have 2x the first byte at the front.
|
||||
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
|
||||
// and shift v0 to have 2x the first byte at the front.
|
||||
dup v1.16b, v0.b[0]
|
||||
dup v5.16b, v4.b[0]
|
||||
// Move x3 back to account for the last 3 bytes we loaded before,
|
||||
@ -993,7 +993,7 @@ function sgr_box5_h_neon, export=1
|
||||
ext v20.16b, v4.16b, v4.16b, #1
|
||||
ext v21.16b, v4.16b, v4.16b, #2
|
||||
ext v22.16b, v4.16b, v4.16b, #3
|
||||
ext v23.16b, v4.16b, v5.16b, #4
|
||||
ext v23.16b, v4.16b, v4.16b, #4
|
||||
uaddl v3.8h, v0.8b, v16.8b
|
||||
uaddl v24.8h, v17.8b, v18.8b
|
||||
uaddl v7.8h, v4.8b, v20.8b
|
||||
@ -1053,15 +1053,15 @@ function sgr_box5_h_neon, export=1
|
||||
ext v4.16b, v4.16b, v4.16b, #4
|
||||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// w < 7, w+1 pixels valid in v3/v5
|
||||
// w < 7, w+1 pixels valid in v0/v4
|
||||
sub w13, w5, #1
|
||||
// w13 = pixels valid - 2
|
||||
adr x14, L(box5_variable_shift_tbl)
|
||||
ldrh w13, [x14, w13, uxtw #1]
|
||||
sub x13, x14, w13, uxth
|
||||
br x13
|
||||
// Shift v3 right, shifting out invalid pixels,
|
||||
// shift v3 left to the original offset, shifting in padding pixels.
|
||||
// Shift v0 right, shifting out invalid pixels,
|
||||
// shift v0 left to the original offset, shifting in padding pixels.
|
||||
22: // 2 pixels valid
|
||||
ext v0.16b, v0.16b, v0.16b, #2
|
||||
ext v4.16b, v4.16b, v4.16b, #2
|
||||
@ -1688,14 +1688,14 @@ function sgr_finish_filter2_neon, export=1
|
||||
|
||||
2:
|
||||
subs x5, x5, #8
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v23.16b, v2.16b, v3.16b, #2 // +stride
|
||||
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
|
||||
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // -stride
|
||||
ext v23.16b, v2.16b, v3.16b, #2 // +stride
|
||||
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
|
||||
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
add v2.8h, v22.8h, v23.8h // -stride, +stride
|
||||
add v0.8h, v0.8h, v25.8h
|
||||
|
||||
ext v22.16b, v16.16b, v17.16b, #4 // -stride
|
||||
ext v23.16b, v17.16b, v18.16b, #4
|
||||
@ -1760,8 +1760,8 @@ function sgr_finish_filter2_neon, export=1
|
||||
|
||||
4:
|
||||
subs x5, x5, #8
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // 0
|
||||
ext v23.16b, v0.16b, v1.16b, #4 // +1
|
||||
ext v22.16b, v0.16b, v1.16b, #2 // 0
|
||||
add v0.8h, v0.8h, v23.8h // -1, +1
|
||||
|
||||
ext v24.16b, v16.16b, v17.16b, #4 // 0
|
||||
@ -1894,12 +1894,12 @@ endfunc
|
||||
// const int16_t wt[2]);
|
||||
function sgr_weighted2_neon, export=1
|
||||
ldr x8, [sp]
|
||||
ld1 {v31.s}[0], [x8]
|
||||
cmp x7, #2
|
||||
add x10, x0, x1
|
||||
add x11, x2, x3
|
||||
add x12, x4, #2*FILTER_OUT_STRIDE
|
||||
add x13, x5, #2*FILTER_OUT_STRIDE
|
||||
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
|
||||
mov x8, #4*FILTER_OUT_STRIDE
|
||||
lsl x1, x1, #1
|
||||
lsl x3, x3, #1
|
||||
@ -1908,8 +1908,6 @@ function sgr_weighted2_neon, export=1
|
||||
sub x1, x1, x9
|
||||
sub x3, x3, x9
|
||||
sub x8, x8, x9, lsl #1
|
||||
dup v30.8h, v31.h[0] // wt[0]
|
||||
dup v31.8h, v31.h[1] // wt[1]
|
||||
mov x9, x6
|
||||
b.lt 2f
|
||||
1:
|
||||
|
4
third_party/dav1d/src/arm/ipred_init_tmpl.c
vendored
4
third_party/dav1d/src/arm/ipred_init_tmpl.c
vendored
@ -54,13 +54,14 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
|
||||
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
|
||||
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
|
||||
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
|
||||
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
|
||||
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
|
||||
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
|
||||
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
|
||||
@ -77,4 +78,5 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
|
||||
|
||||
c->pal_pred = dav1d_pal_pred_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 && ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;
|
||||
|
@ -91,7 +91,6 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#if ARCH_AARCH64
|
||||
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
@ -123,7 +122,7 @@ static void dav1d_sgr_filter1_neon(coef *tmp,
|
||||
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
|
||||
if (edges & LR_HAVE_TOP)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
|
||||
NULL, lpf, lpf_stride, w, 1, edges);
|
||||
NULL, lpf, lpf_stride, w, 2, edges);
|
||||
|
||||
if (edges & LR_HAVE_BOTTOM)
|
||||
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
|
||||
@ -253,7 +252,6 @@ static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // ARCH_AARCH64
|
||||
#endif // BITDEPTH == 8
|
||||
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
@ -263,8 +261,6 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_neon;
|
||||
#if ARCH_AARCH64
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
71
third_party/dav1d/src/decode.c
vendored
71
third_party/dav1d/src/decode.c
vendored
@ -524,6 +524,7 @@ static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
|
||||
{
|
||||
int have_top = i > first;
|
||||
|
||||
assert(pal_idx);
|
||||
pal_idx += first + (i - first) * stride;
|
||||
for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
|
||||
const int have_left = j > 0;
|
||||
@ -586,6 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
|
||||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const ptrdiff_t stride = bw4 * 4;
|
||||
assert(pal_idx);
|
||||
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
uint16_t (*const color_map_cdf)[8] =
|
||||
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
|
||||
@ -1125,6 +1127,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
if (b->pal_sz[0]) {
|
||||
uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
|
||||
} else
|
||||
@ -1137,6 +1140,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
if (has_chroma && b->pal_sz[1]) {
|
||||
uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
|
||||
} else
|
||||
@ -1390,7 +1394,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->ref[1] = f->frame_hdr->skip_mode_refs[1];
|
||||
b->comp_type = COMP_INTER_AVG;
|
||||
b->inter_mode = NEARESTMV_NEARESTMV;
|
||||
b->drl_idx = 0;
|
||||
b->drl_idx = NEAREST_DRL;
|
||||
has_subpel_filter = 0;
|
||||
|
||||
candidate_mv mvstack[8];
|
||||
@ -1490,13 +1494,13 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->inter_mode, ctx, n_mvs, ts->msac.rng);
|
||||
|
||||
const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
|
||||
b->drl_idx = 0;
|
||||
b->drl_idx = NEAREST_DRL;
|
||||
if (b->inter_mode == NEWMV_NEWMV) {
|
||||
if (n_mvs > 1) {
|
||||
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
|
||||
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v1]);
|
||||
if (b->drl_idx == 1 && n_mvs > 2) {
|
||||
if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
|
||||
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v2]);
|
||||
@ -1506,12 +1510,12 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->drl_idx, n_mvs, ts->msac.rng);
|
||||
}
|
||||
} else if (im[0] == NEARMV || im[1] == NEARMV) {
|
||||
b->drl_idx = 1;
|
||||
if (n_mvs > 2) {
|
||||
b->drl_idx = NEARER_DRL;
|
||||
if (n_mvs > 2) { // NEAR or NEARISH
|
||||
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v2]);
|
||||
if (b->drl_idx == 2 && n_mvs > 3) {
|
||||
if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
|
||||
const int drl_ctx_v3 = get_drl_context(mvstack, 2);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v3]);
|
||||
@ -1521,6 +1525,7 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
b->drl_idx, n_mvs, ts->msac.rng);
|
||||
}
|
||||
}
|
||||
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
|
||||
|
||||
#define assign_comp_mv(idx, pfx) \
|
||||
switch (im[idx]) { \
|
||||
@ -1678,14 +1683,14 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
has_subpel_filter = 1;
|
||||
if (dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
|
||||
{
|
||||
{ // NEAREST, NEARER, NEAR or NEARISH
|
||||
b->inter_mode = NEARMV;
|
||||
b->drl_idx = 1;
|
||||
if (n_mvs > 2) {
|
||||
b->drl_idx = NEARER_DRL;
|
||||
if (n_mvs > 2) { // NEARER, NEAR or NEARISH
|
||||
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v2]);
|
||||
if (b->drl_idx == 2 && n_mvs > 3) {
|
||||
if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
|
||||
const int drl_ctx_v3 =
|
||||
get_drl_context(mvstack, 2);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
@ -1694,9 +1699,10 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
}
|
||||
} else {
|
||||
b->inter_mode = NEARESTMV;
|
||||
b->drl_idx = 0;
|
||||
b->drl_idx = NEAREST_DRL;
|
||||
}
|
||||
if (b->drl_idx >= 2) {
|
||||
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
|
||||
if (b->drl_idx >= NEAR_DRL) {
|
||||
b->mv[0] = mvstack[b->drl_idx].this_mv;
|
||||
} else {
|
||||
b->mv[0] = mvlist[0][b->drl_idx];
|
||||
@ -1711,20 +1717,22 @@ static int decode_b(Dav1dTileContext *const t,
|
||||
} else {
|
||||
has_subpel_filter = 1;
|
||||
b->inter_mode = NEWMV;
|
||||
b->drl_idx = 0;
|
||||
if (n_mvs > 1) {
|
||||
b->drl_idx = NEAREST_DRL;
|
||||
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
|
||||
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v1]);
|
||||
if (b->drl_idx == 1 && n_mvs > 2) {
|
||||
if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
|
||||
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
|
||||
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
|
||||
ts->cdf.m.drl_bit[drl_ctx_v2]);
|
||||
}
|
||||
}
|
||||
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
|
||||
if (n_mvs > 1) {
|
||||
b->mv[0] = mvstack[b->drl_idx].this_mv;
|
||||
} else {
|
||||
assert(!b->drl_idx);
|
||||
b->mv[0] = mvlist[0][0];
|
||||
fix_mv_precision(f->frame_hdr, &b->mv[0]);
|
||||
}
|
||||
@ -1972,7 +1980,7 @@ static int checked_decode_b(Dav1dTileContext *const t,
|
||||
for (int p = 0; p < 1 + 2 * has_chroma; p++) {
|
||||
const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
const int stride = f->cur.stride[!!p];
|
||||
const ptrdiff_t stride = f->cur.stride[!!p];
|
||||
const int bx = t->bx & ~ss_hor;
|
||||
const int by = t->by & ~ss_ver;
|
||||
const int width = w4 << (2 - ss_hor + (bw4 == ss_hor));
|
||||
@ -2318,10 +2326,15 @@ static void setup_tile(Dav1dTileState *const ts,
|
||||
const int sb_shift = f->sb_shift;
|
||||
|
||||
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
|
||||
ts->frame_thread.pal_idx =
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4];
|
||||
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
|
||||
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
|
||||
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
|
||||
NULL;
|
||||
|
||||
ts->frame_thread.cf = f->frame_thread.cf ?
|
||||
(uint8_t*)f->frame_thread.cf +
|
||||
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
|
||||
NULL;
|
||||
|
||||
dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
|
||||
ts->last_qidx = f->frame_hdr->quant.yac;
|
||||
memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
|
||||
@ -3106,12 +3119,18 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
|
||||
tile_idx++)
|
||||
{
|
||||
Dav1dTileState *const ts = &f->ts[tile_idx];
|
||||
const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
|
||||
ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4];
|
||||
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
|
||||
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
|
||||
const size_t tile_start_off =
|
||||
(size_t) f->frame_thread.tile_start_off[tile_idx];
|
||||
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
|
||||
&f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] :
|
||||
NULL;
|
||||
ts->frame_thread.cf = f->frame_thread.cf ?
|
||||
(uint8_t*)f->frame_thread.cf +
|
||||
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
|
||||
NULL;
|
||||
if (f->n_tc > 0) {
|
||||
unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
|
||||
const unsigned row_sb_start =
|
||||
f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
|
||||
atomic_init(&ts->progress, row_sb_start);
|
||||
}
|
||||
}
|
||||
|
15
third_party/dav1d/src/env.h
vendored
15
third_party/dav1d/src/env.h
vendored
@ -431,19 +431,10 @@ static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
|
||||
static inline int get_drl_context(const candidate_mv *const ref_mv_stack,
|
||||
const int ref_idx)
|
||||
{
|
||||
if (ref_mv_stack[ref_idx].weight >= 640 &&
|
||||
ref_mv_stack[ref_idx + 1].weight >= 640)
|
||||
return 0;
|
||||
if (ref_mv_stack[ref_idx].weight >= 640)
|
||||
return ref_mv_stack[ref_idx + 1].weight < 640;
|
||||
|
||||
if (ref_mv_stack[ref_idx].weight >= 640 &&
|
||||
ref_mv_stack[ref_idx + 1].weight < 640)
|
||||
return 1;
|
||||
|
||||
if (ref_mv_stack[ref_idx].weight < 640 &&
|
||||
ref_mv_stack[ref_idx + 1].weight < 640)
|
||||
return 2;
|
||||
|
||||
return 0;
|
||||
return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
|
||||
}
|
||||
|
||||
static inline unsigned get_cur_frame_segid(const int by, const int bx,
|
||||
|
2
third_party/dav1d/src/fg_apply.h
vendored
2
third_party/dav1d/src/fg_apply.h
vendored
@ -36,6 +36,6 @@
|
||||
|
||||
bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
|
||||
Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in);
|
||||
Dav1dPicture *const in);
|
||||
|
||||
#endif /* DAV1D_SRC_FG_APPLY_H */
|
||||
|
20
third_party/dav1d/src/fg_apply_tmpl.c
vendored
20
third_party/dav1d/src/fg_apply_tmpl.c
vendored
@ -91,7 +91,7 @@ static void generate_scaling(const int bitdepth,
|
||||
#ifndef UNIT_TEST
|
||||
void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
||||
Dav1dPicture *const out,
|
||||
const Dav1dPicture *const in)
|
||||
Dav1dPicture *const in)
|
||||
{
|
||||
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
|
||||
|
||||
@ -143,7 +143,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
||||
const int cpw = (out->p.w + ss_x) >> ss_x;
|
||||
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
|
||||
for (int row = 0; row < rows; row++) {
|
||||
const pixel *const luma_src =
|
||||
pixel *const luma_src =
|
||||
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
|
||||
|
||||
if (data->num_y_points) {
|
||||
@ -153,7 +153,23 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
||||
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
||||
if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
|
||||
!data->chroma_scaling_from_luma)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
|
||||
|
||||
// extend padding pixels
|
||||
if (out->p.w & ss_x) {
|
||||
pixel *ptr = luma_src;
|
||||
for (int y = 0; y < bh; y++) {
|
||||
ptr[out->p.w] = ptr[out->p.w - 1];
|
||||
ptr += PXSTRIDE(in->stride[0]) << ss_y;
|
||||
}
|
||||
}
|
||||
|
||||
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
|
||||
if (data->chroma_scaling_from_luma) {
|
||||
for (int pl = 0; pl < 2; pl++)
|
||||
|
35
third_party/dav1d/src/ipred_tmpl.c
vendored
35
third_party/dav1d/src/ipred_tmpl.c
vendored
@ -324,7 +324,9 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
|
||||
}
|
||||
}
|
||||
|
||||
static int get_filter_strength(const int wh, const int angle, const int is_sm) {
|
||||
static NOINLINE int get_filter_strength(const int wh, const int angle,
|
||||
const int is_sm)
|
||||
{
|
||||
if (is_sm) {
|
||||
if (wh <= 8) {
|
||||
if (angle >= 64) return 2;
|
||||
@ -357,10 +359,10 @@ static int get_filter_strength(const int wh, const int angle, const int is_sm) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void filter_edge(pixel *const out, const int sz,
|
||||
const int lim_from, const int lim_to,
|
||||
const pixel *const in,
|
||||
const int from, const int to, const unsigned strength)
|
||||
static NOINLINE void filter_edge(pixel *const out, const int sz,
|
||||
const int lim_from, const int lim_to,
|
||||
const pixel *const in, const int from,
|
||||
const int to, const int strength)
|
||||
{
|
||||
static const uint8_t kernel[3][5] = {
|
||||
{ 0, 4, 8, 4, 0 },
|
||||
@ -382,14 +384,13 @@ static void filter_edge(pixel *const out, const int sz,
|
||||
out[i] = in[iclip(i, from, to - 1)];
|
||||
}
|
||||
|
||||
static int get_upsample(const int blk_wh, const unsigned d, const int type) {
|
||||
if (d >= 40) return 0;
|
||||
return type ? (blk_wh <= 8) : (blk_wh <= 16);
|
||||
static inline int get_upsample(const int wh, const int angle, const int is_sm) {
|
||||
return angle < 40 && wh <= 16 >> is_sm;
|
||||
}
|
||||
|
||||
static void upsample_edge(pixel *const out, const int hsz,
|
||||
const pixel *const in, const int from, const int to
|
||||
HIGHBD_DECL_SUFFIX)
|
||||
static NOINLINE void upsample_edge(pixel *const out, const int hsz,
|
||||
const pixel *const in, const int from,
|
||||
const int to HIGHBD_DECL_SUFFIX)
|
||||
{
|
||||
static const int8_t kernel[4] = { -1, 9, 9, -1 };
|
||||
int i;
|
||||
@ -415,7 +416,7 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
|
||||
angle &= 511;
|
||||
assert(angle < 90);
|
||||
int dx = dav1d_dr_intra_derivative[angle >> 1];
|
||||
pixel top_out[(64 + 64) * 2];
|
||||
pixel top_out[64 + 64];
|
||||
const pixel *top;
|
||||
int max_base_x;
|
||||
const int upsample_above = enable_intra_edge_filter ?
|
||||
@ -474,8 +475,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
|
||||
get_upsample(width + height, 180 - angle, is_sm) : 0;
|
||||
const int upsample_above = enable_intra_edge_filter ?
|
||||
get_upsample(width + height, angle - 90, is_sm) : 0;
|
||||
pixel edge[64 * 2 + 64 * 2 + 1];
|
||||
pixel *const topleft = &edge[height * 2];
|
||||
pixel edge[64 + 64 + 1];
|
||||
pixel *const topleft = &edge[64];
|
||||
|
||||
if (upsample_above) {
|
||||
upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
|
||||
@ -494,8 +495,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
|
||||
}
|
||||
}
|
||||
if (upsample_left) {
|
||||
upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1
|
||||
HIGHBD_TAIL_SUFFIX);
|
||||
upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
|
||||
0, height + 1 HIGHBD_TAIL_SUFFIX);
|
||||
dy <<= 1;
|
||||
} else {
|
||||
const int filter_strength = enable_intra_edge_filter ?
|
||||
@ -549,7 +550,7 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
|
||||
angle &= 511;
|
||||
assert(angle > 180);
|
||||
int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
|
||||
pixel left_out[(64 + 64) * 2];
|
||||
pixel left_out[64 + 64];
|
||||
const pixel *left;
|
||||
int max_base_y;
|
||||
const int upsample_left = enable_intra_edge_filter ?
|
||||
|
18
third_party/dav1d/src/levels.h
vendored
18
third_party/dav1d/src/levels.h
vendored
@ -32,17 +32,6 @@
|
||||
|
||||
#include "dav1d/headers.h"
|
||||
|
||||
enum ObuType {
|
||||
OBU_SEQ_HDR = 1,
|
||||
OBU_TD = 2,
|
||||
OBU_FRAME_HDR = 3,
|
||||
OBU_TILE_GRP = 4,
|
||||
OBU_METADATA = 5,
|
||||
OBU_FRAME = 6,
|
||||
OBU_REDUNDANT_FRAME_HDR = 7,
|
||||
OBU_PADDING = 15,
|
||||
};
|
||||
|
||||
enum ObuMetaType {
|
||||
OBU_META_HDR_CLL = 1,
|
||||
OBU_META_HDR_MDCV = 2,
|
||||
@ -221,6 +210,13 @@ enum InterPredMode {
|
||||
N_INTER_PRED_MODES,
|
||||
};
|
||||
|
||||
enum DRL_PROXIMITY {
|
||||
NEAREST_DRL,
|
||||
NEARER_DRL,
|
||||
NEAR_DRL,
|
||||
NEARISH_DRL
|
||||
};
|
||||
|
||||
enum CompInterPredMode {
|
||||
NEARESTMV_NEARESTMV,
|
||||
NEARMV_NEARMV,
|
||||
|
1
third_party/dav1d/src/mc_tmpl.c
vendored
1
third_party/dav1d/src/mc_tmpl.c
vendored
@ -905,7 +905,6 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
|
||||
src_x += mx >> 14;
|
||||
mx &= 0x3fff;
|
||||
}
|
||||
if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];
|
||||
|
||||
dst += PXSTRIDE(dst_stride);
|
||||
src += PXSTRIDE(src_stride);
|
||||
|
2
third_party/dav1d/src/meson.build
vendored
2
third_party/dav1d/src/meson.build
vendored
@ -112,6 +112,8 @@ if is_asm_enabled
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
libdav1d_sources += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/loopfilter.S',
|
||||
'arm/32/looprestoration.S',
|
||||
'arm/32/mc.S',
|
||||
)
|
||||
|
24
third_party/dav1d/src/obu.c
vendored
24
third_party/dav1d/src/obu.c
vendored
@ -1178,7 +1178,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
|
||||
// obu header
|
||||
dav1d_get_bits(&gb, 1); // obu_forbidden_bit
|
||||
const enum ObuType type = dav1d_get_bits(&gb, 4);
|
||||
const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
|
||||
const int has_extension = dav1d_get_bits(&gb, 1);
|
||||
const int has_length_field = dav1d_get_bits(&gb, 1);
|
||||
dav1d_get_bits(&gb, 1); // reserved
|
||||
@ -1217,7 +1217,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
if (len > in->sz - init_byte_pos) goto error;
|
||||
|
||||
// skip obu not belonging to the selected temporal/spatial layer
|
||||
if (type != OBU_SEQ_HDR && type != OBU_TD &&
|
||||
if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
|
||||
has_extension && c->operating_point_idc != 0)
|
||||
{
|
||||
const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
|
||||
@ -1227,7 +1227,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case OBU_SEQ_HDR: {
|
||||
case DAV1D_OBU_SEQ_HDR: {
|
||||
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
|
||||
if (!ref) return DAV1D_ERR(ENOMEM);
|
||||
Dav1dSequenceHeader *seq_hdr = ref->data;
|
||||
@ -1266,11 +1266,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
c->seq_hdr = seq_hdr;
|
||||
break;
|
||||
}
|
||||
case OBU_REDUNDANT_FRAME_HDR:
|
||||
case DAV1D_OBU_REDUNDANT_FRAME_HDR:
|
||||
if (c->frame_hdr) break;
|
||||
// fall-through
|
||||
case OBU_FRAME:
|
||||
case OBU_FRAME_HDR:
|
||||
case DAV1D_OBU_FRAME:
|
||||
case DAV1D_OBU_FRAME_HDR:
|
||||
if (global) break;
|
||||
if (!c->seq_hdr) goto error;
|
||||
if (!c->frame_hdr_ref) {
|
||||
@ -1293,7 +1293,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
dav1d_data_unref_internal(&c->tile[n].data);
|
||||
c->n_tile_data = 0;
|
||||
c->n_tiles = 0;
|
||||
if (type != OBU_FRAME) {
|
||||
if (type != DAV1D_OBU_FRAME) {
|
||||
// This is actually a frame header OBU so read the
|
||||
// trailing bit and check for overrun.
|
||||
dav1d_get_bits(&gb, 1);
|
||||
@ -1312,7 +1312,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
return DAV1D_ERR(ERANGE);
|
||||
}
|
||||
|
||||
if (type != OBU_FRAME)
|
||||
if (type != DAV1D_OBU_FRAME)
|
||||
break;
|
||||
// OBU_FRAMEs shouldn't be signaled with show_existing_frame
|
||||
if (c->frame_hdr->show_existing_frame) {
|
||||
@ -1325,7 +1325,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
// to align to the next byte.
|
||||
dav1d_bytealign_get_bits(&gb);
|
||||
// fall-through
|
||||
case OBU_TILE_GRP: {
|
||||
case DAV1D_OBU_TILE_GRP: {
|
||||
if (global) break;
|
||||
if (!c->frame_hdr) goto error;
|
||||
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
|
||||
@ -1365,7 +1365,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
c->n_tile_data++;
|
||||
break;
|
||||
}
|
||||
case OBU_METADATA: {
|
||||
case DAV1D_OBU_METADATA: {
|
||||
// obu metadta type field
|
||||
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
|
||||
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
|
||||
@ -1479,8 +1479,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
|
||||
|
||||
break;
|
||||
}
|
||||
case OBU_PADDING:
|
||||
case OBU_TD:
|
||||
case DAV1D_OBU_PADDING:
|
||||
case DAV1D_OBU_TD:
|
||||
// ignore OBUs we don't care about
|
||||
break;
|
||||
default:
|
||||
|
29
third_party/dav1d/src/picture.c
vendored
29
third_party/dav1d/src/picture.c
vendored
@ -52,17 +52,24 @@ int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
|
||||
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
|
||||
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
|
||||
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
|
||||
p->stride[0] = aligned_w << hbd;
|
||||
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
|
||||
const size_t y_sz = p->stride[0] * aligned_h;
|
||||
const size_t uv_sz = p->stride[1] * (aligned_h >> ss_ver);
|
||||
const size_t pic_size = y_sz + 2 * uv_sz;
|
||||
|
||||
uint8_t *data = dav1d_alloc_aligned(pic_size + DAV1D_PICTURE_ALIGNMENT,
|
||||
DAV1D_PICTURE_ALIGNMENT);
|
||||
if (data == NULL) {
|
||||
return DAV1D_ERR(ENOMEM);
|
||||
}
|
||||
ptrdiff_t y_stride = aligned_w << hbd;
|
||||
ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
|
||||
/* Due to how mapping of addresses to sets works in most L1 and L2 cache
|
||||
* implementations, strides of multiples of certain power-of-two numbers
|
||||
* may cause multiple rows of the same superblock to map to the same set,
|
||||
* causing evictions of previous rows resulting in a reduction in cache
|
||||
* hit rate. Avoid that by slightly padding the stride when necessary. */
|
||||
if (!(y_stride & 1023))
|
||||
y_stride += DAV1D_PICTURE_ALIGNMENT;
|
||||
if (!(uv_stride & 1023) && has_chroma)
|
||||
uv_stride += DAV1D_PICTURE_ALIGNMENT;
|
||||
p->stride[0] = y_stride;
|
||||
p->stride[1] = uv_stride;
|
||||
const size_t y_sz = y_stride * aligned_h;
|
||||
const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
|
||||
const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
|
||||
uint8_t *data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
|
||||
if (!data) return DAV1D_ERR(ENOMEM);
|
||||
|
||||
p->data[0] = data;
|
||||
p->data[1] = has_chroma ? data + y_sz : NULL;
|
||||
|
3
third_party/dav1d/src/recon_tmpl.c
vendored
3
third_party/dav1d/src/recon_tmpl.c
vendored
@ -680,6 +680,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
|
||||
struct CodedBlockInfo *cbi;
|
||||
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.cf);
|
||||
cf = ts->frame_thread.cf;
|
||||
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
@ -1149,6 +1150,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
||||
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
|
||||
const uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
|
||||
} else {
|
||||
@ -1345,6 +1347,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
|
||||
const uint16_t (*pal)[8];
|
||||
const uint8_t *pal_idx;
|
||||
if (f->frame_thread.pass) {
|
||||
assert(ts->frame_thread.pal_idx);
|
||||
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))];
|
||||
pal_idx = ts->frame_thread.pal_idx;
|
||||
|
2
third_party/dav1d/src/tables.c
vendored
2
third_party/dav1d/src/tables.c
vendored
@ -406,7 +406,7 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
|
||||
{ 2, 0, 22, -1 },
|
||||
};
|
||||
|
||||
const uint8_t dav1d_sgr_x_by_x[256] = {
|
||||
const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
|
||||
255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
|
||||
16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
|
||||
8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
|
||||
|
44
third_party/dav1d/src/x86/film_grain.asm
vendored
44
third_party/dav1d/src/x86/film_grain.asm
vendored
@ -27,7 +27,7 @@
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA
|
||||
SECTION_RODATA 32
|
||||
pw_1024: times 16 dw 1024
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
@ -609,6 +609,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
movd xm15, [base+hmul_bits-10+shiftq*2]
|
||||
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
|
||||
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
|
||||
vpbroadcastw xm7, [base+hmul_bits+4]
|
||||
vpbroadcastd xm6, [base+pb_1]
|
||||
DEFINE_ARGS buf, bufy, h, x
|
||||
pshufd xm12, xm9, q0000
|
||||
pshufd xm13, xm9, q1111
|
||||
@ -639,31 +641,28 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
|
||||
psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5]
|
||||
psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5]
|
||||
psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5]
|
||||
psrldq xm0, 8 ; y=-2,x=[+2,+5]
|
||||
punpcklwd xm4, xm5
|
||||
punpcklwd xm6, xm1
|
||||
psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5]
|
||||
punpcklwd xm0, xm1
|
||||
psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5]
|
||||
psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5]
|
||||
punpcklwd xm7, xm1
|
||||
punpcklwd xm3, xm1
|
||||
pmaddwd xm4, xm9
|
||||
pmaddwd xm6, xm10
|
||||
pmaddwd xm7, xm12
|
||||
paddd xm4, xm6
|
||||
paddd xm2, xm7
|
||||
pmaddwd xm0, xm10
|
||||
pmaddwd xm3, xm12
|
||||
paddd xm4, xm0
|
||||
paddd xm2, xm3
|
||||
paddd xm2, xm4
|
||||
|
||||
vpbroadcastd xm4, [base+pb_1]
|
||||
movq xm6, [bufyq+xq*2]
|
||||
movq xm7, [bufyq+xq*2+82]
|
||||
pmaddubsw xm6, xm4, xm6
|
||||
pmaddubsw xm7, xm4, xm7
|
||||
vpbroadcastw xm4, [base+hmul_bits+4]
|
||||
paddw xm6, xm7
|
||||
pmulhrsw xm6, xm4
|
||||
pxor xm7, xm7
|
||||
punpcklwd xm6, xm7
|
||||
pmaddwd xm6, xm14
|
||||
paddd xm2, xm6
|
||||
movq xm0, [bufyq+xq*2]
|
||||
movq xm3, [bufyq+xq*2+82]
|
||||
pmaddubsw xm0, xm6, xm0
|
||||
pmaddubsw xm3, xm6, xm3
|
||||
paddw xm0, xm3
|
||||
pmulhrsw xm0, xm7
|
||||
punpcklwd xm0, xm0
|
||||
pmaddwd xm0, xm14
|
||||
paddd xm2, xm0
|
||||
|
||||
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
|
||||
.x_loop_ar2_inner:
|
||||
@ -807,8 +806,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
|
||||
pmaddubsw xm1, xm13, xm1
|
||||
pmaddubsw xm2, xm13, xm2
|
||||
paddw xm1, xm2
|
||||
vpbroadcastw xm3, xm15
|
||||
pmulhrsw xm1, xm3
|
||||
pmulhrsw xm1, xm15
|
||||
|
||||
punpcklwd xm6, xm7
|
||||
punpcklwd xm8, xm9
|
||||
|
187
third_party/dav1d/src/x86/ipred.asm
vendored
187
third_party/dav1d/src/x86/ipred.asm
vendored
@ -1485,7 +1485,7 @@ ALIGN function_align
|
||||
pmaddubsw m0, m1
|
||||
pcmpgtw m1, m9, m6 ; base < max_base_x
|
||||
pmulhrsw m0, m3
|
||||
paddsw m6, m10 ; xpos += dx
|
||||
paddw m6, m10 ; xpos += dx
|
||||
lea r5, [dstq+strideq*2]
|
||||
vpblendvb m0, m7, m0, m1
|
||||
packuswb m0, m0
|
||||
@ -1494,9 +1494,9 @@ ALIGN function_align
|
||||
pextrd [r5 +strideq*1], xm0, 1
|
||||
movd [dstq+strideq*0], xm1
|
||||
pextrd [dstq+strideq*1], xm1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jz .w4_end
|
||||
lea dstq, [dstq+strideq*4]
|
||||
cmp r3d, maxbased
|
||||
jb .w4_loop
|
||||
packuswb xm7, xm7
|
||||
@ -1662,16 +1662,16 @@ ALIGN function_align
|
||||
pshufb m0, m8
|
||||
pmaddubsw m0, m1
|
||||
pcmpgtw m1, m9, m2
|
||||
paddsw m2, m6
|
||||
paddw m2, m6
|
||||
pmulhrsw m0, m3
|
||||
vpblendvb m0, m7, m0, m1
|
||||
vextracti128 xm1, m0, 1
|
||||
packuswb xm0, xm1
|
||||
movq [dstq+strideq*0], xm0
|
||||
movhps [dstq+strideq*1], xm0
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jz .w8_end
|
||||
lea dstq, [dstq+strideq*2]
|
||||
cmp r3d, maxbased
|
||||
jb .w8_loop
|
||||
packuswb xm7, xm7
|
||||
@ -1788,13 +1788,13 @@ ALIGN function_align
|
||||
pcmpgtw m1, m9, m6
|
||||
pcmpgtw m2, m10, m6
|
||||
packsswb m1, m2
|
||||
paddsw m6, m11
|
||||
paddw m6, m11
|
||||
vpblendvb m0, m7, m0, m1
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti128 [dstq+strideq*1], m0, 1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jz .w16_end
|
||||
lea dstq, [dstq+strideq*2]
|
||||
cmp r3d, maxbased
|
||||
jb .w16_loop
|
||||
.w16_end_loop:
|
||||
@ -1903,20 +1903,20 @@ ALIGN function_align
|
||||
movd xm9, maxbased
|
||||
vbroadcasti128 m8, [z_filter_s+2]
|
||||
vpbroadcastw m9, xm9
|
||||
mov r3d, dxd
|
||||
mov r5d, dxd
|
||||
psubw m9, [z_base_inc]
|
||||
mova m11, m6
|
||||
psubw m10, m9, m3 ; 64*8
|
||||
.w32_loop:
|
||||
mov r5d, r3d
|
||||
shr r5d, 6
|
||||
mov r3d, r5d
|
||||
shr r3d, 6
|
||||
pand m1, m4, m6
|
||||
psubw m2, m5, m1
|
||||
psllw m1, 8
|
||||
por m2, m1
|
||||
movu m0, [tlq+r5+0]
|
||||
movu m1, [tlq+r5+8]
|
||||
add r3d, dxd
|
||||
movu m0, [tlq+r3+0]
|
||||
movu m1, [tlq+r3+8]
|
||||
add r5d, dxd
|
||||
pshufb m0, m8
|
||||
pshufb m1, m8
|
||||
pmaddubsw m0, m2
|
||||
@ -1927,13 +1927,13 @@ ALIGN function_align
|
||||
pcmpgtw m1, m9, m6
|
||||
pcmpgtw m2, m10, m6
|
||||
packsswb m1, m2
|
||||
paddsw m6, m11
|
||||
paddw m6, m11
|
||||
vpblendvb m0, m7, m0, m1
|
||||
mova [dstq], m0
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jz .w32_end
|
||||
cmp r3d, maxbased
|
||||
add dstq, strideq
|
||||
cmp r5d, maxbased
|
||||
jb .w32_loop
|
||||
test hb, 1
|
||||
jz .w32_end_loop
|
||||
@ -2074,25 +2074,23 @@ ALIGN function_align
|
||||
mova [tlq+32*2], m0
|
||||
mova [tlq+32*3], m1
|
||||
.w64_main:
|
||||
movd xm6, dxd
|
||||
movd xm12, dxd
|
||||
vpbroadcastb m7, [tlq+maxbaseq]
|
||||
lea r3d, [dxq-64]
|
||||
shl maxbased, 6
|
||||
vpbroadcastw m6, xm6
|
||||
movd xm10, maxbased
|
||||
vpbroadcastw m12, xm12
|
||||
sub r3d, maxbased
|
||||
vbroadcasti128 m8, [z_filter_s+2]
|
||||
mov r3d, dxd
|
||||
vpbroadcastw m10, xm10
|
||||
psllw m0, m3, 2 ; 64*32
|
||||
psubw m10, [z_base_inc]
|
||||
mova m14, m6
|
||||
psubw m11, m10, m3 ; 64*8
|
||||
psubw m12, m10, m0
|
||||
psubw m13, m11, m0
|
||||
movd xm6, r3d
|
||||
mov r5d, dxd
|
||||
mova m10, [pb_1to32]
|
||||
vpbroadcastd m11, [pb_32]
|
||||
vpbroadcastw m6, xm6
|
||||
.w64_loop:
|
||||
mov r5d, r3d
|
||||
shr r5d, 6
|
||||
movu m0, [tlq+r5+ 0]
|
||||
movu m1, [tlq+r5+ 8]
|
||||
mov r3d, r5d
|
||||
shr r3d, 6
|
||||
movu m0, [tlq+r3+ 0]
|
||||
movu m1, [tlq+r3+ 8]
|
||||
pand m2, m4, m6
|
||||
psubw m9, m5, m2
|
||||
psllw m2, 8
|
||||
@ -2101,34 +2099,32 @@ ALIGN function_align
|
||||
pshufb m1, m8
|
||||
pmaddubsw m0, m9
|
||||
pmaddubsw m1, m9
|
||||
psraw m2, m6, 6
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m1, m3
|
||||
packsswb m2, m2
|
||||
paddb m2, m10
|
||||
packuswb m0, m1
|
||||
pcmpgtw m1, m10, m6
|
||||
pcmpgtw m2, m11, m6
|
||||
packsswb m1, m2
|
||||
vpblendvb m2, m7, m0, m1
|
||||
movu m0, [tlq+r5+32]
|
||||
movu m1, [tlq+r5+40]
|
||||
add r3d, dxd
|
||||
mova [dstq+ 0], m2
|
||||
vpblendvb m0, m7, m0, m2
|
||||
mova [dstq+ 0], m0
|
||||
movu m0, [tlq+r3+32]
|
||||
movu m1, [tlq+r3+40]
|
||||
add r5d, dxd
|
||||
pshufb m0, m8
|
||||
pshufb m1, m8
|
||||
pmaddubsw m0, m9
|
||||
pmaddubsw m1, m9
|
||||
pcmpgtw m9, m12, m6
|
||||
pcmpgtw m2, m13, m6
|
||||
paddb m2, m11
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m1, m3
|
||||
paddsw m6, m14
|
||||
packsswb m9, m2
|
||||
paddw m6, m12
|
||||
packuswb m0, m1
|
||||
vpblendvb m0, m7, m0, m9
|
||||
vpblendvb m0, m7, m0, m2
|
||||
mova [dstq+32], m0
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jz .w64_end
|
||||
cmp r3d, maxbased
|
||||
add dstq, strideq
|
||||
cmp r5d, maxbased
|
||||
jb .w64_loop
|
||||
.w64_end_loop:
|
||||
mova [dstq+ 0], m7
|
||||
@ -2384,7 +2380,7 @@ ALIGN function_align
|
||||
vpblendvb m0, m1, m2
|
||||
.w4_toponly:
|
||||
pmulhrsw m0, m13
|
||||
paddsw m6, m7 ; xpos += dx
|
||||
paddw m6, m7 ; xpos += dx
|
||||
add r5, dyq
|
||||
packuswb m0, m0
|
||||
vextracti128 xm1, m0, 1
|
||||
@ -2392,9 +2388,9 @@ ALIGN function_align
|
||||
pextrd [dstq+r9 ], xm0, 1
|
||||
movd [dstq+strideq*0], xm1
|
||||
pextrd [dstq+strideq*1], xm1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jz .w4_end
|
||||
lea dstq, [dstq+strideq*4]
|
||||
cmp r2d, r8d
|
||||
jge .w4_loop
|
||||
.w4_leftonly_loop:
|
||||
@ -2604,7 +2600,7 @@ ALIGN function_align
|
||||
.w8_toponly:
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
paddsw m6, m4, m7 ; xpos += dx
|
||||
paddw m6, m4, m7 ; xpos += dx
|
||||
add r5, dyq
|
||||
packuswb m0, m1
|
||||
vextracti128 xm1, m0, 1
|
||||
@ -2612,9 +2608,9 @@ ALIGN function_align
|
||||
movhps [dstq+strideq*2], xm0
|
||||
movq [dstq+strideq*1], xm1
|
||||
movhps [dstq+r9 ], xm1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jz .w8_end
|
||||
lea dstq, [dstq+strideq*4]
|
||||
cmp r2d, r8d
|
||||
jge .w8_loop
|
||||
.w8_leftonly_loop:
|
||||
@ -2841,15 +2837,15 @@ ALIGN function_align
|
||||
.w16_toponly:
|
||||
pmulhrsw m0, m13
|
||||
pmulhrsw m1, m13
|
||||
paddsw m6, m5, m7 ; xpos += dx
|
||||
paddw m6, m5, m7 ; xpos += dx
|
||||
sub r5, 2
|
||||
packuswb m0, m1
|
||||
vpermq m0, m0, q3120
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti128 [dstq+strideq*1], m0, 1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jz .w16_end
|
||||
lea dstq, [dstq+strideq*2]
|
||||
cmp r2d, (63-16)<<6
|
||||
jge .w16_loop
|
||||
.w16_leftonly_loop:
|
||||
@ -3135,9 +3131,9 @@ ALIGN function_align
|
||||
vpbroadcastb m7, [r4]
|
||||
lea r4, [dyq+63] ; ypos
|
||||
movd xm9, maxbased
|
||||
sub maxbased, 63
|
||||
not maxbased
|
||||
vbroadcasti128 m8, [z3_shuf_w4]
|
||||
neg maxbaseq
|
||||
add maxbased, 64
|
||||
vpbroadcastw m9, xm9
|
||||
psrlw m7, 8 ; top[max_base_y]
|
||||
paddw m10, m6, m6
|
||||
@ -3170,7 +3166,7 @@ ALIGN function_align
|
||||
pmaddubsw m0, m1
|
||||
pcmpgtw m1, m9, m6 ; base < max_base_y
|
||||
pmulhrsw m0, m3
|
||||
paddsw m6, m10 ; ypos += dy
|
||||
paddw m6, m10 ; ypos += dy
|
||||
vpblendvb m0, m7, m0, m1
|
||||
vextracti128 xm1, m0, 1
|
||||
packuswb xm1, xm0
|
||||
@ -3179,9 +3175,9 @@ ALIGN function_align
|
||||
pextrd [dstq+strideq*1], xm1, 1
|
||||
pextrd [dstq+strideq*2], xm1, 2
|
||||
pextrd [dstq+r7 ], xm1, 3
|
||||
add dstq, 4
|
||||
sub wd, 4
|
||||
jz .h4_end
|
||||
add dstq, 4
|
||||
cmp r4d, maxbased
|
||||
jg .h4_loop
|
||||
packuswb xm7, xm7
|
||||
@ -3344,9 +3340,9 @@ ALIGN function_align
|
||||
vpbroadcastb m7, [r4]
|
||||
lea r4, [dyq+63]
|
||||
movd xm9, maxbased
|
||||
sub maxbased, 63
|
||||
not maxbased
|
||||
vbroadcasti128 m8, [z3_shuf]
|
||||
neg maxbaseq
|
||||
add maxbased, 64
|
||||
vpbroadcastw m9, xm9
|
||||
psrlw m7, 8
|
||||
psubw m9, m0
|
||||
@ -3367,7 +3363,7 @@ ALIGN function_align
|
||||
pshufb m0, m8
|
||||
pmaddubsw m0, m1
|
||||
pcmpgtw m1, m9, m2
|
||||
paddsw m2, m6
|
||||
paddw m2, m6
|
||||
pmulhrsw m0, m3
|
||||
vpblendvb m0, m7, m0, m1
|
||||
vextracti128 xm1, m0, 1
|
||||
@ -3516,9 +3512,9 @@ ALIGN function_align
|
||||
vpbroadcastb m7, [r4]
|
||||
lea r4, [dyq+63]
|
||||
movd xm9, maxbased
|
||||
sub maxbased, 63
|
||||
not maxbased
|
||||
vbroadcasti128 m8, [z3_shuf]
|
||||
neg maxbaseq
|
||||
add maxbased, 64
|
||||
vpbroadcastw m9, xm9
|
||||
psubw m9, m0
|
||||
paddw m11, m6, m6
|
||||
@ -3548,7 +3544,7 @@ ALIGN function_align
|
||||
pcmpgtw m1, m9, m6
|
||||
pcmpgtw m2, m10, m6
|
||||
packsswb m1, m2
|
||||
paddsw m6, m11
|
||||
paddw m6, m11
|
||||
vpblendvb m0, m7, m0, m1
|
||||
vpermq m0, m0, q3120
|
||||
mova [rsp], m0
|
||||
@ -3742,9 +3738,9 @@ ALIGN function_align
|
||||
vpbroadcastb m7, [r4]
|
||||
lea r4, [dyq+63]
|
||||
movd xm9, maxbased
|
||||
sub maxbased, 63
|
||||
not maxbased
|
||||
vbroadcasti128 m8, [z3_shuf]
|
||||
neg maxbaseq
|
||||
add maxbased, 64
|
||||
vpbroadcastw m9, xm9
|
||||
psubw m9, [z_base_inc]
|
||||
mova m11, m6
|
||||
@ -3772,7 +3768,7 @@ ALIGN function_align
|
||||
pcmpgtw m1, m9, m6
|
||||
pcmpgtw m2, m10, m6
|
||||
packsswb m1, m2
|
||||
paddsw m6, m11
|
||||
paddw m6, m11
|
||||
vpblendvb m0, m7, m0, m1
|
||||
mova [rsp], m0
|
||||
dec wd
|
||||
@ -3996,33 +3992,26 @@ ALIGN function_align
|
||||
mova [tlq-63], m0
|
||||
mova [tlq-31], m1
|
||||
.h64_main:
|
||||
movd xm6, dyd
|
||||
mov r4, tlq
|
||||
sub tlq, 24
|
||||
neg dyq
|
||||
vpbroadcastw m6, xm6
|
||||
sub r4, maxbaseq
|
||||
shl maxbased, 6
|
||||
vpbroadcastb m7, [r4]
|
||||
lea r4, [dyq+63]
|
||||
movd xm10, maxbased
|
||||
sub maxbased, 63
|
||||
vbroadcasti128 m8, [z3_shuf]
|
||||
movd xm12, dyd
|
||||
neg maxbaseq
|
||||
mova xm1, [z_base_inc+16]
|
||||
vinserti128 m1, [z_base_inc], 1
|
||||
vpbroadcastw m10, xm10
|
||||
psllw m0, m3, 2 ; 64*32
|
||||
psubw m10, m1
|
||||
mova m14, m6
|
||||
psubw m11, m10, m3 ; 64*8
|
||||
psubw m12, m10, m0
|
||||
psubw m13, m11, m0
|
||||
vbroadcasti128 m8, [z3_shuf]
|
||||
vpbroadcastb m7, [tlq+maxbaseq]
|
||||
shl maxbased, 6
|
||||
vpbroadcastw m12, xm12
|
||||
lea r5d, [dyq+maxbaseq-64]
|
||||
neg dyq
|
||||
or maxbased, 63
|
||||
lea r4, [dyq+63]
|
||||
movd xm6, r5d
|
||||
mova xm10, [pb_1to32+16]
|
||||
vinserti128 m10, [pb_1to32], 1
|
||||
vpbroadcastd m11, [pb_32]
|
||||
vpbroadcastw m6, xm6
|
||||
.h64_loop:
|
||||
mov r5, r4
|
||||
sar r5, 6
|
||||
movu m0, [tlq+r5-0]
|
||||
movu m1, [tlq+r5-8]
|
||||
movu m0, [tlq+r5-24]
|
||||
movu m1, [tlq+r5-32]
|
||||
pand m2, m4, m6
|
||||
psubw m9, m5, m2
|
||||
psllw m2, 8
|
||||
@ -4031,30 +4020,28 @@ ALIGN function_align
|
||||
pshufb m1, m8
|
||||
pmaddubsw m0, m9
|
||||
pmaddubsw m1, m9
|
||||
psraw m2, m6, 6
|
||||
sub rsp, 64
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m1, m3
|
||||
packsswb m2, m2
|
||||
paddb m2, m10
|
||||
packuswb m0, m1
|
||||
pcmpgtw m1, m10, m6
|
||||
pcmpgtw m2, m11, m6
|
||||
packsswb m1, m2
|
||||
vpblendvb m2, m7, m0, m1
|
||||
movu m0, [tlq+r5-32]
|
||||
movu m1, [tlq+r5-40]
|
||||
vpblendvb m0, m7, m0, m2
|
||||
mova [rsp+32], m0
|
||||
movu m0, [tlq+r5-56]
|
||||
movu m1, [tlq+r5-64]
|
||||
add r4, dyq
|
||||
sub rsp, 64
|
||||
mova [rsp+32], m2
|
||||
pshufb m0, m8
|
||||
pshufb m1, m8
|
||||
pmaddubsw m0, m9
|
||||
pmaddubsw m1, m9
|
||||
pcmpgtw m9, m12, m6
|
||||
pcmpgtw m2, m13, m6
|
||||
paddb m2, m11
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m1, m3
|
||||
paddsw m6, m14
|
||||
packsswb m9, m2
|
||||
paddw m6, m12
|
||||
packuswb m0, m1
|
||||
vpblendvb m0, m7, m0, m9
|
||||
vpblendvb m0, m7, m0, m2
|
||||
mova [rsp], m0
|
||||
dec wd
|
||||
jz .h64_transpose
|
||||
|
319
third_party/dav1d/src/x86/itx.asm
vendored
319
third_party/dav1d/src/x86/itx.asm
vendored
@ -52,13 +52,15 @@ pw_m3803_3344: dw -3803, 3344
|
||||
pw_m3803_m6688: dw -3803, -6688
|
||||
pw_2896_m2896: dw 2896, -2896
|
||||
|
||||
pw_5: times 2 dw 5
|
||||
pw_2048: times 2 dw 2048
|
||||
pw_4096: times 2 dw 4096
|
||||
pw_8192: times 2 dw 8192
|
||||
pw_16384: times 2 dw 16384
|
||||
pw_2896x8: times 2 dw 2896*8
|
||||
pw_5793x4: times 2 dw 5793*4
|
||||
pw_5: times 2 dw 5
|
||||
pw_2048: times 2 dw 2048
|
||||
pw_4096: times 2 dw 4096
|
||||
pw_8192: times 2 dw 8192
|
||||
pw_16384: times 2 dw 16384
|
||||
pw_1697x16: times 2 dw 1697*16
|
||||
pw_1697x8: times 2 dw 1697*8
|
||||
pw_2896x8: times 2 dw 2896*8
|
||||
pw_5793x4: times 2 dw 5793*4
|
||||
|
||||
pd_2048: dd 2048
|
||||
|
||||
@ -389,9 +391,9 @@ ALIGN function_align
|
||||
%ifidn %1_%2, dct_identity
|
||||
vpbroadcastd m0, [o(pw_2896x8)]
|
||||
pmulhrsw m0, [cq]
|
||||
vpbroadcastd m1, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
pmulhrsw m0, m1
|
||||
vpbroadcastd m1, [o(pw_1697x8)]
|
||||
pmulhrsw m1, m0
|
||||
paddw m0, m1
|
||||
punpcklwd m0, m0
|
||||
punpckhdq m1, m0, m0
|
||||
punpckldq m0, m0
|
||||
@ -399,12 +401,12 @@ ALIGN function_align
|
||||
%elifidn %1_%2, identity_dct
|
||||
mova m0, [cq+16*0]
|
||||
packusdw m0, [cq+16*1]
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
vpbroadcastd m3, [o(pw_2896x8)]
|
||||
vpbroadcastd m1, [o(pw_1697x8)]
|
||||
vpbroadcastd m2, [o(pw_2896x8)]
|
||||
packusdw m0, m0
|
||||
paddw m0, m0
|
||||
pmulhrsw m1, m0
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m0, m3
|
||||
mova m1, m0
|
||||
jmp m(iadst_4x4_internal).end
|
||||
%elif %3 >= 0
|
||||
@ -556,22 +558,22 @@ INV_TXFM_4X4_FN identity, identity
|
||||
cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
mova m0, [cq+16*0]
|
||||
mova m1, [cq+16*1]
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m2
|
||||
punpcklwd m0, m2
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
jmp m(iadst_4x4_internal).end
|
||||
|
||||
%macro WRITE_4X8 2 ; coefs[1-2]
|
||||
@ -619,12 +621,12 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
|
||||
movd xm1, [cq+16*2]
|
||||
punpcklwd xm1, [cq+16*3]
|
||||
vpbroadcastd xm2, [o(pw_2896x8)]
|
||||
vpbroadcastd xm3, [o(pw_5793x4)]
|
||||
vpbroadcastd xm3, [o(pw_1697x8)]
|
||||
vpbroadcastd xm4, [o(pw_2048)]
|
||||
punpckldq xm0, xm1
|
||||
pmulhrsw xm0, xm2
|
||||
paddw xm0, xm0
|
||||
pmulhrsw xm0, xm3
|
||||
pmulhrsw xm3, xm0
|
||||
paddw xm0, xm3
|
||||
pmulhrsw xm0, xm2
|
||||
pmulhrsw xm0, xm4
|
||||
vpbroadcastq m0, xm0
|
||||
@ -896,17 +898,17 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpermq m2, [cq+32*0], q3120
|
||||
vpermq m0, [cq+32*1], q3120
|
||||
vpbroadcastd m3, [o(pw_2896x8)]
|
||||
vpbroadcastd m4, [o(pw_5793x4)]
|
||||
vpbroadcastd m4, [o(pw_1697x8)]
|
||||
punpcklwd m1, m2, m0
|
||||
punpckhwd m2, m0
|
||||
pmulhrsw m1, m3
|
||||
pmulhrsw m2, m3
|
||||
punpcklwd m0, m1, m2
|
||||
punpckhwd m1, m2
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m1, m4
|
||||
pmulhrsw m2, m4, m0
|
||||
pmulhrsw m4, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m4, [o(pw_4096)]
|
||||
@ -919,11 +921,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m0, [o(pw_2896x8)]
|
||||
pmulhrsw m0, [cq]
|
||||
vpbroadcastd m1, [o(pw_16384)]
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
vpbroadcastd m2, [o(pw_1697x16)]
|
||||
vpbroadcastd m3, [o(pw_2048)]
|
||||
pmulhrsw m0, m1
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m2, m0
|
||||
paddw m0, m0
|
||||
paddw m0, m2
|
||||
pmulhrsw m3, m0
|
||||
punpcklwd m1, m3, m3
|
||||
punpckhwd m3, m3
|
||||
@ -937,12 +940,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
punpcklwd xm0, [cq+32*1]
|
||||
movd xm1, [cq+32*2]
|
||||
punpcklwd xm1, [cq+32*3]
|
||||
vpbroadcastd xm2, [o(pw_5793x4)]
|
||||
vpbroadcastd xm2, [o(pw_1697x8)]
|
||||
vpbroadcastd xm3, [o(pw_16384)]
|
||||
vpbroadcastd xm4, [o(pw_2896x8)]
|
||||
punpckldq xm0, xm1
|
||||
paddw xm0, xm0
|
||||
pmulhrsw xm0, xm2
|
||||
pmulhrsw xm2, xm0
|
||||
paddw xm0, xm2
|
||||
pmulhrsw xm0, xm3
|
||||
psrlw xm3, 3 ; pw_2048
|
||||
pmulhrsw xm0, xm4
|
||||
@ -1281,13 +1284,19 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
mova m2, [cq+32*1]
|
||||
mova m4, [cq+32*2]
|
||||
mova m0, [cq+32*3]
|
||||
vpbroadcastd m5, [o(pw_5793x4)]
|
||||
vpbroadcastd m5, [o(pw_1697x8)]
|
||||
punpcklwd m1, m3, m2
|
||||
punpckhwd m3, m2
|
||||
punpcklwd m2, m4, m0
|
||||
punpckhwd m4, m0
|
||||
REPX {paddw x, x }, m1, m2, m3, m4
|
||||
REPX {pmulhrsw x, m5}, m1, m2, m3, m4
|
||||
pmulhrsw m0, m5, m1
|
||||
pmulhrsw m6, m5, m2
|
||||
pmulhrsw m7, m5, m3
|
||||
pmulhrsw m5, m4
|
||||
paddw m1, m0
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
paddw m4, m5
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
punpckldq m0, m1, m2
|
||||
punpckhdq m1, m2
|
||||
@ -1296,10 +1305,17 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m4, [o(pw_5793x4)]
|
||||
vpbroadcastd m8, [o(pw_1697x16)]
|
||||
vpbroadcastd m5, [o(pw_2048)]
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3
|
||||
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
||||
pmulhrsw m4, m8, m0
|
||||
pmulhrsw m6, m8, m1
|
||||
pmulhrsw m7, m8, m2
|
||||
pmulhrsw m8, m3
|
||||
REPX {paddw x, x}, m0, m1, m2, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m6
|
||||
paddw m2, m7
|
||||
paddw m3, m8
|
||||
jmp m(iadst_4x16_internal).end2
|
||||
|
||||
%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
|
||||
@ -1333,11 +1349,11 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
%ifidn %1_%2, dct_identity
|
||||
vpbroadcastd xm0, [o(pw_2896x8)]
|
||||
pmulhrsw xm1, xm0, [cq]
|
||||
vpbroadcastd xm2, [o(pw_5793x4)]
|
||||
vpbroadcastd xm2, [o(pw_1697x8)]
|
||||
vpbroadcastd xm3, [o(pw_2048)]
|
||||
pmulhrsw xm1, xm0
|
||||
paddw xm1, xm1
|
||||
pmulhrsw xm1, xm2
|
||||
pmulhrsw xm2, xm1
|
||||
paddw xm1, xm2
|
||||
pmulhrsw xm1, xm3
|
||||
punpcklwd xm1, xm1
|
||||
punpckldq xm0, xm1, xm1
|
||||
@ -1508,11 +1524,11 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
paddw m1, m1
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
vpbroadcastd m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
jmp m(iadst_8x4_internal).end
|
||||
|
||||
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
@ -1773,14 +1789,15 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
|
||||
vpbroadcastd m0, [o(pw_2896x8)]
|
||||
pmulhrsw m7, m0, [cq]
|
||||
vpbroadcastd m1, [o(pw_16384)]
|
||||
vpbroadcastd m2, [o(pw_5793x4)]
|
||||
vpbroadcastd m2, [o(pw_1697x16)]
|
||||
pxor m3, m3
|
||||
mova [cq], m3
|
||||
pmulhrsw m7, m0
|
||||
pmulhrsw m7, m1
|
||||
psrlw m1, 3 ; pw_2048
|
||||
psllw m7, 2
|
||||
pmulhrsw m7, m2
|
||||
pmulhrsw m2, m7
|
||||
paddw m7, m7
|
||||
paddw m7, m2
|
||||
pmulhrsw m7, m1
|
||||
punpcklwd m5, m7, m7
|
||||
punpckhwd m7, m7
|
||||
@ -2101,6 +2118,16 @@ INV_TXFM_8X16_FN identity, adst
|
||||
INV_TXFM_8X16_FN identity, flipadst
|
||||
INV_TXFM_8X16_FN identity, identity
|
||||
|
||||
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
|
||||
pmulhrsw m%2, m%3, m%1
|
||||
%if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw
|
||||
pmulhrsw m%2, m%4
|
||||
%else
|
||||
paddw m%1, m%1
|
||||
%endif
|
||||
paddw m%1, m%2
|
||||
%endmacro
|
||||
|
||||
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
mova xm3, [cq+16*0]
|
||||
mova xm2, [cq+16*2]
|
||||
@ -2139,10 +2166,9 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
punpckhdq m7, m8
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m8, [o(pw_5793x4)]
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
vpbroadcastd m8, [o(pw_1697x16)]
|
||||
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
jmp m(idct_8x16_internal).end
|
||||
|
||||
%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
|
||||
@ -2171,11 +2197,11 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
vpbroadcastd xm3, [o(pw_2896x8)]
|
||||
pmulhrsw xm3, [cq]
|
||||
vpbroadcastd xm0, [o(pw_16384)]
|
||||
vpbroadcastd xm1, [o(pw_5793x4)]
|
||||
vpbroadcastd xm1, [o(pw_1697x8)]
|
||||
pmulhrsw xm3, xm0
|
||||
psrlw xm0, 3 ; pw_2048
|
||||
paddw xm3, xm3
|
||||
pmulhrsw xm3, xm1
|
||||
pmulhrsw xm1, xm3
|
||||
paddw xm3, xm1
|
||||
pmulhrsw xm3, xm0
|
||||
punpcklwd xm3, xm3
|
||||
punpckldq xm1, xm3, xm3
|
||||
@ -2194,15 +2220,15 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
mova xm3, [cq+16*3]
|
||||
vinserti128 m1, m1, [cq+16*6], 1
|
||||
vinserti128 m3, m3, [cq+16*7], 1
|
||||
vpbroadcastd m4, [o(pw_5793x4)]
|
||||
vpbroadcastd m4, [o(pw_1697x16)]
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
packusdw m0, m2
|
||||
packusdw m1, m3
|
||||
packusdw m0, m1
|
||||
vpbroadcastd m1, [o(pw_2896x8)]
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m4, m0
|
||||
pmulhrsw m4, m5
|
||||
paddw m0, m4
|
||||
psrlw m5, 3 ; pw_2048
|
||||
pmulhrsw m0, m1
|
||||
pmulhrsw m0, m5
|
||||
@ -2462,28 +2488,40 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
mova xm1, [cq+16*3]
|
||||
vinserti128 m0, m0, [cq+16*6], 1
|
||||
vinserti128 m1, m1, [cq+16*7], 1
|
||||
vpbroadcastd m5, [o(pw_5793x4)]
|
||||
vpbroadcastd m7, [o(pw_1697x16)]
|
||||
vpbroadcastd m8, [o(pw_16384)]
|
||||
punpcklwd m3, m2, m4
|
||||
punpckhwd m2, m4
|
||||
punpcklwd m4, m0, m1
|
||||
punpckhwd m0, m1
|
||||
REPX {psllw x, 2}, m3, m2, m4, m0
|
||||
punpcklwd m1, m3, m2
|
||||
punpckhwd m3, m2
|
||||
punpcklwd m2, m4, m0
|
||||
punpckhwd m4, m0
|
||||
REPX {pmulhrsw x, m5}, m1, m3, m2, m4
|
||||
vpbroadcastd m5, [o(pw_16384)]
|
||||
pmulhrsw m0, m7, m1
|
||||
pmulhrsw m5, m7, m2
|
||||
pmulhrsw m6, m7, m3
|
||||
pmulhrsw m7, m4
|
||||
REPX {pmulhrsw x, m8}, m0, m5, m6, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m5
|
||||
paddw m3, m6
|
||||
paddw m4, m7
|
||||
punpcklqdq m0, m1, m2
|
||||
punpckhqdq m1, m2
|
||||
punpcklqdq m2, m3, m4
|
||||
punpckhqdq m3, m4
|
||||
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m4, [o(pw_5793x4)]
|
||||
REPX {paddw x, x }, m0, m1, m2, m3
|
||||
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
|
||||
vpbroadcastd m7, [o(pw_1697x8)]
|
||||
pmulhrsw m4, m7, m0
|
||||
pmulhrsw m5, m7, m1
|
||||
pmulhrsw m6, m7, m2
|
||||
pmulhrsw m7, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
jmp m(iadst_16x4_internal).end
|
||||
|
||||
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
@ -2532,7 +2570,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
mova m3, [cq+32*6]
|
||||
packusdw m3, [cq+32*7]
|
||||
vpbroadcastd m4, [o(pw_2896x8)]
|
||||
vpbroadcastd m5, [o(pw_5793x4)]
|
||||
vpbroadcastd m5, [o(pw_1697x16)]
|
||||
packusdw m0, m2
|
||||
packusdw m1, m3
|
||||
vpbroadcastd m2, [o(pw_16384)]
|
||||
@ -2541,9 +2579,9 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
|
||||
vpermq m0, m0, q1100
|
||||
punpcklwd m0, m1
|
||||
pmulhrsw m0, m4
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m5, m0
|
||||
pmulhrsw m5, m2
|
||||
paddw m0, m5
|
||||
psrlw m2, 3 ; pw_2048
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m0, m2
|
||||
@ -2816,8 +2854,8 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
mova xm1, [cq-16*1]
|
||||
vinserti128 m0, m0, [cq+16*6], 1
|
||||
vinserti128 m1, m1, [cq+16*7], 1
|
||||
vpbroadcastd m9, [o(pw_5793x4)]
|
||||
vpbroadcastd m10, [o(pw_16384)]
|
||||
vpbroadcastd m10, [o(pw_1697x16)]
|
||||
vpbroadcastd m11, [o(pw_16384)]
|
||||
REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
|
||||
punpcklwd m3, m7, m2
|
||||
punpckhwd m7, m2
|
||||
@ -2827,7 +2865,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
punpckhwd m8, m5
|
||||
punpcklwd m5, m0, m1
|
||||
punpckhwd m0, m1
|
||||
REPX {psllw x, 2}, m3, m7, m2, m6, m4, m8, m5, m0
|
||||
punpckldq m1, m3, m2
|
||||
punpckhdq m3, m2
|
||||
punpckldq m2, m4, m5
|
||||
@ -2836,7 +2873,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
punpckhdq m7, m6
|
||||
punpckldq m6, m8, m0
|
||||
punpckhdq m8, m0
|
||||
REPX {pmulhrsw x, m9}, m1, m3, m2, m4, m5, m7, m6, m8
|
||||
REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
|
||||
punpcklqdq m0, m1, m2
|
||||
punpckhqdq m1, m2
|
||||
punpcklqdq m2, m3, m4
|
||||
@ -2845,7 +2882,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
punpckhqdq m5, m6
|
||||
punpcklqdq m6, m7, m8
|
||||
punpckhqdq m7, m8
|
||||
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
jmp tx2q
|
||||
.pass2:
|
||||
vpbroadcastd m8, [o(pw_4096)]
|
||||
@ -2916,14 +2952,15 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
|
||||
vinserti128 m2, m2, [cq+32*14], 1
|
||||
vinserti128 m4, m4, [cq+32*15], 1
|
||||
punpcklwd m1, m3
|
||||
vpbroadcastd m3, [o(pw_5793x4)]
|
||||
vpbroadcastd m3, [o(pw_1697x16)]
|
||||
punpcklwd m2, m4
|
||||
vpbroadcastd m4, [o(pw_8192)]
|
||||
punpckldq m1, m2
|
||||
vpbroadcastd m2, [o(pw_2896x8)]
|
||||
punpcklqdq m0, m1
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m3, m0
|
||||
paddw m0, m0
|
||||
paddw m0, m3
|
||||
pmulhrsw m0, m4
|
||||
psrlw m4, 2 ; pw_2048
|
||||
pmulhrsw m0, m2
|
||||
@ -3352,47 +3389,47 @@ INV_TXFM_16X16_FN identity, dct, 15
|
||||
INV_TXFM_16X16_FN identity, identity
|
||||
|
||||
cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
mova xm0, [cq+16*0]
|
||||
mova xm15, [cq+16*1]
|
||||
mova xm1, [cq+16*2]
|
||||
mova xm8, [cq+16*3]
|
||||
mova xm2, [cq+16*4]
|
||||
mova xm9, [cq+16*5]
|
||||
mova xm3, [cq+16*6]
|
||||
mova xm10, [cq+16*7]
|
||||
vpbroadcastd m7, [o(pw_1697x16)]
|
||||
mova xm0, [cq+16* 0]
|
||||
vinserti128 m0, [cq+16*16], 1
|
||||
mova xm15, [cq+16* 1]
|
||||
vinserti128 m15, [cq+16*17], 1
|
||||
mova xm1, [cq+16* 2]
|
||||
vinserti128 m1, [cq+16*18], 1
|
||||
mova xm8, [cq+16* 3]
|
||||
vinserti128 m8, [cq+16*19], 1
|
||||
mova xm2, [cq+16* 4]
|
||||
vinserti128 m2, [cq+16*20], 1
|
||||
mova xm9, [cq+16* 5]
|
||||
vinserti128 m9, [cq+16*21], 1
|
||||
mova xm3, [cq+16* 6]
|
||||
vinserti128 m3, [cq+16*22], 1
|
||||
mova xm10, [cq+16* 7]
|
||||
add cq, 16*16
|
||||
vinserti128 m0, m0, [cq+16*0], 1
|
||||
vinserti128 m15, m15, [cq+16*1], 1
|
||||
mova xm4, [cq-16*8]
|
||||
mova xm11, [cq-16*7]
|
||||
vinserti128 m1, m1, [cq+16*2], 1
|
||||
vinserti128 m8, m8, [cq+16*3], 1
|
||||
mova xm5, [cq-16*6]
|
||||
mova xm12, [cq-16*5]
|
||||
vinserti128 m2, m2, [cq+16*4], 1
|
||||
vinserti128 m9, m9, [cq+16*5], 1
|
||||
mova xm6, [cq-16*4]
|
||||
mova xm13, [cq-16*3]
|
||||
vinserti128 m3, m3, [cq+16*6], 1
|
||||
vinserti128 m10, m10, [cq+16*7], 1
|
||||
mova xm7, [cq-16*2]
|
||||
mova xm14, [cq-16*1]
|
||||
vinserti128 m4, m4, [cq+16*8], 1
|
||||
vinserti128 m11, m11, [cq+16*9], 1
|
||||
vinserti128 m5, m5, [cq+16*10], 1
|
||||
vinserti128 m12, m12, [cq+16*11], 1
|
||||
vinserti128 m6, m6, [cq+16*12], 1
|
||||
vinserti128 m13, m13, [cq+16*13], 1
|
||||
vinserti128 m7, m7, [cq+16*14], 1
|
||||
vinserti128 m14, m14, [cq+16*15], 1
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
mova [rsp], m0
|
||||
vpbroadcastd m0, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m0}, m1, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
pmulhrsw m0, [rsp]
|
||||
vinserti128 m10, [cq+16* 7], 1
|
||||
mova xm4, [cq-16* 8]
|
||||
vinserti128 m4, [cq+16* 8], 1
|
||||
mova xm11, [cq-16* 7]
|
||||
vinserti128 m11, [cq+16* 9], 1
|
||||
mova xm5, [cq-16* 6]
|
||||
vinserti128 m5, [cq+16*10], 1
|
||||
mova xm12, [cq-16* 5]
|
||||
vinserti128 m12, [cq+16*11], 1
|
||||
mova xm13, [cq-16* 3]
|
||||
vinserti128 m13, [cq+16*13], 1
|
||||
mova xm14, [cq-16* 1]
|
||||
vinserti128 m14, [cq+16*15], 1
|
||||
REPX {IDTX16 x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
|
||||
10, 4, 11, 5, 12, 13, 14
|
||||
mova xm6, [cq-16* 4]
|
||||
vinserti128 m6, [cq+16*12], 1
|
||||
mova [rsp], m1
|
||||
IDTX16 6, 1, 7
|
||||
mova xm1, [cq-16* 2]
|
||||
vinserti128 m1, [cq+16*14], 1
|
||||
pmulhrsw m7, m1
|
||||
paddw m1, m1
|
||||
paddw m7, m1
|
||||
vpbroadcastd m1, [o(pw_8192)]
|
||||
REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \
|
||||
m8, m9, m10, m11, m12, m13, m14, m15
|
||||
@ -3401,14 +3438,17 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
|
||||
jmp m(idct_16x16_internal).pass1_end3
|
||||
ALIGN function_align
|
||||
.pass2:
|
||||
vpbroadcastd m15, [o(pw_5793x4)]
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {pmulhrsw x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
vpbroadcastd m15, [o(pw_1697x16)]
|
||||
mova [rsp+32*1], m0
|
||||
REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
|
||||
8, 9, 10, 11, 12, 13, 14
|
||||
mova m0, [rsp+32*1]
|
||||
mova [rsp+32*1], m1
|
||||
IDTX16 0, 1, 15
|
||||
mova m1, [rsp+32*0]
|
||||
REPX {psllw x, 2 }, m8, m9, m10, m11, m12, m13, m14, m1
|
||||
REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
|
||||
pmulhrsw m15, m1
|
||||
paddw m1, m1
|
||||
paddw m15, m1
|
||||
jmp m(idct_16x16_internal).end
|
||||
|
||||
%define o_base iadst4_dconly2a + 128
|
||||
@ -4606,7 +4646,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
|
||||
%undef cmp
|
||||
lea rax, [o_base]
|
||||
vpbroadcastd m9, [o(pw_2896x8)]
|
||||
vpbroadcastd m10, [o(pw_5793x4)]
|
||||
vpbroadcastd m10, [o(pw_1697x8)]
|
||||
vpbroadcastd m11, [o(pw_2048)]
|
||||
cmp eobd, 35 ; if (eob > 35)
|
||||
setg r4b ; iteration_count++
|
||||
@ -4634,9 +4674,24 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
|
||||
vinserti128 m6, m6, [cq+32*14], 1
|
||||
vinserti128 m7, m7, [cq+32*15], 1
|
||||
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psllw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
|
||||
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pmulhrsw m8, m10, m0
|
||||
paddw m0, m8
|
||||
pmulhrsw m8, m10, m1
|
||||
paddw m1, m8
|
||||
pmulhrsw m8, m10, m2
|
||||
paddw m2, m8
|
||||
pmulhrsw m8, m10, m3
|
||||
paddw m3, m8
|
||||
pmulhrsw m8, m10, m4
|
||||
paddw m4, m8
|
||||
pmulhrsw m8, m10, m5
|
||||
paddw m5, m8
|
||||
pmulhrsw m8, m10, m6
|
||||
paddw m6, m8
|
||||
pmulhrsw m8, m10, m7
|
||||
paddw m7, m8
|
||||
REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
|
||||
WRITE_16X2 2, 3, 0, 1, strideq*2, r3
|
||||
|
438
third_party/dav1d/src/x86/itx_ssse3.asm
vendored
438
third_party/dav1d/src/x86/itx_ssse3.asm
vendored
@ -73,6 +73,8 @@ pw_m2048: times 8 dw -2048
|
||||
pw_4096: times 8 dw 4096
|
||||
pw_16384: times 8 dw 16384
|
||||
pw_m16384: times 8 dw -16384
|
||||
pw_1697x16: times 8 dw 1697*16
|
||||
pw_1697x8: times 8 dw 1697*8
|
||||
pw_2896x8: times 8 dw 2896*8
|
||||
pw_3344x8: times 8 dw 3344*8
|
||||
pw_5793x4: times 8 dw 5793*4
|
||||
@ -273,8 +275,8 @@ ALIGN function_align
|
||||
%ifidn %1_%2, dct_identity
|
||||
mova m0, [o(pw_2896x8)]
|
||||
pmulhrsw m0, [coeffq]
|
||||
paddw m0, m0
|
||||
pmulhrsw m0, [o(pw_5793x4)]
|
||||
pmulhrsw m1, m0, [o(pw_1697x8)]
|
||||
paddw m0, m1
|
||||
punpcklwd m0, m0
|
||||
punpckhdq m1, m0, m0
|
||||
punpckldq m0, m0
|
||||
@ -286,8 +288,8 @@ ALIGN function_align
|
||||
punpckhwd m1, m2
|
||||
punpcklwd m0, m1
|
||||
punpcklqdq m0, m0
|
||||
paddw m0, m0
|
||||
pmulhrsw m0, [o(pw_5793x4)]
|
||||
pmulhrsw m1, m0, [o(pw_1697x8)]
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, [o(pw_2896x8)]
|
||||
mova m1, m0
|
||||
TAIL_CALL m(iadst_4x4_internal).end
|
||||
@ -434,12 +436,11 @@ INV_TXFM_4X4_FN identity, identity
|
||||
cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m0, [coeffq+16*0]
|
||||
mova m1, [coeffq+16*1]
|
||||
mova m2, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
|
||||
mova m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m0, m3
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
|
||||
@ -447,11 +448,11 @@ cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
jmp tx2q
|
||||
|
||||
.pass2:
|
||||
mova m2, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
mova m3, [o(pw_1697x8)]
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
jmp m(iadst_4x4_internal).end
|
||||
|
||||
%macro IWHT4_1D_PACKED 0
|
||||
@ -609,8 +610,8 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
|
||||
mova m2, [o(pw_2896x8)]
|
||||
punpckldq m0, m1
|
||||
pmulhrsw m0, m2
|
||||
paddw m0, m0
|
||||
pmulhrsw m0, [o(pw_5793x4)]
|
||||
pmulhrsw m1, m0, [o(pw_1697x8)]
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m0, [o(pw_2048)]
|
||||
punpcklqdq m0, m0
|
||||
@ -828,16 +829,15 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
pmulhrsw m3, [coeffq+16*3]
|
||||
|
||||
.pass1:
|
||||
mova m5, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
paddw m2, m2
|
||||
paddw m3, m3
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m1, m5
|
||||
pmulhrsw m2, m5
|
||||
pmulhrsw m3, m5
|
||||
|
||||
mova m7, [o(pw_1697x8)]
|
||||
pmulhrsw m4, m7, m0
|
||||
pmulhrsw m5, m7, m1
|
||||
pmulhrsw m6, m7, m2
|
||||
pmulhrsw m7, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
jmp m(iadst_4x8_internal).pass1_end
|
||||
|
||||
.pass2:
|
||||
@ -880,8 +880,8 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m0, [o(pw_2896x8)]
|
||||
pmulhrsw m1, m0, [coeffq]
|
||||
pmulhrsw m1, m0
|
||||
paddw m1, m1
|
||||
pmulhrsw m1, [o(pw_5793x4)]
|
||||
pmulhrsw m0, m1, [o(pw_1697x8)]
|
||||
paddw m1, m0
|
||||
pmulhrsw m1, [o(pw_2048)]
|
||||
punpcklwd m1, m1
|
||||
punpckhdq m2, m1, m1
|
||||
@ -1180,15 +1180,15 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
jmp tx2q
|
||||
|
||||
.pass2:
|
||||
mova m4, [o(pw_5793x4)]
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
paddw m2, m2
|
||||
paddw m3, m3
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m1, m4
|
||||
pmulhrsw m2, m4
|
||||
pmulhrsw m3, m4
|
||||
mova m7, [o(pw_1697x8)]
|
||||
pmulhrsw m4, m7, m0
|
||||
pmulhrsw m5, m7, m1
|
||||
pmulhrsw m6, m7, m2
|
||||
pmulhrsw m7, m3
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
jmp m(iadst_8x4_internal).end
|
||||
|
||||
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
|
||||
@ -1635,14 +1635,16 @@ ALIGN function_align
|
||||
pmulhrsw m0, [coeffq+16*0]
|
||||
pmulhrsw m1, [coeffq+16*1]
|
||||
mova m2, [o(pw_16384)]
|
||||
mova m3, [o(pw_5793x4)]
|
||||
mova m3, [o(pw_1697x16)]
|
||||
mova m4, [o(pw_2048)]
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m2
|
||||
psllw m0, 2
|
||||
psllw m1, 2
|
||||
pmulhrsw m0, m3
|
||||
pmulhrsw m1, m3
|
||||
pmulhrsw m2, m3, m0
|
||||
pmulhrsw m3, m1
|
||||
paddw m0, m0
|
||||
paddw m1, m1
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m4, m1
|
||||
punpckhwd m2, m0, m0
|
||||
@ -1664,12 +1666,11 @@ ALIGN function_align
|
||||
punpcklwd m0, [coeffq+32*1]
|
||||
movd m1, [coeffq+32*2]
|
||||
punpcklwd m1, [coeffq+32*3]
|
||||
mova m2, [o(pw_5793x4)]
|
||||
mova m3, [o(pw_16384)]
|
||||
mova m4, [o(pw_2896x8)]
|
||||
punpckldq m0, m1
|
||||
paddw m0, m0
|
||||
pmulhrsw m0, m2
|
||||
pmulhrsw m1, m0, [o(pw_1697x8)]
|
||||
mova m4, [o(pw_2896x8)]
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m3
|
||||
psrlw m3, 3 ; pw_2048
|
||||
pmulhrsw m0, m4
|
||||
@ -1885,17 +1886,27 @@ INV_TXFM_4X16_FN identity, adst
|
||||
INV_TXFM_4X16_FN identity, flipadst
|
||||
INV_TXFM_4X16_FN identity, identity
|
||||
|
||||
%macro IDTX16 3 ; src/dst, tmp, pw_1697x16
|
||||
pmulhrsw m%2, m%3, m%1
|
||||
paddw m%1, m%1
|
||||
paddw m%1, m%2
|
||||
%endmacro
|
||||
|
||||
cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
lea r3, [o(m(iidentity_4x8_internal).pass1)]
|
||||
jmp m(idct_4x16_internal).pass1
|
||||
|
||||
.pass2:
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
psllw m7, [coeffq+16*7], 2
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
mova [coeffq+16*7], m7
|
||||
mova m7, [o(pw_1697x16)]
|
||||
mova [coeffq+16*6], m6
|
||||
REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
|
||||
mova m6, [coeffq+16*7]
|
||||
IDTX16 6, 7, 7
|
||||
mova [coeffq+16*7], m6
|
||||
mova m6, [coeffq+16*6]
|
||||
pmulhrsw m7, m6, [o(pw_1697x16)]
|
||||
paddw m6, m6
|
||||
paddw m6, m7
|
||||
|
||||
mova m7, [o(pw_2048)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
@ -1913,8 +1924,8 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mova m0, [o(pw_16384)]
|
||||
pmulhrsw m3, m0
|
||||
psrlw m0, 3 ; pw_2048
|
||||
paddw m3, m3
|
||||
pmulhrsw m3, [o(pw_5793x4)]
|
||||
pmulhrsw m1, m3, [o(pw_1697x8)]
|
||||
paddw m3, m1
|
||||
pmulhrsw m3, m0
|
||||
punpcklwd m3, m3
|
||||
pshufd m0, m3, q0000
|
||||
@ -1927,28 +1938,28 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mov dstq, tx2q
|
||||
TAIL_CALL m(iadst_8x4_internal).end2
|
||||
%elifidn %1_%2, identity_dct
|
||||
mova m4, [o(pw_1697x16)]
|
||||
mova m5, [o(pw_16384)]
|
||||
mova m6, [o(pw_5793x4)]
|
||||
mova m7, [o(pw_2896x8)]
|
||||
mova m6, [o(pw_2896x8)]
|
||||
mov r3d, 2
|
||||
psrlw m7, m5, 3 ; pw_2048
|
||||
.main_loop:
|
||||
mova m0, [coeffq+16*0]
|
||||
mova m1, [coeffq+16*1]
|
||||
mova m2, [coeffq+16*2]
|
||||
mova m3, [coeffq+16*3]
|
||||
punpckhwd m4, m0, m1
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m1, m2, m3
|
||||
punpcklwd m2, m3
|
||||
punpcklwd m0, m4
|
||||
punpcklwd m2, m1
|
||||
punpcklqdq m0, m2
|
||||
psllw m0, 2
|
||||
punpcklwd m0, m2
|
||||
mova m1, [coeffq+16*2]
|
||||
mova m2, [coeffq+16*3]
|
||||
punpckhwd m3, m1, m2
|
||||
punpcklwd m1, m2
|
||||
punpcklwd m1, m3
|
||||
punpcklqdq m0, m1
|
||||
pmulhrsw m1, m4, m0
|
||||
pmulhrsw m1, m5
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m6
|
||||
pmulhrsw m0, m5
|
||||
psrlw m1, m5, 3 ; pw_2048
|
||||
pmulhrsw m0, m7
|
||||
pmulhrsw m0, m1
|
||||
.end:
|
||||
pxor m3, m3
|
||||
mova [coeffq+16*0], m3
|
||||
@ -2412,22 +2423,56 @@ INV_TXFM_16X4_FN identity, flipadst
|
||||
INV_TXFM_16X4_FN identity, identity
|
||||
|
||||
cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
LOAD_7ROWS coeffq, 16
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
mova m1, [coeffq+16*6]
|
||||
mova m0, [coeffq+16*5]
|
||||
mova m2, [coeffq+16*7]
|
||||
mova m6, [o(pw_1697x16)]
|
||||
mova m7, [o(pw_16384)]
|
||||
pmulhrsw m4, m6, m1
|
||||
pmulhrsw m3, m6, m0
|
||||
pmulhrsw m5, m6, m2
|
||||
pmulhrsw m4, m7
|
||||
pmulhrsw m3, m7
|
||||
pmulhrsw m5, m7
|
||||
paddw m1, m4
|
||||
paddw m0, m3
|
||||
paddw m5, m2
|
||||
mova m2, [coeffq+16*2]
|
||||
mova m3, [coeffq+16*3]
|
||||
mova m4, [coeffq+16*4]
|
||||
mova [coeffq+16*6], m1
|
||||
mova [coeffq+16*5], m0
|
||||
mova [coeffq+16*7], m5
|
||||
pmulhrsw m0, m6, m2
|
||||
pmulhrsw m1, m6, m3
|
||||
pmulhrsw m5, m6, m4
|
||||
pmulhrsw m0, m7
|
||||
pmulhrsw m1, m7
|
||||
pmulhrsw m5, m7
|
||||
paddw m2, m0
|
||||
paddw m3, m1
|
||||
paddw m4, m5
|
||||
mova m0, [coeffq+16*0]
|
||||
mova m1, [coeffq+16*1]
|
||||
pmulhrsw m5, m6, m0
|
||||
pmulhrsw m6, m1
|
||||
pmulhrsw m5, m7
|
||||
pmulhrsw m6, m7
|
||||
paddw m0, m5
|
||||
paddw m1, m6
|
||||
mova m6, [coeffq+16*6]
|
||||
mova m5, [coeffq+16*5]
|
||||
punpckhwd m7, m0, m2 ;packed out1, out5
|
||||
punpcklwd m0, m2 ;packed out0, out4
|
||||
punpckhwd m2, m1, m3 ;packed out3, out7
|
||||
punpcklwd m1, m3 ;packed out2, out6
|
||||
mova [coeffq+16*6], m7
|
||||
psllw m7, [coeffq+16*7], 2
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
mova m7, [coeffq+16*7]
|
||||
punpckhwd m3, m4, m6 ;packed out9, out13
|
||||
punpcklwd m4, m6 ;packed out8, out12
|
||||
punpckhwd m6, m5, m7 ;packed out11, out15
|
||||
punpcklwd m5, m7 ;packed out10, out14
|
||||
jmp m(idct_16x4_internal).pass1_end2
|
||||
jmp m(idct_16x4_internal).pass1_end3
|
||||
|
||||
.pass2:
|
||||
lea tx2q, [o(m(iidentity_8x4_internal).pass2)]
|
||||
@ -2475,8 +2520,9 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
pmulhrsw m7, m0
|
||||
pmulhrsw m7, m1
|
||||
psrlw m1, 3 ; pw_2048
|
||||
psllw m7, 2
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
pmulhrsw m0, m7, [o(pw_1697x16)]
|
||||
paddw m7, m7
|
||||
paddw m7, m0
|
||||
pmulhrsw m7, m1
|
||||
punpcklwd m0, m7, m7
|
||||
punpckhwd m7, m7
|
||||
@ -2720,16 +2766,21 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
lea tx2q, [o(m(iidentity_8x16_internal).end1)]
|
||||
|
||||
.end:
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
pmulhrsw m7, [o(pw_2048)]
|
||||
mova [rsp+gprsize+16*0], m7
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
mova [rsp+gprsize+16*1], m6
|
||||
mova m7, [o(pw_1697x16)]
|
||||
REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
|
||||
mova m6, [rsp+gprsize+16*1]
|
||||
mova [rsp+gprsize+16*2], m5
|
||||
IDTX16 6, 5, 7
|
||||
mova m5, [rsp+gprsize+16*0]
|
||||
IDTX16 5, 7, 7
|
||||
mova m7, [o(pw_2048)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
pmulhrsw m7, [rsp+gprsize+16*2]
|
||||
mova [rsp+gprsize+16*0], m5
|
||||
mova [rsp+gprsize+16*1], m6
|
||||
mova [rsp+gprsize+16*2], m5
|
||||
mova [rsp+gprsize+16*2], m7
|
||||
jmp m(idct_8x8_internal).end3
|
||||
|
||||
.end1:
|
||||
@ -2787,32 +2838,32 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
mov dstq, tx2q
|
||||
TAIL_CALL m(iadst_8x4_internal).end2
|
||||
%elifidn %1_%2, identity_dct
|
||||
mova m5, [o(pw_16384)]
|
||||
mova m6, [o(pw_5793x4)]
|
||||
mova m7, [o(pw_2896x8)]
|
||||
pxor m4, m4
|
||||
mova m4, [o(pw_2896x8)]
|
||||
mova m5, [o(pw_1697x16)]
|
||||
mova m6, [o(pw_16384)]
|
||||
psrlw m7, m6, 3 ; pw_2048
|
||||
mov r3d, 2
|
||||
.main_loop:
|
||||
mova m0, [coeffq+16*0]
|
||||
punpcklwd m0, [coeffq+16*1]
|
||||
mova m1, [coeffq+16*2]
|
||||
punpcklwd m1, [coeffq+16*3]
|
||||
mova m2, [coeffq+16*4]
|
||||
punpcklwd m2, [coeffq+16*5]
|
||||
mova m3, [coeffq+16*6]
|
||||
punpcklwd m3, [coeffq+16*7]
|
||||
punpckldq m0, m1
|
||||
punpckldq m2, m3
|
||||
punpcklqdq m0, m2
|
||||
mova m1, [coeffq+16*4]
|
||||
punpcklwd m1, [coeffq+16*5]
|
||||
mova m2, [coeffq+16*6]
|
||||
punpcklwd m2, [coeffq+16*7]
|
||||
punpckldq m1, m2
|
||||
punpcklqdq m0, m1
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m1, m5, m0
|
||||
pmulhrsw m1, m6
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m0, m7
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m6
|
||||
pmulhrsw m0, m5
|
||||
psrlw m1, m5, 3 ; pw_2048
|
||||
pmulhrsw m0, m7
|
||||
pmulhrsw m0, m1
|
||||
.end:
|
||||
REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
pxor m1, m1
|
||||
REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
add coeffq, 16*8
|
||||
lea tx2q, [dstq+8]
|
||||
WRITE_8X4 0, 0, 0, 0, 1, 2, 3
|
||||
@ -3292,40 +3343,66 @@ INV_TXFM_16X8_FN identity, flipadst
|
||||
INV_TXFM_16X8_FN identity, identity
|
||||
|
||||
cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
LOAD_8ROWS coeffq+16*8, 16, 1
|
||||
|
||||
add coeffq, 16*16
|
||||
mova m4, [coeffq-16*7]
|
||||
mova m5, [coeffq-16*5]
|
||||
mova m6, [coeffq-16*3]
|
||||
mova m7, [coeffq-16*1]
|
||||
mov r3, tx2q
|
||||
lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
|
||||
|
||||
.pass1:
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
mova [rsp+gprsize+16*0], m7
|
||||
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
|
||||
jmp m(idct_8x8_internal).pass1_end
|
||||
mova m0, [o(pw_2896x8)]
|
||||
mova m2, [o(pw_1697x16)]
|
||||
mova m3, [o(pw_16384)]
|
||||
sub coeffq, 8*16
|
||||
REPX {pmulhrsw x, m0}, m4, m5, m6, m7
|
||||
pmulhrsw m1, m2, m4
|
||||
pmulhrsw m1, m3
|
||||
paddw m1, m4 ; 1
|
||||
pmulhrsw m4, m2, m5
|
||||
pmulhrsw m4, m3
|
||||
paddw m4, m5 ; 3
|
||||
pmulhrsw m5, m2, m6
|
||||
pmulhrsw m5, m3
|
||||
paddw m5, m6 ; 5
|
||||
pmulhrsw m6, m2, m7
|
||||
pmulhrsw m6, m3
|
||||
paddw m7, m6 ; 7
|
||||
pmulhrsw m6, m0, [coeffq+16*6]
|
||||
mova [rsp+gprsize+16*0], m4
|
||||
pmulhrsw m4, m2, m6
|
||||
pmulhrsw m4, m3
|
||||
paddw m6, m4 ; 6
|
||||
pmulhrsw m4, m0, [coeffq+16*4]
|
||||
mova [rsp+gprsize+16*1], m6
|
||||
pmulhrsw m6, m2, m4
|
||||
pmulhrsw m6, m3
|
||||
paddw m4, m6 ; 4
|
||||
pmulhrsw m6, m0, [coeffq+16*2]
|
||||
pmulhrsw m0, [coeffq+16*0]
|
||||
pmulhrsw m2, m6
|
||||
pmulhrsw m2, m3
|
||||
paddw m2, m6 ; 2
|
||||
pmulhrsw m6, m0, [o(pw_1697x16)]
|
||||
pmulhrsw m6, m3
|
||||
mova m3, [rsp+gprsize+16*0]
|
||||
paddw m0, m6
|
||||
jmp m(idct_8x8_internal).pass1_end3
|
||||
|
||||
.pass1_end:
|
||||
mova [coeffq+16*9 ], m4
|
||||
mova [coeffq+16*11], m5
|
||||
mova [coeffq+16*13], m6
|
||||
mova [coeffq+16*15], m7
|
||||
mova m4, [o(pw_2896x8)]
|
||||
pmulhrsw m5, m4, [coeffq+16*5]
|
||||
pmulhrsw m6, m4, [coeffq+16*6]
|
||||
pmulhrsw m7, m4, [coeffq+16*7]
|
||||
mova [coeffq+16*5 ], m2
|
||||
mova [coeffq+16*7 ], m3
|
||||
pmulhrsw m2, m4, [coeffq+16*2]
|
||||
pmulhrsw m3, m4, [coeffq+16*3]
|
||||
mova [coeffq+16*3 ], m1
|
||||
pmulhrsw m1, m4, [coeffq+16*1]
|
||||
mova [coeffq+16*1 ], m0
|
||||
pmulhrsw m0, m4, [coeffq+16*0]
|
||||
pmulhrsw m4, [coeffq+16*4]
|
||||
|
||||
mova [coeffq+16*1], m4
|
||||
mova [coeffq+16*3], m5
|
||||
mova [coeffq+16*5], m6
|
||||
mova [coeffq+16*7], m7
|
||||
mova m4, [coeffq-16*7]
|
||||
mova m5, [coeffq-16*5]
|
||||
mova m6, [coeffq-16*3]
|
||||
mova m7, [coeffq-16*1]
|
||||
mova [coeffq-16*7], m0
|
||||
mova [coeffq-16*5], m1
|
||||
mova [coeffq-16*3], m2
|
||||
mova [coeffq-16*1], m3
|
||||
mov tx2q, r3
|
||||
jmp .pass1
|
||||
|
||||
@ -3399,7 +3476,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
jg .loop
|
||||
RET
|
||||
%elifidn %1_%2, identity_dct
|
||||
mova m4, [o(pw_5793x4)]
|
||||
mova m4, [o(pw_1697x16)]
|
||||
mova m5, [o(pw_8192)]
|
||||
mova m6, [o(pw_2896x8)]
|
||||
psrlw m7, m5, 2 ;pw_2048
|
||||
@ -3410,23 +3487,24 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
.main:
|
||||
movd m0, [coeffq+32*0]
|
||||
punpcklwd m0, [coeffq+32*1]
|
||||
movd m2, [coeffq+32*2]
|
||||
punpcklwd m2, [coeffq+32*3]
|
||||
movd m1, [coeffq+32*2]
|
||||
punpcklwd m1, [coeffq+32*3]
|
||||
add coeffq, 32*4
|
||||
punpckldq m0, m1
|
||||
movd m1, [coeffq+32*0]
|
||||
punpcklwd m1, [coeffq+32*1]
|
||||
movd m3, [coeffq+32*2]
|
||||
punpcklwd m3, [coeffq+32*3]
|
||||
movd m2, [coeffq+32*2]
|
||||
punpcklwd m2, [coeffq+32*3]
|
||||
xor eobd, eobd
|
||||
mov [coeffq-32*4], eobd
|
||||
mov [coeffq-32*3], eobd
|
||||
mov [coeffq-32*2], eobd
|
||||
mov [coeffq-32*1], eobd
|
||||
punpckldq m0, m2
|
||||
punpckldq m1, m3
|
||||
punpckldq m1, m2
|
||||
punpcklqdq m0, m1
|
||||
psllw m0, 2
|
||||
pmulhrsw m0, m4
|
||||
pmulhrsw m1, m4, m0
|
||||
paddw m0, m0
|
||||
paddw m0, m1
|
||||
pmulhrsw m0, m5
|
||||
pmulhrsw m0, m6
|
||||
pmulhrsw m0, m7
|
||||
@ -3740,36 +3818,42 @@ INV_TXFM_16X16_FN identity, dct, 15
|
||||
INV_TXFM_16X16_FN identity, identity
|
||||
|
||||
cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
LOAD_8ROWS coeffq+16*17, 32
|
||||
add coeffq, 16*17
|
||||
mov r3, tx2q
|
||||
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
|
||||
|
||||
.pass1:
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
mova [rsp+gprsize+16*0], m7
|
||||
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
|
||||
mova m7, [o(pw_1697x16)]
|
||||
mova m6, [coeffq+32*7]
|
||||
mova m0, [coeffq+32*0]
|
||||
mova m1, [coeffq+32*1]
|
||||
mova m2, [coeffq+32*2]
|
||||
mova m3, [coeffq+32*3]
|
||||
mova m4, [coeffq+32*4]
|
||||
REPX {IDTX16 x, 5, 7}, 6, 0, 1, 2, 3, 4
|
||||
mova m5, [coeffq+32*5]
|
||||
mova [rsp+gprsize+16*0], m6
|
||||
IDTX16 5, 6, 7
|
||||
mova m6, [coeffq+32*6]
|
||||
IDTX16 6, 7, 7
|
||||
mova m7, [o(pw_8192)]
|
||||
jmp m(idct_8x8_internal).pass1_end1
|
||||
|
||||
.pass1_end:
|
||||
SAVE_8ROWS coeffq+16*17, 32
|
||||
LOAD_8ROWS coeffq+16* 1, 32
|
||||
SAVE_8ROWS coeffq, 32
|
||||
sub coeffq, 16
|
||||
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)]
|
||||
jmp .pass1
|
||||
|
||||
.pass1_end1:
|
||||
SAVE_8ROWS coeffq+16* 1, 32
|
||||
LOAD_8ROWS coeffq+16*16, 32
|
||||
SAVE_8ROWS coeffq, 32
|
||||
sub coeffq, 15*16
|
||||
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)]
|
||||
jmp .pass1
|
||||
|
||||
.pass1_end2:
|
||||
SAVE_8ROWS coeffq+16*16, 32
|
||||
LOAD_8ROWS coeffq+16* 0, 32
|
||||
SAVE_8ROWS coeffq, 32
|
||||
sub coeffq, 16
|
||||
mov tx2q, r3
|
||||
jmp .pass1
|
||||
|
||||
@ -3778,16 +3862,22 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
|
||||
lea tx2q, [o(m(iidentity_16x16_internal).end1)]
|
||||
|
||||
.end:
|
||||
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
pmulhrsw m7, [o(pw_2048)]
|
||||
mova [rsp+gprsize+16*0], m7
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
mova m7, [o(pw_2048)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
mova [rsp+gprsize+16*1], m6
|
||||
mova [rsp+gprsize+16*1], m4
|
||||
mova m7, [o(pw_1697x16)]
|
||||
REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
|
||||
mova m4, [o(pw_2048)]
|
||||
pmulhrsw m5, m4
|
||||
pmulhrsw m6, m4
|
||||
mova [rsp+gprsize+16*2], m5
|
||||
mova m5, [rsp+gprsize+16*1]
|
||||
mova [rsp+gprsize+16*1], m6
|
||||
IDTX16 5, 6, 7
|
||||
mova m6, [rsp+gprsize+16*0]
|
||||
IDTX16 6, 7, 7
|
||||
REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
|
||||
pmulhrsw m4, m5
|
||||
mova [rsp+gprsize+16*0], m6
|
||||
jmp m(idct_8x8_internal).end3
|
||||
|
||||
.end1:
|
||||
@ -4991,15 +5081,33 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff,
|
||||
|
||||
.loop:
|
||||
LOAD_8ROWS coeffq, 32, 1
|
||||
REPX {psllw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
REPX {psllw x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
|
||||
mova [rsp+16*1], m6
|
||||
lea tx2q, [o(m(idct_32x16_internal).end)]
|
||||
call m(idct_8x8_internal).pass1_end3
|
||||
pmulhrsw m7, [o(pw_5793x4)]
|
||||
mova [rsp+16*2], m5
|
||||
mova [rsp+16*1], m6
|
||||
mova m5, [o(pw_1697x8)]
|
||||
pmulhrsw m6, m5, m7
|
||||
paddw m7, m6
|
||||
pmulhrsw m6, m5, m0
|
||||
paddw m0, m6
|
||||
pmulhrsw m6, m5, m1
|
||||
paddw m1, m6
|
||||
pmulhrsw m6, m5, m2
|
||||
paddw m2, m6
|
||||
pmulhrsw m6, m5, m3
|
||||
paddw m3, m6
|
||||
pmulhrsw m6, m5, m4
|
||||
pmulhrsw m7, [o(pw_2048)]
|
||||
paddw m4, m6
|
||||
mova m6, [rsp+16*1]
|
||||
mova [rsp+16*0], m7
|
||||
mova m7, [o(pw_5793x4)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
pmulhrsw m7, m5, m6
|
||||
paddw m6, m7
|
||||
mova m7, [rsp+16*2]
|
||||
pmulhrsw m5, m7
|
||||
paddw m5, m7
|
||||
mova m7, [o(pw_2048)]
|
||||
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
|
||||
mova [rsp+16*2], m5
|
||||
@ -5008,7 +5116,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff,
|
||||
lea dstq, [dstq+strideq*2]
|
||||
|
||||
pxor m7, m7
|
||||
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
.loop_end:
|
||||
add coeffq, 16
|
||||
|
@ -204,6 +204,7 @@ WIENER_FILTER(ext) \
|
||||
SGR_FILTER(ext)
|
||||
|
||||
#if BITDEPTH == 8
|
||||
WIENER_FILTER(sse2)
|
||||
DEF_LR_FILTERS(ssse3)
|
||||
# if ARCH_X86_64
|
||||
DEF_LR_FILTERS(avx2)
|
||||
@ -213,6 +214,11 @@ DEF_LR_FILTERS(avx2)
|
||||
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_sse2;
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
#if BITDEPTH == 8
|
||||
c->wiener = wiener_filter_ssse3;
|
||||
|
264
third_party/dav1d/src/x86/looprestoration_ssse3.asm
vendored
264
third_party/dav1d/src/x86/looprestoration_ssse3.asm
vendored
@ -43,8 +43,6 @@ pb_15: times 16 db 15
|
||||
pb_0_1: times 8 db 0, 1
|
||||
pb_6_7: times 8 db 6, 7
|
||||
pb_14_15: times 8 db 14, 15
|
||||
pb_0_1_2_3: times 4 db 0, 1, 2, 3
|
||||
pb_4_5_6_7: times 4 db 4, 5, 6, 7
|
||||
pw_1: times 8 dw 1
|
||||
pw_16: times 8 dw 16
|
||||
pw_128: times 8 dw 128
|
||||
@ -97,58 +95,101 @@ SECTION .text
|
||||
%define PIC_sym(sym) (sym)
|
||||
%endif
|
||||
|
||||
%macro PALIGNR 4 ; dst, src1, src2, shift
|
||||
%if cpuflag(ssse3)
|
||||
palignr %1, %2, %3, %4
|
||||
%else
|
||||
%assign %%i regnumof%+%1 + 1
|
||||
%define %%tmp m %+ %%i
|
||||
psrldq %1, %3, %4
|
||||
pslldq %%tmp, %2, 16-%4
|
||||
por %1, %%tmp
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
|
||||
%if cpuflag(ssse3)
|
||||
pmaddubsw %1, %2
|
||||
%else
|
||||
%if %5 == 1
|
||||
pxor %3, %3
|
||||
%endif
|
||||
punpckhbw %4, %1, %3
|
||||
punpcklbw %1, %3
|
||||
pmaddwd %4, %2
|
||||
pmaddwd %1, %2
|
||||
packssdw %1, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; wiener ;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
INIT_XMM ssse3
|
||||
%macro WIENER_H 0
|
||||
%if ARCH_X86_64
|
||||
cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
|
||||
mov edged, edgem
|
||||
movifnidn wd, wm
|
||||
mov hd, hm
|
||||
%else
|
||||
cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
|
||||
mov r5, edgem
|
||||
mov [esp+12], r5
|
||||
mov wd, wm
|
||||
mov hd, hm
|
||||
SETUP_PIC hd
|
||||
%define m15 m0
|
||||
%define m14 m1
|
||||
%define m13 m2
|
||||
%define m12 m3
|
||||
%endif
|
||||
|
||||
movq m15, [fhq]
|
||||
pshufb m12, m15, [pb_6_7]
|
||||
pshufb m13, m15, [pb_4]
|
||||
pshufb m14, m15, [pb_2]
|
||||
pshufb m15, m15, [pb_0]
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m12, m15, [PIC_sym(pb_6_7)]
|
||||
pshufb m13, m15, [PIC_sym(pb_4)]
|
||||
pshufb m14, m15, [PIC_sym(pb_2)]
|
||||
pshufb m15, m15, [PIC_sym(pb_0)]
|
||||
%else
|
||||
pshuflw m12, m15, q3333
|
||||
punpcklbw m15, m15
|
||||
pshufhw m13, m15, q0000
|
||||
pshuflw m14, m15, q2222
|
||||
pshuflw m15, m15, q0000
|
||||
punpcklqdq m12, m12
|
||||
punpckhqdq m13, m13
|
||||
punpcklqdq m14, m14
|
||||
punpcklqdq m15, m15
|
||||
psraw m13, 8
|
||||
psraw m14, 8
|
||||
psraw m15, 8
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
mova m11, [pw_2048]
|
||||
mova m10, [pw_16380]
|
||||
lea r11, [pb_right_ext_mask]
|
||||
|
||||
DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
|
||||
%else
|
||||
cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
mov wd, edgem
|
||||
mov [esp+12], wd
|
||||
mov wd, wm
|
||||
mov hd, hm
|
||||
SETUP_PIC hd
|
||||
movq m0, [fhq]
|
||||
pshufb m3, m0, [PIC_sym(pb_6_7)]
|
||||
pshufb m2, m0, [PIC_sym(pb_4)]
|
||||
pshufb m1, m0, [PIC_sym(pb_2)]
|
||||
pshufb m0, m0, [PIC_sym(pb_0)]
|
||||
|
||||
DEFINE_ARGS dst, left, src, stride, x, w, h, edge
|
||||
|
||||
%define srcptrq srcq
|
||||
%define dstptrq dstq
|
||||
%define hd dword [esp]
|
||||
%define edged dword [esp+12]
|
||||
%define xlimd dword [esp+16]
|
||||
|
||||
%define m10 [PIC_sym(pw_16380)]
|
||||
%define m11 [PIC_sym(pw_2048)]
|
||||
%define m12 [esp+0x14]
|
||||
%define m13 [esp+0x24]
|
||||
%define m14 [esp+0x34]
|
||||
%define m15 [esp+0x44]
|
||||
|
||||
mova m15, m0
|
||||
mova m14, m1
|
||||
mova m13, m2
|
||||
mova m12, m3
|
||||
mova m13, m2
|
||||
mova m14, m1
|
||||
mova m15, m0
|
||||
|
||||
DEFINE_ARGS dst, left, src, stride, x, w, h, edge
|
||||
%define srcptrq srcq
|
||||
%define dstptrq dstq
|
||||
%define hd dword [esp+ 0]
|
||||
%define edged dword [esp+12]
|
||||
%define xlimd dword [esp+16]
|
||||
%endif
|
||||
|
||||
; if (edge & has_right) align_w_to_16
|
||||
@ -196,7 +237,16 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
jmp .left_load_done
|
||||
.emu_left:
|
||||
movd m0, [srcq]
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m0, [PIC_sym(pb_14x0_1_2)]
|
||||
%else
|
||||
pslldq m1, m0, 13
|
||||
punpcklbw m0, m0
|
||||
pshuflw m0, m0, q0000
|
||||
punpcklqdq m0, m0
|
||||
psrldq m0, 2
|
||||
por m0, m1
|
||||
%endif
|
||||
|
||||
; load right edge pixels
|
||||
.left_load_done:
|
||||
@ -208,19 +258,39 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
|
||||
; for very small images (w=[1-2]), edge-extend the original cache,
|
||||
; ugly, but only runs in very odd cases
|
||||
%if cpuflag(ssse3)
|
||||
add wd, wd
|
||||
%if ARCH_X86_64
|
||||
%if ARCH_X86_64
|
||||
pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
|
||||
%else
|
||||
%else
|
||||
pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
|
||||
%endif
|
||||
%endif
|
||||
shr wd, 1
|
||||
%else
|
||||
shl wd, 4
|
||||
pcmpeqd m2, m2
|
||||
movd m3, wd
|
||||
psrldq m2, 2
|
||||
punpckhbw m1, m0, m0
|
||||
pshufhw m1, m1, q1122
|
||||
psllq m1, m3
|
||||
pand m0, m2
|
||||
pandn m2, m1
|
||||
por m0, m2
|
||||
shr wd, 4
|
||||
%endif
|
||||
|
||||
; main x loop, mostly this starts in .main_load
|
||||
.splat_right:
|
||||
; no need to load new pixels, just extend them from the (possibly previously
|
||||
; extended) previous load into m0
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m1, m0, [PIC_sym(pb_15)]
|
||||
%else
|
||||
punpckhbw m1, m0, m0
|
||||
pshufhw m1, m1, q3333
|
||||
punpckhqdq m1, m1
|
||||
%endif
|
||||
jmp .main_loop
|
||||
.load_and_splat:
|
||||
; load new pixels and extend edge for right-most
|
||||
@ -235,7 +305,13 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
add PIC_reg, xd
|
||||
%endif
|
||||
movd m3, [srcptrq+2+xq]
|
||||
%if cpuflag(ssse3)
|
||||
pshufb m3, [PIC_sym(pb_0)]
|
||||
%else
|
||||
punpcklbw m3, m3
|
||||
pshuflw m3, m3, q0000
|
||||
punpcklqdq m3, m3
|
||||
%endif
|
||||
pand m1, m2
|
||||
pxor m2, [PIC_sym(pb_right_ext_mask)]
|
||||
pand m3, m2
|
||||
@ -246,58 +322,98 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
; load subsequent line
|
||||
movu m1, [srcptrq+3]
|
||||
.main_loop:
|
||||
palignr m2, m1, m0, 10
|
||||
palignr m3, m1, m0, 11
|
||||
palignr m4, m1, m0, 12
|
||||
palignr m5, m1, m0, 13
|
||||
palignr m6, m1, m0, 14
|
||||
palignr m7, m1, m0, 15
|
||||
%if ARCH_X86_64
|
||||
PALIGNR m2, m1, m0, 10
|
||||
PALIGNR m3, m1, m0, 11
|
||||
PALIGNR m4, m1, m0, 12
|
||||
PALIGNR m5, m1, m0, 13
|
||||
PALIGNR m6, m1, m0, 14
|
||||
PALIGNR m7, m1, m0, 15
|
||||
|
||||
%if ARCH_X86_32
|
||||
mova [esp+0x54], m1
|
||||
%define m8 m1
|
||||
%endif
|
||||
punpcklbw m0, m2, m1
|
||||
punpckhbw m2, m1
|
||||
punpcklbw m8, m3, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m7, m4, m6
|
||||
punpckhbw m4, m6
|
||||
pmaddubsw m0, m15
|
||||
pmaddubsw m2, m15
|
||||
pmaddubsw m8, m14
|
||||
pmaddubsw m3, m14
|
||||
pmaddubsw m7, m13
|
||||
pmaddubsw m4, m13
|
||||
PMADDUBSW m0, m15, m6, m9, 1
|
||||
PMADDUBSW m2, m15, m6, m9, 0
|
||||
PMADDUBSW m8, m14, m6, m9, 0
|
||||
PMADDUBSW m3, m14, m6, m9, 0
|
||||
PMADDUBSW m7, m13, m6, m9, 0
|
||||
PMADDUBSW m4, m13, m6, m9, 0
|
||||
paddw m0, m8
|
||||
paddw m2, m3
|
||||
pxor m3, m3
|
||||
punpcklbw m6, m5, m3
|
||||
punpckhbw m5, m3
|
||||
psllw m8, m6, 7
|
||||
psllw m3, m5, 7
|
||||
%if cpuflag(ssse3)
|
||||
pxor m6, m6
|
||||
%endif
|
||||
punpcklbw m3, m5, m6
|
||||
punpckhbw m5, m6
|
||||
psllw m8, m3, 7
|
||||
psllw m6, m5, 7
|
||||
psubw m8, m10
|
||||
psubw m3, m10
|
||||
pmullw m6, m12
|
||||
psubw m6, m10
|
||||
pmullw m3, m12
|
||||
pmullw m5, m12
|
||||
paddw m0, m7
|
||||
paddw m2, m4
|
||||
paddw m0, m6
|
||||
paddw m0, m3
|
||||
paddw m2, m5
|
||||
paddsw m0, m8
|
||||
paddsw m2, m3
|
||||
paddsw m2, m6
|
||||
psraw m0, 3
|
||||
psraw m2, 3
|
||||
paddw m0, m11
|
||||
paddw m2, m11
|
||||
mova [dstptrq+ 0], m0
|
||||
mova [dstptrq+16], m2
|
||||
|
||||
%if ARCH_X86_64
|
||||
mova m0, m1
|
||||
%else
|
||||
mova m0, [esp+0x54]
|
||||
PALIGNR m2, m1, m0, 10
|
||||
punpcklbw m3, m2, m1
|
||||
punpckhbw m2, m1
|
||||
PMADDUBSW m3, m15, m4, m5, 1
|
||||
PMADDUBSW m2, m15, m4, m5, 0
|
||||
PALIGNR m4, m1, m0, 11
|
||||
PALIGNR m5, m1, m0, 15
|
||||
punpcklbw m6, m4, m5
|
||||
punpckhbw m4, m5
|
||||
PMADDUBSW m6, m14, m5, m7, 1
|
||||
PMADDUBSW m4, m14, m5, m7, 0
|
||||
paddw m3, m6
|
||||
paddw m2, m4
|
||||
PALIGNR m4, m1, m0, 12
|
||||
PALIGNR m5, m1, m0, 14
|
||||
punpcklbw m6, m4, m5
|
||||
punpckhbw m4, m5
|
||||
PMADDUBSW m6, m13, m5, m7, 1
|
||||
PMADDUBSW m4, m13, m5, m7, 0
|
||||
paddw m3, m6
|
||||
paddw m2, m4
|
||||
PALIGNR m6, m1, m0, 13
|
||||
%if cpuflag(ssse3)
|
||||
pxor m5, m5
|
||||
%endif
|
||||
punpcklbw m4, m6, m5
|
||||
punpckhbw m6, m5
|
||||
psllw m5, m4, 7
|
||||
psllw m7, m6, 7
|
||||
psubw m5, m10
|
||||
psubw m7, m10
|
||||
pmullw m4, m12
|
||||
pmullw m6, m12
|
||||
paddw m3, m4
|
||||
paddw m2, m6
|
||||
paddsw m3, m5
|
||||
paddsw m2, m7
|
||||
psraw m3, 3
|
||||
psraw m2, 3
|
||||
paddw m3, m11
|
||||
paddw m2, m11
|
||||
mova [dstptrq+ 0], m3
|
||||
mova [dstptrq+16], m2
|
||||
%endif
|
||||
|
||||
mova m0, m1
|
||||
add srcptrq, 16
|
||||
add dstptrq, 32
|
||||
sub xd, 16
|
||||
@ -317,18 +433,19 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
|
||||
dec hd
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro WIENER_V 0
|
||||
%if ARCH_X86_64
|
||||
cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
|
||||
mov edged, edgem
|
||||
movifnidn fvq, fvmp
|
||||
movifnidn hd, hm
|
||||
movq m15, [fvq]
|
||||
pshufb m14, m15, [pb_4_5_6_7]
|
||||
pshufb m15, m15, [pb_0_1_2_3]
|
||||
pshufd m14, m15, q1111
|
||||
pshufd m15, m15, q0000
|
||||
paddw m14, [pw_0_128]
|
||||
movd m12, [pd_1024]
|
||||
pshufd m12, m12, 0
|
||||
mova m12, [pd_1024]
|
||||
|
||||
DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
|
||||
|
||||
@ -351,8 +468,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
|
||||
SETUP_PIC edged
|
||||
|
||||
movq m0, [fvq]
|
||||
pshufb m1, m0, [PIC_sym(pb_4_5_6_7)]
|
||||
pshufb m0, m0, [PIC_sym(pb_0_1_2_3)]
|
||||
pshufd m1, m0, q1111
|
||||
pshufd m0, m0, q0000
|
||||
paddw m1, [PIC_sym(pw_0_128)]
|
||||
mova [esp+0x50], m0
|
||||
mova [esp+0x40], m1
|
||||
@ -504,6 +621,15 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
|
||||
sub wd, 8
|
||||
jg .loop_x
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WIENER_H
|
||||
WIENER_V
|
||||
|
||||
INIT_XMM ssse3
|
||||
WIENER_H
|
||||
WIENER_V
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; self-guided ;;
|
||||
|
9
third_party/dav1d/src/x86/mc_init_tmpl.c
vendored
9
third_party/dav1d/src/x86/mc_init_tmpl.c
vendored
@ -90,9 +90,11 @@ decl_blend_dir_fn(dav1d_blend_h_ssse3);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
|
||||
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
|
||||
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
|
||||
|
||||
decl_emu_edge_fn(dav1d_emu_edge_avx2);
|
||||
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
|
||||
@ -104,6 +106,13 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
c->mct[type] = dav1d_prep_##name##_##suffix
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
|
||||
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
|
||||
#endif
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
165
third_party/dav1d/src/x86/mc_ssse3.asm
vendored
165
third_party/dav1d/src/x86/mc_ssse3.asm
vendored
@ -68,7 +68,9 @@ pw_6903: times 8 dw 6903
|
||||
pw_8192: times 8 dw 8192
|
||||
pd_32: times 4 dd 32
|
||||
pd_512: times 4 dd 512
|
||||
pd_16384: times 4 dd 16484
|
||||
pd_32768: times 4 dd 32768
|
||||
pd_262144:times 4 dd 262144
|
||||
|
||||
pw_258: times 2 dw 258
|
||||
|
||||
@ -3385,6 +3387,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
|
||||
%define m14 m6
|
||||
%define m15 m7
|
||||
%define m11 m7
|
||||
%endif
|
||||
%if notcpuflag(ssse3) || ARCH_X86_32
|
||||
pxor m11, m11
|
||||
%endif
|
||||
lea tmp1d, [myq+deltaq*4]
|
||||
@ -3483,6 +3487,7 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
|
||||
mova m14, [esp+0xE0]
|
||||
mova m15, [esp+0xF0]
|
||||
%endif
|
||||
%if cpuflag(ssse3)
|
||||
psrad m12, 13
|
||||
psrad m13, 13
|
||||
psrad m14, 13
|
||||
@ -3492,6 +3497,22 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
|
||||
mova m13, [PIC_sym(pw_8192)]
|
||||
pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
|
||||
pmulhrsw m14, m13
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
%define m10 m0
|
||||
%endif
|
||||
mova m10, [PIC_sym(pd_16384)]
|
||||
paddd m12, m10
|
||||
paddd m13, m10
|
||||
paddd m14, m10
|
||||
paddd m15, m10
|
||||
psrad m12, 15
|
||||
psrad m13, 15
|
||||
psrad m14, 15
|
||||
psrad m15, 15
|
||||
packssdw m12, m13
|
||||
packssdw m14, m15
|
||||
%endif
|
||||
mova [tmpq+tsq*0], m12
|
||||
mova [tmpq+tsq*2], m14
|
||||
dec counterd
|
||||
@ -3554,11 +3575,16 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
|
||||
call .main2
|
||||
lea dstq, [dstq+dsq*2]
|
||||
.start:
|
||||
%if cpuflag(ssse3)
|
||||
%if ARCH_X86_64
|
||||
mova m10, [PIC_sym(pw_8192)]
|
||||
%if notcpuflag(sse4)
|
||||
%if cpuflag(ssse3)
|
||||
%define roundval pw_8192
|
||||
%else
|
||||
%define m10 [PIC_sym(pw_8192)]
|
||||
%define roundval pd_262144
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
mova m10, [PIC_sym(roundval)]
|
||||
%else
|
||||
%define m10 [PIC_sym(roundval)]
|
||||
%endif
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
@ -3577,10 +3603,18 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
|
||||
packusdw m12, m13
|
||||
pavgw m12, m11 ; (x + (1 << 10)) >> 11
|
||||
%else
|
||||
%if cpuflag(ssse3)
|
||||
psrad m12, 17
|
||||
psrad m13, 17
|
||||
packssdw m12, m13
|
||||
pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
|
||||
pmulhrsw m12, m10
|
||||
%else
|
||||
paddd m12, m10
|
||||
paddd m13, m10
|
||||
psrad m12, 19
|
||||
psrad m13, 19
|
||||
packssdw m12, m13
|
||||
%endif
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
%define m14 m6
|
||||
@ -3594,10 +3628,18 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
|
||||
packusdw m14, m15
|
||||
pavgw m14, m11 ; (x + (1 << 10)) >> 11
|
||||
%else
|
||||
%if cpuflag(ssse3)
|
||||
psrad m14, 17
|
||||
psrad m15, 17
|
||||
packssdw m14, m15
|
||||
pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
|
||||
pmulhrsw m14, m10
|
||||
%else
|
||||
paddd m14, m10
|
||||
paddd m15, m10
|
||||
psrad m14, 19
|
||||
psrad m15, 19
|
||||
packssdw m14, m15
|
||||
%endif
|
||||
%endif
|
||||
packuswb m12, m14
|
||||
movq [dstq+dsq*0], m12
|
||||
@ -3647,12 +3689,17 @@ ALIGN function_align
|
||||
lea filterq, [PIC_sym(mc_warp_filter)]
|
||||
%if ARCH_X86_64
|
||||
mov myd, r6m
|
||||
%if cpuflag(ssse3)
|
||||
pxor m11, m11
|
||||
%endif
|
||||
%endif
|
||||
call .h
|
||||
psrld m2, m0, 16
|
||||
psrld m3, m1, 16
|
||||
%if ARCH_X86_32
|
||||
%if notcpuflag(ssse3)
|
||||
mova [esp+gprsize+0x00], m2
|
||||
%endif
|
||||
mova [esp+gprsize+0x10], m3
|
||||
%endif
|
||||
call .h
|
||||
@ -3666,6 +3713,9 @@ ALIGN function_align
|
||||
%if ARCH_X86_64
|
||||
%define blendmask [rsp+gprsize+0x80]
|
||||
%else
|
||||
%if notcpuflag(ssse3)
|
||||
mova m2, [esp+gprsize+0x00]
|
||||
%endif
|
||||
mova m3, [esp+gprsize+0x10]
|
||||
%define blendmask [esp+gprsize+0x120]
|
||||
%define m10 m7
|
||||
@ -3689,6 +3739,9 @@ ALIGN function_align
|
||||
mova [rsp+gprsize+0x30], m5
|
||||
call .h
|
||||
%if ARCH_X86_32
|
||||
%if notcpuflag(ssse3)
|
||||
mova m2, [esp+gprsize+0x00]
|
||||
%endif
|
||||
mova m3, [esp+gprsize+0x10]
|
||||
%define m10 m5
|
||||
%endif
|
||||
@ -3848,6 +3901,7 @@ ALIGN function_align
|
||||
lea tmp2d, [mxq+alphaq*1]
|
||||
shr mxd, 10
|
||||
shr tmp1d, 10
|
||||
%if cpuflag(ssse3)
|
||||
movq m14, [filterq+mxq *8] ; 2 X
|
||||
movq m9, [filterq+tmp1q*8] ; 6 X
|
||||
lea tmp1d, [tmp2q+alphaq*4]
|
||||
@ -3864,10 +3918,99 @@ ALIGN function_align
|
||||
pmaddubsw m15, m14
|
||||
pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
|
||||
pmaddubsw m10, m9
|
||||
mova m14, [PIC_sym(pw_8192)]
|
||||
mova m9, [PIC_sym(pd_32768)]
|
||||
phaddw m0, m15
|
||||
phaddw m1, m10
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
%define m11 m2
|
||||
%endif
|
||||
pcmpeqw m0, m0
|
||||
psrlw m14, m0, 8
|
||||
psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
|
||||
pand m14, m10 ; 00 02 04 06 08 10 12 14
|
||||
packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
|
||||
psrldq m9, m0, 4
|
||||
pshufd m0, m14, q0220
|
||||
pand m0, m9
|
||||
psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
|
||||
pslldq m15, m14, 12
|
||||
por m0, m15 ; shufA
|
||||
psrlw m15, m0, 8
|
||||
psraw m11, m1, 8
|
||||
psllw m0, 8
|
||||
psllw m1, 8
|
||||
psrlw m0, 8
|
||||
psraw m1, 8
|
||||
pmullw m15, m11
|
||||
pmullw m0, m1
|
||||
paddw m0, m15 ; pmaddubsw m0, m1
|
||||
pshufd m15, m14, q0220
|
||||
pand m15, m9
|
||||
psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
|
||||
pslldq m1, m14, 12
|
||||
por m15, m1 ; shufC
|
||||
pshufd m1, m14, q0220
|
||||
pand m1, m9
|
||||
psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
|
||||
pslldq m11, m14, 12
|
||||
por m1, m11 ; shufB
|
||||
pshufd m10, m14, q0220
|
||||
pand m10, m9
|
||||
psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
|
||||
pslldq m14, m14, 12
|
||||
por m10, m14 ; shufD
|
||||
psrlw m9, m1, 8
|
||||
psraw m11, m8, 8
|
||||
psllw m1, 8
|
||||
psllw m8, 8
|
||||
psrlw m1, 8
|
||||
psraw m8, 8
|
||||
pmullw m9, m11
|
||||
pmullw m1, m8
|
||||
paddw m1, m9 ; pmaddubsw m1, m8
|
||||
movq m14, [filterq+mxq *8] ; 2 X
|
||||
movq m9, [filterq+tmp1q*8] ; 6 X
|
||||
lea tmp1d, [tmp2q+alphaq*4]
|
||||
lea mxd, [tmp2q+betaq] ; mx += beta
|
||||
shr tmp2d, 10
|
||||
shr tmp1d, 10
|
||||
movhps m14, [filterq+tmp2q*8] ; 2 3
|
||||
movhps m9, [filterq+tmp1q*8] ; 6 7
|
||||
psrlw m8, m15, 8
|
||||
psraw m11, m14, 8
|
||||
psllw m15, 8
|
||||
psllw m14, 8
|
||||
psrlw m15, 8
|
||||
psraw m14, 8
|
||||
pmullw m8, m11
|
||||
pmullw m15, m14
|
||||
paddw m15, m8 ; pmaddubsw m15, m14
|
||||
psrlw m8, m10, 8
|
||||
psraw m11, m9, 8
|
||||
psllw m10, 8
|
||||
psllw m9, 8
|
||||
psrlw m10, 8
|
||||
psraw m9, 8
|
||||
pmullw m8, m11
|
||||
pmullw m10, m9
|
||||
paddw m10, m8 ; pmaddubsw m10, m9
|
||||
pslld m8, m0, 16
|
||||
pslld m9, m1, 16
|
||||
pslld m14, m15, 16
|
||||
pslld m11, m10, 16
|
||||
paddw m0, m8
|
||||
paddw m1, m9
|
||||
paddw m15, m14
|
||||
paddw m10, m11
|
||||
psrad m0, 16
|
||||
psrad m1, 16
|
||||
psrad m15, 16
|
||||
psrad m10, 16
|
||||
packssdw m0, m15 ; phaddw m0, m15
|
||||
packssdw m1, m10 ; phaddw m1, m10
|
||||
%endif
|
||||
mova m14, [PIC_sym(pw_8192)]
|
||||
mova m9, [PIC_sym(pd_32768)]
|
||||
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
|
||||
pmaddwd m1, m14
|
||||
paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
|
||||
@ -3883,6 +4026,12 @@ INIT_XMM ssse3
|
||||
WARP_AFFINE_8X8
|
||||
WARP_AFFINE_8X8T
|
||||
|
||||
INIT_XMM sse2
|
||||
WARP_AFFINE_8X8
|
||||
WARP_AFFINE_8X8T
|
||||
|
||||
INIT_XMM ssse3
|
||||
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 6, 4
|
||||
%else
|
||||
|
15
third_party/dav1d/tests/checkasm/checkasm.c
vendored
15
third_party/dav1d/tests/checkasm/checkasm.c
vendored
@ -45,15 +45,22 @@ static unsigned get_seed(void) {
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
#ifdef __APPLE__
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#define COLOR_RED 1
|
||||
#define COLOR_GREEN 2
|
||||
#define COLOR_YELLOW 3
|
||||
|
||||
static unsigned get_seed(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (unsigned) (tv.tv_usec + tv.tv_sec * 1000000);
|
||||
#ifdef __APPLE__
|
||||
return (unsigned) mach_absolute_time();
|
||||
#elif defined(HAVE_CLOCK_GETTIME)
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
2
third_party/dav1d/tests/meson.build
vendored
2
third_party/dav1d/tests/meson.build
vendored
@ -90,7 +90,7 @@ if is_asm_enabled
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: [stackalign_flag, stackrealign_flag],
|
||||
build_by_default: false,
|
||||
dependencies : [thread_dependency, m_lib],
|
||||
dependencies : [thread_dependency, rt_dependency, m_lib],
|
||||
)
|
||||
|
||||
test('checkasm', checkasm, is_parallel: false)
|
||||
|
21
third_party/dav1d/tools/dav1d.c
vendored
21
third_party/dav1d/tools/dav1d.c
vendored
@ -32,6 +32,7 @@
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
@ -44,7 +45,7 @@
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
#endif
|
||||
#if defined(HAVE_MACH_ABSOLUTE_TIME)
|
||||
#ifdef __APPLE__
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
|
||||
@ -67,7 +68,7 @@ static uint64_t get_time_nanos(void) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
|
||||
#elif defined(HAVE_MACH_ABSOLUTE_TIME)
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info_data_t info;
|
||||
mach_timebase_info(&info);
|
||||
return mach_absolute_time() * info.numer / info.denom;
|
||||
@ -145,7 +146,7 @@ int main(const int argc, char *const *const argv) {
|
||||
if (strcmp(version, DAV1D_VERSION)) {
|
||||
fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
|
||||
version, DAV1D_VERSION);
|
||||
return -1;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
init_demuxers();
|
||||
@ -156,12 +157,12 @@ int main(const int argc, char *const *const argv) {
|
||||
cli_settings.inputfile,
|
||||
fps, &total, timebase)) < 0)
|
||||
{
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
for (unsigned i = 0; i <= cli_settings.skip; i++) {
|
||||
if ((res = input_read(in, &data)) < 0) {
|
||||
input_close(in);
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (i < cli_settings.skip) dav1d_data_unref(&data);
|
||||
}
|
||||
@ -176,7 +177,7 @@ int main(const int argc, char *const *const argv) {
|
||||
while (dav1d_parse_sequence_header(&seq, data.data, data.sz)) {
|
||||
if ((res = input_read(in, &data)) < 0) {
|
||||
input_close(in);
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
seq_skip++;
|
||||
}
|
||||
@ -191,7 +192,7 @@ int main(const int argc, char *const *const argv) {
|
||||
total = cli_settings.limit;
|
||||
|
||||
if ((res = dav1d_open(&c, &lib_settings)))
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
|
||||
if (cli_settings.frametimes)
|
||||
frametimes = fopen(cli_settings.frametimes, "w");
|
||||
@ -234,7 +235,7 @@ int main(const int argc, char *const *const argv) {
|
||||
&p.p, fps)) < 0)
|
||||
{
|
||||
if (frametimes) fclose(frametimes);
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
if ((res = output_write(out, &p)) < 0)
|
||||
@ -271,7 +272,7 @@ int main(const int argc, char *const *const argv) {
|
||||
&p.p, fps)) < 0)
|
||||
{
|
||||
if (frametimes) fclose(frametimes);
|
||||
return res;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
if ((res = output_write(out, &p)) < 0)
|
||||
@ -302,5 +303,5 @@ int main(const int argc, char *const *const argv) {
|
||||
}
|
||||
dav1d_close(&c);
|
||||
|
||||
return res;
|
||||
return (res == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
||||
|
2
third_party/dav1d/tools/dav1d_cli_parse.c
vendored
2
third_party/dav1d/tools/dav1d_cli_parse.c
vendored
@ -104,7 +104,7 @@ static void usage(const char *const app, const char *const reason, ...) {
|
||||
fprintf(stderr, "Supported options:\n"
|
||||
" --input/-i $file: input file\n"
|
||||
" --output/-o $file: output file\n"
|
||||
" --demuxer $name: force demuxer type ('ivf' or 'annexb'; default: detect from extension)\n"
|
||||
" --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from extension)\n"
|
||||
" --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
|
||||
" --quiet/-q: disable status messages\n"
|
||||
" --frametimes $file: dump frame times to file\n"
|
||||
|
114
third_party/dav1d/tools/input/annexb.c
vendored
114
third_party/dav1d/tools/input/annexb.c
vendored
@ -1,6 +1,7 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* Copyright © 2019, James Almer <jamrial@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@ -27,13 +28,96 @@
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "dav1d/headers.h"
|
||||
|
||||
#include "input/demuxer.h"
|
||||
#include "input/parse.h"
|
||||
|
||||
// these functions are based on an implementation from FFmpeg, and relicensed
|
||||
// with author's permission
|
||||
|
||||
#define PROBE_SIZE 1024
|
||||
|
||||
static int annexb_probe(const uint8_t *data) {
|
||||
int ret, cnt = 0;
|
||||
|
||||
size_t temporal_unit_size;
|
||||
ret = leb(data + cnt, PROBE_SIZE - cnt, &temporal_unit_size);
|
||||
if (ret < 0)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
|
||||
size_t frame_unit_size;
|
||||
ret = leb(data + cnt, PROBE_SIZE - cnt, &frame_unit_size);
|
||||
if (ret < 0 || ((uint64_t)frame_unit_size + ret) > temporal_unit_size)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
|
||||
temporal_unit_size -= ret;
|
||||
|
||||
size_t obu_unit_size;
|
||||
ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
|
||||
if (ret < 0 || ((uint64_t)obu_unit_size + ret) >= frame_unit_size)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
|
||||
temporal_unit_size -= obu_unit_size + ret;
|
||||
frame_unit_size -= obu_unit_size + ret;
|
||||
|
||||
// Check that the first OBU is a Temporal Delimiter.
|
||||
size_t obu_size;
|
||||
enum Dav1dObuType type;
|
||||
ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
|
||||
&obu_size, &type, 1);
|
||||
if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
|
||||
return 0;
|
||||
cnt += (int)obu_unit_size;
|
||||
|
||||
// look for first frame and accompanying sequence header
|
||||
int seq = 0;
|
||||
while (cnt < PROBE_SIZE) {
|
||||
ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
|
||||
if (ret < 0 || ((uint64_t)obu_unit_size + ret) > frame_unit_size)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
temporal_unit_size -= ret;
|
||||
frame_unit_size -= ret;
|
||||
|
||||
ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
|
||||
&obu_size, &type, 1);
|
||||
if (ret < 0)
|
||||
return 0;
|
||||
cnt += (int)obu_unit_size;
|
||||
|
||||
switch (type) {
|
||||
case DAV1D_OBU_SEQ_HDR:
|
||||
seq = 1;
|
||||
break;
|
||||
case DAV1D_OBU_FRAME:
|
||||
case DAV1D_OBU_FRAME_HDR:
|
||||
return seq;
|
||||
case DAV1D_OBU_TD:
|
||||
case DAV1D_OBU_TILE_GRP:
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
temporal_unit_size -= obu_unit_size;
|
||||
frame_unit_size -= obu_unit_size;
|
||||
if (frame_unit_size <= 0)
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef struct DemuxerPriv {
|
||||
FILE *f;
|
||||
@ -41,23 +125,6 @@ typedef struct DemuxerPriv {
|
||||
size_t frame_unit_size;
|
||||
} AnnexbInputContext;
|
||||
|
||||
static int leb128(AnnexbInputContext *const c, size_t *const len) {
|
||||
unsigned more, i = 0;
|
||||
uint8_t byte;
|
||||
*len = 0;
|
||||
do {
|
||||
if (fread(&byte, 1, 1, c->f) < 1)
|
||||
return -1;
|
||||
more = byte & 0x80;
|
||||
unsigned bits = byte & 0x7f;
|
||||
if (i <= 3 || (i == 4 && bits < (1 << 4)))
|
||||
*len |= bits << (i * 7);
|
||||
else if (bits) return -1;
|
||||
if (++i == 8 && more) return -1;
|
||||
} while (more);
|
||||
return i;
|
||||
}
|
||||
|
||||
static int annexb_open(AnnexbInputContext *const c, const char *const file,
|
||||
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
|
||||
{
|
||||
@ -75,7 +142,7 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file,
|
||||
timebase[0] = 25;
|
||||
timebase[1] = 1;
|
||||
for (*num_frames = 0;; (*num_frames)++) {
|
||||
res = leb128(c, &len);
|
||||
res = leb128(c->f, &len);
|
||||
if (res < 0)
|
||||
break;
|
||||
fseeko(c->f, len, SEEK_CUR);
|
||||
@ -90,15 +157,15 @@ static int annexb_read(AnnexbInputContext *const c, Dav1dData *const data) {
|
||||
int res;
|
||||
|
||||
if (!c->temporal_unit_size) {
|
||||
res = leb128(c, &c->temporal_unit_size);
|
||||
res = leb128(c->f, &c->temporal_unit_size);
|
||||
if (res < 0) return -1;
|
||||
}
|
||||
if (!c->frame_unit_size) {
|
||||
res = leb128(c, &c->frame_unit_size);
|
||||
res = leb128(c->f, &c->frame_unit_size);
|
||||
if (res < 0 || (c->frame_unit_size + res) > c->temporal_unit_size) return -1;
|
||||
c->temporal_unit_size -= res;
|
||||
}
|
||||
res = leb128(c, &len);
|
||||
res = leb128(c->f, &len);
|
||||
if (res < 0 || (len + res) > c->frame_unit_size) return -1;
|
||||
uint8_t *ptr = dav1d_data_create(data, len);
|
||||
if (!ptr) return -1;
|
||||
@ -120,7 +187,8 @@ static void annexb_close(AnnexbInputContext *const c) {
|
||||
const Demuxer annexb_demuxer = {
|
||||
.priv_data_size = sizeof(AnnexbInputContext),
|
||||
.name = "annexb",
|
||||
.extension = "obu",
|
||||
.probe = annexb_probe,
|
||||
.probe_sz = PROBE_SIZE,
|
||||
.open = annexb_open,
|
||||
.read = annexb_read,
|
||||
.close = annexb_close,
|
||||
|
3
third_party/dav1d/tools/input/demuxer.h
vendored
3
third_party/dav1d/tools/input/demuxer.h
vendored
@ -34,7 +34,8 @@ typedef struct DemuxerPriv DemuxerPriv;
|
||||
typedef struct Demuxer {
|
||||
int priv_data_size;
|
||||
const char *name;
|
||||
const char *extension;
|
||||
int probe_sz;
|
||||
int (*probe)(const uint8_t *data);
|
||||
int (*open)(DemuxerPriv *ctx, const char *filename,
|
||||
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
|
||||
int (*read)(DemuxerPriv *ctx, Dav1dData *data);
|
||||
|
47
third_party/dav1d/tools/input/input.c
vendored
47
third_party/dav1d/tools/input/input.c
vendored
@ -33,6 +33,7 @@
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
#include "common/intops.h"
|
||||
|
||||
#include "input/input.h"
|
||||
#include "input/demuxer.h"
|
||||
@ -42,7 +43,7 @@ struct DemuxerContext {
|
||||
const Demuxer *impl;
|
||||
};
|
||||
|
||||
#define MAX_NUM_DEMUXERS 2
|
||||
#define MAX_NUM_DEMUXERS 3
|
||||
static const Demuxer *demuxers[MAX_NUM_DEMUXERS];
|
||||
static int num_demuxers = 0;
|
||||
|
||||
@ -55,23 +56,7 @@ static int num_demuxers = 0;
|
||||
void init_demuxers(void) {
|
||||
register_demuxer(ivf_demuxer);
|
||||
register_demuxer(annexb_demuxer);
|
||||
}
|
||||
|
||||
static const char *find_extension(const char *const f) {
|
||||
const size_t l = strlen(f);
|
||||
|
||||
if (l == 0) return NULL;
|
||||
|
||||
const char *const end = &f[l - 1], *step = end;
|
||||
while ((*step >= 'a' && *step <= 'z') ||
|
||||
(*step >= 'A' && *step <= 'Z') ||
|
||||
(*step >= '0' && *step <= '9'))
|
||||
{
|
||||
step--;
|
||||
}
|
||||
|
||||
return (step < end && step > f && *step == '.' && step[-1] != '/') ?
|
||||
&step[1] : NULL;
|
||||
register_demuxer(section5_demuxer);
|
||||
}
|
||||
|
||||
int input_open(DemuxerContext **const c_out,
|
||||
@ -94,22 +79,34 @@ int input_open(DemuxerContext **const c_out,
|
||||
return DAV1D_ERR(ENOPROTOOPT);
|
||||
}
|
||||
} else {
|
||||
const char *const ext = find_extension(filename);
|
||||
if (!ext) {
|
||||
fprintf(stderr, "No extension found for file %s\n", filename);
|
||||
return -1;
|
||||
int probe_sz = 0;
|
||||
for (i = 0; i < num_demuxers; i++)
|
||||
probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
|
||||
uint8_t *const probe_data = malloc(probe_sz);
|
||||
if (!probe_data) {
|
||||
fprintf(stderr, "Failed to allocate memory\n");
|
||||
return DAV1D_ERR(ENOMEM);
|
||||
}
|
||||
FILE *f = fopen(filename, "rb");
|
||||
res = !!fread(probe_data, 1, probe_sz, f);
|
||||
fclose(f);
|
||||
if (!res) {
|
||||
free(probe_data);
|
||||
fprintf(stderr, "Failed to read probe data\n");
|
||||
return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
|
||||
}
|
||||
|
||||
for (i = 0; i < num_demuxers; i++) {
|
||||
if (!strcmp(demuxers[i]->extension, ext)) {
|
||||
if (demuxers[i]->probe(probe_data)) {
|
||||
impl = demuxers[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
free(probe_data);
|
||||
if (i == num_demuxers) {
|
||||
fprintf(stderr,
|
||||
"Failed to find demuxer for file %s (\"%s\")\n",
|
||||
filename, ext);
|
||||
"Failed to probe demuxer for file %s\n",
|
||||
filename);
|
||||
return DAV1D_ERR(ENOPROTOOPT);
|
||||
}
|
||||
}
|
||||
|
13
third_party/dav1d/tools/input/ivf.c
vendored
13
third_party/dav1d/tools/input/ivf.c
vendored
@ -39,6 +39,16 @@ typedef struct DemuxerPriv {
|
||||
FILE *f;
|
||||
} IvfInputContext;
|
||||
|
||||
static const uint8_t probe_data[] = {
|
||||
'D', 'K', 'I', 'F',
|
||||
0, 0, 0x20, 0,
|
||||
'A', 'V', '0', '1',
|
||||
};
|
||||
|
||||
static int ivf_probe(const uint8_t *const data) {
|
||||
return !memcmp(data, probe_data, sizeof(probe_data));
|
||||
}
|
||||
|
||||
static unsigned rl32(const uint8_t *const p) {
|
||||
return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0];
|
||||
}
|
||||
@ -121,7 +131,8 @@ static void ivf_close(IvfInputContext *const c) {
|
||||
const Demuxer ivf_demuxer = {
|
||||
.priv_data_size = sizeof(IvfInputContext),
|
||||
.name = "ivf",
|
||||
.extension = "ivf",
|
||||
.probe = ivf_probe,
|
||||
.probe_sz = sizeof(probe_data),
|
||||
.open = ivf_open,
|
||||
.read = ivf_read,
|
||||
.close = ivf_close,
|
||||
|
107
third_party/dav1d/tools/input/parse.h
vendored
Normal file
107
third_party/dav1d/tools/input/parse.h
vendored
Normal file
@ -0,0 +1,107 @@
|
||||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* Copyright © 2019, James Almer <jamrial@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_INPUT_PARSE_H
|
||||
#define DAV1D_INPUT_PARSE_H
|
||||
|
||||
#include "dav1d/headers.h"
|
||||
|
||||
static int leb128(FILE *const f, size_t *const len) {
|
||||
unsigned i = 0, more;
|
||||
*len = 0;
|
||||
do {
|
||||
uint8_t byte;
|
||||
if (fread(&byte, 1, 1, f) < 1)
|
||||
return -1;
|
||||
more = byte & 0x80;
|
||||
const unsigned bits = byte & 0x7f;
|
||||
if (i <= 3 || (i == 4 && bits < (1 << 4)))
|
||||
*len |= bits << (i * 7);
|
||||
else if (bits) return -1;
|
||||
if (++i == 8 && more) return -1;
|
||||
} while (more);
|
||||
return i;
|
||||
}
|
||||
|
||||
// these functions are based on an implementation from FFmpeg, and relicensed
|
||||
// with author's permission
|
||||
|
||||
static int leb(const uint8_t *ptr, int sz, size_t *const len) {
|
||||
unsigned i = 0, more;
|
||||
*len = 0;
|
||||
do {
|
||||
if (!sz--) return -1;
|
||||
const int byte = *ptr++;
|
||||
more = byte & 0x80;
|
||||
const unsigned bits = byte & 0x7f;
|
||||
if (i <= 3 || (i == 4 && bits < (1 << 4)))
|
||||
*len |= bits << (i * 7);
|
||||
else if (bits) return -1;
|
||||
if (++i == 8 && more) return -1;
|
||||
} while (more);
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline int parse_obu_header(const uint8_t *buf, int buf_size,
|
||||
size_t *const obu_size,
|
||||
enum Dav1dObuType *const type,
|
||||
const int allow_implicit_size)
|
||||
{
|
||||
int ret, extension_flag, has_size_flag;
|
||||
|
||||
if (!buf_size)
|
||||
return -1;
|
||||
if (*buf & 0x80) // obu_forbidden_bit
|
||||
return -1;
|
||||
|
||||
*type = (*buf & 0x78) >> 3;
|
||||
extension_flag = (*buf & 0x4) >> 2;
|
||||
has_size_flag = (*buf & 0x2) >> 1;
|
||||
// ignore obu_reserved_1bit
|
||||
buf++;
|
||||
buf_size--;
|
||||
|
||||
if (extension_flag) {
|
||||
buf++;
|
||||
buf_size--;
|
||||
// ignore fields
|
||||
}
|
||||
|
||||
if (has_size_flag) {
|
||||
ret = leb(buf, buf_size, obu_size);
|
||||
if (ret < 0)
|
||||
return -1;
|
||||
return (int) *obu_size + ret + 1 + extension_flag;
|
||||
} else if (!allow_implicit_size)
|
||||
return -1;
|
||||
|
||||
*obu_size = buf_size;
|
||||
return buf_size + 1 + extension_flag;
|
||||
}
|
||||
|
||||
#endif /* DAV1D_INPUT_PARSE_H */
|
185
third_party/dav1d/tools/input/section5.c
vendored
Normal file
185
third_party/dav1d/tools/input/section5.c
vendored
Normal file
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Copyright © 2019, VideoLAN and dav1d authors
|
||||
* Copyright © 2019, Two Orioles, LLC
|
||||
* Copyright © 2019, James Almer <jamrial@gmail.com>
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "dav1d/headers.h"
|
||||
|
||||
#include "input/demuxer.h"
|
||||
#include "input/parse.h"
|
||||
|
||||
#define PROBE_SIZE 1024
|
||||
|
||||
static int section5_probe(const uint8_t *data) {
|
||||
int ret, cnt = 0;
|
||||
|
||||
// Check that the first OBU is a Temporal Delimiter.
|
||||
size_t obu_size;
|
||||
enum Dav1dObuType type;
|
||||
ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
|
||||
&obu_size, &type, 0);
|
||||
if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
|
||||
// look for first frame and accompanying sequence header
|
||||
int seq = 0;
|
||||
while (cnt < PROBE_SIZE) {
|
||||
ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
|
||||
&obu_size, &type, 0);
|
||||
if (ret < 0)
|
||||
return 0;
|
||||
cnt += ret;
|
||||
|
||||
switch (type) {
|
||||
case DAV1D_OBU_SEQ_HDR:
|
||||
seq = 1;
|
||||
break;
|
||||
case DAV1D_OBU_FRAME:
|
||||
case DAV1D_OBU_FRAME_HDR:
|
||||
return seq;
|
||||
case DAV1D_OBU_TD:
|
||||
case DAV1D_OBU_TILE_GRP:
|
||||
return 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef struct DemuxerPriv {
|
||||
FILE *f;
|
||||
} Section5InputContext;
|
||||
|
||||
static int section5_open(Section5InputContext *const c, const char *const file,
|
||||
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
|
||||
{
|
||||
if (!(c->f = fopen(file, "rb"))) {
|
||||
fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
|
||||
return -1;
|
||||
}
|
||||
|
||||
// TODO: Parse sequence header and read timing info if any.
|
||||
fps[0] = 25;
|
||||
fps[1] = 1;
|
||||
timebase[0] = 25;
|
||||
timebase[1] = 1;
|
||||
*num_frames = 0;
|
||||
for (;;) {
|
||||
uint8_t byte[2];
|
||||
|
||||
if (fread(&byte[0], 1, 1, c->f) < 1)
|
||||
break;
|
||||
const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
|
||||
if (obu_type == DAV1D_OBU_TD)
|
||||
(*num_frames)++;
|
||||
const int has_length_field = byte[0] & 0x2;
|
||||
if (!has_length_field)
|
||||
return -1;
|
||||
const int has_extension = byte[0] & 0x4;
|
||||
if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
|
||||
return -1;
|
||||
size_t len;
|
||||
const int res = leb128(c->f, &len);
|
||||
if (res < 0)
|
||||
return -1;
|
||||
fseeko(c->f, len, SEEK_CUR); // skip packet
|
||||
}
|
||||
fseeko(c->f, 0, SEEK_SET);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int section5_read(Section5InputContext *const c, Dav1dData *const data) {
|
||||
size_t total_bytes = 0;
|
||||
|
||||
for (int first = 1;; first = 0) {
|
||||
uint8_t byte[2];
|
||||
|
||||
if (fread(&byte[0], 1, 1, c->f) < 1) {
|
||||
if (!first && feof(c->f)) break;
|
||||
return -1;
|
||||
}
|
||||
const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
|
||||
if (first) {
|
||||
if (obu_type != DAV1D_OBU_TD)
|
||||
return -1;
|
||||
} else {
|
||||
if (obu_type == DAV1D_OBU_TD) {
|
||||
// include TD in next packet
|
||||
fseeko(c->f, -1, SEEK_CUR);
|
||||
break;
|
||||
}
|
||||
}
|
||||
const int has_length_field = byte[0] & 0x2;
|
||||
if (!has_length_field)
|
||||
return -1;
|
||||
const int has_extension = !!(byte[0] & 0x4);
|
||||
if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
|
||||
return -1;
|
||||
size_t len;
|
||||
const int res = leb128(c->f, &len);
|
||||
if (res < 0)
|
||||
return -1;
|
||||
total_bytes += 1 + has_extension + res + len;
|
||||
fseeko(c->f, len, SEEK_CUR); // skip packet, we'll read it below
|
||||
}
|
||||
|
||||
fseeko(c->f, -(off_t)total_bytes, SEEK_CUR);
|
||||
uint8_t *ptr = dav1d_data_create(data, total_bytes);
|
||||
if (!ptr) return -1;
|
||||
if (fread(ptr, total_bytes, 1, c->f) != 1) {
|
||||
fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
|
||||
dav1d_data_unref(data);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void section5_close(Section5InputContext *const c) {
|
||||
fclose(c->f);
|
||||
}
|
||||
|
||||
const Demuxer section5_demuxer = {
|
||||
.priv_data_size = sizeof(Section5InputContext),
|
||||
.name = "section5",
|
||||
.probe = section5_probe,
|
||||
.probe_sz = PROBE_SIZE,
|
||||
.open = section5_open,
|
||||
.read = section5_read,
|
||||
.close = section5_close,
|
||||
};
|
16
third_party/dav1d/tools/meson.build
vendored
16
third_party/dav1d/tools/meson.build
vendored
@ -28,6 +28,7 @@ dav1d_input_sources = files(
|
||||
'input/input.c',
|
||||
'input/annexb.c',
|
||||
'input/ivf.c',
|
||||
'input/section5.c',
|
||||
)
|
||||
|
||||
dav1d_output_sources = files(
|
||||
@ -68,21 +69,6 @@ endif
|
||||
# Configuratin data for cli_config.h
|
||||
cli_cdata = configuration_data()
|
||||
|
||||
rt_dependency = []
|
||||
if host_machine.system() != 'windows'
|
||||
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
|
||||
cli_cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
elif host_machine.system() == 'darwin'
|
||||
cli_cdata.set('HAVE_MACH_ABSOLUTE_TIME', 1)
|
||||
else
|
||||
rt_dependency = cc.find_library('rt', required: false)
|
||||
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
|
||||
error('clock_gettime not found')
|
||||
endif
|
||||
cli_cdata.set('HAVE_CLOCK_GETTIME', 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_cdata)
|
||||
|
||||
# dav1d cli tool sources
|
||||
|
4
third_party/dav1d/tools/output/output.c
vendored
4
third_party/dav1d/tools/output/output.c
vendored
@ -146,10 +146,10 @@ void output_close(MuxerContext *const ctx) {
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
int output_verify(MuxerContext *const ctx, const char *const md5_Str) {
|
||||
int output_verify(MuxerContext *const ctx, const char *const md5_str) {
|
||||
int res = 0;
|
||||
if (ctx->impl->verify)
|
||||
res = ctx->impl->verify(ctx->data, md5_Str);
|
||||
res = ctx->impl->verify(ctx->data, md5_str);
|
||||
free(ctx);
|
||||
return res;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user