Bug 1612060 - Update libdav1d to 0.5.2; r=achronop

Differential Revision: https://phabricator.services.mozilla.com/D61223

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Dan Minor 2020-01-31 13:28:26 +00:00
parent 2dfc28fbbe
commit f28bbffcea
52 changed files with 4760 additions and 750 deletions

View File

@ -202,6 +202,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
elif CONFIG['CPU_ARCH'] == 'arm':
SOURCES += [
'../../../third_party/dav1d/src/arm/32/cdef.S',
'../../../third_party/dav1d/src/arm/32/ipred.S',
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
'../../../third_party/dav1d/src/arm/32/mc.S',
]

View File

@ -1,7 +1,7 @@
#define API_VERSION_NUMBER 3,0,0,0
#define API_VERSION_NUMBER_STR "3.0.0"
#define PROJECT_VERSION_NUMBER 0,4,0,0
#define PROJECT_VERSION_NUMBER_STR "0.4.0"
#define API_VERSION_NUMBER 3,1,0,0
#define API_VERSION_NUMBER_STR "3.1.0"
#define PROJECT_VERSION_NUMBER 0,5,2,0
#define PROJECT_VERSION_NUMBER_STR "0.5.2"
#include <windows.h>

View File

@ -20,7 +20,7 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 5595102721d3c298d7cee64e64878486a3b8bdad (2019-10-22T19:50:25.000+02:00).
release: commit 39667c751d427e447cbe8be783cfecd296659e24 (2019-12-02T18:19:06.000+01:00).
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.5.0-6-g5595102"
#define DAV1D_VERSION "0.5.2-0-g39667c7"

View File

@ -28,7 +28,7 @@
#define DAV1D_VERSION_H
#define DAV1D_API_VERSION_MAJOR 3
#define DAV1D_API_VERSION_MINOR 0
#define DAV1D_API_VERSION_MINOR 1
#define DAV1D_API_VERSION_PATCH 0
#endif /* DAV1D_VERSION_H */

View File

@ -1,5 +1,27 @@
Changes for 0.5.2 'Asiatic Cheetah':
------------------------------------
0.5.2 is a small release improving speed for ARM32 and adding minor features:
- ARM32 optimizations for loopfilter, ipred_dc|h|v
- Add section-5 raw OBU demuxer
- Improve the speed by reducing the L2 cache collisions
- Fix minor issues
Changes for 0.5.1 'Asiatic Cheetah':
------------------------------------
0.5.1 is a small release improving speeds and fixing minor issues
compared to 0.5.0:
- SSE2 optimizations for CDEF, wiener and warp_affine
- NEON optimizations for SGR on ARM32
- Fix mismatch issue in x86 asm in inverse identity transforms
- Fix build issue in ARM64 assembly if debug info was enabled
- Add a workaround for Xcode 11 -fstack-check bug
Changes for 0.5.0 'Asiatic Cheetah':
----------------------------
------------------------------------
0.5.0 is a medium release fixing regressions and minor issues,
and improving speed significantly:

View File

@ -31,15 +31,16 @@ The plan is the folllowing:
2. Provide a usable API,
3. Port to most platforms,
4. Make it fast on desktop, by writing asm for AVX-2 chips.
5. Make it fast on mobile, by writing asm for ARMv8 chips,
6. Make it fast on older desktop, by writing asm for SSSE3+ chips.
### On-going
5. Make it fast on mobile, by writing asm for ARMv8 chips,
6. Make it fast on older desktop, by writing asm for SSE chips.
7. Make it fast on older mobiles, by writing asm for ARMv7 chips,
8. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
9. Accelerate for less common architectures, like PPC, SSE2 or AVX-512.
### After
7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
8. Accelerate for less common architectures,
9. Use more GPU, when possible.
10. Use more GPU, when possible.
# Contribute

View File

@ -41,6 +41,17 @@
#define DAV1D_REFS_PER_FRAME 7
#define DAV1D_TOTAL_REFS_PER_FRAME (DAV1D_REFS_PER_FRAME + 1)
enum Dav1dObuType {
DAV1D_OBU_SEQ_HDR = 1,
DAV1D_OBU_TD = 2,
DAV1D_OBU_FRAME_HDR = 3,
DAV1D_OBU_TILE_GRP = 4,
DAV1D_OBU_METADATA = 5,
DAV1D_OBU_FRAME = 6,
DAV1D_OBU_REDUNDANT_FRAME_HDR = 7,
DAV1D_OBU_PADDING = 15,
};
enum Dav1dTxfmMode {
DAV1D_TX_4X4_ONLY,
DAV1D_TX_LARGEST,

View File

@ -37,7 +37,7 @@
/* Number of bytes to align AND pad picture memory buffers by, so that SIMD
* implementations can over-read by a few bytes, and use aligned read/write
* instructions. */
#define DAV1D_PICTURE_ALIGNMENT 32
#define DAV1D_PICTURE_ALIGNMENT 64
typedef struct Dav1dPictureParameters {
int w; ///< width (in pixels)

View File

@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.5.0',
version: '0.5.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '3.0.0'
dav1d_soname_version = '3.1.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
@ -98,6 +98,7 @@ if host_machine.system() == 'windows'
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
cdata.set('_UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
cdata.set('__USE_MINGW_ANSI_STDIO', 1) # Define to force use of MinGW printf
cdata.set('_CRT_DECLARE_NONSTDC_NAMES', 1) # Define to get off_t from sys/types.h on MSVC
if cc.has_function('fseeko', prefix : '#include <stdio.h>', args : test_args)
cdata.set('_FILE_OFFSET_BITS', 64) # Not set by default by Meson on Windows
else
@ -112,11 +113,23 @@ if host_machine.system() == 'windows'
# On Windows, we use a compatibility layer to emulate pthread
thread_dependency = []
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
rt_dependency = []
else
thread_dependency = dependency('threads')
thread_compat_dep = []
endif
rt_dependency = []
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
cdata.set('HAVE_CLOCK_GETTIME', 1)
elif host_machine.system() != 'darwin'
rt_dependency = cc.find_library('rt', required: false)
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
error('clock_gettime not found')
endif
cdata.set('HAVE_CLOCK_GETTIME', 1)
endif
endif
# Header checks
@ -215,6 +228,12 @@ if (get_option('buildtype') != 'debug' and get_option('buildtype') != 'plain')
optional_arguments += '-ffast-math'
endif
if (host_machine.system() == 'darwin' and cc.get_id() == 'clang' and
cc.version().startswith('11'))
# Workaround for Xcode 11 -fstack-check bug, see #301
optional_arguments += '-fno-stack-check'
endif
add_project_arguments(cc.get_supported_arguments(optional_arguments), language : 'c')
# libFuzzer related things

View File

@ -6,7 +6,7 @@ summary: AV1 decoder from VideoLAN
description: |
A small and fast AV1 decoder from the people who brought you VLC.
grade: devel # must be 'stable' to release into candidate/stable channels
grade: stable
confinement: strict # use 'strict' once you have the right plugs and slots
apps:

825
third_party/dav1d/src/arm/32/ipred.S vendored Normal file
View File

@ -0,0 +1,825 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* Copyright © 2019, B Krishnan Iyer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_128_neon, export=1
push {r4, lr}
ldr r4, [sp, #8]
clz r3, r3
adr r2, L(ipred_dc_128_tbl)
sub r3, r3, #25
ldr r3, [r2, r3, lsl #2]
mov lr, #128
vdup.8 q0, lr
add r2, r2, r3
add r12, r0, r1
lsl r1, r1, #1
bx r2
.align 2
L(ipred_dc_128_tbl):
.word 640f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 16f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 8f - L(ipred_dc_128_tbl) + CONFIG_THUMB
.word 4f - L(ipred_dc_128_tbl) + CONFIG_THUMB
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vdup.8 q1, lr
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vdup.8 q1, lr
vdup.8 q2, lr
vdup.8 q3, lr
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_v_neon, export=1
push {r4, lr}
ldr lr, [sp, #8]
clz r3, r3
adr r4, L(ipred_v_tbl)
sub r3, r3, #25
ldr r3, [r4, r3, lsl #2]
add r2, r2, #1
add r4, r4, r3
add r12, r0, r1
lsl r1, r1, #1
bx r4
.align 2
L(ipred_v_tbl):
.word 640f - L(ipred_v_tbl) + CONFIG_THUMB
.word 320f - L(ipred_v_tbl) + CONFIG_THUMB
.word 160f - L(ipred_v_tbl) + CONFIG_THUMB
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[0]}, [r2]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs lr, lr, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4, pc}
80:
vld1.8 {d0}, [r2]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs lr, lr, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4, pc}
160:
vld1.8 {q0}, [r2]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4, pc}
320:
vld1.8 {q0, q1}, [r2]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4, pc}
640:
vld1.8 {q0, q1}, [r2]!
sub r1, r1, #32
vld1.8 {q2, q3}, [r2]
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs lr, lr, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4, pc}
endfunc
// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_h_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_h_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
sub r2, r2, #4
mov lr, #-4
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_h_tbl):
.word 640f - L(ipred_h_tbl) + CONFIG_THUMB
.word 320f - L(ipred_h_tbl) + CONFIG_THUMB
.word 160f - L(ipred_h_tbl) + CONFIG_THUMB
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
4:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.32 {d3[0]}, [r0, :32], r1
vst1.32 {d2[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d1[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
8:
vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
vst1.8 {d3}, [r0, :64], r1
vst1.8 {d2}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d1}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
add r2, r2, #3
mov lr, #-1
16:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128], r1
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #16
32:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
add r2, r2, #3
mov lr, #-1
sub r1, r1, #48
64:
vld1.8 {d0[], d1[]}, [r2], lr
subs r4, r4, #4
vld1.8 {d2[], d3[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vld1.8 {d4[], d5[]}, [r2], lr
vst1.8 {q1}, [r12, :128]!
vld1.8 {d6[], d7[]}, [r2], lr
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128]!
vst1.8 {q1}, [r12, :128]!
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r12, :128], r1
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128]!
vst1.8 {q3}, [r12, :128]!
vst1.8 {q2}, [r0, :128], r1
vst1.8 {q3}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_top_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
clz r3, r3
adr r5, L(ipred_dc_top_tbl)
sub r3, r3, #25
ldr r3, [r5, r3, lsl #2]
add r2, r2, #1
add r5, r5, r3
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_top_tbl):
.word 640f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 320f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 160f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 80f - L(ipred_dc_top_tbl) + CONFIG_THUMB
.word 40f - L(ipred_dc_top_tbl) + CONFIG_THUMB
40:
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d0, d0[0]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 4b
pop {r4-r5, pc}
80:
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 d0, d0[0]
8:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 8b
pop {r4-r5, pc}
160:
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
16:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 16b
pop {r4-r5, pc}
320:
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d4, q0, #5
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
32:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 32b
pop {r4-r5, pc}
640:
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d18, q0, #6
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
sub r1, r1, #32
64:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 64b
pop {r4-r5, pc}
endfunc
// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_left_neon, export=1
push {r4-r5, lr}
ldr r4, [sp, #12]
sub r2, r2, r4
clz r3, r3
clz lr, r4
sub lr, lr, #25
adr r5, L(ipred_dc_left_tbl)
sub r3, r3, #20
ldr r3, [r5, r3, lsl #2]
ldr lr, [r5, lr, lsl #2]
add r3, r5, r3
add r5, r5, lr
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_left_tbl):
.word L(ipred_dc_left_h64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_h4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w64) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w32) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w16) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w8) - L(ipred_dc_left_tbl) + CONFIG_THUMB
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
L(ipred_dc_left_h4):
vld1.32 {d0[]}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w4):
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt L(ipred_dc_left_w4)
pop {r4-r5, pc}
L(ipred_dc_left_h8):
vld1.8 {d0}, [r2]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #3
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w8):
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt L(ipred_dc_left_w8)
pop {r4-r5, pc}
L(ipred_dc_left_h16):
vld1.8 {d0, d1}, [r2]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #4
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w16):
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt L(ipred_dc_left_w16)
pop {r4-r5, pc}
L(ipred_dc_left_h32):
vld1.8 {d0, d1, d2, d3}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #5
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w32):
vmov.8 q1, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
L(ipred_dc_left_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vld1.8 {d4, d5, d6, d7}, [r2]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #6
vdup.8 q0, d0[0]
bx r3
L(ipred_dc_left_w64):
sub r1, r1, #32
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
1:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 1b
pop {r4-r5, pc}
endfunc
// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
// const pixel *const topleft,
// const int width, const int height, const int a,
// const int max_width, const int max_height);
function ipred_dc_neon, export=1
push {r4-r6, lr}
ldr r4, [sp, #16]
sub r2, r2, r4
add lr, r3, r4 // width + height
clz r3, r3
clz r12, r4
vdup.16 q15, lr // width + height
mov r6, #0
adr r5, L(ipred_dc_tbl)
rbit lr, lr // rbit(width + height)
sub r3, r3, #20 // 25 leading bits, minus table offset 5
sub r12, r12, #25
clz lr, lr // ctz(width + height)
ldr r3, [r5, r3, lsl #2]
ldr r12, [r5, r12, lsl #2]
neg lr, lr // -ctz(width + height)
add r3, r5, r3
add r5, r5, r12
vshr.u16 q15, q15, #1 // (width + height) >> 1
vdup.16 q14, lr // -ctz(width + height)
add r12, r0, r1
lsl r1, r1, #1
bx r5
.align 2
L(ipred_dc_tbl):
.word L(ipred_dc_h64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_h4) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w64) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w32) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w16) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w8) - L(ipred_dc_tbl) + CONFIG_THUMB
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
L(ipred_dc_h4):
vld1.32 {d0[0]}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
vld1.32 {d1[0]}, [r2]
vmov.32 d1[1], r6
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
vpadd.u16 d1, d1
vpadd.u16 d1, d1
cmp r4, #4
vadd.s16 d0, d0, d1
vshl.u16 d0, d0, d28
beq 1f // h = 8/16
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
cmp r4, #16
it ne
movne lr, r5
vdup.16 d30, lr
vqdmulh.s16 d0, d0, d30
1:
vdup.8 d0, d0[0]
2:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
subs r4, r4, #4
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h8):
vld1.8 {d0}, [r2]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w8):
add r2, r2, #1
vld1.8 {d2}, [r2]
vadd.s16 d0, d0, d30
vpaddl.u8 d2, d2
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #8
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/16/32
cmp r4, #32
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 d0, d0[0]
2:
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
subs r4, r4, #4
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d0}, [r12, :64], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h16):
vld1.8 {d0, d1}, [r2]!
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w16):
add r2, r2, #1
vld1.8 {d2, d3}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d2, d2
vpadd.u16 d2, d2
cmp r4, #16
vadd.s16 d0, d0, d2
vshl.u16 d0, d0, d28
beq 1f // h = 4/8/32/64
tst r4, #(32+16+8) // 16 added to make a consecutive bitmask
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d0, d0, d24
1:
vdup.8 q0, d0[0]
2:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h32):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w32):
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vadd.u16 d4, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d2, d2, d3
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
cmp r4, #32
vadd.s16 d0, d0, d4
vadd.s16 d0, d0, d2
vshl.u16 d4, d0, d28
beq 1f // h = 8/16/64
cmp r4, #8
movw lr, #(0x3334/2)
movw r5, #(0x5556/2)
it ne
movne lr, r5
vdup.16 q12, lr
vqdmulh.s16 d4, d4, d24
1:
vdup.8 q0, d4[0]
vdup.8 q1, d4[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
L(ipred_dc_h64):
vld1.8 {d0, d1, d2, d3}, [r2]!
vaddl.u8 q0, d0, d1
vld1.8 {d4, d5, d6, d7}, [r2]!
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7
vadd.u16 q0, q0, q1
vadd.u16 q1, q2, q3
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w64):
vmov.8 q1, q0
vmov.8 q2, q0
vmov.8 q3, q0
2:
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]!
vadd.s16 d0, d0, d30
vaddl.u8 q2, d4, d5
vaddl.u8 q1, d2, d3
vadd.u16 d4, d4, d5
vadd.u16 d2, d2, d3
vld1.8 {d16, d17, d18, d19}, [r2]
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vpadd.u16 d4, d4
vpadd.u16 d2, d2
vaddl.u8 q8, d16, d17
vaddl.u8 q9, d18, d19
vadd.u16 d16, d16, d17
vadd.u16 d18, d18, d19
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vpadd.u16 d16, d16
vpadd.u16 d18, d18
vadd.u16 d2, d2, d4
vadd.u16 d3, d16, d18
cmp r4, #64
vadd.s16 d0, d0, d2
vadd.s16 d0, d0, d3
vshl.u16 d18, d0, d28
beq 1f // h = 16/32
movw lr, #(0x5556/2)
movt lr, #(0x3334/2)
mov r5, r4
and r5, r5, #31
lsr lr, lr, r5
vdup.16 d30, lr
vqdmulh.s16 d18, d18, d30
1:
sub r1, r1, #32
vdup.8 q0, d18[0]
vdup.8 q1, d18[0]
vdup.8 q2, d18[0]
vdup.8 q3, d18[0]
2:
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d4, d5, d6, d7}, [r0, :128], r1
vst1.8 {d4, d5, d6, d7}, [r12, :128], r1
bgt 2b
pop {r4-r6, pc}
endfunc

View File

@ -0,0 +1,868 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_8_wd\wd\()_neon
vabd.u8 d0, d22, d23 // abs(p1 - p0)
vabd.u8 d1, d25, d24 // abs(q1 - q0)
vabd.u8 d2, d23, d24 // abs(p0 - q0)
vabd.u8 d3, d22, d25 // abs(p1 - q1)
.if \wd >= 6
vabd.u8 d4, d21, d22 // abs(p2 - p1)
vabd.u8 d5, d26, d25 // abs(q2 - q1)
.endif
.if \wd >= 8
vabd.u8 d6, d20, d21 // abs(p3 - p2)
vabd.u8 d7, d27, d26 // abs(q3 - q3)
.endif
.if \wd >= 6
vmax.u8 d4, d4, d5
.endif
vqadd.u8 d2, d2, d2 // abs(p0 - q0) * 2
.if \wd >= 8
vmax.u8 d6, d6, d7
.endif
vshr.u8 d3, d3, #1
.if \wd >= 8
vmax.u8 d4, d4, d6
.endif
.if \wd >= 6
vand d4, d4, d14
.endif
vmax.u8 d0, d0, d1 // max(abs(p1 - p0), abs(q1 - q0))
vqadd.u8 d2, d2, d3 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
vmax.u8 d4, d0, d4
vcge.u8 d1, d11, d4 // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
vcge.u8 d1, d11, d0 // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
vcge.u8 d2, d10, d2 // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
vand d1, d1, d2 // fm
vand d1, d1, d13 // fm && wd >= 4
.if \wd >= 6
vand d14, d14, d1 // fm && wd > 4
.endif
.if \wd >= 16
vand d15, d15, d1 // fm && wd == 16
.endif
vmov r10, r11, d1
orrs r10, r10, r11
beq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
vmov.i8 d10, #1
vabd.u8 d2, d21, d23 // abs(p2 - p0)
vabd.u8 d3, d22, d23 // abs(p1 - p0)
vabd.u8 d4, d25, d24 // abs(q1 - q0)
vabd.u8 d5, d26, d24 // abs(q2 - q0)
.if \wd >= 8
vabd.u8 d6, d20, d23 // abs(p3 - p0)
vabd.u8 d7, d27, d24 // abs(q3 - q0)
.endif
vmax.u8 d2, d2, d3
vmax.u8 d4, d4, d5
.if \wd >= 8
vmax.u8 d6, d6, d7
.endif
vmax.u8 d2, d2, d4
.if \wd >= 8
vmax.u8 d2, d2, d6
.endif
.if \wd == 16
vabd.u8 d3, d17, d23 // abs(p6 - p0)
vabd.u8 d4, d18, d23 // abs(p5 - p0)
vabd.u8 d5, d19, d23 // abs(p4 - p0)
.endif
vcge.u8 d2, d10, d2 // flat8in
.if \wd == 16
vabd.u8 d6, d28, d24 // abs(q4 - q0)
vabd.u8 d7, d29, d24 // abs(q5 - q0)
vabd.u8 d8, d30, d24 // abs(q6 - q0)
.endif
vand d14, d2, d14 // flat8in && fm && wd > 4
vbic d1, d1, d14 // fm && wd >= 4 && !flat8in
.if \wd == 16
vmax.u8 d3, d3, d4
vmax.u8 d5, d5, d6
.endif
vmov r10, r11, d1
.if \wd == 16
vmax.u8 d7, d7, d8
vmax.u8 d3, d3, d5
vmax.u8 d3, d3, d7
vcge.u8 d3, d10, d3 // flat8out
.endif
orrs r10, r10, r11
.if \wd == 16
vand d15, d15, d3 // flat8out && fm && wd == 16
vand d15, d15, d14 // flat8out && flat8in && fm && wd == 16
vbic d14, d14, d15 // flat8in && fm && wd >= 4 && !flat8out
.endif
beq 1f // skip wd == 4 case
.endif
vsubl.u8 q1, d22, d25 // p1 - q1
vcgt.u8 d0, d0, d12 // hev
vqmovn.s16 d2, q1
vand d4, d2, d0 // if (hev) iclip_diff(p1 - q1)
vbic d0, d1, d0 // (fm && wd >= 4 && !hev)
vsubl.u8 q1, d24, d23
vmov.i16 q3, #3
vmul.i16 q1, q1, q3
vmov.i8 d6, #4
vaddw.s8 q1, q1, d4
vmov.i8 d7, #3
vqmovn.s16 d2, q1 // f
vqadd.s8 d4, d6, d2 // imin(f + 4, 128)
vqadd.s8 d5, d7, d2 // imin(f + 3, 128)
vshr.s8 d4, d4, #3 // f1
vshr.s8 d5, d5, #3 // f2
vmovl.u8 q1, d23 // p0
vmovl.u8 q3, d24 // q0
vaddw.s8 q1, q1, d5
vsubw.s8 q3, q3, d4
vrshr.s8 d4, d4, #1 // (f1 + 1) >> 1
vqmovun.s16 d2, q1 // out p0
vqmovun.s16 d6, q3 // out q0
vbit d23, d2, d1 // if (fm && wd >= 4)
vmovl.u8 q1, d22 // p1
vbit d24, d6, d1 // if (fm && wd >= 4)
vmovl.u8 q3, d25 // q1
vaddw.s8 q1, q1, d4
vsubw.s8 q3, q3, d4
vqmovun.s16 d2, q1 // out p1
vqmovun.s16 d6, q3 // out q1
vbit d22, d2, d0 // if (fm && wd >= 4 && !hev)
vbit d25, d6, d0 // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
vmov r10, r11, d14
orrs r10, r10, r11
beq 2f // skip if there's no flat8in
vaddl.u8 q0, d21, d21 // p2 * 2
vaddl.u8 q1, d21, d22 // p2 + p1
vaddl.u8 q2, d22, d23 // p1 + p0
vaddl.u8 q3, d23, d24 // p0 + q0
vadd.i16 q4, q0, q1
vadd.i16 q5, q2, q3
vaddl.u8 q6, d24, d25 // q0 + q1
vadd.i16 q4, q4, q5
vsub.i16 q6, q6, q0
vaddl.u8 q5, d25, d26 // q1 + q2
vrshrn.i16 d0, q4, #3 // out p1
vadd.i16 q4, q4, q6
vsub.i16 q5, q5, q1
vaddl.u8 q6, d26, d26 // q2 + q2
vrshrn.i16 d1, q4, #3 // out p0
vadd.i16 q4, q4, q5
vsub.i16 q6, q6, q2
vrshrn.i16 d2, q4, #3 // out q0
vbit d22, d0, d14 // p1 if (flat8in)
vadd.i16 q4, q4, q6
vbit d23, d1, d14 // p0 if (flat8in)
vrshrn.i16 d3, q4, #3 // out q1
vbit d24, d2, d14 // q0 if (flat8in)
vbit d25, d3, d14 // q1 if (flat8in)
.elseif \wd >= 8
vmov r10, r11, d14
orrs r10, r10, r11
.if \wd == 8
beq 8f // skip if there's no flat8in
.else
beq 2f // skip if there's no flat8in
.endif
vaddl.u8 q0, d20, d21 // p3 + p2
vaddl.u8 q1, d22, d25 // p1 + q1
vaddl.u8 q2, d20, d22 // p3 + p1
vaddl.u8 q3, d23, d26 // p0 + q2
vadd.i16 q4, q0, q0 // 2 * (p3 + p2)
vaddw.u8 q4, q4, d23 // + p0
vaddw.u8 q4, q4, d24 // + q0
vadd.i16 q4, q4, q2 // + p3 + p1
vsub.i16 q1, q1, q0 // p1 + q1 - p3 - p2
vsub.i16 q3, q3, q2 // p0 + q2 - p3 - p1
vrshrn.i16 d10, q4, #3 // out p2
vadd.i16 q4, q4, q1
vaddl.u8 q0, d20, d23 // p3 + p0
vaddl.u8 q1, d24, d27 // q0 + q3
vrshrn.i16 d11, q4, #3 // out p1
vadd.i16 q4, q4, q3
vsub.i16 q1, q1, q0 // q0 + q3 - p3 - p0
vaddl.u8 q2, d21, d24 // p2 + q0
vaddl.u8 q3, d25, d27 // q1 + q3
vrshrn.i16 d12, q4, #3 // out p0
vadd.i16 q4, q4, q1
vsub.i16 q3, q3, q2 // q1 + q3 - p2 - q0
vaddl.u8 q0, d22, d25 // p1 + q1
vaddl.u8 q1, d26, d27 // q2 + q3
vrshrn.i16 d13, q4, #3 // out q0
vadd.i16 q4, q4, q3
vsub.i16 q1, q1, q0 // q2 + q3 - p1 - q1
vrshrn.i16 d0, q4, #3 // out q1
vadd.i16 q4, q4, q1
vbit d21, d10, d14
vbit d22, d11, d14
vbit d23, d12, d14
vrshrn.i16 d1, q4, #3 // out q2
vbit d24, d13, d14
vbit d25, d0, d14
vbit d26, d1, d14
.endif
2:
.if \wd == 16
vmov r10, r11, d15
orrs r10, r10, r11
bne 1f // check if flat8out is needed
vmov r10, r11, d14
orrs r10, r10, r11
beq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
vaddl.u8 q1, d17, d17 // p6 + p6
vaddl.u8 q2, d17, d18 // p6 + p5
vaddl.u8 q3, d17, d19 // p6 + p4
vaddl.u8 q4, d17, d20 // p6 + p3
vadd.i16 q6, q1, q2
vadd.i16 q5, q3, q4
vaddl.u8 q3, d17, d21 // p6 + p2
vadd.i16 q6, q6, q5
vaddl.u8 q4, d17, d22 // p6 + p1
vaddl.u8 q5, d18, d23 // p5 + p0
vadd.i16 q3, q3, q4
vaddl.u8 q4, d19, d24 // p4 + q0
vadd.i16 q6, q6, q3
vadd.i16 q5, q5, q4
vaddl.u8 q3, d20, d25 // p3 + q1
vadd.i16 q6, q6, q5
vsub.i16 q3, q3, q1
vaddl.u8 q1, d21, d26 // p2 + q2
vrshrn.i16 d0, q6, #4 // out p5
vadd.i16 q6, q6, q3 // - (p6 + p6) + (p3 + q1)
vsub.i16 q1, q1, q2
vaddl.u8 q2, d22, d27 // p1 + q3
vaddl.u8 q3, d17, d19 // p6 + p4
vrshrn.i16 d1, q6, #4 // out p4
vadd.i16 q6, q6, q1 // - (p6 + p5) + (p2 + q2)
vsub.i16 q2, q2, q3
vaddl.u8 q3, d23, d28 // p0 + q4
vaddl.u8 q4, d17, d20 // p6 + p3
vrshrn.i16 d2, q6, #4 // out p3
vadd.i16 q6, q6, q2 // - (p6 + p4) + (p1 + q3)
vsub.i16 q3, q3, q4
vaddl.u8 q4, d24, d29 // q0 + q5
vaddl.u8 q2, d17, d21 // p6 + p2
vrshrn.i16 d3, q6, #4 // out p2
vadd.i16 q6, q6, q3 // - (p6 + p3) + (p0 + q4)
vsub.i16 q4, q4, q2
vaddl.u8 q3, d25, d30 // q1 + q6
vaddl.u8 q5, d17, d22 // p6 + p1
vrshrn.i16 d4, q6, #4 // out p1
vadd.i16 q6, q6, q4 // - (p6 + p2) + (q0 + q5)
vsub.i16 q3, q3, q5
vaddl.u8 q4, d26, d30 // q2 + q6
vbif d0, d18, d15 // out p5
vaddl.u8 q5, d18, d23 // p5 + p0
vrshrn.i16 d5, q6, #4 // out p0
vadd.i16 q6, q6, q3 // - (p6 + p1) + (q1 + q6)
vsub.i16 q4, q4, q5
vaddl.u8 q5, d27, d30 // q3 + q6
vbif d1, d19, d15 // out p4
vaddl.u8 q9, d19, d24 // p4 + q0
vrshrn.i16 d6, q6, #4 // out q0
vadd.i16 q6, q6, q4 // - (p5 + p0) + (q2 + q6)
vsub.i16 q5, q5, q9
vaddl.u8 q4, d28, d30 // q4 + q6
vbif d2, d20, d15 // out p3
vaddl.u8 q9, d20, d25 // p3 + q1
vrshrn.i16 d7, q6, #4 // out q1
vadd.i16 q6, q6, q5 // - (p4 + q0) + (q3 + q6)
vsub.i16 q9, q4, q9
vaddl.u8 q5, d29, d30 // q5 + q6
vbif d3, d21, d15 // out p2
vaddl.u8 q10, d21, d26 // p2 + q2
vrshrn.i16 d8, q6, #4 // out q2
vadd.i16 q6, q6, q9 // - (p3 + q1) + (q4 + q6)
vsub.i16 q5, q5, q10
vaddl.u8 q9, d30, d30 // q6 + q6
vbif d4, d22, d15 // out p1
vaddl.u8 q10, d22, d27 // p1 + q3
vrshrn.i16 d9, q6, #4 // out q3
vadd.i16 q6, q6, q5 // - (p2 + q2) + (q5 + q6)
vsub.i16 q9, q9, q10
vbif d5, d23, d15 // out p0
vrshrn.i16 d10, q6, #4 // out q4
vadd.i16 q6, q6, q9 // - (p1 + q3) + (q6 + q6)
vrshrn.i16 d11, q6, #4 // out q5
vbif d6, d24, d15 // out q0
vbif d7, d25, d15 // out q1
vbif d8, d26, d15 // out q2
vbif d9, d27, d15 // out q3
vbif d10, d28, d15 // out q4
vbif d11, d29, d15 // out q5
.endif
bx lr
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
bx r8
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
bx r9
.endif
9:
// Return directly without writing back any pixels
bx r12
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_8_wd16
adr r8, 7f + CONFIG_THUMB
adr r9, 8f + CONFIG_THUMB
bl lpf_8_wd16_neon
.endm
.macro lpf_8_wd8
adr r9, 8f + CONFIG_THUMB
bl lpf_8_wd8_neon
.endm
.macro lpf_8_wd6
bl lpf_8_wd6_neon
.endm
.macro lpf_8_wd4
bl lpf_8_wd4_neon
.endm
function lpf_v_4_8_neon
mov r12, lr
sub r10, r0, r1, lsl #1
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
lpf_8_wd4
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_4_8_neon
mov r12, lr
sub r10, r0, #2
add r0, r10, r1, lsl #2
vld1.32 {d22[0]}, [r10], r1
vld1.32 {d22[1]}, [r0], r1
vld1.32 {d23[0]}, [r10], r1
vld1.32 {d23[1]}, [r0], r1
vld1.32 {d24[0]}, [r10], r1
vld1.32 {d24[1]}, [r0], r1
vld1.32 {d25[0]}, [r10], r1
vld1.32 {d25[1]}, [r0], r1
add r0, r0, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
lpf_8_wd4
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_6_8_neon
mov r12, lr
sub r10, r0, r1, lsl #1
sub r10, r10, r1
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
lpf_8_wd6
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_6_8_neon
mov r12, lr
sub r10, r0, #4
add r0, r10, r1, lsl #2
vld1.8 {d20}, [r10], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d21}, [r10], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d22}, [r10], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d23}, [r10], r1
vld1.8 {d27}, [r0], r1
add r0, r0, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
lpf_8_wd6
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_8_8_neon
mov r12, lr
sub r10, r0, r1, lsl #2
vld1.8 {d20}, [r10, :64], r1 // p3
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d26}, [r0, :64], r1 // q2
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d27}, [r0, :64], r1 // q3
sub r0, r0, r1, lsl #2
lpf_8_wd8
sub r10, r0, r1, lsl #1
sub r10, r10, r1
vst1.8 {d21}, [r10, :64], r1 // p2
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d25}, [r0, :64], r1 // q1
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
bx r12
8:
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_8_8_neon
mov r12, lr
sub r10, r0, #4
add r0, r10, r1, lsl #2
vld1.8 {d20}, [r10], r1
vld1.8 {d24}, [r0], r1
vld1.8 {d21}, [r10], r1
vld1.8 {d25}, [r0], r1
vld1.8 {d22}, [r10], r1
vld1.8 {d26}, [r0], r1
vld1.8 {d23}, [r10], r1
vld1.8 {d27}, [r0], r1
add r0, r0, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
lpf_8_wd8
sub r10, r0, r1, lsl #3
sub r10, r10, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
add r0, r10, r1, lsl #2
vst1.8 {d20}, [r10], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d21}, [r10], r1
vst1.8 {d25}, [r0], r1
vst1.8 {d22}, [r10], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d23}, [r10], r1
vst1.8 {d27}, [r0], r1
add r0, r0, #4
bx r12
8:
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
function lpf_v_16_8_neon
mov r12, lr
sub r10, r0, r1, lsl #3
add r10, r10, r1
vld1.8 {d17}, [r10, :64], r1 // p6
vld1.8 {d24}, [r0, :64], r1 // q0
vld1.8 {d18}, [r10, :64], r1 // p5
vld1.8 {d25}, [r0, :64], r1 // q1
vld1.8 {d19}, [r10, :64], r1 // p4
vld1.8 {d26}, [r0, :64], r1 // q2
vld1.8 {d20}, [r10, :64], r1 // p3
vld1.8 {d27}, [r0, :64], r1 // q3
vld1.8 {d21}, [r10, :64], r1 // p2
vld1.8 {d28}, [r0, :64], r1 // q4
vld1.8 {d22}, [r10, :64], r1 // p1
vld1.8 {d29}, [r0, :64], r1 // q5
vld1.8 {d23}, [r10, :64], r1 // p0
vld1.8 {d30}, [r0, :64], r1 // q6
sub r0, r0, r1, lsl #3
add r0, r0, r1
lpf_8_wd16
sub r10, r0, r1, lsl #2
sub r10, r10, r1, lsl #1
vst1.8 {d0}, [r10, :64], r1 // p5
vst1.8 {d6}, [r0, :64], r1 // q0
vst1.8 {d1}, [r10, :64], r1 // p4
vst1.8 {d7}, [r0, :64], r1 // q1
vst1.8 {d2}, [r10, :64], r1 // p3
vst1.8 {d8}, [r0, :64], r1 // q2
vst1.8 {d3}, [r10, :64], r1 // p2
vst1.8 {d9}, [r0, :64], r1 // q3
vst1.8 {d4}, [r10, :64], r1 // p1
vst1.8 {d10}, [r0, :64], r1 // q4
vst1.8 {d5}, [r10, :64], r1 // p0
vst1.8 {d11}, [r0, :64], r1 // q5
sub r0, r0, r1, lsl #2
sub r0, r0, r1, lsl #1
bx r12
7:
sub r10, r0, r1
sub r10, r10, r1, lsl #1
vst1.8 {d21}, [r10, :64], r1 // p2
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d25}, [r0, :64], r1 // q1
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d26}, [r0, :64], r1 // q2
sub r0, r0, r1, lsl #1
sub r0, r0, r1
bx r12
8:
sub r10, r0, r1, lsl #1
vst1.8 {d22}, [r10, :64], r1 // p1
vst1.8 {d24}, [r0, :64], r1 // q0
vst1.8 {d23}, [r10, :64], r1 // p0
vst1.8 {d25}, [r0, :64], r1 // q1
sub r0, r0, r1, lsl #1
bx r12
endfunc
function lpf_h_16_8_neon
mov r12, lr
sub r10, r0, #8
vld1.8 {d16}, [r10, :64], r1
vld1.8 {d24}, [r0, :64], r1
vld1.8 {d17}, [r10, :64], r1
vld1.8 {d25}, [r0, :64], r1
vld1.8 {d18}, [r10, :64], r1
vld1.8 {d26}, [r0, :64], r1
vld1.8 {d19}, [r10, :64], r1
vld1.8 {d27}, [r0, :64], r1
vld1.8 {d20}, [r10, :64], r1
vld1.8 {d28}, [r0, :64], r1
vld1.8 {d21}, [r10, :64], r1
vld1.8 {d29}, [r0, :64], r1
vld1.8 {d22}, [r10, :64], r1
vld1.8 {d30}, [r0, :64], r1
vld1.8 {d23}, [r10, :64], r1
vld1.8 {d31}, [r0, :64], r1
transpose_8x8b q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
transpose_8x8b q12, q13, q14, q15, d24, d25, d26, d27, d28, d29, d30, d31
lpf_8_wd16
sub r0, r0, r1, lsl #3
sub r10, r0, #8
transpose_8x8b q8, q0, q1, q2, d16, d17, d0, d1, d2, d3, d4, d5
transpose_8x8b q3, q4, q5, q15, d6, d7, d8, d9, d10, d11, d30, d31
vst1.8 {d16}, [r10, :64], r1
vst1.8 {d6}, [r0, :64], r1
vst1.8 {d17}, [r10, :64], r1
vst1.8 {d7}, [r0, :64], r1
vst1.8 {d0}, [r10, :64], r1
vst1.8 {d8}, [r0, :64], r1
vst1.8 {d1}, [r10, :64], r1
vst1.8 {d9}, [r0, :64], r1
vst1.8 {d2}, [r10, :64], r1
vst1.8 {d10}, [r0, :64], r1
vst1.8 {d3}, [r10, :64], r1
vst1.8 {d11}, [r0, :64], r1
vst1.8 {d4}, [r10, :64], r1
vst1.8 {d30}, [r0, :64], r1
vst1.8 {d5}, [r10, :64], r1
vst1.8 {d31}, [r0, :64], r1
bx r12
7:
sub r10, r0, r1, lsl #3
sub r10, r10, #4
transpose_8x8b q10, q11, q12, q13, d20, d21, d22, d23, d24, d25, d26, d27
add r0, r10, r1, lsl #2
vst1.8 {d20}, [r10], r1
vst1.8 {d24}, [r0], r1
vst1.8 {d21}, [r10], r1
vst1.8 {d25}, [r0], r1
vst1.8 {d22}, [r10], r1
vst1.8 {d26}, [r0], r1
vst1.8 {d23}, [r10], r1
vst1.8 {d27}, [r0], r1
add r0, r0, #4
bx r12
8:
sub r10, r0, r1, lsl #3
sub r10, r10, #2
transpose_4x8b q11, q12, d22, d23, d24, d25
add r0, r10, r1, lsl #2
vst1.32 {d22[0]}, [r10], r1
vst1.32 {d22[1]}, [r0], r1
vst1.32 {d23[0]}, [r10], r1
vst1.32 {d23[1]}, [r0], r1
vst1.32 {d24[0]}, [r10], r1
vst1.32 {d24[1]}, [r0], r1
vst1.32 {d25[0]}, [r10], r1
vst1.32 {d25[1]}, [r0], r1
add r0, r0, #2
bx r12
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100]
ldrd r6, r7, [r2] // vmask[0], vmask[1]
.ifc \type, y
ldr r2, [r2, #8] // vmask[2]
.endif
add r5, r5, #128 // Move to sharp part of lut
.ifc \type, y
orr r7, r7, r2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub r4, r3, r4, lsl #2
.else
sub r3, r3, #4
lsl r4, r4, #2
.endif
orr r6, r6, r7 // vmask[0] |= vmask[1]
1:
tst r6, #0x03
.ifc \dir, v
vld1.8 {d0}, [r4]!
vld1.8 {d1}, [r3]!
.else
vld2.32 {d0[0], d1[0]}, [r3], r4
vld2.32 {d0[1], d1[1]}, [r3], r4
.endif
beq 7f // if (!(vm & bits)) continue;
vld1.8 {d5[]}, [r5] // sharp[0]
add r5, r5, #8
vmov.i32 d2, #0xff
vdup.32 d13, r6 // vmask[0]
vand d0, d0, d2 // Keep only lowest byte in each 32 bit word
vand d1, d1, d2
vtst.8 d3, d1, d2 // Check for nonzero values in l[0][0]
vmov.i8 d4, #1
vld1.8 {d6[]}, [r5] // sharp[1]
sub r5, r5, #8
vbif d1, d0, d3 // if (!l[0][0]) L = l[offset][0]
vmul.i32 d1, d1, d4 // L
.ifc \type, y
vdup.32 d15, r2 // vmask[2]
.endif
vtst.32 d2, d1, d2 // L != 0
vdup.32 d14, r7 // vmask[1]
vmov r10, r11, d2
orrs r10, r10, r11
beq 7f // if (!L) continue;
vneg.s8 d5, d5 // -sharp[0]
movrel_local r10, word_12
vshr.u8 d12, d1, #4 // H
vld1.32 {d16}, [r10, :64]
vshl.s8 d3, d1, d5 // L >> sharp[0]
.ifc \type, y
vtst.32 d15, d15, d16 // if (vmask[2] & bits)
.endif
vmov.i8 d7, #2
vmin.u8 d3, d3, d6 // imin(L >> sharp[0], sharp[1])
vadd.i8 d0, d1, d7 // L + 2
vmax.u8 d11, d3, d4 // imax(imin(), 1) = limit = I
vadd.u8 d0, d0, d0 // 2*(L + 2)
vtst.32 d14, d14, d16 // if (vmask[1] & bits)
vadd.i8 d10, d0, d11 // 2*(L + 2) + limit = E
vtst.32 d13, d13, d16 // if (vmask[0] & bits)
vand d13, d13, d2 // vmask[0] &= L != 0
.ifc \type, y
tst r2, #0x03
beq 2f
// wd16
bl lpf_\dir\()_16_8_neon
b 8f
2:
.endif
tst r7, #0x03
beq 3f
.ifc \type, y
// wd8
bl lpf_\dir\()_8_8_neon
.else
// wd6
bl lpf_\dir\()_6_8_neon
.endif
b 8f
3:
// wd4
bl lpf_\dir\()_4_8_neon
.ifc \dir, h
b 8f
7:
// For dir h, the functions above increment r0.
// If the whole function is skipped, increment it here instead.
add r0, r0, r1, lsl #3
.else
7:
.endif
8:
lsrs r6, r6, #2 // vmask[0] >>= 2
lsr r7, r7, #2 // vmask[1] >>= 2
.ifc \type, y
lsr r2, r2, #2 // vmask[2] >>= 2
.endif
.ifc \dir, v
add r0, r0, #8
.else
// For dir h, r0 is returned incremented
.endif
bne 1b
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
.endm
lpf_func v, y
lpf_func h, y
lpf_func v, uv
lpf_func h, uv
const word_12, align=4
.word 1, 2
endconst

File diff suppressed because it is too large Load Diff

View File

@ -34,11 +34,11 @@
.macro movrel_local rd, val, offset=0
#if defined(PIC)
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
ldr \rd, 90001f
b 90002f
90001:
.word \val + \offset - (90002f + 8 - 4 * CONFIG_THUMB)
90002:
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
@ -84,4 +84,11 @@
vtrn.8 \r6, \r7
.endm
.macro transpose_4x8b q0, q1, r0, r1, r2, r3
vtrn.16 \q0, \q1
vtrn.8 \r0, \r1
vtrn.8 \r2, \r3
.endm
#endif /* DAV1D_SRC_ARM_32_UTIL_S */

View File

@ -37,11 +37,11 @@ function lpf_16_wd\wd\()_neon
.if \wd >= 6
uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
.endif
.if \wd >= 8
uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
.endif
.endif
.if \wd >= 6
umax v4.16b, v4.16b, v5.16b
.endif
@ -70,7 +70,7 @@ function lpf_16_wd\wd\()_neon
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 6
.if \wd >= 16
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
@ -303,7 +303,6 @@ function lpf_16_wd\wd\()_neon
rshrn v13.8b, v8.8h, #3 // out q0
rshrn2 v13.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
@ -420,6 +419,7 @@ function lpf_16_wd\wd\()_neon
sub v7.8h, v7.8h, v11.8h
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
uaddl2 v9.8h, v26.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
rshrn v5.8b, v12.8h, #4 // out p0
@ -430,56 +430,55 @@ function lpf_16_wd\wd\()_neon
sub v9.8h, v9.8h, v11.8h
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
uaddl2 v11.8h, v27.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v14.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v18.8h, v19.16b, v24.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v18.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v19.8h, v19.16b, v24.16b
rshrn v6.8b, v12.8h, #4 // out q0
rshrn2 v6.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
add v13.8h, v13.8h, v9.8h
sub v10.8h, v10.8h, v14.8h
sub v11.8h, v11.8h, v18.8h
uaddl v14.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v18.8h, v28.16b, v30.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v8.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v9.8h, v20.16b, v25.16b
sub v10.8h, v10.8h, v18.8h
sub v11.8h, v11.8h, v19.8h
uaddl v8.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v9.8h, v28.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v18.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v19.8h, v20.16b, v25.16b
rshrn v7.8b, v12.8h, #4 // out q1
rshrn2 v7.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v8.8h
sub v18.8h, v18.8h, v9.8h
sub v18.8h, v8.8h, v18.8h
sub v19.8h, v9.8h, v19.8h
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
uaddl2 v11.8h, v29.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v19.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v20.8h, v21.16b, v26.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v20.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v21.8h, v21.16b, v26.16b
rshrn v8.8b, v12.8h, #4 // out q2
rshrn2 v8.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v18.8h
sub v10.8h, v10.8h, v19.8h
sub v11.8h, v11.8h, v20.8h
uaddl v14.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v18.8h, v30.16b, v30.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v19.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v20.8h, v22.16b, v27.16b
add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v19.8h
sub v10.8h, v10.8h, v20.8h
sub v11.8h, v11.8h, v21.8h
uaddl v18.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v19.8h, v30.16b, v30.16b
bif v4.16b, v22.16b, v15.16b // out p1
uaddl v20.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v21.8h, v22.16b, v27.16b
rshrn v9.8b, v12.8h, #4 // out q3
rshrn2 v9.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v19.8h
sub v18.8h, v18.8h, v20.8h
bif v4.16b, v22.16b, v15.16b // out p1
sub v19.8h, v19.8h, v21.8h
bif v5.16b, v23.16b, v15.16b // out p0
rshrn v10.8b, v12.8h, #4 // out q4
rshrn2 v10.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v18.8h
add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v19.8h
rshrn v11.8b, v12.8h, #4 // out q5
rshrn2 v11.16b, v13.8h, #4
bif v5.16b, v23.16b, v15.16b // out p0
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2

View File

@ -949,8 +949,8 @@ function sgr_box5_h_neon, export=1
ext v4.16b, v5.16b, v4.16b, #13
b 2f
0:
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
// and shift v3 to have 2x the first byte at the front.
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 2x the first byte at the front.
dup v1.16b, v0.b[0]
dup v5.16b, v4.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
@ -993,7 +993,7 @@ function sgr_box5_h_neon, export=1
ext v20.16b, v4.16b, v4.16b, #1
ext v21.16b, v4.16b, v4.16b, #2
ext v22.16b, v4.16b, v4.16b, #3
ext v23.16b, v4.16b, v5.16b, #4
ext v23.16b, v4.16b, v4.16b, #4
uaddl v3.8h, v0.8b, v16.8b
uaddl v24.8h, v17.8b, v18.8b
uaddl v7.8h, v4.8b, v20.8b
@ -1053,15 +1053,15 @@ function sgr_box5_h_neon, export=1
ext v4.16b, v4.16b, v4.16b, #4
6: // Pad the right edge and produce the last few pixels.
// w < 7, w+1 pixels valid in v3/v5
// w < 7, w+1 pixels valid in v0/v4
sub w13, w5, #1
// w13 = pixels valid - 2
adr x14, L(box5_variable_shift_tbl)
ldrh w13, [x14, w13, uxtw #1]
sub x13, x14, w13, uxth
br x13
// Shift v3 right, shifting out invalid pixels,
// shift v3 left to the original offset, shifting in padding pixels.
// Shift v0 right, shifting out invalid pixels,
// shift v0 left to the original offset, shifting in padding pixels.
22: // 2 pixels valid
ext v0.16b, v0.16b, v0.16b, #2
ext v4.16b, v4.16b, v4.16b, #2
@ -1688,14 +1688,14 @@ function sgr_finish_filter2_neon, export=1
2:
subs x5, x5, #8
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
ext v24.16b, v0.16b, v1.16b, #4 // +1-stride
ext v25.16b, v2.16b, v3.16b, #4 // +1+stride
ext v22.16b, v0.16b, v1.16b, #2 // -stride
ext v23.16b, v2.16b, v3.16b, #2 // +stride
add v0.8h, v0.8h, v24.8h // -1-stride, +1-stride
add v25.8h, v2.8h, v25.8h // -1+stride, +1+stride
add v0.8h, v0.8h, v25.8h
add v2.8h, v22.8h, v23.8h // -stride, +stride
add v0.8h, v0.8h, v25.8h
ext v22.16b, v16.16b, v17.16b, #4 // -stride
ext v23.16b, v17.16b, v18.16b, #4
@ -1760,8 +1760,8 @@ function sgr_finish_filter2_neon, export=1
4:
subs x5, x5, #8
ext v22.16b, v0.16b, v1.16b, #2 // 0
ext v23.16b, v0.16b, v1.16b, #4 // +1
ext v22.16b, v0.16b, v1.16b, #2 // 0
add v0.8h, v0.8h, v23.8h // -1, +1
ext v24.16b, v16.16b, v17.16b, #4 // 0
@ -1894,12 +1894,12 @@ endfunc
// const int16_t wt[2]);
function sgr_weighted2_neon, export=1
ldr x8, [sp]
ld1 {v31.s}[0], [x8]
cmp x7, #2
add x10, x0, x1
add x11, x2, x3
add x12, x4, #2*FILTER_OUT_STRIDE
add x13, x5, #2*FILTER_OUT_STRIDE
ld2r {v30.8h, v31.8h}, [x8] // wt[0], wt[1]
mov x8, #4*FILTER_OUT_STRIDE
lsl x1, x1, #1
lsl x3, x3, #1
@ -1908,8 +1908,6 @@ function sgr_weighted2_neon, export=1
sub x1, x1, x9
sub x3, x3, x9
sub x8, x8, x9, lsl #1
dup v30.8h, v31.h[0] // wt[0]
dup v31.8h, v31.h[1] // wt[1]
mov x9, x6
b.lt 2f
1:

View File

@ -54,13 +54,14 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon;
c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon;
c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon;
c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon;
#if ARCH_AARCH64
c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon;
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
@ -77,4 +78,5 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c
c->pal_pred = dav1d_pal_pred_neon;
#endif
#endif
}

View File

@ -38,7 +38,7 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_arm)(Dav1dLoopFilterDSPContext *const
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8 && ARCH_AARCH64
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_neon;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_neon;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_neon;

View File

@ -91,7 +91,6 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
}
}
#if ARCH_AARCH64
void dav1d_sgr_box3_h_neon(int32_t *sumsq, int16_t *sum,
const pixel (*left)[4],
const pixel *src, const ptrdiff_t stride,
@ -123,7 +122,7 @@ static void dav1d_sgr_filter1_neon(coef *tmp,
dav1d_sgr_box3_h_neon(sumsq, sum, left, src, stride, w, h, edges);
if (edges & LR_HAVE_TOP)
dav1d_sgr_box3_h_neon(&sumsq[-2 * (384 + 16)], &sum[-2 * (384 + 16)],
NULL, lpf, lpf_stride, w, 1, edges);
NULL, lpf, lpf_stride, w, 2, edges);
if (edges & LR_HAVE_BOTTOM)
dav1d_sgr_box3_h_neon(&sumsq[h * (384 + 16)], &sum[h * (384 + 16)],
@ -253,7 +252,6 @@ static void sgr_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
}
}
}
#endif // ARCH_AARCH64
#endif // BITDEPTH == 8
COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPContext *const c) {
@ -263,8 +261,6 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
#if BITDEPTH == 8
c->wiener = wiener_filter_neon;
#if ARCH_AARCH64
c->selfguided = sgr_filter_neon;
#endif
#endif
}

View File

@ -524,6 +524,7 @@ static void order_palette(const uint8_t *pal_idx, const ptrdiff_t stride,
{
int have_top = i > first;
assert(pal_idx);
pal_idx += first + (i - first) * stride;
for (int j = first, n = 0; j >= last; have_top = 1, j--, n++, pal_idx += stride - 1) {
const int have_left = j > 0;
@ -586,6 +587,7 @@ static void read_pal_indices(Dav1dTileContext *const t,
{
Dav1dTileState *const ts = t->ts;
const ptrdiff_t stride = bw4 * 4;
assert(pal_idx);
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
uint16_t (*const color_map_cdf)[8] =
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
@ -1125,6 +1127,7 @@ static int decode_b(Dav1dTileContext *const t,
if (b->pal_sz[0]) {
uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
} else
@ -1137,6 +1140,7 @@ static int decode_b(Dav1dTileContext *const t,
if (has_chroma && b->pal_sz[1]) {
uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += cbw4 * cbh4 * 16;
} else
@ -1390,7 +1394,7 @@ static int decode_b(Dav1dTileContext *const t,
b->ref[1] = f->frame_hdr->skip_mode_refs[1];
b->comp_type = COMP_INTER_AVG;
b->inter_mode = NEARESTMV_NEARESTMV;
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
has_subpel_filter = 0;
candidate_mv mvstack[8];
@ -1490,13 +1494,13 @@ static int decode_b(Dav1dTileContext *const t,
b->inter_mode, ctx, n_mvs, ts->msac.rng);
const uint8_t *const im = dav1d_comp_inter_pred_modes[b->inter_mode];
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
if (b->inter_mode == NEWMV_NEWMV) {
if (n_mvs > 1) {
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v1]);
if (b->drl_idx == 1 && n_mvs > 2) {
if (b->drl_idx == NEARER_DRL && n_mvs > 2) {
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
@ -1506,12 +1510,12 @@ static int decode_b(Dav1dTileContext *const t,
b->drl_idx, n_mvs, ts->msac.rng);
}
} else if (im[0] == NEARMV || im[1] == NEARMV) {
b->drl_idx = 1;
if (n_mvs > 2) {
b->drl_idx = NEARER_DRL;
if (n_mvs > 2) { // NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
if (b->drl_idx == 2 && n_mvs > 3) {
if (b->drl_idx == NEAR_DRL && n_mvs > 3) {
const int drl_ctx_v3 = get_drl_context(mvstack, 2);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v3]);
@ -1521,6 +1525,7 @@ static int decode_b(Dav1dTileContext *const t,
b->drl_idx, n_mvs, ts->msac.rng);
}
}
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
#define assign_comp_mv(idx, pfx) \
switch (im[idx]) { \
@ -1678,14 +1683,14 @@ static int decode_b(Dav1dTileContext *const t,
has_subpel_filter = 1;
if (dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.refmv_mode[(ctx >> 4) & 15]))
{
{ // NEAREST, NEARER, NEAR or NEARISH
b->inter_mode = NEARMV;
b->drl_idx = 1;
if (n_mvs > 2) {
b->drl_idx = NEARER_DRL;
if (n_mvs > 2) { // NEARER, NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
if (b->drl_idx == 2 && n_mvs > 3) {
if (b->drl_idx == NEAR_DRL && n_mvs > 3) { // NEAR or NEARISH
const int drl_ctx_v3 =
get_drl_context(mvstack, 2);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
@ -1694,9 +1699,10 @@ static int decode_b(Dav1dTileContext *const t,
}
} else {
b->inter_mode = NEARESTMV;
b->drl_idx = 0;
b->drl_idx = NEAREST_DRL;
}
if (b->drl_idx >= 2) {
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
if (b->drl_idx >= NEAR_DRL) {
b->mv[0] = mvstack[b->drl_idx].this_mv;
} else {
b->mv[0] = mvlist[0][b->drl_idx];
@ -1711,20 +1717,22 @@ static int decode_b(Dav1dTileContext *const t,
} else {
has_subpel_filter = 1;
b->inter_mode = NEWMV;
b->drl_idx = 0;
if (n_mvs > 1) {
b->drl_idx = NEAREST_DRL;
if (n_mvs > 1) { // NEARER, NEAR or NEARISH
const int drl_ctx_v1 = get_drl_context(mvstack, 0);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v1]);
if (b->drl_idx == 1 && n_mvs > 2) {
if (b->drl_idx == NEARER_DRL && n_mvs > 2) { // NEAR or NEARISH
const int drl_ctx_v2 = get_drl_context(mvstack, 1);
b->drl_idx += dav1d_msac_decode_bool_adapt(&ts->msac,
ts->cdf.m.drl_bit[drl_ctx_v2]);
}
}
assert(b->drl_idx >= NEAREST_DRL && b->drl_idx <= NEARISH_DRL);
if (n_mvs > 1) {
b->mv[0] = mvstack[b->drl_idx].this_mv;
} else {
assert(!b->drl_idx);
b->mv[0] = mvlist[0][0];
fix_mv_precision(f->frame_hdr, &b->mv[0]);
}
@ -1972,7 +1980,7 @@ static int checked_decode_b(Dav1dTileContext *const t,
for (int p = 0; p < 1 + 2 * has_chroma; p++) {
const int ss_ver = p && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
const int stride = f->cur.stride[!!p];
const ptrdiff_t stride = f->cur.stride[!!p];
const int bx = t->bx & ~ss_hor;
const int by = t->by & ~ss_ver;
const int width = w4 << (2 - ss_hor + (bw4 == ss_hor));
@ -2318,10 +2326,15 @@ static void setup_tile(Dav1dTileState *const ts,
const int sb_shift = f->sb_shift;
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
ts->frame_thread.pal_idx =
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4];
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
NULL;
ts->frame_thread.cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
(((size_t)tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
NULL;
dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
ts->last_qidx = f->frame_hdr->quant.yac;
memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
@ -3106,12 +3119,18 @@ int dav1d_decode_frame(Dav1dFrameContext *const f) {
tile_idx++)
{
Dav1dTileState *const ts = &f->ts[tile_idx];
const int tile_start_off = f->frame_thread.tile_start_off[tile_idx];
ts->frame_thread.pal_idx = &f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4];
ts->frame_thread.cf = (uint8_t*)f->frame_thread.cf +
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd);
const size_t tile_start_off =
(size_t) f->frame_thread.tile_start_off[tile_idx];
ts->frame_thread.pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[tile_start_off * size_mul[1] / 4] :
NULL;
ts->frame_thread.cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
((tile_start_off * size_mul[0]) >> !f->seq_hdr->hbd) :
NULL;
if (f->n_tc > 0) {
unsigned row_sb_start = f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
const unsigned row_sb_start =
f->frame_hdr->tiling.row_start_sb[ts->tiling.row];
atomic_init(&ts->progress, row_sb_start);
}
}

View File

@ -431,19 +431,10 @@ static inline int av1_get_uni_p1_ctx(const BlockContext *const a,
static inline int get_drl_context(const candidate_mv *const ref_mv_stack,
const int ref_idx)
{
if (ref_mv_stack[ref_idx].weight >= 640 &&
ref_mv_stack[ref_idx + 1].weight >= 640)
return 0;
if (ref_mv_stack[ref_idx].weight >= 640)
return ref_mv_stack[ref_idx + 1].weight < 640;
if (ref_mv_stack[ref_idx].weight >= 640 &&
ref_mv_stack[ref_idx + 1].weight < 640)
return 1;
if (ref_mv_stack[ref_idx].weight < 640 &&
ref_mv_stack[ref_idx + 1].weight < 640)
return 2;
return 0;
return ref_mv_stack[ref_idx + 1].weight < 640 ? 2 : 0;
}
static inline unsigned get_cur_frame_segid(const int by, const int bx,

View File

@ -36,6 +36,6 @@
bitfn_decls(void dav1d_apply_grain, const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in);
Dav1dPicture *const in);
#endif /* DAV1D_SRC_FG_APPLY_H */

View File

@ -91,7 +91,7 @@ static void generate_scaling(const int bitdepth,
#ifndef UNIT_TEST
void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
Dav1dPicture *const out,
const Dav1dPicture *const in)
Dav1dPicture *const in)
{
const Dav1dFilmGrainData *const data = &out->frame_hdr->film_grain.data;
@ -143,7 +143,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
for (int row = 0; row < rows; row++) {
const pixel *const luma_src =
pixel *const luma_src =
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
if (data->num_y_points) {
@ -153,7 +153,23 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
}
if (!data->num_uv_points[0] && !data->num_uv_points[1] &&
!data->chroma_scaling_from_luma)
{
continue;
}
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
// extend padding pixels
if (out->p.w & ss_x) {
pixel *ptr = luma_src;
for (int y = 0; y < bh; y++) {
ptr[out->p.w] = ptr[out->p.w - 1];
ptr += PXSTRIDE(in->stride[0]) << ss_y;
}
}
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
if (data->chroma_scaling_from_luma) {
for (int pl = 0; pl < 2; pl++)

View File

@ -324,7 +324,9 @@ static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
}
}
static int get_filter_strength(const int wh, const int angle, const int is_sm) {
static NOINLINE int get_filter_strength(const int wh, const int angle,
const int is_sm)
{
if (is_sm) {
if (wh <= 8) {
if (angle >= 64) return 2;
@ -357,10 +359,10 @@ static int get_filter_strength(const int wh, const int angle, const int is_sm) {
return 0;
}
static void filter_edge(pixel *const out, const int sz,
const int lim_from, const int lim_to,
const pixel *const in,
const int from, const int to, const unsigned strength)
static NOINLINE void filter_edge(pixel *const out, const int sz,
const int lim_from, const int lim_to,
const pixel *const in, const int from,
const int to, const int strength)
{
static const uint8_t kernel[3][5] = {
{ 0, 4, 8, 4, 0 },
@ -382,14 +384,13 @@ static void filter_edge(pixel *const out, const int sz,
out[i] = in[iclip(i, from, to - 1)];
}
static int get_upsample(const int blk_wh, const unsigned d, const int type) {
if (d >= 40) return 0;
return type ? (blk_wh <= 8) : (blk_wh <= 16);
static inline int get_upsample(const int wh, const int angle, const int is_sm) {
return angle < 40 && wh <= 16 >> is_sm;
}
static void upsample_edge(pixel *const out, const int hsz,
const pixel *const in, const int from, const int to
HIGHBD_DECL_SUFFIX)
static NOINLINE void upsample_edge(pixel *const out, const int hsz,
const pixel *const in, const int from,
const int to HIGHBD_DECL_SUFFIX)
{
static const int8_t kernel[4] = { -1, 9, 9, -1 };
int i;
@ -415,7 +416,7 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
angle &= 511;
assert(angle < 90);
int dx = dav1d_dr_intra_derivative[angle >> 1];
pixel top_out[(64 + 64) * 2];
pixel top_out[64 + 64];
const pixel *top;
int max_base_x;
const int upsample_above = enable_intra_edge_filter ?
@ -474,8 +475,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
get_upsample(width + height, 180 - angle, is_sm) : 0;
const int upsample_above = enable_intra_edge_filter ?
get_upsample(width + height, angle - 90, is_sm) : 0;
pixel edge[64 * 2 + 64 * 2 + 1];
pixel *const topleft = &edge[height * 2];
pixel edge[64 + 64 + 1];
pixel *const topleft = &edge[64];
if (upsample_above) {
upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
@ -494,8 +495,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
}
}
if (upsample_left) {
upsample_edge(edge, height + 1, &topleft_in[-height], 0, height + 1
HIGHBD_TAIL_SUFFIX);
upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
0, height + 1 HIGHBD_TAIL_SUFFIX);
dy <<= 1;
} else {
const int filter_strength = enable_intra_edge_filter ?
@ -549,7 +550,7 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
angle &= 511;
assert(angle > 180);
int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
pixel left_out[(64 + 64) * 2];
pixel left_out[64 + 64];
const pixel *left;
int max_base_y;
const int upsample_left = enable_intra_edge_filter ?

View File

@ -32,17 +32,6 @@
#include "dav1d/headers.h"
enum ObuType {
OBU_SEQ_HDR = 1,
OBU_TD = 2,
OBU_FRAME_HDR = 3,
OBU_TILE_GRP = 4,
OBU_METADATA = 5,
OBU_FRAME = 6,
OBU_REDUNDANT_FRAME_HDR = 7,
OBU_PADDING = 15,
};
enum ObuMetaType {
OBU_META_HDR_CLL = 1,
OBU_META_HDR_MDCV = 2,
@ -221,6 +210,13 @@ enum InterPredMode {
N_INTER_PRED_MODES,
};
enum DRL_PROXIMITY {
NEAREST_DRL,
NEARER_DRL,
NEAR_DRL,
NEARISH_DRL
};
enum CompInterPredMode {
NEARESTMV_NEARESTMV,
NEARMV_NEARMV,

View File

@ -905,7 +905,6 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
src_x += mx >> 14;
mx &= 0x3fff;
}
if (dst_w & 1) dst[dst_w] = dst[dst_w - 1];
dst += PXSTRIDE(dst_stride);
src += PXSTRIDE(src_stride);

View File

@ -112,6 +112,8 @@ if is_asm_enabled
elif host_machine.cpu_family().startswith('arm')
libdav1d_sources += files(
'arm/32/cdef.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
'arm/32/mc.S',
)

View File

@ -1178,7 +1178,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
// obu header
dav1d_get_bits(&gb, 1); // obu_forbidden_bit
const enum ObuType type = dav1d_get_bits(&gb, 4);
const enum Dav1dObuType type = dav1d_get_bits(&gb, 4);
const int has_extension = dav1d_get_bits(&gb, 1);
const int has_length_field = dav1d_get_bits(&gb, 1);
dav1d_get_bits(&gb, 1); // reserved
@ -1217,7 +1217,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
if (len > in->sz - init_byte_pos) goto error;
// skip obu not belonging to the selected temporal/spatial layer
if (type != OBU_SEQ_HDR && type != OBU_TD &&
if (type != DAV1D_OBU_SEQ_HDR && type != DAV1D_OBU_TD &&
has_extension && c->operating_point_idc != 0)
{
const int in_temporal_layer = (c->operating_point_idc >> temporal_id) & 1;
@ -1227,7 +1227,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
}
switch (type) {
case OBU_SEQ_HDR: {
case DAV1D_OBU_SEQ_HDR: {
Dav1dRef *ref = dav1d_ref_create(sizeof(Dav1dSequenceHeader));
if (!ref) return DAV1D_ERR(ENOMEM);
Dav1dSequenceHeader *seq_hdr = ref->data;
@ -1266,11 +1266,11 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
c->seq_hdr = seq_hdr;
break;
}
case OBU_REDUNDANT_FRAME_HDR:
case DAV1D_OBU_REDUNDANT_FRAME_HDR:
if (c->frame_hdr) break;
// fall-through
case OBU_FRAME:
case OBU_FRAME_HDR:
case DAV1D_OBU_FRAME:
case DAV1D_OBU_FRAME_HDR:
if (global) break;
if (!c->seq_hdr) goto error;
if (!c->frame_hdr_ref) {
@ -1293,7 +1293,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
dav1d_data_unref_internal(&c->tile[n].data);
c->n_tile_data = 0;
c->n_tiles = 0;
if (type != OBU_FRAME) {
if (type != DAV1D_OBU_FRAME) {
// This is actually a frame header OBU so read the
// trailing bit and check for overrun.
dav1d_get_bits(&gb, 1);
@ -1312,7 +1312,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
return DAV1D_ERR(ERANGE);
}
if (type != OBU_FRAME)
if (type != DAV1D_OBU_FRAME)
break;
// OBU_FRAMEs shouldn't be signaled with show_existing_frame
if (c->frame_hdr->show_existing_frame) {
@ -1325,7 +1325,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
// to align to the next byte.
dav1d_bytealign_get_bits(&gb);
// fall-through
case OBU_TILE_GRP: {
case DAV1D_OBU_TILE_GRP: {
if (global) break;
if (!c->frame_hdr) goto error;
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
@ -1365,7 +1365,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
c->n_tile_data++;
break;
}
case OBU_METADATA: {
case DAV1D_OBU_METADATA: {
// obu metadta type field
const enum ObuMetaType meta_type = dav1d_get_uleb128(&gb);
const int meta_type_len = (dav1d_get_bits_pos(&gb) - init_bit_pos) >> 3;
@ -1479,8 +1479,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
break;
}
case OBU_PADDING:
case OBU_TD:
case DAV1D_OBU_PADDING:
case DAV1D_OBU_TD:
// ignore OBUs we don't care about
break;
default:

View File

@ -52,17 +52,24 @@ int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
p->stride[0] = aligned_w << hbd;
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
const size_t y_sz = p->stride[0] * aligned_h;
const size_t uv_sz = p->stride[1] * (aligned_h >> ss_ver);
const size_t pic_size = y_sz + 2 * uv_sz;
uint8_t *data = dav1d_alloc_aligned(pic_size + DAV1D_PICTURE_ALIGNMENT,
DAV1D_PICTURE_ALIGNMENT);
if (data == NULL) {
return DAV1D_ERR(ENOMEM);
}
ptrdiff_t y_stride = aligned_w << hbd;
ptrdiff_t uv_stride = has_chroma ? y_stride >> ss_hor : 0;
/* Due to how mapping of addresses to sets works in most L1 and L2 cache
* implementations, strides of multiples of certain power-of-two numbers
* may cause multiple rows of the same superblock to map to the same set,
* causing evictions of previous rows resulting in a reduction in cache
* hit rate. Avoid that by slightly padding the stride when necessary. */
if (!(y_stride & 1023))
y_stride += DAV1D_PICTURE_ALIGNMENT;
if (!(uv_stride & 1023) && has_chroma)
uv_stride += DAV1D_PICTURE_ALIGNMENT;
p->stride[0] = y_stride;
p->stride[1] = uv_stride;
const size_t y_sz = y_stride * aligned_h;
const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
uint8_t *data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
if (!data) return DAV1D_ERR(ENOMEM);
p->data[0] = data;
p->data[1] = has_chroma ? data + y_sz : NULL;

View File

@ -680,6 +680,7 @@ static void read_coef_tree(Dav1dTileContext *const t,
struct CodedBlockInfo *cbi;
if (f->frame_thread.pass) {
assert(ts->frame_thread.cf);
cf = ts->frame_thread.cf;
ts->frame_thread.cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
cbi = &f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
@ -1149,6 +1150,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
const uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal_idx = ts->frame_thread.pal_idx;
ts->frame_thread.pal_idx += bw4 * bh4 * 16;
} else {
@ -1345,6 +1347,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
const uint16_t (*pal)[8];
const uint8_t *pal_idx;
if (f->frame_thread.pass) {
assert(ts->frame_thread.pal_idx);
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))];
pal_idx = ts->frame_thread.pal_idx;

View File

@ -406,7 +406,7 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 },
};
const uint8_t dav1d_sgr_x_by_x[256] = {
const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,

View File

@ -27,7 +27,7 @@
%if ARCH_X86_64
SECTION_RODATA
SECTION_RODATA 32
pw_1024: times 16 dw 1024
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
@ -609,6 +609,8 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
movd xm15, [base+hmul_bits-10+shiftq*2]
pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7
pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12
vpbroadcastw xm7, [base+hmul_bits+4]
vpbroadcastd xm6, [base+pb_1]
DEFINE_ARGS buf, bufy, h, x
pshufd xm12, xm9, q0000
pshufd xm13, xm9, q1111
@ -639,31 +641,28 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
psrldq xm4, xm0, 4 ; y=-2,x=[+0,+5]
psrldq xm5, xm0, 6 ; y=-2,x=[+1,+5]
psrldq xm6, xm0, 8 ; y=-2,x=[+2,+5]
psrldq xm0, 8 ; y=-2,x=[+2,+5]
punpcklwd xm4, xm5
punpcklwd xm6, xm1
psrldq xm7, xm1, 6 ; y=-1,x=[+1,+5]
punpcklwd xm0, xm1
psrldq xm3, xm1, 6 ; y=-1,x=[+1,+5]
psrldq xm1, xm1, 8 ; y=-1,x=[+2,+5]
punpcklwd xm7, xm1
punpcklwd xm3, xm1
pmaddwd xm4, xm9
pmaddwd xm6, xm10
pmaddwd xm7, xm12
paddd xm4, xm6
paddd xm2, xm7
pmaddwd xm0, xm10
pmaddwd xm3, xm12
paddd xm4, xm0
paddd xm2, xm3
paddd xm2, xm4
vpbroadcastd xm4, [base+pb_1]
movq xm6, [bufyq+xq*2]
movq xm7, [bufyq+xq*2+82]
pmaddubsw xm6, xm4, xm6
pmaddubsw xm7, xm4, xm7
vpbroadcastw xm4, [base+hmul_bits+4]
paddw xm6, xm7
pmulhrsw xm6, xm4
pxor xm7, xm7
punpcklwd xm6, xm7
pmaddwd xm6, xm14
paddd xm2, xm6
movq xm0, [bufyq+xq*2]
movq xm3, [bufyq+xq*2+82]
pmaddubsw xm0, xm6, xm0
pmaddubsw xm3, xm6, xm3
paddw xm0, xm3
pmulhrsw xm0, xm7
punpcklwd xm0, xm0
pmaddwd xm0, xm14
paddd xm2, xm0
movq xm0, [bufq+xq-2] ; y=0,x=[-2,+5]
.x_loop_ar2_inner:
@ -807,8 +806,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
pmaddubsw xm1, xm13, xm1
pmaddubsw xm2, xm13, xm2
paddw xm1, xm2
vpbroadcastw xm3, xm15
pmulhrsw xm1, xm3
pmulhrsw xm1, xm15
punpcklwd xm6, xm7
punpcklwd xm8, xm9

View File

@ -1485,7 +1485,7 @@ ALIGN function_align
pmaddubsw m0, m1
pcmpgtw m1, m9, m6 ; base < max_base_x
pmulhrsw m0, m3
paddsw m6, m10 ; xpos += dx
paddw m6, m10 ; xpos += dx
lea r5, [dstq+strideq*2]
vpblendvb m0, m7, m0, m1
packuswb m0, m0
@ -1494,9 +1494,9 @@ ALIGN function_align
pextrd [r5 +strideq*1], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jz .w4_end
lea dstq, [dstq+strideq*4]
cmp r3d, maxbased
jb .w4_loop
packuswb xm7, xm7
@ -1662,16 +1662,16 @@ ALIGN function_align
pshufb m0, m8
pmaddubsw m0, m1
pcmpgtw m1, m9, m2
paddsw m2, m6
paddw m2, m6
pmulhrsw m0, m3
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
lea dstq, [dstq+strideq*2]
sub hd, 2
jz .w8_end
lea dstq, [dstq+strideq*2]
cmp r3d, maxbased
jb .w8_loop
packuswb xm7, xm7
@ -1788,13 +1788,13 @@ ALIGN function_align
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
paddw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jz .w16_end
lea dstq, [dstq+strideq*2]
cmp r3d, maxbased
jb .w16_loop
.w16_end_loop:
@ -1903,20 +1903,20 @@ ALIGN function_align
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
mov r3d, dxd
mov r5d, dxd
psubw m9, [z_base_inc]
mova m11, m6
psubw m10, m9, m3 ; 64*8
.w32_loop:
mov r5d, r3d
shr r5d, 6
mov r3d, r5d
shr r3d, 6
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
movu m0, [tlq+r5+0]
movu m1, [tlq+r5+8]
add r3d, dxd
movu m0, [tlq+r3+0]
movu m1, [tlq+r3+8]
add r5d, dxd
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m2
@ -1927,13 +1927,13 @@ ALIGN function_align
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
paddw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq], m0
add dstq, strideq
dec hd
jz .w32_end
cmp r3d, maxbased
add dstq, strideq
cmp r5d, maxbased
jb .w32_loop
test hb, 1
jz .w32_end_loop
@ -2074,25 +2074,23 @@ ALIGN function_align
mova [tlq+32*2], m0
mova [tlq+32*3], m1
.w64_main:
movd xm6, dxd
movd xm12, dxd
vpbroadcastb m7, [tlq+maxbaseq]
lea r3d, [dxq-64]
shl maxbased, 6
vpbroadcastw m6, xm6
movd xm10, maxbased
vpbroadcastw m12, xm12
sub r3d, maxbased
vbroadcasti128 m8, [z_filter_s+2]
mov r3d, dxd
vpbroadcastw m10, xm10
psllw m0, m3, 2 ; 64*32
psubw m10, [z_base_inc]
mova m14, m6
psubw m11, m10, m3 ; 64*8
psubw m12, m10, m0
psubw m13, m11, m0
movd xm6, r3d
mov r5d, dxd
mova m10, [pb_1to32]
vpbroadcastd m11, [pb_32]
vpbroadcastw m6, xm6
.w64_loop:
mov r5d, r3d
shr r5d, 6
movu m0, [tlq+r5+ 0]
movu m1, [tlq+r5+ 8]
mov r3d, r5d
shr r3d, 6
movu m0, [tlq+r3+ 0]
movu m1, [tlq+r3+ 8]
pand m2, m4, m6
psubw m9, m5, m2
psllw m2, 8
@ -2101,34 +2099,32 @@ ALIGN function_align
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
psraw m2, m6, 6
pmulhrsw m0, m3
pmulhrsw m1, m3
packsswb m2, m2
paddb m2, m10
packuswb m0, m1
pcmpgtw m1, m10, m6
pcmpgtw m2, m11, m6
packsswb m1, m2
vpblendvb m2, m7, m0, m1
movu m0, [tlq+r5+32]
movu m1, [tlq+r5+40]
add r3d, dxd
mova [dstq+ 0], m2
vpblendvb m0, m7, m0, m2
mova [dstq+ 0], m0
movu m0, [tlq+r3+32]
movu m1, [tlq+r3+40]
add r5d, dxd
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
pcmpgtw m9, m12, m6
pcmpgtw m2, m13, m6
paddb m2, m11
pmulhrsw m0, m3
pmulhrsw m1, m3
paddsw m6, m14
packsswb m9, m2
paddw m6, m12
packuswb m0, m1
vpblendvb m0, m7, m0, m9
vpblendvb m0, m7, m0, m2
mova [dstq+32], m0
add dstq, strideq
dec hd
jz .w64_end
cmp r3d, maxbased
add dstq, strideq
cmp r5d, maxbased
jb .w64_loop
.w64_end_loop:
mova [dstq+ 0], m7
@ -2384,7 +2380,7 @@ ALIGN function_align
vpblendvb m0, m1, m2
.w4_toponly:
pmulhrsw m0, m13
paddsw m6, m7 ; xpos += dx
paddw m6, m7 ; xpos += dx
add r5, dyq
packuswb m0, m0
vextracti128 xm1, m0, 1
@ -2392,9 +2388,9 @@ ALIGN function_align
pextrd [dstq+r9 ], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jz .w4_end
lea dstq, [dstq+strideq*4]
cmp r2d, r8d
jge .w4_loop
.w4_leftonly_loop:
@ -2604,7 +2600,7 @@ ALIGN function_align
.w8_toponly:
pmulhrsw m0, m13
pmulhrsw m1, m13
paddsw m6, m4, m7 ; xpos += dx
paddw m6, m4, m7 ; xpos += dx
add r5, dyq
packuswb m0, m1
vextracti128 xm1, m0, 1
@ -2612,9 +2608,9 @@ ALIGN function_align
movhps [dstq+strideq*2], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+r9 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jz .w8_end
lea dstq, [dstq+strideq*4]
cmp r2d, r8d
jge .w8_loop
.w8_leftonly_loop:
@ -2841,15 +2837,15 @@ ALIGN function_align
.w16_toponly:
pmulhrsw m0, m13
pmulhrsw m1, m13
paddsw m6, m5, m7 ; xpos += dx
paddw m6, m5, m7 ; xpos += dx
sub r5, 2
packuswb m0, m1
vpermq m0, m0, q3120
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jz .w16_end
lea dstq, [dstq+strideq*2]
cmp r2d, (63-16)<<6
jge .w16_loop
.w16_leftonly_loop:
@ -3135,9 +3131,9 @@ ALIGN function_align
vpbroadcastb m7, [r4]
lea r4, [dyq+63] ; ypos
movd xm9, maxbased
sub maxbased, 63
not maxbased
vbroadcasti128 m8, [z3_shuf_w4]
neg maxbaseq
add maxbased, 64
vpbroadcastw m9, xm9
psrlw m7, 8 ; top[max_base_y]
paddw m10, m6, m6
@ -3170,7 +3166,7 @@ ALIGN function_align
pmaddubsw m0, m1
pcmpgtw m1, m9, m6 ; base < max_base_y
pmulhrsw m0, m3
paddsw m6, m10 ; ypos += dy
paddw m6, m10 ; ypos += dy
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
packuswb xm1, xm0
@ -3179,9 +3175,9 @@ ALIGN function_align
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm1, 2
pextrd [dstq+r7 ], xm1, 3
add dstq, 4
sub wd, 4
jz .h4_end
add dstq, 4
cmp r4d, maxbased
jg .h4_loop
packuswb xm7, xm7
@ -3344,9 +3340,9 @@ ALIGN function_align
vpbroadcastb m7, [r4]
lea r4, [dyq+63]
movd xm9, maxbased
sub maxbased, 63
not maxbased
vbroadcasti128 m8, [z3_shuf]
neg maxbaseq
add maxbased, 64
vpbroadcastw m9, xm9
psrlw m7, 8
psubw m9, m0
@ -3367,7 +3363,7 @@ ALIGN function_align
pshufb m0, m8
pmaddubsw m0, m1
pcmpgtw m1, m9, m2
paddsw m2, m6
paddw m2, m6
pmulhrsw m0, m3
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
@ -3516,9 +3512,9 @@ ALIGN function_align
vpbroadcastb m7, [r4]
lea r4, [dyq+63]
movd xm9, maxbased
sub maxbased, 63
not maxbased
vbroadcasti128 m8, [z3_shuf]
neg maxbaseq
add maxbased, 64
vpbroadcastw m9, xm9
psubw m9, m0
paddw m11, m6, m6
@ -3548,7 +3544,7 @@ ALIGN function_align
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
paddw m6, m11
vpblendvb m0, m7, m0, m1
vpermq m0, m0, q3120
mova [rsp], m0
@ -3742,9 +3738,9 @@ ALIGN function_align
vpbroadcastb m7, [r4]
lea r4, [dyq+63]
movd xm9, maxbased
sub maxbased, 63
not maxbased
vbroadcasti128 m8, [z3_shuf]
neg maxbaseq
add maxbased, 64
vpbroadcastw m9, xm9
psubw m9, [z_base_inc]
mova m11, m6
@ -3772,7 +3768,7 @@ ALIGN function_align
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
paddw m6, m11
vpblendvb m0, m7, m0, m1
mova [rsp], m0
dec wd
@ -3996,33 +3992,26 @@ ALIGN function_align
mova [tlq-63], m0
mova [tlq-31], m1
.h64_main:
movd xm6, dyd
mov r4, tlq
sub tlq, 24
neg dyq
vpbroadcastw m6, xm6
sub r4, maxbaseq
shl maxbased, 6
vpbroadcastb m7, [r4]
lea r4, [dyq+63]
movd xm10, maxbased
sub maxbased, 63
vbroadcasti128 m8, [z3_shuf]
movd xm12, dyd
neg maxbaseq
mova xm1, [z_base_inc+16]
vinserti128 m1, [z_base_inc], 1
vpbroadcastw m10, xm10
psllw m0, m3, 2 ; 64*32
psubw m10, m1
mova m14, m6
psubw m11, m10, m3 ; 64*8
psubw m12, m10, m0
psubw m13, m11, m0
vbroadcasti128 m8, [z3_shuf]
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m12, xm12
lea r5d, [dyq+maxbaseq-64]
neg dyq
or maxbased, 63
lea r4, [dyq+63]
movd xm6, r5d
mova xm10, [pb_1to32+16]
vinserti128 m10, [pb_1to32], 1
vpbroadcastd m11, [pb_32]
vpbroadcastw m6, xm6
.h64_loop:
mov r5, r4
sar r5, 6
movu m0, [tlq+r5-0]
movu m1, [tlq+r5-8]
movu m0, [tlq+r5-24]
movu m1, [tlq+r5-32]
pand m2, m4, m6
psubw m9, m5, m2
psllw m2, 8
@ -4031,30 +4020,28 @@ ALIGN function_align
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
psraw m2, m6, 6
sub rsp, 64
pmulhrsw m0, m3
pmulhrsw m1, m3
packsswb m2, m2
paddb m2, m10
packuswb m0, m1
pcmpgtw m1, m10, m6
pcmpgtw m2, m11, m6
packsswb m1, m2
vpblendvb m2, m7, m0, m1
movu m0, [tlq+r5-32]
movu m1, [tlq+r5-40]
vpblendvb m0, m7, m0, m2
mova [rsp+32], m0
movu m0, [tlq+r5-56]
movu m1, [tlq+r5-64]
add r4, dyq
sub rsp, 64
mova [rsp+32], m2
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m9
pmaddubsw m1, m9
pcmpgtw m9, m12, m6
pcmpgtw m2, m13, m6
paddb m2, m11
pmulhrsw m0, m3
pmulhrsw m1, m3
paddsw m6, m14
packsswb m9, m2
paddw m6, m12
packuswb m0, m1
vpblendvb m0, m7, m0, m9
vpblendvb m0, m7, m0, m2
mova [rsp], m0
dec wd
jz .h64_transpose

View File

@ -52,13 +52,15 @@ pw_m3803_3344: dw -3803, 3344
pw_m3803_m6688: dw -3803, -6688
pw_2896_m2896: dw 2896, -2896
pw_5: times 2 dw 5
pw_2048: times 2 dw 2048
pw_4096: times 2 dw 4096
pw_8192: times 2 dw 8192
pw_16384: times 2 dw 16384
pw_2896x8: times 2 dw 2896*8
pw_5793x4: times 2 dw 5793*4
pw_5: times 2 dw 5
pw_2048: times 2 dw 2048
pw_4096: times 2 dw 4096
pw_8192: times 2 dw 8192
pw_16384: times 2 dw 16384
pw_1697x16: times 2 dw 1697*16
pw_1697x8: times 2 dw 1697*8
pw_2896x8: times 2 dw 2896*8
pw_5793x4: times 2 dw 5793*4
pd_2048: dd 2048
@ -389,9 +391,9 @@ ALIGN function_align
%ifidn %1_%2, dct_identity
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m0, [cq]
vpbroadcastd m1, [o(pw_5793x4)]
paddw m0, m0
pmulhrsw m0, m1
vpbroadcastd m1, [o(pw_1697x8)]
pmulhrsw m1, m0
paddw m0, m1
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
@ -399,12 +401,12 @@ ALIGN function_align
%elifidn %1_%2, identity_dct
mova m0, [cq+16*0]
packusdw m0, [cq+16*1]
vpbroadcastd m2, [o(pw_5793x4)]
vpbroadcastd m3, [o(pw_2896x8)]
vpbroadcastd m1, [o(pw_1697x8)]
vpbroadcastd m2, [o(pw_2896x8)]
packusdw m0, m0
paddw m0, m0
pmulhrsw m1, m0
paddw m0, m1
pmulhrsw m0, m2
pmulhrsw m0, m3
mova m1, m0
jmp m(iadst_4x4_internal).end
%elif %3 >= 0
@ -556,22 +558,22 @@ INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
mova m0, [cq+16*0]
mova m1, [cq+16*1]
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2
punpcklwd m0, m2
jmp tx2q
.pass2:
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
jmp m(iadst_4x4_internal).end
%macro WRITE_4X8 2 ; coefs[1-2]
@ -619,12 +621,12 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
movd xm1, [cq+16*2]
punpcklwd xm1, [cq+16*3]
vpbroadcastd xm2, [o(pw_2896x8)]
vpbroadcastd xm3, [o(pw_5793x4)]
vpbroadcastd xm3, [o(pw_1697x8)]
vpbroadcastd xm4, [o(pw_2048)]
punpckldq xm0, xm1
pmulhrsw xm0, xm2
paddw xm0, xm0
pmulhrsw xm0, xm3
pmulhrsw xm3, xm0
paddw xm0, xm3
pmulhrsw xm0, xm2
pmulhrsw xm0, xm4
vpbroadcastq m0, xm0
@ -896,17 +898,17 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpermq m2, [cq+32*0], q3120
vpermq m0, [cq+32*1], q3120
vpbroadcastd m3, [o(pw_2896x8)]
vpbroadcastd m4, [o(pw_5793x4)]
vpbroadcastd m4, [o(pw_1697x8)]
punpcklwd m1, m2, m0
punpckhwd m2, m0
pmulhrsw m1, m3
pmulhrsw m2, m3
punpcklwd m0, m1, m2
punpckhwd m1, m2
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m4
pmulhrsw m1, m4
pmulhrsw m2, m4, m0
pmulhrsw m4, m1
paddw m0, m2
paddw m1, m4
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_4096)]
@ -919,11 +921,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m0, [cq]
vpbroadcastd m1, [o(pw_16384)]
vpbroadcastd m2, [o(pw_5793x4)]
vpbroadcastd m2, [o(pw_1697x16)]
vpbroadcastd m3, [o(pw_2048)]
pmulhrsw m0, m1
psllw m0, 2
pmulhrsw m0, m2
pmulhrsw m2, m0
paddw m0, m0
paddw m0, m2
pmulhrsw m3, m0
punpcklwd m1, m3, m3
punpckhwd m3, m3
@ -937,12 +940,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
punpcklwd xm0, [cq+32*1]
movd xm1, [cq+32*2]
punpcklwd xm1, [cq+32*3]
vpbroadcastd xm2, [o(pw_5793x4)]
vpbroadcastd xm2, [o(pw_1697x8)]
vpbroadcastd xm3, [o(pw_16384)]
vpbroadcastd xm4, [o(pw_2896x8)]
punpckldq xm0, xm1
paddw xm0, xm0
pmulhrsw xm0, xm2
pmulhrsw xm2, xm0
paddw xm0, xm2
pmulhrsw xm0, xm3
psrlw xm3, 3 ; pw_2048
pmulhrsw xm0, xm4
@ -1281,13 +1284,19 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m2, [cq+32*1]
mova m4, [cq+32*2]
mova m0, [cq+32*3]
vpbroadcastd m5, [o(pw_5793x4)]
vpbroadcastd m5, [o(pw_1697x8)]
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
REPX {paddw x, x }, m1, m2, m3, m4
REPX {pmulhrsw x, m5}, m1, m2, m3, m4
pmulhrsw m0, m5, m1
pmulhrsw m6, m5, m2
pmulhrsw m7, m5, m3
pmulhrsw m5, m4
paddw m1, m0
paddw m2, m6
paddw m3, m7
paddw m4, m5
vpbroadcastd m5, [o(pw_16384)]
punpckldq m0, m1, m2
punpckhdq m1, m2
@ -1296,10 +1305,17 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_5793x4)]
vpbroadcastd m8, [o(pw_1697x16)]
vpbroadcastd m5, [o(pw_2048)]
REPX {psllw x, 2 }, m0, m1, m2, m3
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
pmulhrsw m4, m8, m0
pmulhrsw m6, m8, m1
pmulhrsw m7, m8, m2
pmulhrsw m8, m3
REPX {paddw x, x}, m0, m1, m2, m3
paddw m0, m4
paddw m1, m6
paddw m2, m7
paddw m3, m8
jmp m(iadst_4x16_internal).end2
%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
@ -1333,11 +1349,11 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
%ifidn %1_%2, dct_identity
vpbroadcastd xm0, [o(pw_2896x8)]
pmulhrsw xm1, xm0, [cq]
vpbroadcastd xm2, [o(pw_5793x4)]
vpbroadcastd xm2, [o(pw_1697x8)]
vpbroadcastd xm3, [o(pw_2048)]
pmulhrsw xm1, xm0
paddw xm1, xm1
pmulhrsw xm1, xm2
pmulhrsw xm2, xm1
paddw xm1, xm2
pmulhrsw xm1, xm3
punpcklwd xm1, xm1
punpckldq xm0, xm1, xm1
@ -1508,11 +1524,11 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
paddw m1, m1
jmp tx2q
.pass2:
vpbroadcastd m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -1773,14 +1789,15 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
vpbroadcastd m0, [o(pw_2896x8)]
pmulhrsw m7, m0, [cq]
vpbroadcastd m1, [o(pw_16384)]
vpbroadcastd m2, [o(pw_5793x4)]
vpbroadcastd m2, [o(pw_1697x16)]
pxor m3, m3
mova [cq], m3
pmulhrsw m7, m0
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
psllw m7, 2
pmulhrsw m7, m2
pmulhrsw m2, m7
paddw m7, m7
paddw m7, m2
pmulhrsw m7, m1
punpcklwd m5, m7, m7
punpckhwd m7, m7
@ -2101,6 +2118,16 @@ INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
pmulhrsw m%2, m%3, m%1
%if %0 == 4 ; if we're going to downshift by 1 doing so here eliminates the paddw
pmulhrsw m%2, m%4
%else
paddw m%1, m%1
%endif
paddw m%1, m%2
%endmacro
cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm3, [cq+16*0]
mova xm2, [cq+16*2]
@ -2139,10 +2166,9 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
punpckhdq m7, m8
jmp tx2q
.pass2:
vpbroadcastd m8, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
vpbroadcastd m8, [o(pw_1697x16)]
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
jmp m(idct_8x16_internal).end
%macro WRITE_16X2 6 ; coefs[1-2], tmp[1-2], offset[1-2]
@ -2171,11 +2197,11 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
vpbroadcastd xm3, [o(pw_2896x8)]
pmulhrsw xm3, [cq]
vpbroadcastd xm0, [o(pw_16384)]
vpbroadcastd xm1, [o(pw_5793x4)]
vpbroadcastd xm1, [o(pw_1697x8)]
pmulhrsw xm3, xm0
psrlw xm0, 3 ; pw_2048
paddw xm3, xm3
pmulhrsw xm3, xm1
pmulhrsw xm1, xm3
paddw xm3, xm1
pmulhrsw xm3, xm0
punpcklwd xm3, xm3
punpckldq xm1, xm3, xm3
@ -2194,15 +2220,15 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm3, [cq+16*3]
vinserti128 m1, m1, [cq+16*6], 1
vinserti128 m3, m3, [cq+16*7], 1
vpbroadcastd m4, [o(pw_5793x4)]
vpbroadcastd m4, [o(pw_1697x16)]
vpbroadcastd m5, [o(pw_16384)]
packusdw m0, m2
packusdw m1, m3
packusdw m0, m1
vpbroadcastd m1, [o(pw_2896x8)]
psllw m0, 2
pmulhrsw m0, m4
pmulhrsw m0, m5
pmulhrsw m4, m0
pmulhrsw m4, m5
paddw m0, m4
psrlw m5, 3 ; pw_2048
pmulhrsw m0, m1
pmulhrsw m0, m5
@ -2462,28 +2488,40 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova xm1, [cq+16*3]
vinserti128 m0, m0, [cq+16*6], 1
vinserti128 m1, m1, [cq+16*7], 1
vpbroadcastd m5, [o(pw_5793x4)]
vpbroadcastd m7, [o(pw_1697x16)]
vpbroadcastd m8, [o(pw_16384)]
punpcklwd m3, m2, m4
punpckhwd m2, m4
punpcklwd m4, m0, m1
punpckhwd m0, m1
REPX {psllw x, 2}, m3, m2, m4, m0
punpcklwd m1, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m0
punpckhwd m4, m0
REPX {pmulhrsw x, m5}, m1, m3, m2, m4
vpbroadcastd m5, [o(pw_16384)]
pmulhrsw m0, m7, m1
pmulhrsw m5, m7, m2
pmulhrsw m6, m7, m3
pmulhrsw m7, m4
REPX {pmulhrsw x, m8}, m0, m5, m6, m7
paddw m1, m0
paddw m2, m5
paddw m3, m6
paddw m4, m7
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [o(pw_5793x4)]
REPX {paddw x, x }, m0, m1, m2, m3
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
vpbroadcastd m7, [o(pw_1697x8)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
jmp m(iadst_16x4_internal).end
%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -2532,7 +2570,7 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
mova m3, [cq+32*6]
packusdw m3, [cq+32*7]
vpbroadcastd m4, [o(pw_2896x8)]
vpbroadcastd m5, [o(pw_5793x4)]
vpbroadcastd m5, [o(pw_1697x16)]
packusdw m0, m2
packusdw m1, m3
vpbroadcastd m2, [o(pw_16384)]
@ -2541,9 +2579,9 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
vpermq m0, m0, q1100
punpcklwd m0, m1
pmulhrsw m0, m4
psllw m0, 2
pmulhrsw m0, m5
pmulhrsw m0, m2
pmulhrsw m5, m0
pmulhrsw m5, m2
paddw m0, m5
psrlw m2, 3 ; pw_2048
pmulhrsw m0, m4
pmulhrsw m0, m2
@ -2816,8 +2854,8 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
mova xm1, [cq-16*1]
vinserti128 m0, m0, [cq+16*6], 1
vinserti128 m1, m1, [cq+16*7], 1
vpbroadcastd m9, [o(pw_5793x4)]
vpbroadcastd m10, [o(pw_16384)]
vpbroadcastd m10, [o(pw_1697x16)]
vpbroadcastd m11, [o(pw_16384)]
REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
punpcklwd m3, m7, m2
punpckhwd m7, m2
@ -2827,7 +2865,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
punpckhwd m8, m5
punpcklwd m5, m0, m1
punpckhwd m0, m1
REPX {psllw x, 2}, m3, m7, m2, m6, m4, m8, m5, m0
punpckldq m1, m3, m2
punpckhdq m3, m2
punpckldq m2, m4, m5
@ -2836,7 +2873,7 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
punpckhdq m7, m6
punpckldq m6, m8, m0
punpckhdq m8, m0
REPX {pmulhrsw x, m9}, m1, m3, m2, m4, m5, m7, m6, m8
REPX {IDTX16 x, 0, 10, 11}, 1, 3, 2, 4, 5, 7, 6, 8
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
@ -2845,7 +2882,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
punpckhqdq m5, m6
punpcklqdq m6, m7, m8
punpckhqdq m7, m8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
vpbroadcastd m8, [o(pw_4096)]
@ -2916,14 +2952,15 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
vinserti128 m2, m2, [cq+32*14], 1
vinserti128 m4, m4, [cq+32*15], 1
punpcklwd m1, m3
vpbroadcastd m3, [o(pw_5793x4)]
vpbroadcastd m3, [o(pw_1697x16)]
punpcklwd m2, m4
vpbroadcastd m4, [o(pw_8192)]
punpckldq m1, m2
vpbroadcastd m2, [o(pw_2896x8)]
punpcklqdq m0, m1
psllw m0, 2
pmulhrsw m0, m3
pmulhrsw m3, m0
paddw m0, m0
paddw m0, m3
pmulhrsw m0, m4
psrlw m4, 2 ; pw_2048
pmulhrsw m0, m2
@ -3352,47 +3389,47 @@ INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
mova xm0, [cq+16*0]
mova xm15, [cq+16*1]
mova xm1, [cq+16*2]
mova xm8, [cq+16*3]
mova xm2, [cq+16*4]
mova xm9, [cq+16*5]
mova xm3, [cq+16*6]
mova xm10, [cq+16*7]
vpbroadcastd m7, [o(pw_1697x16)]
mova xm0, [cq+16* 0]
vinserti128 m0, [cq+16*16], 1
mova xm15, [cq+16* 1]
vinserti128 m15, [cq+16*17], 1
mova xm1, [cq+16* 2]
vinserti128 m1, [cq+16*18], 1
mova xm8, [cq+16* 3]
vinserti128 m8, [cq+16*19], 1
mova xm2, [cq+16* 4]
vinserti128 m2, [cq+16*20], 1
mova xm9, [cq+16* 5]
vinserti128 m9, [cq+16*21], 1
mova xm3, [cq+16* 6]
vinserti128 m3, [cq+16*22], 1
mova xm10, [cq+16* 7]
add cq, 16*16
vinserti128 m0, m0, [cq+16*0], 1
vinserti128 m15, m15, [cq+16*1], 1
mova xm4, [cq-16*8]
mova xm11, [cq-16*7]
vinserti128 m1, m1, [cq+16*2], 1
vinserti128 m8, m8, [cq+16*3], 1
mova xm5, [cq-16*6]
mova xm12, [cq-16*5]
vinserti128 m2, m2, [cq+16*4], 1
vinserti128 m9, m9, [cq+16*5], 1
mova xm6, [cq-16*4]
mova xm13, [cq-16*3]
vinserti128 m3, m3, [cq+16*6], 1
vinserti128 m10, m10, [cq+16*7], 1
mova xm7, [cq-16*2]
mova xm14, [cq-16*1]
vinserti128 m4, m4, [cq+16*8], 1
vinserti128 m11, m11, [cq+16*9], 1
vinserti128 m5, m5, [cq+16*10], 1
vinserti128 m12, m12, [cq+16*11], 1
vinserti128 m6, m6, [cq+16*12], 1
vinserti128 m13, m13, [cq+16*13], 1
vinserti128 m7, m7, [cq+16*14], 1
vinserti128 m14, m14, [cq+16*15], 1
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [rsp], m0
vpbroadcastd m0, [o(pw_5793x4)]
REPX {pmulhrsw x, m0}, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
pmulhrsw m0, [rsp]
vinserti128 m10, [cq+16* 7], 1
mova xm4, [cq-16* 8]
vinserti128 m4, [cq+16* 8], 1
mova xm11, [cq-16* 7]
vinserti128 m11, [cq+16* 9], 1
mova xm5, [cq-16* 6]
vinserti128 m5, [cq+16*10], 1
mova xm12, [cq-16* 5]
vinserti128 m12, [cq+16*11], 1
mova xm13, [cq-16* 3]
vinserti128 m13, [cq+16*13], 1
mova xm14, [cq-16* 1]
vinserti128 m14, [cq+16*15], 1
REPX {IDTX16 x, 6, 7}, 0, 15, 1, 8, 2, 9, 3, \
10, 4, 11, 5, 12, 13, 14
mova xm6, [cq-16* 4]
vinserti128 m6, [cq+16*12], 1
mova [rsp], m1
IDTX16 6, 1, 7
mova xm1, [cq-16* 2]
vinserti128 m1, [cq+16*14], 1
pmulhrsw m7, m1
paddw m1, m1
paddw m7, m1
vpbroadcastd m1, [o(pw_8192)]
REPX {pmulhrsw x, m1}, m0, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
@ -3401,14 +3438,17 @@ cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
jmp m(idct_16x16_internal).pass1_end3
ALIGN function_align
.pass2:
vpbroadcastd m15, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pmulhrsw x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
vpbroadcastd m15, [o(pw_1697x16)]
mova [rsp+32*1], m0
REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14
mova m0, [rsp+32*1]
mova [rsp+32*1], m1
IDTX16 0, 1, 15
mova m1, [rsp+32*0]
REPX {psllw x, 2 }, m8, m9, m10, m11, m12, m13, m14, m1
REPX {pmulhrsw x, m15}, m8, m9, m10, m11, m12, m13, m14
pmulhrsw m15, m1
paddw m1, m1
paddw m15, m1
jmp m(idct_16x16_internal).end
%define o_base iadst4_dconly2a + 128
@ -4606,7 +4646,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
%undef cmp
lea rax, [o_base]
vpbroadcastd m9, [o(pw_2896x8)]
vpbroadcastd m10, [o(pw_5793x4)]
vpbroadcastd m10, [o(pw_1697x8)]
vpbroadcastd m11, [o(pw_2048)]
cmp eobd, 35 ; if (eob > 35)
setg r4b ; iteration_count++
@ -4634,9 +4674,24 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
vinserti128 m6, m6, [cq+32*14], 1
vinserti128 m7, m7, [cq+32*15], 1
REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 3 }, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
call m(inv_txfm_add_identity_identity_8x32).transpose8x8
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
pmulhrsw m8, m10, m0
paddw m0, m8
pmulhrsw m8, m10, m1
paddw m1, m8
pmulhrsw m8, m10, m2
paddw m2, m8
pmulhrsw m8, m10, m3
paddw m3, m8
pmulhrsw m8, m10, m4
paddw m4, m8
pmulhrsw m8, m10, m5
paddw m5, m8
pmulhrsw m8, m10, m6
paddw m6, m8
pmulhrsw m8, m10, m7
paddw m7, m8
REPX {pmulhrsw x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1
WRITE_16X2 2, 3, 0, 1, strideq*2, r3

View File

@ -73,6 +73,8 @@ pw_m2048: times 8 dw -2048
pw_4096: times 8 dw 4096
pw_16384: times 8 dw 16384
pw_m16384: times 8 dw -16384
pw_1697x16: times 8 dw 1697*16
pw_1697x8: times 8 dw 1697*8
pw_2896x8: times 8 dw 2896*8
pw_3344x8: times 8 dw 3344*8
pw_5793x4: times 8 dw 5793*4
@ -273,8 +275,8 @@ ALIGN function_align
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq]
paddw m0, m0
pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m1, m0, [o(pw_1697x8)]
paddw m0, m1
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
@ -286,8 +288,8 @@ ALIGN function_align
punpckhwd m1, m2
punpcklwd m0, m1
punpcklqdq m0, m0
paddw m0, m0
pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m1, m0, [o(pw_1697x8)]
paddw m0, m1
pmulhrsw m0, [o(pw_2896x8)]
mova m1, m0
TAIL_CALL m(iadst_4x4_internal).end
@ -434,12 +436,11 @@ INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
mova m3, [o(pw_1697x8)]
pmulhrsw m2, m0, m3
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
@ -447,11 +448,11 @@ cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp tx2q
.pass2:
mova m2, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
mova m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m2
paddw m1, m3
jmp m(iadst_4x4_internal).end
%macro IWHT4_1D_PACKED 0
@ -609,8 +610,8 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m2, [o(pw_2896x8)]
punpckldq m0, m1
pmulhrsw m0, m2
paddw m0, m0
pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m1, m0, [o(pw_1697x8)]
paddw m0, m1
pmulhrsw m0, m2
pmulhrsw m0, [o(pw_2048)]
punpcklqdq m0, m0
@ -828,16 +829,15 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m3, [coeffq+16*3]
.pass1:
mova m5, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
paddw m2, m2
paddw m3, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
pmulhrsw m2, m5
pmulhrsw m3, m5
mova m7, [o(pw_1697x8)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
jmp m(iadst_4x8_internal).pass1_end
.pass2:
@ -880,8 +880,8 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [o(pw_2896x8)]
pmulhrsw m1, m0, [coeffq]
pmulhrsw m1, m0
paddw m1, m1
pmulhrsw m1, [o(pw_5793x4)]
pmulhrsw m0, m1, [o(pw_1697x8)]
paddw m1, m0
pmulhrsw m1, [o(pw_2048)]
punpcklwd m1, m1
punpckhdq m2, m1, m1
@ -1180,15 +1180,15 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp tx2q
.pass2:
mova m4, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
paddw m2, m2
paddw m3, m3
pmulhrsw m0, m4
pmulhrsw m1, m4
pmulhrsw m2, m4
pmulhrsw m3, m4
mova m7, [o(pw_1697x8)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddw m0, m4
paddw m1, m5
paddw m2, m6
paddw m3, m7
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
@ -1635,14 +1635,16 @@ ALIGN function_align
pmulhrsw m0, [coeffq+16*0]
pmulhrsw m1, [coeffq+16*1]
mova m2, [o(pw_16384)]
mova m3, [o(pw_5793x4)]
mova m3, [o(pw_1697x16)]
mova m4, [o(pw_2048)]
pmulhrsw m0, m2
pmulhrsw m1, m2
psllw m0, 2
psllw m1, 2
pmulhrsw m0, m3
pmulhrsw m1, m3
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
paddw m0, m0
paddw m1, m1
paddw m0, m2
paddw m1, m3
pmulhrsw m0, m4
pmulhrsw m4, m1
punpckhwd m2, m0, m0
@ -1664,12 +1666,11 @@ ALIGN function_align
punpcklwd m0, [coeffq+32*1]
movd m1, [coeffq+32*2]
punpcklwd m1, [coeffq+32*3]
mova m2, [o(pw_5793x4)]
mova m3, [o(pw_16384)]
mova m4, [o(pw_2896x8)]
punpckldq m0, m1
paddw m0, m0
pmulhrsw m0, m2
pmulhrsw m1, m0, [o(pw_1697x8)]
mova m4, [o(pw_2896x8)]
paddw m0, m1
pmulhrsw m0, m3
psrlw m3, 3 ; pw_2048
pmulhrsw m0, m4
@ -1885,17 +1886,27 @@ INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
%macro IDTX16 3 ; src/dst, tmp, pw_1697x16
pmulhrsw m%2, m%3, m%1
paddw m%1, m%1
paddw m%1, m%2
%endmacro
cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iidentity_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
psllw m7, [coeffq+16*7], 2
pmulhrsw m7, [o(pw_5793x4)]
mova [coeffq+16*7], m7
mova m7, [o(pw_1697x16)]
mova [coeffq+16*6], m6
REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
mova m6, [coeffq+16*7]
IDTX16 6, 7, 7
mova [coeffq+16*7], m6
mova m6, [coeffq+16*6]
pmulhrsw m7, m6, [o(pw_1697x16)]
paddw m6, m6
paddw m6, m7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
@ -1913,8 +1924,8 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [o(pw_16384)]
pmulhrsw m3, m0
psrlw m0, 3 ; pw_2048
paddw m3, m3
pmulhrsw m3, [o(pw_5793x4)]
pmulhrsw m1, m3, [o(pw_1697x8)]
paddw m3, m1
pmulhrsw m3, m0
punpcklwd m3, m3
pshufd m0, m3, q0000
@ -1927,28 +1938,28 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end2
%elifidn %1_%2, identity_dct
mova m4, [o(pw_1697x16)]
mova m5, [o(pw_16384)]
mova m6, [o(pw_5793x4)]
mova m7, [o(pw_2896x8)]
mova m6, [o(pw_2896x8)]
mov r3d, 2
psrlw m7, m5, 3 ; pw_2048
.main_loop:
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
punpckhwd m4, m0, m1
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpcklwd m0, m4
punpcklwd m2, m1
punpcklqdq m0, m2
psllw m0, 2
punpcklwd m0, m2
mova m1, [coeffq+16*2]
mova m2, [coeffq+16*3]
punpckhwd m3, m1, m2
punpcklwd m1, m2
punpcklwd m1, m3
punpcklqdq m0, m1
pmulhrsw m1, m4, m0
pmulhrsw m1, m5
paddw m0, m1
pmulhrsw m0, m6
pmulhrsw m0, m5
psrlw m1, m5, 3 ; pw_2048
pmulhrsw m0, m7
pmulhrsw m0, m1
.end:
pxor m3, m3
mova [coeffq+16*0], m3
@ -2412,22 +2423,56 @@ INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_7ROWS coeffq, 16
mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova m1, [coeffq+16*6]
mova m0, [coeffq+16*5]
mova m2, [coeffq+16*7]
mova m6, [o(pw_1697x16)]
mova m7, [o(pw_16384)]
pmulhrsw m4, m6, m1
pmulhrsw m3, m6, m0
pmulhrsw m5, m6, m2
pmulhrsw m4, m7
pmulhrsw m3, m7
pmulhrsw m5, m7
paddw m1, m4
paddw m0, m3
paddw m5, m2
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mova m4, [coeffq+16*4]
mova [coeffq+16*6], m1
mova [coeffq+16*5], m0
mova [coeffq+16*7], m5
pmulhrsw m0, m6, m2
pmulhrsw m1, m6, m3
pmulhrsw m5, m6, m4
pmulhrsw m0, m7
pmulhrsw m1, m7
pmulhrsw m5, m7
paddw m2, m0
paddw m3, m1
paddw m4, m5
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
pmulhrsw m5, m6, m0
pmulhrsw m6, m1
pmulhrsw m5, m7
pmulhrsw m6, m7
paddw m0, m5
paddw m1, m6
mova m6, [coeffq+16*6]
mova m5, [coeffq+16*5]
punpckhwd m7, m0, m2 ;packed out1, out5
punpcklwd m0, m2 ;packed out0, out4
punpckhwd m2, m1, m3 ;packed out3, out7
punpcklwd m1, m3 ;packed out2, out6
mova [coeffq+16*6], m7
psllw m7, [coeffq+16*7], 2
pmulhrsw m7, [o(pw_5793x4)]
mova m7, [coeffq+16*7]
punpckhwd m3, m4, m6 ;packed out9, out13
punpcklwd m4, m6 ;packed out8, out12
punpckhwd m6, m5, m7 ;packed out11, out15
punpcklwd m5, m7 ;packed out10, out14
jmp m(idct_16x4_internal).pass1_end2
jmp m(idct_16x4_internal).pass1_end3
.pass2:
lea tx2q, [o(m(iidentity_8x4_internal).pass2)]
@ -2475,8 +2520,9 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m7, m0
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
psllw m7, 2
pmulhrsw m7, [o(pw_5793x4)]
pmulhrsw m0, m7, [o(pw_1697x16)]
paddw m7, m7
paddw m7, m0
pmulhrsw m7, m1
punpcklwd m0, m7, m7
punpckhwd m7, m7
@ -2720,16 +2766,21 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea tx2q, [o(m(iidentity_8x16_internal).end1)]
.end:
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
pmulhrsw m7, [o(pw_5793x4)]
pmulhrsw m7, [o(pw_2048)]
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_5793x4)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova [rsp+gprsize+16*1], m6
mova m7, [o(pw_1697x16)]
REPX {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
mova m6, [rsp+gprsize+16*1]
mova [rsp+gprsize+16*2], m5
IDTX16 6, 5, 7
mova m5, [rsp+gprsize+16*0]
IDTX16 5, 7, 7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [rsp+gprsize+16*2]
mova [rsp+gprsize+16*0], m5
mova [rsp+gprsize+16*1], m6
mova [rsp+gprsize+16*2], m5
mova [rsp+gprsize+16*2], m7
jmp m(idct_8x8_internal).end3
.end1:
@ -2787,32 +2838,32 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end2
%elifidn %1_%2, identity_dct
mova m5, [o(pw_16384)]
mova m6, [o(pw_5793x4)]
mova m7, [o(pw_2896x8)]
pxor m4, m4
mova m4, [o(pw_2896x8)]
mova m5, [o(pw_1697x16)]
mova m6, [o(pw_16384)]
psrlw m7, m6, 3 ; pw_2048
mov r3d, 2
.main_loop:
mova m0, [coeffq+16*0]
punpcklwd m0, [coeffq+16*1]
mova m1, [coeffq+16*2]
punpcklwd m1, [coeffq+16*3]
mova m2, [coeffq+16*4]
punpcklwd m2, [coeffq+16*5]
mova m3, [coeffq+16*6]
punpcklwd m3, [coeffq+16*7]
punpckldq m0, m1
punpckldq m2, m3
punpcklqdq m0, m2
mova m1, [coeffq+16*4]
punpcklwd m1, [coeffq+16*5]
mova m2, [coeffq+16*6]
punpcklwd m2, [coeffq+16*7]
punpckldq m1, m2
punpcklqdq m0, m1
pmulhrsw m0, m4
pmulhrsw m1, m5, m0
pmulhrsw m1, m6
paddw m0, m1
pmulhrsw m0, m4
pmulhrsw m0, m7
psllw m0, 2
pmulhrsw m0, m6
pmulhrsw m0, m5
psrlw m1, m5, 3 ; pw_2048
pmulhrsw m0, m7
pmulhrsw m0, m1
.end:
REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
pxor m1, m1
REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
add coeffq, 16*8
lea tx2q, [dstq+8]
WRITE_8X4 0, 0, 0, 0, 1, 2, 3
@ -3292,40 +3343,66 @@ INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*8, 16, 1
add coeffq, 16*16
mova m4, [coeffq-16*7]
mova m5, [coeffq-16*5]
mova m6, [coeffq-16*3]
mova m7, [coeffq-16*1]
mov r3, tx2q
lea tx2q, [o(m(iidentity_16x8_internal).pass1_end)]
.pass1:
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
pmulhrsw m7, [o(pw_5793x4)]
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_5793x4)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
jmp m(idct_8x8_internal).pass1_end
mova m0, [o(pw_2896x8)]
mova m2, [o(pw_1697x16)]
mova m3, [o(pw_16384)]
sub coeffq, 8*16
REPX {pmulhrsw x, m0}, m4, m5, m6, m7
pmulhrsw m1, m2, m4
pmulhrsw m1, m3
paddw m1, m4 ; 1
pmulhrsw m4, m2, m5
pmulhrsw m4, m3
paddw m4, m5 ; 3
pmulhrsw m5, m2, m6
pmulhrsw m5, m3
paddw m5, m6 ; 5
pmulhrsw m6, m2, m7
pmulhrsw m6, m3
paddw m7, m6 ; 7
pmulhrsw m6, m0, [coeffq+16*6]
mova [rsp+gprsize+16*0], m4
pmulhrsw m4, m2, m6
pmulhrsw m4, m3
paddw m6, m4 ; 6
pmulhrsw m4, m0, [coeffq+16*4]
mova [rsp+gprsize+16*1], m6
pmulhrsw m6, m2, m4
pmulhrsw m6, m3
paddw m4, m6 ; 4
pmulhrsw m6, m0, [coeffq+16*2]
pmulhrsw m0, [coeffq+16*0]
pmulhrsw m2, m6
pmulhrsw m2, m3
paddw m2, m6 ; 2
pmulhrsw m6, m0, [o(pw_1697x16)]
pmulhrsw m6, m3
mova m3, [rsp+gprsize+16*0]
paddw m0, m6
jmp m(idct_8x8_internal).pass1_end3
.pass1_end:
mova [coeffq+16*9 ], m4
mova [coeffq+16*11], m5
mova [coeffq+16*13], m6
mova [coeffq+16*15], m7
mova m4, [o(pw_2896x8)]
pmulhrsw m5, m4, [coeffq+16*5]
pmulhrsw m6, m4, [coeffq+16*6]
pmulhrsw m7, m4, [coeffq+16*7]
mova [coeffq+16*5 ], m2
mova [coeffq+16*7 ], m3
pmulhrsw m2, m4, [coeffq+16*2]
pmulhrsw m3, m4, [coeffq+16*3]
mova [coeffq+16*3 ], m1
pmulhrsw m1, m4, [coeffq+16*1]
mova [coeffq+16*1 ], m0
pmulhrsw m0, m4, [coeffq+16*0]
pmulhrsw m4, [coeffq+16*4]
mova [coeffq+16*1], m4
mova [coeffq+16*3], m5
mova [coeffq+16*5], m6
mova [coeffq+16*7], m7
mova m4, [coeffq-16*7]
mova m5, [coeffq-16*5]
mova m6, [coeffq-16*3]
mova m7, [coeffq-16*1]
mova [coeffq-16*7], m0
mova [coeffq-16*5], m1
mova [coeffq-16*3], m2
mova [coeffq-16*1], m3
mov tx2q, r3
jmp .pass1
@ -3399,7 +3476,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jg .loop
RET
%elifidn %1_%2, identity_dct
mova m4, [o(pw_5793x4)]
mova m4, [o(pw_1697x16)]
mova m5, [o(pw_8192)]
mova m6, [o(pw_2896x8)]
psrlw m7, m5, 2 ;pw_2048
@ -3410,23 +3487,24 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.main:
movd m0, [coeffq+32*0]
punpcklwd m0, [coeffq+32*1]
movd m2, [coeffq+32*2]
punpcklwd m2, [coeffq+32*3]
movd m1, [coeffq+32*2]
punpcklwd m1, [coeffq+32*3]
add coeffq, 32*4
punpckldq m0, m1
movd m1, [coeffq+32*0]
punpcklwd m1, [coeffq+32*1]
movd m3, [coeffq+32*2]
punpcklwd m3, [coeffq+32*3]
movd m2, [coeffq+32*2]
punpcklwd m2, [coeffq+32*3]
xor eobd, eobd
mov [coeffq-32*4], eobd
mov [coeffq-32*3], eobd
mov [coeffq-32*2], eobd
mov [coeffq-32*1], eobd
punpckldq m0, m2
punpckldq m1, m3
punpckldq m1, m2
punpcklqdq m0, m1
psllw m0, 2
pmulhrsw m0, m4
pmulhrsw m1, m4, m0
paddw m0, m0
paddw m0, m1
pmulhrsw m0, m5
pmulhrsw m0, m6
pmulhrsw m0, m7
@ -3740,36 +3818,42 @@ INV_TXFM_16X16_FN identity, dct, 15
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
LOAD_8ROWS coeffq+16*17, 32
add coeffq, 16*17
mov r3, tx2q
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end)]
.pass1:
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
pmulhrsw m7, [o(pw_5793x4)]
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_5793x4)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova m7, [o(pw_1697x16)]
mova m6, [coeffq+32*7]
mova m0, [coeffq+32*0]
mova m1, [coeffq+32*1]
mova m2, [coeffq+32*2]
mova m3, [coeffq+32*3]
mova m4, [coeffq+32*4]
REPX {IDTX16 x, 5, 7}, 6, 0, 1, 2, 3, 4
mova m5, [coeffq+32*5]
mova [rsp+gprsize+16*0], m6
IDTX16 5, 6, 7
mova m6, [coeffq+32*6]
IDTX16 6, 7, 7
mova m7, [o(pw_8192)]
jmp m(idct_8x8_internal).pass1_end1
.pass1_end:
SAVE_8ROWS coeffq+16*17, 32
LOAD_8ROWS coeffq+16* 1, 32
SAVE_8ROWS coeffq, 32
sub coeffq, 16
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end1)]
jmp .pass1
.pass1_end1:
SAVE_8ROWS coeffq+16* 1, 32
LOAD_8ROWS coeffq+16*16, 32
SAVE_8ROWS coeffq, 32
sub coeffq, 15*16
lea tx2q, [o(m(iidentity_16x16_internal).pass1_end2)]
jmp .pass1
.pass1_end2:
SAVE_8ROWS coeffq+16*16, 32
LOAD_8ROWS coeffq+16* 0, 32
SAVE_8ROWS coeffq, 32
sub coeffq, 16
mov tx2q, r3
jmp .pass1
@ -3778,16 +3862,22 @@ cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea tx2q, [o(m(iidentity_16x16_internal).end1)]
.end:
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
pmulhrsw m7, [o(pw_5793x4)]
pmulhrsw m7, [o(pw_2048)]
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_5793x4)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova [rsp+gprsize+16*1], m6
mova [rsp+gprsize+16*1], m4
mova m7, [o(pw_1697x16)]
REPX {IDTX16 x, 4, 7}, 5, 6, 0, 1, 2, 3
mova m4, [o(pw_2048)]
pmulhrsw m5, m4
pmulhrsw m6, m4
mova [rsp+gprsize+16*2], m5
mova m5, [rsp+gprsize+16*1]
mova [rsp+gprsize+16*1], m6
IDTX16 5, 6, 7
mova m6, [rsp+gprsize+16*0]
IDTX16 6, 7, 7
REPX {pmulhrsw x, m4}, m0, m1, m2, m3, m6
pmulhrsw m4, m5
mova [rsp+gprsize+16*0], m6
jmp m(idct_8x8_internal).end3
.end1:
@ -4991,15 +5081,33 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff,
.loop:
LOAD_8ROWS coeffq, 32, 1
REPX {psllw x, 3}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psllw x, 2}, m0, m1, m2, m3, m4, m5, m6, m7
mova [rsp+16*1], m6
lea tx2q, [o(m(idct_32x16_internal).end)]
call m(idct_8x8_internal).pass1_end3
pmulhrsw m7, [o(pw_5793x4)]
mova [rsp+16*2], m5
mova [rsp+16*1], m6
mova m5, [o(pw_1697x8)]
pmulhrsw m6, m5, m7
paddw m7, m6
pmulhrsw m6, m5, m0
paddw m0, m6
pmulhrsw m6, m5, m1
paddw m1, m6
pmulhrsw m6, m5, m2
paddw m2, m6
pmulhrsw m6, m5, m3
paddw m3, m6
pmulhrsw m6, m5, m4
pmulhrsw m7, [o(pw_2048)]
paddw m4, m6
mova m6, [rsp+16*1]
mova [rsp+16*0], m7
mova m7, [o(pw_5793x4)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, m5, m6
paddw m6, m7
mova m7, [rsp+16*2]
pmulhrsw m5, m7
paddw m5, m7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova [rsp+16*2], m5
@ -5008,7 +5116,7 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 8, 16*4, dst, stride, coeff,
lea dstq, [dstq+strideq*2]
pxor m7, m7
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
.loop_end:
add coeffq, 16

View File

@ -204,6 +204,7 @@ WIENER_FILTER(ext) \
SGR_FILTER(ext)
#if BITDEPTH == 8
WIENER_FILTER(sse2)
DEF_LR_FILTERS(ssse3)
# if ARCH_X86_64
DEF_LR_FILTERS(avx2)
@ -213,6 +214,11 @@ DEF_LR_FILTERS(avx2)
COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return;
#if BITDEPTH == 8
c->wiener = wiener_filter_sse2;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->wiener = wiener_filter_ssse3;

View File

@ -43,8 +43,6 @@ pb_15: times 16 db 15
pb_0_1: times 8 db 0, 1
pb_6_7: times 8 db 6, 7
pb_14_15: times 8 db 14, 15
pb_0_1_2_3: times 4 db 0, 1, 2, 3
pb_4_5_6_7: times 4 db 4, 5, 6, 7
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_128: times 8 dw 128
@ -97,58 +95,101 @@ SECTION .text
%define PIC_sym(sym) (sym)
%endif
%macro PALIGNR 4 ; dst, src1, src2, shift
%if cpuflag(ssse3)
palignr %1, %2, %3, %4
%else
%assign %%i regnumof%+%1 + 1
%define %%tmp m %+ %%i
psrldq %1, %3, %4
pslldq %%tmp, %2, 16-%4
por %1, %%tmp
%endif
%endmacro
%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
%if cpuflag(ssse3)
pmaddubsw %1, %2
%else
%if %5 == 1
pxor %3, %3
%endif
punpckhbw %4, %1, %3
punpcklbw %1, %3
pmaddwd %4, %2
pmaddwd %1, %2
packssdw %1, %4
%endif
%endmacro
;;;;;;;;;;;;;;;;;;;;;;
;; wiener ;;
;;;;;;;;;;;;;;;;;;;;;;
INIT_XMM ssse3
%macro WIENER_H 0
%if ARCH_X86_64
cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
mov edged, edgem
movifnidn wd, wm
mov hd, hm
%else
cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
mov r5, edgem
mov [esp+12], r5
mov wd, wm
mov hd, hm
SETUP_PIC hd
%define m15 m0
%define m14 m1
%define m13 m2
%define m12 m3
%endif
movq m15, [fhq]
pshufb m12, m15, [pb_6_7]
pshufb m13, m15, [pb_4]
pshufb m14, m15, [pb_2]
pshufb m15, m15, [pb_0]
%if cpuflag(ssse3)
pshufb m12, m15, [PIC_sym(pb_6_7)]
pshufb m13, m15, [PIC_sym(pb_4)]
pshufb m14, m15, [PIC_sym(pb_2)]
pshufb m15, m15, [PIC_sym(pb_0)]
%else
pshuflw m12, m15, q3333
punpcklbw m15, m15
pshufhw m13, m15, q0000
pshuflw m14, m15, q2222
pshuflw m15, m15, q0000
punpcklqdq m12, m12
punpckhqdq m13, m13
punpcklqdq m14, m14
punpcklqdq m15, m15
psraw m13, 8
psraw m14, 8
psraw m15, 8
%endif
%if ARCH_X86_64
mova m11, [pw_2048]
mova m10, [pw_16380]
lea r11, [pb_right_ext_mask]
DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
%else
cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
mov wd, edgem
mov [esp+12], wd
mov wd, wm
mov hd, hm
SETUP_PIC hd
movq m0, [fhq]
pshufb m3, m0, [PIC_sym(pb_6_7)]
pshufb m2, m0, [PIC_sym(pb_4)]
pshufb m1, m0, [PIC_sym(pb_2)]
pshufb m0, m0, [PIC_sym(pb_0)]
DEFINE_ARGS dst, left, src, stride, x, w, h, edge
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp]
%define edged dword [esp+12]
%define xlimd dword [esp+16]
%define m10 [PIC_sym(pw_16380)]
%define m11 [PIC_sym(pw_2048)]
%define m12 [esp+0x14]
%define m13 [esp+0x24]
%define m14 [esp+0x34]
%define m15 [esp+0x44]
mova m15, m0
mova m14, m1
mova m13, m2
mova m12, m3
mova m13, m2
mova m14, m1
mova m15, m0
DEFINE_ARGS dst, left, src, stride, x, w, h, edge
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp+ 0]
%define edged dword [esp+12]
%define xlimd dword [esp+16]
%endif
; if (edge & has_right) align_w_to_16
@ -196,7 +237,16 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
jmp .left_load_done
.emu_left:
movd m0, [srcq]
%if cpuflag(ssse3)
pshufb m0, [PIC_sym(pb_14x0_1_2)]
%else
pslldq m1, m0, 13
punpcklbw m0, m0
pshuflw m0, m0, q0000
punpcklqdq m0, m0
psrldq m0, 2
por m0, m1
%endif
; load right edge pixels
.left_load_done:
@ -208,19 +258,39 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
; for very small images (w=[1-2]), edge-extend the original cache,
; ugly, but only runs in very odd cases
%if cpuflag(ssse3)
add wd, wd
%if ARCH_X86_64
%if ARCH_X86_64
pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
%else
%else
pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
%endif
%endif
shr wd, 1
%else
shl wd, 4
pcmpeqd m2, m2
movd m3, wd
psrldq m2, 2
punpckhbw m1, m0, m0
pshufhw m1, m1, q1122
psllq m1, m3
pand m0, m2
pandn m2, m1
por m0, m2
shr wd, 4
%endif
; main x loop, mostly this starts in .main_load
.splat_right:
; no need to load new pixels, just extend them from the (possibly previously
; extended) previous load into m0
%if cpuflag(ssse3)
pshufb m1, m0, [PIC_sym(pb_15)]
%else
punpckhbw m1, m0, m0
pshufhw m1, m1, q3333
punpckhqdq m1, m1
%endif
jmp .main_loop
.load_and_splat:
; load new pixels and extend edge for right-most
@ -235,7 +305,13 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
add PIC_reg, xd
%endif
movd m3, [srcptrq+2+xq]
%if cpuflag(ssse3)
pshufb m3, [PIC_sym(pb_0)]
%else
punpcklbw m3, m3
pshuflw m3, m3, q0000
punpcklqdq m3, m3
%endif
pand m1, m2
pxor m2, [PIC_sym(pb_right_ext_mask)]
pand m3, m2
@ -246,58 +322,98 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
; load subsequent line
movu m1, [srcptrq+3]
.main_loop:
palignr m2, m1, m0, 10
palignr m3, m1, m0, 11
palignr m4, m1, m0, 12
palignr m5, m1, m0, 13
palignr m6, m1, m0, 14
palignr m7, m1, m0, 15
%if ARCH_X86_64
PALIGNR m2, m1, m0, 10
PALIGNR m3, m1, m0, 11
PALIGNR m4, m1, m0, 12
PALIGNR m5, m1, m0, 13
PALIGNR m6, m1, m0, 14
PALIGNR m7, m1, m0, 15
%if ARCH_X86_32
mova [esp+0x54], m1
%define m8 m1
%endif
punpcklbw m0, m2, m1
punpckhbw m2, m1
punpcklbw m8, m3, m7
punpckhbw m3, m7
punpcklbw m7, m4, m6
punpckhbw m4, m6
pmaddubsw m0, m15
pmaddubsw m2, m15
pmaddubsw m8, m14
pmaddubsw m3, m14
pmaddubsw m7, m13
pmaddubsw m4, m13
PMADDUBSW m0, m15, m6, m9, 1
PMADDUBSW m2, m15, m6, m9, 0
PMADDUBSW m8, m14, m6, m9, 0
PMADDUBSW m3, m14, m6, m9, 0
PMADDUBSW m7, m13, m6, m9, 0
PMADDUBSW m4, m13, m6, m9, 0
paddw m0, m8
paddw m2, m3
pxor m3, m3
punpcklbw m6, m5, m3
punpckhbw m5, m3
psllw m8, m6, 7
psllw m3, m5, 7
%if cpuflag(ssse3)
pxor m6, m6
%endif
punpcklbw m3, m5, m6
punpckhbw m5, m6
psllw m8, m3, 7
psllw m6, m5, 7
psubw m8, m10
psubw m3, m10
pmullw m6, m12
psubw m6, m10
pmullw m3, m12
pmullw m5, m12
paddw m0, m7
paddw m2, m4
paddw m0, m6
paddw m0, m3
paddw m2, m5
paddsw m0, m8
paddsw m2, m3
paddsw m2, m6
psraw m0, 3
psraw m2, 3
paddw m0, m11
paddw m2, m11
mova [dstptrq+ 0], m0
mova [dstptrq+16], m2
%if ARCH_X86_64
mova m0, m1
%else
mova m0, [esp+0x54]
PALIGNR m2, m1, m0, 10
punpcklbw m3, m2, m1
punpckhbw m2, m1
PMADDUBSW m3, m15, m4, m5, 1
PMADDUBSW m2, m15, m4, m5, 0
PALIGNR m4, m1, m0, 11
PALIGNR m5, m1, m0, 15
punpcklbw m6, m4, m5
punpckhbw m4, m5
PMADDUBSW m6, m14, m5, m7, 1
PMADDUBSW m4, m14, m5, m7, 0
paddw m3, m6
paddw m2, m4
PALIGNR m4, m1, m0, 12
PALIGNR m5, m1, m0, 14
punpcklbw m6, m4, m5
punpckhbw m4, m5
PMADDUBSW m6, m13, m5, m7, 1
PMADDUBSW m4, m13, m5, m7, 0
paddw m3, m6
paddw m2, m4
PALIGNR m6, m1, m0, 13
%if cpuflag(ssse3)
pxor m5, m5
%endif
punpcklbw m4, m6, m5
punpckhbw m6, m5
psllw m5, m4, 7
psllw m7, m6, 7
psubw m5, m10
psubw m7, m10
pmullw m4, m12
pmullw m6, m12
paddw m3, m4
paddw m2, m6
paddsw m3, m5
paddsw m2, m7
psraw m3, 3
psraw m2, 3
paddw m3, m11
paddw m2, m11
mova [dstptrq+ 0], m3
mova [dstptrq+16], m2
%endif
mova m0, m1
add srcptrq, 16
add dstptrq, 32
sub xd, 16
@ -317,18 +433,19 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
dec hd
jg .loop
RET
%endmacro
%macro WIENER_V 0
%if ARCH_X86_64
cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
mov edged, edgem
movifnidn fvq, fvmp
movifnidn hd, hm
movq m15, [fvq]
pshufb m14, m15, [pb_4_5_6_7]
pshufb m15, m15, [pb_0_1_2_3]
pshufd m14, m15, q1111
pshufd m15, m15, q0000
paddw m14, [pw_0_128]
movd m12, [pd_1024]
pshufd m12, m12, 0
mova m12, [pd_1024]
DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr
@ -351,8 +468,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
SETUP_PIC edged
movq m0, [fvq]
pshufb m1, m0, [PIC_sym(pb_4_5_6_7)]
pshufb m0, m0, [PIC_sym(pb_0_1_2_3)]
pshufd m1, m0, q1111
pshufd m0, m0, q0000
paddw m1, [PIC_sym(pw_0_128)]
mova [esp+0x50], m0
mova [esp+0x40], m1
@ -504,6 +621,15 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
sub wd, 8
jg .loop_x
RET
%endmacro
INIT_XMM sse2
WIENER_H
WIENER_V
INIT_XMM ssse3
WIENER_H
WIENER_V
;;;;;;;;;;;;;;;;;;;;;;;;;;
;; self-guided ;;

View File

@ -90,9 +90,11 @@ decl_blend_dir_fn(dav1d_blend_h_ssse3);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse4);
decl_warp8x8_fn(dav1d_warp_affine_8x8_ssse3);
decl_warp8x8_fn(dav1d_warp_affine_8x8_sse2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse4);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_ssse3);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
decl_emu_edge_fn(dav1d_emu_edge_avx2);
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
@ -104,6 +106,13 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->mct[type] = dav1d_prep_##name##_##suffix
const unsigned flags = dav1d_get_cpu_flags();
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
return;
#if BITDEPTH == 8
c->warp8x8 = dav1d_warp_affine_8x8_sse2;
c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
return;

View File

@ -68,7 +68,9 @@ pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
pd_512: times 4 dd 512
pd_16384: times 4 dd 16484
pd_32768: times 4 dd 32768
pd_262144:times 4 dd 262144
pw_258: times 2 dw 258
@ -3385,6 +3387,8 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%define m14 m6
%define m15 m7
%define m11 m7
%endif
%if notcpuflag(ssse3) || ARCH_X86_32
pxor m11, m11
%endif
lea tmp1d, [myq+deltaq*4]
@ -3483,6 +3487,7 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
mova m14, [esp+0xE0]
mova m15, [esp+0xF0]
%endif
%if cpuflag(ssse3)
psrad m12, 13
psrad m13, 13
psrad m14, 13
@ -3492,6 +3497,22 @@ cglobal warp_affine_8x8t, 0, 7, 16, -0x130-copy_args, tmp, ts
mova m13, [PIC_sym(pw_8192)]
pmulhrsw m12, m13 ; (x + (1 << 6)) >> 7
pmulhrsw m14, m13
%else
%if ARCH_X86_32
%define m10 m0
%endif
mova m10, [PIC_sym(pd_16384)]
paddd m12, m10
paddd m13, m10
paddd m14, m10
paddd m15, m10
psrad m12, 15
psrad m13, 15
psrad m14, 15
psrad m15, 15
packssdw m12, m13
packssdw m14, m15
%endif
mova [tmpq+tsq*0], m12
mova [tmpq+tsq*2], m14
dec counterd
@ -3554,11 +3575,16 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
call .main2
lea dstq, [dstq+dsq*2]
.start:
%if cpuflag(ssse3)
%if ARCH_X86_64
mova m10, [PIC_sym(pw_8192)]
%if notcpuflag(sse4)
%if cpuflag(ssse3)
%define roundval pw_8192
%else
%define m10 [PIC_sym(pw_8192)]
%define roundval pd_262144
%endif
%if ARCH_X86_64
mova m10, [PIC_sym(roundval)]
%else
%define m10 [PIC_sym(roundval)]
%endif
%endif
%if ARCH_X86_32
@ -3577,10 +3603,18 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
packusdw m12, m13
pavgw m12, m11 ; (x + (1 << 10)) >> 11
%else
%if cpuflag(ssse3)
psrad m12, 17
psrad m13, 17
packssdw m12, m13
pmulhrsw m12, m10 ; (x + (1 << 10)) >> 11
pmulhrsw m12, m10
%else
paddd m12, m10
paddd m13, m10
psrad m12, 19
psrad m13, 19
packssdw m12, m13
%endif
%endif
%if ARCH_X86_32
%define m14 m6
@ -3594,10 +3628,18 @@ cglobal warp_affine_8x8, 0, 7, 16, -0x130-copy_args, \
packusdw m14, m15
pavgw m14, m11 ; (x + (1 << 10)) >> 11
%else
%if cpuflag(ssse3)
psrad m14, 17
psrad m15, 17
packssdw m14, m15
pmulhrsw m14, m10 ; (x + (1 << 10)) >> 11
pmulhrsw m14, m10
%else
paddd m14, m10
paddd m15, m10
psrad m14, 19
psrad m15, 19
packssdw m14, m15
%endif
%endif
packuswb m12, m14
movq [dstq+dsq*0], m12
@ -3647,12 +3689,17 @@ ALIGN function_align
lea filterq, [PIC_sym(mc_warp_filter)]
%if ARCH_X86_64
mov myd, r6m
%if cpuflag(ssse3)
pxor m11, m11
%endif
%endif
call .h
psrld m2, m0, 16
psrld m3, m1, 16
%if ARCH_X86_32
%if notcpuflag(ssse3)
mova [esp+gprsize+0x00], m2
%endif
mova [esp+gprsize+0x10], m3
%endif
call .h
@ -3666,6 +3713,9 @@ ALIGN function_align
%if ARCH_X86_64
%define blendmask [rsp+gprsize+0x80]
%else
%if notcpuflag(ssse3)
mova m2, [esp+gprsize+0x00]
%endif
mova m3, [esp+gprsize+0x10]
%define blendmask [esp+gprsize+0x120]
%define m10 m7
@ -3689,6 +3739,9 @@ ALIGN function_align
mova [rsp+gprsize+0x30], m5
call .h
%if ARCH_X86_32
%if notcpuflag(ssse3)
mova m2, [esp+gprsize+0x00]
%endif
mova m3, [esp+gprsize+0x10]
%define m10 m5
%endif
@ -3848,6 +3901,7 @@ ALIGN function_align
lea tmp2d, [mxq+alphaq*1]
shr mxd, 10
shr tmp1d, 10
%if cpuflag(ssse3)
movq m14, [filterq+mxq *8] ; 2 X
movq m9, [filterq+tmp1q*8] ; 6 X
lea tmp1d, [tmp2q+alphaq*4]
@ -3864,10 +3918,99 @@ ALIGN function_align
pmaddubsw m15, m14
pshufb m10, m10, [PIC_sym(warp_8x8_shufD)]
pmaddubsw m10, m9
mova m14, [PIC_sym(pw_8192)]
mova m9, [PIC_sym(pd_32768)]
phaddw m0, m15
phaddw m1, m10
%else
%if ARCH_X86_32
%define m11 m2
%endif
pcmpeqw m0, m0
psrlw m14, m0, 8
psrlw m15, m10, 8 ; 01 03 05 07 09 11 13 15
pand m14, m10 ; 00 02 04 06 08 10 12 14
packuswb m14, m15 ; 00 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15
psrldq m9, m0, 4
pshufd m0, m14, q0220
pand m0, m9
psrldq m14, 1 ; 02 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __
pslldq m15, m14, 12
por m0, m15 ; shufA
psrlw m15, m0, 8
psraw m11, m1, 8
psllw m0, 8
psllw m1, 8
psrlw m0, 8
psraw m1, 8
pmullw m15, m11
pmullw m0, m1
paddw m0, m15 ; pmaddubsw m0, m1
pshufd m15, m14, q0220
pand m15, m9
psrldq m14, 1 ; 04 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __
pslldq m1, m14, 12
por m15, m1 ; shufC
pshufd m1, m14, q0220
pand m1, m9
psrldq m14, 1 ; 06 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __
pslldq m11, m14, 12
por m1, m11 ; shufB
pshufd m10, m14, q0220
pand m10, m9
psrldq m14, 1 ; 08 10 12 14 01 03 05 07 09 11 13 15 __ __ __ __
pslldq m14, m14, 12
por m10, m14 ; shufD
psrlw m9, m1, 8
psraw m11, m8, 8
psllw m1, 8
psllw m8, 8
psrlw m1, 8
psraw m8, 8
pmullw m9, m11
pmullw m1, m8
paddw m1, m9 ; pmaddubsw m1, m8
movq m14, [filterq+mxq *8] ; 2 X
movq m9, [filterq+tmp1q*8] ; 6 X
lea tmp1d, [tmp2q+alphaq*4]
lea mxd, [tmp2q+betaq] ; mx += beta
shr tmp2d, 10
shr tmp1d, 10
movhps m14, [filterq+tmp2q*8] ; 2 3
movhps m9, [filterq+tmp1q*8] ; 6 7
psrlw m8, m15, 8
psraw m11, m14, 8
psllw m15, 8
psllw m14, 8
psrlw m15, 8
psraw m14, 8
pmullw m8, m11
pmullw m15, m14
paddw m15, m8 ; pmaddubsw m15, m14
psrlw m8, m10, 8
psraw m11, m9, 8
psllw m10, 8
psllw m9, 8
psrlw m10, 8
psraw m9, 8
pmullw m8, m11
pmullw m10, m9
paddw m10, m8 ; pmaddubsw m10, m9
pslld m8, m0, 16
pslld m9, m1, 16
pslld m14, m15, 16
pslld m11, m10, 16
paddw m0, m8
paddw m1, m9
paddw m15, m14
paddw m10, m11
psrad m0, 16
psrad m1, 16
psrad m15, 16
psrad m10, 16
packssdw m0, m15 ; phaddw m0, m15
packssdw m1, m10 ; phaddw m1, m10
%endif
mova m14, [PIC_sym(pw_8192)]
mova m9, [PIC_sym(pd_32768)]
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
pmaddwd m1, m14
paddd m0, m9 ; rounded 14-bit result in upper 16 bits of dword
@ -3883,6 +4026,12 @@ INIT_XMM ssse3
WARP_AFFINE_8X8
WARP_AFFINE_8X8T
INIT_XMM sse2
WARP_AFFINE_8X8
WARP_AFFINE_8X8T
INIT_XMM ssse3
%if WIN64
DECLARE_REG_TMP 6, 4
%else

View File

@ -45,15 +45,22 @@ static unsigned get_seed(void) {
#else
#include <unistd.h>
#include <signal.h>
#include <sys/time.h>
#include <time.h>
#ifdef __APPLE__
#include <mach/mach_time.h>
#endif
#define COLOR_RED 1
#define COLOR_GREEN 2
#define COLOR_YELLOW 3
static unsigned get_seed(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return (unsigned) (tv.tv_usec + tv.tv_sec * 1000000);
#ifdef __APPLE__
return (unsigned) mach_absolute_time();
#elif defined(HAVE_CLOCK_GETTIME)
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned) (1000000000ULL * ts.tv_sec + ts.tv_nsec);
#endif
}
#endif

View File

@ -90,7 +90,7 @@ if is_asm_enabled
include_directories: dav1d_inc_dirs,
c_args: [stackalign_flag, stackrealign_flag],
build_by_default: false,
dependencies : [thread_dependency, m_lib],
dependencies : [thread_dependency, rt_dependency, m_lib],
)
test('checkasm', checkasm, is_parallel: false)

View File

@ -32,6 +32,7 @@
#include <errno.h>
#include <inttypes.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
@ -44,7 +45,7 @@
#ifdef _WIN32
# include <windows.h>
#endif
#if defined(HAVE_MACH_ABSOLUTE_TIME)
#ifdef __APPLE__
#include <mach/mach_time.h>
#endif
@ -67,7 +68,7 @@ static uint64_t get_time_nanos(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return 1000000000ULL * ts.tv_sec + ts.tv_nsec;
#elif defined(HAVE_MACH_ABSOLUTE_TIME)
#elif defined(__APPLE__)
mach_timebase_info_data_t info;
mach_timebase_info(&info);
return mach_absolute_time() * info.numer / info.denom;
@ -145,7 +146,7 @@ int main(const int argc, char *const *const argv) {
if (strcmp(version, DAV1D_VERSION)) {
fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
version, DAV1D_VERSION);
return -1;
return EXIT_FAILURE;
}
init_demuxers();
@ -156,12 +157,12 @@ int main(const int argc, char *const *const argv) {
cli_settings.inputfile,
fps, &total, timebase)) < 0)
{
return res;
return EXIT_FAILURE;
}
for (unsigned i = 0; i <= cli_settings.skip; i++) {
if ((res = input_read(in, &data)) < 0) {
input_close(in);
return res;
return EXIT_FAILURE;
}
if (i < cli_settings.skip) dav1d_data_unref(&data);
}
@ -176,7 +177,7 @@ int main(const int argc, char *const *const argv) {
while (dav1d_parse_sequence_header(&seq, data.data, data.sz)) {
if ((res = input_read(in, &data)) < 0) {
input_close(in);
return res;
return EXIT_FAILURE;
}
seq_skip++;
}
@ -191,7 +192,7 @@ int main(const int argc, char *const *const argv) {
total = cli_settings.limit;
if ((res = dav1d_open(&c, &lib_settings)))
return res;
return EXIT_FAILURE;
if (cli_settings.frametimes)
frametimes = fopen(cli_settings.frametimes, "w");
@ -234,7 +235,7 @@ int main(const int argc, char *const *const argv) {
&p.p, fps)) < 0)
{
if (frametimes) fclose(frametimes);
return res;
return EXIT_FAILURE;
}
}
if ((res = output_write(out, &p)) < 0)
@ -271,7 +272,7 @@ int main(const int argc, char *const *const argv) {
&p.p, fps)) < 0)
{
if (frametimes) fclose(frametimes);
return res;
return EXIT_FAILURE;
}
}
if ((res = output_write(out, &p)) < 0)
@ -302,5 +303,5 @@ int main(const int argc, char *const *const argv) {
}
dav1d_close(&c);
return res;
return (res == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
}

View File

@ -104,7 +104,7 @@ static void usage(const char *const app, const char *const reason, ...) {
fprintf(stderr, "Supported options:\n"
" --input/-i $file: input file\n"
" --output/-o $file: output file\n"
" --demuxer $name: force demuxer type ('ivf' or 'annexb'; default: detect from extension)\n"
" --demuxer $name: force demuxer type ('ivf', 'section5' or 'annexb'; default: detect from extension)\n"
" --muxer $name: force muxer type ('md5', 'yuv', 'yuv4mpeg2' or 'null'; default: detect from extension)\n"
" --quiet/-q: disable status messages\n"
" --frametimes $file: dump frame times to file\n"

View File

@ -1,6 +1,7 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* Copyright © 2019, James Almer <jamrial@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -27,13 +28,96 @@
#include "config.h"
#include <errno.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "common/intops.h"
#include "dav1d/headers.h"
#include "input/demuxer.h"
#include "input/parse.h"
// these functions are based on an implementation from FFmpeg, and relicensed
// with author's permission
#define PROBE_SIZE 1024
static int annexb_probe(const uint8_t *data) {
int ret, cnt = 0;
size_t temporal_unit_size;
ret = leb(data + cnt, PROBE_SIZE - cnt, &temporal_unit_size);
if (ret < 0)
return 0;
cnt += ret;
size_t frame_unit_size;
ret = leb(data + cnt, PROBE_SIZE - cnt, &frame_unit_size);
if (ret < 0 || ((uint64_t)frame_unit_size + ret) > temporal_unit_size)
return 0;
cnt += ret;
temporal_unit_size -= ret;
size_t obu_unit_size;
ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
if (ret < 0 || ((uint64_t)obu_unit_size + ret) >= frame_unit_size)
return 0;
cnt += ret;
temporal_unit_size -= obu_unit_size + ret;
frame_unit_size -= obu_unit_size + ret;
// Check that the first OBU is a Temporal Delimiter.
size_t obu_size;
enum Dav1dObuType type;
ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
&obu_size, &type, 1);
if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
return 0;
cnt += (int)obu_unit_size;
// look for first frame and accompanying sequence header
int seq = 0;
while (cnt < PROBE_SIZE) {
ret = leb(data + cnt, PROBE_SIZE - cnt, &obu_unit_size);
if (ret < 0 || ((uint64_t)obu_unit_size + ret) > frame_unit_size)
return 0;
cnt += ret;
temporal_unit_size -= ret;
frame_unit_size -= ret;
ret = parse_obu_header(data + cnt, imin(PROBE_SIZE - cnt, (int) obu_unit_size),
&obu_size, &type, 1);
if (ret < 0)
return 0;
cnt += (int)obu_unit_size;
switch (type) {
case DAV1D_OBU_SEQ_HDR:
seq = 1;
break;
case DAV1D_OBU_FRAME:
case DAV1D_OBU_FRAME_HDR:
return seq;
case DAV1D_OBU_TD:
case DAV1D_OBU_TILE_GRP:
return 0;
default:
break;
}
temporal_unit_size -= obu_unit_size;
frame_unit_size -= obu_unit_size;
if (frame_unit_size <= 0)
break;
}
return 0;
}
typedef struct DemuxerPriv {
FILE *f;
@ -41,23 +125,6 @@ typedef struct DemuxerPriv {
size_t frame_unit_size;
} AnnexbInputContext;
static int leb128(AnnexbInputContext *const c, size_t *const len) {
unsigned more, i = 0;
uint8_t byte;
*len = 0;
do {
if (fread(&byte, 1, 1, c->f) < 1)
return -1;
more = byte & 0x80;
unsigned bits = byte & 0x7f;
if (i <= 3 || (i == 4 && bits < (1 << 4)))
*len |= bits << (i * 7);
else if (bits) return -1;
if (++i == 8 && more) return -1;
} while (more);
return i;
}
static int annexb_open(AnnexbInputContext *const c, const char *const file,
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
{
@ -75,7 +142,7 @@ static int annexb_open(AnnexbInputContext *const c, const char *const file,
timebase[0] = 25;
timebase[1] = 1;
for (*num_frames = 0;; (*num_frames)++) {
res = leb128(c, &len);
res = leb128(c->f, &len);
if (res < 0)
break;
fseeko(c->f, len, SEEK_CUR);
@ -90,15 +157,15 @@ static int annexb_read(AnnexbInputContext *const c, Dav1dData *const data) {
int res;
if (!c->temporal_unit_size) {
res = leb128(c, &c->temporal_unit_size);
res = leb128(c->f, &c->temporal_unit_size);
if (res < 0) return -1;
}
if (!c->frame_unit_size) {
res = leb128(c, &c->frame_unit_size);
res = leb128(c->f, &c->frame_unit_size);
if (res < 0 || (c->frame_unit_size + res) > c->temporal_unit_size) return -1;
c->temporal_unit_size -= res;
}
res = leb128(c, &len);
res = leb128(c->f, &len);
if (res < 0 || (len + res) > c->frame_unit_size) return -1;
uint8_t *ptr = dav1d_data_create(data, len);
if (!ptr) return -1;
@ -120,7 +187,8 @@ static void annexb_close(AnnexbInputContext *const c) {
const Demuxer annexb_demuxer = {
.priv_data_size = sizeof(AnnexbInputContext),
.name = "annexb",
.extension = "obu",
.probe = annexb_probe,
.probe_sz = PROBE_SIZE,
.open = annexb_open,
.read = annexb_read,
.close = annexb_close,

View File

@ -34,7 +34,8 @@ typedef struct DemuxerPriv DemuxerPriv;
typedef struct Demuxer {
int priv_data_size;
const char *name;
const char *extension;
int probe_sz;
int (*probe)(const uint8_t *data);
int (*open)(DemuxerPriv *ctx, const char *filename,
unsigned fps[2], unsigned *num_frames, unsigned timebase[2]);
int (*read)(DemuxerPriv *ctx, Dav1dData *data);

View File

@ -33,6 +33,7 @@
#include <string.h>
#include "common/attributes.h"
#include "common/intops.h"
#include "input/input.h"
#include "input/demuxer.h"
@ -42,7 +43,7 @@ struct DemuxerContext {
const Demuxer *impl;
};
#define MAX_NUM_DEMUXERS 2
#define MAX_NUM_DEMUXERS 3
static const Demuxer *demuxers[MAX_NUM_DEMUXERS];
static int num_demuxers = 0;
@ -55,23 +56,7 @@ static int num_demuxers = 0;
void init_demuxers(void) {
register_demuxer(ivf_demuxer);
register_demuxer(annexb_demuxer);
}
static const char *find_extension(const char *const f) {
const size_t l = strlen(f);
if (l == 0) return NULL;
const char *const end = &f[l - 1], *step = end;
while ((*step >= 'a' && *step <= 'z') ||
(*step >= 'A' && *step <= 'Z') ||
(*step >= '0' && *step <= '9'))
{
step--;
}
return (step < end && step > f && *step == '.' && step[-1] != '/') ?
&step[1] : NULL;
register_demuxer(section5_demuxer);
}
int input_open(DemuxerContext **const c_out,
@ -94,22 +79,34 @@ int input_open(DemuxerContext **const c_out,
return DAV1D_ERR(ENOPROTOOPT);
}
} else {
const char *const ext = find_extension(filename);
if (!ext) {
fprintf(stderr, "No extension found for file %s\n", filename);
return -1;
int probe_sz = 0;
for (i = 0; i < num_demuxers; i++)
probe_sz = imax(probe_sz, demuxers[i]->probe_sz);
uint8_t *const probe_data = malloc(probe_sz);
if (!probe_data) {
fprintf(stderr, "Failed to allocate memory\n");
return DAV1D_ERR(ENOMEM);
}
FILE *f = fopen(filename, "rb");
res = !!fread(probe_data, 1, probe_sz, f);
fclose(f);
if (!res) {
free(probe_data);
fprintf(stderr, "Failed to read probe data\n");
return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
}
for (i = 0; i < num_demuxers; i++) {
if (!strcmp(demuxers[i]->extension, ext)) {
if (demuxers[i]->probe(probe_data)) {
impl = demuxers[i];
break;
}
}
free(probe_data);
if (i == num_demuxers) {
fprintf(stderr,
"Failed to find demuxer for file %s (\"%s\")\n",
filename, ext);
"Failed to probe demuxer for file %s\n",
filename);
return DAV1D_ERR(ENOPROTOOPT);
}
}

View File

@ -39,6 +39,16 @@ typedef struct DemuxerPriv {
FILE *f;
} IvfInputContext;
static const uint8_t probe_data[] = {
'D', 'K', 'I', 'F',
0, 0, 0x20, 0,
'A', 'V', '0', '1',
};
static int ivf_probe(const uint8_t *const data) {
return !memcmp(data, probe_data, sizeof(probe_data));
}
static unsigned rl32(const uint8_t *const p) {
return ((uint32_t)p[3] << 24U) | (p[2] << 16U) | (p[1] << 8U) | p[0];
}
@ -121,7 +131,8 @@ static void ivf_close(IvfInputContext *const c) {
const Demuxer ivf_demuxer = {
.priv_data_size = sizeof(IvfInputContext),
.name = "ivf",
.extension = "ivf",
.probe = ivf_probe,
.probe_sz = sizeof(probe_data),
.open = ivf_open,
.read = ivf_read,
.close = ivf_close,

107
third_party/dav1d/tools/input/parse.h vendored Normal file
View File

@ -0,0 +1,107 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* Copyright © 2019, James Almer <jamrial@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_INPUT_PARSE_H
#define DAV1D_INPUT_PARSE_H
#include "dav1d/headers.h"
static int leb128(FILE *const f, size_t *const len) {
unsigned i = 0, more;
*len = 0;
do {
uint8_t byte;
if (fread(&byte, 1, 1, f) < 1)
return -1;
more = byte & 0x80;
const unsigned bits = byte & 0x7f;
if (i <= 3 || (i == 4 && bits < (1 << 4)))
*len |= bits << (i * 7);
else if (bits) return -1;
if (++i == 8 && more) return -1;
} while (more);
return i;
}
// these functions are based on an implementation from FFmpeg, and relicensed
// with author's permission
static int leb(const uint8_t *ptr, int sz, size_t *const len) {
unsigned i = 0, more;
*len = 0;
do {
if (!sz--) return -1;
const int byte = *ptr++;
more = byte & 0x80;
const unsigned bits = byte & 0x7f;
if (i <= 3 || (i == 4 && bits < (1 << 4)))
*len |= bits << (i * 7);
else if (bits) return -1;
if (++i == 8 && more) return -1;
} while (more);
return i;
}
static inline int parse_obu_header(const uint8_t *buf, int buf_size,
size_t *const obu_size,
enum Dav1dObuType *const type,
const int allow_implicit_size)
{
int ret, extension_flag, has_size_flag;
if (!buf_size)
return -1;
if (*buf & 0x80) // obu_forbidden_bit
return -1;
*type = (*buf & 0x78) >> 3;
extension_flag = (*buf & 0x4) >> 2;
has_size_flag = (*buf & 0x2) >> 1;
// ignore obu_reserved_1bit
buf++;
buf_size--;
if (extension_flag) {
buf++;
buf_size--;
// ignore fields
}
if (has_size_flag) {
ret = leb(buf, buf_size, obu_size);
if (ret < 0)
return -1;
return (int) *obu_size + ret + 1 + extension_flag;
} else if (!allow_implicit_size)
return -1;
*obu_size = buf_size;
return buf_size + 1 + extension_flag;
}
#endif /* DAV1D_INPUT_PARSE_H */

185
third_party/dav1d/tools/input/section5.c vendored Normal file
View File

@ -0,0 +1,185 @@
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* Copyright © 2019, James Almer <jamrial@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include "dav1d/headers.h"
#include "input/demuxer.h"
#include "input/parse.h"
#define PROBE_SIZE 1024
static int section5_probe(const uint8_t *data) {
int ret, cnt = 0;
// Check that the first OBU is a Temporal Delimiter.
size_t obu_size;
enum Dav1dObuType type;
ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
&obu_size, &type, 0);
if (ret < 0 || type != DAV1D_OBU_TD || obu_size > 0)
return 0;
cnt += ret;
// look for first frame and accompanying sequence header
int seq = 0;
while (cnt < PROBE_SIZE) {
ret = parse_obu_header(data + cnt, PROBE_SIZE - cnt,
&obu_size, &type, 0);
if (ret < 0)
return 0;
cnt += ret;
switch (type) {
case DAV1D_OBU_SEQ_HDR:
seq = 1;
break;
case DAV1D_OBU_FRAME:
case DAV1D_OBU_FRAME_HDR:
return seq;
case DAV1D_OBU_TD:
case DAV1D_OBU_TILE_GRP:
return 0;
default:
break;
}
}
return 0;
}
typedef struct DemuxerPriv {
FILE *f;
} Section5InputContext;
static int section5_open(Section5InputContext *const c, const char *const file,
unsigned fps[2], unsigned *const num_frames, unsigned timebase[2])
{
if (!(c->f = fopen(file, "rb"))) {
fprintf(stderr, "Failed to open %s: %s\n", file, strerror(errno));
return -1;
}
// TODO: Parse sequence header and read timing info if any.
fps[0] = 25;
fps[1] = 1;
timebase[0] = 25;
timebase[1] = 1;
*num_frames = 0;
for (;;) {
uint8_t byte[2];
if (fread(&byte[0], 1, 1, c->f) < 1)
break;
const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
if (obu_type == DAV1D_OBU_TD)
(*num_frames)++;
const int has_length_field = byte[0] & 0x2;
if (!has_length_field)
return -1;
const int has_extension = byte[0] & 0x4;
if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
return -1;
size_t len;
const int res = leb128(c->f, &len);
if (res < 0)
return -1;
fseeko(c->f, len, SEEK_CUR); // skip packet
}
fseeko(c->f, 0, SEEK_SET);
return 0;
}
static int section5_read(Section5InputContext *const c, Dav1dData *const data) {
size_t total_bytes = 0;
for (int first = 1;; first = 0) {
uint8_t byte[2];
if (fread(&byte[0], 1, 1, c->f) < 1) {
if (!first && feof(c->f)) break;
return -1;
}
const enum Dav1dObuType obu_type = (byte[0] >> 3) & 0xf;
if (first) {
if (obu_type != DAV1D_OBU_TD)
return -1;
} else {
if (obu_type == DAV1D_OBU_TD) {
// include TD in next packet
fseeko(c->f, -1, SEEK_CUR);
break;
}
}
const int has_length_field = byte[0] & 0x2;
if (!has_length_field)
return -1;
const int has_extension = !!(byte[0] & 0x4);
if (has_extension && fread(&byte[1], 1, 1, c->f) < 1)
return -1;
size_t len;
const int res = leb128(c->f, &len);
if (res < 0)
return -1;
total_bytes += 1 + has_extension + res + len;
fseeko(c->f, len, SEEK_CUR); // skip packet, we'll read it below
}
fseeko(c->f, -(off_t)total_bytes, SEEK_CUR);
uint8_t *ptr = dav1d_data_create(data, total_bytes);
if (!ptr) return -1;
if (fread(ptr, total_bytes, 1, c->f) != 1) {
fprintf(stderr, "Failed to read frame data: %s\n", strerror(errno));
dav1d_data_unref(data);
return -1;
}
return 0;
}
static void section5_close(Section5InputContext *const c) {
fclose(c->f);
}
const Demuxer section5_demuxer = {
.priv_data_size = sizeof(Section5InputContext),
.name = "section5",
.probe = section5_probe,
.probe_sz = PROBE_SIZE,
.open = section5_open,
.read = section5_read,
.close = section5_close,
};

View File

@ -28,6 +28,7 @@ dav1d_input_sources = files(
'input/input.c',
'input/annexb.c',
'input/ivf.c',
'input/section5.c',
)
dav1d_output_sources = files(
@ -68,21 +69,6 @@ endif
# Configuratin data for cli_config.h
cli_cdata = configuration_data()
rt_dependency = []
if host_machine.system() != 'windows'
if cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args)
cli_cdata.set('HAVE_CLOCK_GETTIME', 1)
elif host_machine.system() == 'darwin'
cli_cdata.set('HAVE_MACH_ABSOLUTE_TIME', 1)
else
rt_dependency = cc.find_library('rt', required: false)
if not cc.has_function('clock_gettime', prefix : '#include <time.h>', args : test_args, dependencies : rt_dependency)
error('clock_gettime not found')
endif
cli_cdata.set('HAVE_CLOCK_GETTIME', 1)
endif
endif
cli_config_h_target = configure_file(output: 'cli_config.h', configuration: cli_cdata)
# dav1d cli tool sources

View File

@ -146,10 +146,10 @@ void output_close(MuxerContext *const ctx) {
free(ctx);
}
int output_verify(MuxerContext *const ctx, const char *const md5_Str) {
int output_verify(MuxerContext *const ctx, const char *const md5_str) {
int res = 0;
if (ctx->impl->verify)
res = ctx->impl->verify(ctx->data, md5_Str);
res = ctx->impl->verify(ctx->data, md5_str);
free(ctx);
return res;
}