Bug 1927529 - Update dav1d to ef4aff75b0b56a8e1af996458ae653c0728a1596 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D227072
This commit is contained in:
Updatebot 2024-11-04 06:00:15 +00:00
parent 382f354ebe
commit 387f3edbef
26 changed files with 3853 additions and 1054 deletions

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 7072e79faa8b8f387960b4a738139c98c270277f (2024-10-07T13:04:34.000+02:00).
release: ef4aff75b0b56a8e1af996458ae653c0728a1596 (2024-10-22T00:00:32.000+02:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 7072e79faa8b8f387960b4a738139c98c270277f
revision: ef4aff75b0b56a8e1af996458ae653c0728a1596
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "7072e79faa8b8f387960b4a738139c98c270277f"
#define DAV1D_VERSION "ef4aff75b0b56a8e1af996458ae653c0728a1596"

View File

@ -1,5 +1,5 @@
Changes for 1.5.0 'Road Runner':
--------------------------------
Changes for 1.5.0 'Sonic':
--------------------------
1.5.0 is a major release of dav1d, that:
- WARNING: we removed some of the SSE2 optimizations, so if you care about
@ -11,6 +11,9 @@ Changes for 1.5.0 'Road Runner':
- AArch64/SVE: Add HBD subpel filters using 128-bit SVE2
- AArch64: Add USMMLA implempentation for 6-tap H/HV
- AArch64: Optimize Armv8.0 NEON for HBD horizontal filters and 6-tap filters
- Power9: Optimized ITX till 16x4.
- Loongarch: numerous optimizations
- RISC-V optimizations for pal, cdef_filter, ipred, mc_blend, mc_bdir, itx
- Allow playing videos in full-screen mode in dav1dplay

View File

@ -346,6 +346,36 @@ if host_machine.cpu_family().startswith('x86')
cdata_asm.set('STACK_ALIGNMENT', stack_alignment)
endif
#
# ASM specific stuff
#
use_gaspp = false
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')) and
cc.get_argument_syntax() == 'msvc' and
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
gaspp = find_program('gas-preprocessor.pl')
use_gaspp = true
gaspp_args = [
'-as-type', 'armasm',
'-arch', host_machine.cpu_family(),
'--',
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
'-nologo',
'-I@0@'.format(dav1d_src_root),
'-I@0@/'.format(meson.current_build_dir()),
]
gaspp_gen = generator(gaspp,
output: '@BASENAME@.obj',
arguments: gaspp_args + [
'@INPUT@',
'-c',
'-o', '@OUTPUT@'
])
endif
cdata.set10('ARCH_AARCH64', host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64')
cdata.set10('ARCH_ARM', host_machine.cpu_family().startswith('arm') and host_machine.cpu() != 'arm64')
@ -417,7 +447,21 @@ if (is_asm_enabled and
cdata.set('AS_ARCH_LEVEL', as_arch_level)
as_arch_str = '".arch ' + as_arch_level + '\\n"'
endif
if use_gaspp
python3 = import('python').find_installation()
endif
foreach name, instr : aarch64_extensions
if use_gaspp
f = configure_file(
command: [python3, '-c', 'import sys; print(sys.argv[1])', '@0@'.format(instr)],
output: 'test-@0@.S'.format(name),
capture: true)
r = run_command(gaspp, gaspp_args, f, '-c', '-o', meson.current_build_dir() / 'test-' + name + '.obj', check: false)
message('Checking for gaspp/armasm64 ' + name.to_upper() + ': ' + (r.returncode() == 0 ? 'YES' : 'NO'))
if r.returncode() == 0
supported_aarch64_instructions += name
endif
else
# Test for support for the various extensions. First test if
# the assembler supports the .arch_extension directive for
# enabling/disabling the extension, then separately check whether
@ -438,6 +482,7 @@ if (is_asm_enabled and
if cc.compiles(code, name: name.to_upper())
supported_aarch64_instructions += name
endif
endif
endforeach
endif
endif
@ -482,9 +527,6 @@ if (host_machine.system() in ['darwin', 'ios', 'tvos'] or
cdata_asm.set10('PREFIX', true)
endif
#
# ASM specific stuff
#
if is_asm_enabled and host_machine.cpu_family().startswith('x86')
# NASM compiler support
@ -535,30 +577,6 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
])
endif
use_gaspp = false
if (is_asm_enabled and
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm')) and
cc.get_argument_syntax() == 'msvc' and
(cc.get_id() != 'clang-cl' or meson.version().version_compare('<0.58.0')))
gaspp = find_program('gas-preprocessor.pl')
use_gaspp = true
gaspp_gen = generator(gaspp,
output: '@BASENAME@.obj',
arguments: [
'-as-type', 'armasm',
'-arch', host_machine.cpu_family(),
'--',
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
'-nologo',
'-I@0@'.format(dav1d_src_root),
'-I@0@/'.format(meson.current_build_dir()),
'@INPUT@',
'-c',
'-o', '@OUTPUT@'
])
endif
if is_asm_enabled and host_machine.cpu_family().startswith('riscv')
as_option_code = '''__asm__ (
".option arch, +v\n"

View File

@ -308,6 +308,8 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
#include "src/arm/cdef.h"
#elif ARCH_PPC64LE
#include "src/ppc/cdef.h"
#elif ARCH_RISCV
#include "src/riscv/cdef.h"
#elif ARCH_X86
#include "src/x86/cdef.h"
#elif ARCH_LOONGARCH64
@ -326,6 +328,8 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
cdef_dsp_init_arm(c);
#elif ARCH_PPC64LE
cdef_dsp_init_ppc(c);
#elif ARCH_RISCV
cdef_dsp_init_riscv(c);
#elif ARCH_X86
cdef_dsp_init_x86(c);
#elif ARCH_LOONGARCH64

View File

@ -732,6 +732,8 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/ipred.h"
#elif ARCH_RISCV
#include "src/riscv/ipred.h"
#elif ARCH_X86
#include "src/x86/ipred.h"
#elif ARCH_LOONGARCH64
@ -769,6 +771,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
intra_pred_dsp_init_arm(c);
#elif ARCH_RISCV
intra_pred_dsp_init_riscv(c);
#elif ARCH_X86
intra_pred_dsp_init_x86(c);
#elif ARCH_LOONGARCH64

View File

@ -545,15 +545,21 @@ endconst
vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz
vssrarni.h.w \out3, vr16, 12 // out3
vsrari.w vr16, vr16, 12
vsrari.w \out3, \out3, 12
vneg.w vr16, vr16
vneg.w \out3, \out3
vssrarni.h.w \out3, vr16, 0 // out3
vssrarni.h.w \out4, vr17, 12 // out4
vneg.h \out3, \out3
vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz
vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz
vssrarni.h.w \out5, vr17, 12 // out5
vssrarni.h.w \out2, vr16, 12 // out2
vneg.h \out5, \out5
vsrari.w vr17, vr17, 12
vsrari.w \out5, \out5, 12
vneg.w vr17, vr17
vneg.w \out5, \out5
vssrarni.h.w \out5, vr17, 0 // out5
.endm
functionl inv_adst_8h_x8_lsx
@ -1512,24 +1518,38 @@ endconst
vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz
vneg.h vr21, vr20
vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz
vssrarni.h.w vr10, vr6, 12 // out[7]
vssrarni.h.w vr1, vr16, 12 // out[8]
vneg.h vr10, vr10
vsrari.w vr6, vr6, 12
vsrari.w vr10, vr10, 12
vneg.w vr6, vr6
vneg.w vr10, vr10
vssrarni.h.w vr10, vr6, 0 // out[7]
vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz
vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz
vssrarni.h.w vr17, vr16, 12 // out[11]
vssrarni.h.w vr7, vr6, 12 // out[4]
vneg.h vr17, vr17
vsrari.w vr16, vr16, 12
vsrari.w vr17, vr17, 12
vneg.w vr16, vr16
vneg.w vr17, vr17
vssrarni.h.w vr17, vr16, 0 // out[11]
vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz
vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz
vssrarni.h.w vr0, vr16, 12 // out[9]
vssrarni.h.w vr8, vr6, 12 // out[6]
vneg.h vr0, vr0
vsrari.w vr16, vr16, 12
vsrari.w vr0, vr0, 12
vneg.w vr16, vr16
vneg.w vr0, vr0
vssrarni.h.w vr0, vr16, 0 // out[9]
vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz
vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz
vssrarni.h.w vr4, vr6, 12 // out[5]
vssrarni.h.w vr19, vr16, 12 // out[10]
vneg.h vr4, vr4
vsrari.w vr6, vr6, 12
vsrari.w vr4, vr4, 12
vneg.w vr6, vr6
vneg.w vr4, vr4
vssrarni.h.w vr4, vr6, 0 // out[5]
.ifc \txfm, adst
vor.v vr12, vr3, vr3
@ -4664,9 +4684,12 @@ endfunc
xvilvl.w xr7, xr1, xr16
xvilvh.w xr10, xr10, xr6
xvilvh.w xr1, xr1, xr16
xvssrarni.h.w xr10, xr17, 12 // out[7]
xvssrarni.h.w xr1, xr7, 12 // out[8]
xvneg.h xr10, xr10
xvsrari.w xr17, xr17, 12
xvsrari.w xr10, xr10, 12
xvneg.w xr17, xr17
xvneg.w xr10, xr10
xvssrarni.h.w xr10, xr17, 0 // out[7]
xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17
xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7
@ -4674,9 +4697,12 @@ endfunc
xvilvl.w xr8, xr7, xr6
xvilvh.w xr17, xr17, xr16
xvilvh.w xr7, xr7, xr6
xvssrarni.h.w xr17, xr0, 12 // out[11]
xvssrarni.h.w xr7, xr8, 12 // out[4]
xvneg.h xr17, xr17
xvsrari.w xr0, xr0, 12
xvsrari.w xr17, xr17, 12
xvneg.w xr0, xr0
xvneg.w xr17, xr17
xvssrarni.h.w xr17, xr0, 0 // out[11]
xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0
xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8
@ -4684,19 +4710,24 @@ endfunc
xvilvl.w xr19, xr8, xr6
xvilvh.w xr0, xr0, xr16
xvilvh.w xr8, xr8, xr6
xvssrarni.h.w xr0, xr4, 12 // out[9]
xvssrarni.h.w xr8, xr19, 12 // out[6]
xvneg.h xr0, xr0
xvsrari.w xr4, xr4, 12
xvsrari.w xr0, xr0, 12
xvneg.w xr4, xr4
xvneg.w xr0, xr0
xvssrarni.h.w xr0, xr4, 0 // out[9]
xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4
xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19
xvilvl.w xr11, xr4, xr6
xvilvl.w xr12, xr19, xr16
xvilvh.w xr4, xr4, xr6
xvilvh.w xr19, xr19, xr16
xvssrarni.h.w xr4, xr11, 12 // out[5]
xvssrarni.h.w xr19, xr12, 12 // out[10]
xvneg.h xr4, xr4
xvsrari.w xr11, xr11, 12
xvsrari.w xr4, xr4, 12
xvneg.w xr11, xr11
xvneg.w xr4, xr4
xvssrarni.h.w xr4, xr11, 0 // out[5]
.endm
function inv_txfm_add_adst_adst_16x16_8bpc_lasx

View File

@ -907,6 +907,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
#include "src/arm/mc.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/mc.h"
#elif ARCH_RISCV
#include "src/riscv/mc.h"
#elif ARCH_X86
#include "src/x86/mc.h"
#endif
@ -950,6 +952,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
mc_dsp_init_arm(c);
#elif ARCH_LOONGARCH64
mc_dsp_init_loongarch(c);
#elif ARCH_RISCV
mc_dsp_init_riscv(c);
#elif ARCH_X86
mc_dsp_init_x86(c);
#endif

View File

@ -269,9 +269,26 @@ if is_asm_enabled
)
if host_machine.cpu_family() == 'riscv64'
libdav1d_sources += files(
'riscv/64/cdef.S',
'riscv/64/cpu.S',
'riscv/64/itx.S',
'riscv/64/pal.S',
'riscv/64/mc.S',
)
if dav1d_bitdepths.contains('8')
libdav1d_sources += files(
'riscv/64/cdef.S',
'riscv/64/ipred.S',
)
endif
if dav1d_bitdepths.contains('16')
libdav1d_sources += files(
'riscv/64/cdef16.S',
'riscv/64/ipred16.S',
)
endif
endif
endif
endif

View File

@ -61,8 +61,10 @@ static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src,
}
#if HAVE_ASM
#if ARCH_X86
#include "src/x86/pal.h"
#if ARCH_RISCV
#include "riscv/pal.h"
#elif ARCH_X86
#include "x86/pal.h"
#endif
#endif
@ -70,7 +72,9 @@ COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
c->pal_idx_finish = pal_idx_finish_c;
#if HAVE_ASM
#if ARCH_X86
#if ARCH_RISCV
pal_dsp_init_riscv(c);
#elif ARCH_X86
pal_dsp_init_x86(c);
#endif
#endif

703
third_party/dav1d/src/riscv/64/cdef.S vendored Normal file
View File

@ -0,0 +1,703 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
vmslt.vx v0, \vec_tmp1, zero
vneg.v \vec_tmp1, \vec_tmp1, v0.t
vmmv.m v1, v0
vmslt.vx v0, \vec_tmp2, zero
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vsra.vx \vec1, \vec_tmp1, \shift
vsra.vx \vec2, \vec_tmp2, \shift
vrsub.vx \vec1, \vec1, \strength
vrsub.vx \vec2, \vec2, \strength
vmax.vx \vec1, \vec1, zero
vmax.vx \vec2, \vec2, zero
vmin.vv \vec_tmp1, \vec1, \vec_tmp1
vmin.vv \vec_tmp2, \vec2, \vec_tmp2
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vmmv.m v0, v1
vneg.v \vec_tmp1, \vec_tmp1, v0.t
.endm
.macro padding_fn w, h
li t5, -32768 # INT16_MIN
andi t4, a7, 4
li t2, -2 # y_start
.if \w == 4
vsetivli zero, \w + 4, e16, m1, ta, ma
.else
vsetivli zero, \w + 4, e16, m2, ta, ma
.endif
vmv.v.x v0, t5
bnez t4, L(top_done_\w\()x\h)
slli t5, a1, 1
addi t5, t5, 2
slli t5, t5, 1
sub t5, a0, t5
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
li t2, 0
L(top_done_\w\()x\h):
andi t4, a7, 8
li t3, 2 + \h # y_end
bnez t4, L(bottom_done_\w\()x\h)
li t5, \h
mul t5, a1, t5
addi t5, t5, -2
sh1add t5, t5, a0
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
addi t3, t3, -2
L(bottom_done_\w\()x\h):
andi t4, a7, 1
li t0, -2 # x_start
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
bnez t4, L(left_done_\w\()x\h)
mul t5, a1, t2
addi t5, t5, -2
sh1add t5, t5, a0
sub t0, t3, t2
3:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t0, t0, -1
bnez t0, 3b
L(left_done_\w\()x\h):
andi t4, a7, 2
li t1, 2 + \w # x_end
bnez t4, L(right_done_\w\()x\h)
mul t5, t2, a1
addi t5, t5, \w
sh1add t5, t5, a0
sub t1, t3, t2
4:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t1, t1, -1
bnez t1, 4b
li t1, \w
L(right_done_\w\()x\h):
beqz t2, L(top_skip_\w\()x\h)
mul t5, a1, t2
add t5, t0, t5
sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
add a5, a5, t0
sub t5, t1, t0 # x_end - x_start
slli t6, t0, 1
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
5:
vle8.v v0, (a5)
addi t2, t2, 1
vzext.vf2 v2, v0
add a5, a3, a5
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez t2, 5b
sub a0, a0, t6 # tmp -= x_start
L(top_skip_\w\()x\h):
li a5, \h
beqz t0, L(left_skip_\w\()x\h)
sh1add a0, t0, a0 # tmp += x_start
7:
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
vle8.v v0, (a4)
addi a5, a5, -1
vzext.vf2 v2, v0
addi a4, a4, 2
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez a5, 7b
li a5, \h
mul t5, a1, a5
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
L(left_skip_\w\()x\h):
8:
.if \w == 4
vsetvli zero, t1, e16, m1, ta, ma
.else
vsetvli zero, t1, e16, m2, ta, ma
.endif
vle8.v v0, (a2)
vzext.vf2 v2, v0
vse16.v v2, (a0)
add a2, a3, a2
sh1add a0, a1, a0
addi a5, a5, -1
bnez a5, 8b
li a5, \h
sh1add a0, t0, a0 # tmp += x_start
add a6, a6, t0 # bottom += x_start
beq a5, t3, L(bottom_skip_\w\()x\h)
sub t5, t1, t0
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
9:
vle8.v v0, (a6)
add a6, a3, a6
vzext.vf2 v2, v0
addi a5, a5, 1
vse16.v v2, (a0)
sh1add a0, a1, a0
bne a5, t3, 9b
L(bottom_skip_\w\()x\h):
li t6, \h
mul t6, a3, t6
sub a2, a2, t6 # src -= h * src_stride
mul t5, a1, t3
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
.endm
.macro cdef_fn w, h
function cdef_filter_block_\w\()x\h\()_8bpc_rvv, export=1, ext="v,zba,zbb"
csrw vxrm, zero
addi sp, sp, -32 - 144*2
sd a5, 24(sp) # pri_strength
sd a6, 16(sp) # sec_strength
sd a7, 8(sp) # dir
ld a7, 8 + 32 + 144*2(sp) # edges
mv a6, a4 # bottom
mv a5, a3 # top
mv a4, a2 # left
mv a3, a1 # dst_stride
mv a2, a0 # dst
li a1, 12 # tmp_stride
addi a0, sp, 32 + 2*(2*12+2)
padding_fn \w, \h
ld a4, 32 + 2*144(sp) # damping
ld a5, 24(sp) # pri_strength
ld a6, 16(sp) # sec_strength
ld a7, 8(sp) # dir
beqz a5, cdef_filter_sec_only_\w\()x\h
bnez a6, cdef_filter_pri_sec_\w\()x\h
andi t0, a5, 1
li t1, 4
sub t4, t1, t0
li t1, 63
clz t2, a5
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 2
sh1add t2, t3, t2
blt zero, t1, 1f
mv t1, zero
1:
vsetivli zero, \w, e16, m1, ta, mu
lb t3, 0(t2)
vle8.v v0, (a2)
vzext.vf2 v2, v0
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
lb t3, 1(t2)
andi t5, t4, 3
ori t5, t5, 2
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vsetvli zero, zero, e16, m1, ta, mu
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmacc.vx v28, t5, v16
vmacc.vx v28, t5, v8
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 1b
addi sp, sp, 32 + 144*2
ret
cdef_filter_sec_only_\w\()x\h:
li t1, 63
clz t2, a6
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 4
sh1add t3, t3, t2
sh1add t2, a7, t2
2:
vsetivli zero, \w, e16, m1, ta, mu
lb t4, 0(t3)
lb t5, 0(t2)
vle8.v v0, (a2)
vzext.vf2 v2, v0
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
li t4, 2
constrain_vectors v4, v6, v12, a6, t1, v12, v14
constrain_vectors v8, v10, v14, a6, t1, v16, v18
vmul.vx v28, v18, t4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v14
vmacc.vx v28, t4, v12
lb t4, 1(t3)
lb t5, 1(t2)
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vsetvli zero, zero, e16, m1, ta, mu
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t1, v12, v14
constrain_vectors v8, v10, v14, a6, t1, v16, v18
vadd.vv v4, v28, v12
vadd.vv v28, v4, v14
vadd.vv v4, v28, v16
vadd.vv v28, v4, v18
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 2b
addi sp, sp, 32 + 144*2
ret
cdef_filter_pri_sec_\w\()x\h:
li t1, 63
clz t2, a5
clz t3, a6
sub t2, t1, t2
sub t3, t1, t3
sub t1, a4, t2
sub t2, a4, t3
li t0, \h
la t3, dav1d_cdef_directions
blt zero, t1, 3f
mv t1, zero
3:
vsetivli zero, \w, e16, m1, ta, ma
li t4, 4
andi t6, a5, 1
addi t5, a7, 2
sub t4, t4, t6
sh1add t5, t5, t3
vle8.v v0, (a2)
lb t6, 0(t5)
vzext.vf2 v2, v0
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v2
vmax.vv v24, v4, v2
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
lb t6, 1(t5)
andi t4, t4, 3
ori t4, t4, 2
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a5, t1, v8, v16
addi t5, a7, 4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v8
sh1add t5, t5, t3
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t2, v8, v16
sh1add t5, a7, t3
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vsetvli zero, zero, e16, m1, ta, ma
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v12, a6, t2, v8, v16
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, mu
vnclip.wi v16, v28, 4
vadd.vv v28, v2, v16
vmslt.vv v0, v20, v28
vmerge.vvm v4, v20, v28, v0
vmslt.vv v0, v4, v24
vmerge.vvm v28, v24, v4, v0
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v24, v28, 0
vse8.v v24, (a2)
addi t0, t0, -1
add a2, a2, a3
sh1add a0, a1, a0
bnez t0, 3b
addi sp, sp, 32 + 144*2
ret
endfunc
.endm
cdef_fn 4, 4
cdef_fn 4, 8
cdef_fn 8, 8

689
third_party/dav1d/src/riscv/64/cdef16.S vendored Normal file
View File

@ -0,0 +1,689 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
.macro constrain_vectors vec1, vec2, vec_sub, strength, shift, vec_tmp1, vec_tmp2
vmslt.vx v0, \vec_tmp1, zero
vneg.v \vec_tmp1, \vec_tmp1, v0.t
vmmv.m v1, v0
vmslt.vx v0, \vec_tmp2, zero
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vsra.vx \vec1, \vec_tmp1, \shift
vsra.vx \vec2, \vec_tmp2, \shift
vrsub.vx \vec1, \vec1, \strength
vrsub.vx \vec2, \vec2, \strength
vmax.vx \vec1, \vec1, zero
vmax.vx \vec2, \vec2, zero
vmin.vv \vec_tmp1, \vec1, \vec_tmp1
vmin.vv \vec_tmp2, \vec2, \vec_tmp2
vneg.v \vec_tmp2, \vec_tmp2, v0.t
vmmv.m v0, v1
vneg.v \vec_tmp1, \vec_tmp1, v0.t
.endm
.macro padding_fn w, h
li t5, -32768 # INT16_MIN
andi t4, a7, 4
li t2, -2 # y_start
.if \w == 4
vsetivli zero, \w + 4, e16, m1, ta, ma
.else
vsetivli zero, \w + 4, e16, m2, ta, ma
.endif
vmv.v.x v0, t5
bnez t4, L(top_done_\w\()x\h)
slli t5, a1, 1
addi t5, t5, 2
slli t5, t5, 1
sub t5, a0, t5
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
li t2, 0
L(top_done_\w\()x\h):
andi t4, a7, 8
li t3, 2 + \h # y_end
bnez t4, L(bottom_done_\w\()x\h)
li t5, \h
mul t5, a1, t5
addi t5, t5, -2
sh1add t5, t5, a0
sh1add t4, a1, t5
vse16.v v0, (t5)
vse16.v v0, (t4)
addi t3, t3, -2
L(bottom_done_\w\()x\h):
andi t4, a7, 1
li t0, -2 # x_start
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
bnez t4, L(left_done_\w\()x\h)
mul t5, a1, t2
addi t5, t5, -2
sh1add t5, t5, a0
sub t0, t3, t2
3:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t0, t0, -1
bnez t0, 3b
L(left_done_\w\()x\h):
andi t4, a7, 2
li t1, 2 + \w # x_end
bnez t4, L(right_done_\w\()x\h)
mul t5, t2, a1
addi t5, t5, \w
sh1add t5, t5, a0
sub t1, t3, t2
4:
vse16.v v0, (t5)
sh1add t5, a1, t5
addi t1, t1, -1
bnez t1, 4b
li t1, \w
L(right_done_\w\()x\h):
beqz t2, L(top_skip_\w\()x\h)
mul t5, a1, t2
add t5, t0, t5
sh1add a0, t5, a0 # tmp += y_start * tmp_stride + x_start
sh1add a5, t0, a5 # top += x_start
sub t5, t1, t0
slli t6, t0, 1
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
5:
vle16.v v2, (a5)
addi t2, t2, 1
add a5, a3, a5
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez t2, 5b
sub a0, a0, t6 # tmp -= x_start
L(top_skip_\w\()x\h):
li a5, \h
beqz t0, L(left_skip_\w\()x\h)
sh1add a0, t0, a0 # tmp += x_start
7:
.if \w == 4
vsetivli zero, 2, e16, m1, ta, ma
.else
vsetivli zero, 2, e16, m2, ta, ma
.endif
vle16.v v2, (a4)
addi a5, a5, -1
addi a4, a4, 4
vse16.v v2, (a0)
sh1add a0, a1, a0
bnez a5, 7b
li a5, \h
mul t5, a1, a5
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= h * tmp_stride + x_start
L(left_skip_\w\()x\h):
8:
.if \w == 4
vsetvli zero, t1, e16, m1, ta, ma
.else
vsetvli zero, t1, e16, m2, ta, ma
.endif
vle16.v v2, (a2)
add a2, a3, a2
vse16.v v2, (a0)
sh1add a0, a1, a0
addi a5, a5, -1
bnez a5, 8b
li a5, \h
sh1add a0, t0, a0 # tmp += x_start
sh1add a6, t0, a6 # bottom += x_start
beq a5, t3, L(bottom_skip_\w\()x\h)
sub t5, t1, t0
.if \w == 4
vsetvli zero, t5, e16, m1, ta, ma
.else
vsetvli zero, t5, e16, m2, ta, ma
.endif
9:
vle16.v v2, (a6)
add a6, a3, a6
addi a5, a5, 1
vse16.v v2, (a0)
sh1add a0, a1, a0
bne a5, t3, 9b
L(bottom_skip_\w\()x\h):
li t6, \h
mul t6, a3, t6
sub a2, a2, t6 # src -= h * PXSTRIDE(src_stride)
mul t5, a1, t3
add t5, t5, t0
slli t5, t5, 1
sub a0, a0, t5 # tmp -= y_end * tmp_stride + x_start
.endm
.macro cdef_fn w, h
function cdef_filter_block_\w\()x\h\()_16bpc_rvv, export=1, ext="v,zba,zbb"
csrw vxrm, zero
addi sp, sp, -32 - 144*2
sd a5, 24(sp) # pri_strength
sd a6, 16(sp) # sec_strength
sd a7, 8(sp) # dir
ld a7, 8 + 32 + 144*2(sp) # edges
mv a6, a4 # bottom
mv a5, a3 # top
mv a4, a2 # left
mv a3, a1 # dst_stride
mv a2, a0 # dst
li a1, 12 # tmp_stride
addi a0, sp, 32 + 2*(2*12+2)
padding_fn \w, \h
ld a4, 32 + 2*144(sp) # damping
ld a5, 24(sp) # pri_strength
ld a6, 16(sp) # sec_strength
ld a7, 8(sp) # dir
beqz a5, cdef_filter_sec_only_\w\()x\h
bnez a6, cdef_filter_pri_sec_\w\()x\h
li t1, 64-8
ld t4, 32 + 2*144 + 16(sp) # bitdepth_max
clz t4, t4
sub t4, t1, t4
sra t4, a5, t4
andi t0, t4, 1
li t1, 4
sub t4, t1, t0
li t1, 63
clz t2, a5
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 2
sh1add t2, t3, t2
vsetivli zero, \w, e16, m1, ta, ma
blt zero, t1, 1f
mv t1, zero
1:
lb t3, 0(t2)
vle16.v v2, (a2)
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
lb t3, 1(t2)
andi t5, t4, 3
ori t5, t5, 2
sh1add t6, t3, a0
slli t3, t3, 1
sub t3, a0, t3
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (t6)
vle16.v v6, (t3)
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a5, t1, v8, v16
vmacc.vx v28, t5, v16
vmacc.vx v28, t5, v8
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vse16.v v28, (a2)
add a2, a2, a3
sh1add a0, a1, a0
addi t0, t0, -1
bnez t0, 1b
addi sp, sp, 32 + 144*2
ret
cdef_filter_sec_only_\w\()x\h:
li t1, 63
clz t2, a6
sub t1, t1, t2
sub t1, a4, t1
li t0, \h
la t2, dav1d_cdef_directions
addi t3, a7, 4
sh1add t3, t3, t2
sh1add t2, a7, t2
vsetivli zero, \w, e16, m1, ta, ma
2:
lb t4, 0(t3)
lb t5, 0(t2)
vle16.v v2, (a2)
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
li t4, 2
constrain_vectors v4, v6, v2, a6, t1, v12, v14
constrain_vectors v8, v10, v2, a6, t1, v16, v18
vmul.vx v28, v18, t4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v14
vmacc.vx v28, t4, v12
lb t4, 1(t3)
lb t5, 1(t2)
sh1add t6, t4, a0
slli t4, t4, 1
sub t4, a0, t4
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (t6)
vle16.v v6, (t4)
sh1add t4, t5, a0
slli t5, t5, 1
sub t5, a0, t5
vle16.v v8, (t4)
vle16.v v10, (t5)
vwsub.vv v12, v4, v2
vwsub.vv v14, v6, v2
vwsub.vv v16, v8, v2
vwsub.vv v18, v10, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a6, t1, v12, v14
constrain_vectors v8, v10, v2, a6, t1, v16, v18
vadd.vv v4, v28, v12
vadd.vv v28, v4, v14
vadd.vv v4, v28, v16
vadd.vv v28, v4, v18
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v24, v28, 4
vadd.vv v28, v2, v24
vse16.v v28, (a2)
add a2, a2, a3
sh1add a0, a1, a0
addi t0, t0, -1
bnez t0, 2b
addi sp, sp, 32 + 144*2
ret
cdef_filter_pri_sec_\w\()x\h:
li t1, 63
clz t2, a5
clz t3, a6
sub t2, t1, t2
sub t3, t1, t3
sub t1, a4, t2
sub t2, a4, t3
li t0, \h
la t3, dav1d_cdef_directions
vsetivli zero, \w, e16, m1, ta, ma
blt zero, t1, 3f
mv t1, zero
3:
li t5, 64-8
ld t4, 32 + 2*144 + 16(sp) # bitdepth_max
clz t4, t4
sub t4, t5, t4
sra t4, a5, t4
li t6, 4
andi t5, t4, 1
sub t4, t6, t5
addi t5, a7, 2
sh1add t5, t5, t3
vle16.v v2, (a2)
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v2
vmax.vv v24, v4, v2
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a5, t1, v8, v16
vmul.vx v28, v16, t4
vmacc.vx v28, t4, v8
andi t4, t4, 3
ori t4, t4, 2
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a5, t1, v8, v16
addi t5, a7, 4
vmacc.vx v28, t4, v16
vmacc.vx v28, t4, v8
sh1add t5, t5, t3
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v2, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a6, t2, v8, v16
sh1add t5, a7, t3
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vsetvli zero, zero, e16, m1, ta, ma
lb t6, 0(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
li t6, 2
constrain_vectors v4, v6, v2, a6, t2, v8, v16
vmacc.vx v28, t6, v16
vmacc.vx v28, t6, v8
lb t6, 1(t5)
sh1add a4, t6, a0
slli t6, t6, 1
sub t6, a0, t6
vsetvli zero, zero, e16, m1, ta, ma
vle16.v v4, (a4)
vle16.v v6, (t6)
vminu.vv v20, v4, v20
vmax.vv v24, v4, v24
vminu.vv v20, v6, v20
vmax.vv v24, v6, v24
vwsub.vv v8, v4, v2
vwsub.vv v16, v6, v2
vsetvli zero, zero, e32, m2, ta, mu
constrain_vectors v4, v6, v2, a6, t2, v8, v16
vadd.vv v4, v28, v8
vadd.vv v28, v4, v16
vmslt.vx v0, v28, zero
vadd.vi v28, v28, -1, v0.t
vsetvli zero, zero, e16, m1, ta, ma
vnclip.wi v16, v28, 4
vadd.vv v28, v2, v16
vmslt.vv v0, v20, v28
vmerge.vvm v4, v20, v28, v0
vmslt.vv v0, v4, v24
vmerge.vvm v28, v24, v4, v0
vse16.v v28, (a2)
add a2, a2, a3
sh1add a0, a1, a0
addi t0, t0, -1
bnez t0, 3b
addi sp, sp, 32 + 144*2
ret
endfunc
.endm
cdef_fn 4, 4
cdef_fn 4, 8
cdef_fn 8, 8

View File

@ -42,3 +42,8 @@ function has_compliant_rvv, export=1, ext=v
sgtz a0, a0
ret
endfunc
function get_vlenb, export=1
csrr a0, vlenb
ret
endfunc

461
third_party/dav1d/src/riscv/64/ipred.S vendored Normal file
View File

@ -0,0 +1,461 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
function dc_gen_8bpc_rvv, export=1, ext="v,zbb"
.variant_cc dav1d_dc_gen_8bpc_rvv
add t1, a1, a2
srli t5, t1, 1
mv t1, a1
addi t2, a0, 1
vsetvli zero, t1, e16, m4, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e8, m2, tu, ma
vle8.v v4, (t2)
vwaddu.wv v0, v0, v4
sub t1, t1, t3
add t2, t2, t3
bnez t1, 1b
mv t1, a2
mv t2, a0
vsetvli zero, t1, e16, m4, ta, ma
vmv.v.x v8, zero
2:
vsetvli t3, t1, e8, m2, tu, ma
sub t2, t2, t3
vle8.v v4, (t2)
vwaddu.wv v8, v8, v4
sub t1, t1, t3
bnez t1, 2b
vsetvli zero, zero, e32, m8, ta, ma
vmv.s.x v16, t5
vmv.s.x v12, zero
vsetvli zero, a1, e16, m4, ta, ma
vwredsum.vs v24, v0, v16
vsetvli zero, a2, e16, m4, ta, ma
vwredsum.vs v16, v8, v12
vsetvli zero, zero, e32, m8, ta, ma
vmv.x.s t5, v24
vmv.x.s t1, v16
add t5, t5, t1
add t1, a1, a2
ctz t1, t1
srl a0, t5, t1
beq a1, a2, 5f
slli t1, a1, 1
sltu t2, t1, a2
slli t3, a2, 1
sltu t1, t3, a1
or t1, t1, t2
bnez t1, 3f
li t1, 0x5556
j 4f
3:
li t1, 0x3334
4:
mul a0, a0, t1
srli a0, a0, 16
5:
jr t0
endfunc
function dc_gen_top_8bpc_rvv, export=1, ext="v,zbb"
.variant_cc dav1d_dc_gen_top_8bpc_rvv
mv t1, a1
srli t5, a1, 1
addi a0, a0, 1
vsetvli zero, t1, e16, m4, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e8, m2, tu, ma
vle8.v v4, (a0)
vwaddu.wv v0, v0, v4
sub t1, t1, t3
add a0, a0, t3
bnez t1, 1b
j dc_gen_sum_up_8bpc_rvv
endfunc
function dc_gen_left_8bpc_rvv, export=1, ext="v,zbb"
.variant_cc dav1d_dc_gen_left_8bpc_rvv
mv t1, a1
srli t5, a1, 1
vsetvli t2, t1, e16, m4, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e8, m2, tu, ma
sub a0, a0, t3
vle8.v v4, (a0)
vwaddu.wv v0, v0, v4
sub t1, t1, t3
bnez t1, 1b
j dc_gen_sum_up_8bpc_rvv
endfunc
function dc_gen_sum_up_8bpc_rvv, export=1, ext="v,zbb"
.variant_cc dav1d_dc_gen_sum_up_8bpc_rvv
vsetvli zero, a1, e32, m8, ta, ma
vmv.s.x v4, t5
vsetvli zero, zero, e16, m4, ta, ma
vwredsum.vs v8, v0, v4
vsetvli zero, zero, e32, m8, ta, ma
vmv.x.s t5, v8
ctz t1, a1
srl a0, t5, t1
jr t0
endfunc
function cfl_pred_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
1:
li t2, 0
mv t3, a2
2:
vsetvli t0, t3, e16, m2, ta, ma
add t4, a0, t2
vle16.v v0, (a5)
sh1add a5, t0, a5
vwmul.vx v4, v0, a6
vsetvli zero, zero, e32, m4, ta, mu
vneg.v v8, v4
vmslt.vx v0, v4, x0
vmax.vv v12, v8, v4
vssra.vi v16, v12, 6
vneg.v v16, v16, v0.t
vadd.vx v20, v16, a4
vmax.vx v0, v20, zero
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v4, v0, 0
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v0, v4, 0
vse8.v v0, (t4)
add t2, t0, t2
sub t3, t3, t0
bnez t3, 2b
addi a3, a3, -1
add a0, a0, a1
bnez a3, 1b
ret
endfunc
function ipred_cfl_8bpc_rvv, export=1, ext=v
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a3 # width
mv a2, a4 # height
jal t0, dc_gen_8bpc_rvv
mv a2, a3 # width
mv a3, a4 # height
mv a4, a0 # dc_get_top
mv a0, t6 # dst
mv a1, t4 # stride
j cfl_pred_8bpc_rvv
endfunc
function ipred_cfl_128_8bpc_rvv, export=1, ext="v,zba"
# dc = 128, then just rearrange registers
mv a2, a3
mv a3, a4
li a4, 128
j cfl_pred_8bpc_rvv
endfunc
function ipred_cfl_top_8bpc_rvv, export=1, ext=v
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a3 # width
jal t0, dc_gen_top_8bpc_rvv
mv a3, a4 # height
mv a4, a0 # dc_get_top
mv a0, t6 # dst
mv a2, a1 # width
mv a1, t4 # stride
j cfl_pred_8bpc_rvv
endfunc
function ipred_cfl_left_8bpc_rvv, export=1, ext="v,zba"
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a4 # height
mv a2, a3 # width
jal t0, dc_gen_left_8bpc_rvv
mv a3, a4 # height
mv a4, a0 # dc_get_left
mv a1, t4 # stride
mv a0, t6 # dst
j cfl_pred_8bpc_rvv
endfunc
function ipred_paeth_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
li t0, 0
mv t3, a2
lbu t1, (a2)
addi a6, a2, -1
addi a2, a2, 1
1:
lbu t2, (a6)
mv t3, a3
2:
sub t5, a3, t3
add t5, a2, t5
vsetvli t6, t3, e8, m1, ta, ma
vle8.v v2, (t5)
vwaddu.vx v4, v2, t2
vsetvli zero, zero, e16, m2, ta, ma
vwsub.vx v8, v4, t1
vsetvli zero, zero, e32, m4, ta, mu
vzext.vf4 v24, v2
vsub.vx v12, v8, t1
vmslt.vx v0, v12, zero
vneg.v v12, v12, v0.t
vsub.vx v16, v8, t2
vmslt.vx v0, v16, zero
vneg.v v16, v16, v0.t
vsub.vv v20, v8, v24
vmslt.vx v0, v20, zero
vneg.v v20, v20, v0.t
sub t5, a3, t3
vmsleu.vv v4, v16, v20
vmsleu.vv v5, v16, v12
vmsgtu.vv v0, v20, v12
vmand.mm v6, v4, v5
vsetvli zero, zero, e8, m1, ta, ma
vmerge.vxm v8, v2, t1, v0
vmmv.m v0, v6
add t5, a0, t5
sub t3, t3, t6
vmerge.vxm v4, v8, t2, v0
vse8.v v4, (t5)
bnez t3, 2b
addi a4, a4, -1
addi a6, a6, -1
add a0, a0, a1
bnez a4, 1b
ret
endfunc
function ipred_smooth_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
add t1, t0, a3
add t2, a2, a3
add t0, t0, a4
lbu t2, (t2)
sub t3, a2, a4
addi a6, a2, -1
addi a2, a2, 1
lbu t3, (t3)
1:
mv t6, a3
lbu a7, (a6)
lbu t4, (t0)
2:
li a5, 256
vsetvli t5, t6, e8, m1, ta, ma
vle8.v v2, (t1)
add t1, t1, t5
vle8.v v4, (a2)
add a2, a2, t5
sub a5, a5, t4
vwmulu.vx v8, v4, t4
vsetvli zero, zero, e16, m2, ta, ma
mul a5, a5, t3
vadd.vx v4, v8, a5
vsetvli zero, zero, e8, m1, ta, ma
vwmulu.vx v8, v2, a7
vneg.v v12, v2
vwmaccu.vx v8, t2, v12
vsetvli zero, zero, e16, m2, ta, ma
vwaddu.vv v12, v4, v8
sub a5, a3, t6
sub t6, t6, t5
add a5, a5, a0
vnclipu.wi v2, v12, 9
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v0, v2, 0
vse8.v v0, (a5)
bnez t6, 2b
sub t1, t1, a3
add a0, a0, a1
sub a2, a2, a3
addi a4, a4, -1
addi t0, t0, 1
addi a6, a6, -1
bnez a4, 1b
ret
endfunc
function ipred_smooth_v_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
add t2, a2, a3
add t0, t0, a4
sub t3, a2, a4
addi a2, a2, 1
lbu t3, (t3)
1:
mv t6, a3
lbu t4, (t0)
2:
li a5, 256
vsetvli t5, t6, e8, m1, ta, ma
vle8.v v4, (a2)
add a2, a2, t5
sub a5, a5, t4
vwmulu.vx v8, v4, t4
vsetvli zero, zero, e16, m2, ta, ma
mul a5, a5, t3
vwaddu.vx v4, v8, a5
sub a5, a3, t6
sub t6, t6, t5
add a5, a5, a0
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v2, v4, 8
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v0, v2, 0
vse8.v v0, (a5)
bnez t6, 2b
add a0, a0, a1
sub a2, a2, a3
addi a4, a4, -1
addi t0, t0, 1
bnez a4, 1b
ret
endfunc
function ipred_smooth_h_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
add t1, t0, a3
add t2, a2, a3
lbu t2, (t2)
addi a6, a2, -1
1:
mv t6, a3
lbu a7, (a6)
2:
vsetvli t5, t6, e8, m1, ta, ma
vle8.v v2, (t1)
add t1, t1, t5
vwmulu.vx v8, v2, a7
vneg.v v12, v2
vwmaccu.vx v8, t2, v12
sub a5, a3, t6
sub t6, t6, t5
add a5, a5, a0
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v0, v8, 8
vse8.v v0, (a5)
bnez t6, 2b
sub t1, t1, a3
add a0, a0, a1
addi a4, a4, -1
addi a6, a6, -1
bnez a4, 1b
ret
endfunc
function pal_pred_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
vsetivli t5, 8, e8, m1, ta, ma
vle8.v v30, (a2)
li t0, 2
srli t1, a4, 1
1:
mv t4, a4
2:
vsetvli t5, t1, e8, m1, ta, ma
vle8.v v0, (a3)
add a3, a3, t5
vsrl.vi v2, v0, 4
sub t6, a4, t4
vand.vi v1, v0, 7
add t6, a0, t6
vrgather.vv v3, v30, v1
addi t2, t6, 1
vrgather.vv v4, v30, v2
slli t5, t5, 1
vsse8.v v3, (t6), t0
sub t4, t4, t5
vsse8.v v4, (t2), t0
bnez t4, 2b
addi a5, a5, -1
add a0, a0, a1
bnez a5, 1b
ret
endfunc

471
third_party/dav1d/src/riscv/64/ipred16.S vendored Normal file
View File

@ -0,0 +1,471 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
function dc_gen_16bpc_rvv, export=1, ext="v,zba,zbb"
.variant_cc dav1d_dc_gen_8bpc_rvv
add t1, a1, a2
srli t5, t1, 1
mv t1, a1
addi t2, a0, 2
vsetvli zero, t1, e32, m8, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e16, m4, tu, ma
vle16.v v8, (t2)
vwaddu.wv v0, v0, v8
sub t1, t1, t3
sh1add t2, t3, t2
bnez t1, 1b
mv t1, a2
mv t2, a0
vsetvli zero, t1, e32, m8, ta, ma
vmv.v.x v16, zero
2:
vsetvli t3, t1, e16, m4, tu, ma
sub t1, t1, t3
sll t3, t3, 1
sub t2, t2, t3
vle16.v v8, (t2)
vwaddu.wv v16, v16, v8
bnez t1, 2b
vsetvli zero, a1, e32, m8, ta, ma
vmv.s.x v24, t5
vmv.s.x v25, zero
vredsum.vs v8, v0, v24
vsetvli zero, a2, e32, m8, ta, ma
vredsum.vs v0, v16, v25
vmv.x.s t5, v8
vmv.x.s t1, v0
add t5, t5, t1
add t1, a1, a2
ctz t1, t1
srl a0, t5, t1
beq a1, a2, 5f
slli t1, a1, 1
sltu t2, t1, a2
slli t3, a2, 1
sltu t1, t3, a1
or t1, t1, t2
bnez t1, 3f
li t1, 0xAAAB
j 4f
3:
li t1, 0x6667
4:
mul a0, a0, t1
li t1, 17
srl a0, a0, t1
5:
jr t0
endfunc
function dc_gen_top_16bpc_rvv, export=1, ext="v,zba,zbb"
.variant_cc dav1d_dc_gen_top_16bpc_rvv
mv t1, a1
srli t5, a1, 1
addi a0, a0, 2
vsetvli zero, t1, e32, m2, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e16, m1, tu, ma
vle16.v v4, (a0)
vwaddu.wv v0, v0, v4
sh1add a0, t3, a0
sub t1, t1, t3
bnez t1, 1b
j dc_gen_sum_up_16bpc_rvv
endfunc
function dc_gen_left_16bpc_rvv, export=1, ext="v,zba,zbb"
.variant_cc dav1d_dc_gen_left_16bpc_rvv
mv t1, a1
srli t5, a1, 1
vsetvli zero, t1, e32, m2, ta, ma
vmv.v.x v0, zero
1:
vsetvli t3, t1, e16, m1, tu, ma
sub t1, t1, t3
slli t3, t3, 1
sub a0, a0, t3
vle16.v v4, (a0)
vwaddu.wv v0, v0, v4
bnez t1, 1b
j dc_gen_sum_up_16bpc_rvv
endfunc
function dc_gen_sum_up_16bpc_rvv, export=1, ext="v,zba,zbb"
.variant_cc dav1d_dc_gen_sum_up_16bpc_rvv
vsetvli zero, a1, e32, m2, ta, ma
vmv.s.x v4, t5
vredsum.vs v8, v0, v4
vmv.x.s t5, v8
ctz t1, a1
srl a0, t5, t1
jr t0
endfunc
function cfl_pred_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
1:
li t2, 0
mv t3, a2
2:
vsetvli t0, t3, e16, m2, ta, ma
sh1add t4, t2, a0
vle16.v v0, (a5)
sh1add a5, t0, a5
vwmul.vx v4, v0, a6
vsetvli zero, zero, e32, m4, ta, mu
vneg.v v8, v4
vmslt.vx v0, v4, x0
vmax.vv v12, v8, v4
vssra.vi v16, v12, 6
vneg.v v16, v16, v0.t
vadd.vx v20, v16, a4
vmax.vx v0, v20, zero
vmin.vx v0, v0, a7
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v4, v0, 0
vse16.v v4, (t4)
add t2, t0, t2
sub t3, t3, t0
bnez t3, 2b
addi a3, a3, -1
add a0, a0, a1
bnez a3, 1b
ret
endfunc
function ipred_cfl_16bpc_rvv, export=1, ext=v
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a3 # width
mv a2, a4 # height
jal t0, dc_gen_16bpc_rvv
mv a2, a3 # width
mv a3, a4 # height
mv a4, a0 # dc_get_top
mv a0, t6 # dst
mv a1, t4 # stride
j cfl_pred_16bpc_rvv
endfunc
function ipred_cfl_128_16bpc_rvv, export=1, ext="v,zba"
# dc = (bitdepth_max + 1) >> 1, then just rearrange registers
mv a2, a3
mv a3, a4
addi a4, a7, 1
srli a4, a4, 1
j cfl_pred_16bpc_rvv
endfunc
function ipred_cfl_top_16bpc_rvv, export=1, ext=v
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a3 # width
jal t0, dc_gen_top_16bpc_rvv
mv a3, a4 # height
mv a4, a0 # dc_get_top
mv a0, t6 # dst
mv a2, a1 # width
mv a1, t4 # stride
j cfl_pred_16bpc_rvv
endfunc
function ipred_cfl_left_16bpc_rvv, export=1, ext=v
mv t6, a0 # dst
mv a0, a2 # topleft
mv t4, a1 # stride
mv a1, a4 # height
mv a2, a3 # width
jal t0, dc_gen_left_16bpc_rvv
mv a3, a4 # height
mv a4, a0 # dc_get_top
mv a1, t4 # stride
mv a0, t6 # dst
j cfl_pred_16bpc_rvv
endfunc
function ipred_paeth_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
li t0, 0
mv t3, a2
lhu t1, (a2)
addi a6, a2, -2
addi a2, a2, 2
1:
lhu t2, (a6)
mv t3, a3
2:
sub t5, a3, t3
sh1add t5, t5, a2
vsetvli t6, t3, e16, m2, ta, ma
vle16.v v2, (t5)
vwaddu.vx v4, v2, t2
vsetvli zero, zero, e32, m4, ta, mu
vsub.vx v8, v4, t1
vzext.vf2 v24, v2
vsub.vx v12, v8, t1
vmslt.vx v0, v12, zero
vneg.v v12, v12, v0.t
vsub.vx v16, v8, t2
vmslt.vx v0, v16, zero
vneg.v v16, v16, v0.t
vsub.vv v20, v8, v24
vmslt.vx v0, v20, zero
vneg.v v20, v20, v0.t
sub t5, a3, t3
vmsleu.vv v4, v16, v20
vmsleu.vv v5, v16, v12
vmsgtu.vv v0, v20, v12
vmand.mm v6, v4, v5
vsetvli zero, zero, e16, m2, ta, ma
vmerge.vxm v8, v2, t1, v0
vmmv.m v0, v6
sh1add t5, t5, a0
sub t3, t3, t6
vmerge.vxm v4, v8, t2, v0
vse16.v v4, (t5)
bnez t3, 2b
addi a4, a4, -1
addi a6, a6, -2
add a0, a0, a1
bnez a4, 1b
ret
endfunc
function ipred_smooth_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
add t1, t0, a3
sh1add t2, a3, a2
slli t3, a4, 1
add t0, t0, a4
lhu t2, (t2)
sub t3, a2, t3
addi a6, a2, -2
addi a2, a2, 2
lhu t3, (t3)
1:
mv t6, a3
lhu a7, (a6)
lbu t4, (t0)
2:
li a5, 256
vsetvli t5, t6, e16, m2, ta, ma
vle8.v v2, (t1)
add t1, t1, t5
vle16.v v4, (a2)
sh1add a2, t5, a2
sub a5, a5, t4
vwmul.vx v8, v4, t4
mul a5, a5, t3
vsetvli zero, zero, e32, m4, ta, ma
vadd.vx v4, v8, a5
li a5, 256
vzext.vf4 v12, v2
vmul.vx v8, v12, a7
vrsub.vx v12, v12, a5
vmacc.vx v8, t2, v12
vadd.vv v12, v4, v8
vsetvli zero, zero, e32, m4, ta, ma
sub a5, a3, t6
sub t6, t6, t5
sh1add a5, a5, a0
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v2, v12, 9
vse16.v v2, (a5)
bnez t6, 2b
sub t1, t1, a3
slli t6, a3, 1
add a0, a0, a1
sub a2, a2, t6
addi a4, a4, -1
addi t0, t0, 1
addi a6, a6, -2
bnez a4, 1b
ret
endfunc
function ipred_smooth_v_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
slli t3, a4, 1
add t0, t0, a4
sub t3, a2, t3
addi a2, a2, 2
lhu t3, (t3)
1:
mv t6, a3
lbu t4, (t0)
2:
li a5, 256
vsetvli t5, t6, e16, m2, ta, ma
vle16.v v4, (a2)
sh1add a2, t5, a2
sub a5, a5, t4
vwmul.vx v8, v4, t4
mul a5, a5, t3
vsetvli zero, zero, e32, m4, ta, ma
vadd.vx v4, v8, a5
vsetvli zero, zero, e32, m4, ta, ma
sub a5, a3, t6
sub t6, t6, t5
sh1add a5, a5, a0
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v2, v4, 8
vse16.v v2, (a5)
bnez t6, 2b
slli t6, a3, 1
add a0, a0, a1
sub a2, a2, t6
addi a4, a4, -1
addi t0, t0, 1
bnez a4, 1b
ret
endfunc
function ipred_smooth_h_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
la t0, dav1d_sm_weights
add t1, t0, a3
sh1add t2, a3, a2
lhu t2, (t2)
addi a6, a2, -2
1:
mv t6, a3
lhu a7, (a6)
2:
vsetvli t5, t6, e16, m2, ta, ma
vle8.v v2, (t1)
add t1, t1, t5
li a5, 256
vsetvli zero, zero, e32, m4, ta, ma
vzext.vf4 v12, v2
vmul.vx v8, v12, a7
vrsub.vx v12, v12, a5
vmacc.vx v8, t2, v12
sub a5, a3, t6
sub t6, t6, t5
sh1add a5, a5, a0
vsetvli zero, zero, e16, m2, ta, ma
vnclipu.wi v2, v8, 8
vse16.v v2, (a5)
bnez t6, 2b
sub t1, t1, a3
add a0, a0, a1
addi a4, a4, -1
addi a6, a6, -2
bnez a4, 1b
ret
endfunc
function pal_pred_16bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
vsetivli t5, 8, e16, m1, ta, ma
vle16.v v30, (a2)
li t0, 4
srli t1, a4, 1
li t2, 1
1:
mv t4, a4
2:
vsetvli t5, t1, e8, mf2, ta, ma
vle8.v v0, (a3)
add a3, a3, t5
vand.vi v1, v0, 7
sub t6, a4, t4
vsrl.vi v2, v0, 4
vwmul.vx v4, v1, t2
vwmul.vx v6, v2, t2
vsetvli zero, zero, e16, m1, ta, ma
sh1add t6, t6, a0
vrgather.vv v8, v30, v4
addi t3, t6, 2
vrgather.vv v10, v30, v6
slli t5, t5, 1
vsse16.v v8, (t6), t0
vsse16.v v10, (t3), t0
sub t4, t4, t5
bnez t4, 2b
add a0, a0, a1
addi a5, a5, -1
bnez a5, 1b
ret
endfunc

View File

@ -145,17 +145,10 @@ endfunc
vwmacc.vx v20, t2, \o3
vwmacc.vx v22, t3, \o3
li t1, 2048
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vsadd.vv \o0, v16, v20
vsadd.vv \o1, v18, v22
@ -192,19 +185,12 @@ endfunc
vadd.vv v18, v18, v22
vsub.vv v22, v24, v22
li t1, 2048
vadd.vx v16, v16, t1
vadd.vx v18, v18, t1
vadd.vx v20, v20, t1
vadd.vx v22, v22, t1
vsetvli zero, zero, e16, \lm, ta, ma
vnsra.wi \o0, v16, 12
vnsra.wi \o1, v18, 12
vnsra.wi \o2, v20, 12
vnsra.wi \o3, v22, 12
vnclip.wi \o0, v16, 12
vnclip.wi \o1, v18, 12
vnclip.wi \o2, v20, 12
vnclip.wi \o3, v22, 12
.endm
function inv_dct_e16_x4_rvv, export=1, ext=v
@ -491,17 +477,10 @@ endfunc
vwmacc.vx v20, t3, \o3
vwmacc.vx v18, t4, \o3
li t1, 2048
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vssub.vv \o7, v22, v20
vsadd.vv v22, v22, v20
@ -516,11 +495,8 @@ endfunc
neg t2, t2
vwmacc.vx v18, t2, \o1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vssub.vv \o7, \o0, v22
vsadd.vv \o0, \o0, v22
@ -559,7 +535,6 @@ endfunc
vwmacc.vx v24, t6, v4
vwmacc.vx v26, t5, v4
li t1, 2048
li t2, 1189
li t3, 3920
li t4, 1567
@ -572,23 +547,14 @@ endfunc
vwmacc.vx v28, t3, v6
vwmacc.vx v30, t2, v6
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v24, v24, 12
vnsra.wi v26, v26, 12
vnsra.wi v28, v28, 12
vnsra.wi v30, v30, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v24, v24, 12
vnclip.wi v26, v26, 12
vnclip.wi v28, v28, 12
vnclip.wi v30, v30, 12
vssub.vv v4, v16, v24
vsadd.vv v16, v16, v24
@ -615,15 +581,10 @@ endfunc
vwmacc.vx v20, t4, v6
vwmacc.vx v18, t5, v5
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vsadd.vv \o1, v16, v20
vsadd.vv \o6, v18, v22
@ -640,15 +601,10 @@ endfunc
vwmacc.vx v20, t6, v3
vwmacc.vx v24, t6, v17
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vnsra.wi \o3, v18, 12
vnsra.wi \o4, v20, 12
vnsra.wi \o2, v22, 12
vnsra.wi \o5, v24, 12
vnclip.wi \o3, v18, 12
vnclip.wi \o4, v20, 12
vnclip.wi \o2, v22, 12
vnclip.wi \o5, v24, 12
vmv.v.x v16, zero
vssub.vv \o1, v16, \o1
@ -972,28 +928,18 @@ function inv_dct_e16_x16_rvv, export=1, ext=v
vwmacc.vx v24, t3, v3
vwmacc.vx v22, t4, v3
li t1, 2048
li t2, 2896
li t3, 1567
li t4, 3784
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v24, v24, 12
vnsra.wi v26, v26, 12
vnsra.wi v28, v28, 12
vnsra.wi v30, v30, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v24, v24, 12
vnclip.wi v26, v26, 12
vnclip.wi v28, v28, 12
vnclip.wi v30, v30, 12
vssub.vv v3, v16, v18
vsadd.vv v16, v16, v18
@ -1015,15 +961,10 @@ function inv_dct_e16_x16_rvv, export=1, ext=v
vwmacc.vx v20, t3, v5
vwmacc.vx v26, t4, v5
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v26, v26, 12
vnsra.wi v28, v28, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v26, v26, 12
vnclip.wi v28, v28, 12
vssub.vv v5, v18, v20
vsadd.vv v18, v18, v20
@ -1045,15 +986,10 @@ function inv_dct_e16_x16_rvv, export=1, ext=v
vwmacc.vx v20, t2, v5
vwmacc.vx v22, t2, v7
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v24, v24, 12
vnsra.wi v26, v26, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v24, v24, 12
vnclip.wi v26, v26, 12
vssub.vv v15, v0, v30
vsadd.vv v0, v0, v30
@ -1112,25 +1048,14 @@ endfunc
vwmacc.vx v28, t4, v6
vwmacc.vx v30, t3, v6
li t1, 2048
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi v0, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v2, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v4, v24, 12
vnsra.wi v26, v26, 12
vnsra.wi v6, v28, 12
vnsra.wi v30, v30, 12
vnclip.wi v0, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v2, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v4, v24, 12
vnclip.wi v26, v26, 12
vnclip.wi v6, v28, 12
vnclip.wi v30, v30, 12
li t1, 2751
li t2, 3035
@ -1149,17 +1074,10 @@ endfunc
vwmacc.vx v24, t4, v10
vwmacc.vx v28, t3, v10
li t1, 2048
vwadd.wx v16, v16, t1
vwadd.wx v20, v20, t1
vwadd.wx v24, v24, t1
vwadd.wx v28, v28, t1
vnsra.wi v16, v16, 12
vnsra.wi v9, v20, 12
vnsra.wi v24, v24, 12
vnsra.wi v11, v28, 12
vnclip.wi v16, v16, 12
vnclip.wi v9, v20, 12
vnclip.wi v24, v24, 12
vnclip.wi v11, v28, 12
vssub.vv v8, v0, v16
vsadd.vv v0, v0, v16
@ -1183,17 +1101,10 @@ endfunc
vwmacc.vx v24, t4, v14
vwmacc.vx v28, t3, v14
li t1, 2048
vwadd.wx v16, v16, t1
vwadd.wx v20, v20, t1
vwadd.wx v24, v24, t1
vwadd.wx v28, v28, t1
vnsra.wi v16, v16, 12
vnsra.wi v13, v20, 12
vnsra.wi v24, v24, 12
vnsra.wi v15, v28, 12
vnclip.wi v16, v16, 12
vnclip.wi v13, v20, 12
vnclip.wi v24, v24, 12
vnclip.wi v15, v28, 12
vssub.vv v12, v4, v16
vsadd.vv v16, v4, v16
@ -1244,28 +1155,18 @@ endfunc
vwmacc.vx v24, t1, v12
vwmacc.vx v28, t3, v14
li t1, 2048
li t2, 2896
li t3, 1567
li t4, 3784
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v24, v24, 12
vnsra.wi v26, v26, 12
vnsra.wi v28, v28, 12
vnsra.wi v30, v30, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v24, v24, 12
vnclip.wi v26, v26, 12
vnclip.wi v28, v28, 12
vnclip.wi v30, v30, 12
vsadd.vv v8, v16, v24
vsadd.vv v9, v18, v26
@ -1295,23 +1196,14 @@ endfunc
vwmacc.vx v18, t4, v5
vwmacc.vx v26, t4, v13
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vnsra.wi v24, v24, 12
vnsra.wi v26, v26, 12
vnsra.wi v28, v28, 12
vnsra.wi v30, v30, 12
vnclip.wi v16, v16, 12
vnclip.wi v18, v18, 12
vnclip.wi v20, v20, 12
vnclip.wi v22, v22, 12
vnclip.wi v24, v24, 12
vnclip.wi v26, v26, 12
vnclip.wi v28, v28, 12
vnclip.wi v30, v30, 12
.ifc \o0, v0
vsadd.vv \o14, v9, v11
@ -1365,23 +1257,14 @@ endfunc
vwmacc.vx v16, t2, v9
vwmacc.vx v18, t3, v9
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vwadd.wx v24, v24, t1
vwadd.wx v26, v26, t1
vwadd.wx v28, v28, t1
vwadd.wx v30, v30, t1
vnsra.wi \o7, v16, 12
vnsra.wi \o8, v18, 12
vnsra.wi \o4, v20, 12
vnsra.wi \o11, v22, 12
vnsra.wi \o6, v24, 12
vnsra.wi \o9, v26, 12
vnsra.wi \o5, v28, 12
vnsra.wi \o10, v30, 12
vnclip.wi \o7, v16, 12
vnclip.wi \o8, v18, 12
vnclip.wi \o4, v20, 12
vnclip.wi \o11, v22, 12
vnclip.wi \o6, v24, 12
vnclip.wi \o9, v26, 12
vnclip.wi \o5, v28, 12
vnclip.wi \o10, v30, 12
vmv.v.x v16, zero
vssub.vv \o1, v16, \o1
@ -1552,6 +1435,9 @@ endfunc
.macro def_fn_16x16 txfm1, txfm2, eob_half
function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
.ifc \txfm1\()_\txfm2, dct_dct
beqz a3, 1f
.endif
.ifc \txfm1, identity
la a6, inv_txfm_horz_identity_16x8_rvv
.else
@ -1561,6 +1447,75 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
la a5, inv_\txfm2\()_e16_x16_rvv
li a7, \eob_half
j inv_txfm_add_16x16_rvv
.ifc \txfm1\()_\txfm2, dct_dct
1:
csrw vxrm, zero
vsetivli zero, 16, e16, m2, ta, ma
lh t2, (a2)
li t3, 2896*8
li t4, 1<<14
li t5, 0xFFFF
li t6, -0x10000
sh x0, (a2)
mul t2, t2, t3
add t2, t2, t4
srai t2, t2, 15
ble t2, t5, 3f
mv t2, t5
3:
ble t6, t2, 4f
mv t2, t6
4:
addi t2, t2, 2
srai t2, t2, 2
mul t2, t2, t3
add t2, t2, t4
srai t2, t2, 15
ble t2, t5, 5f
mv t2, t5
5:
ble t6, t2, 6f
mv t2, t6
6:
addi t2, t2, 8
srai t2, t2, 4
vmv.v.x v24, t2
vsetvli zero, zero, e8, m1, ta, ma
add t2, a1, a1
li t3, 16
2:
add t0, a0, a1
vle8.v v16, (a0)
vle8.v v17, (t0)
vwaddu.wv v0, v24, v16
vwaddu.wv v2, v24, v17
addi t3, t3, -2 # loop counter
vsetvli zero, zero, e16, m2, ta, ma
.irp i, 0, 2
vmax.vx v\i, v\i, zero
.endr
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v16, v0, 0
vnclipu.wi v17, v2, 0
add t0, a0, a1
vse8.v v16, (a0)
add a0, a0, t2
vse8.v v17, (t0)
bnez t3, 2b
ret
.endif
endfunc
.endm

532
third_party/dav1d/src/riscv/64/mc.S vendored Normal file
View File

@ -0,0 +1,532 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Nathan Egge, Niklas Haas, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
function blend_vl256_8bpc_rvv, export=1, ext=zbb
ctz t0, a3
addi t0, t0, 0xc3
j L(blend_epilog)
endfunc
function blend_8bpc_rvv, export=1, ext="v,zbb"
ctz t0, a3
addi t0, t0, 0xc4
L(blend_epilog):
csrw vxrm, zero
andi t0, t0, 0xc7
vsetvl zero, a3, t0
li t1, 64
1:
addi a4, a4, -2
vle8.v v4, (a2)
add a2, a2, a3
vle8.v v6, (a2)
add a2, a2, a3
vle8.v v8, (a5)
add a5, a5, a3
vle8.v v10, (a5)
add a5, a5, a3
vle8.v v0, (a0)
add t0, a0, a1
vle8.v v2, (t0)
vwmulu.vv v16, v4, v8
vwmulu.vv v20, v6, v10
vrsub.vx v8, v8, t1
vrsub.vx v10, v10, t1
vwmaccu.vv v16, v0, v8
vwmaccu.vv v20, v2, v10
vnclipu.wi v0, v16, 6
vnclipu.wi v2, v20, 6
vse8.v v0, (a0)
vse8.v v2, (t0)
add a0, t0, a1
bnez a4, 1b
ret
endfunc
function blend_h_vl256_8bpc_rvv, export=1, ext=zbb
srai t0, a3, 2
li t2, 64
ctz t0, t0
addi t0, t0, 0xc5
j L(blend_h_epilog)
endfunc
function blend_h_8bpc_rvv, export=1, ext="v,zbb"
li t2, 64
bgt a3, t2, 128f
ctz t0, a3
addi t0, t0, 0xc4
L(blend_h_epilog):
csrw vxrm, zero
andi t0, t0, 0xc7
vsetvl zero, a3, t0
la t1, dav1d_obmc_masks
srai t0, a4, 2
add t1, t1, a4
sub a4, a4, t0
0:
mv t5, ra
1:
addi a4, a4, -2
lbu t3, (t1)
addi t1, t1, 1
lbu t4, (t1)
addi t1, t1, 1
vle8.v v8, (a2)
add a2, a2, a3
vle8.v v12, (a2)
add a2, a2, a3
vle8.v v0, (a0)
add t0, a0, a1
vle8.v v4, (t0)
vwmulu.vx v16, v8, t3
vwmulu.vx v24, v12, t4
sub t3, t2, t3
sub t4, t2, t4
vwmaccu.vx v16, t3, v0
vwmaccu.vx v24, t4, v4
vnclipu.wi v0, v16, 6
vnclipu.wi v4, v24, 6
vse8.v v0, (a0)
vse8.v v4, (t0)
add a0, t0, a1
bgtz a4, 1b
jr t5
128:
csrw vxrm, zero
vsetvli zero, t2, e8, m4, ta, ma
la t1, dav1d_obmc_masks
srai t0, a4, 2
add t1, t1, a4
sub a4, a4, t0
mv a5, a0
mv a6, a2
mv a7, a4
jal t5, 1b
add t1, t1, a4
add a0, a5, t2
add a2, a6, t2
mv a4, a7
sub t1, t1, a4
j 0b
endfunc
function blend_v_vl256_8bpc_rvv, export=1, ext=zbb
srai t0, a3, 2
ctz t0, t0
addi t0, t0, 0xc5
j L(blend_v_epilog)
endfunc
function blend_v_8bpc_rvv, export=1, ext="v,zbb"
ctz t0, a3
addi t0, t0, 0xc4
L(blend_v_epilog):
andi t0, t0, 0xc7
vsetvl zero, a3, t0
csrw vxrm, zero
la t1, dav1d_obmc_masks
add t1, t1, a3
vle8.v v8, (t1)
li t0, 64
vrsub.vx v10, v8, t0
1:
addi a4, a4, -2
vle8.v v4, (a2)
add a2, a2, a3
vle8.v v6, (a2)
add a2, a2, a3
vle8.v v0, (a0)
add t0, a0, a1
vle8.v v2, (t0)
vwmulu.vv v12, v4, v8
vwmulu.vv v16, v6, v8
vwmaccu.vv v12, v0, v10
vwmaccu.vv v16, v2, v10
vnclipu.wi v0, v12, 6
vnclipu.wi v2, v16, 6
vse8.v v0, (a0)
vse8.v v2, (t0)
add a0, t0, a1
bnez a4, 1b
ret
endfunc
.macro avg va, vb, vm
vadd.vv \va, \va, \vb
.endm
.macro w_avg va, vb, vm
vwmul.vx v24, \va, a6
vwmacc.vx v24, a7, \vb
vnclip.wi \va, v24, 8
.endm
.macro mask va, vb, vm
vwmul.vv v24, \va, \vm
vrsub.vx \vm, \vm, a7
vwmacc.vv v24, \vb, \vm
vnclip.wi \va, v24, 10
.endm
.macro bidir_fn type, shift
function \type\()_8bpc_rvv, export=1, ext="v,zba,zbb"
.ifc \type, w_avg
li a7, 16
sub a7, a7, a6
.endif
.ifc \type, mask
li a7, 64
.endif
li t0, 4
csrw vxrm, zero
beq t0, a4, 4f
csrr t0, vlenb
ctz t1, a4
ctz t0, t0
li t2, 1
sub t0, t1, t0
li t4, -3
bgt t0, t2, 2f
max t0, t0, t4
andi t1, t0, 0x7
addi t0, t1, 1 # may overflow into E16 bit
ori t0, t0, MA | TA | E16
ori t1, t1, MA | TA | E8
1:
addi a5, a5, -4
.rept 2
vsetvl zero, a4, t0
sh1add t3, a4, a2
vle16.v v0, (a2)
sh1add a2, a4, t3
vle16.v v4, (t3)
sh1add t3, a4, a3
vle16.v v8, (a3)
sh1add a3, a4, t3
vle16.v v12, (t3)
.ifc \type, mask
add t3, a4, a6
vle8.v v24, (a6)
add a6, a4, t3
vle8.v v26, (t3)
vzext.vf2 v16, v24
vzext.vf2 v20, v26
.endif
\type v0, v8, v16
\type v4, v12, v20
vmax.vx v8, v0, zero
vmax.vx v12, v4, zero
vsetvl zero, zero, t1
vnclipu.wi v0, v8, \shift
vnclipu.wi v2, v12, \shift
add t3, a1, a0
vse8.v v0, (a0)
add a0, a1, t3
vse8.v v2, (t3)
.endr
bnez a5, 1b
ret
2:
mv t0, a0
neg t4, a4
add a0, a1, a0
addi a5, a5, -1
20:
vsetvli t2, a4, e16, m4, ta, ma
sh1add t4, t2, t4
sh1add t3, t2, a2
vle16.v v0, (a2)
sh1add a2, t2, t3
vle16.v v4, (t3)
sh1add t3, t2, a3
vle16.v v8, (a3)
sh1add a3, t2, t3
vle16.v v12, (t3)
.ifc \type, mask
add t3, t2, a6
vle8.v v24, (a6)
add a6, t2, t3
vle8.v v26, (t3)
vzext.vf2 v16, v24
vzext.vf2 v20, v26
.endif
\type v0, v8, v16
\type v4, v12, v20
vmax.vx v8, v0, zero
vmax.vx v12, v4, zero
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, \shift
vnclipu.wi v2, v12, \shift
add t3, t2, t0
vse8.v v0, (t0)
add t0, t2, t3
vse8.v v2, (t3)
bnez t4, 20b
bnez a5, 2b
ret
4:
slli t0, a5, 2
vsetvli t1, t0, e16, m4, ta, ma
vle16.v v0, (a2)
sh1add a2, t1, a2
vle16.v v4, (a3)
sh1add a3, t1, a3
.ifc \type, mask
vle8.v v16, (a6)
add a6, t1, a6
vzext.vf2 v8, v16
.endif
\type v0, v4, v8
vmax.vx v8, v0, zero
vsetvli zero, zero, e8, m2, ta, ma
vnclipu.wi v0, v8, \shift
vsetvli t1, a5, e32, m2, ta, ma
vsse32.v v0, (a0), a1
ctz t0, t1
sub a5, a5, t1
sll t0, a1, t0
add a0, t0, a0
bnez a5, 4b
ret
endfunc
.endm
bidir_fn avg, 5
bidir_fn w_avg, 0
bidir_fn mask, 0
function warp_8x8_8bpc_rvv, export=1, ext="v"
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
addi sp, sp, -2*15*8
mv t5, sp
li t0, 3
mul t0, a3, t0
sub a2, a2, t0
addi a2, a2, -3
li t0, 64
addi a3, a3, -8
li t1, 15
la t2, dav1d_mc_warp_filter
lh t6, (a4)
lh t4, 2(a4)
vid.v v30
vwmul.vx v28, v30, t6
1:
addi t1, t1, -1
vsetvli zero, zero, e32, m2, ta, ma
vadd.vx v4, v28, a5
add a5, a5, t4
vssra.vi v2, v4, 10
vadd.vx v2, v2, t0
vsll.vi v24, v2, 3
vsetvli zero, zero, e8, mf2, ta, ma
vluxseg8ei32.v v2, (t2), v24
vsetvli zero, zero, e16, m1, ta, ma
.irp i, 2, 3, 4, 5, 6, 7, 8, 9
vle8.v v10, (a2)
addi a2, a2, 1
vsext.vf2 v14, v\i
vzext.vf2 v16, v10
.if \i == 2
vwmulsu.vv v12, v14, v16
.else
vwmaccsu.vv v12, v14, v16
.endif
.endr
vnclip.wi v10, v12, 3
add a2, a2, a3
vse16.v v10, (t5)
addi t5, t5, 16
bnez t1, 1b
mv t5, sp
li t1, 8
lh t6, 4(a4)
lh t4, 6(a4)
vwmul.vx v28, v30, t6
2:
addi t1, t1, -1
vsetvli zero, zero, e32, m2, ta, ma
vadd.vx v4, v28, a6
add a6, a6, t4
vssra.vi v2, v4, 10
vadd.vx v2, v2, t0
vsll.vi v24, v2, 3
vsetvli zero, zero, e8, mf2, ta, ma
vluxseg8ei32.v v2, (t2), v24
vsetvli zero, zero, e16, m1, ta, ma
.irp i, 2, 3, 4, 5, 6, 7, 8, 9
vle16.v v10, (t5)
addi t5, t5, 16
vsext.vf2 v14, v\i
.if \i == 2
vwmul.vv v12, v14, v10
.else
vwmacc.vv v12, v14, v10
.endif
.endr
addi t5, t5, -16*7
vnclip.wi v10, v12, 11
vmax.vx v10, v10, zero
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v12, v10, 0
vse8.v v12, (a0)
add a0, a0, a1
bnez t1, 2b
addi sp, sp, 2*15*8
ret
endfunc
function warp_8x8t_8bpc_rvv, export=1, ext="v,zba"
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
addi sp, sp, -2*15*8
mv t5, sp
li t0, 3
mul t0, a3, t0
sub a2, a2, t0
addi a2, a2, -3
li t0, 64
addi a3, a3, -8
li t1, 15
la t2, dav1d_mc_warp_filter
lh t6, (a4)
lh t4, 2(a4)
vid.v v30
vwmul.vx v28, v30, t6
1:
addi t1, t1, -1
vsetvli zero, zero, e32, m2, ta, ma
vadd.vx v4, v28, a5
add a5, a5, t4
vssra.vi v2, v4, 10
vadd.vx v2, v2, t0
vsll.vi v24, v2, 3
vsetvli zero, zero, e8, mf2, ta, ma
vluxseg8ei32.v v2, (t2), v24
vsetvli zero, zero, e16, m1, ta, ma
.irp i, 2, 3, 4, 5, 6, 7, 8, 9
vle8.v v10, (a2)
addi a2, a2, 1
vsext.vf2 v14, v\i
vzext.vf2 v16, v10
.if \i == 2
vwmulsu.vv v12, v14, v16
.else
vwmaccsu.vv v12, v14, v16
.endif
.endr
vnclip.wi v10, v12, 3
add a2, a2, a3
vse16.v v10, (t5)
addi t5, t5, 16
bnez t1, 1b
mv t5, sp
li t1, 8
lh t6, 4(a4)
lh t4, 6(a4)
vwmul.vx v28, v30, t6
2:
addi t1, t1, -1
vsetvli zero, zero, e32, m2, ta, ma
vadd.vx v4, v28, a6
add a6, a6, t4
vssra.vi v2, v4, 10
vadd.vx v2, v2, t0
vsll.vi v24, v2, 3
vsetvli zero, zero, e8, mf2, ta, ma
vluxseg8ei32.v v2, (t2), v24
vsetvli zero, zero, e16, m1, ta, ma
.irp i, 2, 3, 4, 5, 6, 7, 8, 9
vle16.v v10, (t5)
addi t5, t5, 16
vsext.vf2 v14, v\i
.if \i == 2
vwmul.vv v12, v14, v10
.else
vwmacc.vv v12, v14, v10
.endif
.endr
addi t5, t5, -16*7
vnclip.wi v10, v12, 7
vse16.v v10, (a0)
sh1add a0, a1, a0
bnez t1, 2b
addi sp, sp, 2*15*8
ret
endfunc

95
third_party/dav1d/src/riscv/64/pal.S vendored Normal file
View File

@ -0,0 +1,95 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
function pal_idx_finish_rvv, export=1, ext="v,zba,zbb"
csrw vxrm, zero
srl t0, a2, 1
sub a2, a2, a4
srl t1, a4, 1
mv t2, a5
csrr t6, vlenb
li t4, -3
ctz a6, t0
ctz t6, t6
li a7, 16
sub a6, a6, t6
li t6, 1<<4+1
// a6 is never > 3 for VLEN >=128
// that would've required stripmining with a6 set to 3
max a6, a6, t4
li t5, 2
andi a6, a6, 7
addi t4, a1, 1
ori a6, a6, 0xc0
1:
sub t3, t0, t1
vsetvl zero, t1, a6
vlse8.v v0, (a1), t5
sh1add a1, t1, a1
vlse8.v v8, (t4), t5
sh1add t4, t1, t4
vmacc.vx v0, a7, v8
vse8.v v0, (a0)
add a0, a0, t1
ble t3, zero, 4f
lbu a4, -1(a1)
mul a4, a4, t6
vsetvl zero, t3, a6
vmv.v.x v0, a4
vse8.v v0, (a0)
add a0, a0, t3
4:
addi t2, t2, -1
add a1, a1, a2
add t4, t4, a2
bnez t2, 1b
sub t1, a3, a5
sub t2, a0, t0
ble t1, zero, 7f
vsetvl zero, t0, a6
vle8.v v0, (t2)
add t2, a0, t0
5:
addi t1, t1, -2
vse8.v v0, (a0)
vse8.v v0, (t2)
sh1add a0, t0, a0
sh1add t2, t0, t2
bnez t1, 5b
7:
ret
endfunc

View File

@ -51,6 +51,13 @@
#define EXTERN PRIVATE_PREFIX
#endif
.macro arch ext:req, more:vararg
.option arch, +\ext
.ifnb \more
arch \more
.endif
.endm
.macro function name, export=0, ext=
.macro endfunc
#ifdef __ELF__
@ -62,7 +69,7 @@
.text
.option push
.ifnb \ext
.option arch, +\ext
arch \ext
.endif
.if \export
.global EXTERN\name
@ -125,4 +132,18 @@ EXTERN\name:
#define L(x) .L ## x
#define MA (1 << 7)
#define TA (1 << 6)
#define E8 (0 << 3)
#define E16 (1 << 3)
#define E32 (2 << 3)
#define E64 (3 << 3)
#define M1 0
#define M2 1
#define M4 2
#define M8 3
#define MF2 7
#define MF4 6
#define MF8 5
#endif /* DAV1D_SRC_RISCV_ASM_S */

31
third_party/dav1d/src/riscv/cdef.h vendored Normal file
View File

@ -0,0 +1,31 @@
#include "src/cpu.h"
#include "src/cdef.h"
extern void BF(dav1d_cdef_filter_block_4x4, rvv)(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2],
const pixel *const top, const pixel *const bottom,
const int pri_strength, const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
extern void BF(dav1d_cdef_filter_block_4x8, rvv)(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2],
const pixel *const top, const pixel *const bottom,
const int pri_strength, const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
extern void BF(dav1d_cdef_filter_block_8x8, rvv)(pixel *dst, const ptrdiff_t dst_stride,
const pixel (*left)[2],
const pixel *const top, const pixel *const bottom,
const int pri_strength, const int sec_strength, const int dir,
const int damping, const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX);
static ALWAYS_INLINE void cdef_dsp_init_riscv(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
// c->dir = BF(dav1d_cdef_dir, rvv);
c->fb[0] = BF(dav1d_cdef_filter_block_8x8, rvv);
c->fb[1] = BF(dav1d_cdef_filter_block_4x8, rvv);
c->fb[2] = BF(dav1d_cdef_filter_block_4x4, rvv);
}

View File

@ -34,4 +34,8 @@ enum CpuFlags {
unsigned dav1d_get_cpu_flags_riscv(void);
int dav1d_get_vlenb(void);
#define dav1d_get_vlen() (dav1d_get_vlenb()*8)
#endif /* DAV1D_SRC_RISCV_CPU_H */

74
third_party/dav1d/src/riscv/ipred.h vendored Normal file
View File

@ -0,0 +1,74 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/ipred.h"
decl_cfl_pred_fn(BF(dav1d_ipred_cfl, rvv));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, rvv));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, rvv));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, rvv));
decl_angular_ipred_fn(BF(dav1d_ipred_paeth, rvv));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth, rvv));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, rvv));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, rvv));
decl_pal_pred_fn(BF(dav1d_pal_pred, rvv));
static ALWAYS_INLINE void intra_pred_dsp_init_riscv(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
#if BITDEPTH == 8
c->cfl_pred[DC_PRED ] = dav1d_ipred_cfl_8bpc_rvv;
c->cfl_pred[DC_128_PRED ] = dav1d_ipred_cfl_128_8bpc_rvv;
c->cfl_pred[TOP_DC_PRED ] = dav1d_ipred_cfl_top_8bpc_rvv;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_8bpc_rvv;
c->intra_pred[PAETH_PRED ] = dav1d_ipred_paeth_8bpc_rvv;
c->intra_pred[SMOOTH_PRED ] = dav1d_ipred_smooth_8bpc_rvv;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_8bpc_rvv;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_8bpc_rvv;
c->pal_pred = dav1d_pal_pred_8bpc_rvv;
#elif BITDEPTH == 16
c->cfl_pred[DC_PRED ] = dav1d_ipred_cfl_16bpc_rvv;
c->cfl_pred[DC_128_PRED ] = dav1d_ipred_cfl_128_16bpc_rvv;
c->cfl_pred[TOP_DC_PRED ] = dav1d_ipred_cfl_top_16bpc_rvv;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_16bpc_rvv;
c->intra_pred[PAETH_PRED ] = dav1d_ipred_paeth_16bpc_rvv;
c->intra_pred[SMOOTH_PRED ] = dav1d_ipred_smooth_16bpc_rvv;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_16bpc_rvv;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_16bpc_rvv;
c->pal_pred = dav1d_pal_pred_16bpc_rvv;
#endif
}

69
third_party/dav1d/src/riscv/mc.h vendored Normal file
View File

@ -0,0 +1,69 @@
/*
* Copyright © 2024, VideoLAN and dav1d authors
* Copyright © 2024, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/mc.h"
decl_blend_fn(BF(dav1d_blend, rvv));
decl_blend_dir_fn(BF(dav1d_blend_h, rvv));
decl_blend_dir_fn(BF(dav1d_blend_v, rvv));
decl_blend_fn(BF(dav1d_blend_vl256, rvv));
decl_blend_dir_fn(BF(dav1d_blend_h_vl256, rvv));
decl_blend_dir_fn(BF(dav1d_blend_v_vl256, rvv));
decl_avg_fn(BF(dav1d_avg, rvv));
decl_w_avg_fn(BF(dav1d_w_avg, rvv));
decl_mask_fn(BF(dav1d_mask, rvv));
decl_warp8x8_fn(BF(dav1d_warp_8x8, rvv));
decl_warp8x8t_fn(BF(dav1d_warp_8x8t, rvv));
static ALWAYS_INLINE void mc_dsp_init_riscv(Dav1dMCDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
#if BITDEPTH == 8
c->blend = BF(dav1d_blend, rvv);
c->blend_h = BF(dav1d_blend_h, rvv);
c->blend_v = BF(dav1d_blend_v, rvv);
if (dav1d_get_vlen() >= 256) {
c->blend = BF(dav1d_blend_vl256, rvv);
c->blend_h = BF(dav1d_blend_h_vl256, rvv);
c->blend_v = BF(dav1d_blend_v_vl256, rvv);
}
c->avg = BF(dav1d_avg, rvv);
c->w_avg = BF(dav1d_w_avg, rvv);
c->mask = BF(dav1d_mask, rvv);
c->warp8x8 = BF(dav1d_warp_8x8, rvv);
c->warp8x8t = BF(dav1d_warp_8x8t, rvv);
#endif
}

39
third_party/dav1d/src/riscv/pal.h vendored Normal file
View File

@ -0,0 +1,39 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2024, Bogdan Gligorijevic
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/pal.h"
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_rvv);
static ALWAYS_INLINE void pal_dsp_init_riscv(Dav1dPalDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
c->pal_idx_finish = dav1d_pal_idx_finish_rvv;
}

View File

@ -47,20 +47,19 @@ pb_2_3: times 8 db 2, 3
pb_6_7: times 8 db 6, 7
pw_256: times 8 dw 256
pw_1023: times 8 dw 1023
pw_164_24: times 4 dw 164, 24
pw_455_24: times 4 dw 455, 24
pd_8: times 4 dd 8
pd_4096: times 4 dd 4096
pd_34816: times 4 dd 34816
pd_m262128: times 4 dd -262128
pd_0xffff: times 4 dd 0xffff
pd_0xf00800a4: times 4 dd 0xf00800a4
pd_0xf00801c7: times 4 dd 0xf00801c7
pd_0xfffffff0: times 4 dd 0xfffffff0
pf_256: times 4 dd 256.0
wiener_shifts: dw 4, 4, 2048, 2048, 1, 1, 8192, 8192
wiener_round: dd 1049600, 1048832
cextern sgr_x_by_x
SECTION .text
%macro movif64 2 ; dst, src
@ -1048,56 +1047,51 @@ ALIGN function_align
%endif
ret
%macro GATHERDD 3 ; dst, src, tmp
movd %3d, %2
%if ARCH_X86_64
movd %1, [r13+%3]
pextrw %3d, %2, 2
pinsrw %1, [r13+%3+2], 3
pextrw %3d, %2, 4
pinsrw %1, [r13+%3+2], 5
pextrw %3d, %2, 6
pinsrw %1, [r13+%3+2], 7
%else
movd %1, [base+sgr_x_by_x-0xf03+%3]
pextrw %3, %2, 2
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
pextrw %3, %2, 4
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
pextrw %3, %2, 6
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
%endif
%macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2]
pmulhuw %5, %1, %3
pmulhuw %6, %2, %4
pmullw %1, %3
pmullw %2, %4
pslld %5, 16
pslld %6, 16
paddd %1, %5
paddd %2, %6
%endmacro
%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
%if ARCH_X86_64
%define tmp r14
%else
%define tmp %4
%endif
GATHERDD %1, %2, tmp
GATHERDD %2, %3, tmp
movif32 %4, %5
psrld %1, 24
psrld %2, 24
packssdw %1, %2
%endmacro
%macro MAXSD 3-4 0 ; dst, src, restore_tmp
pcmpgtd %3, %1, %2
pand %1, %3
pandn %3, %2
por %1, %3
%if %4 == 1
pxor %3, %3
%endif
%endmacro
%macro MULLD 3 ; dst, src, tmp
pmulhuw %3, %1, %2
pmullw %1, %2
pslld %3, 16
paddd %1, %3
%macro SGR_CALC_X 10 ; BB_dst, BB_src, b, tmp, an[1-2], zero, s, b_mul, pf_256
punpcklwd %4, %3, %7
punpckhwd %3, %7
pmaddwd %4, %4 ; b * b
pmaddwd %3, %3
punpcklwd %1, %2, %7 ; BB
punpckhwd %2, %7
psubd %5, %4 ; a * n - b * b
psubd %6, %3
pcmpgtd %4, %5, %7
pcmpgtd %3, %6, %7
pand %5, %4 ; p
pand %6, %3
MUL_32X16X2 %5, %6, %8, %8, %4, %3 ; p * s
paddw %5, %9
paddw %6, %9
psrld %5, 20 ; z + 1
psrld %6, 20
cvtdq2ps %5, %5
cvtdq2ps %6, %6
pmaddwd %1, %9 ; BB * 164
pmaddwd %2, %9
rcpps %3, %5 ; 1 / (z + 1)
rcpps %4, %6
cmpltps %5, %10
cmpltps %6, %10
mulps %3, %10 ; 256 / (z + 1)
mulps %4, %10
packssdw %5, %6
cvtps2dq %3, %3
cvtps2dq %4, %4
psrlw %5, 8 ; z < 255 ? 255 : 0
packssdw %3, %4
pminsw %3, %5 ; x
%endmacro
%if ARCH_X86_32
@ -1134,12 +1128,13 @@ cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
%define m8 [base+pd_8]
%define m9 [base+pd_0xfffffff0]
%define m10 [esp+calloff+16*2]
%define m11 [base+pd_0xf00800a4]
%define m11 [base+pw_164_24]
%define m12 [base+sgr_lshuf5]
%define m13 [base+pd_34816]
%define m14 [base+pw_1023]
%define m15 [base+pf_256]
%define r10 r4
%define base r6-$$
%define base r6-pw_455_24
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -1156,7 +1151,7 @@ cglobal sgr_filter_5x5_16bpc, 1, 7, 8, -400*24-16-extra_stack, \
mov edged, r2
%endif
%else
cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
cglobal sgr_filter_5x5_16bpc, 4, 13, 16, -400*24-16, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -1164,7 +1159,6 @@ cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
add wd, wd
mov edged, r7m
@ -1176,7 +1170,7 @@ cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
mova m9, [pd_0xfffffff0]
add dstq, wq
lea t3, [rsp+wq*2+400*12+16]
mova m11, [pd_0xf00800a4]
mova m11, [pw_164_24]
lea t4, [rsp+wq+400*20+16]
pshufhw m7, m10, q0000
pshufb m10, [pw_256] ; s0
@ -1186,11 +1180,12 @@ cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
pxor m6, m6
mova m14, [pw_1023]
psllw m7, 4
movaps m15, [pf_256]
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_455_24
add wd, wd
movu m1, [r1]
add lpfm, wq
@ -1218,7 +1213,6 @@ cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
jz .no_top
call .h_top
add lpfq, stridemp
movif32 t2m, t1
mov t2, t1
call .top_fixup
add t1, 400*6
@ -1298,7 +1292,6 @@ cglobal sgr_filter_5x5_16bpc, 4, 15, 15, -400*24-16, dst, stride, left, lpf, \
mov lpfm, r10
call .h
lea t2, [t1+400*6]
movif32 t2m, t2
call .top_fixup
dec hd
jz .no_top_height1
@ -1526,33 +1519,14 @@ ALIGN function_align
psrld m5, 1
paddd m4, m2 ; a * 25
paddd m5, m0
punpcklwd m2, m3, m6
punpckhwd m3, m6
pmaddwd m2, m2 ; b * b
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m6
MAXSD m5, m3, m6, 1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m10, m2 ; p * s
MULLD m5, m10, m2
pmaddwd m0, m11 ; b * 164
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m10, m11, m15
punpcklwd m2, m3, m3
mova [t4+wq+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m13
mova [t4+wq+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*2+ 8], m0
@ -1562,7 +1536,6 @@ ALIGN function_align
mov t2, t1
mov t1, t0
mov t0, t2
movif32 t2m, t2
movif32 t0m, t0
ret
.hv_last_row: ; esoteric edge case for odd heights
@ -1606,32 +1579,13 @@ ALIGN function_align
psrld m5, 1
paddd m4, m2 ; a * 25
paddd m5, m0
punpcklwd m2, m3, m6
punpckhwd m3, m6
pmaddwd m2, m2 ; b * b
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m6
MAXSD m5, m3, m6, 1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m10, m2 ; p * s
MULLD m5, m10, m2
pmaddwd m0, m11 ; b * 164
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m10, m11, m15
punpcklwd m2, m3, m3
mova [t4+wq+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m13 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m13
mova [t4+wq+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*2+ 8], m0
@ -1784,12 +1738,12 @@ cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
%define t4m dword [esp+calloff+4*3]
%define m8 [base+pd_8]
%define m9 [esp+calloff+16*1]
%define m10 [base+pd_0xf00801c7]
%define m10 [base+pw_455_24]
%define m11 [base+pd_34816]
%define m12 [base+sgr_lshuf3]
%define m13 [base+pw_1023]
%define m14 m6
%define base r6-$$
%define m14 [base+pf_256]
%define base r6-pw_455_24
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -1806,7 +1760,7 @@ cglobal sgr_filter_3x3_16bpc, 1, 7, 8, -400*42-16-extra_stack, \
mov edged, r2
%endif
%else
cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
cglobal sgr_filter_3x3_16bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -1814,7 +1768,6 @@ cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
add wd, wd
mov edged, r7m
@ -1824,7 +1777,7 @@ cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
mova m8, [pd_8]
add dstq, wq
lea t3, [rsp+wq*2+400*12+8]
mova m10, [pd_0xf00801c7]
mova m10, [pw_455_24]
lea t4, [rsp+wq+400*32+8]
mova m11, [pd_34816]
pshuflw m7, m9, q3333
@ -1835,11 +1788,12 @@ cglobal sgr_filter_3x3_16bpc, 4, 15, 15, -400*42-8, dst, stride, left, lpf, \
mova m13, [pw_1023]
psllw m7, 4
mova m12, [sgr_lshuf3]
movaps m14, [pf_256]
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_455_24
add wd, wd
movq m1, [r1+4]
add lpfm, wq
@ -2113,36 +2067,14 @@ ALIGN function_align
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m14
MAXSD m5, m3, m14
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m14 ; p * s
MULLD m5, m9, m14
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m14
MULLD m1, m5, m14
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq+4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+ 8], m0
@ -2225,36 +2157,14 @@ ALIGN function_align
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m14
MAXSD m5, m3, m14
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m14 ; p * s
MULLD m5, m9, m14
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m14
MULLD m1, m5, m14
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*1+400*2 +4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*1+400*2 +4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+400*4+ 8], m0
@ -2294,35 +2204,13 @@ ALIGN function_align
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m14
MAXSD m5, m3, m14
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m14 ; p * s
MULLD m5, m9, m14
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m14
MULLD m1, m5, m14
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*1+400*0+ 4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*1+400*0+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+400*0+ 8], m0
@ -2356,35 +2244,13 @@ ALIGN function_align
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m6 ; (b + 2) >> 2
punpcklwd m2, m3, m6
pmaddwd m2, m2
punpckhwd m3, m6
pmaddwd m3, m3
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
MAXSD m4, m2, m14
MAXSD m5, m3, m14
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m14 ; p * s
MULLD m5, m9, m14
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m14
MULLD m1, m5, m14
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*1+400*2+ 4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*1+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+400*4+ 8], m0
@ -2592,13 +2458,13 @@ cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
%xdefine m8 m6
%define m9 [base+pd_8]
%define m10 [base+pd_34816]
%define m11 [base+pd_0xf00801c7]
%define m12 [base+pd_0xf00800a4]
%define m11 [base+pw_455_24]
%define m12 [base+pw_164_24]
%define m13 [esp+calloff+16*4]
%define m14 [esp+calloff+16*5]
%define m15 [esp+calloff+16*6]
%define m6 [esp+calloff+16*7]
%define base r6-$$
%define base r6-pw_455_24
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -2615,7 +2481,7 @@ cglobal sgr_filter_mix_16bpc, 1, 7, 8, -400*66-48-extra_stack, \
mov edged, r2
%endif
%else
cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
cglobal sgr_filter_mix_16bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -2623,7 +2489,6 @@ cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
add wd, wd
mov edged, r7m
@ -2633,9 +2498,9 @@ cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
lea t1, [rsp+wq+44]
mova m10, [pd_34816]
add dstq, wq
mova m11, [pd_0xf00801c7]
mova m11, [pw_455_24]
lea t3, [rsp+wq*2+400*24+40]
mova m12, [pd_0xf00800a4]
mova m12, [pw_164_24]
lea t4, [rsp+wq+400*52+40]
neg wq
pshufd m15, m14, q2222 ; w0 w1
@ -2648,7 +2513,7 @@ cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_455_24
add wd, wd
mova m2, [r1]
add lpfm, wq
@ -2848,18 +2713,18 @@ cglobal sgr_filter_mix_16bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
paddd m2, m7 ; sumsq3
palignr m5, m4, 8
punpcklwd m7, m5, m4
paddw m8, m4, m5
paddd m3, m0
paddw m0, m4, m5
pmaddwd m7, m7
punpckhwd m5, m4
pmaddwd m5, m5
paddd m3, m0
mova [t1+wq+400* 6], m1
mova [t1+wq+400* 8], m2
mova [t1+wq+400*10], m3
paddw m8, m1 ; sum5
paddw m0, m1 ; sum5
paddd m7, m2 ; sumsq5
paddd m5, m3
mova [t1+wq+400* 0], m8
mova [t1+wq+400* 0], m0
mova [t1+wq+400* 2], m7
mova [t1+wq+400* 4], m5
add wq, 16
@ -2930,21 +2795,21 @@ ALIGN function_align
paddd m2, m7 ; h sumsq3
palignr m5, m4, 8
punpcklwd m7, m5, m4
paddw m8, m4, m5
paddd m3, m0
paddw m0, m4, m5
pmaddwd m7, m7
punpckhwd m5, m4
pmaddwd m5, m5
paddd m3, m0
paddw m8, m1 ; h sum5
paddw m0, m1 ; h sum5
paddd m7, m2 ; h sumsq5
paddd m5, m3
mova [t3+wq*2+400*8+ 8], m8
mova [t3+wq*2+400*8+ 8], m0
mova [t3+wq*2+400*0+ 8], m7
mova [t3+wq*2+400*0+24], m5
paddw m8, [t1+wq+400* 0]
paddw m0, [t1+wq+400* 0]
paddd m7, [t1+wq+400* 2]
paddd m5, [t1+wq+400* 4]
mova [t1+wq+400* 0], m8
mova [t1+wq+400* 0], m0
mova [t1+wq+400* 2], m7
mova [t1+wq+400* 4], m5
paddw m0, m1, [t1+wq+400* 6]
@ -2961,48 +2826,28 @@ ALIGN function_align
mova [t2+wq+400*10], m5
paddd m2, m9
paddd m3, m9
movaps m8, [base+pf_256]
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
%if ARCH_X86_64
SWAP m7, m6
%if ARCH_X86_32
pxor m7, m7
pavgw m3, m7
SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
%else
pavgw m3, m6 ; (b3 + 2) >> 2
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
%endif
MAXSD m4, m2, m7
MAXSD m5, m3, m7
psubd m4, m2 ; p3
psubd m5, m3
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
punpcklwd m2, m3, m3
mova [t4+wq*1+400*2+ 4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*1+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+400*4+ 8], m0
@ -3098,49 +2943,29 @@ ALIGN function_align
paddd m3, m7
psrlw m7, m5, 1
pavgw m7, m6 ; (b3 + 2) >> 2
punpcklwd m0, m7, m6
pmaddwd m0, m0
punpckhwd m7, m6
pmaddwd m7, m7
%if ARCH_X86_32
mova [esp+20], m8
mov t3, t3m
SGR_CALC_X m0, m5, m7, m8, m2, m3, m6, m14, m11, [base+pf_256]
%else
SWAP m8, m6
SGR_CALC_X m0, m5, m7, m12, m2, m3, m6, m14, m11, [base+pf_256]
%endif
punpcklwd m2, m7, m7
mova [t4+wq*1+400*4+4], m7
punpckhwd m7, m7
%if ARCH_X86_32
MUL_32X16X2 m0, m5, m2, m7, m3, m8
mova m8, [esp+20]
%else
MUL_32X16X2 m0, m5, m2, m7, m3, m12
mova m12, [pw_164_24]
%endif
MAXSD m2, m0, m8
MAXSD m3, m7, m8
pxor m8, m8
psubd m2, m0 ; p3
psubd m3, m7
punpcklwd m0, m5, m8 ; b3
punpckhwd m5, m8
MULLD m2, m14, m8 ; p3 * s1
MULLD m3, m14, m8
pmaddwd m0, m11 ; b3 * 455
pmaddwd m5, m11
paddusw m2, m11
paddusw m3, m11
psrld m2, 20 ; min(z3, 255)
movif32 t3, t3m
psrld m3, 20
GATHER_X_BY_X m8, m2, m3, r0, dstm
punpcklwd m2, m8, m8
punpckhwd m3, m8, m8
MULLD m0, m2, m7
MULLD m5, m3, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m5, m10
psrld m0, 12
psrld m5, 12
mova [t4+wq*1+400*4+4], m8
mova [t3+wq*2+400*8+ 8], m0
mova [t3+wq*2+400*8+24], m5
%if ARCH_X86_32
mova m8, [esp+20]
%else
SWAP m6, m8
pxor m6, m6
%endif
paddw m5, m8, [t2+wq+400*0]
paddd m2, m4, [t2+wq+400*2]
paddd m3, m1, [t2+wq+400*4]
@ -3162,42 +2987,22 @@ ALIGN function_align
pslld m3, 3
paddd m2, m8 ; ((a5 + 8) >> 4) * 25
paddd m3, m4
psrlw m1, m5, 1
%if ARCH_X86_32
pxor m7, m7
pavgw m1, m7
SGR_CALC_X m0, m5, m1, m4, m2, m3, m7, m13, m12, [base+pf_256]
%else
SWAP m7, m6
movaps m8, [base+pf_256]
pavgw m1, m6 ; (b5 + 2) >> 2
SGR_CALC_X m0, m5, m1, m4, m2, m3, m6, m13, m12, m8
%endif
psrlw m1, m5, 1
pavgw m1, m7 ; (b5 + 2) >> 2
punpcklwd m4, m1, m7
pmaddwd m4, m4
punpckhwd m1, m7
pmaddwd m1, m1
punpcklwd m0, m5, m7 ; b5
punpckhwd m5, m7
%if ARCH_X86_64
SWAP m7, m6
%endif
MAXSD m2, m4, m7
psubd m2, m4 ; p5
MAXSD m3, m1, m7
psubd m3, m1
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m5, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m1, m2, m3, r0, dstm
punpcklwd m2, m1, m1
punpckhwd m3, m1, m1
MULLD m0, m2, m7
MULLD m5, m3, m7
mova [t4+wq*1+400*0+ 4], m1
punpckhwd m1, m1
MUL_32X16X2 m0, m5, m2, m1, m3, m4
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m5, m10
mova [t4+wq*1+400*0+ 4], m1
psrld m0, 12
psrld m5, 12
mova [t3+wq*2+400*0+ 8], m0
@ -3214,6 +3019,7 @@ ALIGN function_align
%else
mov wd, w0m
%endif
movaps m8, [base+pf_256]
.v0_loop:
mova m0, [t1+wq+400* 6]
mova m4, [t1+wq+400* 8]
@ -3231,46 +3037,25 @@ ALIGN function_align
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
%if ARCH_X86_64
SWAP m7, m6
%if ARCH_X86_32
pxor m7, m7
pavgw m3, m7
SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
%else
pavgw m3, m6 ; (b3 + 2) >> 2
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
%endif
MAXSD m4, m2, m7
MAXSD m5, m3, m7
psubd m4, m2 ; p3
psubd m5, m3
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
punpcklwd m2, m3, m3
mova [t4+wq*1+400*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*1+400*2+4], m3
psrld m0, 12
psrld m1, 12
mova m3, [t1+wq+400*0]
@ -3296,6 +3081,7 @@ ALIGN function_align
%else
mov wd, w0m
%endif
movaps m8, [base+pf_256]
.v1_loop:
mova m4, [t1+wq+400* 6]
mova m5, [t1+wq+400* 8]
@ -3310,51 +3096,32 @@ ALIGN function_align
paddd m3, m9
psrld m2, 4 ; (a3 + 8) >> 4
psrld m3, 4
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
psrlw m3, m1, 1
pavgw m3, m7 ; (b3 + 2) >> 2
punpcklwd m2, m3, m7
pmaddwd m2, m2
punpckhwd m3, m7
pmaddwd m3, m3
punpcklwd m0, m1, m7 ; b3
punpckhwd m1, m7
%if ARCH_X86_64
SWAP m7, m6
%if ARCH_X86_32
pxor m7, m7
pavgw m3, m7
SGR_CALC_X m0, m1, m3, m2, m4, m5, m7, m14, m11, m8
%else
pavgw m3, m6 ; (b3 + 2) >> 2
SGR_CALC_X m0, m1, m3, m2, m4, m5, m6, m14, m11, m8
%endif
MAXSD m4, m2, m7
MAXSD m5, m3, m7
psubd m4, m2 ; p3
psubd m5, m3
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
punpcklwd m2, m3, m3
mova [t4+wq*1+400*4+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*1+400*4+4], m3
psrld m0, 12
psrld m8, m1, 12
psrld m1, 12
mova m4, [t3+wq*2+400*8+ 8]
mova m5, [t3+wq*2+400*0+ 8]
mova m7, [t3+wq*2+400*0+24]
mova [t3+wq*2+400*8+ 8], m0
mova [t3+wq*2+400*8+24], m1
paddw m1, m4, [t2+wq+400*0]
paddd m2, m5, [t2+wq+400*2]
paddd m3, m7, [t2+wq+400*4]
@ -3368,9 +3135,7 @@ ALIGN function_align
paddd m3, m9
psrld m2, 4 ; (a5 + 8) >> 4
psrld m3, 4
mova [t3+wq*2+400*8+ 8], m0
pslld m4, m2, 4
mova [t3+wq*2+400*8+24], m8
pslld m5, m3, 4
paddd m4, m2
pslld m2, 3
@ -3378,42 +3143,21 @@ ALIGN function_align
pslld m3, 3
paddd m2, m4
paddd m3, m5
psrlw m5, m1, 1
%if ARCH_X86_32
pxor m7, m7
pavgw m5, m7
SGR_CALC_X m0, m1, m5, m4, m2, m3, m7, m13, m12, m8
%else
SWAP m7, m6
pavgw m5, m6 ; (b5 + 2) >> 2
SGR_CALC_X m0, m1, m5, m4, m2, m3, m6, m13, m12, m8
%endif
psrlw m5, m1, 1
pavgw m5, m7 ; (b5 + 2) >> 2
punpcklwd m4, m5, m7
pmaddwd m4, m4
punpckhwd m5, m7
pmaddwd m5, m5
punpcklwd m0, m1, m7 ; b5
punpckhwd m1, m7
%if ARCH_X86_64
SWAP m7, m6
%endif
MAXSD m2, m4, m7
psubd m2, m4 ; p5
MAXSD m3, m5, m7
psubd m3, m5
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m1, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m4, m2, m3, r0, dstm
punpcklwd m2, m4, m4
punpckhwd m3, m4, m4
MULLD m0, m2, m7
MULLD m1, m3, m7
punpcklwd m4, m5, m5
mova [t4+wq*1+400*0+ 4], m5
punpckhwd m5, m5
MUL_32X16X2 m0, m1, m4, m5, m2, m3
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*1+400*0+ 4], m4
psrld m0, 12
psrld m1, 12
mova [t3+wq*2+400*0+ 8], m0
@ -3431,20 +3175,20 @@ ALIGN function_align
movu m0, [t4+wq*1+400*0+ 2]
movu m1, [t3+wq*2+400*0+ 4]
movu m2, [t3+wq*2+400*0+20]
movu m7, [t4+wq*1+400*0+ 4]
movu m8, [t3+wq*2+400*0+ 8]
paddw m3, m0, [t4+wq*1+400*0+ 0]
paddd m4, m1, [t3+wq*2+400*0+ 0]
movu m3, [t4+wq*1+400*0+ 4]
movu m4, [t3+wq*2+400*0+ 8]
paddw m3, [t4+wq*1+400*0+ 0]
paddd m4, [t3+wq*2+400*0+ 0]
paddd m5, m2, [t3+wq*2+400*0+16]
paddw m3, m7
paddd m4, m8
movu m7, [t3+wq*2+400*0+24]
paddw m3, m0
paddd m4, m1
paddd m5, m7
paddw m0, m3
paddd m1, m4
paddd m2, m5
psllw m3, 2
pslld m4, 2
paddd m5, m7
paddd m2, m5
pslld m5, 2
paddw m0, m3 ; a5 565
paddd m1, m4 ; b5 565

View File

@ -45,13 +45,12 @@ pb_1: times 16 db 1
pw_256: times 8 dw 256
pw_2056: times 8 dw 2056
pw_m16380: times 8 dw -16380
pw_164_24: times 4 dw 164, 24
pw_455_24: times 4 dw 455, 24
pd_4096: times 4 dd 4096
pd_34816: times 4 dd 34816
pd_0xffff: times 4 dd 0xffff
pd_0xf00800a4: times 4 dd 0xf00800a4
pd_0xf00801c7: times 4 dd 0xf00801c7
cextern sgr_x_by_x
pf_256: times 4 dd 256.0
SECTION .text
@ -1125,46 +1124,43 @@ WIENER
;; self-guided ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;
%macro GATHERDD 3 ; dst, src, tmp
movd %3d, %2
%if ARCH_X86_64
movd %1, [r13+%3]
pextrw %3d, %2, 2
pinsrw %1, [r13+%3+2], 3
pextrw %3d, %2, 4
pinsrw %1, [r13+%3+2], 5
pextrw %3d, %2, 6
pinsrw %1, [r13+%3+2], 7
%else
movd %1, [base+sgr_x_by_x-0xf03+%3]
pextrw %3, %2, 2
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 3
pextrw %3, %2, 4
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 5
pextrw %3, %2, 6
pinsrw %1, [base+sgr_x_by_x-0xf03+%3+2], 7
%endif
%macro MUL_32X16X2 6 ; dst[1-2], src[1-2], tmp[1-2]
pmulhuw %5, %1, %3
pmulhuw %6, %2, %4
pmullw %1, %3
pmullw %2, %4
pslld %5, 16
pslld %6, 16
paddd %1, %5
paddd %2, %6
%endmacro
%macro GATHER_X_BY_X 5 ; dst, src0, src1, tmp32, tmp32_restore
%if ARCH_X86_64
%define tmp r14
%else
%define tmp %4
%endif
GATHERDD %1, %2, tmp
GATHERDD %2, %3, tmp
movif32 %4, %5
psrld %1, 24
psrld %2, 24
%macro SGR_CALC_X 9 ; dst, tmp, b[1-2], an[1-2], s, b_mul, pf_256
pmaddwd %1, %3, %3 ; b * b
pmaddwd %2, %4, %4
psubd %5, %1 ; p
psubd %6, %2
MUL_32X16X2 %5, %6, %7, %7, %1, %2 ; p * s
pmaddwd %3, %8 ; b * b_mul
pmaddwd %4, %8
paddw %5, %8
paddw %6, %8
psrld %5, 20 ; z + 1
psrld %6, 20
cvtdq2ps %5, %5
cvtdq2ps %6, %6
rcpps %1, %5 ; 1 / (z + 1)
rcpps %2, %6
cmpltps %5, %9
cmpltps %6, %9
mulps %1, %9 ; 256 / (z + 1)
mulps %2, %9
packssdw %5, %6
cvtps2dq %1, %1
cvtps2dq %2, %2
psrlw %5, 8 ; z < 255 ? 255 : 0
packssdw %1, %2
%endmacro
%macro MULLD 3 ; dst, src, tmp
pmulhuw %3, %1, %2
pmullw %1, %2
pslld %3, 16
paddd %1, %3
pminsw %1, %5 ; x
%endmacro
%if ARCH_X86_32
@ -1200,12 +1196,13 @@ cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
%define t4m dword [esp+calloff+4*5]
%define m8 [base+pb_1]
%define m9 [esp+calloff+16*2]
%define m10 [base+pd_0xf00800a4]
%define m10 [base+pw_164_24]
%define m11 [base+sgr_lshuf5]
%define m12 [base+pd_34816]
%define m13 [base+pb_0to15]
%define m14 [base+pf_256]
%define r10 r4
%define base r6-$$
%define base r6-pw_2056
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -1223,7 +1220,7 @@ cglobal sgr_filter_5x5_8bpc, 1, 7, 8, -400*24-16-extra_stack, \
%endif
%else
DECLARE_REG_TMP 8, 7, 9, 11, 12
cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
cglobal sgr_filter_5x5_8bpc, 4, 13, 15, -400*24-16, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -1231,14 +1228,13 @@ cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
mov edged, r7m
movu m9, [paramsq]
add lpfq, wq
mova m8, [pb_1]
lea t1, [rsp+wq*2+20]
mova m10, [pd_0xf00800a4]
mova m10, [pw_164_24]
add dstq, wq
lea t3, [rsp+wq*4+400*12+16]
mova m12, [pd_34816] ; (1 << 11) + (1 << 15)
@ -1251,11 +1247,12 @@ cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
pxor m6, m6
mova m11, [sgr_lshuf5]
psllw m7, 4
movaps m14, [pf_256]
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_2056
movu m1, [r1]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+20]
@ -1282,7 +1279,6 @@ cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
jz .no_top
call .h_top
add lpfq, stridemp
movif32 t2m, t1
mov t2, t1
call .top_fixup
add t1, 400*6
@ -1362,7 +1358,6 @@ cglobal sgr_filter_5x5_8bpc, 4, 15, 14, -400*24-16, dst, stride, left, lpf, \
mov lpfm, r10
call .h
lea t2, [t1+400*6]
movif32 t2m, t2
call .top_fixup
dec hd
jz .no_top_height1
@ -1582,27 +1577,14 @@ ALIGN function_align
paddd m5, m2
punpcklwd m0, m1, m6 ; b
punpckhwd m1, m6
pmaddwd m2, m0, m0 ; b * b
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m2 ; p * s
MULLD m5, m9, m2
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
punpcklwd m2, m3, m3
mova [t4+wq*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m12
mova [t4+wq*2+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*4+ 8], m0
@ -1612,7 +1594,6 @@ ALIGN function_align
mov t2, t1
mov t1, t0
mov t0, t2
movif32 t2m, t2
movif32 t0m, t0
ret
.hv_last_row: ; esoteric edge case for odd heights
@ -1652,26 +1633,13 @@ ALIGN function_align
paddd m5, m2
punpcklwd m0, m1, m6
punpckhwd m1, m6
pmaddwd m2, m0, m0 ; b * b
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m2 ; p * s
MULLD m5, m9, m2
pmaddwd m0, m10 ; b * 164
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, t2, t2m
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m2
MULLD m1, m5, m2
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m12 ; x * b * 164 + (1 << 11) + (1 << 15)
paddd m1, m12
mova [t4+wq*2+4], m3
psrld m0, 12 ; b
psrld m1, 12
mova [t3+wq*4+ 8], m0
@ -1824,11 +1792,12 @@ cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
%define t4m dword [esp+calloff+4*3]
%define m8 [base+pb_0to15]
%define m9 [esp+calloff+16*1]
%define m10 [base+pd_0xf00801c7]
%define m10 [base+pw_455_24]
%define m11 [base+pd_34816]
%define m12 m6
%define m13 [base+sgr_lshuf3]
%define base r6-$$
%define m14 [base+pf_256]
%define base r6-pw_2056
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -1845,7 +1814,7 @@ cglobal sgr_filter_3x3_8bpc, 1, 7, 8, -400*42-16-extra_stack, \
mov edged, r2
%endif
%else
cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
cglobal sgr_filter_3x3_8bpc, 4, 13, 15, -400*42-8, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -1853,7 +1822,6 @@ cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
mov hd, hm
mov edged, r7m
movq m9, [paramsq+4]
@ -1862,7 +1830,7 @@ cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
mova m8, [pb_0to15]
add dstq, wq
lea t3, [rsp+wq*4+400*12+8]
mova m10, [pd_0xf00801c7]
mova m10, [pw_455_24]
lea t4, [rsp+wq*2+400*32+8]
mova m11, [pd_34816]
pshuflw m7, m9, q3333
@ -1872,11 +1840,12 @@ cglobal sgr_filter_3x3_8bpc, 4, 15, 14, -400*42-8, dst, stride, left, lpf, \
pxor m6, m6
mova m13, [sgr_lshuf3]
psllw m7, 4
movaps m14, [pf_256]
DEFINE_ARGS dst, stride, left, lpf, _, h, edge, _, _, _, w
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_2056
movq m1, [r1+4]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+20]
@ -2148,31 +2117,15 @@ ALIGN function_align
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
punpcklwd m2, m3, m3
mova [t4+wq*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0
@ -2251,31 +2204,15 @@ ALIGN function_align
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
movif32 t3, t3m
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
punpcklwd m2, m3, m3
mova [t4+wq*2+400*2 +4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+400*2 +4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
@ -2310,30 +2247,14 @@ ALIGN function_align
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0
@ -2362,30 +2283,14 @@ ALIGN function_align
paddd m4, m2 ; a * 9
paddd m5, m3
punpcklwd m0, m1, m6 ; b
pmaddwd m2, m0, m0 ; b * b
punpckhwd m1, m6
pmaddwd m3, m1, m1
psubd m4, m2 ; p
psubd m5, m3
MULLD m4, m9, m12 ; p * s
MULLD m5, m9, m12
pmaddwd m0, m10 ; b * 455
pmaddwd m1, m10
paddusw m4, m10
paddusw m5, m10
psrld m4, 20 ; min(z, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m12
MULLD m1, m5, m12
%if ARCH_X86_32
pxor m6, m6
%endif
SGR_CALC_X m3, m2, m0, m1, m4, m5, m9, m10, m14
punpcklwd m2, m3, m3
mova [t4+wq*2+400*2+ 4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m11 ; x * b * 455 + (1 << 11) + (1 << 15)
paddd m1, m11
mova [t4+wq*2+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
@ -2593,13 +2498,13 @@ cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
%xdefine m8 m6
%define m9 [base+pd_0xffff]
%define m10 [base+pd_34816]
%define m11 [base+pd_0xf00801c7]
%define m12 [base+pd_0xf00800a4]
%define m11 [base+pw_455_24]
%define m12 [base+pw_164_24]
%define m13 [esp+calloff+16*4]
%define m14 [esp+calloff+16*5]
%define m15 [esp+calloff+16*6]
%define m6 [esp+calloff+16*7]
%define base r6-$$
%define base r6-pw_2056
%assign calloff 0
%if STACK_ALIGNMENT < 16
mov strideq, [rstk+stack_offset+ 8]
@ -2616,7 +2521,7 @@ cglobal sgr_filter_mix_8bpc, 1, 7, 8, -400*66-48-extra_stack, \
mov edged, r2
%endif
%else
cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
cglobal sgr_filter_mix_8bpc, 4, 13, 16, -400*66-40, dst, stride, left, lpf, \
w, h, edge, params
%endif
%if ARCH_X86_64 || STACK_ALIGNMENT >= 16
@ -2624,7 +2529,6 @@ cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
%endif
%if ARCH_X86_64
mov paramsq, r6mp
lea r13, [sgr_x_by_x-0xf03]
movifnidn hd, hm
mov edged, r7m
mova m15, [paramsq]
@ -2634,9 +2538,9 @@ cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
mova m10, [pd_34816]
add dstq, wq
lea t3, [rsp+wq*4+400*24+40]
mova m11, [pd_0xf00801c7]
mova m11, [pw_455_24]
lea t4, [rsp+wq*2+400*52+40]
mova m12, [base+pd_0xf00800a4]
mova m12, [pw_164_24]
neg wq
pshuflw m13, m15, q0000
pshuflw m14, m15, q2222
@ -2650,7 +2554,7 @@ cglobal sgr_filter_mix_8bpc, 4, 15, 16, -400*66-40, dst, stride, left, lpf, \
%define lpfm [rsp]
%else
mov r1, [rstk+stack_offset+28] ; params
LEA r6, $$
LEA r6, pw_2056
mova m2, [r1]
add lpfm, wq
lea t1, [rsp+extra_stack+wq*2+52]
@ -2991,40 +2895,26 @@ ALIGN function_align
mova [t2+wq*2+400* 6], m0
mova [t2+wq*2+400* 8], m4
mova [t2+wq*2+400*10], m5
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a3 * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
punpckhwd m1, m7
pmaddwd m3, m1, m1
%if ARCH_X86_64
SWAP m7, m6
movaps m7, [base+pf_256]
%if ARCH_X86_32
pxor m2, m2
punpcklwd m0, m1, m2
punpckhwd m1, m2
%else
punpcklwd m0, m1, m6 ; b3
punpckhwd m1, m6
%endif
psubd m4, m2 ; p3
psubd m5, m3
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
SGR_CALC_X m3, m2, m0, m1, m4, m5, m14, m11, m7
punpcklwd m2, m3, m3
mova [t4+wq*2+400*2+ 4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+400*2+ 4], m3
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+400*4+ 8], m0
@ -3127,30 +3017,17 @@ ALIGN function_align
SWAP m8, m6
%endif
punpcklwd m0, m5, m8 ; b3
pmaddwd m7, m0, m0
punpckhwd m5, m8
pmaddwd m8, m5, m5
psubd m2, m7 ; p3
psubd m3, m8
MULLD m2, m14, m8 ; p3 * s1
MULLD m3, m14, m8
pmaddwd m0, m11 ; b3 * 455
pmaddwd m5, m11
paddusw m2, m11
paddusw m3, m11
psrld m2, 20 ; min(z3, 255)
SGR_CALC_X m8, m7, m0, m5, m2, m3, m14, m11, [base+pf_256]
movif32 t3, t3m
psrld m3, 20
GATHER_X_BY_X m8, m2, m3, r0, dstm
punpcklwd m2, m8, m8
punpckhwd m3, m8, m8
MULLD m0, m2, m7
MULLD m5, m3, m7
mova [t4+wq*2+400*4+ 4], m8
punpckhwd m8, m8
MUL_32X16X2 m0, m5, m2, m8, m3, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m5, m10
psrld m0, 12
psrld m5, 12
mova [t4+wq*2+400*4+ 4], m8
mova [t3+wq*4+400*8+ 8], m0
mova [t3+wq*4+400*8+24], m5
%if ARCH_X86_32
@ -3172,40 +3049,26 @@ ALIGN function_align
mova [t2+wq*2+400*4], m1
pslld m4, m2, 3
paddd m2, m0
pslld m7, m3, 3
pslld m0, m3, 3
paddd m3, m8
paddd m2, m4 ; a5 * 25
paddd m3, m7
paddd m3, m0
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
punpcklwd m0, m5, m7 ; b5
pmaddwd m4, m0, m0
punpcklwd m0, m5, m7
punpckhwd m5, m7
pmaddwd m1, m5, m5
%if ARCH_X86_64
SWAP m7, m6
%else
punpcklwd m0, m5, m6 ; b5
punpckhwd m5, m6
%endif
psubd m2, m4 ; p5
psubd m3, m1
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m5, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m1, m2, m3, r0, dstm
movaps m8, [base+pf_256]
SGR_CALC_X m1, m4, m0, m5, m2, m3, m13, m12, m8
punpcklwd m2, m1, m1
punpckhwd m3, m1, m1
MULLD m0, m2, m7
MULLD m5, m3, m7
mova [t4+wq*2+4], m1
punpckhwd m1, m1
MUL_32X16X2 m0, m5, m2, m1, m3, m4
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m5, m10
mova [t4+wq*2+4], m1
psrld m0, 12
psrld m5, 12
mova [t3+wq*4+ 8], m0
@ -3222,6 +3085,7 @@ ALIGN function_align
%else
mov wd, w0m
%endif
movaps m8, [base+pf_256]
.v0_loop:
mova m0, [t1+wq*2+400* 6]
mova m4, [t1+wq*2+400* 8]
@ -3235,40 +3099,25 @@ ALIGN function_align
mova [t2+wq*2+400* 6], m0
mova [t2+wq*2+400* 8], m4
mova [t2+wq*2+400*10], m5
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; a3 * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
%if ARCH_X86_32
pxor m7, m7
punpcklwd m0, m1, m7
punpckhwd m1, m7
pmaddwd m3, m1, m1
psubd m4, m2 ; p3
psubd m5, m3
%if ARCH_X86_64
SWAP m7, m6
%else
punpcklwd m0, m1, m6 ; b3
punpckhwd m1, m6
%endif
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
SGR_CALC_X m3, m2, m0, m1, m4, m5, m14, m11, m8
punpcklwd m2, m3, m3
mova [t4+wq*2+400*2+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m1, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+400*2+4], m3
psrld m0, 12
psrld m1, 12
mova m3, [t1+wq*2+400*0]
@ -3298,48 +3147,34 @@ ALIGN function_align
mova m4, [t1+wq*2+400* 6]
mova m5, [t1+wq*2+400* 8]
mova m7, [t1+wq*2+400*10]
paddw m1, m4, [t2+wq*2+400* 6]
paddw m8, m4, [t2+wq*2+400* 6]
paddd m2, m5, [t2+wq*2+400* 8]
paddd m3, m7, [t2+wq*2+400*10]
mova [t2+wq*2+400* 6], m4
mova [t2+wq*2+400* 8], m5
mova [t2+wq*2+400*10], m7
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
pslld m4, m2, 3
pslld m5, m3, 3
paddd m4, m2 ; ((a3 + 8) >> 4) * 9
paddd m5, m3
punpcklwd m0, m1, m7 ; b3
pmaddwd m2, m0, m0
punpckhwd m1, m7
pmaddwd m3, m1, m1
psubd m4, m2 ; p3
psubd m5, m3
%if ARCH_X86_64
SWAP m7, m6
movaps m1, [base+pf_256]
%if ARCH_X86_32
pxor m7, m7
punpcklwd m0, m8, m7
punpckhwd m8, m7
%else
punpcklwd m0, m8, m6 ; b3
punpckhwd m8, m6
%endif
MULLD m4, m14, m7 ; p3 * s1
MULLD m5, m14, m7
pmaddwd m0, m11 ; b3 * 455
pmaddwd m1, m11
paddusw m4, m11
paddusw m5, m11
psrld m4, 20 ; min(z3, 255)
psrld m5, 20
GATHER_X_BY_X m3, m4, m5, r0, dstm
punpcklwd m4, m3, m3
punpckhwd m5, m3, m3
MULLD m0, m4, m7
MULLD m1, m5, m7
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m1, m10
SGR_CALC_X m3, m2, m0, m8, m4, m5, m14, m11, m1
punpcklwd m2, m3, m3
mova [t4+wq*2+400*4+4], m3
punpckhwd m3, m3
MUL_32X16X2 m0, m8, m2, m3, m4, m5
paddd m0, m10 ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
paddd m8, m10
psrld m0, 12
psrld m8, m1, 12
psrld m8, 12
mova m4, [t3+wq*4+400*8+ 8]
mova m5, [t3+wq*4+400*0+ 8]
mova m7, [t3+wq*4+400*0+24]
@ -3358,40 +3193,26 @@ ALIGN function_align
mova [t3+wq*4+400*8+24], m8
pslld m7, m2, 3
paddd m2, m4
pslld m8, m3, 3
pslld m4, m3, 3
paddd m3, m5
paddd m2, m7 ; a5 * 25
paddd m3, m8
paddd m3, m4
movaps m8, [base+pf_256]
%if ARCH_X86_32
pxor m7, m7
%else
SWAP m7, m6
%endif
punpcklwd m0, m1, m7 ; b5
pmaddwd m4, m0, m0
punpcklwd m0, m1, m7
punpckhwd m1, m7
pmaddwd m5, m1, m1
psubd m2, m4 ; p5
psubd m3, m5
%if ARCH_X86_64
SWAP m7, m6
%else
punpcklwd m0, m1, m6 ; b5
punpckhwd m1, m6
%endif
MULLD m2, m13, m7 ; p5 * s0
MULLD m3, m13, m7
pmaddwd m0, m12 ; b5 * 164
pmaddwd m1, m12
paddusw m2, m12
paddusw m3, m12
psrld m2, 20 ; min(z5, 255)
psrld m3, 20
GATHER_X_BY_X m4, m2, m3, r0, dstm
punpcklwd m2, m4, m4
punpckhwd m3, m4, m4
MULLD m0, m2, m7
MULLD m1, m3, m7
SGR_CALC_X m5, m4, m0, m1, m2, m3, m13, m12, m8
punpcklwd m4, m5, m5
mova [t4+wq*2+4], m5
punpckhwd m5, m5
MUL_32X16X2 m0, m1, m4, m5, m2, m3
paddd m0, m10 ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
paddd m1, m10
mova [t4+wq*2+4], m4
psrld m0, 12
psrld m1, 12
mova [t3+wq*4+ 8], m0