Bug 1875883 - Update dav1d to a6878be7e07114f5a2915ad46300700f0db55197 r=media-playback-reviewers,padenot

Differential Revision: https://phabricator.services.mozilla.com/D200241
This commit is contained in:
Chun-Min Chang 2024-02-02 18:17:49 +00:00
parent 80236c4ec3
commit 8966466027
41 changed files with 19328 additions and 1499 deletions

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc (2023-12-19T13:15:43.000+01:00).
release: a6878be7e07114f5a2915ad46300700f0db55197 (2024-01-31T06:04:21.000-05:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc
revision: a6878be7e07114f5a2915ad46300700f0db55197
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "746ab8b4f3021d7263c64d6b5d6f1e2c281c7acc"
#define DAV1D_VERSION "a6878be7e07114f5a2915ad46300700f0db55197"

View File

@ -60,7 +60,7 @@
#define ALIGN_64_VAL 64
#define ALIGN_32_VAL 32
#define ALIGN_16_VAL 16
#elif ARCH_X86_32 || ARCH_ARM || ARCH_AARCH64 || ARCH_PPC64LE
#elif ARCH_AARCH64 || ARCH_ARM || ARCH_LOONGARCH || ARCH_PPC64LE || ARCH_X86_32
/* ARM doesn't benefit from anything more than 16-byte alignment. */
#define ALIGN_64_VAL 16
#define ALIGN_32_VAL 16

View File

@ -62,11 +62,13 @@ endforeach
# ASM option
is_asm_enabled = (get_option('enable_asm') == true and
(host_machine.cpu_family() == 'x86' or
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '') or
host_machine.cpu_family() == 'aarch64' or
(host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le'))
host_machine.cpu() == 'ppc64le' or
host_machine.cpu_family().startswith('riscv') or
host_machine.cpu_family().startswith('loongarch') or
host_machine.cpu_family() == 'x86' or
(host_machine.cpu_family() == 'x86_64' and cc.get_define('__ILP32__').strip() == '')))
cdata.set10('HAVE_ASM', is_asm_enabled)
if is_asm_enabled and get_option('b_sanitize') == 'memory'
@ -232,7 +234,9 @@ endif
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm') or
host_machine.cpu() == 'ppc64le')
host_machine.cpu_family().startswith('loongarch') or
host_machine.cpu() == 'ppc64le' or
host_machine.cpu_family().startswith('riscv'))
if cc.has_function('getauxval', prefix : '#include <sys/auxv.h>', args : test_args)
cdata.set('HAVE_GETAUXVAL', 1)
endif
@ -379,6 +383,14 @@ endif
cdata.set10('ARCH_PPC64LE', host_machine.cpu() == 'ppc64le')
cdata.set10('ARCH_RISCV', host_machine.cpu_family().startswith('riscv'))
cdata.set10('ARCH_RV32', host_machine.cpu_family() == 'riscv32')
cdata.set10('ARCH_RV64', host_machine.cpu_family() == 'riscv64')
cdata.set10('ARCH_LOONGARCH', host_machine.cpu_family().startswith('loongarch'))
cdata.set10('ARCH_LOONGARCH32', host_machine.cpu_family() == 'loongarch32')
cdata.set10('ARCH_LOONGARCH64', host_machine.cpu_family() == 'loongarch64')
# meson's cc.symbols_have_underscore_prefix() is unfortunately unrelieably
# when additional flags like '-fprofile-instr-generate' are passed via CFLAGS
# see following meson issue https://github.com/mesonbuild/meson/issues/5482

View File

@ -25,6 +25,11 @@ option('enable_tests',
value: true,
description: 'Build dav1d tests')
option('enable_seek_stress',
type: 'boolean',
value: false,
description: 'Build seek_stress test tool')
option('enable_docs',
type: 'boolean',
value: false,

View File

@ -56,8 +56,12 @@ COLD void dav1d_init_cpu(void) {
// memory sanitizer is inherently incompatible with asm
#if ARCH_AARCH64 || ARCH_ARM
dav1d_cpu_flags = dav1d_get_cpu_flags_arm();
#elif ARCH_LOONGARCH
dav1d_cpu_flags = dav1d_get_cpu_flags_loongarch();
#elif ARCH_PPC64LE
dav1d_cpu_flags = dav1d_get_cpu_flags_ppc();
#elif ARCH_RISCV
dav1d_cpu_flags = dav1d_get_cpu_flags_riscv();
#elif ARCH_X86
dav1d_cpu_flags = dav1d_get_cpu_flags_x86();
#endif

View File

@ -37,8 +37,12 @@
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/cpu.h"
#elif ARCH_LOONGARCH
#include "src/loongarch/cpu.h"
#elif ARCH_PPC64LE
#include "src/ppc/cpu.h"
#elif ARCH_RISCV
#include "src/riscv/cpu.h"
#elif ARCH_X86
#include "src/x86/cpu.h"
#endif
@ -64,6 +68,10 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
#if defined(__VSX__)
flags |= DAV1D_PPC_CPU_FLAG_VSX;
#endif
#elif ARCH_RISCV
#if defined(__riscv_v)
flags |= DAV1D_RISCV_CPU_FLAG_V;
#endif
#elif ARCH_X86
#if defined(__AVX512F__) && defined(__AVX512CD__) && \
defined(__AVX512BW__) && defined(__AVX512DQ__) && \

View File

@ -183,6 +183,10 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/itx.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/itx.h"
#elif ARCH_RISCV
#include "src/riscv/itx.h"
#elif ARCH_X86
#include "src/x86/itx.h"
#endif
@ -257,6 +261,12 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if ARCH_AARCH64 || ARCH_ARM
itx_dsp_init_arm(c, bpc);
#endif
#if ARCH_LOONGARCH64
itx_dsp_init_loongarch(c, bpc);
#endif
#if ARCH_RISCV
itx_dsp_init_riscv(c, bpc);
#endif
#if ARCH_X86
itx_dsp_init_x86(c, bpc);
#endif

47
third_party/dav1d/src/loongarch/cpu.c vendored Normal file
View File

@ -0,0 +1,47 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "common/attributes.h"
#include "src/loongarch/cpu.h"
#if defined(HAVE_GETAUXVAL)
#include <sys/auxv.h>
#define LA_HWCAP_LSX ( 1 << 4 )
#define LA_HWCAP_LASX ( 1 << 5 )
#endif
COLD unsigned dav1d_get_cpu_flags_loongarch(void) {
unsigned flags = 0;
#if defined(HAVE_GETAUXVAL)
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & LA_HWCAP_LSX) ? DAV1D_LOONGARCH_CPU_FLAG_LSX : 0;
flags |= (hw_cap & LA_HWCAP_LASX) ? DAV1D_LOONGARCH_CPU_FLAG_LASX : 0;
#endif
return flags;
}

37
third_party/dav1d/src/loongarch/cpu.h vendored Normal file
View File

@ -0,0 +1,37 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_CPU_H
#define DAV1D_SRC_LOONGARCH_CPU_H
enum CpuFlags {
DAV1D_LOONGARCH_CPU_FLAG_LSX = 1 << 0,
DAV1D_LOONGARCH_CPU_FLAG_LASX = 1 << 1,
};
unsigned dav1d_get_cpu_flags_loongarch(void);
#endif /* DAV1D_SRC_LOONGARCH_CPU_H */

8104
third_party/dav1d/src/loongarch/itx.S vendored Normal file

File diff suppressed because it is too large Load Diff

195
third_party/dav1d/src/loongarch/itx.h vendored Normal file
View File

@ -0,0 +1,195 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_ITX_H
#define DAV1D_SRC_LOONGARCH_ITX_H
#include "src/cpu.h"
#include "src/itx.h"
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if BITDEPTH == 8
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
if (BITDEPTH != 8 ) return;
c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_ITX_H */

View File

@ -0,0 +1,776 @@
/*********************************************************************
* Copyright (c) 2022 Loongson Technology Corporation Limited
* Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
* Shiyou Yin(yinshiyou-hf@loongson.cn)
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*********************************************************************/
/*
* This file is a LoongArch assembly helper file and available under ISC
* license. It provides a large number of macros and alias to simplify
* writing assembly code, especially for LSX and LASX optimizations.
*
* Any one can modify it or add new features for his/her own purposes.
* Contributing a patch will be appreciated as it might be useful for
* others as well. Send patches to loongson contributor mentioned above.
*
* MAJOR version: Usage changes, incompatible with previous version.
* MINOR version: Add new macros/functions, or bug fixes.
* MICRO version: Comment changes or implementation changes.
*/
#define LML_VERSION_MAJOR 0
#define LML_VERSION_MINOR 4
#define LML_VERSION_MICRO 0
#define DEFAULT_ALIGN 5
/* Set prefix as needed. */
#ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_
#endif
#define PASTE(a,b) a ## b
#define CONCAT(a,b) PASTE(a,b)
#ifdef PREFIX
#define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
#else
#define ASM_PREF PRIVATE_PREFIX
#endif
.macro function name, align=DEFAULT_ALIGN
.macro endfunc
jirl $r0, $r1, 0x0
.size ASM_PREF\name, . - ASM_PREF\name
.purgem endfunc
.endm
.text ;
.align \align ;
.globl ASM_PREF\name ;
.type ASM_PREF\name, @function ;
ASM_PREF\name: ;
.endm
.macro const name, align=DEFAULT_ALIGN
.macro endconst
.size \name, . - \name
.purgem endconst
.endm
.section .rodata
.align \align
\name:
.endm
/*
*============================================================================
* LoongArch register alias
*============================================================================
*/
#define a0 $a0
#define a1 $a1
#define a2 $a2
#define a3 $a3
#define a4 $a4
#define a5 $a5
#define a6 $a6
#define a7 $a7
#define t0 $t0
#define t1 $t1
#define t2 $t2
#define t3 $t3
#define t4 $t4
#define t5 $t5
#define t6 $t6
#define t7 $t7
#define t8 $t8
#define s0 $s0
#define s1 $s1
#define s2 $s2
#define s3 $s3
#define s4 $s4
#define s5 $s5
#define s6 $s6
#define s7 $s7
#define s8 $s8
#define zero $zero
#define sp $sp
#define ra $ra
#define fa0 $fa0
#define fa1 $fa1
#define fa2 $fa2
#define fa3 $fa3
#define fa4 $fa4
#define fa5 $fa5
#define fa6 $fa6
#define fa7 $fa7
#define ft0 $ft0
#define ft1 $ft1
#define ft2 $ft2
#define ft3 $ft3
#define ft4 $ft4
#define ft5 $ft5
#define ft6 $ft6
#define ft7 $ft7
#define ft8 $ft8
#define ft9 $ft9
#define ft10 $ft10
#define ft11 $ft11
#define ft12 $ft12
#define ft13 $ft13
#define ft14 $ft14
#define ft15 $ft15
#define fs0 $fs0
#define fs1 $fs1
#define fs2 $fs2
#define fs3 $fs3
#define fs4 $fs4
#define fs5 $fs5
#define fs6 $fs6
#define fs7 $fs7
#define f0 $f0
#define f1 $f1
#define f2 $f2
#define f3 $f3
#define f4 $f4
#define f5 $f5
#define f6 $f6
#define f7 $f7
#define f8 $f8
#define f9 $f9
#define f10 $f10
#define f11 $f11
#define f12 $f12
#define f13 $f13
#define f14 $f14
#define f15 $f15
#define f16 $f16
#define f17 $f17
#define f18 $f18
#define f19 $f19
#define f20 $f20
#define f21 $f21
#define f22 $f22
#define f23 $f23
#define f24 $f24
#define f25 $f25
#define f26 $f26
#define f27 $f27
#define f28 $f28
#define f29 $f29
#define f30 $f30
#define f31 $f31
#define vr0 $vr0
#define vr1 $vr1
#define vr2 $vr2
#define vr3 $vr3
#define vr4 $vr4
#define vr5 $vr5
#define vr6 $vr6
#define vr7 $vr7
#define vr8 $vr8
#define vr9 $vr9
#define vr10 $vr10
#define vr11 $vr11
#define vr12 $vr12
#define vr13 $vr13
#define vr14 $vr14
#define vr15 $vr15
#define vr16 $vr16
#define vr17 $vr17
#define vr18 $vr18
#define vr19 $vr19
#define vr20 $vr20
#define vr21 $vr21
#define vr22 $vr22
#define vr23 $vr23
#define vr24 $vr24
#define vr25 $vr25
#define vr26 $vr26
#define vr27 $vr27
#define vr28 $vr28
#define vr29 $vr29
#define vr30 $vr30
#define vr31 $vr31
#define xr0 $xr0
#define xr1 $xr1
#define xr2 $xr2
#define xr3 $xr3
#define xr4 $xr4
#define xr5 $xr5
#define xr6 $xr6
#define xr7 $xr7
#define xr8 $xr8
#define xr9 $xr9
#define xr10 $xr10
#define xr11 $xr11
#define xr12 $xr12
#define xr13 $xr13
#define xr14 $xr14
#define xr15 $xr15
#define xr16 $xr16
#define xr17 $xr17
#define xr18 $xr18
#define xr19 $xr19
#define xr20 $xr20
#define xr21 $xr21
#define xr22 $xr22
#define xr23 $xr23
#define xr24 $xr24
#define xr25 $xr25
#define xr26 $xr26
#define xr27 $xr27
#define xr28 $xr28
#define xr29 $xr29
#define xr30 $xr30
#define xr31 $xr31
/*
*============================================================================
* LSX/LASX synthesize instructions
*============================================================================
*/
/*
* Description : Dot product of byte vector elements
* Arguments : Inputs - vj, vk
* Outputs - vd
* Return Type - halfword
*/
.macro vdp2.h.bu vd, vj, vk
vmulwev.h.bu \vd, \vj, \vk
vmaddwod.h.bu \vd, \vj, \vk
.endm
.macro vdp2.h.bu.b vd, vj, vk
vmulwev.h.bu.b \vd, \vj, \vk
vmaddwod.h.bu.b \vd, \vj, \vk
.endm
.macro vdp2.w.h vd, vj, vk
vmulwev.w.h \vd, \vj, \vk
vmaddwod.w.h \vd, \vj, \vk
.endm
.macro xvdp2.h.bu xd, xj, xk
xvmulwev.h.bu \xd, \xj, \xk
xvmaddwod.h.bu \xd, \xj, \xk
.endm
.macro xvdp2.h.bu.b xd, xj, xk
xvmulwev.h.bu.b \xd, \xj, \xk
xvmaddwod.h.bu.b \xd, \xj, \xk
.endm
.macro xvdp2.w.h xd, xj, xk
xvmulwev.w.h \xd, \xj, \xk
xvmaddwod.w.h \xd, \xj, \xk
.endm
/*
* Description : Dot product & addition of halfword vector elements
* Arguments : Inputs - vj, vk
* Outputs - vd
* Return Type - twice size of input
*/
.macro vdp2add.h.bu vd, vj, vk
vmaddwev.h.bu \vd, \vj, \vk
vmaddwod.h.bu \vd, \vj, \vk
.endm
.macro vdp2add.h.bu.b vd, vj, vk
vmaddwev.h.bu.b \vd, \vj, \vk
vmaddwod.h.bu.b \vd, \vj, \vk
.endm
.macro vdp2add.w.h vd, vj, vk
vmaddwev.w.h \vd, \vj, \vk
vmaddwod.w.h \vd, \vj, \vk
.endm
.macro xvdp2add.h.bu.b xd, xj, xk
xvmaddwev.h.bu.b \xd, \xj, \xk
xvmaddwod.h.bu.b \xd, \xj, \xk
.endm
.macro xvdp2add.w.h xd, xj, xk
xvmaddwev.w.h \xd, \xj, \xk
xvmaddwod.w.h \xd, \xj, \xk
.endm
/*
* Description : Range element vj[i] to vk[i] ~ vj[i]
* clip: vj > vk ? vj : vk && vj < va ? vj : va
*/
.macro vclip.h vd, vj, vk, va
vmax.h \vd, \vj, \vk
vmin.h \vd, \vd, \va
.endm
.macro vclip.w vd, vj, vk, va
vmax.w \vd, \vj, \vk
vmin.w \vd, \vd, \va
.endm
.macro xvclip.h xd, xj, xk, xa
xvmax.h \xd, \xj, \xk
xvmin.h \xd, \xd, \xa
.endm
.macro xvclip.w xd, xj, xk, xa
xvmax.w \xd, \xj, \xk
xvmin.w \xd, \xd, \xa
.endm
/*
* Description : Range element vj[i] to 0 ~ 255
* clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
*/
.macro vclip255.h vd, vj
vmaxi.h \vd, \vj, 0
vsat.hu \vd, \vd, 7
.endm
.macro vclip255.w vd, vj
vmaxi.w \vd, \vj, 0
vsat.wu \vd, \vd, 7
.endm
.macro xvclip255.h xd, xj
xvmaxi.h \xd, \xj, 0
xvsat.hu \xd, \xd, 7
.endm
.macro xvclip255.w xd, xj
xvmaxi.w \xd, \xj, 0
xvsat.wu \xd, \xd, 7
.endm
/*
* Description : Store elements of vector
* vd : Data vector to be stroed
* rk : Address of data storage
* ra : Offset of address
* si : Index of data in vd
*/
.macro vstelmx.b vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.b \vd, \rk, 0, \si
.endm
.macro vstelmx.h vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.h \vd, \rk, 0, \si
.endm
.macro vstelmx.w vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.w \vd, \rk, 0, \si
.endm
.macro vstelmx.d vd, rk, ra, si
add.d \rk, \rk, \ra
vstelm.d \vd, \rk, 0, \si
.endm
.macro vmov xd, xj
vor.v \xd, \xj, \xj
.endm
.macro xmov xd, xj
xvor.v \xd, \xj, \xj
.endm
.macro xvstelmx.d xd, rk, ra, si
add.d \rk, \rk, \ra
xvstelm.d \xd, \rk, 0, \si
.endm
/*
*============================================================================
* LSX/LASX custom macros
*============================================================================
*/
/*
* Load 4 float, double, V128, v256 elements with stride.
*/
.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
fld.s \out0, \src, 0
fldx.s \out1, \src, \stride
fldx.s \out2, \src, \stride2
fldx.s \out3, \src, \stride3
.endm
.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
fld.d \out0, \src, 0
fldx.d \out1, \src, \stride
fldx.d \out2, \src, \stride2
fldx.d \out3, \src, \stride3
.endm
.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
vld \out0, \src, 0
vldx \out1, \src, \stride
vldx \out2, \src, \stride2
vldx \out3, \src, \stride3
.endm
.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
xvld \out0, \src, 0
xvldx \out1, \src, \stride
xvldx \out2, \src, \stride2
xvldx \out3, \src, \stride3
.endm
/*
* Description : Transpose 4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
vilvl.h \tmp0, \in1, \in0
vilvl.h \tmp1, \in3, \in2
vilvl.w \out0, \tmp1, \tmp0
vilvh.w \out2, \tmp1, \tmp0
vilvh.d \out1, \out0, \out0
vilvh.d \out3, \out0, \out2
.endm
/*
* Description : Transpose 4x4 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Details :
* Example :
* 1, 2, 3, 4 1, 5, 9,13
* 5, 6, 7, 8 to 2, 6,10,14
* 9,10,11,12 =====> 3, 7,11,15
* 13,14,15,16 4, 8,12,16
*/
.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
vilvl.w \tmp0, \in1, \in0
vilvh.w \out1, \in1, \in0
vilvl.w \tmp1, \in3, \in2
vilvh.w \out3, \in3, \in2
vilvl.d \out0, \tmp1, \tmp0
vilvl.d \out2, \out3, \out1
vilvh.d \out3, \out3, \out1
vilvh.d \out1, \tmp1, \tmp0
.endm
/*
* Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
tmp3, tmp4, tmp5, tmp6, tmp7
vilvl.h \tmp0, \in6, \in4
vilvl.h \tmp1, \in7, \in5
vilvl.h \tmp2, \in2, \in0
vilvl.h \tmp3, \in3, \in1
vilvl.h \tmp4, \tmp1, \tmp0
vilvh.h \tmp5, \tmp1, \tmp0
vilvl.h \tmp6, \tmp3, \tmp2
vilvh.h \tmp7, \tmp3, \tmp2
vilvh.h \tmp0, \in6, \in4
vilvh.h \tmp1, \in7, \in5
vilvh.h \tmp2, \in2, \in0
vilvh.h \tmp3, \in3, \in1
vpickev.d \out0, \tmp4, \tmp6
vpickod.d \out1, \tmp4, \tmp6
vpickev.d \out2, \tmp5, \tmp7
vpickod.d \out3, \tmp5, \tmp7
vilvl.h \tmp4, \tmp1, \tmp0
vilvh.h \tmp5, \tmp1, \tmp0
vilvl.h \tmp6, \tmp3, \tmp2
vilvh.h \tmp7, \tmp3, \tmp2
vpickev.d \out4, \tmp4, \tmp6
vpickod.d \out5, \tmp4, \tmp6
vpickev.d \out6, \tmp5, \tmp7
vpickod.d \out7, \tmp5, \tmp7
.endm
/*
* Description : Transpose 16x8 block with byte elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
in8, in9, in10, in11, in12, in13, in14, in15, \
out0, out1, out2, out3, out4, out5, out6, out7,\
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
xvilvl.b \tmp0, \in2, \in0
xvilvl.b \tmp1, \in3, \in1
xvilvl.b \tmp2, \in6, \in4
xvilvl.b \tmp3, \in7, \in5
xvilvl.b \tmp4, \in10, \in8
xvilvl.b \tmp5, \in11, \in9
xvilvl.b \tmp6, \in14, \in12
xvilvl.b \tmp7, \in15, \in13
xvilvl.b \out0, \tmp1, \tmp0
xvilvh.b \out1, \tmp1, \tmp0
xvilvl.b \out2, \tmp3, \tmp2
xvilvh.b \out3, \tmp3, \tmp2
xvilvl.b \out4, \tmp5, \tmp4
xvilvh.b \out5, \tmp5, \tmp4
xvilvl.b \out6, \tmp7, \tmp6
xvilvh.b \out7, \tmp7, \tmp6
xvilvl.w \tmp0, \out2, \out0
xvilvh.w \tmp2, \out2, \out0
xvilvl.w \tmp4, \out3, \out1
xvilvh.w \tmp6, \out3, \out1
xvilvl.w \tmp1, \out6, \out4
xvilvh.w \tmp3, \out6, \out4
xvilvl.w \tmp5, \out7, \out5
xvilvh.w \tmp7, \out7, \out5
xvilvl.d \out0, \tmp1, \tmp0
xvilvh.d \out1, \tmp1, \tmp0
xvilvl.d \out2, \tmp3, \tmp2
xvilvh.d \out3, \tmp3, \tmp2
xvilvl.d \out4, \tmp5, \tmp4
xvilvh.d \out5, \tmp5, \tmp4
xvilvl.d \out6, \tmp7, \tmp6
xvilvh.d \out7, \tmp7, \tmp6
.endm
/*
* Description : Transpose 4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.h \tmp0, \in1, \in0
xvilvl.h \tmp1, \in3, \in2
xvilvl.w \out0, \tmp1, \tmp0
xvilvh.w \out2, \tmp1, \tmp0
xvilvh.d \out1, \out0, \out0
xvilvh.d \out3, \out0, \out2
.endm
/*
* Description : Transpose 4x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.h \tmp0, \in2, \in0
xvilvl.h \tmp1, \in3, \in1
xvilvl.h \out2, \tmp1, \tmp0
xvilvh.h \out3, \tmp1, \tmp0
xvilvl.d \out0, \out2, \out2
xvilvh.d \out1, \out2, \out2
xvilvl.d \out2, \out3, \out3
xvilvh.d \out3, \out3, \out3
.endm
/*
* Description : Transpose 8x8 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6, out7
*/
.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7, \
tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
xvilvl.h \tmp0, \in6, \in4
xvilvl.h \tmp1, \in7, \in5
xvilvl.h \tmp2, \in2, \in0
xvilvl.h \tmp3, \in3, \in1
xvilvl.h \tmp4, \tmp1, \tmp0
xvilvh.h \tmp5, \tmp1, \tmp0
xvilvl.h \tmp6, \tmp3, \tmp2
xvilvh.h \tmp7, \tmp3, \tmp2
xvilvh.h \tmp0, \in6, \in4
xvilvh.h \tmp1, \in7, \in5
xvilvh.h \tmp2, \in2, \in0
xvilvh.h \tmp3, \in3, \in1
xvpickev.d \out0, \tmp4, \tmp6
xvpickod.d \out1, \tmp4, \tmp6
xvpickev.d \out2, \tmp5, \tmp7
xvpickod.d \out3, \tmp5, \tmp7
xvilvl.h \tmp4, \tmp1, \tmp0
xvilvh.h \tmp5, \tmp1, \tmp0
xvilvl.h \tmp6, \tmp3, \tmp2
xvilvh.h \tmp7, \tmp3, \tmp2
xvpickev.d \out4, \tmp4, \tmp6
xvpickod.d \out5, \tmp4, \tmp6
xvpickev.d \out6, \tmp5, \tmp7
xvpickod.d \out7, \tmp5, \tmp7
.endm
/*
* Description : Transpose 2x4x4 block with half-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
*/
.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1, tmp2
xvilvh.h \tmp1, \in0, \in1
xvilvl.h \out1, \in0, \in1
xvilvh.h \tmp0, \in2, \in3
xvilvl.h \out3, \in2, \in3
xvilvh.w \tmp2, \out3, \out1
xvilvl.w \out3, \out3, \out1
xvilvl.w \out2, \tmp0, \tmp1
xvilvh.w \tmp1, \tmp0, \tmp1
xvilvh.d \out0, \out2, \out3
xvilvl.d \out2, \out2, \out3
xvilvh.d \out1, \tmp1, \tmp2
xvilvl.d \out3, \tmp1, \tmp2
.endm
/*
* Description : Transpose 4x4 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Details :
* Example :
* 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
* 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
* 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
* 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
*/
.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.w \tmp0, \in1, \in0
xvilvh.w \out1, \in1, \in0
xvilvl.w \tmp1, \in3, \in2
xvilvh.w \out3, \in3, \in2
xvilvl.d \out0, \tmp1, \tmp0
xvilvl.d \out2, \out3, \out1
xvilvh.d \out3, \out3, \out1
xvilvh.d \out1, \tmp1, \tmp0
.endm
/*
* Description : Transpose 8x8 block with word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
* Outputs - out0, out1, out2, out3, out4, out5, out6,
* _out7
* Example : LASX_TRANSPOSE8x8_W
* in0 : 1,2,3,4,5,6,7,8
* in1 : 2,2,3,4,5,6,7,8
* in2 : 3,2,3,4,5,6,7,8
* in3 : 4,2,3,4,5,6,7,8
* in4 : 5,2,3,4,5,6,7,8
* in5 : 6,2,3,4,5,6,7,8
* in6 : 7,2,3,4,5,6,7,8
* in7 : 8,2,3,4,5,6,7,8
*
* out0 : 1,2,3,4,5,6,7,8
* out1 : 2,2,2,2,2,2,2,2
* out2 : 3,3,3,3,3,3,3,3
* out3 : 4,4,4,4,4,4,4,4
* out4 : 5,5,5,5,5,5,5,5
* out5 : 6,6,6,6,6,6,6,6
* out6 : 7,7,7,7,7,7,7,7
* out7 : 8,8,8,8,8,8,8,8
*/
.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
out0, out1, out2, out3, out4, out5, out6, out7,\
tmp0, tmp1, tmp2, tmp3
xvilvl.w \tmp0, \in2, \in0
xvilvl.w \tmp1, \in3, \in1
xvilvh.w \tmp2, \in2, \in0
xvilvh.w \tmp3, \in3, \in1
xvilvl.w \out0, \tmp1, \tmp0
xvilvh.w \out1, \tmp1, \tmp0
xvilvl.w \out2, \tmp3, \tmp2
xvilvh.w \out3, \tmp3, \tmp2
xvilvl.w \tmp0, \in6, \in4
xvilvl.w \tmp1, \in7, \in5
xvilvh.w \tmp2, \in6, \in4
xvilvh.w \tmp3, \in7, \in5
xvilvl.w \out4, \tmp1, \tmp0
xvilvh.w \out5, \tmp1, \tmp0
xvilvl.w \out6, \tmp3, \tmp2
xvilvh.w \out7, \tmp3, \tmp2
xmov \tmp0, \out0
xmov \tmp1, \out1
xmov \tmp2, \out2
xmov \tmp3, \out3
xvpermi.q \out0, \out4, 0x02
xvpermi.q \out1, \out5, 0x02
xvpermi.q \out2, \out6, 0x02
xvpermi.q \out3, \out7, 0x02
xvpermi.q \out4, \tmp0, 0x31
xvpermi.q \out5, \tmp1, 0x31
xvpermi.q \out6, \tmp2, 0x31
xvpermi.q \out7, \tmp3, 0x31
.endm
/*
* Description : Transpose 4x4 block with double-word elements in vectors
* Arguments : Inputs - in0, in1, in2, in3
* Outputs - out0, out1, out2, out3
* Example : LASX_TRANSPOSE4x4_D
* in0 : 1,2,3,4
* in1 : 1,2,3,4
* in2 : 1,2,3,4
* in3 : 1,2,3,4
*
* out0 : 1,1,1,1
* out1 : 2,2,2,2
* out2 : 3,3,3,3
* out3 : 4,4,4,4
*/
.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
tmp0, tmp1
xvilvl.d \tmp0, \in1, \in0
xvilvh.d \out1, \in1, \in0
xvilvh.d \tmp1, \in3, \in2
xvilvl.d \out2, \in3, \in2
xvor.v \out0, \tmp0, \tmp0
xvor.v \out3, \tmp1, \tmp1
xvpermi.q \out0, \out2, 0x02
xvpermi.q \out2, \tmp0, 0x31
xvpermi.q \out3, \out1, 0x31
xvpermi.q \out1, \tmp1, 0x02
.endm

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,52 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_LOOPFILTER_H
#define DAV1D_SRC_LOONGARCH_LOOPFILTER_H
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, lsx));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, lsx));
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, lsx));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, lsx));
static ALWAYS_INLINE void loop_filter_dsp_init_loongarch(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, lsx);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, lsx);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, lsx);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, lsx);
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_LOOPFILTER_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,78 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
#define DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H
#include "common/intops.h"
#include "src/cpu.h"
#include "src/looprestoration.h"
void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t stride,
const uint8_t (*const left)[4],
const uint8_t *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX);
static ALWAYS_INLINE void loop_restoration_dsp_init_loongarch(Dav1dLoopRestorationDSPContext *const c, int bpc)
{
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
#if BITDEPTH == 8
c->wiener[0] = c->wiener[1] = dav1d_wiener_filter_lsx;
c->sgr[0] = dav1d_sgr_filter_5x5_lsx;
c->sgr[1] = dav1d_sgr_filter_3x3_lsx;
c->sgr[2] = dav1d_sgr_filter_mix_lsx;
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_LOOPRESTORATION_H */

View File

@ -0,0 +1,274 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/looprestoration.h"
#if BITDEPTH == 8
#define REST_UNIT_STRIDE (400)
void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr,
uint8_t *tmp_ptr,
const int16_t filterh[8],
const int w, const int h);
void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p,
const ptrdiff_t p_stride,
const int32_t *hor,
const int16_t filterv[8],
const int w, const int h);
// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
static inline void padding(uint8_t *dst, const uint8_t *p,
const ptrdiff_t stride, const uint8_t (*left)[4],
const uint8_t *lpf, int unit_w, const int stripe_h,
const enum LrEdgeFlags edges)
{
const int have_left = !!(edges & LR_HAVE_LEFT);
const int have_right = !!(edges & LR_HAVE_RIGHT);
// Copy more pixels if we don't have to pad them
unit_w += 3 * have_left + 3 * have_right;
uint8_t *dst_l = dst + 3 * !have_left;
p -= 3 * have_left;
lpf -= 3 * have_left;
if (edges & LR_HAVE_TOP) {
// Copy previous loop filtered rows
const uint8_t *const above_1 = lpf;
const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
pixel_copy(dst_l, above_1, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
} else {
// Pad with first row
pixel_copy(dst_l, p, unit_w);
pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
if (have_left) {
pixel_copy(dst_l, &left[0][1], 3);
pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
}
}
uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
if (edges & LR_HAVE_BOTTOM) {
// Copy next loop filtered rows
const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
} else {
// Pad with last row
const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
if (have_left) {
pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
}
}
// Inner UNIT_WxSTRIPE_H
for (int j = 0; j < stripe_h; j++) {
pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
dst_tl += REST_UNIT_STRIDE;
p += PXSTRIDE(stride);
}
if (!have_right) {
uint8_t *pad = dst_l + unit_w;
uint8_t *row_last = &dst_l[unit_w - 1];
// Pad 3x(STRIPE_H+6) with last column
for (int j = 0; j < stripe_h + 6; j++) {
pixel_set(pad, *row_last, 3);
pad += REST_UNIT_STRIDE;
row_last += REST_UNIT_STRIDE;
}
}
if (!have_left) {
// Pad 3x(STRIPE_H+6) with first column
for (int j = 0; j < stripe_h + 6; j++) {
pixel_set(dst, *dst_l, 3);
dst += REST_UNIT_STRIDE;
dst_l += REST_UNIT_STRIDE;
}
} else {
dst += 3 * REST_UNIT_STRIDE;
for (int j = 0; j < stripe_h; j++) {
pixel_copy(dst, &left[j][1], 3);
dst += REST_UNIT_STRIDE;
}
}
}
// This function refers to the function in the ppc/looprestoration_init_tmpl.c.
// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
// FIXME Could implement a version that requires less temporary memory
// (should be possible to implement with only 6 rows of temp storage)
void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride,
const uint8_t (*const left)[4],
const uint8_t *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
const int16_t (*const filter)[8] = params->filter;
// Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
// of padding above and below
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
padding(tmp, p, p_stride, left, lpf, w, h, edges);
ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6);
BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h);
}
void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src,
const int w, const int h);
void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum,
const int w, const int h);
void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
const int w, const int h, const int w1);
void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp,
int32_t *sumsq, int16_t *sum,
const int w, const int h);
void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride,
int16_t *dst, int w1,
const int w, const int h);
static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src,
const int w, const int h)
{
BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6);
BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6);
}
void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
padding(tmp, p, p_stride, left, lpf, w, h, edges);
coef dst[64 * 384];
ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
boxsum3_lsx(sumsq, sum, tmp, w, h);
BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1);
BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h);
}
void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum,
const uint8_t *const src,
const int w, const int h);
void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum,
const int w, const int h);
void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum,
const int w, const int h,
const unsigned s);
void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src,
int32_t *sumsq, int16_t *sum,
const int w, const int h);
void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride,
const int16_t *dst0, const int16_t *dst1,
const int w0, const int w1,
const int w, const int h);
static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src,
const int w, const int h)
{
BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6);
BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6);
}
void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
padding(tmp, p, p_stride, left, lpf, w, h, edges);
coef dst[64 * 384];
ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, );
ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, );
boxsum5_lsx(sumsq, sum, tmp, w, h);
BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0);
BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h);
BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h);
}
void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride,
const pixel (*const left)[4],
const pixel *lpf,
const int w, const int h,
const LooprestorationParams *const params,
const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
padding(tmp, p, p_stride, left, lpf, w, h, edges);
coef dst0[64 * 384];
coef dst1[64 * 384];
ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, );
ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, );
boxsum5_lsx(sumsq0, sum0, tmp, w, h);
BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0);
BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h);
boxsum3_lsx(sumsq0, sum0, tmp, w, h);
BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1);
BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h);
BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0,
params->sgr.w1, w, h);
}
#endif

4758
third_party/dav1d/src/loongarch/mc.S vendored Normal file

File diff suppressed because it is too large Load Diff

118
third_party/dav1d/src/loongarch/mc.h vendored Normal file
View File

@ -0,0 +1,118 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_MC_H
#define DAV1D_SRC_LOONGARCH_MC_H
#include "config.h"
#include "src/mc.h"
#include "src/cpu.h"
#define init_mc_fn(type, name, suffix) \
c->mc[type] = BF(dav1d_put_##name, suffix)
#define init_mct_fn(type, name, suffix) \
c->mct[type] = BF(dav1d_prep_##name, suffix)
decl_avg_fn(BF(dav1d_avg, lsx));
decl_w_avg_fn(BF(dav1d_w_avg, lsx));
decl_mask_fn(BF(dav1d_mask, lsx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
decl_mc_fn(BF(dav1d_put_8tap_regular, lsx));
decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx));
decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, lsx));
decl_mc_fn(BF(dav1d_put_8tap_smooth, lsx));
decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, lsx));
decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, lsx));
decl_mc_fn(BF(dav1d_put_8tap_sharp, lsx));
decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, lsx));
decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, lsx));
decl_avg_fn(BF(dav1d_avg, lasx));
decl_w_avg_fn(BF(dav1d_w_avg, lasx));
decl_mask_fn(BF(dav1d_mask, lasx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lasx));
static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
#if BITDEPTH == 8
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
c->avg = BF(dav1d_avg, lsx);
c->w_avg = BF(dav1d_w_avg, lsx);
c->mask = BF(dav1d_mask, lsx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx);
init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx);
init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx);
init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx);
init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx);
init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
c->avg = BF(dav1d_avg, lasx);
c->w_avg = BF(dav1d_w_avg, lasx);
c->mask = BF(dav1d_mask, lasx);
c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lasx);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lasx);
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_MC_H */

368
third_party/dav1d/src/loongarch/msac.S vendored Normal file
View File

@ -0,0 +1,368 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "loongson_asm.S"
const min_prob
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
endconst
.macro decode_symbol_adapt w
addi.d sp, sp, -48
addi.d a4, a0, 24
vldrepl.h vr0, a4, 0 //rng
fst.s f0, sp, 0 //val==0
vld vr1, a1, 0 //cdf
.if \w == 16
li.w t4, 16
vldx vr11, a1, t4
.endif
addi.d a6, a0, 16
vldrepl.d vr2, a6, 0 //dif
addi.d t0, a0, 32
ld.w t1, t0, 0 //allow_update_cdf
la.local t2, min_prob
addi.d t2, t2, 32
addi.w t3, a2, 1
slli.w t3, t3, 1
sub.d t2, t2, t3
vld vr3, t2, 0 //min_prob
.if \w == 16
vldx vr13, t2, t4
.endif
vsrli.h vr4, vr0, 8 //r = s->rng >> 8
vslli.h vr4, vr4, 8 //r << 8
vsrli.h vr5, vr1, 6
vslli.h vr5, vr5, 7
.if \w == 16
vsrli.h vr15, vr11, 6
vslli.h vr15, vr15, 7
.endif
vmuh.hu vr5, vr4, vr5
vadd.h vr5, vr5, vr3 //v
.if \w == 16
vmuh.hu vr15, vr4, vr15
vadd.h vr15, vr15, vr13
.endif
addi.d t8, sp, 4
vst vr5, t8, 0 //store v
.if \w == 16
vstx vr15, t8, t4
.endif
vreplvei.h vr20, vr2, 3 //c
vssub.hu vr6, vr5, vr20 //c >=v
vseqi.h vr6, vr6, 0
.if \w == 16
vssub.hu vr16, vr15, vr20 //c >=v
vseqi.h vr16, vr16, 0
vpickev.b vr21, vr16, vr6
.endif
.if \w <= 8
vmskltz.h vr10, vr6
.else
vmskltz.b vr10, vr21
.endif
beqz t1, .renorm\()\w
// update_cdf
alsl.d t1, a2, a1, 1
ld.h t2, t1, 0 //count
srli.w t3, t2, 4 //count >> 4
addi.w t3, t3, 4
li.w t5, 2
sltu t5, t5, a2
add.w t3, t3, t5 //rate
sltui t5, t2, 32
add.w t2, t2, t5 //count + (count < 32)
vreplgr2vr.h vr9, t3
vseq.h vr7, vr7, vr7
vavgr.hu vr5, vr6, vr7 //i >= val ? -1 : 32768
vsub.h vr5, vr5, vr1
vsub.h vr8, vr1, vr6
.if \w == 16
vavgr.hu vr15, vr16, vr7
vsub.h vr15, vr15, vr11
vsub.h vr18, vr11, vr16
.endif
vsra.h vr5, vr5, vr9
vadd.h vr8, vr8, vr5
.if \w == 4
fst.d f8, a1, 0
.else
vst vr8, a1, 0
.endif
.if \w == 16
vsra.h vr15, vr15, vr9
vadd.h vr18, vr18, vr15
vstx vr18, a1, t4
.endif
st.h t2, t1, 0
.renorm\()\w:
vpickve2gr.h t3, vr10, 0
ctz.w a7, t3 // ret
alsl.d t3, a7, t8, 1
ld.hu t4, t3, 0 // v
addi.d t3, t3, -2
ld.hu t5, t3, 0 // u
sub.w t5, t5, t4 // rng
slli.d t4, t4, 48
vpickve2gr.d t6, vr2, 0
sub.d t6, t6, t4 // dif
addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
addi.d t6, t6, -1 // dif
addi.d a5, a0, 28 // cnt
ld.w t7, a5, 0
sub.w t7, t7, t4 // cnt-d
sll.w t5, t5, t4
st.w t5, a4, 0 // store rng
bge t7, zero, 9f
// refill
ld.d t0, a0, 0 // buf_pos
addi.d t1, a0, 8
ld.d t1, t1, 0 // buf_end
addi.d t2, t0, 8
blt t1, t2, 1f
ld.d t0, t0, 0 // next_bits
addi.w t3, t7, 23 // shift_bits = cnt + 23
addi.w t7, t7, 16 // cnt += 16
revb.d t0, t0 // next_bits = bswap(next_bits)
srli.w t4, t3, 3
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
st.d t2, a0, 0
andi t3, t3, 24 // shift_bits &= 24
srl.d t0, t0, t3 // next_bits >>= shift_bits
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
sll.d t0, t0, t3 // next_bits <<= shift_bits
li.w t5, 48
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
xor t6, t6, t0 // dif ^= next_bits
b 9f
1:
li.w t4, 40
sub.w t5, t4, t7 // c = 40 - cnt
2:
bge t0, t1, 3f
ld.bu t2, t0, 0
addi.d t0, t0, 1
sll.d t2, t2, t5
xor t6, t6, t2
addi.w t5, t5, -8
bge t5, zero, 2b
// refill_eob_end
3:
st.d t0, a0, 0 // s->buf_pos = buf_pos
sub.w t7, t4, t5 // cnt = 40 - c
9:
st.w t7, a5, 0 // store cnt
st.d t6, a6, 0 // store dif
move a0, a7
addi.d sp, sp, 48
.endm
function msac_decode_symbol_adapt4_lsx
decode_symbol_adapt 4
endfunc
function msac_decode_symbol_adapt8_lsx
decode_symbol_adapt 8
endfunc
function msac_decode_symbol_adapt16_lsx
decode_symbol_adapt 16
endfunc
function msac_decode_bool_lsx
ld.w t0, a0, 24 // rng
srli.w a1, a1, 6
ld.d t1, a0, 16 // dif
srli.w t2, t0, 8 // r >> 8
mul.w t2, t2, a1
ld.w a5, a0, 28 // cnt
addi.d t1, t1, 1 // dif + 1
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // ret
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
// renorm
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
addi.d t6, t6, -1 // dif
sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
st.w t5, a0, 24 // store rng
bge t7, zero, 9f
// refill
ld.d t0, a0, 0 // buf_pos
addi.d t1, a0, 8
ld.d t1, t1, 0 // buf_end
addi.d t2, t0, 8
blt t1, t2, 1f
ld.d t0, t0, 0 // next_bits
addi.w t3, t7, 23 // shift_bits = cnt + 23
addi.w t7, t7, 16 // cnt += 16
revb.d t0, t0 // next_bits = bswap(next_bits)
srli.w t4, t3, 3
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
st.d t2, a0, 0
andi t3, t3, 24 // shift_bits &= 24
srl.d t0, t0, t3 // next_bits >>= shift_bits
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
sll.d t0, t0, t3 // next_bits <<= shift_bits
li.w t5, 48
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
xor t6, t6, t0 // dif ^= next_bits
b 9f
1:
li.w t4, 40
sub.w t5, t4, t7 // c = 40 - cnt
2:
bge t0, t1, 3f
ld.bu t2, t0, 0
addi.d t0, t0, 1
sll.d t2, t2, t5
xor t6, t6, t2
addi.w t5, t5, -8
bge t5, zero, 2b
// refill_eob_end
3:
st.d t0, a0, 0 // s->buf_pos = buf_pos
sub.w t7, t4, t5 // cnt = 40 - c
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
function msac_decode_bool_adapt_lsx
ld.hu a3, a1, 0 // cdf[0] /f
ld.w t0, a0, 24 // rng
ld.d t1, a0, 16 // dif
srli.w t2, t0, 8 // r >> 8
srli.w a7, a3, 6
mul.w t2, t2, a7
ld.w a4, a0, 32 // allow_update_cdf
ld.w a5, a0, 28 // cnt
srli.w t2, t2, 1
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // bit
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
beqz a4, .renorm
// update_cdf
ld.hu t0, a1, 2 // cdf[1]
srli.w t1, t0, 4
addi.w t1, t1, 4 // rate
sltui t2, t0, 32 // count < 32
add.w t0, t0, t2 // count + (count < 32)
sub.w a3, a3, t8 // cdf[0] -= bit
slli.w t4, t8, 15
sub.w t7, a3, t4 // cdf[0] - bit - 32768
sra.w t7, t7, t1 // (cdf[0] - bit - 32768) >> rate
sub.w t7, a3, t7 // cdf[0]
st.h t7, a1, 0
st.h t0, a1, 2
.renorm:
// renorm
addi.d t6, t6, 1
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
addi.d t6, t6, -1 // dif
sub.w t7, a5, t4 // cnt-d
sll.w t5, t5, t4
st.w t5, a0, 24 // store rng
bge t7, zero, 9f
// refill
ld.d t0, a0, 0 // buf_pos
addi.d t1, a0, 8
ld.d t1, t1, 0 // buf_end
addi.d t2, t0, 8
blt t1, t2, 1f
ld.d t0, t0, 0 // next_bits
addi.w t3, t7, 23 // shift_bits = cnt + 23
addi.w t7, t7, 16 // cnt += 16
revb.d t0, t0 // next_bits = bswap(next_bits)
srli.w t4, t3, 3
sub.d t2, t2, t4 // buf_pos -= shift_bits >> 3
st.d t2, a0, 0
andi t3, t3, 24 // shift_bits &= 24
srl.d t0, t0, t3 // next_bits >>= shift_bits
sub.w t3, t3, t7 // shift_bits -= 16 + cnt
sll.d t0, t0, t3 // next_bits <<= shift_bits
li.w t5, 48
sub.w t7, t5, t3 // cnt = cnt + 64 - shift_bits
xor t6, t6, t0 // dif ^= next_bits
b 9f
1:
li.w t4, 40
sub.w t5, t4, t7 // c = 40 - cnt
2:
bge t0, t1, 3f
ld.bu t2, t0, 0
addi.d t0, t0, 1
sll.d t2, t2, t5
xor t6, t6, t2
addi.w t5, t5, -8
bge t5, zero, 2b
// refill_eob_end
3:
st.d t0, a0, 0 // s->buf_pos = buf_pos
sub.w t7, t4, t5 // cnt = 40 - c
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc

46
third_party/dav1d/src/loongarch/msac.h vendored Normal file
View File

@ -0,0 +1,46 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_MSAC_H
#define DAV1D_SRC_LOONGARCH_MSAC_H
unsigned dav1d_msac_decode_symbol_adapt4_lsx(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_lsx(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx
#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */

152
third_party/dav1d/src/loongarch/refmvs.S vendored Normal file
View File

@ -0,0 +1,152 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/loongarch/loongson_asm.S"
/*
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
const int bx4, const int bw4, int bh4)
*/
function splat_mv_lsx
vld vr0, a1, 0 // 0 1 ... 11 ...
clz.w t4, a3
vaddi.bu vr1, vr0, 0
addi.w t4, t4, -26
vextrins.w vr1, vr0, 0x30 // 0 1 2 ... 11 0 1 2 3
la.local t5, .SPLAT_LSX_JRTABLE
vbsrl.v vr2, vr1, 4 // 4 5 6 7...11 0 1 2 3 0 0 0 0
alsl.d t6, t4, t5, 1
vextrins.w vr2, vr0, 0x31 // 4 5 6 7...11 0 1 2 3 4 5 6 7
ld.h t7, t6, 0
vbsrl.v vr3, vr2, 4 // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
add.d t8, t5, t7
alsl.d a2, a2, a2, 1
vextrins.w vr3, vr0, 0x32 // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
slli.w a2, a2, 2
jirl $r0, t8, 0
.SPLAT_LSX_JRTABLE:
.hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W8_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W4_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W2_LSX - .SPLAT_LSX_JRTABLE
.hword .SPLAT_W1_LSX - .SPLAT_LSX_JRTABLE
.SPLAT_W1_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
fst.d f1, t3, 0
fst.s f3, t3, 8
blt zero, a4, .SPLAT_W1_LSX
b .splat_end
.SPLAT_W2_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
fst.d f2, t3, 16
blt zero, a4, .SPLAT_W2_LSX
b .splat_end
.SPLAT_W4_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
blt zero, a4, .SPLAT_W4_LSX
b .splat_end
.SPLAT_W8_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
blt zero, a4, .SPLAT_W8_LSX
b .splat_end
.SPLAT_W16_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
.rept 2
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
addi.d t3, t3, 96
.endr
blt zero, a4, .SPLAT_W16_LSX
b .splat_end
.SPLAT_W32_LSX:
ld.d t3, a0, 0
addi.d a0, a0, 8
addi.d a4, a4, -1
add.d t3, t3, a2
.rept 4
vst vr1, t3, 0
vst vr2, t3, 16
vst vr3, t3, 32
vst vr1, t3, 48
vst vr2, t3, 64
vst vr3, t3, 80
addi.d t3, t3, 96
.endr
blt zero, a4, .SPLAT_W32_LSX
.splat_end:
endfunc

View File

@ -0,0 +1,44 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_REFMVS_H
#define DAV1D_SRC_LOONGARCH_REFMVS_H
#include "src/cpu.h"
#include "src/refmvs.h"
decl_splat_mv_fn(dav1d_splat_mv_lsx);
static ALWAYS_INLINE void refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
c->splat_mv = dav1d_splat_mv_lsx;
}
#endif /* DAV1D_SRC_LOONGARCH_REFMVS_H */

View File

@ -247,6 +247,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/loopfilter.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/loopfilter.h"
#elif ARCH_X86
#include "src/x86/loopfilter.h"
#endif
@ -261,6 +263,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
loop_filter_dsp_init_arm(c);
#elif ARCH_LOONGARCH64
loop_filter_dsp_init_loongarch(c);
#elif ARCH_X86
loop_filter_dsp_init_x86(c);
#endif

View File

@ -527,6 +527,8 @@ static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/looprestoration.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/looprestoration.h"
#elif ARCH_PPC64LE
#include "src/ppc/looprestoration.h"
#elif ARCH_X86
@ -545,6 +547,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
loop_restoration_dsp_init_arm(c, bpc);
#elif ARCH_LOONGARCH64
loop_restoration_dsp_init_loongarch(c, bpc);
#elif ARCH_PPC64LE
loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86

View File

@ -905,6 +905,8 @@ static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/mc.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/mc.h"
#elif ARCH_X86
#include "src/x86/mc.h"
#endif
@ -946,6 +948,8 @@ COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
mc_dsp_init_arm(c);
#elif ARCH_LOONGARCH64
mc_dsp_init_loongarch(c);
#elif ARCH_X86
mc_dsp_init_x86(c);
#endif

View File

@ -226,6 +226,24 @@ if is_asm_enabled
# Compile the ASM sources with NASM
libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
elif host_machine.cpu_family().startswith('loongarch')
libdav1d_sources += files(
'loongarch/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
'loongarch/looprestoration_tmpl.c',
)
libdav1d_sources_asm = files(
'loongarch/mc.S',
'loongarch/loopfilter.S',
'loongarch/looprestoration.S',
'loongarch/msac.S',
'loongarch/refmvs.S',
'loongarch/itx.S',
)
libdav1d_asm_objs += libdav1d_sources_asm
elif host_machine.cpu() == 'ppc64le'
arch_flags = ['-maltivec', '-mvsx']
libdav1d_sources += files(
@ -235,6 +253,15 @@ if is_asm_enabled
'ppc/cdef_tmpl.c',
'ppc/looprestoration_tmpl.c',
)
elif host_machine.cpu_family().startswith('riscv')
libdav1d_sources += files(
'riscv/cpu.c',
)
if host_machine.cpu_family() == 'riscv64'
libdav1d_sources += files(
'riscv/64/itx.S',
)
endif
endif
endif

View File

@ -51,6 +51,8 @@ typedef struct MsacContext {
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/msac.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/msac.h"
#elif ARCH_X86
#include "src/x86/msac.h"
#endif

File diff suppressed because it is too large Load Diff

View File

@ -919,6 +919,8 @@ static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/refmvs.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/refmvs.h"
#elif ARCH_X86
#include "src/x86/refmvs.h"
#endif
@ -933,6 +935,8 @@ COLD void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *const c)
#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
refmvs_dsp_init_arm(c);
#elif ARCH_LOONGARCH64
refmvs_dsp_init_loongarch(c);
#elif ARCH_X86
refmvs_dsp_init_x86(c);
#endif

View File

@ -171,6 +171,7 @@ void dav1d_refmvs_find(const refmvs_tile *rt,
void dav1d_refmvs_dsp_init(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_arm(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_loongarch(Dav1dRefmvsDSPContext *dsp);
void dav1d_refmvs_dsp_init_x86(Dav1dRefmvsDSPContext *dsp);
#endif /* DAV1D_SRC_REF_MVS_H */

662
third_party/dav1d/src/riscv/64/itx.S vendored Normal file
View File

@ -0,0 +1,662 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2023, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
function inv_txfm_add_4x4_rvv, export=1, ext=v
csrw vxrm, zero
vsetivli zero, 4, e16, mf2, ta, ma
vle16.v v0, (a2)
addi t0, a2, 8
vle16.v v1, (t0)
addi t0, t0, 8
vle16.v v2, (t0)
addi t0, t0, 8
vle16.v v3, (t0)
jalr t0, a4
vmv.v.x v4, zero
vsseg4e16.v v0, (a2)
vle16.v v0, (a2)
vse16.v v4, (a2)
addi t0, a2, 8
vle16.v v1, (t0)
vse16.v v4, (t0)
addi t0, t0, 8
vle16.v v2, (t0)
vse16.v v4, (t0)
addi t0, t0, 8
vle16.v v3, (t0)
vse16.v v4, (t0)
jalr t0, a5
vssra.vi v0, v0, 4
vssra.vi v1, v1, 4
vssra.vi v2, v2, 4
vssra.vi v3, v3, 4
itx_4x4_end:
vsetvli zero, zero, e8, mf4, ta, ma
vle8.v v4, (a0)
add t0, a0, a1
vle8.v v5, (t0)
add t0, t0, a1
vle8.v v6, (t0)
add t0, t0, a1
vle8.v v7, (t0)
vwaddu.wv v0, v0, v4
vwaddu.wv v1, v1, v5
vwaddu.wv v2, v2, v6
vwaddu.wv v3, v3, v7
vsetvli zero, zero, e16, mf2, ta, ma
vmax.vx v0, v0, zero
vmax.vx v1, v1, zero
vmax.vx v2, v2, zero
vmax.vx v3, v3, zero
vsetvli zero, zero, e8, mf4, ta, ma
vnclipu.wi v4, v0, 0
vnclipu.wi v5, v1, 0
vnclipu.wi v6, v2, 0
vnclipu.wi v7, v3, 0
vse8.v v4, (a0)
add a0, a0, a1
vse8.v v5, (a0)
add a0, a0, a1
vse8.v v6, (a0)
add a0, a0, a1
vse8.v v7, (a0)
ret
endfunc
function inv_identity_e16_x4_rvv, export=1, ext=v
li t1, (5793-4096)*8
vsmul.vx v4, v0, t1
vsmul.vx v5, v1, t1
vsmul.vx v6, v2, t1
vsmul.vx v7, v3, t1
vsadd.vv v0, v0, v4
vsadd.vv v1, v1, v5
vsadd.vv v2, v2, v6
vsadd.vv v3, v3, v7
jr t0
endfunc
.macro idct_4 o0, o1, o2, o3
li t1, 2896
li t2, 1567
li t3, 3784
vwmul.vx v8, \o0, t1
vwmul.vx v10, \o0, t1
vwmacc.vx v8, t1, \o2
neg t1, t1
vwmacc.vx v10, t1, \o2
vwmul.vx v12, \o1, t3
neg t3, t3
vwmul.vx v14, \o1, t2
vwmacc.vx v12, t2, \o3
vwmacc.vx v14, t3, \o3
li t1, 2048
vwadd.wx v8, v8, t1
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vwadd.wx v14, v14, t1
vnsra.wi v8, v8, 12
vnsra.wi v10, v10, 12
vnsra.wi v12, v12, 12
vnsra.wi v14, v14, 12
vsadd.vv \o0, v8, v12
vsadd.vv \o1, v10, v14
vssub.vv \o2, v10, v14
vssub.vv \o3, v8, v12
.endm
.macro iadst_4 o0, o1, o2, o3
li t1, 1321
li t2, 3803
li t3, 2482
vwmul.vx v4, v0, t1
vwmul.vx v5, v0, t3
neg t1, t1
vwmacc.vx v4, t2, v2
vwmacc.vx v5, t1, v2
neg t2, t2
vwmacc.vx v4, t3, v3
vwmacc.vx v5, t2, v3
vwsub.vv v6, v0, v2
vwadd.wv v6, v6, v3
li t1, 3344
vwmul.vx v7, v1, t1
vsetvli zero, zero, e32, m1, ta, ma
vmul.vx v6, v6, t1
vadd.vv v8, v4, v5
vadd.vv v4, v4, v7
vadd.vv v5, v5, v7
vsub.vv v7, v8, v7
li t1, 2048
vadd.vx v4, v4, t1
vadd.vx v5, v5, t1
vadd.vx v6, v6, t1
vadd.vx v7, v7, t1
vsetvli zero, zero, e16, mf2, ta, ma
vnsra.wi \o0, v4, 12
vnsra.wi \o1, v5, 12
vnsra.wi \o2, v6, 12
vnsra.wi \o3, v7, 12
.endm
function inv_dct_e16_x4_rvv, export=1, ext=v
idct_4 v0, v1, v2, v3
jr t0
endfunc
function inv_adst_e16_x4_rvv, export=1, ext=v
iadst_4 v0, v1, v2, v3
jr t0
endfunc
function inv_flipadst_e16_x4_rvv, export=1, ext=v
iadst_4 v3, v2, v1, v0
jr t0
endfunc
.macro def_fn_4x4 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
.ifc \txfm1\()_\txfm2, dct_dct
beqz a3, 1f
.endif
la a4, inv_\txfm1\()_e16_x4_rvv
la a5, inv_\txfm2\()_e16_x4_rvv
j inv_txfm_add_4x4_rvv
.ifc \txfm1\()_\txfm2, dct_dct
1:
csrw vxrm, zero
vsetivli zero, 4, e16, mf2, ta, ma
ld t2, (a2)
li t1, 2896*8
vmv.v.x v0, t2
vsmul.vx v0, v0, t1
sd x0, (a2)
vsmul.vx v0, v0, t1
vssra.vi v0, v0, 4
vmv.v.v v1, v0
vmv.v.v v2, v0
vmv.v.v v3, v0
j itx_4x4_end
.endif
endfunc
.endm
def_fn_4x4 dct, dct
def_fn_4x4 identity, identity
def_fn_4x4 dct, adst
def_fn_4x4 dct, flipadst
def_fn_4x4 dct, identity
def_fn_4x4 adst, dct
def_fn_4x4 adst, adst
def_fn_4x4 adst, flipadst
def_fn_4x4 flipadst, dct
def_fn_4x4 flipadst, adst
def_fn_4x4 flipadst, flipadst
def_fn_4x4 identity, dct
def_fn_4x4 adst, identity
def_fn_4x4 flipadst, identity
def_fn_4x4 identity, adst
def_fn_4x4 identity, flipadst
.macro def_fn_8x8_base variant
function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
vle16.v v0, (a2)
addi t0, a2, 16
vle16.v v1, (t0)
addi t0, t0, 16
vle16.v v2, (t0)
addi t0, t0, 16
vle16.v v3, (t0)
addi t0, t0, 16
vle16.v v4, (t0)
addi t0, t0, 16
vle16.v v5, (t0)
addi t0, t0, 16
vle16.v v6, (t0)
addi t0, t0, 16
vle16.v v7, (t0)
.ifc \variant, identity_
// The identity vsadd.vv and downshift vssra.vi 1 cancel out
.else
jalr t0, a4
vssra.vi v0, v0, 1
vssra.vi v1, v1, 1
vssra.vi v2, v2, 1
vssra.vi v3, v3, 1
vssra.vi v4, v4, 1
vssra.vi v5, v5, 1
vssra.vi v6, v6, 1
vssra.vi v7, v7, 1
.endif
vsseg8e16.v v0, (a2)
vle16.v v0, (a2)
addi t0, a2, 16
vle16.v v1, (t0)
addi t0, t0, 16
vle16.v v2, (t0)
addi t0, t0, 16
vle16.v v3, (t0)
addi t0, t0, 16
vle16.v v4, (t0)
addi t0, t0, 16
vle16.v v5, (t0)
addi t0, t0, 16
vle16.v v6, (t0)
addi t0, t0, 16
vle16.v v7, (t0)
jalr t0, a5
vssra.vi v0, v0, 4
vssra.vi v1, v1, 4
vssra.vi v2, v2, 4
vssra.vi v3, v3, 4
vssra.vi v4, v4, 4
vssra.vi v5, v5, 4
vssra.vi v6, v6, 4
vssra.vi v7, v7, 4
li t1, 64
vsetvli zero, t1, e16, m8, ta, ma
vmv.v.x v8, zero
vse16.v v8, (a2)
.ifc \variant, identity_
itx_8x8_end:
.endif
vsetivli zero, 8, e8, mf2, ta, ma
vle8.v v8, (a0)
add t0, a0, a1
vle8.v v9, (t0)
add t0, t0, a1
vle8.v v10, (t0)
add t0, t0, a1
vle8.v v11, (t0)
add t0, t0, a1
vle8.v v12, (t0)
add t0, t0, a1
vle8.v v13, (t0)
add t0, t0, a1
vle8.v v14, (t0)
add t0, t0, a1
vle8.v v15, (t0)
vwaddu.wv v0, v0, v8
vwaddu.wv v1, v1, v9
vwaddu.wv v2, v2, v10
vwaddu.wv v3, v3, v11
vwaddu.wv v4, v4, v12
vwaddu.wv v5, v5, v13
vwaddu.wv v6, v6, v14
vwaddu.wv v7, v7, v15
vsetvli zero, zero, e16, m1
vmax.vx v0, v0, zero
vmax.vx v1, v1, zero
vmax.vx v2, v2, zero
vmax.vx v3, v3, zero
vmax.vx v4, v4, zero
vmax.vx v5, v5, zero
vmax.vx v6, v6, zero
vmax.vx v7, v7, zero
vsetvli zero, zero, e8, mf2, ta, ma
vnclipu.wi v8, v0, 0
vnclipu.wi v9, v1, 0
vnclipu.wi v10, v2, 0
vnclipu.wi v11, v3, 0
vnclipu.wi v12, v4, 0
vnclipu.wi v13, v5, 0
vnclipu.wi v14, v6, 0
vnclipu.wi v15, v7, 0
vse8.v v8, (a0)
add a0, a0, a1
vse8.v v9, (a0)
add a0, a0, a1
vse8.v v10, (a0)
add a0, a0, a1
vse8.v v11, (a0)
add a0, a0, a1
vse8.v v12, (a0)
add a0, a0, a1
vse8.v v13, (a0)
add a0, a0, a1
vse8.v v14, (a0)
add a0, a0, a1
vse8.v v15, (a0)
ret
endfunc
.endm
def_fn_8x8_base
def_fn_8x8_base identity_
function inv_identity_e16_x8_rvv, export=1, ext=v
vsadd.vv v0, v0, v0
vsadd.vv v1, v1, v1
vsadd.vv v2, v2, v2
vsadd.vv v3, v3, v3
vsadd.vv v4, v4, v4
vsadd.vv v5, v5, v5
vsadd.vv v6, v6, v6
vsadd.vv v7, v7, v7
jr t0
endfunc
function inv_dct_e16_x8_rvv, export=1, ext=v
idct_4 v0, v2, v4, v6
li t1, 799
li t2, 4017
li t3, 3406
li t4, 2276
vwmul.vx v14, v1, t2
neg t2, t2
vwmul.vx v8, v1, t1
vwmacc.vx v14, t1, v7
vwmacc.vx v8, t2, v7
vwmul.vx v12, v5, t4
neg t4, t4
vwmul.vx v10, v5, t3
vwmacc.vx v12, t3, v3
vwmacc.vx v10, t4, v3
li t1, 2048
vwadd.wx v8, v8, t1
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vwadd.wx v14, v14, t1
vnsra.wi v8, v8, 12
vnsra.wi v10, v10, 12
vnsra.wi v12, v12, 12
vnsra.wi v14, v14, 12
vssub.vv v7, v14, v12
vsadd.vv v14, v14, v12
vssub.vv v1, v8, v10
vsadd.vv v8, v8, v10
li t2, 2896
vwmul.vx v10, v7, t2
vwmul.vx v12, v7, t2
vwmacc.vx v12, t2, v1
neg t2, t2
vwmacc.vx v10, t2, v1
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vnsra.wi v10, v10, 12
vnsra.wi v12, v12, 12
vssub.vv v7, v0, v14
vsadd.vv v0, v0, v14
vssub.vv v9, v2, v12
vsadd.vv v1, v2, v12
vssub.vv v5, v4, v10
vsadd.vv v2, v4, v10
vssub.vv v4, v6, v8
vsadd.vv v3, v6, v8
vmv.v.v v6, v9
jr t0
endfunc
.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
li t1, 4076
li t2, 401
li t3, 3612
li t4, 1931
li t5, 2598
li t6, 3166
vwmul.vx v8, v7, t1
neg t1, t1
vwmul.vx v10, v7, t2
vwmacc.vx v8, t2, v0
vwmacc.vx v10, t1, v0
vwmul.vx v12, v5, t3
neg t3, t3
vwmul.vx v14, v5, t4
vwmacc.vx v12, t4, v2
vwmacc.vx v14, t3, v2
vwmul.vx v16, v3, t5
neg t5, t5
vwmul.vx v18, v3, t6
vwmacc.vx v16, t6, v4
vwmacc.vx v18, t5, v4
li t1, 2048
li t2, 1189
li t3, 3920
li t4, 1567
li t5, 3784
li t6, 2896
vwmul.vx v20, v1, t2
neg t2, t2
vwmul.vx v22, v1, t3
vwmacc.vx v20, t3, v6
vwmacc.vx v22, t2, v6
vwadd.wx v8, v8, t1
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vwadd.wx v14, v14, t1
vwadd.wx v16, v16, t1
vwadd.wx v18, v18, t1
vwadd.wx v20, v20, t1
vwadd.wx v22, v22, t1
vnsra.wi v8, v8, 12
vnsra.wi v10, v10, 12
vnsra.wi v12, v12, 12
vnsra.wi v14, v14, 12
vnsra.wi v16, v16, 12
vnsra.wi v18, v18, 12
vnsra.wi v20, v20, 12
vnsra.wi v22, v22, 12
vssub.vv v4, v8, v16
vsadd.vv v8, v8, v16
vsadd.vv v1, v10, v18
vsadd.vv v2, v12, v20
vsadd.vv v3, v14, v22
vssub.vv v5, v10, v18
vssub.vv v6, v12, v20
vssub.vv v22, v14, v22
vsadd.vv \o0, v8, v2
vsadd.vv \o7, v1, v3
vssub.vv v2, v8, v2
vssub.vv v3, v1, v3
vwmul.vx v8, v4, t5
vwmul.vx v10, v4, t4
vwmul.vx v12, v22, t5
vwmul.vx v14, v22, t4
vwmacc.vx v8, t4, v5
neg t4, t4
vwmacc.vx v14, t5, v6
neg t5, t5
vwmacc.vx v12, t4, v6
vwmacc.vx v10, t5, v5
vwadd.wx v8, v8, t1
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vwadd.wx v14, v14, t1
vnsra.wi v8, v8, 12
vnsra.wi v10, v10, 12
vnsra.wi v12, v12, 12
vnsra.wi v14, v14, 12
vsadd.vv \o1, v8, v12
vsadd.vv \o6, v10, v14
vssub.vv v8, v8, v12
vssub.vv v9, v10, v14
vwmul.vx v10, v2, t6
vwmul.vx v12, v2, t6
vwmul.vx v14, v8, t6
vwmul.vx v16, v8, t6
vwmacc.vx v10, t6, v3
vwmacc.vx v14, t6, v9
neg t6, t6
vwmacc.vx v12, t6, v3
vwmacc.vx v16, t6, v9
vwadd.wx v10, v10, t1
vwadd.wx v12, v12, t1
vwadd.wx v14, v14, t1
vwadd.wx v16, v16, t1
vnsra.wi \o3, v10, 12
vnsra.wi \o4, v12, 12
vnsra.wi \o2, v14, 12
vnsra.wi \o5, v16, 12
vmv.v.x v8, zero
vssub.vv \o1, v8, \o1
vssub.vv \o3, v8, \o3
vssub.vv \o5, v8, \o5
vssub.vv \o7, v8, \o7
.endm
function inv_adst_e16_x8_rvv, export=1, ext=v
iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
jr t0
endfunc
function inv_flipadst_e16_x8_rvv, export=1, ext=v
iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
jr t0
endfunc
.macro def_fn_8x8 txfm1, txfm2
function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
.ifc \txfm1\()_\txfm2, dct_dct
beqz a3, 1f
.endif
la a5, inv_\txfm2\()_e16_x8_rvv
.ifc \txfm1, identity
j inv_txfm_identity_add_8x8_rvv
.else
la a4, inv_\txfm1\()_e16_x8_rvv
j inv_txfm_add_8x8_rvv
.endif
.ifc \txfm1\()_\txfm2, dct_dct
1:
csrw vxrm, zero
vsetivli zero, 8, e16, m1, ta, ma
ld t2, (a2)
li t1, 2896*8
vmv.v.x v0, t2
vsmul.vx v0, v0, t1
sd x0, (a2)
vssra.vi v0, v0, 1
vsmul.vx v0, v0, t1
vssra.vi v0, v0, 4
vmv.v.v v1, v0
vmv.v.v v2, v0
vmv.v.v v3, v0
vmv.v.v v4, v0
vmv.v.v v5, v0
vmv.v.v v6, v0
vmv.v.v v7, v0
j itx_8x8_end
.endif
endfunc
.endm
def_fn_8x8 dct, dct
def_fn_8x8 identity, identity
def_fn_8x8 dct, adst
def_fn_8x8 dct, flipadst
def_fn_8x8 dct, identity
def_fn_8x8 adst, dct
def_fn_8x8 adst, adst
def_fn_8x8 adst, flipadst
def_fn_8x8 flipadst, dct
def_fn_8x8 flipadst, adst
def_fn_8x8 flipadst, flipadst
def_fn_8x8 identity, dct
def_fn_8x8 adst, identity
def_fn_8x8 flipadst, identity
def_fn_8x8 identity, adst
def_fn_8x8 identity, flipadst

126
third_party/dav1d/src/riscv/asm.S vendored Normal file
View File

@ -0,0 +1,126 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2023, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_RISCV_ASM_S
#define DAV1D_SRC_RISCV_ASM_S
#include "config.h"
#if !defined(PIC)
#if defined(__PIC__)
#define PIC __PIC__
#elif defined(__pic__)
#define PIC __pic__
#endif
#endif
#ifndef PRIVATE_PREFIX
#define PRIVATE_PREFIX dav1d_
#endif
#define PASTE(a,b) a ## b
#define CONCAT(a,b) PASTE(a,b)
#ifdef PREFIX
#define EXTERN CONCAT(_,PRIVATE_PREFIX)
#else
#define EXTERN PRIVATE_PREFIX
#endif
.macro function name, export=0, ext=
.macro endfunc
#ifdef __ELF__
.size \name, . - \name
#endif
.option pop
.purgem endfunc
.endm
.text
.option push
.ifnb \ext
.option arch, +\ext
.endif
.if \export
.global EXTERN\name
#ifdef __ELF__
.type EXTERN\name, %function
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
EXTERN\name:
.else
#ifdef __ELF__
.type \name, %function
#endif
.endif
\name:
.endm
.macro const name, export=0, align=2
.macro endconst
#ifdef __ELF__
.size \name, . - \name
#endif
.purgem endconst
.endm
#if defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
#endif
.align \align
.if \export
.global EXTERN\name
#ifdef __ELF__
.hidden EXTERN\name
#elif defined(__MACH__)
.private_extern EXTERN\name
#endif
EXTERN\name:
.endif
\name:
.endm
.macro thread_local name, align=3, quads=1
.macro end_thread_local
.size \name, . - \name
.purgem end_thread_local
.endm
.section .tbss, "waT"
.align \align
.hidden \name
\name:
.rept \quads
.quad 0
.endr
end_thread_local
.endm
#endif /* DAV1D_SRC_RISCV_ASM_S */

49
third_party/dav1d/src/riscv/cpu.c vendored Normal file
View File

@ -0,0 +1,49 @@
/*
* Copyright © 2022, VideoLAN and dav1d authors
* Copyright © 2022, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include "common/attributes.h"
#include "src/riscv/cpu.h"
#if defined(HAVE_GETAUXVAL)
#include <sys/auxv.h>
#define HWCAP_RVV (1 << ('v' - 'a'))
#endif
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
unsigned flags = 0;
#if defined(HAVE_GETAUXVAL)
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
#endif
return flags;
}

37
third_party/dav1d/src/riscv/cpu.h vendored Normal file
View File

@ -0,0 +1,37 @@
/*
* Copyright © 2022, VideoLAN and dav1d authors
* Copyright © 2022, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_RISCV_CPU_H
#define DAV1D_SRC_RISCV_CPU_H
enum CpuFlags {
DAV1D_RISCV_CPU_FLAG_V = 1 << 0,
};
unsigned dav1d_get_cpu_flags_riscv(void);
#endif /* DAV1D_SRC_RISCV_CPU_H */

109
third_party/dav1d/src/riscv/itx.h vendored Normal file
View File

@ -0,0 +1,109 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2023, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/itx.h"
#define decl_itx2_fns(w, h, opt) \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
#define decl_itx12_fns(w, h, opt) \
decl_itx2_fns(w, h, opt); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
#define decl_itx16_fns(w, h, opt) \
decl_itx12_fns(w, h, opt); \
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
#define decl_itx17_fns(w, h, opt) \
decl_itx16_fns(w, h, opt); \
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
#define decl_itx_fns(ext) \
decl_itx17_fns( 4, 4, ext); \
decl_itx16_fns( 8, 8, ext)
decl_itx_fns(rvv);
static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
#define assign_itx1_fn(pfx, w, h, ext) \
assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext)
#define assign_itx2_fn(pfx, w, h, ext) \
assign_itx1_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, identity_identity, IDTX, ext)
#define assign_itx12_fn(pfx, w, h, ext) \
assign_itx2_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, dct_adst, ADST_DCT, ext); \
assign_itx_fn(pfx, w, h, dct_flipadst, FLIPADST_DCT, ext); \
assign_itx_fn(pfx, w, h, dct_identity, H_DCT, ext); \
assign_itx_fn(pfx, w, h, adst_dct, DCT_ADST, ext); \
assign_itx_fn(pfx, w, h, adst_adst, ADST_ADST, ext); \
assign_itx_fn(pfx, w, h, adst_flipadst, FLIPADST_ADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_dct, DCT_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_adst, ADST_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, identity_dct, V_DCT, ext)
#define assign_itx16_fn(pfx, w, h, ext) \
assign_itx12_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, adst_identity, H_ADST, ext); \
assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST, ext); \
assign_itx_fn(pfx, w, h, identity_adst, V_ADST, ext); \
assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST, ext)
#define assign_itx17_fn(pfx, w, h, ext) \
assign_itx16_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
#if BITDEPTH == 8
assign_itx16_fn( , 4, 4, rvv);
assign_itx16_fn( , 8, 8, rvv);
#endif
}

View File

@ -138,13 +138,13 @@ static ALWAYS_INLINE void intra_pred_dsp_init_x86(Dav1dIntraPredDSPContext *cons
init_angular_ipred_fn(HOR_PRED, ipred_h, avx512icl);
init_angular_ipred_fn(VERT_PRED, ipred_v, avx512icl);
init_angular_ipred_fn(Z2_PRED, ipred_z2, avx512icl);
init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
#endif
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, avx512icl);
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, avx512icl);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, avx512icl);
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, avx512icl);
init_angular_ipred_fn(Z1_PRED, ipred_z1, avx512icl);
init_angular_ipred_fn(Z3_PRED, ipred_z3, avx512icl);
init_angular_ipred_fn(FILTER_PRED, ipred_filter, avx512icl);
c->pal_pred = BF(dav1d_pal_pred, avx512icl);

View File

@ -1,5 +1,5 @@
; Copyright © 2022, VideoLAN and dav1d authors
; Copyright © 2022, Two Orioles, LLC
; Copyright © 2022-2024, VideoLAN and dav1d authors
; Copyright © 2022-2024, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
@ -42,12 +42,16 @@ pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
pw_0to31: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
dw 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
pw_31to0: dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
pw_1to32: dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
z_upsample: dw 0, -1, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6
dw 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14
z_xpos_mul: dw 1, 1, 1, 1, 2, 2, 1, 1, 3, 3, 2, 2, 4, 4, 2, 2
dw 5, 5, 3, 3, 6, 6, 3, 3, 7, 7, 4, 4, 8, 8, 4, 4
z_ypos_mul: dw 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 3, 3, 1, 1
dw 4, 4, 2, 2, 5, 5, 2, 2, 6, 6, 3, 3, 7, 7, 3, 3
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
z_xpos_off1a: dw 30720, 30784, 30848, 30912, 30976, 31040, 31104, 31168
@ -75,13 +79,25 @@ z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
z_filter_k: dw 8, 8, 6, 6, 4, 4
dw 4, 4, 5, 5, 4, 4
dw 0, 0, 0, 0, 2, 2
pw_15: times 2 dw 15
pw_16: times 2 dw 16
pw_17: times 2 dw 17
pw_24: times 2 dw 24
pw_32: times 2 dw 32
pw_63: times 2 dw 63
pw_64: times 2 dw 64
pw_512: times 2 dw 512
pw_31806: times 2 dw 31806
pw_32640: times 2 dw 32640
pw_32672: times 2 dw 32672
pw_32704: times 2 dw 32704
pw_32735: times 2 dw 32735
pw_32736: times 2 dw 32736
%define pw_2 (z_xpos_mul+4* 2)
%define pw_3 (z_xpos_mul+4* 4)
%define pw_7 (z_xpos_mul+4*12)
%define pw_0to31 (pw_1to32-2)
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@ -98,6 +114,7 @@ JMP_TABLE ipred_smooth_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3_16bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE pal_pred_16bpc, avx512icl, w4, w8, w16, w32, w64
cextern smooth_weights_1d_16bpc
@ -757,7 +774,7 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
lea r3d, [angleq+216]
movu ym5, [tlq]
mov r3b, hb
mova m10, [base+pw_0to31]
movu m10, [base+pw_0to31]
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
lea r3d, [hq+7]
@ -1157,6 +1174,638 @@ cglobal ipred_z1_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dx
mov rsp, r7
RET
cglobal ipred_z3_16bpc, 3, 8, 16, dst, stride, tl, w, h, angle, dy
lea r7, [z_filter_t0]
tzcnt wd, wm
movifnidn angled, anglem
lea t0, [dr_intra_derivative+45*2-1]
movsxd wq, [base+ipred_z3_16bpc_avx512icl_table+wq*4]
sub angled, 180
mov dyd, angled
neg dyd
xor angled, 0x400
or dyq, ~0x7e
mova m0, [base+pw_31to0]
movzx dyd, word [t0+dyq]
lea wq, [base+ipred_z3_16bpc_avx512icl_table+wq]
movifnidn hd, hm
vpbroadcastd m14, [base+pw_31806]
vpbroadcastd m15, [base+pw_1]
jmp wq
.w4:
lea r3d, [hq+3]
xor r3d, 31 ; 32 - (h + imin(w, h))
vpbroadcastw m7, r3d
pmaxuw m7, m0
vpermw m6, m7, [tlq-64*1]
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w4_main
cmp angleb, 40
jae .w4_filter
lea r3d, [angleq-1024]
sar r3d, 7
add r3d, hd
jg .w4_filter ; h > 8 || (h == 8 && is_sm)
call .upsample
movsldup m1, [base+z_ypos_mul]
paddw m1, m1
jmp .w4_main2
.w4_filter:
lea r3d, [hq+3]
call .filter32
.w4_main:
movsldup m1, [base+z_ypos_mul]
.w4_main2:
vpbroadcastq m0, [base+pw_1to32]
vpbroadcastw m4, dyd
lea r2d, [hq+4]
shr r2d, 3
pmullw m4, m0 ; ypos
vpbroadcastw m0, r2d
imul r2, strideq ; stride * imax(height / 8, 1)
pmullw m1, m0
lea r3, [r2*3]
paddd m1, [base+pw_32736] {1to16}
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
vpermw m3, m2, m6 ; left[base+0]
.w4_loop:
paddsw m2, m15 ; base+1
vpermw m1, m2, m6 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddw m0, m3
movq [dstq+r2*0], xm0
movhps [dstq+r2*1], xm0
vextracti32x4 xm3, ym0, 1
movq [dstq+r2*2], xm3
movhps [dstq+r3 ], xm3
sub hd, 8
jl .w4_end
lea r5, [dstq+r2*4]
vextracti32x8 ym0, m0, 1
mova m3, m1
movq [r5+r2*0], xm0
movhps [r5+r2*1], xm0
vextracti32x4 xm1, ym0, 1
movq [r5+r2*2], xm1
movhps [r5+r3 ], xm1
add dstq, strideq
test hd, hd
jnz .w4_loop
.w4_end:
RET
.upsample:
vinserti32x4 m6, [tlq-14], 3
mova m3, [base+z_upsample]
vpbroadcastd m4, [base+pd_65536]
add dyd, dyd
vpermw m0, m3, m6
paddw m3, m4
vpermw m1, m3, m6
paddw m3, m4
vpermw m2, m3, m6
paddw m3, m4
vpermw m3, m3, m6
vpbroadcastw m6, r9m ; pixel_max
paddw m1, m2 ; b+c
paddw m0, m3 ; a+d
psubw m0, m1, m0
psraw m0, 3
pxor m2, m2
paddw m0, m1
pmaxsw m0, m2
pavgw m0, m2
pminsw m6, m0
ret
.w8:
mova m6, [tlq-64*1]
cmp hd, 32
je .w8_h32
mov r3d, 8
cmp hd, 4
cmove r3d, hd
lea r3d, [r3+hq-1]
xor r3d, 31 ; 32 - (h + imin(w, h))
vpbroadcastw m1, r3d
vpermw m7, m1, m6
pmaxuw m1, m0
vpermw m6, m1, m6
test angled, 0x400
jnz .w8_main
lea r3d, [angleq+216]
mov r3b, hb
cmp r3d, 8
ja .w8_filter ; is_sm || d >= 40 || h > 8
call .upsample
movshdup m1, [base+z_ypos_mul]
paddw m1, m1
call .w8_main_setup
.w8_upsample_loop:
vpermw m3, m2, m6 ; left[base+0]
paddw m2, m15 ; base+1
vpermw m1, m2, m6 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddw m2, m15 ; base+2
paddw m0, m3
mova m3, m1
mova [dstq+r2*0], xm0
vextracti32x4 [dstq+r2*1], ym0, 1
vextracti32x4 [dstq+r2*2], m0, 2
vextracti32x4 [dstq+r3 ], m0, 3
add dstq, strideq
sub hd, 4
jg .w8_upsample_loop
RET
.w8_main_setup:
vbroadcasti32x4 m0, [base+pw_1to32]
vpbroadcastw m4, dyd
rorx r2d, hd, 2
pmullw m4, m0 ; ypos
vpbroadcastw m0, r2d
imul r2, strideq ; stride * height / 4
lea r3, [r2*3]
pmullw m1, m0 ; 0 1 2 3
paddd m1, [base+pw_32704] {1to16}
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
ret
.w8_h32:
pmaxud m7, m0, [base+pw_24] {1to16}
vpermw m6, m0, m6
vpermw m7, m7, [tlq-64*2]
test angled, 0x400
jnz .w8_main
call .filter64
vpbroadcastd m0, [base+pw_7]
pminuw m0, [base+pw_0to31]
vpermw m7, m0, m7
jmp .w8_main
.w8_filter:
lea r3d, [hq+7]
call .filter32
.w8_main:
movshdup m1, [base+z_ypos_mul]
call .w8_main_setup
mova m3, m6
vpermt2w m3, m2, m7 ; left[base+0]
.w8_loop:
paddsw m2, m15 ; base+1
mova m1, m6
vpermt2w m1, m2, m7 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddw m0, m3
mova m3, m1
mova [dstq+r2*0], xm0
vextracti32x4 [dstq+r2*1], ym0, 1
vextracti32x4 [dstq+r2*2], m0, 2
vextracti32x4 [dstq+r3 ], m0, 3
add dstq, strideq
sub hd, 4
jg .w8_loop
RET
.filter32:
vpbroadcastb ym10, r3d
vpbroadcastb ym1, angled
shr angled, 8
vpcmpeqb k1, ym10, [base+z_filter_wh]
mova xm2, [base+z_filter_t0+angleq*8]
vpcmpgtb k1{k1}, ym1, ym2
kmovd r5d, k1
test r5d, r5d
jz .filter32_end
vpbroadcastw m2, [tlq]
popcnt r5d, r5d
vpbroadcastd m5, [base+z_filter_k+(r5-1)*4+12*0]
valignq m2, m6, m2, 6
vpbroadcastd m8, [base+z_filter_k+(r5-1)*4+12*1]
valignq m4, m7, m6, 2
vpbroadcastd m9, [base+z_filter_k+(r5-1)*4+12*2]
palignr m1, m6, m2, 14
pmullw m5, m6
palignr m3, m4, m6, 2
paddw m1, m3
palignr m2, m6, m2, 12
pmullw m1, m8
palignr m4, m6, 4
paddw m2, m4
pmullw m2, m9
pmovzxbw m10, ym10
pxor m6, m6
paddw m5, m1
pminuw m1, m10, [base+pw_0to31]
paddw m5, m2
psrlw m5, 3
pavgw m6, m5
vpermw m7, m10, m6
vpermw m6, m1, m6
.filter32_end:
ret
.w16:
mova m6, [tlq-64*1]
cmp hd, 32
jl .w16_h16
pmaxud m8, m0, [base+pw_16] {1to16}
mova m7, [tlq-64*2]
vpermw m6, m0, m6
jg .w16_h64
vpermw m7, m8, m7
test angled, 0x400
jnz .w16_main
call .filter64
vpbroadcastd m0, [base+pw_15]
vinserti32x8 m0, [base+pw_0to31], 0
vpermw m7, m0, m7
jmp .w16_main
.w16_h16:
lea r3d, [hq*2-1]
xor r3d, 31 ; 32 - (h + imin(w, h))
vpbroadcastw m1, r3d
vpermw m7, m1, m6
pmaxuw m1, m0
vpermw m6, m1, m6
test angled, 0x400
jnz .w16_main
lea r3d, [hq+15]
call .filter32
.w16_main:
vbroadcasti32x8 m0, [base+pw_1to32]
vpbroadcastw m4, dyd
rorx r2d, hd, 1
pmullw m4, m0 ; ypos
vpbroadcastw ym1, r2d
imul r2, strideq ; stride * height / 2
paddd m1, [base+pw_32704] {1to16}
lea r3, [r2+strideq]
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
mova m3, m6
vpermt2w m3, m2, m7 ; left[base+0]
.w16_loop:
paddsw m1, m2, m15 ; base+1
paddsw m2, m1, m15 ; base+2
vpermi2w m1, m6, m7 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddw m0, m3
mova m3, m6
vpermt2w m3, m2, m7 ; left[base+2]
vextracti32x8 [dstq+strideq*0], m0, 1
mova [dstq+r2 ], ym0
psubw m0, m3, m1
pmulhrsw m0, m4
paddw m0, m1
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+r3 ], ym0
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w16_loop
RET
.w16_h64:
vpermw m7, m0, m7
vpermw m8, m8, [tlq-64*3]
test angled, 0x400
jnz .w16_h64_main
valignq m11, m8, m7, 6
call .filter64
vshufi32x4 m2, m8, m8, q3321
vpbroadcastd m0, [base+pw_15]
palignr ym3, ym8, ym11, 12
vinserti32x8 m0, [base+pw_0to31], 0
palignr ym4, ym8, ym11, 14
palignr ym1, ym2, ym8, 4
paddw ym3, ym5
palignr ym2, ym8, 2
paddw ym8, ym4
pavgw ym3, ym1
paddw ym8, ym2
paddw ym8, ym3
psrlw ym8, 2
vpermw m8, m0, m8
.w16_h64_main:
vbroadcasti32x8 m0, [base+pw_1to32]
vpbroadcastw m4, dyd
pmullw m4, m0 ; ypos
vpbroadcastd ym1, [base+pw_32]
paddd m1, [base+pw_32672] {1to16}
mov r2, strideq
shl r2, 5 ; stride*32
vpbroadcastd m9, [base+pw_32735]
lea r3, [r2+strideq]
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
mova m3, m7
vpermt2w m3, m2, m6
vpcmpgtw k1, m2, m9
vpermw m3{k1}, m2, m8 ; left[base+0]
.w16_h64_loop:
paddsw m2, m15 ; base+1
mova m1, m7
vpermt2w m1, m2, m6
vpcmpgtw k1, m2, m9
vpermw m1{k1}, m2, m8 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddsw m2, m15 ; base+2
paddw m0, m3
mova m3, m7
vpermt2w m3, m2, m6
vpcmpgtw k1, m2, m9
vpermw m3{k1}, m2, m8 ; left[base+2]
vextracti32x8 [dstq+strideq*0], m0, 1
mova [dstq+r2 ], ym0
psubw m0, m3, m1
pmulhrsw m0, m4
paddw m0, m1
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+r3 ], ym0
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w16_h64_loop
RET
.filter64:
vpbroadcastw m2, [tlq]
vpbroadcastd m5, [base+pw_3]
valignq m2, m6, m2, 6
valignq m4, m7, m6, 2
valignq m10, m7, m6, 6
palignr m1, m6, m2, 12
palignr m2, m6, m2, 14
palignr m3, m4, m6, 4
paddw m1, m5
palignr m4, m6, 2
paddw m6, m2
valignq m2, m8, m7, 2
pavgw m1, m3
palignr m3, m7, m10, 12
paddw m6, m4
palignr m4, m7, m10, 14
paddw m6, m1
palignr m1, m2, m7, 4
psrlw m6, 2
palignr m2, m7, 2
paddw m3, m5
paddw m7, m4
pavgw m3, m1
paddw m7, m2
paddw m7, m3
psrlw m7, 2
ret
.w32:
mova m6, [tlq-64*1]
cmp hd, 32
jl .w32_h16
mova m8, [tlq-64*2]
vpermw m6, m0, m6
vpermw m7, m0, m8
jg .w32_h64
test angled, 0x400
jnz .w32_main
vpbroadcastw xm8, xm8
jmp .w32_filter
.w32_h16:
lea r3d, [hq*2-1]
xor r3d, 31 ; 32 - (h + imin(w, h))
vpbroadcastw m1, r3d
vpermw m7, m1, m6
pmaxuw m1, m0
vpermw m6, m1, m6
test angled, 0x400
jnz .w32_main
vextracti32x4 xm8, m7, 3
.w32_filter:
call .filter64
.w32_main:
vpbroadcastw m4, dyd
vpbroadcastd m1, [base+pw_32704]
pmullw m4, [base+pw_1to32] ; ypos
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
mova m3, m6
vpermt2w m3, m2, m7 ; left[base+0]
.w32_loop:
paddsw m1, m2, m15 ; base+1
paddsw m2, m1, m15 ; base+2
vpermi2w m1, m6, m7 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddw m0, m3
mova m3, m6
vpermt2w m3, m2, m7 ; left[base+2]
mova [dstq+strideq*0], m0
psubw m0, m3, m1
pmulhrsw m0, m4
paddw m0, m1
mova [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_loop
RET
.w32_h64:
mova m9, [tlq-64*3]
vpermw m8, m0, m9
test angled, 0x400
jnz .w32_h64_main
vpbroadcastw xm9, xm9
call .filter96
.w32_h64_main:
vpbroadcastw m4, dyd
vpbroadcastd m1, [base+pw_32672]
pmullw m4, [base+pw_1to32] ; ypos
vpbroadcastd m9, [base+pw_32735]
psrlw m2, m4, 6
psllw m4, 9
paddsw m2, m1 ; base+0
vpandd m4, m14 ; frac << 9
mova m3, m7
vpermt2w m3, m2, m6
vpcmpgtw k1, m2, m9
vpermw m3{k1}, m2, m8 ; left[base+0]
.w32_h64_loop:
paddsw m2, m15 ; base+1
mova m1, m7
vpermt2w m1, m2, m6
vpcmpgtw k1, m2, m9
vpermw m1{k1}, m2, m8 ; left[base+1]
psubw m0, m1, m3
pmulhrsw m0, m4
paddsw m2, m15 ; base+2
paddw m0, m3
mova m3, m7
vpermt2w m3, m2, m6
vpcmpgtw k1, m2, m9
vpermw m3{k1}, m2, m8 ; left[base+2]
mova [dstq+strideq*0], m0
psubw m0, m3, m1
pmulhrsw m0, m4
paddw m0, m1
mova [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32_h64_loop
RET
.filter96:
valignq m11, m8, m7, 6
call .filter64
valignq m2, m9, m8, 2
palignr m3, m8, m11, 12
palignr m4, m8, m11, 14
palignr m1, m2, m8, 4
paddw m3, m5
palignr m2, m8, 2
paddw m8, m4
pavgw m3, m1
paddw m8, m2
paddw m8, m3
psrlw m8, 2
ret
.w64:
mova m7, [tlq-64*1]
vpermw m6, m0, m7
cmp hd, 32
jl .w64_h16
mova m8, [tlq-64*2]
vpermw m7, m0, m8
jg .w64_h64
test angled, 0x400
jnz .w64_main
vpbroadcastw m8, xm8
mova m9, m8
call .filter96
vshufi32x4 m9, m8, m8, q3333
jmp .w64_h64_main
.w64_h16:
vpbroadcastw m7, xm7
test angled, 0x400
jnz .w64_main
mova m8, m7
call .filter64
.w64_main:
vpbroadcastw m11, dyd
vpbroadcastd m1, [base+pw_32704]
pmullw m10, m11, [base+pw_1to32] ; ypos
psllw m11, 5
psrlw m8, m10, 6
paddw m11, m10
psllw m10, 9
psrlw m9, m11, 6
psllw m11, 9
psubw m9, m8
paddsw m8, m1 ; base+0
vpandd m10, m14 ; frac << 9
vpandd m11, m14 ; frac << 9
mova m4, m6
vpermt2w m4, m8, m7 ; left[base+0] ( 0..31)
paddsw m5, m8, m9
vpermi2w m5, m6, m7 ; left[base+0] (32..63)
.w64_loop:
paddsw m8, m15 ; base+1 ( 0..31)
mova m2, m6
vpermt2w m2, m8, m7 ; left[base+1] ( 0..31)
paddsw m3, m8, m9 ; base+1 (32..63)
vpermi2w m3, m6, m7 ; left[base+1] (32..63)
psubw m0, m2, m4
psubw m1, m3, m5
pmulhrsw m0, m10
pmulhrsw m1, m11
paddw m0, m4
paddw m1, m5
mova m4, m2
mova [dstq+64*0], m0
mova m5, m3
mova [dstq+64*1], m1
add dstq, strideq
dec hd
jg .w64_loop
RET
.w64_h64:
vpermw m8, m0, [tlq-64*3]
mova m13, [tlq-64*4]
vpermw m9, m0, m13
test angled, 0x400
jnz .w64_h64_main
valignq m12, m9, m8, 6
call .filter96
vpbroadcastw xm2, xm13
valignq m2, m9, 2
palignr m3, m9, m12, 12
palignr m4, m9, m12, 14
palignr m1, m2, m9, 4
paddw m3, m5
palignr m2, m9, 2
paddw m9, m4
pavgw m3, m1
paddw m9, m2
paddw m9, m3
psrlw m9, 2
.w64_h64_main:
vpbroadcastw m11, dyd
vpbroadcastd m1, [base+pw_32640]
pmullw m10, m11, [base+pw_1to32] ; ypos
psllw m11, 5
psrlw m12, m10, 6
paddw m11, m10
psllw m10, 9
psrlw m13, m11, 6
psllw m11, 9
psubw m13, m12
paddsw m12, m1 ; base+0
vpandd m10, m14 ; frac << 9
vpandd m11, m14 ; frac << 9
vpbroadcastd m14, [base+pw_64]
mova m4, m6
vpermt2w m4, m12, m7
vptestmw k1, m12, m14
mova m0, m8
vpermt2w m0, m12, m9
paddsw m1, m12, m13
mova m5, m6
vpermt2w m5, m1, m7
vptestmw k2, m1, m14
vpermi2w m1, m8, m9
vmovdqu16 m4{k1}, m0 ; left[base+0] ( 0..31)
vmovdqu16 m5{k2}, m1 ; left[base+0] (32..63)
.w64_h64_loop:
paddsw m12, m15 ; base+1
mova m2, m6
vpermt2w m2, m12, m7
vptestmw k1, m12, m14
mova m0, m8
vpermt2w m0, m12, m9
paddsw m1, m12, m13
mova m3, m6
vpermt2w m3, m1, m7
vptestmw k2, m1, m14
vpermi2w m1, m8, m9
vmovdqu16 m2{k1}, m0 ; left[base+1] ( 0..31)
vmovdqu16 m3{k2}, m1 ; left[base+1] (32..63)
psubw m0, m2, m4
psubw m1, m3, m5
pmulhrsw m0, m10
pmulhrsw m1, m11
paddw m0, m4
paddw m1, m5
mova m4, m2
mova [dstq+64*0], m0
mova m5, m3
mova [dstq+64*1], m1
add dstq, strideq
dec hd
jg .w64_h64_loop
RET
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_16bpc_avx512icl_table]
tzcnt wd, wm

View File

@ -132,7 +132,8 @@ for d in "${dirs[@]}"; do
fi
done
if [ ${#files[@]} -eq 0 ]; then
num_files="${#files[@]}"
if [ "$num_files" -eq 0 ]; then
error "Error! No files found at ${dirs[*]}"
fi
@ -148,17 +149,17 @@ for i in "${!files[@]}"; do
md5=$(<"${md5/%obu/md5}") || error "Error! Can't read md5 ${md5} for file ${f}"
md5=${md5/ */}
printf "\033[1K\r[%3d%% %d/%d] Verifying %s" "$(((i+1)*100/${#files[@]}))" "$((i+1))" "${#files[@]}" "$f"
printf '\033[1K\r[%3d%% %*d/%d] Verifying %s' "$(((i+1)*100/num_files))" "${#num_files}" "$((i+1))" "$num_files" "${f#"$ARGON_DIR"/}"
cmd=("$DAV1D" -i "$f" --filmgrain "$FILMGRAIN" --verify "$md5" --cpumask "$CPUMASK" --threads "$THREADS" -q)
if [ "$JOBS" -gt 1 ]; then
"${cmd[@]}" 2>/dev/null &
p=$!
pids+=("$p")
declare "file$p=$f"
declare "file$p=${f#"$ARGON_DIR"/}"
block_pids
else
if ! "${cmd[@]}" 2>/dev/null; then
fail "$f"
fail "${f#"$ARGON_DIR"/}"
fi
fi
done
@ -166,9 +167,9 @@ done
wait_all_pids
if [ "$failed" -ne 0 ]; then
printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "${#files[@]}"
printf "\033[1K\r%d/%d files \033[1;91mfailed\033[0m to verify" "$failed" "$num_files"
else
printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "${#files[@]}"
printf "\033[1K\r%d files \033[1;92msuccessfully\033[0m verified" "$num_files"
fi
printf " in %dm%ds (%s)\n" "$((SECONDS/60))" "$((SECONDS%60))" "$ver_info"

View File

@ -69,6 +69,8 @@ if is_asm_enabled
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
elif host_machine.cpu_family().startswith('arm')
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
elif host_machine.cpu_family() == 'riscv64'
checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S')
elif host_machine.cpu_family().startswith('x86')
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
endif
@ -128,7 +130,7 @@ endforeach
subdir('libfuzzer')
# seek stress test binary, depends on dav1d cli tool
if get_option('enable_tools')
if (get_option('enable_tools') and get_option('enable_seek_stress'))
seek_stress_sources = files('seek_stress.c')
seek_stress = executable('seek_stress',
seek_stress_sources, rev_target,