mirror of
https://github.com/mozilla/gecko-dev.git
synced 2025-02-17 06:09:19 +00:00
Bug 1716453 - Update dav1d to new version ddbbfde for Firefox 91. r=mjf
Differential Revision: https://phabricator.services.mozilla.com/D118295
This commit is contained in:
parent
e7acc848b5
commit
36b612a851
@ -93,8 +93,9 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm', # moved from autovendored
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc16_avx2.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_avx2.asm',
|
||||
@ -110,7 +111,9 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
||||
'../../../third_party/dav1d/src/x86/itx16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/itx_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/loopfilter_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/looprestoration16_sse.asm', # moved from autovendored
|
||||
'../../../third_party/dav1d/src/x86/looprestoration_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/msac.asm',
|
||||
]
|
||||
@ -206,6 +209,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
||||
'../../../third_party/dav1d/src/arm/32/cdef.S',
|
||||
'../../../third_party/dav1d/src/arm/32/cdef16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
|
||||
'../../../third_party/dav1d/src/arm/32/film_grain.S',
|
||||
'../../../third_party/dav1d/src/arm/32/film_grain16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/ipred.S',
|
||||
'../../../third_party/dav1d/src/arm/32/ipred16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/itx.S',
|
||||
|
@ -20,11 +20,11 @@ origin:
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit c54add020492e3cca0da5ab90fa69c92ba496384 (2021-05-18T02:50:02.000+02:00).
|
||||
release: commit ddbbfde198aced0d02ea739c320d754d43406f7b (2021-06-12T07:58:29.000+00:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: c54add020492e3cca0da5ab90fa69c92ba496384
|
||||
revision: ddbbfde198aced0d02ea739c320d754d43406f7b
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -1,2 +1,2 @@
|
||||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.9.0-1-gc54add0"
|
||||
#define DAV1D_VERSION "0.9.0-24-gddbbfde"
|
||||
|
2
third_party/dav1d/README.md
vendored
2
third_party/dav1d/README.md
vendored
@ -60,7 +60,7 @@ Our contributions guidelines are quite strict. We want to build a coherent codeb
|
||||
|
||||
Notably, the codebase is in pure C and asm.
|
||||
|
||||
We are on IRC, on the **#dav1d** channel on *Freenode*.
|
||||
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d).
|
||||
|
||||
See the [contributions document](CONTRIBUTING.md).
|
||||
|
||||
|
714
third_party/dav1d/src/arm/32/film_grain.S
vendored
Normal file
714
third_party/dav1d/src/arm/32/film_grain.S
vendored
Normal file
@ -0,0 +1,714 @@
|
||||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
#include "src/arm/asm-offsets.h"
|
||||
|
||||
#define GRAIN_WIDTH 82
|
||||
|
||||
.macro gather_interleaved dst1, dst2, src1, src2, off
|
||||
vmov.u8 r11, \src1[0+\off]
|
||||
vmov.u8 r12, \src2[0+\off]
|
||||
add r11, r11, r3
|
||||
vmov.u8 lr, \src1[2+\off]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst1[0+\off]}, [r11]
|
||||
vmov.u8 r11, \src2[2+\off]
|
||||
add lr, lr, r3
|
||||
vld1.8 {\dst2[0+\off]}, [r12]
|
||||
vmov.u8 r12, \src1[4+\off]
|
||||
add r11, r11, r3
|
||||
vld1.8 {\dst1[2+\off]}, [lr]
|
||||
vmov.u8 lr, \src2[4+\off]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst2[2+\off]}, [r11]
|
||||
vmov.u8 r11, \src1[6+\off]
|
||||
add lr, lr, r3
|
||||
vld1.8 {\dst1[4+\off]}, [r12]
|
||||
vmov.u8 r12, \src2[6+\off]
|
||||
add r11, r11, r3
|
||||
vld1.8 {\dst2[4+\off]}, [lr]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst1[6+\off]}, [r11]
|
||||
vld1.8 {\dst2[6+\off]}, [r12]
|
||||
.endm
|
||||
|
||||
.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
|
||||
gather_interleaved \dst1, \dst3, \src1, \src3, 0
|
||||
gather_interleaved \dst1, \dst3, \src1, \src3, 1
|
||||
gather_interleaved \dst2, \dst4, \src2, \src4, 0
|
||||
gather_interleaved \dst2, \dst4, \src2, \src4, 1
|
||||
.endm
|
||||
|
||||
function gather32_neon
|
||||
push {r11-r12,lr}
|
||||
gather d8, d9, d10, d11, d0, d1, d2, d3
|
||||
pop {r11-r12,pc}
|
||||
endfunc
|
||||
|
||||
function gather16_neon
|
||||
push {r11-r12,lr}
|
||||
gather_interleaved d8, d9, d0, d1, 0
|
||||
gather_interleaved d8, d9, d0, d1, 1
|
||||
pop {r11-r12,pc}
|
||||
endfunc
|
||||
|
||||
const overlap_coeffs_0, align=4
|
||||
.byte 27, 17, 0, 0, 0, 0, 0, 0
|
||||
.byte 17, 27, 32, 32, 32, 32, 32, 32
|
||||
endconst
|
||||
|
||||
const overlap_coeffs_1, align=4
|
||||
.byte 23, 0, 0, 0, 0, 0, 0, 0
|
||||
.byte 22, 32, 32, 32, 32, 32, 32, 32
|
||||
endconst
|
||||
|
||||
.macro calc_offset offx, offy, src, sx, sy
|
||||
and \offy, \src, #0xF // randval & 0xF
|
||||
lsr \offx, \src, #4 // randval >> 4
|
||||
.if \sy == 0
|
||||
add \offy, \offy, \offy // 2 * (randval & 0xF)
|
||||
.endif
|
||||
.if \sx == 0
|
||||
add \offx, \offx, \offx // 2 * (randval >> 4)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add_offset dst, offx, offy, src, stride
|
||||
mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
|
||||
add \dst, \dst, \offx // grain_lut += offx
|
||||
.endm
|
||||
|
||||
// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
// const uint8_t scaling[SCALING_SIZE],
|
||||
// const int scaling_shift,
|
||||
// const entry grain_lut[][GRAIN_WIDTH],
|
||||
// const int offsets[][2],
|
||||
// const int h, const ptrdiff_t clip,
|
||||
// const ptrdiff_t type);
|
||||
function fgy_32x32_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
|
||||
ldrd r6, r7, [sp, #108] // offsets, h
|
||||
ldr r8, [sp, #116] // clip
|
||||
mov r9, #GRAIN_WIDTH // grain_lut stride
|
||||
|
||||
neg r4, r4
|
||||
vdup.16 q13, r4 // -scaling_shift
|
||||
cmp r8, #0
|
||||
|
||||
movrel_local r12, overlap_coeffs_0
|
||||
|
||||
beq 1f
|
||||
// clip
|
||||
vmov.i8 q14, #16
|
||||
vmov.i8 q15, #235
|
||||
b 2f
|
||||
1:
|
||||
// no clip
|
||||
vmov.i8 q14, #0
|
||||
vmov.i8 q15, #255
|
||||
2:
|
||||
|
||||
vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
|
||||
|
||||
add r5, r5, #9 // grain_lut += 9
|
||||
add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
|
||||
add r5, r5, r9 // grain_lut += grain_stride
|
||||
|
||||
ldr r10, [r6, #8] // offsets[1][0]
|
||||
calc_offset r10, r4, r10, 0, 0
|
||||
add_offset r4, r10, r4, r5, r9
|
||||
ldr r10, [r6, #4] // offsets[0][1]
|
||||
calc_offset r10, r11, r10, 0, 0
|
||||
add_offset r11, r10, r11, r5, r9
|
||||
ldr r10, [r6, #12] // offsets[1][1]
|
||||
calc_offset r10, r8, r10, 0, 0
|
||||
add_offset r8, r10, r8, r5, r9
|
||||
ldr r6, [r6] // offsets[0][0]
|
||||
calc_offset r6, lr, r6, 0, 0
|
||||
add_offset r5, r6, lr, r5, r9
|
||||
|
||||
add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
|
||||
ldr r10, [sp, #120] // type
|
||||
adr r11, L(fgy_loop_tbl)
|
||||
|
||||
tst r10, #1
|
||||
ldr r10, [r11, r10, lsl #2]
|
||||
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
|
||||
add r11, r11, r10
|
||||
|
||||
beq 1f
|
||||
// y overlap
|
||||
vdup.8 d14, d24[0]
|
||||
vdup.8 d15, d24[1]
|
||||
mov r10, r7 // backup actual h
|
||||
mov r7, #2
|
||||
1:
|
||||
bx r11
|
||||
endfunc
|
||||
|
||||
function fgy_loop_neon
|
||||
L(fgy_loop_tbl):
|
||||
.word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fgy ox, oy
|
||||
L(loop_\ox\oy):
|
||||
1:
|
||||
.if \ox
|
||||
vld1.8 {d8}, [r4], r9 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.8 {q2, q3}, [r6], r9 // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.8 {d10}, [r8], r9 // grain_lut top old
|
||||
.endif
|
||||
vld1.8 {q0, q1}, [r1, :128], r2 // src
|
||||
vld1.8 {q10, q11}, [r5], r9 // grain_lut
|
||||
|
||||
.if \ox
|
||||
vmull.s8 q4, d8, d24
|
||||
vmlal.s8 q4, d20, d25
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
vmull.s8 q5, d10, d24
|
||||
vmlal.s8 q5, d4, d25
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d4, q5, #5
|
||||
.endif
|
||||
|
||||
vmull.s8 q4, d20, d15
|
||||
vmull.s8 q5, d21, d15
|
||||
vmull.s8 q8, d22, d15
|
||||
vmull.s8 q9, d23, d15
|
||||
vmlal.s8 q4, d4, d14
|
||||
vmlal.s8 q5, d5, d14
|
||||
vmlal.s8 q8, d6, d14
|
||||
vmlal.s8 q9, d7, d14
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d21, q5, #5
|
||||
vqrshrn.s16 d22, q8, #5
|
||||
vqrshrn.s16 d23, q9, #5
|
||||
.elseif \ox
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
.endif
|
||||
|
||||
bl gather32_neon
|
||||
|
||||
vmovl.s8 q8, d20 // grain
|
||||
vmovl.s8 q9, d21
|
||||
vmovl.s8 q10, d22
|
||||
vmovl.s8 q11, d23
|
||||
|
||||
vmovl.u8 q2, d8 // scaling
|
||||
vmovl.u8 q3, d9
|
||||
vmovl.u8 q4, d10
|
||||
vmovl.u8 q5, d11
|
||||
|
||||
vmul.i16 q8, q8, q2 // scaling * grain
|
||||
vmul.i16 q9, q9, q3
|
||||
vmul.i16 q10, q10, q4
|
||||
vmul.i16 q11, q11, q5
|
||||
|
||||
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
|
||||
vrshl.s16 q9, q9, q13
|
||||
vrshl.s16 q10, q10, q13
|
||||
vrshl.s16 q11, q11, q13
|
||||
|
||||
vaddw.u8 q8, q8, d0 // *src + noise
|
||||
vaddw.u8 q9, q9, d1
|
||||
vaddw.u8 q10, q10, d2
|
||||
vaddw.u8 q11, q11, d3
|
||||
|
||||
vqmovun.s16 d0, q8
|
||||
vqmovun.s16 d1, q9
|
||||
vqmovun.s16 d2, q10
|
||||
vqmovun.s16 d3, q11
|
||||
|
||||
vmax.u8 q0, q0, q14
|
||||
vmax.u8 q1, q1, q14
|
||||
vmin.u8 q0, q0, q15
|
||||
vmin.u8 q1, q1, q15
|
||||
|
||||
subs r7, r7, #1
|
||||
.if \oy
|
||||
vdup.8 d14, d25[0]
|
||||
vdup.8 d15, d25[1]
|
||||
.endif
|
||||
vst1.8 {q0, q1}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r10, #2
|
||||
sub r7, r10, #2 // restore actual remaining h
|
||||
bgt L(loop_\ox\()0)
|
||||
.endif
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.endm
|
||||
|
||||
fgy 0, 0
|
||||
fgy 0, 1
|
||||
fgy 1, 0
|
||||
fgy 1, 1
|
||||
endfunc
|
||||
|
||||
// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
|
||||
// const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
// const uint8_t scaling[SCALING_SIZE],
|
||||
// const Dav1dFilmGrainData *const data,
|
||||
// const entry grain_lut[][GRAIN_WIDTH],
|
||||
// const pixel *const luma_row,
|
||||
// const ptrdiff_t luma_stride,
|
||||
// const int offsets[][2],
|
||||
// const ptrdiff_t h, const ptrdiff_t uv,
|
||||
// const ptrdiff_t is_id,
|
||||
// const ptrdiff_t type);
|
||||
.macro fguv layout, sx, sy
|
||||
function fguv_32x32_\layout\()_8bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100] // data, grain_lut
|
||||
ldrd r6, r7, [sp, #108] // luma_row, luma_stride
|
||||
ldrd r8, r9, [sp, #116] // offsets, h
|
||||
ldrd r10, r11, [sp, #124] // uv, is_id
|
||||
|
||||
// !csfl
|
||||
add r10, r4, r10, lsl #2 // + 4*uv
|
||||
add r12, r10, #FGD_UV_LUMA_MULT
|
||||
add lr, r10, #FGD_UV_MULT
|
||||
add r10, r10, #FGD_UV_OFFSET
|
||||
vld1.16 {d4[]}, [r12] // uv_luma_mult
|
||||
vld1.16 {d4[2]}, [r10] // uv_offset
|
||||
vld1.16 {d4[1]}, [lr] // uv_mult
|
||||
|
||||
ldr lr, [r4, #FGD_SCALING_SHIFT]
|
||||
ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
|
||||
neg lr, lr // -scaling_shift
|
||||
|
||||
cmp r12, #0
|
||||
vdup.16 q13, lr // -scaling_shift
|
||||
|
||||
beq 1f
|
||||
// clip
|
||||
cmp r11, #0
|
||||
vmov.i8 q14, #16
|
||||
vmov.i8 q15, #240
|
||||
beq 2f
|
||||
// is_id
|
||||
vmov.i8 q15, #235
|
||||
b 2f
|
||||
1:
|
||||
// no clip
|
||||
vmov.i8 q14, #0
|
||||
vmov.i8 q15, #255
|
||||
2:
|
||||
|
||||
mov r10, #GRAIN_WIDTH // grain_lut stride
|
||||
|
||||
add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
|
||||
.if \sy
|
||||
add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
|
||||
add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
|
||||
.else
|
||||
add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
|
||||
add r5, r5, r10 // grain_lut += grain_stride
|
||||
.endif
|
||||
|
||||
ldr r12, [r8, #8] // offsets[1][0]
|
||||
calc_offset r12, r4, r12, \sx, \sy
|
||||
add_offset r4, r12, r4, r5, r10
|
||||
|
||||
ldr r12, [r8, #4] // offsets[0][1]
|
||||
calc_offset r12, lr, r12, \sx, \sy
|
||||
add_offset lr, r12, lr, r5, r10
|
||||
|
||||
ldr r12, [r8, #12] // offsets[1][1]
|
||||
calc_offset r12, r11, r12, \sx, \sy
|
||||
add_offset r11, r12, r11, r5, r10
|
||||
|
||||
ldr r8, [r8] // offsets[0][0]
|
||||
calc_offset r8, r12, r8, \sx, \sy
|
||||
add_offset r5, r8, r12, r5, r10
|
||||
|
||||
add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
|
||||
movrel_local r12, overlap_coeffs_\sx
|
||||
ldr lr, [sp, #132] // type
|
||||
|
||||
vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
|
||||
|
||||
movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
|
||||
#if CONFIG_THUMB
|
||||
// This uses movrel_local instead of adr above, because the target
|
||||
// can be out of range for adr. But movrel_local leaves the thumb bit
|
||||
// set on COFF (but probably wouldn't if building for thumb on ELF),
|
||||
// thus try to clear the bit for robustness.
|
||||
bic r12, r12, #1
|
||||
#endif
|
||||
|
||||
tst lr, #1
|
||||
ldr lr, [r12, lr, lsl #2]
|
||||
|
||||
add r12, r12, lr
|
||||
|
||||
beq 1f
|
||||
// y overlap
|
||||
sub lr, r9, #(2 >> \sy) // backup remaining h
|
||||
mov r9, #(2 >> \sy)
|
||||
|
||||
1:
|
||||
|
||||
.if \sy
|
||||
vmov.i8 d6, #23
|
||||
vmov.i8 d7, #22
|
||||
.else
|
||||
vmov.i8 d6, #27
|
||||
vmov.i8 d7, #17
|
||||
.endif
|
||||
|
||||
.if \sy
|
||||
add r7, r7, r7 // luma_stride *= 2
|
||||
.endif
|
||||
|
||||
bx r12
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
fguv 420, 1, 1
|
||||
fguv 422, 1, 0
|
||||
fguv 444, 0, 0
|
||||
|
||||
function fguv_loop_sx0_neon
|
||||
L(fguv_loop_sx0_tbl):
|
||||
.word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fguv_loop_sx0 csfl, ox, oy
|
||||
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||||
.if \oy
|
||||
mov r12, lr
|
||||
.endif
|
||||
1:
|
||||
.if \ox
|
||||
vld1.8 {d8}, [r4], r10 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.8 {q8, q9}, [r8], r10 // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.8 {d10}, [r11], r10 // grain_lut top old
|
||||
.endif
|
||||
vld1.8 {q0, q1}, [r6, :128], r7 // luma
|
||||
vld1.8 {q10, q11}, [r5], r10 // grain_lut
|
||||
|
||||
.if \ox
|
||||
vmull.s8 q4, d8, d24
|
||||
vmlal.s8 q4, d20, d25
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
vmull.s8 q5, d10, d24
|
||||
vmlal.s8 q5, d16, d25
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d16, q5, #5
|
||||
.endif
|
||||
|
||||
vmull.s8 q4, d20, d7
|
||||
vmull.s8 q5, d21, d7
|
||||
vmull.s8 q6, d22, d7
|
||||
vmull.s8 q7, d23, d7
|
||||
vmlal.s8 q4, d16, d6
|
||||
vmlal.s8 q5, d17, d6
|
||||
vmlal.s8 q6, d18, d6
|
||||
vmlal.s8 q7, d19, d6
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d21, q5, #5
|
||||
vqrshrn.s16 d22, q6, #5
|
||||
vqrshrn.s16 d23, q7, #5
|
||||
.elseif \ox
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
.endif
|
||||
.if !\csfl
|
||||
vld1.8 {q8, q9}, [r1, :128] // src
|
||||
vmovl.u8 q4, d0
|
||||
vmovl.u8 q5, d1
|
||||
vmovl.u8 q6, d2
|
||||
vmovl.u8 q7, d3
|
||||
vmovl.u8 q0, d16
|
||||
vmovl.u8 q1, d17
|
||||
vmovl.u8 q8, d18
|
||||
vmovl.u8 q9, d19
|
||||
vmul.i16 q4, q4, d4[0]
|
||||
vmul.i16 q5, q5, d4[0]
|
||||
vmul.i16 q6, q6, d4[0]
|
||||
vmul.i16 q7, q7, d4[0]
|
||||
vmul.i16 q0, q0, d4[1]
|
||||
vmul.i16 q1, q1, d4[1]
|
||||
vmul.i16 q8, q8, d4[1]
|
||||
vmul.i16 q9, q9, d4[1]
|
||||
vqadd.s16 q4, q4, q0
|
||||
vqadd.s16 q5, q5, q1
|
||||
vqadd.s16 q6, q6, q8
|
||||
vqadd.s16 q7, q7, q9
|
||||
vdup.16 q0, d4[2]
|
||||
vshr.s16 q4, q4, #6
|
||||
vshr.s16 q5, q5, #6
|
||||
vshr.s16 q6, q6, #6
|
||||
vshr.s16 q7, q7, #6
|
||||
vadd.i16 q4, q4, q0
|
||||
vadd.i16 q5, q5, q0
|
||||
vadd.i16 q6, q6, q0
|
||||
vadd.i16 q7, q7, q0
|
||||
vqmovun.s16 d0, q4
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6
|
||||
vqmovun.s16 d3, q7
|
||||
.endif
|
||||
|
||||
bl gather32_neon
|
||||
|
||||
vld1.8 {q0, q1}, [r1, :128], r2 // src
|
||||
|
||||
vmovl.s8 q8, d20 // grain
|
||||
vmovl.s8 q9, d21
|
||||
vmovl.s8 q10, d22
|
||||
vmovl.s8 q11, d23
|
||||
|
||||
vmovl.u8 q6, d8 // scaling
|
||||
vmovl.u8 q7, d9
|
||||
vmovl.u8 q4, d10
|
||||
vmovl.u8 q5, d11
|
||||
|
||||
vmul.i16 q8, q8, q6 // scaling * grain
|
||||
vmul.i16 q9, q9, q7
|
||||
vmul.i16 q10, q10, q4
|
||||
vmul.i16 q11, q11, q5
|
||||
|
||||
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
|
||||
vrshl.s16 q9, q9, q13
|
||||
vrshl.s16 q10, q10, q13
|
||||
vrshl.s16 q11, q11, q13
|
||||
|
||||
vaddw.u8 q8, q8, d0 // *src + noise
|
||||
vaddw.u8 q9, q9, d1
|
||||
vaddw.u8 q10, q10, d2
|
||||
vaddw.u8 q11, q11, d3
|
||||
|
||||
vqmovun.s16 d0, q8
|
||||
vqmovun.s16 d1, q9
|
||||
vqmovun.s16 d2, q10
|
||||
vqmovun.s16 d3, q11
|
||||
|
||||
vmax.u8 q0, q0, q14
|
||||
vmax.u8 q1, q1, q14
|
||||
vmin.u8 q0, q0, q15
|
||||
vmin.u8 q1, q1, q15
|
||||
|
||||
subs r9, r9, #1
|
||||
.if \oy
|
||||
vdup.8 d6, d25[0]
|
||||
vdup.8 d7, d25[1]
|
||||
.endif
|
||||
|
||||
vst1.8 {q0, q1}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r12, #0
|
||||
mov r9, r12 // restore actual remaining h
|
||||
bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
|
||||
.endif
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx0 0, 0, 0
|
||||
fguv_loop_sx0 0, 0, 1
|
||||
fguv_loop_sx0 0, 1, 0
|
||||
fguv_loop_sx0 0, 1, 1
|
||||
fguv_loop_sx0 1, 0, 0
|
||||
fguv_loop_sx0 1, 0, 1
|
||||
fguv_loop_sx0 1, 1, 0
|
||||
fguv_loop_sx0 1, 1, 1
|
||||
|
||||
9:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
function fguv_loop_sx1_neon
|
||||
L(fguv_loop_sx1_tbl):
|
||||
.word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fguv_loop_sx1 csfl, ox, oy
|
||||
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
.if \oy
|
||||
mov r12, lr
|
||||
.endif
|
||||
1:
|
||||
.if \ox
|
||||
vld1.8 {d8}, [r4], r10 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.8 {q8}, [r8], r10 // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.8 {d10}, [r11], r10 // grain_lut top old
|
||||
.endif
|
||||
vld1.8 {q0, q1}, [r6, :128], r7 // luma
|
||||
vld1.8 {q10}, [r5], r10 // grain_lut
|
||||
vld1.8 {q11}, [r1, :128], r2 // src
|
||||
|
||||
.if \ox
|
||||
vmull.s8 q4, d8, d24
|
||||
vmlal.s8 q4, d20, d25
|
||||
.endif
|
||||
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
.if \oy
|
||||
.if \ox
|
||||
vmull.s8 q5, d10, d24
|
||||
vmlal.s8 q5, d16, d25
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d16, q5, #5
|
||||
.endif
|
||||
|
||||
vmull.s8 q4, d20, d7
|
||||
vmull.s8 q5, d21, d7
|
||||
vmlal.s8 q4, d16, d6
|
||||
vmlal.s8 q5, d17, d6
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
vqrshrn.s16 d21, q5, #5
|
||||
.elseif \ox
|
||||
vqrshrn.s16 d20, q4, #5
|
||||
.endif
|
||||
.if \csfl
|
||||
vrshrn.u16 d0, q0, #1
|
||||
vrshrn.u16 d1, q1, #1
|
||||
.else
|
||||
vrshr.u16 q4, q0, #1
|
||||
vrshr.u16 q5, q1, #1
|
||||
vmovl.u8 q0, d22
|
||||
vmovl.u8 q1, d23
|
||||
vmul.i16 q4, q4, d4[0]
|
||||
vmul.i16 q5, q5, d4[0]
|
||||
vmul.i16 q0, q0, d4[1]
|
||||
vmul.i16 q1, q1, d4[1]
|
||||
vqadd.s16 q4, q4, q0
|
||||
vqadd.s16 q5, q5, q1
|
||||
vdup.16 q0, d4[2]
|
||||
vshr.s16 q4, q4, #6
|
||||
vshr.s16 q5, q5, #6
|
||||
vadd.i16 q4, q4, q0
|
||||
vadd.i16 q5, q5, q0
|
||||
vqmovun.s16 d0, q4
|
||||
vqmovun.s16 d1, q5
|
||||
.endif
|
||||
|
||||
bl gather16_neon
|
||||
|
||||
vmovl.s8 q8, d20 // grain
|
||||
vmovl.s8 q9, d21
|
||||
|
||||
vmovl.u8 q6, d8 // scaling
|
||||
vmovl.u8 q7, d9
|
||||
|
||||
vmul.i16 q8, q8, q6 // scaling * grain
|
||||
vmul.i16 q9, q9, q7
|
||||
|
||||
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
|
||||
vrshl.s16 q9, q9, q13
|
||||
|
||||
vaddw.u8 q8, q8, d22 // *src + noise
|
||||
vaddw.u8 q9, q9, d23
|
||||
|
||||
vqmovun.s16 d0, q8
|
||||
vqmovun.s16 d1, q9
|
||||
|
||||
vmax.u8 q0, q0, q14
|
||||
vmin.u8 q0, q0, q15
|
||||
|
||||
subs r9, r9, #1
|
||||
.if \oy
|
||||
vswp d6, d7
|
||||
.endif
|
||||
vst1.8 {q0}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r12, #0
|
||||
mov r9, r12 // restore actual remaining h
|
||||
bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
|
||||
.endif
|
||||
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx1 0, 0, 0
|
||||
fguv_loop_sx1 0, 0, 1
|
||||
fguv_loop_sx1 0, 1, 0
|
||||
fguv_loop_sx1 0, 1, 1
|
||||
fguv_loop_sx1 1, 0, 0
|
||||
fguv_loop_sx1 1, 0, 1
|
||||
fguv_loop_sx1 1, 1, 0
|
||||
fguv_loop_sx1 1, 1, 1
|
||||
|
||||
9:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
949
third_party/dav1d/src/arm/32/film_grain16.S
vendored
Normal file
949
third_party/dav1d/src/arm/32/film_grain16.S
vendored
Normal file
@ -0,0 +1,949 @@
|
||||
/*
|
||||
* Copyright © 2021, VideoLAN and dav1d authors
|
||||
* Copyright © 2021, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
#include "src/arm/asm-offsets.h"
|
||||
|
||||
#define GRAIN_WIDTH 82
|
||||
|
||||
.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
|
||||
vmov.u16 r11, \src1[0+\off]
|
||||
vmov.u16 r12, \src3[0+\off]
|
||||
add r11, r11, r3
|
||||
vmov.u16 lr, \src1[2+\off]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst1[0+\off]}, [r11]
|
||||
vmov.u16 r11, \src3[2+\off]
|
||||
add lr, lr, r3
|
||||
vld1.8 {\dst2[0+\off]}, [r12]
|
||||
vmov.u16 r12, \src2[0+\off]
|
||||
add r11, r11, r3
|
||||
vld1.8 {\dst1[2+\off]}, [lr]
|
||||
vmov.u16 lr, \src4[0+\off]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst2[2+\off]}, [r11]
|
||||
vmov.u16 r11, \src2[2+\off]
|
||||
add lr, lr, r3
|
||||
vld1.8 {\dst1[4+\off]}, [r12]
|
||||
vmov.u16 r12, \src4[2+\off]
|
||||
add r11, r11, r3
|
||||
vld1.8 {\dst2[4+\off]}, [lr]
|
||||
add r12, r12, r3
|
||||
vld1.8 {\dst1[6+\off]}, [r11]
|
||||
vld1.8 {\dst2[6+\off]}, [r12]
|
||||
.endm
|
||||
|
||||
.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
|
||||
gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
|
||||
gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
|
||||
gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
|
||||
gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
|
||||
.endm
|
||||
|
||||
function gather32_neon
|
||||
push {r11-r12,lr}
|
||||
gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
|
||||
pop {r11-r12,pc}
|
||||
endfunc
|
||||
|
||||
function gather16_neon
|
||||
push {r11-r12,lr}
|
||||
gather_interleaved d8, d9, d0, d1, d2, d3, 0
|
||||
gather_interleaved d8, d9, d0, d1, d2, d3, 1
|
||||
pop {r11-r12,pc}
|
||||
endfunc
|
||||
|
||||
const overlap_coeffs_0, align=4
|
||||
.short 27, 17, 0, 0
|
||||
.short 17, 27, 32, 32
|
||||
endconst
|
||||
|
||||
const overlap_coeffs_1, align=4
|
||||
.short 23, 0, 0, 0
|
||||
.short 22, 32, 32, 32
|
||||
endconst
|
||||
|
||||
.macro calc_offset offx, offy, src, sx, sy
|
||||
and \offy, \src, #0xF // randval & 0xF
|
||||
lsr \offx, \src, #4 // randval >> 4
|
||||
.if \sy == 0
|
||||
add \offy, \offy, \offy // 2 * (randval & 0xF)
|
||||
.endif
|
||||
.if \sx == 0
|
||||
add \offx, \offx, \offx // 2 * (randval >> 4)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro add_offset dst, offx, offy, src, stride
|
||||
mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
|
||||
add \dst, \dst, \offx, lsl #1 // grain_lut += offx
|
||||
.endm
|
||||
|
||||
// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
// const uint8_t scaling[SCALING_SIZE],
|
||||
// const int scaling_shift,
|
||||
// const entry grain_lut[][GRAIN_WIDTH],
|
||||
// const int offsets[][2],
|
||||
// const int h, const ptrdiff_t clip,
|
||||
// const ptrdiff_t type,
|
||||
// const int bitdepth_max);
|
||||
function fgy_32x32_16bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
|
||||
ldrd r6, r7, [sp, #108] // offsets, h
|
||||
ldr r8, [sp, #116] // clip
|
||||
mov r9, #GRAIN_WIDTH*2 // grain_lut stride
|
||||
ldr r10, [sp, #124] // bitdepth_max
|
||||
|
||||
eor r4, r4, #15 // 15 - scaling_shift
|
||||
vdup.16 q6, r10 // bitdepth_max
|
||||
clz r10, r10
|
||||
vdup.16 q13, r4 // 15 - scaling_shift
|
||||
rsb r10, r10, #24 // bitdepth_min_8
|
||||
cmp r8, #0
|
||||
vdup.16 q12, r10 // bitdepth_min_8
|
||||
|
||||
movrel_local r12, overlap_coeffs_0
|
||||
|
||||
beq 1f
|
||||
// clip
|
||||
vmov.i16 q14, #16
|
||||
vmov.i16 q15, #235
|
||||
vshl.s16 q14, q14, q12
|
||||
vshl.s16 q15, q15, q12
|
||||
b 2f
|
||||
1:
|
||||
// no clip
|
||||
vmov.i16 q14, #0
|
||||
vmov q15, q6
|
||||
2:
|
||||
vshr.u16 q6, q6, #1 // grain_max
|
||||
|
||||
vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
|
||||
|
||||
add r5, r5, #18 // grain_lut += 9
|
||||
add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
|
||||
add r5, r5, r9 // grain_lut += grain_stride
|
||||
|
||||
ldr r10, [r6, #8] // offsets[1][0]
|
||||
calc_offset r10, r4, r10, 0, 0
|
||||
add_offset r4, r10, r4, r5, r9
|
||||
ldr r10, [r6, #4] // offsets[0][1]
|
||||
calc_offset r10, r11, r10, 0, 0
|
||||
add_offset r11, r10, r11, r5, r9
|
||||
ldr r10, [r6, #12] // offsets[1][1]
|
||||
calc_offset r10, r8, r10, 0, 0
|
||||
add_offset r8, r10, r8, r5, r9
|
||||
ldr r6, [r6] // offsets[0][0]
|
||||
calc_offset r6, lr, r6, 0, 0
|
||||
add_offset r5, r6, lr, r5, r9
|
||||
|
||||
add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
|
||||
ldr r10, [sp, #120] // type
|
||||
adr r11, L(fgy_loop_tbl)
|
||||
|
||||
tst r10, #1
|
||||
ldr r10, [r11, r10, lsl #2]
|
||||
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
|
||||
add r11, r11, r10
|
||||
|
||||
beq 1f
|
||||
// y overlap
|
||||
vdup.16 d14, d24[0]
|
||||
vdup.16 d15, d24[1]
|
||||
mov r10, r7 // backup actual h
|
||||
mov r7, #2
|
||||
1:
|
||||
sub r2, r2, #32 // src_stride -= 32
|
||||
sub r9, r9, #32 // grain_stride -= 32
|
||||
bx r11
|
||||
endfunc
|
||||
|
||||
function fgy_loop_neon
|
||||
L(fgy_loop_tbl):
|
||||
.word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
.word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fgy ox, oy
|
||||
L(loop_\ox\oy):
|
||||
1:
|
||||
.if \ox
|
||||
vld1.16 {d0}, [r4], r9 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.16 {q2, q3}, [r6]! // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.16 {d2}, [r8], r9 // grain_lut top old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.16 {q4, q5}, [r6], r9 // grain_lut top
|
||||
.endif
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q0, q1}, [r1, :128]! // src
|
||||
.endif
|
||||
vld1.16 {q8, q9}, [r5]! // grain_lut
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
.endif
|
||||
.if !\oy
|
||||
vmvn.i16 q5, #0xf000 // 0x0fff
|
||||
.endif
|
||||
vld1.16 {q10, q11}, [r5], r9 // grain_lut
|
||||
|
||||
.if \ox
|
||||
add r4, r4, #32
|
||||
vmull.s16 q0, d0, d24
|
||||
vmlal.s16 q0, d16, d25
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
add r8, r8, #32
|
||||
vmull.s16 q1, d2, d24
|
||||
vmlal.s16 q1, d4, d25
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vmvn d0, d12 // grain_min
|
||||
vqrshrn.s32 d4, q1, #5
|
||||
vmin.s16 d16, d16, d12
|
||||
vmin.s16 d4, d4, d12
|
||||
vmax.s16 d16, d16, d0
|
||||
vmax.s16 d4, d4, d0
|
||||
.endif
|
||||
|
||||
vmull.s16 q0, d4, d14
|
||||
vmull.s16 q1, d5, d14
|
||||
vmull.s16 q2, d6, d14
|
||||
vmull.s16 q3, d7, d14
|
||||
vmlal.s16 q0, d16, d15
|
||||
vmlal.s16 q1, d17, d15
|
||||
vmlal.s16 q2, d18, d15
|
||||
vmlal.s16 q3, d19, d15
|
||||
vmull.s16 q8, d20, d15
|
||||
vmull.s16 q9, d21, d15
|
||||
vmull.s16 q10, d22, d15
|
||||
vmull.s16 q11, d23, d15
|
||||
vmlal.s16 q8, d8, d14
|
||||
vmlal.s16 q9, d9, d14
|
||||
vmlal.s16 q10, d10, d14
|
||||
vmlal.s16 q11, d11, d14
|
||||
vmvn q4, q6 // grain_min
|
||||
vqrshrn.s32 d0, q0, #5
|
||||
vqrshrn.s32 d1, q1, #5
|
||||
vqrshrn.s32 d2, q2, #5
|
||||
vqrshrn.s32 d3, q3, #5
|
||||
vqrshrn.s32 d4, q8, #5
|
||||
vqrshrn.s32 d5, q9, #5
|
||||
vqrshrn.s32 d6, q10, #5
|
||||
vqrshrn.s32 d7, q11, #5
|
||||
vmin.s16 q8, q0, q6
|
||||
vmin.s16 q9, q1, q6
|
||||
vld1.16 {q0, q1}, [r1, :128]! // src
|
||||
vmin.s16 q10, q2, q6
|
||||
vmin.s16 q11, q3, q6
|
||||
vmax.s16 q8, q8, q4
|
||||
vmax.s16 q9, q9, q4
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
vmvn.i16 q5, #0xf000 // 0x0fff
|
||||
vmax.s16 q10, q10, q4
|
||||
vmax.s16 q11, q11, q4
|
||||
.elseif \ox
|
||||
vmvn d4, d12 // grain_min
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vld1.16 {q0, q1}, [r1, :128]! // src
|
||||
vmin.s16 d16, d16, d12
|
||||
vmax.s16 d16, d16, d4
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
.endif
|
||||
|
||||
// Make sure that uninitialized pixels out of range past the right
|
||||
// edge are in range; their actual values shouldn't matter.
|
||||
vand q0, q0, q5
|
||||
vand q1, q1, q5
|
||||
vand q2, q2, q5
|
||||
vand q3, q3, q5
|
||||
|
||||
bl gather32_neon
|
||||
|
||||
.if \ox || \oy
|
||||
vpush {q6-q7}
|
||||
.endif
|
||||
|
||||
vmovl.u8 q6, d8 // scaling
|
||||
vmovl.u8 q7, d9
|
||||
vmovl.u8 q4, d10
|
||||
vmovl.u8 q5, d11
|
||||
|
||||
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
|
||||
vshl.u16 q7, q7, q13
|
||||
vshl.u16 q4, q4, q13
|
||||
vshl.u16 q5, q5, q13
|
||||
|
||||
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
vqrdmulh.s16 q9, q9, q7
|
||||
vqrdmulh.s16 q10, q10, q4
|
||||
vqrdmulh.s16 q11, q11, q5
|
||||
|
||||
.if \ox || \oy
|
||||
vpop {q6-q7}
|
||||
.endif
|
||||
|
||||
vqadd.s16 q0, q0, q8 // *src + noise
|
||||
vqadd.s16 q1, q1, q9
|
||||
vqadd.s16 q2, q2, q10
|
||||
vqadd.s16 q3, q3, q11
|
||||
|
||||
vmax.s16 q0, q0, q14
|
||||
vmax.s16 q1, q1, q14
|
||||
vmax.s16 q2, q2, q14
|
||||
vmax.s16 q3, q3, q14
|
||||
vmin.s16 q0, q0, q15
|
||||
vmin.s16 q1, q1, q15
|
||||
vmin.s16 q2, q2, q15
|
||||
vmin.s16 q3, q3, q15
|
||||
|
||||
vst1.16 {q0, q1}, [r0, :128]! // dst
|
||||
subs r7, r7, #1
|
||||
.if \oy
|
||||
vdup.16 d14, d25[0]
|
||||
vdup.16 d15, d25[1]
|
||||
.endif
|
||||
vst1.16 {q2, q3}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r10, #2
|
||||
sub r7, r10, #2 // restore actual remaining h
|
||||
bgt L(loop_\ox\()0)
|
||||
.endif
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.endm
|
||||
|
||||
fgy 0, 0
|
||||
fgy 0, 1
|
||||
fgy 1, 0
|
||||
fgy 1, 1
|
||||
endfunc
|
||||
|
||||
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
|
||||
// const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
// const uint8_t scaling[SCALING_SIZE],
|
||||
// const Dav1dFilmGrainData *const data,
|
||||
// const entry grain_lut[][GRAIN_WIDTH],
|
||||
// const pixel *const luma_row,
|
||||
// const ptrdiff_t luma_stride,
|
||||
// const int offsets[][2],
|
||||
// const ptrdiff_t h, const ptrdiff_t uv,
|
||||
// const ptrdiff_t is_id,
|
||||
// const ptrdiff_t type,
|
||||
// const int bitdepth_max);
|
||||
.macro fguv layout, sx, sy
|
||||
function fguv_32x32_\layout\()_16bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100] // data, grain_lut
|
||||
ldrd r10, r11, [sp, #124] // uv, is_id
|
||||
ldr r6, [sp, #136] // bitdepth_max
|
||||
|
||||
clz r7, r6
|
||||
rsb r7, r7, #24 // bitdepth_min_8
|
||||
|
||||
// !csfl
|
||||
add r10, r4, r10, lsl #2 // + 4*uv
|
||||
add r12, r10, #FGD_UV_LUMA_MULT
|
||||
add lr, r10, #FGD_UV_MULT
|
||||
ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
|
||||
vld1.16 {d30[]}, [r12] // uv_luma_mult
|
||||
lsl r10, r10, r7 // uv_offset << bitdepth_min_8
|
||||
vld1.16 {d30[1]}, [lr] // uv_mult
|
||||
|
||||
ldr lr, [r4, #FGD_SCALING_SHIFT]
|
||||
ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
|
||||
eor lr, lr, #15 // 15 - scaling_shift
|
||||
|
||||
vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
|
||||
|
||||
cmp r12, #0
|
||||
vdup.16 q13, lr // 15 - scaling_shift
|
||||
|
||||
beq 1f
|
||||
// clip
|
||||
cmp r11, #0
|
||||
mov r8, #16
|
||||
mov r9, #240
|
||||
lsl r8, r8, r7
|
||||
lsl r9, r9, r7
|
||||
beq 2f
|
||||
// is_id
|
||||
mov r9, #235
|
||||
lsl r9, r9, r7
|
||||
b 2f
|
||||
1:
|
||||
// no clip
|
||||
mov r8, #0
|
||||
mov r9, r6 // bitdepth_max
|
||||
2:
|
||||
vmov.16 d30[3], r6 // bitdepth_max
|
||||
vdup.16 d31, r8 // clip_min
|
||||
|
||||
mov r10, #GRAIN_WIDTH*2 // grain_lut stride
|
||||
|
||||
.if \sy
|
||||
mov r6, #23
|
||||
mov r7, #22
|
||||
.else
|
||||
mov r6, #27
|
||||
mov r7, #17
|
||||
.endif
|
||||
vmov.16 d31[1], r9 // clip_max
|
||||
|
||||
ldrd r8, r9, [sp, #116] // offsets, h
|
||||
|
||||
add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
|
||||
.if \sy
|
||||
add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
|
||||
add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
|
||||
.else
|
||||
add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
|
||||
add r5, r5, r10 // grain_lut += grain_stride
|
||||
.endif
|
||||
vmov.16 d31[2], r6 // overlap y [0]
|
||||
|
||||
ldr r12, [r8, #8] // offsets[1][0]
|
||||
calc_offset r12, r4, r12, \sx, \sy
|
||||
add_offset r4, r12, r4, r5, r10
|
||||
|
||||
ldr r12, [r8, #4] // offsets[0][1]
|
||||
calc_offset r12, lr, r12, \sx, \sy
|
||||
add_offset lr, r12, lr, r5, r10
|
||||
|
||||
ldr r12, [r8, #12] // offsets[1][1]
|
||||
calc_offset r12, r11, r12, \sx, \sy
|
||||
add_offset r11, r12, r11, r5, r10
|
||||
|
||||
ldr r8, [r8] // offsets[0][0]
|
||||
calc_offset r8, r12, r8, \sx, \sy
|
||||
add_offset r5, r8, r12, r5, r10
|
||||
|
||||
vmov.16 d31[3], r7 // overlap y [1]
|
||||
|
||||
add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
|
||||
movrel_local r12, overlap_coeffs_\sx
|
||||
ldr lr, [sp, #132] // type
|
||||
ldrd r6, r7, [sp, #108] // luma_row, luma_stride
|
||||
|
||||
vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
|
||||
|
||||
movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
|
||||
#if CONFIG_THUMB
|
||||
// This uses movrel_local instead of adr above, because the target
|
||||
// can be out of range for adr. But movrel_local leaves the thumb bit
|
||||
// set on COFF (but probably wouldn't if building for thumb on ELF),
|
||||
// thus try to clear the bit for robustness.
|
||||
bic r12, r12, #1
|
||||
#endif
|
||||
|
||||
tst lr, #1
|
||||
ldr lr, [r12, lr, lsl #2]
|
||||
|
||||
add r12, r12, lr
|
||||
|
||||
beq 1f
|
||||
// y overlap
|
||||
sub lr, r9, #(2 >> \sy) // backup remaining h
|
||||
mov r9, #(2 >> \sy)
|
||||
|
||||
1:
|
||||
.if \sy
|
||||
add r7, r7, r7 // luma_stride *= 2
|
||||
.endif
|
||||
sub r7, r7, #32 // luma_stride -= 32
|
||||
|
||||
bx r12
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
fguv 420, 1, 1
|
||||
fguv 422, 1, 0
|
||||
fguv 444, 0, 0
|
||||
|
||||
function fguv_loop_sx0_neon
|
||||
L(fguv_loop_sx0_tbl):
|
||||
.word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fguv_loop_sx0 csfl, ox, oy
|
||||
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||||
sub r2, r2, #32 // src_stride -= 32
|
||||
sub r10, r10, #32 // grain_stride -= 32
|
||||
.if \oy
|
||||
mov r12, lr
|
||||
.endif
|
||||
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
|
||||
1:
|
||||
.if \ox
|
||||
vld1.16 {d0}, [r4], r10 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.16 {q2, q3}, [r8]! // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.16 {d2}, [r11], r10 // grain_lut top old
|
||||
.endif
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
.endif
|
||||
vld1.16 {q8, q9}, [r5]! // grain_lut
|
||||
.if \oy
|
||||
vld1.16 {q4, q5}, [r8], r10 // grain_lut top
|
||||
.endif
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
.endif
|
||||
.if \oy
|
||||
vdup.16 d28, d31[2] // overlap y coeff
|
||||
vdup.16 d29, d31[3] // overlap y coeff
|
||||
.endif
|
||||
vld1.16 {q10, q11}, [r5], r10 // grain_lut
|
||||
|
||||
.if \ox
|
||||
vdup.16 q7, d30[3] // bitdepth_max
|
||||
add r4, r4, #32
|
||||
vmull.s16 q0, d0, d24
|
||||
vshr.u16 q7, q7, #1 // grain_max
|
||||
vmlal.s16 q0, d16, d25
|
||||
vmvn q6, q7 // grain_min
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
add r11, r11, #32
|
||||
vmull.s16 q1, d2, d24
|
||||
vmlal.s16 q1, d4, d25
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vqrshrn.s32 d4, q1, #5
|
||||
vmin.s16 d4, d4, d14
|
||||
vmin.s16 d16, d16, d14
|
||||
vmax.s16 d4, d4, d12
|
||||
vmax.s16 d16, d16, d12
|
||||
.endif
|
||||
|
||||
vmull.s16 q0, d4, d28
|
||||
vmull.s16 q1, d5, d28
|
||||
vmull.s16 q2, d6, d28
|
||||
vmull.s16 q3, d7, d28
|
||||
.if !\ox
|
||||
vdup.16 q7, d30[3] // bitdepth_max
|
||||
.endif
|
||||
vmlal.s16 q0, d16, d29
|
||||
vmlal.s16 q1, d17, d29
|
||||
vmlal.s16 q2, d18, d29
|
||||
vmlal.s16 q3, d19, d29
|
||||
.if !\ox
|
||||
vshr.u16 q7, q7, #1 // grain_max
|
||||
.endif
|
||||
vmull.s16 q8, d20, d29
|
||||
vmull.s16 q9, d21, d29
|
||||
vmull.s16 q10, d22, d29
|
||||
vmull.s16 q11, d23, d29
|
||||
.if !\ox
|
||||
vmvn q6, q7 // grain_min
|
||||
.endif
|
||||
vmlal.s16 q8, d8, d28
|
||||
vmlal.s16 q9, d9, d28
|
||||
vmlal.s16 q10, d10, d28
|
||||
vmlal.s16 q11, d11, d28
|
||||
vqrshrn.s32 d0, q0, #5
|
||||
vqrshrn.s32 d1, q1, #5
|
||||
vqrshrn.s32 d2, q2, #5
|
||||
vqrshrn.s32 d3, q3, #5
|
||||
vqrshrn.s32 d4, q8, #5
|
||||
vqrshrn.s32 d5, q9, #5
|
||||
vqrshrn.s32 d6, q10, #5
|
||||
vqrshrn.s32 d7, q11, #5
|
||||
vmin.s16 q8, q0, q7
|
||||
vmin.s16 q9, q1, q7
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
vmin.s16 q10, q2, q7
|
||||
vmin.s16 q11, q3, q7
|
||||
vmax.s16 q8, q8, q6
|
||||
vmax.s16 q9, q9, q6
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
vmax.s16 q10, q10, q6
|
||||
vmax.s16 q11, q11, q6
|
||||
.elseif \ox
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
vmin.s16 d16, d16, d14
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
vmax.s16 d16, d16, d12
|
||||
.endif
|
||||
|
||||
.if !\csfl
|
||||
vdup.16 d28, d30[0] // uv_luma_mult
|
||||
vld1.16 {q4, q5}, [r1, :128]! // src
|
||||
vdup.16 d29, d30[1] // uv_mult
|
||||
vmull.s16 q6, d0, d28
|
||||
vmull.s16 q7, d1, d28
|
||||
vmull.s16 q0, d2, d28
|
||||
vmull.s16 q1, d3, d28
|
||||
vmlal.s16 q6, d8, d29
|
||||
vmlal.s16 q7, d9, d29
|
||||
vmlal.s16 q0, d10, d29
|
||||
vmlal.s16 q1, d11, d29
|
||||
vld1.16 {q4, q5}, [r1, :128] // src
|
||||
sub r1, r1, #32
|
||||
vshrn.s32 d12, q6, #6
|
||||
vshrn.s32 d13, q7, #6
|
||||
vshrn.s32 d14, q0, #6
|
||||
vshrn.s32 d15, q1, #6
|
||||
vmull.s16 q0, d4, d28
|
||||
vmull.s16 q1, d5, d28
|
||||
vmull.s16 q2, d6, d28
|
||||
vmull.s16 q3, d7, d28
|
||||
vmlal.s16 q0, d8, d29
|
||||
vmlal.s16 q1, d9, d29
|
||||
vmlal.s16 q2, d10, d29
|
||||
vmlal.s16 q3, d11, d29
|
||||
vdup.16 q14, d30[2] // uv_offset
|
||||
vshrn.s32 d0, q0, #6
|
||||
vshrn.s32 d1, q1, #6
|
||||
vshrn.s32 d2, q2, #6
|
||||
vshrn.s32 d3, q3, #6
|
||||
vdup.16 q4, d30[3] // bitdepth_max
|
||||
vmov.i16 q5, #0
|
||||
vadd.i16 q6, q6, q14
|
||||
vadd.i16 q7, q7, q14
|
||||
vadd.i16 q2, q0, q14
|
||||
vadd.i16 q3, q1, q14
|
||||
vmin.s16 q0, q6, q4
|
||||
vmin.s16 q1, q7, q4
|
||||
vmin.s16 q2, q2, q4
|
||||
vmin.s16 q3, q3, q4
|
||||
vmax.s16 q0, q0, q5
|
||||
vmax.s16 q1, q1, q5
|
||||
vmax.s16 q2, q2, q5
|
||||
vmax.s16 q3, q3, q5
|
||||
.else
|
||||
vdup.16 q14, d30[3] // bitdepth_max
|
||||
// Make sure that uninitialized pixels out of range past the right
|
||||
// edge are in range; their actual values shouldn't matter.
|
||||
vand q0, q0, q14
|
||||
vand q1, q1, q14
|
||||
vand q2, q2, q14
|
||||
vand q3, q3, q14
|
||||
.endif
|
||||
|
||||
bl gather32_neon
|
||||
|
||||
vld1.16 {q0, q1}, [r1, :128]! // src
|
||||
|
||||
vmovl.u8 q6, d8 // scaling
|
||||
vmovl.u8 q7, d9
|
||||
vmovl.u8 q4, d10
|
||||
vmovl.u8 q5, d11
|
||||
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
|
||||
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
|
||||
vshl.u16 q7, q7, q13
|
||||
vshl.u16 q4, q4, q13
|
||||
vshl.u16 q5, q5, q13
|
||||
|
||||
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
vqrdmulh.s16 q9, q9, q7
|
||||
vqrdmulh.s16 q10, q10, q4
|
||||
vqrdmulh.s16 q11, q11, q5
|
||||
|
||||
|
||||
vdup.16 q4, d31[0] // clip_min
|
||||
vdup.16 q5, d31[1] // clip_max
|
||||
|
||||
vqadd.s16 q0, q0, q8 // *src + noise
|
||||
vqadd.s16 q1, q1, q9
|
||||
vqadd.s16 q2, q2, q10
|
||||
vqadd.s16 q3, q3, q11
|
||||
|
||||
.if \oy
|
||||
vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
|
||||
.endif
|
||||
|
||||
vmax.s16 q0, q0, q4
|
||||
vmax.s16 q1, q1, q4
|
||||
vmax.s16 q2, q2, q4
|
||||
vmax.s16 q3, q3, q4
|
||||
vmin.s16 q0, q0, q5
|
||||
vmin.s16 q1, q1, q5
|
||||
vmin.s16 q2, q2, q5
|
||||
vmin.s16 q3, q3, q5
|
||||
|
||||
vst1.16 {q0, q1}, [r0, :128]! // dst
|
||||
|
||||
subs r9, r9, #1
|
||||
.if \oy
|
||||
vmov.32 d31[1], lr // new coeffs for overlap y
|
||||
.endif
|
||||
|
||||
vst1.16 {q2, q3}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r12, #0
|
||||
mov r9, r12 // restore actual remaining h
|
||||
bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
|
||||
.endif
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx0 0, 0, 0
|
||||
fguv_loop_sx0 0, 0, 1
|
||||
fguv_loop_sx0 0, 1, 0
|
||||
fguv_loop_sx0 0, 1, 1
|
||||
fguv_loop_sx0 1, 0, 0
|
||||
fguv_loop_sx0 1, 0, 1
|
||||
fguv_loop_sx0 1, 1, 0
|
||||
fguv_loop_sx0 1, 1, 1
|
||||
|
||||
9:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
||||
|
||||
function fguv_loop_sx1_neon
|
||||
L(fguv_loop_sx1_tbl):
|
||||
.word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
.word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
|
||||
|
||||
.macro fguv_loop_sx1 csfl, ox, oy
|
||||
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
.if \oy
|
||||
mov r12, lr
|
||||
.endif
|
||||
1:
|
||||
.if \ox
|
||||
vld1.16 {d0}, [r4], r10 // grain_lut old
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
vld1.16 {d2}, [r11], r10 // grain_lut top old
|
||||
.endif
|
||||
.if \oy
|
||||
vld1.16 {q2, q3}, [r8], r10 // grain_lut top
|
||||
.endif
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
.endif
|
||||
vld1.16 {q8, q9}, [r5], r10 // grain_lut
|
||||
.if \oy
|
||||
vdup.16 d28, d31[2] // overlap y coeff
|
||||
vdup.16 d29, d31[3] // overlap y coeff
|
||||
.endif
|
||||
.if !\ox && !\oy
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
.endif
|
||||
|
||||
.if \ox
|
||||
vdup.16 q7, d30[3] // bitdepth_max
|
||||
vmull.s16 q0, d0, d24
|
||||
vshr.u16 q7, q7, #1 // grain_max
|
||||
vmlal.s16 q0, d16, d25
|
||||
vmvn q6, q7 // grain_min
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
vmull.s16 q1, d2, d24
|
||||
vmlal.s16 q1, d4, d25
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vqrshrn.s32 d4, q1, #5
|
||||
vmin.s16 d4, d4, d14
|
||||
vmin.s16 d16, d16, d14
|
||||
vmax.s16 d4, d4, d12
|
||||
vmax.s16 d16, d16, d12
|
||||
.endif
|
||||
|
||||
vmull.s16 q0, d4, d28
|
||||
vmull.s16 q1, d5, d28
|
||||
vmull.s16 q2, d6, d28
|
||||
vmull.s16 q3, d7, d28
|
||||
.if !\ox
|
||||
vdup.16 q7, d30[3] // bitdepth_max
|
||||
.endif
|
||||
vmlal.s16 q0, d16, d29
|
||||
vmlal.s16 q1, d17, d29
|
||||
vmlal.s16 q2, d18, d29
|
||||
vmlal.s16 q3, d19, d29
|
||||
.if !\ox
|
||||
vshr.u16 q7, q7, #1 // grain_max
|
||||
.endif
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vqrshrn.s32 d17, q1, #5
|
||||
vqrshrn.s32 d18, q2, #5
|
||||
vqrshrn.s32 d19, q3, #5
|
||||
.if !\ox
|
||||
vmvn q6, q7 // grain_min
|
||||
.endif
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
vmin.s16 q8, q8, q7
|
||||
vmin.s16 q9, q9, q7
|
||||
vmax.s16 q8, q8, q6
|
||||
vmax.s16 q9, q9, q6
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
.elseif \ox
|
||||
vqrshrn.s32 d16, q0, #5
|
||||
vld1.16 {q0, q1}, [r6, :128]! // luma
|
||||
vmin.s16 d16, d16, d14
|
||||
vld1.16 {q2, q3}, [r6, :128], r7 // luma
|
||||
vmax.s16 d16, d16, d12
|
||||
.endif
|
||||
|
||||
vpadd.i16 d0, d0, d1
|
||||
vpadd.i16 d1, d2, d3
|
||||
vpadd.i16 d2, d4, d5
|
||||
vpadd.i16 d3, d6, d7
|
||||
vrshr.u16 q0, q0, #1
|
||||
vrshr.u16 q1, q1, #1
|
||||
.if !\csfl
|
||||
vdup.16 d28, d30[0] // uv_luma_mult
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
vdup.16 d29, d30[1] // uv_mult
|
||||
vmull.s16 q6, d0, d28
|
||||
vmull.s16 q7, d1, d28
|
||||
vmull.s16 q0, d2, d28
|
||||
vmull.s16 q1, d3, d28
|
||||
vmlal.s16 q6, d4, d29
|
||||
vmlal.s16 q7, d5, d29
|
||||
vmlal.s16 q0, d6, d29
|
||||
vmlal.s16 q1, d7, d29
|
||||
vshrn.s32 d12, q6, #6
|
||||
vshrn.s32 d13, q7, #6
|
||||
vshrn.s32 d14, q0, #6
|
||||
vshrn.s32 d15, q1, #6
|
||||
vdup.16 q14, d30[2] // uv_offset
|
||||
vdup.16 q4, d30[3] // bitdepth_max
|
||||
vmov.i16 q5, #0
|
||||
vadd.i16 q6, q6, q14
|
||||
vadd.i16 q7, q7, q14
|
||||
vmin.s16 q0, q6, q4
|
||||
vmin.s16 q1, q7, q4
|
||||
vmax.s16 q0, q0, q5
|
||||
vmax.s16 q1, q1, q5
|
||||
.else
|
||||
vdup.16 q14, d30[3] // bitdepth_max
|
||||
vld1.16 {q2, q3}, [r1, :128], r2 // src
|
||||
|
||||
// Make sure that uninitialized pixels out of range past the right
|
||||
// edge are in range; their actual values shouldn't matter.
|
||||
vand q0, q0, q14
|
||||
vand q1, q1, q14
|
||||
.endif
|
||||
|
||||
bl gather16_neon
|
||||
|
||||
vmovl.u8 q6, d8 // scaling
|
||||
vmovl.u8 q7, d9
|
||||
|
||||
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
|
||||
vshl.u16 q7, q7, q13
|
||||
|
||||
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
vqrdmulh.s16 q9, q9, q7
|
||||
|
||||
|
||||
vdup.16 q4, d31[0] // clip_min
|
||||
vdup.16 q5, d31[1] // clip_max
|
||||
|
||||
vqadd.s16 q0, q2, q8 // *src + noise
|
||||
vqadd.s16 q1, q3, q9
|
||||
|
||||
.if \oy
|
||||
// Swap the two last coefficients of d31, place them first in d28
|
||||
vrev64.16 d28, d31
|
||||
.endif
|
||||
|
||||
vmax.s16 q0, q0, q4
|
||||
vmax.s16 q1, q1, q4
|
||||
vmin.s16 q0, q0, q5
|
||||
vmin.s16 q1, q1, q5
|
||||
|
||||
subs r9, r9, #1
|
||||
.if \oy
|
||||
// Take the first two 16 bit coefficients of d28 and place them at the
|
||||
// end of d31
|
||||
vtrn.32 d31, d28
|
||||
.endif
|
||||
|
||||
vst1.16 {q0, q1}, [r0, :128], r2 // dst
|
||||
bgt 1b
|
||||
|
||||
.if \oy
|
||||
cmp r12, #0
|
||||
mov r9, r12 // restore actual remaining h
|
||||
bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
|
||||
.endif
|
||||
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx1 0, 0, 0
|
||||
fguv_loop_sx1 0, 0, 1
|
||||
fguv_loop_sx1 0, 1, 0
|
||||
fguv_loop_sx1 0, 1, 1
|
||||
fguv_loop_sx1 1, 0, 0
|
||||
fguv_loop_sx1 1, 0, 1
|
||||
fguv_loop_sx1 1, 1, 0
|
||||
fguv_loop_sx1 1, 1, 1
|
||||
|
||||
9:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
endfunc
|
275
third_party/dav1d/src/arm/64/film_grain.S
vendored
275
third_party/dav1d/src/arm/64/film_grain.S
vendored
@ -232,12 +232,14 @@ function sum_lag1_above_neon
|
||||
smull2 v5.8h, v0.16b, v27.16b
|
||||
smull v6.8h, v1.8b, v29.8b
|
||||
smull2 v7.8h, v1.16b, v29.16b
|
||||
add v2.8h, v2.8h, v4.8h
|
||||
add v3.8h, v3.8h, v5.8h
|
||||
saddl v4.4s, v2.4h, v6.4h
|
||||
saddl2 v5.4s, v2.8h, v6.8h
|
||||
saddl v6.4s, v3.4h, v7.4h
|
||||
saddl2 v7.4s, v3.8h, v7.8h
|
||||
saddl v0.4s, v2.4h, v4.4h
|
||||
saddl2 v1.4s, v2.8h, v4.8h
|
||||
saddl v2.4s, v3.4h, v5.4h
|
||||
saddl2 v3.4s, v3.8h, v5.8h
|
||||
saddw v4.4s, v0.4s, v6.4h
|
||||
saddw2 v5.4s, v1.4s, v6.8h
|
||||
saddw v6.4s, v2.4s, v7.4h
|
||||
saddw2 v7.4s, v3.4s, v7.8h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -450,14 +452,18 @@ function sum_lag2_above_neon
|
||||
smull2 v7.8h, v0.16b, v28.16b
|
||||
smull v0.8h, v1.8b, v29.8b
|
||||
smull2 v1.8h, v1.16b, v29.16b
|
||||
add v2.8h, v2.8h, v4.8h
|
||||
add v3.8h, v3.8h, v5.8h
|
||||
add v0.8h, v0.8h, v6.8h
|
||||
add v1.8h, v1.8h, v7.8h
|
||||
saddl v4.4s, v2.4h, v0.4h
|
||||
saddl2 v5.4s, v2.8h, v0.8h
|
||||
saddl v6.4s, v3.4h, v1.4h
|
||||
saddl2 v7.4s, v3.8h, v1.8h
|
||||
saddl v22.4s, v2.4h, v4.4h
|
||||
saddl2 v23.4s, v2.8h, v4.8h
|
||||
saddl v26.4s, v3.4h, v5.4h
|
||||
saddl2 v27.4s, v3.8h, v5.8h
|
||||
saddl v2.4s, v0.4h, v6.4h
|
||||
saddl2 v3.4s, v0.8h, v6.8h
|
||||
saddl v6.4s, v1.4h, v7.4h
|
||||
saddl2 v7.4s, v1.8h, v7.8h
|
||||
add v4.4s, v22.4s, v2.4s
|
||||
add v5.4s, v23.4s, v3.4s
|
||||
add v6.4s, v26.4s, v6.4s
|
||||
add v7.4s, v27.4s, v7.4s
|
||||
|
||||
ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
|
||||
dup v26.16b, v30.b[5]
|
||||
@ -476,14 +482,18 @@ function sum_lag2_above_neon
|
||||
smull2 v27.8h, v0.16b, v28.16b
|
||||
smull v28.8h, v1.8b, v29.8b
|
||||
smull2 v29.8h, v1.16b, v29.16b
|
||||
add v2.8h, v2.8h, v22.8h
|
||||
add v3.8h, v3.8h, v23.8h
|
||||
add v26.8h, v26.8h, v28.8h
|
||||
add v27.8h, v27.8h, v29.8h
|
||||
saddl v0.4s, v2.4h, v26.4h
|
||||
saddl2 v1.4s, v2.8h, v26.8h
|
||||
saddl v2.4s, v3.4h, v27.4h
|
||||
saddl2 v3.4s, v3.8h, v27.8h
|
||||
saddl v0.4s, v2.4h, v22.4h
|
||||
saddl2 v1.4s, v2.8h, v22.8h
|
||||
saddl v2.4s, v3.4h, v23.4h
|
||||
saddl2 v3.4s, v3.8h, v23.8h
|
||||
saddl v22.4s, v26.4h, v28.4h
|
||||
saddl2 v23.4s, v26.8h, v28.8h
|
||||
saddl v26.4s, v27.4h, v29.4h
|
||||
saddl2 v27.4s, v27.8h, v29.8h
|
||||
add v0.4s, v0.4s, v22.4s
|
||||
add v1.4s, v1.4s, v23.4s
|
||||
add v2.4s, v2.4s, v26.4s
|
||||
add v3.4s, v3.4s, v27.4s
|
||||
dup v26.16b, v30.b[2]
|
||||
dup v27.16b, v30.b[7]
|
||||
smull v22.8h, v17.8b, v26.8b
|
||||
@ -498,14 +508,16 @@ function sum_lag2_above_neon
|
||||
mov v16.16b, v17.16b
|
||||
mov v17.16b, v18.16b
|
||||
|
||||
add v22.8h, v22.8h, v24.8h
|
||||
add v23.8h, v23.8h, v25.8h
|
||||
saddl v0.4s, v22.4h, v24.4h
|
||||
saddl2 v1.4s, v22.8h, v24.8h
|
||||
saddl v2.4s, v23.4h, v25.4h
|
||||
saddl2 v3.4s, v23.8h, v25.8h
|
||||
mov v19.16b, v20.16b
|
||||
mov v20.16b, v21.16b
|
||||
saddw v4.4s, v4.4s, v22.4h
|
||||
saddw2 v5.4s, v5.4s, v22.8h
|
||||
saddw v6.4s, v6.4s, v23.4h
|
||||
saddw2 v7.4s, v7.4s, v23.8h
|
||||
add v4.4s, v4.4s, v0.4s
|
||||
add v5.4s, v5.4s, v1.4s
|
||||
add v6.4s, v6.4s, v2.4s
|
||||
add v7.4s, v7.4s, v3.4s
|
||||
ret
|
||||
endfunc
|
||||
|
||||
@ -711,32 +723,38 @@ function sum_lag3_above_neon
|
||||
smull2 v3.8h, v9.16b, v23.16b
|
||||
smull v8.8h, v10.8b, v24.8b
|
||||
smull2 v9.8h, v10.16b, v24.16b
|
||||
add v0.8h, v0.8h, v2.8h
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
smull v10.8h, v11.8b, v26.8b
|
||||
smull2 v11.8h, v11.16b, v26.16b
|
||||
smull v2.8h, v12.8b, v27.8b
|
||||
smull2 v3.8h, v12.16b, v27.16b
|
||||
add v8.8h, v8.8h, v10.8h
|
||||
add v9.8h, v9.8h, v11.8h
|
||||
saddl v22.4s, v0.4h, v2.4h
|
||||
saddl2 v23.4s, v0.8h, v2.8h
|
||||
saddl v24.4s, v1.4h, v3.4h
|
||||
saddl2 v26.4s, v1.8h, v3.8h
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
smull v8.8h, v12.8b, v27.8b
|
||||
smull2 v9.8h, v12.16b, v27.16b
|
||||
smull v10.8h, v13.8b, v28.8b
|
||||
smull2 v11.8h, v13.16b, v28.16b
|
||||
saddl v4.4s, v0.4h, v8.4h
|
||||
saddl2 v5.4s, v0.8h, v8.8h
|
||||
saddl v6.4s, v1.4h, v9.4h
|
||||
saddl2 v7.4s, v1.8h, v9.8h
|
||||
smull v8.8h, v14.8b, v25.8b
|
||||
smull2 v9.8h, v14.16b, v25.16b
|
||||
add v2.8h, v2.8h, v10.8h
|
||||
add v3.8h, v3.8h, v11.8h
|
||||
saddl v0.4s, v2.4h, v8.4h
|
||||
saddl2 v1.4s, v2.8h, v8.8h
|
||||
saddl v2.4s, v3.4h, v9.4h
|
||||
saddl2 v3.4s, v3.8h, v9.8h
|
||||
smull v12.8h, v14.8b, v25.8b
|
||||
smull2 v13.8h, v14.16b, v25.16b
|
||||
add v4.4s, v22.4s, v0.4s
|
||||
add v5.4s, v23.4s, v1.4s
|
||||
add v6.4s, v24.4s, v2.4s
|
||||
add v7.4s, v26.4s, v3.4s
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
add v4.4s, v4.4s, v0.4s
|
||||
add v5.4s, v5.4s, v1.4s
|
||||
add v6.4s, v6.4s, v2.4s
|
||||
add v7.4s, v7.4s, v3.4s
|
||||
saddw v4.4s, v4.4s, v12.4h
|
||||
saddw2 v5.4s, v5.4s, v12.8h
|
||||
saddw v6.4s, v6.4s, v13.4h
|
||||
saddw2 v7.4s, v7.4s, v13.8h
|
||||
|
||||
ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
|
||||
dup v22.16b, v29.b[7]
|
||||
@ -758,36 +776,42 @@ function sum_lag3_above_neon
|
||||
smull2 v3.8h, v9.16b, v23.16b
|
||||
smull v8.8h, v10.8b, v24.8b
|
||||
smull2 v9.8h, v10.16b, v24.16b
|
||||
add v0.8h, v0.8h, v2.8h
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
smull v10.8h, v11.8b, v26.8b
|
||||
smull2 v11.8h, v11.16b, v26.16b
|
||||
smull v2.8h, v12.8b, v27.8b
|
||||
smull2 v3.8h, v12.16b, v27.16b
|
||||
add v8.8h, v8.8h, v10.8h
|
||||
add v9.8h, v9.8h, v11.8h
|
||||
saddl v22.4s, v0.4h, v2.4h
|
||||
saddl2 v23.4s, v0.8h, v2.8h
|
||||
saddl v24.4s, v1.4h, v3.4h
|
||||
saddl2 v26.4s, v1.8h, v3.8h
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
smull v8.8h, v12.8b, v27.8b
|
||||
smull2 v9.8h, v12.16b, v27.16b
|
||||
smull v10.8h, v13.8b, v28.8b
|
||||
smull2 v11.8h, v13.16b, v28.16b
|
||||
saddl v12.4s, v0.4h, v8.4h
|
||||
saddl2 v13.4s, v0.8h, v8.8h
|
||||
saddl v0.4s, v1.4h, v9.4h
|
||||
saddl2 v1.4s, v1.8h, v9.8h
|
||||
smull v8.8h, v17.8b, v25.8b
|
||||
smull2 v9.8h, v17.16b, v25.16b
|
||||
add v2.8h, v2.8h, v10.8h
|
||||
add v3.8h, v3.8h, v11.8h
|
||||
add v4.4s, v4.4s, v12.4s
|
||||
add v5.4s, v5.4s, v13.4s
|
||||
add v6.4s, v6.4s, v0.4s
|
||||
add v7.4s, v7.4s, v1.4s
|
||||
saddl v0.4s, v2.4h, v8.4h
|
||||
saddl2 v1.4s, v2.8h, v8.8h
|
||||
saddl v2.4s, v3.4h, v9.4h
|
||||
saddl2 v3.4s, v3.8h, v9.8h
|
||||
smull v12.8h, v17.8b, v25.8b
|
||||
smull2 v13.8h, v17.16b, v25.16b
|
||||
add v22.4s, v22.4s, v0.4s
|
||||
add v23.4s, v23.4s, v1.4s
|
||||
add v24.4s, v24.4s, v2.4s
|
||||
add v26.4s, v26.4s, v3.4s
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
add v4.4s, v4.4s, v22.4s
|
||||
add v5.4s, v5.4s, v23.4s
|
||||
add v6.4s, v6.4s, v24.4s
|
||||
add v7.4s, v7.4s, v26.4s
|
||||
add v4.4s, v4.4s, v0.4s
|
||||
add v5.4s, v5.4s, v1.4s
|
||||
add v6.4s, v6.4s, v2.4s
|
||||
add v7.4s, v7.4s, v3.4s
|
||||
saddw v4.4s, v4.4s, v12.4h
|
||||
saddw2 v5.4s, v5.4s, v12.8h
|
||||
saddw v6.4s, v6.4s, v13.4h
|
||||
saddw2 v7.4s, v7.4s, v13.8h
|
||||
|
||||
ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
|
||||
dup v22.16b, v29.b[14]
|
||||
@ -809,42 +833,46 @@ function sum_lag3_above_neon
|
||||
smull2 v3.8h, v9.16b, v23.16b
|
||||
smull v8.8h, v10.8b, v24.8b
|
||||
smull2 v9.8h, v10.16b, v24.16b
|
||||
add v0.8h, v0.8h, v2.8h
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
smull v10.8h, v11.8b, v26.8b
|
||||
smull2 v11.8h, v11.16b, v26.16b
|
||||
smull v2.8h, v12.8b, v27.8b
|
||||
smull2 v3.8h, v12.16b, v27.16b
|
||||
add v8.8h, v8.8h, v10.8h
|
||||
add v9.8h, v9.8h, v11.8h
|
||||
saddl v22.4s, v0.4h, v2.4h
|
||||
saddl2 v23.4s, v0.8h, v2.8h
|
||||
saddl v24.4s, v1.4h, v3.4h
|
||||
saddl2 v26.4s, v1.8h, v3.8h
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
smull v8.8h, v12.8b, v27.8b
|
||||
smull2 v9.8h, v12.16b, v27.16b
|
||||
smull v10.8h, v13.8b, v28.8b
|
||||
smull2 v11.8h, v13.16b, v28.16b
|
||||
saddl v12.4s, v0.4h, v8.4h
|
||||
saddl2 v13.4s, v0.8h, v8.8h
|
||||
saddl v0.4s, v1.4h, v9.4h
|
||||
saddl2 v1.4s, v1.8h, v9.8h
|
||||
smull v8.8h, v20.8b, v25.8b
|
||||
smull2 v9.8h, v20.16b, v25.16b
|
||||
add v2.8h, v2.8h, v10.8h
|
||||
add v3.8h, v3.8h, v11.8h
|
||||
add v4.4s, v4.4s, v12.4s
|
||||
add v5.4s, v5.4s, v13.4s
|
||||
add v6.4s, v6.4s, v0.4s
|
||||
add v7.4s, v7.4s, v1.4s
|
||||
saddl v0.4s, v2.4h, v8.4h
|
||||
saddl2 v1.4s, v2.8h, v8.8h
|
||||
saddl v2.4s, v3.4h, v9.4h
|
||||
saddl2 v3.4s, v3.8h, v9.8h
|
||||
smull v12.8h, v20.8b, v25.8b
|
||||
smull2 v19.8h, v20.16b, v25.16b
|
||||
add v22.4s, v22.4s, v0.4s
|
||||
add v23.4s, v23.4s, v1.4s
|
||||
add v24.4s, v24.4s, v2.4s
|
||||
add v26.4s, v26.4s, v3.4s
|
||||
saddl v0.4s, v8.4h, v10.4h
|
||||
saddl2 v1.4s, v8.8h, v10.8h
|
||||
saddl v2.4s, v9.4h, v11.4h
|
||||
saddl2 v3.4s, v9.8h, v11.8h
|
||||
add v4.4s, v4.4s, v22.4s
|
||||
add v5.4s, v5.4s, v23.4s
|
||||
add v6.4s, v6.4s, v24.4s
|
||||
add v7.4s, v7.4s, v26.4s
|
||||
mov v13.16b, v14.16b
|
||||
mov v14.16b, v15.16b
|
||||
add v4.4s, v4.4s, v0.4s
|
||||
add v5.4s, v5.4s, v1.4s
|
||||
add v6.4s, v6.4s, v2.4s
|
||||
add v7.4s, v7.4s, v3.4s
|
||||
|
||||
mov v13.16b, v14.16b
|
||||
mov v14.16b, v15.16b
|
||||
|
||||
mov v16.16b, v17.16b
|
||||
mov v17.16b, v18.16b
|
||||
saddw v4.4s, v4.4s, v12.4h
|
||||
saddw2 v5.4s, v5.4s, v12.8h
|
||||
saddw v6.4s, v6.4s, v19.4h
|
||||
saddw2 v7.4s, v7.4s, v19.8h
|
||||
|
||||
mov v19.16b, v20.16b
|
||||
mov v20.16b, v21.16b
|
||||
@ -1483,43 +1511,50 @@ gen_grain_44 uv_422
|
||||
|
||||
.macro gather_interleaved dst1, dst2, src1, src2, off
|
||||
umov w14, \src1[0+\off]
|
||||
umov w15, \src2[1+\off]
|
||||
umov w15, \src2[8+\off]
|
||||
umov w16, \src1[2+\off]
|
||||
add x14, x14, x3
|
||||
umov w17, \src2[3+\off]
|
||||
umov w17, \src2[10+\off]
|
||||
add x15, x15, x3
|
||||
ld1 {\dst1}[0+\off], [x14]
|
||||
ld1 {\dst1}[0+\off], [x14]
|
||||
umov w14, \src1[4+\off]
|
||||
add x16, x16, x3
|
||||
ld1 {\dst2}[1+\off], [x15]
|
||||
umov w15, \src2[5+\off]
|
||||
ld1 {\dst2}[8+\off], [x15]
|
||||
umov w15, \src2[12+\off]
|
||||
add x17, x17, x3
|
||||
ld1 {\dst1}[2+\off], [x16]
|
||||
ld1 {\dst1}[2+\off], [x16]
|
||||
umov w16, \src1[6+\off]
|
||||
add x14, x14, x3
|
||||
ld1 {\dst2}[3+\off], [x17]
|
||||
umov w17, \src2[7+\off]
|
||||
ld1 {\dst2}[10+\off], [x17]
|
||||
umov w17, \src2[14+\off]
|
||||
add x15, x15, x3
|
||||
ld1 {\dst1}[4+\off], [x14]
|
||||
ld1 {\dst1}[4+\off], [x14]
|
||||
add x16, x16, x3
|
||||
ld1 {\dst2}[5+\off], [x15]
|
||||
ld1 {\dst2}[12+\off], [x15]
|
||||
add x17, x17, x3
|
||||
ld1 {\dst1}[6+\off], [x16]
|
||||
ld1 {\dst2}[7+\off], [x17]
|
||||
ld1 {\dst1}[6+\off], [x16]
|
||||
ld1 {\dst2}[14+\off], [x17]
|
||||
.endm
|
||||
|
||||
.macro gather dst1, dst2, src1, src2
|
||||
gather_interleaved \dst1, \dst2, \src1, \src2, 0
|
||||
gather_interleaved \dst2, \dst1, \src2, \src1, 0
|
||||
gather_interleaved \dst1, \dst2, \src1, \src2, 8
|
||||
gather_interleaved \dst2, \dst1, \src2, \src1, 8
|
||||
gather_interleaved \dst1, \dst2, \src1, \src2, 1
|
||||
gather_interleaved \dst2, \dst1, \src2, \src1, 1
|
||||
.endm
|
||||
|
||||
function gather_neon
|
||||
function gather32_neon
|
||||
gather v4.b, v5.b, v0.b, v1.b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function gather16_neon
|
||||
gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
|
||||
gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
|
||||
ins v4.d[1], v5.d[1]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const overlap_coeffs_0, align=4
|
||||
.byte 27, 17, 0, 0, 0, 0, 0, 0
|
||||
.byte 17, 27, 32, 32, 32, 32, 32, 32
|
||||
@ -1564,7 +1599,7 @@ function fgy_32x32_8bpc_neon, export=1
|
||||
mov x9, #GRAIN_WIDTH // grain_lut stride
|
||||
|
||||
neg w4, w4
|
||||
dup v29.8h, w4 // -scaling_shift
|
||||
dup v29.8h, w4 // -scaling_shift
|
||||
|
||||
movrel x16, overlap_coeffs_0
|
||||
|
||||
@ -1635,7 +1670,7 @@ L(loop_\ox\oy):
|
||||
.endif
|
||||
ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
|
||||
|
||||
bl gather_neon
|
||||
bl gather32_neon
|
||||
|
||||
.if \ox
|
||||
smull v20.8h, v20.8b, v27.8b
|
||||
@ -1765,7 +1800,7 @@ endfunc
|
||||
.macro fguv layout, sx, sy
|
||||
function fguv_32x32_\layout\()_8bpc_neon, export=1
|
||||
str x30, [sp, #-32]!
|
||||
stp d8, d9, [sp, #16]
|
||||
str d8, [sp, #16]
|
||||
ldp x8, x9, [sp, #32] // offsets, h
|
||||
ldp x10, x11, [sp, #48] // uv, is_id
|
||||
|
||||
@ -1778,11 +1813,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
|
||||
add x14, x10, #FGD_UV_LUMA_MULT
|
||||
add x15, x10, #FGD_UV_MULT
|
||||
add x10, x10, #FGD_UV_OFFSET
|
||||
ld1 {v8.h}[0], [x14] // uv_luma_mult
|
||||
ld1r {v24.8h}, [x10] // uv_offset
|
||||
ld1 {v8.h}[1], [x15] // uv_mult
|
||||
ld1 {v8.h}[0], [x14] // uv_luma_mult
|
||||
ld1r {v24.8h}, [x10] // uv_offset
|
||||
ld1 {v8.h}[1], [x15] // uv_mult
|
||||
|
||||
dup v29.8h, w13 // -scaling_shift
|
||||
dup v29.8h, w13 // -scaling_shift
|
||||
|
||||
cbz w12, 1f
|
||||
// clip
|
||||
@ -1918,7 +1953,7 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||||
sqxtun2 v1.16b, v5.8h
|
||||
.endif
|
||||
|
||||
bl gather_neon
|
||||
bl gather32_neon
|
||||
|
||||
.if \ox
|
||||
smull v20.8h, v20.8b, v27.8b
|
||||
@ -2029,7 +2064,7 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||||
fguv_loop_sx0 1, 1, 1
|
||||
|
||||
9:
|
||||
ldp d8, d9, [sp, #16]
|
||||
ldr d8, [sp, #16]
|
||||
ldr x30, [sp], #32
|
||||
ret
|
||||
|
||||
@ -2085,7 +2120,7 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
sqxtun2 v0.16b, v3.8h
|
||||
.endif
|
||||
|
||||
bl gather_neon
|
||||
bl gather16_neon
|
||||
|
||||
.if \ox
|
||||
smull v20.8h, v20.8b, v27.8b
|
||||
@ -2176,7 +2211,7 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
fguv_loop_sx1 1, 1, 1
|
||||
|
||||
9:
|
||||
ldp d8, d9, [sp, #16]
|
||||
ldr d8, [sp, #16]
|
||||
ldr x30, [sp], #32
|
||||
ret
|
||||
|
||||
|
558
third_party/dav1d/src/arm/64/film_grain16.S
vendored
558
third_party/dav1d/src/arm/64/film_grain16.S
vendored
@ -27,6 +27,7 @@
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
#include "src/arm/asm-offsets.h"
|
||||
|
||||
#define GRAIN_WIDTH 82
|
||||
|
||||
@ -64,11 +65,18 @@
|
||||
gather_interleaved \dst2, \dst1, \src4, \src2, 8
|
||||
.endm
|
||||
|
||||
function gather_neon
|
||||
function gather32_neon
|
||||
gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function gather16_neon
|
||||
gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
|
||||
gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
|
||||
ins v6.d[1], v7.d[0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const overlap_coeffs_0, align=4
|
||||
.short 27, 17, 0, 0
|
||||
.short 17, 27, 32, 32
|
||||
@ -110,6 +118,7 @@ function fgy_32x32_16bpc_neon, export=1
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d12, d13, [sp, #48]
|
||||
str d14, [sp, #64]
|
||||
eor w4, w4, #15 // 15 - scaling_shift
|
||||
ldr w11, [x6, #8] // offsets[1][0]
|
||||
ldr w13, [x6, #4] // offsets[0][1]
|
||||
ldr w15, [x6, #12] // offsets[1][1]
|
||||
@ -122,8 +131,7 @@ function fgy_32x32_16bpc_neon, export=1
|
||||
mov x9, #GRAIN_WIDTH*2 // grain_lut stride
|
||||
neg w10, w10 // bitdepth_min_8
|
||||
|
||||
neg w4, w4
|
||||
dup v29.4s, w4 // -scaling_shift
|
||||
dup v29.8h, w4 // 15 - scaling_shift
|
||||
dup v27.8h, w10 // bitdepth_min_8
|
||||
|
||||
movrel x16, overlap_coeffs_0
|
||||
@ -207,7 +215,7 @@ L(loop_\ox\oy):
|
||||
and v1.16b, v1.16b, v4.16b
|
||||
and v2.16b, v2.16b, v4.16b
|
||||
and v3.16b, v3.16b, v4.16b
|
||||
bl gather_neon
|
||||
bl gather32_neon
|
||||
|
||||
.if \ox
|
||||
smull v20.4s, v20.4h, v27.4h
|
||||
@ -268,7 +276,7 @@ L(loop_\ox\oy):
|
||||
smax v19.8h, v19.8h, v25.8h
|
||||
.endif
|
||||
|
||||
uxtl v4.8h, v6.8b // scaling
|
||||
uxtl v4.8h, v6.8b // scaling
|
||||
.if \ox && !\oy
|
||||
sqrshrn v20.4h, v20.4s, #5
|
||||
.endif
|
||||
@ -281,37 +289,18 @@ L(loop_\ox\oy):
|
||||
smax v20.4h, v20.4h, v25.4h
|
||||
.endif
|
||||
uxtl2 v7.8h, v7.16b
|
||||
|
||||
.if \ox && !\oy
|
||||
smull v20.4s, v20.4h, v4.4h // scaling * grain
|
||||
.else
|
||||
smull v20.4s, v16.4h, v4.4h
|
||||
ins v16.d[0], v20.d[0]
|
||||
.endif
|
||||
smull2 v21.4s, v16.8h, v4.8h
|
||||
smull v22.4s, v17.4h, v5.4h
|
||||
smull2 v23.4s, v17.8h, v5.8h
|
||||
smull v16.4s, v18.4h, v6.4h
|
||||
smull2 v17.4s, v18.8h, v6.8h
|
||||
smull v18.4s, v19.4h, v7.4h
|
||||
smull2 v19.4s, v19.8h, v7.8h
|
||||
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
|
||||
ushl v5.8h, v5.8h, v29.8h
|
||||
ushl v6.8h, v6.8h, v29.8h
|
||||
ushl v7.8h, v7.8h, v29.8h
|
||||
|
||||
srshl v20.4s, v20.4s, v29.4s // round2(scaling * grain, scaling_shift)
|
||||
srshl v21.4s, v21.4s, v29.4s
|
||||
srshl v22.4s, v22.4s, v29.4s
|
||||
srshl v23.4s, v23.4s, v29.4s
|
||||
srshl v16.4s, v16.4s, v29.4s
|
||||
srshl v17.4s, v17.4s, v29.4s
|
||||
srshl v18.4s, v18.4s, v29.4s
|
||||
srshl v19.4s, v19.4s, v29.4s
|
||||
|
||||
sqxtn v20.4h, v20.4s
|
||||
sqxtn2 v20.8h, v21.4s
|
||||
sqxtn v21.4h, v22.4s
|
||||
sqxtn2 v21.8h, v23.4s
|
||||
sqxtn v22.4h, v16.4s
|
||||
sqxtn2 v22.8h, v17.4s
|
||||
sqxtn v23.4h, v18.4s
|
||||
sqxtn2 v23.8h, v19.4s
|
||||
sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
sqrdmulh v21.8h, v17.8h, v5.8h
|
||||
sqrdmulh v22.8h, v18.8h, v6.8h
|
||||
sqrdmulh v23.8h, v19.8h, v7.8h
|
||||
|
||||
usqadd v0.8h, v20.8h // *src + noise
|
||||
usqadd v1.8h, v21.8h
|
||||
@ -359,3 +348,506 @@ L(fgy_loop_tbl):
|
||||
.hword L(fgy_loop_tbl) - L(loop_10)
|
||||
.hword L(fgy_loop_tbl) - L(loop_11)
|
||||
endfunc
|
||||
|
||||
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
|
||||
// const pixel *const src,
|
||||
// const ptrdiff_t stride,
|
||||
// const uint8_t scaling[SCALING_SIZE],
|
||||
// const Dav1dFilmGrainData *const data,
|
||||
// const entry grain_lut[][GRAIN_WIDTH],
|
||||
// const pixel *const luma_row,
|
||||
// const ptrdiff_t luma_stride,
|
||||
// const int offsets[][2],
|
||||
// const ptrdiff_t h, const ptrdiff_t uv,
|
||||
// const ptrdiff_t is_id,
|
||||
// const ptrdiff_t type,
|
||||
// const int bitdepth_max);
|
||||
.macro fguv layout, sx, sy
|
||||
function fguv_32x32_\layout\()_16bpc_neon, export=1
|
||||
str x30, [sp, #-80]!
|
||||
stp d8, d9, [sp, #16]
|
||||
stp d10, d11, [sp, #32]
|
||||
stp d12, d13, [sp, #48]
|
||||
stp d14, d15, [sp, #64]
|
||||
|
||||
ldp x8, x9, [sp, #80] // offsets, h
|
||||
ldp x10, x11, [sp, #96] // uv, is_id
|
||||
ldr w16, [sp, #120] // bitdepth_max
|
||||
|
||||
ldr w13, [x4, #FGD_SCALING_SHIFT]
|
||||
ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
|
||||
dup v23.8h, w16 // bitdepth_max
|
||||
clz w16, w16
|
||||
eor w13, w13, #15 // 15 - scaling_shift
|
||||
sub w16, w16, #24 // -bitdepth_min_8
|
||||
|
||||
// !csfl
|
||||
add x10, x4, x10, lsl #2 // + 4*uv
|
||||
add x14, x10, #FGD_UV_LUMA_MULT
|
||||
add x15, x10, #FGD_UV_MULT
|
||||
add x10, x10, #FGD_UV_OFFSET
|
||||
neg w16, w16 // bitdepth_min_8
|
||||
ld1r {v8.8h}, [x14] // uv_luma_mult
|
||||
ld1r {v24.8h}, [x10] // uv_offset
|
||||
ld1r {v9.8h}, [x15] // uv_mult
|
||||
|
||||
dup v29.8h, w13 // 15 - scaling_shift
|
||||
dup v27.8h, w16 // bitdepth_min_8
|
||||
|
||||
cbz w12, 1f
|
||||
// clip
|
||||
movi v30.8h, #16
|
||||
movi v31.8h, #240
|
||||
sshl v30.8h, v30.8h, v27.8h
|
||||
sshl v31.8h, v31.8h, v27.8h
|
||||
cbz w11, 2f
|
||||
// is_id
|
||||
movi v31.8h, #235
|
||||
sshl v31.8h, v31.8h, v27.8h
|
||||
b 2f
|
||||
1:
|
||||
// no clip
|
||||
movi v30.8h, #0
|
||||
mov v31.16b, v23.16b // bitdepth_max
|
||||
2:
|
||||
|
||||
ushr v15.8h, v23.8h, #1 // grain_max
|
||||
sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
|
||||
not v14.16b, v15.16b // grain_min
|
||||
|
||||
ldr w12, [x8, #8] // offsets[1][0]
|
||||
ldr w14, [x8, #4] // offsets[0][1]
|
||||
ldr w16, [x8, #12] // offsets[1][1]
|
||||
ldr w8, [x8] // offsets[0][0]
|
||||
|
||||
mov x10, #GRAIN_WIDTH*2 // grain_lut stride
|
||||
|
||||
add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
|
||||
.if \sy
|
||||
add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
|
||||
add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
|
||||
.else
|
||||
add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
|
||||
add x5, x5, x10 // grain_lut += grain_stride
|
||||
.endif
|
||||
|
||||
calc_offset w12, w13, w12, \sx, \sy
|
||||
calc_offset w14, w15, w14, \sx, \sy
|
||||
calc_offset w16, w17, w16, \sx, \sy
|
||||
calc_offset w8, w11, w8, \sx, \sy
|
||||
|
||||
add_offset x13, w12, x13, x5, x10
|
||||
add_offset x15, w14, x15, x5, x10
|
||||
add_offset x17, w16, x17, x5, x10
|
||||
add_offset x5, w8, x11, x5, x10
|
||||
|
||||
add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
|
||||
ldr w13, [sp, #112] // type
|
||||
|
||||
movrel x16, overlap_coeffs_\sx
|
||||
adr x14, L(fguv_loop_sx\sx\()_tbl)
|
||||
|
||||
ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
|
||||
tst w13, #1
|
||||
ldrh w13, [x14, w13, uxtw #1]
|
||||
|
||||
b.eq 1f
|
||||
// y overlap
|
||||
sub w12, w9, #(2 >> \sy) // backup remaining h
|
||||
mov w9, #(2 >> \sy)
|
||||
|
||||
1:
|
||||
sub x13, x14, w13, uxtw
|
||||
|
||||
.if \sy
|
||||
movi v25.8h, #23
|
||||
movi v26.8h, #22
|
||||
.else
|
||||
movi v25.8h, #27
|
||||
movi v26.8h, #17
|
||||
.endif
|
||||
|
||||
.if \sy
|
||||
add x7, x7, x7 // luma_stride *= 2
|
||||
.endif
|
||||
|
||||
br x13
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
fguv 420, 1, 1
|
||||
fguv 422, 1, 0
|
||||
fguv 444, 0, 0
|
||||
|
||||
function fguv_loop_sx0_neon
|
||||
.macro fguv_loop_sx0 csfl, ox, oy
|
||||
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||||
1:
|
||||
.if \ox
|
||||
ld1 {v4.4h}, [x4], x10 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
ld1 {v5.4h}, [x11], x10 // grain_lut top old
|
||||
.endif
|
||||
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
|
||||
|
||||
.if \ox
|
||||
smull v4.4s, v4.4h, v27.4h
|
||||
smlal v4.4s, v16.4h, v28.4h
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
smull v5.4s, v5.4h, v27.4h
|
||||
smlal v5.4s, v0.4h, v28.4h
|
||||
sqrshrn v4.4h, v4.4s, #5
|
||||
sqrshrn v5.4h, v5.4s, #5
|
||||
smin v4.4h, v4.4h, v15.4h
|
||||
smin v5.4h, v5.4h, v15.4h
|
||||
smax v4.4h, v4.4h, v14.4h
|
||||
smax v5.4h, v5.4h, v14.4h
|
||||
ins v16.d[0], v4.d[0]
|
||||
ins v0.d[0], v5.d[0]
|
||||
.endif
|
||||
|
||||
smull v6.4s, v16.4h, v26.4h
|
||||
smull2 v7.4s, v16.8h, v26.8h
|
||||
smull v10.4s, v17.4h, v26.4h
|
||||
smull2 v11.4s, v17.8h, v26.8h
|
||||
smull v16.4s, v18.4h, v26.4h
|
||||
smull2 v17.4s, v18.8h, v26.8h
|
||||
smull v18.4s, v19.4h, v26.4h
|
||||
smull2 v19.4s, v19.8h, v26.8h
|
||||
smlal v6.4s, v0.4h, v25.4h
|
||||
smlal2 v7.4s, v0.8h, v25.8h
|
||||
smlal v10.4s, v1.4h, v25.4h
|
||||
smlal2 v11.4s, v1.8h, v25.8h
|
||||
smlal v16.4s, v2.4h, v25.4h
|
||||
smlal2 v17.4s, v2.8h, v25.8h
|
||||
smlal v18.4s, v3.4h, v25.4h
|
||||
smlal2 v19.4s, v3.8h, v25.8h
|
||||
sqrshrn v6.4h, v6.4s, #5
|
||||
sqrshrn2 v6.8h, v7.4s, #5
|
||||
sqrshrn v7.4h, v10.4s, #5
|
||||
sqrshrn2 v7.8h, v11.4s, #5
|
||||
sqrshrn v10.4h, v16.4s, #5
|
||||
sqrshrn2 v10.8h, v17.4s, #5
|
||||
sqrshrn v11.4h, v18.4s, #5
|
||||
sqrshrn2 v11.8h, v19.4s, #5
|
||||
.endif
|
||||
|
||||
.if \ox && !\oy
|
||||
sqrshrn v4.4h, v4.4s, #5
|
||||
smin v4.4h, v4.4h, v15.4h
|
||||
.endif
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
|
||||
.if \oy
|
||||
smin v16.8h, v6.8h, v15.8h
|
||||
smin v17.8h, v7.8h, v15.8h
|
||||
smin v18.8h, v10.8h, v15.8h
|
||||
smin v19.8h, v11.8h, v15.8h
|
||||
smax v16.8h, v16.8h, v14.8h
|
||||
smax v17.8h, v17.8h, v14.8h
|
||||
smax v18.8h, v18.8h, v14.8h
|
||||
smax v19.8h, v19.8h, v14.8h
|
||||
.endif
|
||||
|
||||
.if \ox && !\oy
|
||||
smax v4.4h, v4.4h, v14.4h
|
||||
.endif
|
||||
ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
|
||||
.if \ox && !\oy
|
||||
ins v16.d[0], v4.d[0]
|
||||
.endif
|
||||
|
||||
.if !\csfl
|
||||
smull v4.4s, v0.4h, v8.4h
|
||||
smull2 v5.4s, v0.8h, v8.8h
|
||||
smull v6.4s, v1.4h, v8.4h
|
||||
smull2 v7.4s, v1.8h, v8.8h
|
||||
smull v0.4s, v2.4h, v8.4h
|
||||
smull2 v1.4s, v2.8h, v8.8h
|
||||
smull v2.4s, v3.4h, v8.4h
|
||||
smull2 v3.4s, v3.8h, v8.8h
|
||||
smlal v4.4s, v10.4h, v9.4h
|
||||
smlal2 v5.4s, v10.8h, v9.8h
|
||||
smlal v6.4s, v11.4h, v9.4h
|
||||
smlal2 v7.4s, v11.8h, v9.8h
|
||||
smlal v0.4s, v12.4h, v9.4h
|
||||
smlal2 v1.4s, v12.8h, v9.8h
|
||||
smlal v2.4s, v13.4h, v9.4h
|
||||
smlal2 v3.4s, v13.8h, v9.8h
|
||||
shrn v4.4h, v4.4s, #6
|
||||
shrn2 v4.8h, v5.4s, #6
|
||||
shrn v5.4h, v6.4s, #6
|
||||
shrn2 v5.8h, v7.4s, #6
|
||||
shrn v6.4h, v0.4s, #6
|
||||
shrn2 v6.8h, v1.4s, #6
|
||||
shrn v7.4h, v2.4s, #6
|
||||
shrn2 v7.8h, v3.4s, #6
|
||||
add v0.8h, v4.8h, v24.8h
|
||||
add v1.8h, v5.8h, v24.8h
|
||||
add v2.8h, v6.8h, v24.8h
|
||||
add v3.8h, v7.8h, v24.8h
|
||||
movi v20.8h, #0
|
||||
smin v0.8h, v0.8h, v23.8h
|
||||
smin v1.8h, v1.8h, v23.8h
|
||||
smin v2.8h, v2.8h, v23.8h
|
||||
smin v3.8h, v3.8h, v23.8h
|
||||
smax v0.8h, v0.8h, v20.8h
|
||||
smax v1.8h, v1.8h, v20.8h
|
||||
smax v2.8h, v2.8h, v20.8h
|
||||
smax v3.8h, v3.8h, v20.8h
|
||||
.else
|
||||
// Make sure that uninitialized pixels out of range past the right
|
||||
// edge are in range; their actual values shouldn't matter.
|
||||
and v0.16b, v0.16b, v23.16b
|
||||
and v1.16b, v1.16b, v23.16b
|
||||
and v2.16b, v2.16b, v23.16b
|
||||
and v3.16b, v3.16b, v23.16b
|
||||
.endif
|
||||
|
||||
bl gather32_neon
|
||||
|
||||
uxtl v4.8h, v6.8b // scaling
|
||||
uxtl2 v5.8h, v6.16b
|
||||
uxtl v6.8h, v7.8b
|
||||
uxtl2 v7.8h, v7.16b
|
||||
|
||||
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
|
||||
ushl v5.8h, v5.8h, v29.8h
|
||||
ushl v6.8h, v6.8h, v29.8h
|
||||
ushl v7.8h, v7.8h, v29.8h
|
||||
|
||||
sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
sqrdmulh v17.8h, v17.8h, v5.8h
|
||||
sqrdmulh v18.8h, v18.8h, v6.8h
|
||||
sqrdmulh v19.8h, v19.8h, v7.8h
|
||||
|
||||
usqadd v10.8h, v16.8h // *src + noise
|
||||
usqadd v11.8h, v17.8h
|
||||
usqadd v12.8h, v18.8h
|
||||
usqadd v13.8h, v19.8h
|
||||
|
||||
umax v0.8h, v10.8h, v30.8h
|
||||
umax v1.8h, v11.8h, v30.8h
|
||||
umax v2.8h, v12.8h, v30.8h
|
||||
umax v3.8h, v13.8h, v30.8h
|
||||
umin v0.8h, v0.8h, v31.8h
|
||||
umin v1.8h, v1.8h, v31.8h
|
||||
umin v2.8h, v2.8h, v31.8h
|
||||
umin v3.8h, v3.8h, v31.8h
|
||||
|
||||
subs w9, w9, #1
|
||||
.if \oy
|
||||
dup v25.8h, v28.h[0]
|
||||
dup v26.8h, v28.h[1]
|
||||
.endif
|
||||
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
|
||||
b.gt 1b
|
||||
|
||||
.if \oy
|
||||
cmp w12, #0
|
||||
mov w9, w12 // restore actual remaining h
|
||||
b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
|
||||
.endif
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx0 0, 0, 0
|
||||
fguv_loop_sx0 0, 0, 1
|
||||
fguv_loop_sx0 0, 1, 0
|
||||
fguv_loop_sx0 0, 1, 1
|
||||
fguv_loop_sx0 1, 0, 0
|
||||
fguv_loop_sx0 1, 0, 1
|
||||
fguv_loop_sx0 1, 1, 0
|
||||
fguv_loop_sx0 1, 1, 1
|
||||
|
||||
9:
|
||||
ldp d14, d15, [sp, #64]
|
||||
ldp d12, d13, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d8, d9, [sp, #16]
|
||||
ldr x30, [sp], #80
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx0_tbl):
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
|
||||
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
|
||||
endfunc
|
||||
|
||||
function fguv_loop_sx1_neon
|
||||
.macro fguv_loop_sx1 csfl, ox, oy
|
||||
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||||
1:
|
||||
.if \ox
|
||||
ld1 {v18.4h}, [x4], x10 // grain_lut old
|
||||
.endif
|
||||
.if \oy
|
||||
ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
|
||||
.endif
|
||||
.if \ox && \oy
|
||||
ld1 {v19.4h}, [x11], x10 // grain_lut top old
|
||||
.endif
|
||||
ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
|
||||
|
||||
.if \ox
|
||||
smull v18.4s, v18.4h, v27.4h
|
||||
smlal v18.4s, v16.4h, v28.4h
|
||||
.endif
|
||||
|
||||
.if \oy
|
||||
.if \ox
|
||||
smull v19.4s, v19.4h, v27.4h
|
||||
smlal v19.4s, v20.4h, v28.4h
|
||||
sqrshrn v18.4h, v18.4s, #5
|
||||
sqrshrn v19.4h, v19.4s, #5
|
||||
smin v18.4h, v18.4h, v15.4h
|
||||
smin v19.4h, v19.4h, v15.4h
|
||||
smax v18.4h, v18.4h, v14.4h
|
||||
smax v19.4h, v19.4h, v14.4h
|
||||
ins v16.d[0], v18.d[0]
|
||||
ins v20.d[0], v19.d[0]
|
||||
.endif
|
||||
|
||||
smull v0.4s, v16.4h, v26.4h
|
||||
smull2 v1.4s, v16.8h, v26.8h
|
||||
smull v2.4s, v17.4h, v26.4h
|
||||
smull2 v3.4s, v17.8h, v26.8h
|
||||
smlal v0.4s, v20.4h, v25.4h
|
||||
smlal2 v1.4s, v20.8h, v25.8h
|
||||
smlal v2.4s, v21.4h, v25.4h
|
||||
smlal2 v3.4s, v21.8h, v25.8h
|
||||
sqrshrn v16.4h, v0.4s, #5
|
||||
sqrshrn2 v16.8h, v1.4s, #5
|
||||
sqrshrn v17.4h, v2.4s, #5
|
||||
sqrshrn2 v17.8h, v3.4s, #5
|
||||
.endif
|
||||
|
||||
.if \ox && !\oy
|
||||
sqrshrn v18.4h, v18.4s, #5
|
||||
smin v18.4h, v18.4h, v15.4h
|
||||
.endif
|
||||
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
|
||||
.if \oy
|
||||
smin v16.8h, v16.8h, v15.8h
|
||||
smin v17.8h, v17.8h, v15.8h
|
||||
smax v16.8h, v16.8h, v14.8h
|
||||
smax v17.8h, v17.8h, v14.8h
|
||||
.endif
|
||||
|
||||
.if \ox && !\oy
|
||||
smax v18.4h, v18.4h, v14.4h
|
||||
.endif
|
||||
ld1 {v10.8h, v11.8h}, [x1], x2 // src
|
||||
.if \ox && !\oy
|
||||
ins v16.d[0], v18.d[0]
|
||||
.endif
|
||||
addp v0.8h, v0.8h, v1.8h
|
||||
addp v1.8h, v2.8h, v3.8h
|
||||
urshr v0.8h, v0.8h, #1
|
||||
urshr v1.8h, v1.8h, #1
|
||||
.if !\csfl
|
||||
smull v2.4s, v0.4h, v8.4h
|
||||
smull2 v3.4s, v0.8h, v8.8h
|
||||
smull v0.4s, v1.4h, v8.4h
|
||||
smull2 v1.4s, v1.8h, v8.8h
|
||||
smlal v2.4s, v10.4h, v9.4h
|
||||
smlal2 v3.4s, v10.8h, v9.8h
|
||||
smlal v0.4s, v11.4h, v9.4h
|
||||
smlal2 v1.4s, v11.8h, v9.8h
|
||||
shrn v2.4h, v2.4s, #6
|
||||
shrn2 v2.8h, v3.4s, #6
|
||||
shrn v3.4h, v0.4s, #6
|
||||
shrn2 v3.8h, v1.4s, #6
|
||||
add v0.8h, v2.8h, v24.8h
|
||||
add v1.8h, v3.8h, v24.8h
|
||||
movi v2.8h, #0
|
||||
smin v0.8h, v0.8h, v23.8h
|
||||
smin v1.8h, v1.8h, v23.8h
|
||||
smax v0.8h, v0.8h, v2.8h
|
||||
smax v1.8h, v1.8h, v2.8h
|
||||
.else
|
||||
// Make sure that uninitialized pixels out of range past the right
|
||||
// edge are in range; their actual values shouldn't matter.
|
||||
and v0.16b, v0.16b, v23.16b
|
||||
and v1.16b, v1.16b, v23.16b
|
||||
.endif
|
||||
|
||||
bl gather16_neon
|
||||
|
||||
uxtl v4.8h, v6.8b // scaling
|
||||
uxtl2 v5.8h, v6.16b
|
||||
|
||||
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
|
||||
ushl v5.8h, v5.8h, v29.8h
|
||||
|
||||
sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
|
||||
sqrdmulh v17.8h, v17.8h, v5.8h
|
||||
|
||||
usqadd v10.8h, v16.8h // *src + noise
|
||||
usqadd v11.8h, v17.8h
|
||||
|
||||
umax v0.8h, v10.8h, v30.8h
|
||||
umax v1.8h, v11.8h, v30.8h
|
||||
umin v0.8h, v0.8h, v31.8h
|
||||
umin v1.8h, v1.8h, v31.8h
|
||||
|
||||
.if \oy
|
||||
mov v16.16b, v25.16b
|
||||
.endif
|
||||
subs w9, w9, #1
|
||||
.if \oy
|
||||
mov v25.16b, v26.16b
|
||||
mov v26.16b, v16.16b
|
||||
.endif
|
||||
st1 {v0.8h, v1.8h}, [x0], x2 // dst
|
||||
b.gt 1b
|
||||
|
||||
.if \oy
|
||||
cmp w12, #0
|
||||
mov w9, w12 // restore actual remaining h
|
||||
b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
|
||||
.endif
|
||||
|
||||
b 9f
|
||||
.endm
|
||||
fguv_loop_sx1 0, 0, 0
|
||||
fguv_loop_sx1 0, 0, 1
|
||||
fguv_loop_sx1 0, 1, 0
|
||||
fguv_loop_sx1 0, 1, 1
|
||||
fguv_loop_sx1 1, 0, 0
|
||||
fguv_loop_sx1 1, 0, 1
|
||||
fguv_loop_sx1 1, 1, 0
|
||||
fguv_loop_sx1 1, 1, 1
|
||||
|
||||
9:
|
||||
ldp d14, d15, [sp, #64]
|
||||
ldp d12, d13, [sp, #48]
|
||||
ldp d10, d11, [sp, #32]
|
||||
ldp d8, d9, [sp, #16]
|
||||
ldr x30, [sp], #80
|
||||
ret
|
||||
|
||||
L(fguv_loop_sx1_tbl):
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
|
||||
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
|
||||
endfunc
|
||||
|
11
third_party/dav1d/src/arm/film_grain_init_tmpl.c
vendored
11
third_party/dav1d/src/arm/film_grain_init_tmpl.c
vendored
@ -60,6 +60,7 @@ void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
|
||||
GEN_GRAIN_UV(420);
|
||||
GEN_GRAIN_UV(422);
|
||||
GEN_GRAIN_UV(444);
|
||||
#endif
|
||||
|
||||
// Use ptrdiff_t instead of int for the last few parameters, to get the
|
||||
// same layout of parameters on the stack across platforms.
|
||||
@ -149,7 +150,6 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
|
||||
}
|
||||
}
|
||||
|
||||
#if BITDEPTH == 8
|
||||
#define fguv_ss_fn(nm, sx, sy) \
|
||||
static void \
|
||||
fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
|
||||
@ -204,16 +204,12 @@ fguv_ss_fn(420, 1, 1);
|
||||
fguv_ss_fn(422, 1, 0);
|
||||
fguv_ss_fn(444, 0, 0);
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if ARCH_AARCH64
|
||||
#if BITDEPTH == 8
|
||||
#if ARCH_AARCH64 && BITDEPTH == 8
|
||||
c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
|
||||
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
|
||||
@ -221,10 +217,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c
|
||||
#endif
|
||||
|
||||
c->fgy_32x32xn = fgy_32x32xn_neon;
|
||||
#if BITDEPTH == 8
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
|
||||
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
7
third_party/dav1d/src/meson.build
vendored
7
third_party/dav1d/src/meson.build
vendored
@ -144,6 +144,7 @@ if is_asm_enabled
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/film_grain.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/loopfilter.S',
|
||||
'arm/32/looprestoration.S',
|
||||
@ -154,6 +155,7 @@ if is_asm_enabled
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/32/cdef16.S',
|
||||
'arm/32/film_grain16.S',
|
||||
'arm/32/ipred16.S',
|
||||
'arm/32/itx16.S',
|
||||
'arm/32/loopfilter16.S',
|
||||
@ -218,11 +220,14 @@ if is_asm_enabled
|
||||
'x86/film_grain16_avx2.asm',
|
||||
'x86/ipred16_avx2.asm',
|
||||
'x86/itx16_avx2.asm',
|
||||
'x86/itx16_sse.asm',
|
||||
'x86/loopfilter16_avx2.asm',
|
||||
'x86/looprestoration16_avx2.asm',
|
||||
'x86/mc16_avx2.asm',
|
||||
'x86/cdef16_sse.asm',
|
||||
'x86/itx16_sse.asm',
|
||||
'x86/loopfilter16_sse.asm',
|
||||
'x86/looprestoration16_sse.asm',
|
||||
'x86/mc16_sse.asm',
|
||||
)
|
||||
endif
|
||||
|
||||
|
163
third_party/dav1d/src/x86/film_grain16_avx2.asm
vendored
163
third_party/dav1d/src/x86/film_grain16_avx2.asm
vendored
@ -29,7 +29,6 @@
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 32
|
||||
pd_0x10000: times 8 dd 0x10000
|
||||
pw_1024: times 16 dw 1024
|
||||
pw_23_22: times 8 dw 23, 22
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
@ -844,7 +843,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
mov r7d, [fg_dataq+FGData.scaling_shift]
|
||||
lea r8, [pb_mask]
|
||||
%define base r8-pb_mask
|
||||
vpbroadcastw m11, [base+round_vals+r7*2-12]
|
||||
vpbroadcastw m11, [base+mul_bits+r7*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
mov r9d, r9m ; bdmax
|
||||
sar r9d, 11 ; is_12bpc
|
||||
@ -854,7 +853,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
lea r9d, [r6d*2+r9d]
|
||||
vpbroadcastw m12, [base+max+r9*2]
|
||||
vpbroadcastw m10, r9m
|
||||
mov r9mp, r7
|
||||
pxor m2, m2
|
||||
|
||||
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
|
||||
@ -921,27 +919,17 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
vpgatherdd m5, [scalingq+m6-3], m3
|
||||
vpgatherdd m6, [scalingq+m7-3], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
|
||||
packssdw m8, m4
|
||||
packssdw m5, m6
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m9, [grain_lutq+offxyq*2]
|
||||
movu m3, [grain_lutq+offxyq*2+32]
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
; the problem here is that since the grain is 10-bits, the product of
|
||||
; scaling*grain is 17+sign bits, so we need to unfortunately do some
|
||||
; of these steps in 32-bits
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r9m}, m9, m7, m3, m8
|
||||
packssdw m9, m7
|
||||
packssdw m3, m8
|
||||
REPX {pmullw x, m11}, m8, m5
|
||||
pmulhrsw m9, m8
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
@ -1014,7 +1002,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
vpgatherdd m5, [scalingq+m6-3], m3
|
||||
vpgatherdd m6, [scalingq+m7-3], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
|
||||
packssdw m8, m4
|
||||
packssdw m5, m6
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m9, [grain_lutq+offxyq*2]
|
||||
@ -1033,17 +1022,9 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
movu m3, [grain_lutq+offxyq*2+32]
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r9m}, m9, m7, m3, m8
|
||||
packssdw m9, m7
|
||||
packssdw m3, m8
|
||||
REPX {pmullw x, m11}, m8, m5
|
||||
pmulhrsw m9, m8
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
@ -1167,16 +1148,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
vpgatherdd m6, [scalingq+m4-3], m3
|
||||
vpgatherdd m4, [scalingq+m5-3], m9
|
||||
REPX {psrld x, 24}, m6, m4
|
||||
REPX {por x, [pd_0x10000]}, m6, m4
|
||||
packssdw m6, m4
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
punpckhwd m9, m7, m11
|
||||
punpcklwd m7, m11
|
||||
pmaddwd m6, m7
|
||||
pmaddwd m4, m9
|
||||
|
||||
REPX {psrad x, r9m}, m6, m4
|
||||
packssdw m6, m4
|
||||
pmullw m6, m11
|
||||
pmulhrsw m6, m7
|
||||
|
||||
; same for the other half
|
||||
pminuw m1, m10, [srcq+32] ; m0-1: src as word
|
||||
@ -1187,16 +1163,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
vpgatherdd m5, [scalingq+m4-3], m3
|
||||
vpgatherdd m4, [scalingq+m9-3], m7
|
||||
REPX {psrld x, 24}, m5, m4
|
||||
REPX {por x, [pd_0x10000]}, m5, m4
|
||||
|
||||
punpckhwd m9, m8, m11
|
||||
punpcklwd m8, m11
|
||||
pmaddwd m5, m8
|
||||
pmaddwd m4, m9
|
||||
|
||||
REPX {psrad x, r9m}, m5, m4
|
||||
packssdw m5, m4
|
||||
|
||||
pmullw m5, m11
|
||||
pmulhrsw m5, m8
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m6
|
||||
paddw m1, m5
|
||||
@ -1313,15 +1284,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m4, [scalingq+m5-3], m9
|
||||
REPX {psrld x, 24}, m6, m4
|
||||
REPX {por x, [pd_0x10000]}, m6, m4
|
||||
packssdw m6, m4
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
punpckhwd m9, m7, m11
|
||||
punpcklwd m7, m11
|
||||
pmaddwd m9, m4
|
||||
pmaddwd m7, m6
|
||||
REPX {psrad x, r9m}, m9, m7
|
||||
packssdw m7, m9
|
||||
pmullw m6, m11
|
||||
pmulhrsw m7, m6
|
||||
|
||||
; other half
|
||||
punpckhwd m5, m1, m2
|
||||
@ -1333,15 +1300,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
pcmpeqw m6, m6
|
||||
vpgatherdd m4, [scalingq+m5-3], m6
|
||||
REPX {psrld x, 24}, m9, m4
|
||||
REPX {por x, [pd_0x10000]}, m9, m4
|
||||
packssdw m9, m4
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
punpckhwd m6, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m6, m4
|
||||
pmaddwd m3, m9
|
||||
REPX {psrad x, r9m}, m6, m3
|
||||
packssdw m3, m6
|
||||
pmullw m9, m11
|
||||
pmulhrsw m3, m9
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m7
|
||||
@ -1378,7 +1341,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%define base r8-pb_mask
|
||||
lea r8, [pb_mask]
|
||||
mov r7d, [fg_dataq+FGData.scaling_shift]
|
||||
vpbroadcastw m11, [base+round_vals+r7*2-12]
|
||||
vpbroadcastw m11, [base+mul_bits+r7*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
mov r9d, r13m ; bdmax
|
||||
sar r9d, 11 ; is_12bpc
|
||||
@ -1391,7 +1354,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpbroadcastw m12, [base+max+r10*2]
|
||||
vpbroadcastw m10, r13m
|
||||
pxor m2, m2
|
||||
mov r13mp, r7
|
||||
|
||||
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
||||
jne .csfl
|
||||
@ -1510,24 +1472,17 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpgatherdd m5, [scalingq+m6-3], m3
|
||||
vpgatherdd m6, [scalingq+m7-3], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
|
||||
packssdw m8, m4
|
||||
packssdw m5, m6
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m9, [grain_lutq+offxyq*2]
|
||||
movu m3, [grain_lutq+offxyq*2+82*2]
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r13m}, m9, m7, m3, m8
|
||||
packssdw m9, m7
|
||||
packssdw m3, m8
|
||||
REPX {pmullw x, m11}, m8, m5
|
||||
pmulhrsw m9, m8
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
@ -1655,15 +1610,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
pcmpeqw m7, m7
|
||||
vpgatherdd m4, [scalingq+m5-3], m7
|
||||
REPX {psrld x, 24}, m8, m4
|
||||
REPX {por x, [pd_0x10000]}, m8, m4
|
||||
packssdw m8, m4
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
REPX {psrad x, r13m}, m9, m7
|
||||
packssdw m9, m7
|
||||
pmullw m8, m11
|
||||
pmulhrsw m9, m8
|
||||
|
||||
; same for the other half
|
||||
punpckhwd m7, m6, m2
|
||||
@ -1673,15 +1624,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpgatherdd m5, [scalingq+m6-3], m8
|
||||
vpgatherdd m6, [scalingq+m7-3], m4
|
||||
REPX {psrld x, 24}, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m5, m6
|
||||
packssdw m5, m6
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r13m}, m3, m8
|
||||
packssdw m3, m8
|
||||
pmullw m5, m11
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
@ -1841,15 +1788,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
pcmpeqw m7, m7
|
||||
vpgatherdd m4, [scalingq+m5-3], m7
|
||||
REPX {psrld x, 24}, m8, m4
|
||||
REPX {por x, [pd_0x10000]}, m8, m4
|
||||
packssdw m8, m4
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
REPX {psrad x, r13m}, m9, m7
|
||||
packssdw m9, m7
|
||||
pmullw m8, m11
|
||||
pmulhrsw m9, m8
|
||||
|
||||
; same for the other half
|
||||
punpckhwd m7, m6, m2
|
||||
@ -1859,16 +1802,12 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpgatherdd m5, [scalingq+m6-3], m8
|
||||
vpgatherdd m6, [scalingq+m7-3], m4
|
||||
REPX {psrld x, 24}, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m5, m6
|
||||
packssdw m5, m6
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
movu m3, [grain_lutq+offxyq*2+82*2]
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r13m}, m3, m8
|
||||
packssdw m3, m8
|
||||
pmullw m5, m11
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
@ -2025,15 +1964,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
pcmpeqw m7, m7
|
||||
vpgatherdd m4, [scalingq+m5-3], m7
|
||||
REPX {psrld x, 24}, m8, m4
|
||||
REPX {por x, [pd_0x10000]}, m8, m4
|
||||
packssdw m8, m4
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m7, m9, m11
|
||||
punpcklwd m9, m11
|
||||
pmaddwd m9, m8
|
||||
pmaddwd m7, m4
|
||||
REPX {psrad x, r13m}, m9, m7
|
||||
packssdw m9, m7
|
||||
pmullw m8, m11
|
||||
pmulhrsw m9, m8
|
||||
|
||||
; same for the other half
|
||||
punpckhwd m7, m6, m2
|
||||
@ -2043,15 +1978,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpgatherdd m5, [scalingq+m6-3], m8
|
||||
vpgatherdd m6, [scalingq+m7-3], m4
|
||||
REPX {psrld x, 24}, m5, m6
|
||||
REPX {por x, [pd_0x10000]}, m5, m6
|
||||
packssdw m5, m6
|
||||
|
||||
; noise = round2(scaling[luma_src] * grain, scaling_shift)
|
||||
punpckhwd m8, m3, m11
|
||||
punpcklwd m3, m11
|
||||
pmaddwd m3, m5
|
||||
pmaddwd m8, m6
|
||||
REPX {psrad x, r13m}, m3, m8
|
||||
packssdw m3, m8
|
||||
pmullw m5, m11
|
||||
pmulhrsw m3, m5
|
||||
|
||||
; dst = clip_pixel(src, noise)
|
||||
paddw m0, m9
|
||||
|
1135
third_party/dav1d/src/x86/loopfilter16_sse.asm
vendored
Normal file
1135
third_party/dav1d/src/x86/loopfilter16_sse.asm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -36,6 +36,7 @@ decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
|
||||
|
||||
decl_loopfilter_sb_fns(ssse3);
|
||||
decl_loopfilter_sb_fns(avx2);
|
||||
decl_loopfilter_sb_fns(16bpc_ssse3);
|
||||
decl_loopfilter_sb_fns(16bpc_avx2);
|
||||
|
||||
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
|
||||
@ -48,6 +49,13 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
|
||||
#else
|
||||
#if ARCH_X86_64
|
||||
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3;
|
||||
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3;
|
||||
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3;
|
||||
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
@ -56,7 +56,7 @@ pd_8: dd 8
|
||||
pd_25: dd 25
|
||||
pd_4096: dd 4096
|
||||
pd_34816: dd 34816
|
||||
pd_m262128 dd -262128
|
||||
pd_m262128: dd -262128
|
||||
pd_0xf00800a4: dd 0xf00800a4
|
||||
pd_0xf00801c7: dd 0xf00801c7
|
||||
|
||||
|
1125
third_party/dav1d/src/x86/looprestoration16_sse.asm
vendored
Normal file
1125
third_party/dav1d/src/x86/looprestoration16_sse.asm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
@ -197,9 +197,9 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
|
||||
#endif
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
#if BITDEPTH == 8
|
||||
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
|
||||
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
|
||||
#if BITDEPTH == 8
|
||||
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
|
||||
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
|
||||
c->sgr[2] = BF(sgr_filter_mix, ssse3);
|
||||
|
4144
third_party/dav1d/src/x86/mc16_sse.asm
vendored
Normal file
4144
third_party/dav1d/src/x86/mc16_sse.asm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
22
third_party/dav1d/src/x86/mc_init_tmpl.c
vendored
22
third_party/dav1d/src/x86/mc_init_tmpl.c
vendored
@ -47,7 +47,7 @@
|
||||
decl_##type##_fn(name##_16bpc_sse2); \
|
||||
decl_##type##_fn(name##_16bpc_ssse3); \
|
||||
decl_##type##_fn(name##_16bpc_avx2); \
|
||||
decl_##type##_fn(name##_avx512icl);
|
||||
decl_##type##_fn(name##_16bpc_avx512icl);
|
||||
#define init_mc_fn(type, name, suffix) \
|
||||
c->mc[type] = dav1d_put_##name##_16bpc_##suffix
|
||||
#define init_mct_fn(type, name, suffix) \
|
||||
@ -147,8 +147,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
||||
#if BITDEPTH == 8
|
||||
init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
|
||||
init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
|
||||
@ -158,8 +156,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
|
||||
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
|
||||
init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
|
||||
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
|
||||
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
|
||||
@ -169,8 +167,9 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
|
||||
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
|
||||
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
|
||||
|
||||
#if ARCH_X86_64
|
||||
#if BITDEPTH == 8 && ARCH_X86_64
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
|
||||
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
|
||||
@ -194,6 +193,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
|
||||
#endif
|
||||
|
||||
#if BITDEPTH == 8
|
||||
c->avg = dav1d_avg_ssse3;
|
||||
c->w_avg = dav1d_w_avg_ssse3;
|
||||
c->mask = dav1d_mask_ssse3;
|
||||
@ -207,6 +207,18 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_ssse3;
|
||||
c->resize = dav1d_resize_ssse3;
|
||||
#else
|
||||
c->avg = dav1d_avg_16bpc_ssse3;
|
||||
c->w_avg = dav1d_w_avg_16bpc_ssse3;
|
||||
c->mask = dav1d_mask_16bpc_ssse3;
|
||||
c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3;
|
||||
c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3;
|
||||
c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3;
|
||||
c->blend = dav1d_blend_16bpc_ssse3;
|
||||
c->blend_v = dav1d_blend_v_16bpc_ssse3;
|
||||
c->blend_h = dav1d_blend_h_16bpc_ssse3;
|
||||
|
||||
c->emu_edge = dav1d_emu_edge_16bpc_ssse3;
|
||||
#endif
|
||||
|
||||
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
|
||||
|
8
third_party/dav1d/tests/checkasm/checkasm.h
vendored
8
third_party/dav1d/tests/checkasm/checkasm.h
vendored
@ -282,9 +282,9 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
#ifdef readtime
|
||||
#define bench_new(...)\
|
||||
do {\
|
||||
func_type *tfunc = func_new;\
|
||||
checkasm_set_signal_handler_state(1);\
|
||||
if (checkasm_bench_func()) {\
|
||||
checkasm_set_signal_handler_state(1);\
|
||||
func_type *tfunc = func_new;\
|
||||
uint64_t tsum = 0;\
|
||||
int tcount = 0;\
|
||||
for (int ti = 0; ti < BENCH_RUNS; ti++) {\
|
||||
@ -299,9 +299,11 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
|
||||
tcount++;\
|
||||
}\
|
||||
}\
|
||||
checkasm_set_signal_handler_state(0);\
|
||||
checkasm_update_bench(tcount, tsum);\
|
||||
} else {\
|
||||
tfunc(__VA_ARGS__);\
|
||||
}\
|
||||
checkasm_set_signal_handler_state(0);\
|
||||
} while (0)
|
||||
#else
|
||||
#define bench_new(...) do {} while (0)
|
||||
|
42
third_party/dav1d/tests/checkasm/filmgrain.c
vendored
42
third_party/dav1d/tests/checkasm/filmgrain.c
vendored
@ -188,12 +188,21 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
for (int i = 0; i <= fg_data[0].overlap_flag; i++) {
|
||||
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
|
||||
int w, h, row_num;
|
||||
if (fg_data[0].overlap_flag) {
|
||||
w = 35 + (rnd() % 93);
|
||||
h = 3 + (rnd() % 29);
|
||||
row_num = i ? 1 + (rnd() & 0x7ff) : 0;
|
||||
if (i == 0) {
|
||||
row_num = 0;
|
||||
h = 1 + (rnd() % 31);
|
||||
} else {
|
||||
row_num = 1 + (rnd() & 0x7ff);
|
||||
if (i == 1) {
|
||||
h = 3 + (rnd() % 30);
|
||||
} else {
|
||||
h = 1 + (rnd() & 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
w = 1 + (rnd() & 127);
|
||||
h = 1 + (rnd() & 31);
|
||||
@ -220,6 +229,11 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
}
|
||||
}
|
||||
fg_data[0].overlap_flag = 1;
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Make sure all pixels are in range
|
||||
for (int x = 0; x < 128; x++)
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
}
|
||||
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
|
||||
1 HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
@ -311,12 +325,21 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
|
||||
fg_data[0].overlap_flag++)
|
||||
{
|
||||
for (int i = 0; i <= fg_data[0].overlap_flag; i++) {
|
||||
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
|
||||
int w, h, row_num;
|
||||
if (fg_data[0].overlap_flag) {
|
||||
w = (36 >> ss_x) + (rnd() % (92 >> ss_x));
|
||||
h = (4 >> ss_y) + (rnd() % (28 >> ss_y));
|
||||
row_num = i ? 1 + (rnd() & 0x7ff) : 0;
|
||||
if (i == 0) {
|
||||
row_num = 0;
|
||||
h = 1 + (rnd() & (31 >> ss_y));
|
||||
} else {
|
||||
row_num = 1 + (rnd() & 0x7ff);
|
||||
if (i == 1) {
|
||||
h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30));
|
||||
} else {
|
||||
h = ss_y ? 1 : 1 + (rnd() & 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
w = 1 + (rnd() & (127 >> ss_x));
|
||||
h = 1 + (rnd() & (31 >> ss_y));
|
||||
@ -350,6 +373,13 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
|
||||
}
|
||||
|
||||
fg_data[0].overlap_flag = 1;
|
||||
for (int y = 0; y < 32; y++) {
|
||||
// Make sure all pixels are in range
|
||||
for (int x = 0; x < 128; x++) {
|
||||
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
|
||||
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
|
||||
}
|
||||
}
|
||||
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
|
||||
1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
@ -33,13 +33,12 @@
|
||||
#include "src/loopfilter.h"
|
||||
|
||||
static void init_lpf_border(pixel *const dst, const ptrdiff_t stride,
|
||||
int E, int I, int H, const int bitdepth_max)
|
||||
int E, int I, const int bitdepth_max)
|
||||
{
|
||||
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
|
||||
const int F = 1 << bitdepth_min_8;
|
||||
E <<= bitdepth_min_8;
|
||||
I <<= bitdepth_min_8;
|
||||
H <<= bitdepth_min_8;
|
||||
|
||||
const int filter_type = rnd() % 4;
|
||||
const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2);
|
||||
@ -171,7 +170,7 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
|
||||
L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx];
|
||||
}
|
||||
init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1,
|
||||
lut.e[L], lut.i[L], L >> 4, bitdepth_max);
|
||||
lut.e[L], lut.i[L], bitdepth_max);
|
||||
}
|
||||
memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user