Bug 1716453 - Update dav1d to new version ddbbfde for Firefox 91. r=mjf

Differential Revision: https://phabricator.services.mozilla.com/D118295
This commit is contained in:
Jon Bauman 2021-06-21 21:15:52 +00:00
parent e7acc848b5
commit 36b612a851
21 changed files with 8882 additions and 303 deletions

View File

@ -93,8 +93,9 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/itx16_avx2.asm',
'../../../third_party/dav1d/src/x86/itx_avx2.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_avx2.asm',
'../../../third_party/dav1d/src/x86/loopfilter16_sse.asm',
'../../../third_party/dav1d/src/x86/loopfilter_avx2.asm',
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/looprestoration16_avx2.asm',
'../../../third_party/dav1d/src/x86/looprestoration_avx2.asm',
'../../../third_party/dav1d/src/x86/mc16_avx2.asm',
'../../../third_party/dav1d/src/x86/mc_avx2.asm',
@ -110,7 +111,9 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/itx16_sse.asm',
'../../../third_party/dav1d/src/x86/itx_sse.asm',
'../../../third_party/dav1d/src/x86/loopfilter_sse.asm',
'../../../third_party/dav1d/src/x86/looprestoration16_sse.asm', # moved from autovendored
'../../../third_party/dav1d/src/x86/looprestoration_sse.asm',
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
'../../../third_party/dav1d/src/x86/mc_sse.asm',
'../../../third_party/dav1d/src/x86/msac.asm',
]
@ -206,6 +209,8 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
'../../../third_party/dav1d/src/arm/32/cdef.S',
'../../../third_party/dav1d/src/arm/32/cdef16.S',
'../../../third_party/dav1d/src/arm/32/cdef_tmpl.S',
'../../../third_party/dav1d/src/arm/32/film_grain.S',
'../../../third_party/dav1d/src/arm/32/film_grain16.S',
'../../../third_party/dav1d/src/arm/32/ipred.S',
'../../../third_party/dav1d/src/arm/32/ipred16.S',
'../../../third_party/dav1d/src/arm/32/itx.S',

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit c54add020492e3cca0da5ab90fa69c92ba496384 (2021-05-18T02:50:02.000+02:00).
release: commit ddbbfde198aced0d02ea739c320d754d43406f7b (2021-06-12T07:58:29.000+00:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: c54add020492e3cca0da5ab90fa69c92ba496384
revision: ddbbfde198aced0d02ea739c320d754d43406f7b
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "0.9.0-1-gc54add0"
#define DAV1D_VERSION "0.9.0-24-gddbbfde"

View File

@ -60,7 +60,7 @@ Our contributions guidelines are quite strict. We want to build a coherent codeb
Notably, the codebase is in pure C and asm.
We are on IRC, on the **#dav1d** channel on *Freenode*.
We are on IRC, on the **#dav1d** channel on [*Libera.chat*](http://libera.chat/). If you do not have an IRC Client at hand, use [KiwiIRC Web Interface](https://kiwiirc.com/nextclient/#ircs://irc.libera.chat/#dav1d).
See the [contributions document](CONTRIBUTING.md).

View File

@ -0,0 +1,714 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "src/arm/asm-offsets.h"
#define GRAIN_WIDTH 82
.macro gather_interleaved dst1, dst2, src1, src2, off
vmov.u8 r11, \src1[0+\off]
vmov.u8 r12, \src2[0+\off]
add r11, r11, r3
vmov.u8 lr, \src1[2+\off]
add r12, r12, r3
vld1.8 {\dst1[0+\off]}, [r11]
vmov.u8 r11, \src2[2+\off]
add lr, lr, r3
vld1.8 {\dst2[0+\off]}, [r12]
vmov.u8 r12, \src1[4+\off]
add r11, r11, r3
vld1.8 {\dst1[2+\off]}, [lr]
vmov.u8 lr, \src2[4+\off]
add r12, r12, r3
vld1.8 {\dst2[2+\off]}, [r11]
vmov.u8 r11, \src1[6+\off]
add lr, lr, r3
vld1.8 {\dst1[4+\off]}, [r12]
vmov.u8 r12, \src2[6+\off]
add r11, r11, r3
vld1.8 {\dst2[4+\off]}, [lr]
add r12, r12, r3
vld1.8 {\dst1[6+\off]}, [r11]
vld1.8 {\dst2[6+\off]}, [r12]
.endm
.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
gather_interleaved \dst1, \dst3, \src1, \src3, 0
gather_interleaved \dst1, \dst3, \src1, \src3, 1
gather_interleaved \dst2, \dst4, \src2, \src4, 0
gather_interleaved \dst2, \dst4, \src2, \src4, 1
.endm
function gather32_neon
push {r11-r12,lr}
gather d8, d9, d10, d11, d0, d1, d2, d3
pop {r11-r12,pc}
endfunc
function gather16_neon
push {r11-r12,lr}
gather_interleaved d8, d9, d0, d1, 0
gather_interleaved d8, d9, d0, d1, 1
pop {r11-r12,pc}
endfunc
const overlap_coeffs_0, align=4
.byte 27, 17, 0, 0, 0, 0, 0, 0
.byte 17, 27, 32, 32, 32, 32, 32, 32
endconst
const overlap_coeffs_1, align=4
.byte 23, 0, 0, 0, 0, 0, 0, 0
.byte 22, 32, 32, 32, 32, 32, 32, 32
endconst
.macro calc_offset offx, offy, src, sx, sy
and \offy, \src, #0xF // randval & 0xF
lsr \offx, \src, #4 // randval >> 4
.if \sy == 0
add \offy, \offy, \offy // 2 * (randval & 0xF)
.endif
.if \sx == 0
add \offx, \offx, \offx // 2 * (randval >> 4)
.endif
.endm
.macro add_offset dst, offx, offy, src, stride
mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
add \dst, \dst, \offx // grain_lut += offx
.endm
// void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const int scaling_shift,
// const entry grain_lut[][GRAIN_WIDTH],
// const int offsets[][2],
// const int h, const ptrdiff_t clip,
// const ptrdiff_t type);
function fgy_32x32_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
ldrd r6, r7, [sp, #108] // offsets, h
ldr r8, [sp, #116] // clip
mov r9, #GRAIN_WIDTH // grain_lut stride
neg r4, r4
vdup.16 q13, r4 // -scaling_shift
cmp r8, #0
movrel_local r12, overlap_coeffs_0
beq 1f
// clip
vmov.i8 q14, #16
vmov.i8 q15, #235
b 2f
1:
// no clip
vmov.i8 q14, #0
vmov.i8 q15, #255
2:
vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
add r5, r5, #9 // grain_lut += 9
add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
add r5, r5, r9 // grain_lut += grain_stride
ldr r10, [r6, #8] // offsets[1][0]
calc_offset r10, r4, r10, 0, 0
add_offset r4, r10, r4, r5, r9
ldr r10, [r6, #4] // offsets[0][1]
calc_offset r10, r11, r10, 0, 0
add_offset r11, r10, r11, r5, r9
ldr r10, [r6, #12] // offsets[1][1]
calc_offset r10, r8, r10, 0, 0
add_offset r8, r10, r8, r5, r9
ldr r6, [r6] // offsets[0][0]
calc_offset r6, lr, r6, 0, 0
add_offset r5, r6, lr, r5, r9
add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
ldr r10, [sp, #120] // type
adr r11, L(fgy_loop_tbl)
tst r10, #1
ldr r10, [r11, r10, lsl #2]
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
add r11, r11, r10
beq 1f
// y overlap
vdup.8 d14, d24[0]
vdup.8 d15, d24[1]
mov r10, r7 // backup actual h
mov r7, #2
1:
bx r11
endfunc
function fgy_loop_neon
L(fgy_loop_tbl):
.word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
.macro fgy ox, oy
L(loop_\ox\oy):
1:
.if \ox
vld1.8 {d8}, [r4], r9 // grain_lut old
.endif
.if \oy
vld1.8 {q2, q3}, [r6], r9 // grain_lut top
.endif
.if \ox && \oy
vld1.8 {d10}, [r8], r9 // grain_lut top old
.endif
vld1.8 {q0, q1}, [r1, :128], r2 // src
vld1.8 {q10, q11}, [r5], r9 // grain_lut
.if \ox
vmull.s8 q4, d8, d24
vmlal.s8 q4, d20, d25
.endif
.if \oy
.if \ox
vmull.s8 q5, d10, d24
vmlal.s8 q5, d4, d25
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d4, q5, #5
.endif
vmull.s8 q4, d20, d15
vmull.s8 q5, d21, d15
vmull.s8 q8, d22, d15
vmull.s8 q9, d23, d15
vmlal.s8 q4, d4, d14
vmlal.s8 q5, d5, d14
vmlal.s8 q8, d6, d14
vmlal.s8 q9, d7, d14
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d21, q5, #5
vqrshrn.s16 d22, q8, #5
vqrshrn.s16 d23, q9, #5
.elseif \ox
vqrshrn.s16 d20, q4, #5
.endif
bl gather32_neon
vmovl.s8 q8, d20 // grain
vmovl.s8 q9, d21
vmovl.s8 q10, d22
vmovl.s8 q11, d23
vmovl.u8 q2, d8 // scaling
vmovl.u8 q3, d9
vmovl.u8 q4, d10
vmovl.u8 q5, d11
vmul.i16 q8, q8, q2 // scaling * grain
vmul.i16 q9, q9, q3
vmul.i16 q10, q10, q4
vmul.i16 q11, q11, q5
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
vrshl.s16 q9, q9, q13
vrshl.s16 q10, q10, q13
vrshl.s16 q11, q11, q13
vaddw.u8 q8, q8, d0 // *src + noise
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vaddw.u8 q11, q11, d3
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vqmovun.s16 d3, q11
vmax.u8 q0, q0, q14
vmax.u8 q1, q1, q14
vmin.u8 q0, q0, q15
vmin.u8 q1, q1, q15
subs r7, r7, #1
.if \oy
vdup.8 d14, d25[0]
vdup.8 d15, d25[1]
.endif
vst1.8 {q0, q1}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r10, #2
sub r7, r10, #2 // restore actual remaining h
bgt L(loop_\ox\()0)
.endif
vpop {q4-q7}
pop {r4-r11,pc}
.endm
fgy 0, 0
fgy 0, 1
fgy 1, 0
fgy 1, 1
endfunc
// void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
// const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const Dav1dFilmGrainData *const data,
// const entry grain_lut[][GRAIN_WIDTH],
// const pixel *const luma_row,
// const ptrdiff_t luma_stride,
// const int offsets[][2],
// const ptrdiff_t h, const ptrdiff_t uv,
// const ptrdiff_t is_id,
// const ptrdiff_t type);
.macro fguv layout, sx, sy
function fguv_32x32_\layout\()_8bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100] // data, grain_lut
ldrd r6, r7, [sp, #108] // luma_row, luma_stride
ldrd r8, r9, [sp, #116] // offsets, h
ldrd r10, r11, [sp, #124] // uv, is_id
// !csfl
add r10, r4, r10, lsl #2 // + 4*uv
add r12, r10, #FGD_UV_LUMA_MULT
add lr, r10, #FGD_UV_MULT
add r10, r10, #FGD_UV_OFFSET
vld1.16 {d4[]}, [r12] // uv_luma_mult
vld1.16 {d4[2]}, [r10] // uv_offset
vld1.16 {d4[1]}, [lr] // uv_mult
ldr lr, [r4, #FGD_SCALING_SHIFT]
ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
neg lr, lr // -scaling_shift
cmp r12, #0
vdup.16 q13, lr // -scaling_shift
beq 1f
// clip
cmp r11, #0
vmov.i8 q14, #16
vmov.i8 q15, #240
beq 2f
// is_id
vmov.i8 q15, #235
b 2f
1:
// no clip
vmov.i8 q14, #0
vmov.i8 q15, #255
2:
mov r10, #GRAIN_WIDTH // grain_lut stride
add r5, r5, #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
.if \sy
add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
.else
add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
add r5, r5, r10 // grain_lut += grain_stride
.endif
ldr r12, [r8, #8] // offsets[1][0]
calc_offset r12, r4, r12, \sx, \sy
add_offset r4, r12, r4, r5, r10
ldr r12, [r8, #4] // offsets[0][1]
calc_offset r12, lr, r12, \sx, \sy
add_offset lr, r12, lr, r5, r10
ldr r12, [r8, #12] // offsets[1][1]
calc_offset r12, r11, r12, \sx, \sy
add_offset r11, r12, r11, r5, r10
ldr r8, [r8] // offsets[0][0]
calc_offset r8, r12, r8, \sx, \sy
add_offset r5, r8, r12, r5, r10
add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
movrel_local r12, overlap_coeffs_\sx
ldr lr, [sp, #132] // type
vld1.8 {d24, d25}, [r12, :128] // overlap_coeffs
movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
#if CONFIG_THUMB
// This uses movrel_local instead of adr above, because the target
// can be out of range for adr. But movrel_local leaves the thumb bit
// set on COFF (but probably wouldn't if building for thumb on ELF),
// thus try to clear the bit for robustness.
bic r12, r12, #1
#endif
tst lr, #1
ldr lr, [r12, lr, lsl #2]
add r12, r12, lr
beq 1f
// y overlap
sub lr, r9, #(2 >> \sy) // backup remaining h
mov r9, #(2 >> \sy)
1:
.if \sy
vmov.i8 d6, #23
vmov.i8 d7, #22
.else
vmov.i8 d6, #27
vmov.i8 d7, #17
.endif
.if \sy
add r7, r7, r7 // luma_stride *= 2
.endif
bx r12
endfunc
.endm
fguv 420, 1, 1
fguv 422, 1, 0
fguv 444, 0, 0
function fguv_loop_sx0_neon
L(fguv_loop_sx0_tbl):
.word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.macro fguv_loop_sx0 csfl, ox, oy
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
.if \oy
mov r12, lr
.endif
1:
.if \ox
vld1.8 {d8}, [r4], r10 // grain_lut old
.endif
.if \oy
vld1.8 {q8, q9}, [r8], r10 // grain_lut top
.endif
.if \ox && \oy
vld1.8 {d10}, [r11], r10 // grain_lut top old
.endif
vld1.8 {q0, q1}, [r6, :128], r7 // luma
vld1.8 {q10, q11}, [r5], r10 // grain_lut
.if \ox
vmull.s8 q4, d8, d24
vmlal.s8 q4, d20, d25
.endif
.if \oy
.if \ox
vmull.s8 q5, d10, d24
vmlal.s8 q5, d16, d25
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d16, q5, #5
.endif
vmull.s8 q4, d20, d7
vmull.s8 q5, d21, d7
vmull.s8 q6, d22, d7
vmull.s8 q7, d23, d7
vmlal.s8 q4, d16, d6
vmlal.s8 q5, d17, d6
vmlal.s8 q6, d18, d6
vmlal.s8 q7, d19, d6
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d21, q5, #5
vqrshrn.s16 d22, q6, #5
vqrshrn.s16 d23, q7, #5
.elseif \ox
vqrshrn.s16 d20, q4, #5
.endif
.if !\csfl
vld1.8 {q8, q9}, [r1, :128] // src
vmovl.u8 q4, d0
vmovl.u8 q5, d1
vmovl.u8 q6, d2
vmovl.u8 q7, d3
vmovl.u8 q0, d16
vmovl.u8 q1, d17
vmovl.u8 q8, d18
vmovl.u8 q9, d19
vmul.i16 q4, q4, d4[0]
vmul.i16 q5, q5, d4[0]
vmul.i16 q6, q6, d4[0]
vmul.i16 q7, q7, d4[0]
vmul.i16 q0, q0, d4[1]
vmul.i16 q1, q1, d4[1]
vmul.i16 q8, q8, d4[1]
vmul.i16 q9, q9, d4[1]
vqadd.s16 q4, q4, q0
vqadd.s16 q5, q5, q1
vqadd.s16 q6, q6, q8
vqadd.s16 q7, q7, q9
vdup.16 q0, d4[2]
vshr.s16 q4, q4, #6
vshr.s16 q5, q5, #6
vshr.s16 q6, q6, #6
vshr.s16 q7, q7, #6
vadd.i16 q4, q4, q0
vadd.i16 q5, q5, q0
vadd.i16 q6, q6, q0
vadd.i16 q7, q7, q0
vqmovun.s16 d0, q4
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6
vqmovun.s16 d3, q7
.endif
bl gather32_neon
vld1.8 {q0, q1}, [r1, :128], r2 // src
vmovl.s8 q8, d20 // grain
vmovl.s8 q9, d21
vmovl.s8 q10, d22
vmovl.s8 q11, d23
vmovl.u8 q6, d8 // scaling
vmovl.u8 q7, d9
vmovl.u8 q4, d10
vmovl.u8 q5, d11
vmul.i16 q8, q8, q6 // scaling * grain
vmul.i16 q9, q9, q7
vmul.i16 q10, q10, q4
vmul.i16 q11, q11, q5
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
vrshl.s16 q9, q9, q13
vrshl.s16 q10, q10, q13
vrshl.s16 q11, q11, q13
vaddw.u8 q8, q8, d0 // *src + noise
vaddw.u8 q9, q9, d1
vaddw.u8 q10, q10, d2
vaddw.u8 q11, q11, d3
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vqmovun.s16 d2, q10
vqmovun.s16 d3, q11
vmax.u8 q0, q0, q14
vmax.u8 q1, q1, q14
vmin.u8 q0, q0, q15
vmin.u8 q1, q1, q15
subs r9, r9, #1
.if \oy
vdup.8 d6, d25[0]
vdup.8 d7, d25[1]
.endif
vst1.8 {q0, q1}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r12, #0
mov r9, r12 // restore actual remaining h
bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
.endif
b 9f
.endm
fguv_loop_sx0 0, 0, 0
fguv_loop_sx0 0, 0, 1
fguv_loop_sx0 0, 1, 0
fguv_loop_sx0 0, 1, 1
fguv_loop_sx0 1, 0, 0
fguv_loop_sx0 1, 0, 1
fguv_loop_sx0 1, 1, 0
fguv_loop_sx0 1, 1, 1
9:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
function fguv_loop_sx1_neon
L(fguv_loop_sx1_tbl):
.word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.macro fguv_loop_sx1 csfl, ox, oy
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
.if \oy
mov r12, lr
.endif
1:
.if \ox
vld1.8 {d8}, [r4], r10 // grain_lut old
.endif
.if \oy
vld1.8 {q8}, [r8], r10 // grain_lut top
.endif
.if \ox && \oy
vld1.8 {d10}, [r11], r10 // grain_lut top old
.endif
vld1.8 {q0, q1}, [r6, :128], r7 // luma
vld1.8 {q10}, [r5], r10 // grain_lut
vld1.8 {q11}, [r1, :128], r2 // src
.if \ox
vmull.s8 q4, d8, d24
vmlal.s8 q4, d20, d25
.endif
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
.if \oy
.if \ox
vmull.s8 q5, d10, d24
vmlal.s8 q5, d16, d25
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d16, q5, #5
.endif
vmull.s8 q4, d20, d7
vmull.s8 q5, d21, d7
vmlal.s8 q4, d16, d6
vmlal.s8 q5, d17, d6
vqrshrn.s16 d20, q4, #5
vqrshrn.s16 d21, q5, #5
.elseif \ox
vqrshrn.s16 d20, q4, #5
.endif
.if \csfl
vrshrn.u16 d0, q0, #1
vrshrn.u16 d1, q1, #1
.else
vrshr.u16 q4, q0, #1
vrshr.u16 q5, q1, #1
vmovl.u8 q0, d22
vmovl.u8 q1, d23
vmul.i16 q4, q4, d4[0]
vmul.i16 q5, q5, d4[0]
vmul.i16 q0, q0, d4[1]
vmul.i16 q1, q1, d4[1]
vqadd.s16 q4, q4, q0
vqadd.s16 q5, q5, q1
vdup.16 q0, d4[2]
vshr.s16 q4, q4, #6
vshr.s16 q5, q5, #6
vadd.i16 q4, q4, q0
vadd.i16 q5, q5, q0
vqmovun.s16 d0, q4
vqmovun.s16 d1, q5
.endif
bl gather16_neon
vmovl.s8 q8, d20 // grain
vmovl.s8 q9, d21
vmovl.u8 q6, d8 // scaling
vmovl.u8 q7, d9
vmul.i16 q8, q8, q6 // scaling * grain
vmul.i16 q9, q9, q7
vrshl.s16 q8, q8, q13 // round2(scaling * grain, scaling_shift)
vrshl.s16 q9, q9, q13
vaddw.u8 q8, q8, d22 // *src + noise
vaddw.u8 q9, q9, d23
vqmovun.s16 d0, q8
vqmovun.s16 d1, q9
vmax.u8 q0, q0, q14
vmin.u8 q0, q0, q15
subs r9, r9, #1
.if \oy
vswp d6, d7
.endif
vst1.8 {q0}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r12, #0
mov r9, r12 // restore actual remaining h
bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
.endif
b 9f
.endm
fguv_loop_sx1 0, 0, 0
fguv_loop_sx1 0, 0, 1
fguv_loop_sx1 0, 1, 0
fguv_loop_sx1 0, 1, 1
fguv_loop_sx1 1, 0, 0
fguv_loop_sx1 1, 0, 1
fguv_loop_sx1 1, 1, 0
fguv_loop_sx1 1, 1, 1
9:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc

View File

@ -0,0 +1,949 @@
/*
* Copyright © 2021, VideoLAN and dav1d authors
* Copyright © 2021, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
#include "src/arm/asm-offsets.h"
#define GRAIN_WIDTH 82
.macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
vmov.u16 r11, \src1[0+\off]
vmov.u16 r12, \src3[0+\off]
add r11, r11, r3
vmov.u16 lr, \src1[2+\off]
add r12, r12, r3
vld1.8 {\dst1[0+\off]}, [r11]
vmov.u16 r11, \src3[2+\off]
add lr, lr, r3
vld1.8 {\dst2[0+\off]}, [r12]
vmov.u16 r12, \src2[0+\off]
add r11, r11, r3
vld1.8 {\dst1[2+\off]}, [lr]
vmov.u16 lr, \src4[0+\off]
add r12, r12, r3
vld1.8 {\dst2[2+\off]}, [r11]
vmov.u16 r11, \src2[2+\off]
add lr, lr, r3
vld1.8 {\dst1[4+\off]}, [r12]
vmov.u16 r12, \src4[2+\off]
add r11, r11, r3
vld1.8 {\dst2[4+\off]}, [lr]
add r12, r12, r3
vld1.8 {\dst1[6+\off]}, [r11]
vld1.8 {\dst2[6+\off]}, [r12]
.endm
.macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
.endm
function gather32_neon
push {r11-r12,lr}
gather d8, d9, d10, d11, d0, d1, d2, d3, d4, d5, d6, d7
pop {r11-r12,pc}
endfunc
function gather16_neon
push {r11-r12,lr}
gather_interleaved d8, d9, d0, d1, d2, d3, 0
gather_interleaved d8, d9, d0, d1, d2, d3, 1
pop {r11-r12,pc}
endfunc
const overlap_coeffs_0, align=4
.short 27, 17, 0, 0
.short 17, 27, 32, 32
endconst
const overlap_coeffs_1, align=4
.short 23, 0, 0, 0
.short 22, 32, 32, 32
endconst
.macro calc_offset offx, offy, src, sx, sy
and \offy, \src, #0xF // randval & 0xF
lsr \offx, \src, #4 // randval >> 4
.if \sy == 0
add \offy, \offy, \offy // 2 * (randval & 0xF)
.endif
.if \sx == 0
add \offx, \offx, \offx // 2 * (randval >> 4)
.endif
.endm
.macro add_offset dst, offx, offy, src, stride
mla \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
add \dst, \dst, \offx, lsl #1 // grain_lut += offx
.endm
// void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const int scaling_shift,
// const entry grain_lut[][GRAIN_WIDTH],
// const int offsets[][2],
// const int h, const ptrdiff_t clip,
// const ptrdiff_t type,
// const int bitdepth_max);
function fgy_32x32_16bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100] // scaling_shift, grain_lut
ldrd r6, r7, [sp, #108] // offsets, h
ldr r8, [sp, #116] // clip
mov r9, #GRAIN_WIDTH*2 // grain_lut stride
ldr r10, [sp, #124] // bitdepth_max
eor r4, r4, #15 // 15 - scaling_shift
vdup.16 q6, r10 // bitdepth_max
clz r10, r10
vdup.16 q13, r4 // 15 - scaling_shift
rsb r10, r10, #24 // bitdepth_min_8
cmp r8, #0
vdup.16 q12, r10 // bitdepth_min_8
movrel_local r12, overlap_coeffs_0
beq 1f
// clip
vmov.i16 q14, #16
vmov.i16 q15, #235
vshl.s16 q14, q14, q12
vshl.s16 q15, q15, q12
b 2f
1:
// no clip
vmov.i16 q14, #0
vmov q15, q6
2:
vshr.u16 q6, q6, #1 // grain_max
vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
add r5, r5, #18 // grain_lut += 9
add r5, r5, r9, lsl #3 // grain_lut += 8 * grain_stride
add r5, r5, r9 // grain_lut += grain_stride
ldr r10, [r6, #8] // offsets[1][0]
calc_offset r10, r4, r10, 0, 0
add_offset r4, r10, r4, r5, r9
ldr r10, [r6, #4] // offsets[0][1]
calc_offset r10, r11, r10, 0, 0
add_offset r11, r10, r11, r5, r9
ldr r10, [r6, #12] // offsets[1][1]
calc_offset r10, r8, r10, 0, 0
add_offset r8, r10, r8, r5, r9
ldr r6, [r6] // offsets[0][0]
calc_offset r6, lr, r6, 0, 0
add_offset r5, r6, lr, r5, r9
add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
ldr r10, [sp, #120] // type
adr r11, L(fgy_loop_tbl)
tst r10, #1
ldr r10, [r11, r10, lsl #2]
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
add r11, r11, r10
beq 1f
// y overlap
vdup.16 d14, d24[0]
vdup.16 d15, d24[1]
mov r10, r7 // backup actual h
mov r7, #2
1:
sub r2, r2, #32 // src_stride -= 32
sub r9, r9, #32 // grain_stride -= 32
bx r11
endfunc
function fgy_loop_neon
L(fgy_loop_tbl):
.word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
.word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
.macro fgy ox, oy
L(loop_\ox\oy):
1:
.if \ox
vld1.16 {d0}, [r4], r9 // grain_lut old
.endif
.if \oy
vld1.16 {q2, q3}, [r6]! // grain_lut top
.endif
.if \ox && \oy
vld1.16 {d2}, [r8], r9 // grain_lut top old
.endif
.if \oy
vld1.16 {q4, q5}, [r6], r9 // grain_lut top
.endif
.if !\ox && !\oy
vld1.16 {q0, q1}, [r1, :128]! // src
.endif
vld1.16 {q8, q9}, [r5]! // grain_lut
.if !\ox && !\oy
vld1.16 {q2, q3}, [r1, :128], r2 // src
.endif
.if !\oy
vmvn.i16 q5, #0xf000 // 0x0fff
.endif
vld1.16 {q10, q11}, [r5], r9 // grain_lut
.if \ox
add r4, r4, #32
vmull.s16 q0, d0, d24
vmlal.s16 q0, d16, d25
.endif
.if \oy
.if \ox
add r8, r8, #32
vmull.s16 q1, d2, d24
vmlal.s16 q1, d4, d25
vqrshrn.s32 d16, q0, #5
vmvn d0, d12 // grain_min
vqrshrn.s32 d4, q1, #5
vmin.s16 d16, d16, d12
vmin.s16 d4, d4, d12
vmax.s16 d16, d16, d0
vmax.s16 d4, d4, d0
.endif
vmull.s16 q0, d4, d14
vmull.s16 q1, d5, d14
vmull.s16 q2, d6, d14
vmull.s16 q3, d7, d14
vmlal.s16 q0, d16, d15
vmlal.s16 q1, d17, d15
vmlal.s16 q2, d18, d15
vmlal.s16 q3, d19, d15
vmull.s16 q8, d20, d15
vmull.s16 q9, d21, d15
vmull.s16 q10, d22, d15
vmull.s16 q11, d23, d15
vmlal.s16 q8, d8, d14
vmlal.s16 q9, d9, d14
vmlal.s16 q10, d10, d14
vmlal.s16 q11, d11, d14
vmvn q4, q6 // grain_min
vqrshrn.s32 d0, q0, #5
vqrshrn.s32 d1, q1, #5
vqrshrn.s32 d2, q2, #5
vqrshrn.s32 d3, q3, #5
vqrshrn.s32 d4, q8, #5
vqrshrn.s32 d5, q9, #5
vqrshrn.s32 d6, q10, #5
vqrshrn.s32 d7, q11, #5
vmin.s16 q8, q0, q6
vmin.s16 q9, q1, q6
vld1.16 {q0, q1}, [r1, :128]! // src
vmin.s16 q10, q2, q6
vmin.s16 q11, q3, q6
vmax.s16 q8, q8, q4
vmax.s16 q9, q9, q4
vld1.16 {q2, q3}, [r1, :128], r2 // src
vmvn.i16 q5, #0xf000 // 0x0fff
vmax.s16 q10, q10, q4
vmax.s16 q11, q11, q4
.elseif \ox
vmvn d4, d12 // grain_min
vqrshrn.s32 d16, q0, #5
vld1.16 {q0, q1}, [r1, :128]! // src
vmin.s16 d16, d16, d12
vmax.s16 d16, d16, d4
vld1.16 {q2, q3}, [r1, :128], r2 // src
.endif
// Make sure that uninitialized pixels out of range past the right
// edge are in range; their actual values shouldn't matter.
vand q0, q0, q5
vand q1, q1, q5
vand q2, q2, q5
vand q3, q3, q5
bl gather32_neon
.if \ox || \oy
vpush {q6-q7}
.endif
vmovl.u8 q6, d8 // scaling
vmovl.u8 q7, d9
vmovl.u8 q4, d10
vmovl.u8 q5, d11
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
vshl.u16 q7, q7, q13
vshl.u16 q4, q4, q13
vshl.u16 q5, q5, q13
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
vqrdmulh.s16 q9, q9, q7
vqrdmulh.s16 q10, q10, q4
vqrdmulh.s16 q11, q11, q5
.if \ox || \oy
vpop {q6-q7}
.endif
vqadd.s16 q0, q0, q8 // *src + noise
vqadd.s16 q1, q1, q9
vqadd.s16 q2, q2, q10
vqadd.s16 q3, q3, q11
vmax.s16 q0, q0, q14
vmax.s16 q1, q1, q14
vmax.s16 q2, q2, q14
vmax.s16 q3, q3, q14
vmin.s16 q0, q0, q15
vmin.s16 q1, q1, q15
vmin.s16 q2, q2, q15
vmin.s16 q3, q3, q15
vst1.16 {q0, q1}, [r0, :128]! // dst
subs r7, r7, #1
.if \oy
vdup.16 d14, d25[0]
vdup.16 d15, d25[1]
.endif
vst1.16 {q2, q3}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r10, #2
sub r7, r10, #2 // restore actual remaining h
bgt L(loop_\ox\()0)
.endif
vpop {q4-q7}
pop {r4-r11,pc}
.endm
fgy 0, 0
fgy 0, 1
fgy 1, 0
fgy 1, 1
endfunc
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
// const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const Dav1dFilmGrainData *const data,
// const entry grain_lut[][GRAIN_WIDTH],
// const pixel *const luma_row,
// const ptrdiff_t luma_stride,
// const int offsets[][2],
// const ptrdiff_t h, const ptrdiff_t uv,
// const ptrdiff_t is_id,
// const ptrdiff_t type,
// const int bitdepth_max);
.macro fguv layout, sx, sy
function fguv_32x32_\layout\()_16bpc_neon, export=1
push {r4-r11,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #100] // data, grain_lut
ldrd r10, r11, [sp, #124] // uv, is_id
ldr r6, [sp, #136] // bitdepth_max
clz r7, r6
rsb r7, r7, #24 // bitdepth_min_8
// !csfl
add r10, r4, r10, lsl #2 // + 4*uv
add r12, r10, #FGD_UV_LUMA_MULT
add lr, r10, #FGD_UV_MULT
ldrh r10, [r10, #FGD_UV_OFFSET] // uv_offset
vld1.16 {d30[]}, [r12] // uv_luma_mult
lsl r10, r10, r7 // uv_offset << bitdepth_min_8
vld1.16 {d30[1]}, [lr] // uv_mult
ldr lr, [r4, #FGD_SCALING_SHIFT]
ldr r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
eor lr, lr, #15 // 15 - scaling_shift
vmov.16 d30[2], r10 // uv_offset << bitdepth_min_8
cmp r12, #0
vdup.16 q13, lr // 15 - scaling_shift
beq 1f
// clip
cmp r11, #0
mov r8, #16
mov r9, #240
lsl r8, r8, r7
lsl r9, r9, r7
beq 2f
// is_id
mov r9, #235
lsl r9, r9, r7
b 2f
1:
// no clip
mov r8, #0
mov r9, r6 // bitdepth_max
2:
vmov.16 d30[3], r6 // bitdepth_max
vdup.16 d31, r8 // clip_min
mov r10, #GRAIN_WIDTH*2 // grain_lut stride
.if \sy
mov r6, #23
mov r7, #22
.else
mov r6, #27
mov r7, #17
.endif
vmov.16 d31[1], r9 // clip_max
ldrd r8, r9, [sp, #116] // offsets, h
add r5, r5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
.if \sy
add r5, r5, r10, lsl #2 // grain_lut += 4 * grain_stride
add r5, r5, r10, lsl #1 // grain_lut += 2 * grain_stride
.else
add r5, r5, r10, lsl #3 // grain_lut += 8 * grain_stride
add r5, r5, r10 // grain_lut += grain_stride
.endif
vmov.16 d31[2], r6 // overlap y [0]
ldr r12, [r8, #8] // offsets[1][0]
calc_offset r12, r4, r12, \sx, \sy
add_offset r4, r12, r4, r5, r10
ldr r12, [r8, #4] // offsets[0][1]
calc_offset r12, lr, r12, \sx, \sy
add_offset lr, r12, lr, r5, r10
ldr r12, [r8, #12] // offsets[1][1]
calc_offset r12, r11, r12, \sx, \sy
add_offset r11, r12, r11, r5, r10
ldr r8, [r8] // offsets[0][0]
calc_offset r8, r12, r8, \sx, \sy
add_offset r5, r8, r12, r5, r10
vmov.16 d31[3], r7 // overlap y [1]
add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
movrel_local r12, overlap_coeffs_\sx
ldr lr, [sp, #132] // type
ldrd r6, r7, [sp, #108] // luma_row, luma_stride
vld1.16 {d24, d25}, [r12, :128] // overlap_coeffs
movrel_local r12, L(fguv_loop_sx\sx\()_tbl)
#if CONFIG_THUMB
// This uses movrel_local instead of adr above, because the target
// can be out of range for adr. But movrel_local leaves the thumb bit
// set on COFF (but probably wouldn't if building for thumb on ELF),
// thus try to clear the bit for robustness.
bic r12, r12, #1
#endif
tst lr, #1
ldr lr, [r12, lr, lsl #2]
add r12, r12, lr
beq 1f
// y overlap
sub lr, r9, #(2 >> \sy) // backup remaining h
mov r9, #(2 >> \sy)
1:
.if \sy
add r7, r7, r7 // luma_stride *= 2
.endif
sub r7, r7, #32 // luma_stride -= 32
bx r12
endfunc
.endm
fguv 420, 1, 1
fguv 422, 1, 0
fguv 444, 0, 0
function fguv_loop_sx0_neon
L(fguv_loop_sx0_tbl):
.word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
.macro fguv_loop_sx0 csfl, ox, oy
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
sub r2, r2, #32 // src_stride -= 32
sub r10, r10, #32 // grain_stride -= 32
.if \oy
mov r12, lr
.endif
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
1:
.if \ox
vld1.16 {d0}, [r4], r10 // grain_lut old
.endif
.if \oy
vld1.16 {q2, q3}, [r8]! // grain_lut top
.endif
.if \ox && \oy
vld1.16 {d2}, [r11], r10 // grain_lut top old
.endif
.if !\ox && !\oy
vld1.16 {q0, q1}, [r6, :128]! // luma
.endif
vld1.16 {q8, q9}, [r5]! // grain_lut
.if \oy
vld1.16 {q4, q5}, [r8], r10 // grain_lut top
.endif
.if !\ox && !\oy
vld1.16 {q2, q3}, [r6, :128], r7 // luma
.endif
.if \oy
vdup.16 d28, d31[2] // overlap y coeff
vdup.16 d29, d31[3] // overlap y coeff
.endif
vld1.16 {q10, q11}, [r5], r10 // grain_lut
.if \ox
vdup.16 q7, d30[3] // bitdepth_max
add r4, r4, #32
vmull.s16 q0, d0, d24
vshr.u16 q7, q7, #1 // grain_max
vmlal.s16 q0, d16, d25
vmvn q6, q7 // grain_min
.endif
.if \oy
.if \ox
add r11, r11, #32
vmull.s16 q1, d2, d24
vmlal.s16 q1, d4, d25
vqrshrn.s32 d16, q0, #5
vqrshrn.s32 d4, q1, #5
vmin.s16 d4, d4, d14
vmin.s16 d16, d16, d14
vmax.s16 d4, d4, d12
vmax.s16 d16, d16, d12
.endif
vmull.s16 q0, d4, d28
vmull.s16 q1, d5, d28
vmull.s16 q2, d6, d28
vmull.s16 q3, d7, d28
.if !\ox
vdup.16 q7, d30[3] // bitdepth_max
.endif
vmlal.s16 q0, d16, d29
vmlal.s16 q1, d17, d29
vmlal.s16 q2, d18, d29
vmlal.s16 q3, d19, d29
.if !\ox
vshr.u16 q7, q7, #1 // grain_max
.endif
vmull.s16 q8, d20, d29
vmull.s16 q9, d21, d29
vmull.s16 q10, d22, d29
vmull.s16 q11, d23, d29
.if !\ox
vmvn q6, q7 // grain_min
.endif
vmlal.s16 q8, d8, d28
vmlal.s16 q9, d9, d28
vmlal.s16 q10, d10, d28
vmlal.s16 q11, d11, d28
vqrshrn.s32 d0, q0, #5
vqrshrn.s32 d1, q1, #5
vqrshrn.s32 d2, q2, #5
vqrshrn.s32 d3, q3, #5
vqrshrn.s32 d4, q8, #5
vqrshrn.s32 d5, q9, #5
vqrshrn.s32 d6, q10, #5
vqrshrn.s32 d7, q11, #5
vmin.s16 q8, q0, q7
vmin.s16 q9, q1, q7
vld1.16 {q0, q1}, [r6, :128]! // luma
vmin.s16 q10, q2, q7
vmin.s16 q11, q3, q7
vmax.s16 q8, q8, q6
vmax.s16 q9, q9, q6
vld1.16 {q2, q3}, [r6, :128], r7 // luma
vmax.s16 q10, q10, q6
vmax.s16 q11, q11, q6
.elseif \ox
vqrshrn.s32 d16, q0, #5
vld1.16 {q0, q1}, [r6, :128]! // luma
vmin.s16 d16, d16, d14
vld1.16 {q2, q3}, [r6, :128], r7 // luma
vmax.s16 d16, d16, d12
.endif
.if !\csfl
vdup.16 d28, d30[0] // uv_luma_mult
vld1.16 {q4, q5}, [r1, :128]! // src
vdup.16 d29, d30[1] // uv_mult
vmull.s16 q6, d0, d28
vmull.s16 q7, d1, d28
vmull.s16 q0, d2, d28
vmull.s16 q1, d3, d28
vmlal.s16 q6, d8, d29
vmlal.s16 q7, d9, d29
vmlal.s16 q0, d10, d29
vmlal.s16 q1, d11, d29
vld1.16 {q4, q5}, [r1, :128] // src
sub r1, r1, #32
vshrn.s32 d12, q6, #6
vshrn.s32 d13, q7, #6
vshrn.s32 d14, q0, #6
vshrn.s32 d15, q1, #6
vmull.s16 q0, d4, d28
vmull.s16 q1, d5, d28
vmull.s16 q2, d6, d28
vmull.s16 q3, d7, d28
vmlal.s16 q0, d8, d29
vmlal.s16 q1, d9, d29
vmlal.s16 q2, d10, d29
vmlal.s16 q3, d11, d29
vdup.16 q14, d30[2] // uv_offset
vshrn.s32 d0, q0, #6
vshrn.s32 d1, q1, #6
vshrn.s32 d2, q2, #6
vshrn.s32 d3, q3, #6
vdup.16 q4, d30[3] // bitdepth_max
vmov.i16 q5, #0
vadd.i16 q6, q6, q14
vadd.i16 q7, q7, q14
vadd.i16 q2, q0, q14
vadd.i16 q3, q1, q14
vmin.s16 q0, q6, q4
vmin.s16 q1, q7, q4
vmin.s16 q2, q2, q4
vmin.s16 q3, q3, q4
vmax.s16 q0, q0, q5
vmax.s16 q1, q1, q5
vmax.s16 q2, q2, q5
vmax.s16 q3, q3, q5
.else
vdup.16 q14, d30[3] // bitdepth_max
// Make sure that uninitialized pixels out of range past the right
// edge are in range; their actual values shouldn't matter.
vand q0, q0, q14
vand q1, q1, q14
vand q2, q2, q14
vand q3, q3, q14
.endif
bl gather32_neon
vld1.16 {q0, q1}, [r1, :128]! // src
vmovl.u8 q6, d8 // scaling
vmovl.u8 q7, d9
vmovl.u8 q4, d10
vmovl.u8 q5, d11
vld1.16 {q2, q3}, [r1, :128], r2 // src
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
vshl.u16 q7, q7, q13
vshl.u16 q4, q4, q13
vshl.u16 q5, q5, q13
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
vqrdmulh.s16 q9, q9, q7
vqrdmulh.s16 q10, q10, q4
vqrdmulh.s16 q11, q11, q5
vdup.16 q4, d31[0] // clip_min
vdup.16 q5, d31[1] // clip_max
vqadd.s16 q0, q0, q8 // *src + noise
vqadd.s16 q1, q1, q9
vqadd.s16 q2, q2, q10
vqadd.s16 q3, q3, q11
.if \oy
vmov.32 lr, d25[0] // 2 first 16 bit coeffs from overlap x
.endif
vmax.s16 q0, q0, q4
vmax.s16 q1, q1, q4
vmax.s16 q2, q2, q4
vmax.s16 q3, q3, q4
vmin.s16 q0, q0, q5
vmin.s16 q1, q1, q5
vmin.s16 q2, q2, q5
vmin.s16 q3, q3, q5
vst1.16 {q0, q1}, [r0, :128]! // dst
subs r9, r9, #1
.if \oy
vmov.32 d31[1], lr // new coeffs for overlap y
.endif
vst1.16 {q2, q3}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r12, #0
mov r9, r12 // restore actual remaining h
bgt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
.endif
b 9f
.endm
fguv_loop_sx0 0, 0, 0
fguv_loop_sx0 0, 0, 1
fguv_loop_sx0 0, 1, 0
fguv_loop_sx0 0, 1, 1
fguv_loop_sx0 1, 0, 0
fguv_loop_sx0 1, 0, 1
fguv_loop_sx0 1, 1, 0
fguv_loop_sx0 1, 1, 1
9:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc
function fguv_loop_sx1_neon
L(fguv_loop_sx1_tbl):
.word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
.macro fguv_loop_sx1 csfl, ox, oy
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
.if \oy
mov r12, lr
.endif
1:
.if \ox
vld1.16 {d0}, [r4], r10 // grain_lut old
.endif
.if \ox && \oy
vld1.16 {d2}, [r11], r10 // grain_lut top old
.endif
.if \oy
vld1.16 {q2, q3}, [r8], r10 // grain_lut top
.endif
.if !\ox && !\oy
vld1.16 {q0, q1}, [r6, :128]! // luma
.endif
vld1.16 {q8, q9}, [r5], r10 // grain_lut
.if \oy
vdup.16 d28, d31[2] // overlap y coeff
vdup.16 d29, d31[3] // overlap y coeff
.endif
.if !\ox && !\oy
vld1.16 {q2, q3}, [r6, :128], r7 // luma
.endif
.if \ox
vdup.16 q7, d30[3] // bitdepth_max
vmull.s16 q0, d0, d24
vshr.u16 q7, q7, #1 // grain_max
vmlal.s16 q0, d16, d25
vmvn q6, q7 // grain_min
.endif
.if \oy
.if \ox
vmull.s16 q1, d2, d24
vmlal.s16 q1, d4, d25
vqrshrn.s32 d16, q0, #5
vqrshrn.s32 d4, q1, #5
vmin.s16 d4, d4, d14
vmin.s16 d16, d16, d14
vmax.s16 d4, d4, d12
vmax.s16 d16, d16, d12
.endif
vmull.s16 q0, d4, d28
vmull.s16 q1, d5, d28
vmull.s16 q2, d6, d28
vmull.s16 q3, d7, d28
.if !\ox
vdup.16 q7, d30[3] // bitdepth_max
.endif
vmlal.s16 q0, d16, d29
vmlal.s16 q1, d17, d29
vmlal.s16 q2, d18, d29
vmlal.s16 q3, d19, d29
.if !\ox
vshr.u16 q7, q7, #1 // grain_max
.endif
vqrshrn.s32 d16, q0, #5
vqrshrn.s32 d17, q1, #5
vqrshrn.s32 d18, q2, #5
vqrshrn.s32 d19, q3, #5
.if !\ox
vmvn q6, q7 // grain_min
.endif
vld1.16 {q0, q1}, [r6, :128]! // luma
vmin.s16 q8, q8, q7
vmin.s16 q9, q9, q7
vmax.s16 q8, q8, q6
vmax.s16 q9, q9, q6
vld1.16 {q2, q3}, [r6, :128], r7 // luma
.elseif \ox
vqrshrn.s32 d16, q0, #5
vld1.16 {q0, q1}, [r6, :128]! // luma
vmin.s16 d16, d16, d14
vld1.16 {q2, q3}, [r6, :128], r7 // luma
vmax.s16 d16, d16, d12
.endif
vpadd.i16 d0, d0, d1
vpadd.i16 d1, d2, d3
vpadd.i16 d2, d4, d5
vpadd.i16 d3, d6, d7
vrshr.u16 q0, q0, #1
vrshr.u16 q1, q1, #1
.if !\csfl
vdup.16 d28, d30[0] // uv_luma_mult
vld1.16 {q2, q3}, [r1, :128], r2 // src
vdup.16 d29, d30[1] // uv_mult
vmull.s16 q6, d0, d28
vmull.s16 q7, d1, d28
vmull.s16 q0, d2, d28
vmull.s16 q1, d3, d28
vmlal.s16 q6, d4, d29
vmlal.s16 q7, d5, d29
vmlal.s16 q0, d6, d29
vmlal.s16 q1, d7, d29
vshrn.s32 d12, q6, #6
vshrn.s32 d13, q7, #6
vshrn.s32 d14, q0, #6
vshrn.s32 d15, q1, #6
vdup.16 q14, d30[2] // uv_offset
vdup.16 q4, d30[3] // bitdepth_max
vmov.i16 q5, #0
vadd.i16 q6, q6, q14
vadd.i16 q7, q7, q14
vmin.s16 q0, q6, q4
vmin.s16 q1, q7, q4
vmax.s16 q0, q0, q5
vmax.s16 q1, q1, q5
.else
vdup.16 q14, d30[3] // bitdepth_max
vld1.16 {q2, q3}, [r1, :128], r2 // src
// Make sure that uninitialized pixels out of range past the right
// edge are in range; their actual values shouldn't matter.
vand q0, q0, q14
vand q1, q1, q14
.endif
bl gather16_neon
vmovl.u8 q6, d8 // scaling
vmovl.u8 q7, d9
vshl.u16 q6, q6, q13 // scaling << (15 - scaling_shift)
vshl.u16 q7, q7, q13
vqrdmulh.s16 q8, q8, q6 // round2((scaling << (15 - scaling_shift) * grain, 15)
vqrdmulh.s16 q9, q9, q7
vdup.16 q4, d31[0] // clip_min
vdup.16 q5, d31[1] // clip_max
vqadd.s16 q0, q2, q8 // *src + noise
vqadd.s16 q1, q3, q9
.if \oy
// Swap the two last coefficients of d31, place them first in d28
vrev64.16 d28, d31
.endif
vmax.s16 q0, q0, q4
vmax.s16 q1, q1, q4
vmin.s16 q0, q0, q5
vmin.s16 q1, q1, q5
subs r9, r9, #1
.if \oy
// Take the first two 16 bit coefficients of d28 and place them at the
// end of d31
vtrn.32 d31, d28
.endif
vst1.16 {q0, q1}, [r0, :128], r2 // dst
bgt 1b
.if \oy
cmp r12, #0
mov r9, r12 // restore actual remaining h
bgt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
.endif
b 9f
.endm
fguv_loop_sx1 0, 0, 0
fguv_loop_sx1 0, 0, 1
fguv_loop_sx1 0, 1, 0
fguv_loop_sx1 0, 1, 1
fguv_loop_sx1 1, 0, 0
fguv_loop_sx1 1, 0, 1
fguv_loop_sx1 1, 1, 0
fguv_loop_sx1 1, 1, 1
9:
vpop {q4-q7}
pop {r4-r11,pc}
endfunc

View File

@ -232,12 +232,14 @@ function sum_lag1_above_neon
smull2 v5.8h, v0.16b, v27.16b
smull v6.8h, v1.8b, v29.8b
smull2 v7.8h, v1.16b, v29.16b
add v2.8h, v2.8h, v4.8h
add v3.8h, v3.8h, v5.8h
saddl v4.4s, v2.4h, v6.4h
saddl2 v5.4s, v2.8h, v6.8h
saddl v6.4s, v3.4h, v7.4h
saddl2 v7.4s, v3.8h, v7.8h
saddl v0.4s, v2.4h, v4.4h
saddl2 v1.4s, v2.8h, v4.8h
saddl v2.4s, v3.4h, v5.4h
saddl2 v3.4s, v3.8h, v5.8h
saddw v4.4s, v0.4s, v6.4h
saddw2 v5.4s, v1.4s, v6.8h
saddw v6.4s, v2.4s, v7.4h
saddw2 v7.4s, v3.4s, v7.8h
ret
endfunc
@ -450,14 +452,18 @@ function sum_lag2_above_neon
smull2 v7.8h, v0.16b, v28.16b
smull v0.8h, v1.8b, v29.8b
smull2 v1.8h, v1.16b, v29.16b
add v2.8h, v2.8h, v4.8h
add v3.8h, v3.8h, v5.8h
add v0.8h, v0.8h, v6.8h
add v1.8h, v1.8h, v7.8h
saddl v4.4s, v2.4h, v0.4h
saddl2 v5.4s, v2.8h, v0.8h
saddl v6.4s, v3.4h, v1.4h
saddl2 v7.4s, v3.8h, v1.8h
saddl v22.4s, v2.4h, v4.4h
saddl2 v23.4s, v2.8h, v4.8h
saddl v26.4s, v3.4h, v5.4h
saddl2 v27.4s, v3.8h, v5.8h
saddl v2.4s, v0.4h, v6.4h
saddl2 v3.4s, v0.8h, v6.8h
saddl v6.4s, v1.4h, v7.4h
saddl2 v7.4s, v1.8h, v7.8h
add v4.4s, v22.4s, v2.4s
add v5.4s, v23.4s, v3.4s
add v6.4s, v26.4s, v6.4s
add v7.4s, v27.4s, v7.4s
ext v22.16b, v19.16b, v20.16b, #14 // top left, top mid
dup v26.16b, v30.b[5]
@ -476,14 +482,18 @@ function sum_lag2_above_neon
smull2 v27.8h, v0.16b, v28.16b
smull v28.8h, v1.8b, v29.8b
smull2 v29.8h, v1.16b, v29.16b
add v2.8h, v2.8h, v22.8h
add v3.8h, v3.8h, v23.8h
add v26.8h, v26.8h, v28.8h
add v27.8h, v27.8h, v29.8h
saddl v0.4s, v2.4h, v26.4h
saddl2 v1.4s, v2.8h, v26.8h
saddl v2.4s, v3.4h, v27.4h
saddl2 v3.4s, v3.8h, v27.8h
saddl v0.4s, v2.4h, v22.4h
saddl2 v1.4s, v2.8h, v22.8h
saddl v2.4s, v3.4h, v23.4h
saddl2 v3.4s, v3.8h, v23.8h
saddl v22.4s, v26.4h, v28.4h
saddl2 v23.4s, v26.8h, v28.8h
saddl v26.4s, v27.4h, v29.4h
saddl2 v27.4s, v27.8h, v29.8h
add v0.4s, v0.4s, v22.4s
add v1.4s, v1.4s, v23.4s
add v2.4s, v2.4s, v26.4s
add v3.4s, v3.4s, v27.4s
dup v26.16b, v30.b[2]
dup v27.16b, v30.b[7]
smull v22.8h, v17.8b, v26.8b
@ -498,14 +508,16 @@ function sum_lag2_above_neon
mov v16.16b, v17.16b
mov v17.16b, v18.16b
add v22.8h, v22.8h, v24.8h
add v23.8h, v23.8h, v25.8h
saddl v0.4s, v22.4h, v24.4h
saddl2 v1.4s, v22.8h, v24.8h
saddl v2.4s, v23.4h, v25.4h
saddl2 v3.4s, v23.8h, v25.8h
mov v19.16b, v20.16b
mov v20.16b, v21.16b
saddw v4.4s, v4.4s, v22.4h
saddw2 v5.4s, v5.4s, v22.8h
saddw v6.4s, v6.4s, v23.4h
saddw2 v7.4s, v7.4s, v23.8h
add v4.4s, v4.4s, v0.4s
add v5.4s, v5.4s, v1.4s
add v6.4s, v6.4s, v2.4s
add v7.4s, v7.4s, v3.4s
ret
endfunc
@ -711,32 +723,38 @@ function sum_lag3_above_neon
smull2 v3.8h, v9.16b, v23.16b
smull v8.8h, v10.8b, v24.8b
smull2 v9.8h, v10.16b, v24.16b
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
smull v10.8h, v11.8b, v26.8b
smull2 v11.8h, v11.16b, v26.16b
smull v2.8h, v12.8b, v27.8b
smull2 v3.8h, v12.16b, v27.16b
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
saddl v22.4s, v0.4h, v2.4h
saddl2 v23.4s, v0.8h, v2.8h
saddl v24.4s, v1.4h, v3.4h
saddl2 v26.4s, v1.8h, v3.8h
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
smull v8.8h, v12.8b, v27.8b
smull2 v9.8h, v12.16b, v27.16b
smull v10.8h, v13.8b, v28.8b
smull2 v11.8h, v13.16b, v28.16b
saddl v4.4s, v0.4h, v8.4h
saddl2 v5.4s, v0.8h, v8.8h
saddl v6.4s, v1.4h, v9.4h
saddl2 v7.4s, v1.8h, v9.8h
smull v8.8h, v14.8b, v25.8b
smull2 v9.8h, v14.16b, v25.16b
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
saddl v0.4s, v2.4h, v8.4h
saddl2 v1.4s, v2.8h, v8.8h
saddl v2.4s, v3.4h, v9.4h
saddl2 v3.4s, v3.8h, v9.8h
smull v12.8h, v14.8b, v25.8b
smull2 v13.8h, v14.16b, v25.16b
add v4.4s, v22.4s, v0.4s
add v5.4s, v23.4s, v1.4s
add v6.4s, v24.4s, v2.4s
add v7.4s, v26.4s, v3.4s
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
add v4.4s, v4.4s, v0.4s
add v5.4s, v5.4s, v1.4s
add v6.4s, v6.4s, v2.4s
add v7.4s, v7.4s, v3.4s
saddw v4.4s, v4.4s, v12.4h
saddw2 v5.4s, v5.4s, v12.8h
saddw v6.4s, v6.4s, v13.4h
saddw2 v7.4s, v7.4s, v13.8h
ext v8.16b, v16.16b, v17.16b, #13 // top left, top mid
dup v22.16b, v29.b[7]
@ -758,36 +776,42 @@ function sum_lag3_above_neon
smull2 v3.8h, v9.16b, v23.16b
smull v8.8h, v10.8b, v24.8b
smull2 v9.8h, v10.16b, v24.16b
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
smull v10.8h, v11.8b, v26.8b
smull2 v11.8h, v11.16b, v26.16b
smull v2.8h, v12.8b, v27.8b
smull2 v3.8h, v12.16b, v27.16b
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
saddl v22.4s, v0.4h, v2.4h
saddl2 v23.4s, v0.8h, v2.8h
saddl v24.4s, v1.4h, v3.4h
saddl2 v26.4s, v1.8h, v3.8h
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
smull v8.8h, v12.8b, v27.8b
smull2 v9.8h, v12.16b, v27.16b
smull v10.8h, v13.8b, v28.8b
smull2 v11.8h, v13.16b, v28.16b
saddl v12.4s, v0.4h, v8.4h
saddl2 v13.4s, v0.8h, v8.8h
saddl v0.4s, v1.4h, v9.4h
saddl2 v1.4s, v1.8h, v9.8h
smull v8.8h, v17.8b, v25.8b
smull2 v9.8h, v17.16b, v25.16b
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.4s, v4.4s, v12.4s
add v5.4s, v5.4s, v13.4s
add v6.4s, v6.4s, v0.4s
add v7.4s, v7.4s, v1.4s
saddl v0.4s, v2.4h, v8.4h
saddl2 v1.4s, v2.8h, v8.8h
saddl v2.4s, v3.4h, v9.4h
saddl2 v3.4s, v3.8h, v9.8h
smull v12.8h, v17.8b, v25.8b
smull2 v13.8h, v17.16b, v25.16b
add v22.4s, v22.4s, v0.4s
add v23.4s, v23.4s, v1.4s
add v24.4s, v24.4s, v2.4s
add v26.4s, v26.4s, v3.4s
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
add v4.4s, v4.4s, v22.4s
add v5.4s, v5.4s, v23.4s
add v6.4s, v6.4s, v24.4s
add v7.4s, v7.4s, v26.4s
add v4.4s, v4.4s, v0.4s
add v5.4s, v5.4s, v1.4s
add v6.4s, v6.4s, v2.4s
add v7.4s, v7.4s, v3.4s
saddw v4.4s, v4.4s, v12.4h
saddw2 v5.4s, v5.4s, v12.8h
saddw v6.4s, v6.4s, v13.4h
saddw2 v7.4s, v7.4s, v13.8h
ext v8.16b, v19.16b, v20.16b, #13 // top left, top mid
dup v22.16b, v29.b[14]
@ -809,42 +833,46 @@ function sum_lag3_above_neon
smull2 v3.8h, v9.16b, v23.16b
smull v8.8h, v10.8b, v24.8b
smull2 v9.8h, v10.16b, v24.16b
add v0.8h, v0.8h, v2.8h
add v1.8h, v1.8h, v3.8h
smull v10.8h, v11.8b, v26.8b
smull2 v11.8h, v11.16b, v26.16b
smull v2.8h, v12.8b, v27.8b
smull2 v3.8h, v12.16b, v27.16b
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
saddl v22.4s, v0.4h, v2.4h
saddl2 v23.4s, v0.8h, v2.8h
saddl v24.4s, v1.4h, v3.4h
saddl2 v26.4s, v1.8h, v3.8h
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
smull v8.8h, v12.8b, v27.8b
smull2 v9.8h, v12.16b, v27.16b
smull v10.8h, v13.8b, v28.8b
smull2 v11.8h, v13.16b, v28.16b
saddl v12.4s, v0.4h, v8.4h
saddl2 v13.4s, v0.8h, v8.8h
saddl v0.4s, v1.4h, v9.4h
saddl2 v1.4s, v1.8h, v9.8h
smull v8.8h, v20.8b, v25.8b
smull2 v9.8h, v20.16b, v25.16b
add v2.8h, v2.8h, v10.8h
add v3.8h, v3.8h, v11.8h
add v4.4s, v4.4s, v12.4s
add v5.4s, v5.4s, v13.4s
add v6.4s, v6.4s, v0.4s
add v7.4s, v7.4s, v1.4s
saddl v0.4s, v2.4h, v8.4h
saddl2 v1.4s, v2.8h, v8.8h
saddl v2.4s, v3.4h, v9.4h
saddl2 v3.4s, v3.8h, v9.8h
smull v12.8h, v20.8b, v25.8b
smull2 v19.8h, v20.16b, v25.16b
add v22.4s, v22.4s, v0.4s
add v23.4s, v23.4s, v1.4s
add v24.4s, v24.4s, v2.4s
add v26.4s, v26.4s, v3.4s
saddl v0.4s, v8.4h, v10.4h
saddl2 v1.4s, v8.8h, v10.8h
saddl v2.4s, v9.4h, v11.4h
saddl2 v3.4s, v9.8h, v11.8h
add v4.4s, v4.4s, v22.4s
add v5.4s, v5.4s, v23.4s
add v6.4s, v6.4s, v24.4s
add v7.4s, v7.4s, v26.4s
mov v13.16b, v14.16b
mov v14.16b, v15.16b
add v4.4s, v4.4s, v0.4s
add v5.4s, v5.4s, v1.4s
add v6.4s, v6.4s, v2.4s
add v7.4s, v7.4s, v3.4s
mov v13.16b, v14.16b
mov v14.16b, v15.16b
mov v16.16b, v17.16b
mov v17.16b, v18.16b
saddw v4.4s, v4.4s, v12.4h
saddw2 v5.4s, v5.4s, v12.8h
saddw v6.4s, v6.4s, v19.4h
saddw2 v7.4s, v7.4s, v19.8h
mov v19.16b, v20.16b
mov v20.16b, v21.16b
@ -1483,43 +1511,50 @@ gen_grain_44 uv_422
.macro gather_interleaved dst1, dst2, src1, src2, off
umov w14, \src1[0+\off]
umov w15, \src2[1+\off]
umov w15, \src2[8+\off]
umov w16, \src1[2+\off]
add x14, x14, x3
umov w17, \src2[3+\off]
umov w17, \src2[10+\off]
add x15, x15, x3
ld1 {\dst1}[0+\off], [x14]
ld1 {\dst1}[0+\off], [x14]
umov w14, \src1[4+\off]
add x16, x16, x3
ld1 {\dst2}[1+\off], [x15]
umov w15, \src2[5+\off]
ld1 {\dst2}[8+\off], [x15]
umov w15, \src2[12+\off]
add x17, x17, x3
ld1 {\dst1}[2+\off], [x16]
ld1 {\dst1}[2+\off], [x16]
umov w16, \src1[6+\off]
add x14, x14, x3
ld1 {\dst2}[3+\off], [x17]
umov w17, \src2[7+\off]
ld1 {\dst2}[10+\off], [x17]
umov w17, \src2[14+\off]
add x15, x15, x3
ld1 {\dst1}[4+\off], [x14]
ld1 {\dst1}[4+\off], [x14]
add x16, x16, x3
ld1 {\dst2}[5+\off], [x15]
ld1 {\dst2}[12+\off], [x15]
add x17, x17, x3
ld1 {\dst1}[6+\off], [x16]
ld1 {\dst2}[7+\off], [x17]
ld1 {\dst1}[6+\off], [x16]
ld1 {\dst2}[14+\off], [x17]
.endm
.macro gather dst1, dst2, src1, src2
gather_interleaved \dst1, \dst2, \src1, \src2, 0
gather_interleaved \dst2, \dst1, \src2, \src1, 0
gather_interleaved \dst1, \dst2, \src1, \src2, 8
gather_interleaved \dst2, \dst1, \src2, \src1, 8
gather_interleaved \dst1, \dst2, \src1, \src2, 1
gather_interleaved \dst2, \dst1, \src2, \src1, 1
.endm
function gather_neon
function gather32_neon
gather v4.b, v5.b, v0.b, v1.b
ret
endfunc
function gather16_neon
gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
ins v4.d[1], v5.d[1]
ret
endfunc
const overlap_coeffs_0, align=4
.byte 27, 17, 0, 0, 0, 0, 0, 0
.byte 17, 27, 32, 32, 32, 32, 32, 32
@ -1564,7 +1599,7 @@ function fgy_32x32_8bpc_neon, export=1
mov x9, #GRAIN_WIDTH // grain_lut stride
neg w4, w4
dup v29.8h, w4 // -scaling_shift
dup v29.8h, w4 // -scaling_shift
movrel x16, overlap_coeffs_0
@ -1635,7 +1670,7 @@ L(loop_\ox\oy):
.endif
ld1 {v18.16b, v19.16b}, [x5], x9 // grain_lut
bl gather_neon
bl gather32_neon
.if \ox
smull v20.8h, v20.8b, v27.8b
@ -1765,7 +1800,7 @@ endfunc
.macro fguv layout, sx, sy
function fguv_32x32_\layout\()_8bpc_neon, export=1
str x30, [sp, #-32]!
stp d8, d9, [sp, #16]
str d8, [sp, #16]
ldp x8, x9, [sp, #32] // offsets, h
ldp x10, x11, [sp, #48] // uv, is_id
@ -1778,11 +1813,11 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
add x14, x10, #FGD_UV_LUMA_MULT
add x15, x10, #FGD_UV_MULT
add x10, x10, #FGD_UV_OFFSET
ld1 {v8.h}[0], [x14] // uv_luma_mult
ld1r {v24.8h}, [x10] // uv_offset
ld1 {v8.h}[1], [x15] // uv_mult
ld1 {v8.h}[0], [x14] // uv_luma_mult
ld1r {v24.8h}, [x10] // uv_offset
ld1 {v8.h}[1], [x15] // uv_mult
dup v29.8h, w13 // -scaling_shift
dup v29.8h, w13 // -scaling_shift
cbz w12, 1f
// clip
@ -1918,7 +1953,7 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
sqxtun2 v1.16b, v5.8h
.endif
bl gather_neon
bl gather32_neon
.if \ox
smull v20.8h, v20.8b, v27.8b
@ -2029,7 +2064,7 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
fguv_loop_sx0 1, 1, 1
9:
ldp d8, d9, [sp, #16]
ldr d8, [sp, #16]
ldr x30, [sp], #32
ret
@ -2085,7 +2120,7 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
sqxtun2 v0.16b, v3.8h
.endif
bl gather_neon
bl gather16_neon
.if \ox
smull v20.8h, v20.8b, v27.8b
@ -2176,7 +2211,7 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
fguv_loop_sx1 1, 1, 1
9:
ldp d8, d9, [sp, #16]
ldr d8, [sp, #16]
ldr x30, [sp], #32
ret

View File

@ -27,6 +27,7 @@
#include "src/arm/asm.S"
#include "util.S"
#include "src/arm/asm-offsets.h"
#define GRAIN_WIDTH 82
@ -64,11 +65,18 @@
gather_interleaved \dst2, \dst1, \src4, \src2, 8
.endm
function gather_neon
function gather32_neon
gather v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
ret
endfunc
function gather16_neon
gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
ins v6.d[1], v7.d[0]
ret
endfunc
const overlap_coeffs_0, align=4
.short 27, 17, 0, 0
.short 17, 27, 32, 32
@ -110,6 +118,7 @@ function fgy_32x32_16bpc_neon, export=1
stp d10, d11, [sp, #32]
stp d12, d13, [sp, #48]
str d14, [sp, #64]
eor w4, w4, #15 // 15 - scaling_shift
ldr w11, [x6, #8] // offsets[1][0]
ldr w13, [x6, #4] // offsets[0][1]
ldr w15, [x6, #12] // offsets[1][1]
@ -122,8 +131,7 @@ function fgy_32x32_16bpc_neon, export=1
mov x9, #GRAIN_WIDTH*2 // grain_lut stride
neg w10, w10 // bitdepth_min_8
neg w4, w4
dup v29.4s, w4 // -scaling_shift
dup v29.8h, w4 // 15 - scaling_shift
dup v27.8h, w10 // bitdepth_min_8
movrel x16, overlap_coeffs_0
@ -207,7 +215,7 @@ L(loop_\ox\oy):
and v1.16b, v1.16b, v4.16b
and v2.16b, v2.16b, v4.16b
and v3.16b, v3.16b, v4.16b
bl gather_neon
bl gather32_neon
.if \ox
smull v20.4s, v20.4h, v27.4h
@ -268,7 +276,7 @@ L(loop_\ox\oy):
smax v19.8h, v19.8h, v25.8h
.endif
uxtl v4.8h, v6.8b // scaling
uxtl v4.8h, v6.8b // scaling
.if \ox && !\oy
sqrshrn v20.4h, v20.4s, #5
.endif
@ -281,37 +289,18 @@ L(loop_\ox\oy):
smax v20.4h, v20.4h, v25.4h
.endif
uxtl2 v7.8h, v7.16b
.if \ox && !\oy
smull v20.4s, v20.4h, v4.4h // scaling * grain
.else
smull v20.4s, v16.4h, v4.4h
ins v16.d[0], v20.d[0]
.endif
smull2 v21.4s, v16.8h, v4.8h
smull v22.4s, v17.4h, v5.4h
smull2 v23.4s, v17.8h, v5.8h
smull v16.4s, v18.4h, v6.4h
smull2 v17.4s, v18.8h, v6.8h
smull v18.4s, v19.4h, v7.4h
smull2 v19.4s, v19.8h, v7.8h
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
ushl v5.8h, v5.8h, v29.8h
ushl v6.8h, v6.8h, v29.8h
ushl v7.8h, v7.8h, v29.8h
srshl v20.4s, v20.4s, v29.4s // round2(scaling * grain, scaling_shift)
srshl v21.4s, v21.4s, v29.4s
srshl v22.4s, v22.4s, v29.4s
srshl v23.4s, v23.4s, v29.4s
srshl v16.4s, v16.4s, v29.4s
srshl v17.4s, v17.4s, v29.4s
srshl v18.4s, v18.4s, v29.4s
srshl v19.4s, v19.4s, v29.4s
sqxtn v20.4h, v20.4s
sqxtn2 v20.8h, v21.4s
sqxtn v21.4h, v22.4s
sqxtn2 v21.8h, v23.4s
sqxtn v22.4h, v16.4s
sqxtn2 v22.8h, v17.4s
sqxtn v23.4h, v18.4s
sqxtn2 v23.8h, v19.4s
sqrdmulh v20.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
sqrdmulh v21.8h, v17.8h, v5.8h
sqrdmulh v22.8h, v18.8h, v6.8h
sqrdmulh v23.8h, v19.8h, v7.8h
usqadd v0.8h, v20.8h // *src + noise
usqadd v1.8h, v21.8h
@ -359,3 +348,506 @@ L(fgy_loop_tbl):
.hword L(fgy_loop_tbl) - L(loop_10)
.hword L(fgy_loop_tbl) - L(loop_11)
endfunc
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
// const pixel *const src,
// const ptrdiff_t stride,
// const uint8_t scaling[SCALING_SIZE],
// const Dav1dFilmGrainData *const data,
// const entry grain_lut[][GRAIN_WIDTH],
// const pixel *const luma_row,
// const ptrdiff_t luma_stride,
// const int offsets[][2],
// const ptrdiff_t h, const ptrdiff_t uv,
// const ptrdiff_t is_id,
// const ptrdiff_t type,
// const int bitdepth_max);
.macro fguv layout, sx, sy
function fguv_32x32_\layout\()_16bpc_neon, export=1
str x30, [sp, #-80]!
stp d8, d9, [sp, #16]
stp d10, d11, [sp, #32]
stp d12, d13, [sp, #48]
stp d14, d15, [sp, #64]
ldp x8, x9, [sp, #80] // offsets, h
ldp x10, x11, [sp, #96] // uv, is_id
ldr w16, [sp, #120] // bitdepth_max
ldr w13, [x4, #FGD_SCALING_SHIFT]
ldr w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
dup v23.8h, w16 // bitdepth_max
clz w16, w16
eor w13, w13, #15 // 15 - scaling_shift
sub w16, w16, #24 // -bitdepth_min_8
// !csfl
add x10, x4, x10, lsl #2 // + 4*uv
add x14, x10, #FGD_UV_LUMA_MULT
add x15, x10, #FGD_UV_MULT
add x10, x10, #FGD_UV_OFFSET
neg w16, w16 // bitdepth_min_8
ld1r {v8.8h}, [x14] // uv_luma_mult
ld1r {v24.8h}, [x10] // uv_offset
ld1r {v9.8h}, [x15] // uv_mult
dup v29.8h, w13 // 15 - scaling_shift
dup v27.8h, w16 // bitdepth_min_8
cbz w12, 1f
// clip
movi v30.8h, #16
movi v31.8h, #240
sshl v30.8h, v30.8h, v27.8h
sshl v31.8h, v31.8h, v27.8h
cbz w11, 2f
// is_id
movi v31.8h, #235
sshl v31.8h, v31.8h, v27.8h
b 2f
1:
// no clip
movi v30.8h, #0
mov v31.16b, v23.16b // bitdepth_max
2:
ushr v15.8h, v23.8h, #1 // grain_max
sshl v24.8h, v24.8h, v27.8h // uv_offset << bitdepth_min_8
not v14.16b, v15.16b // grain_min
ldr w12, [x8, #8] // offsets[1][0]
ldr w14, [x8, #4] // offsets[0][1]
ldr w16, [x8, #12] // offsets[1][1]
ldr w8, [x8] // offsets[0][0]
mov x10, #GRAIN_WIDTH*2 // grain_lut stride
add x5, x5, #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
.if \sy
add x5, x5, x10, lsl #2 // grain_lut += 4 * grain_stride
add x5, x5, x10, lsl #1 // grain_lut += 2 * grain_stride
.else
add x5, x5, x10, lsl #3 // grain_lut += 8 * grain_stride
add x5, x5, x10 // grain_lut += grain_stride
.endif
calc_offset w12, w13, w12, \sx, \sy
calc_offset w14, w15, w14, \sx, \sy
calc_offset w16, w17, w16, \sx, \sy
calc_offset w8, w11, w8, \sx, \sy
add_offset x13, w12, x13, x5, x10
add_offset x15, w14, x15, x5, x10
add_offset x17, w16, x17, x5, x10
add_offset x5, w8, x11, x5, x10
add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
ldr w13, [sp, #112] // type
movrel x16, overlap_coeffs_\sx
adr x14, L(fguv_loop_sx\sx\()_tbl)
ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
tst w13, #1
ldrh w13, [x14, w13, uxtw #1]
b.eq 1f
// y overlap
sub w12, w9, #(2 >> \sy) // backup remaining h
mov w9, #(2 >> \sy)
1:
sub x13, x14, w13, uxtw
.if \sy
movi v25.8h, #23
movi v26.8h, #22
.else
movi v25.8h, #27
movi v26.8h, #17
.endif
.if \sy
add x7, x7, x7 // luma_stride *= 2
.endif
br x13
endfunc
.endm
fguv 420, 1, 1
fguv 422, 1, 0
fguv 444, 0, 0
function fguv_loop_sx0_neon
.macro fguv_loop_sx0 csfl, ox, oy
L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
1:
.if \ox
ld1 {v4.4h}, [x4], x10 // grain_lut old
.endif
.if \oy
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], x10 // grain_lut top
.endif
.if \ox && \oy
ld1 {v5.4h}, [x11], x10 // grain_lut top old
.endif
ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x5], x10 // grain_lut
.if \ox
smull v4.4s, v4.4h, v27.4h
smlal v4.4s, v16.4h, v28.4h
.endif
.if \oy
.if \ox
smull v5.4s, v5.4h, v27.4h
smlal v5.4s, v0.4h, v28.4h
sqrshrn v4.4h, v4.4s, #5
sqrshrn v5.4h, v5.4s, #5
smin v4.4h, v4.4h, v15.4h
smin v5.4h, v5.4h, v15.4h
smax v4.4h, v4.4h, v14.4h
smax v5.4h, v5.4h, v14.4h
ins v16.d[0], v4.d[0]
ins v0.d[0], v5.d[0]
.endif
smull v6.4s, v16.4h, v26.4h
smull2 v7.4s, v16.8h, v26.8h
smull v10.4s, v17.4h, v26.4h
smull2 v11.4s, v17.8h, v26.8h
smull v16.4s, v18.4h, v26.4h
smull2 v17.4s, v18.8h, v26.8h
smull v18.4s, v19.4h, v26.4h
smull2 v19.4s, v19.8h, v26.8h
smlal v6.4s, v0.4h, v25.4h
smlal2 v7.4s, v0.8h, v25.8h
smlal v10.4s, v1.4h, v25.4h
smlal2 v11.4s, v1.8h, v25.8h
smlal v16.4s, v2.4h, v25.4h
smlal2 v17.4s, v2.8h, v25.8h
smlal v18.4s, v3.4h, v25.4h
smlal2 v19.4s, v3.8h, v25.8h
sqrshrn v6.4h, v6.4s, #5
sqrshrn2 v6.8h, v7.4s, #5
sqrshrn v7.4h, v10.4s, #5
sqrshrn2 v7.8h, v11.4s, #5
sqrshrn v10.4h, v16.4s, #5
sqrshrn2 v10.8h, v17.4s, #5
sqrshrn v11.4h, v18.4s, #5
sqrshrn2 v11.8h, v19.4s, #5
.endif
.if \ox && !\oy
sqrshrn v4.4h, v4.4s, #5
smin v4.4h, v4.4h, v15.4h
.endif
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
.if \oy
smin v16.8h, v6.8h, v15.8h
smin v17.8h, v7.8h, v15.8h
smin v18.8h, v10.8h, v15.8h
smin v19.8h, v11.8h, v15.8h
smax v16.8h, v16.8h, v14.8h
smax v17.8h, v17.8h, v14.8h
smax v18.8h, v18.8h, v14.8h
smax v19.8h, v19.8h, v14.8h
.endif
.if \ox && !\oy
smax v4.4h, v4.4h, v14.4h
.endif
ld1 {v10.8h, v11.8h, v12.8h, v13.8h}, [x1], x2 // src
.if \ox && !\oy
ins v16.d[0], v4.d[0]
.endif
.if !\csfl
smull v4.4s, v0.4h, v8.4h
smull2 v5.4s, v0.8h, v8.8h
smull v6.4s, v1.4h, v8.4h
smull2 v7.4s, v1.8h, v8.8h
smull v0.4s, v2.4h, v8.4h
smull2 v1.4s, v2.8h, v8.8h
smull v2.4s, v3.4h, v8.4h
smull2 v3.4s, v3.8h, v8.8h
smlal v4.4s, v10.4h, v9.4h
smlal2 v5.4s, v10.8h, v9.8h
smlal v6.4s, v11.4h, v9.4h
smlal2 v7.4s, v11.8h, v9.8h
smlal v0.4s, v12.4h, v9.4h
smlal2 v1.4s, v12.8h, v9.8h
smlal v2.4s, v13.4h, v9.4h
smlal2 v3.4s, v13.8h, v9.8h
shrn v4.4h, v4.4s, #6
shrn2 v4.8h, v5.4s, #6
shrn v5.4h, v6.4s, #6
shrn2 v5.8h, v7.4s, #6
shrn v6.4h, v0.4s, #6
shrn2 v6.8h, v1.4s, #6
shrn v7.4h, v2.4s, #6
shrn2 v7.8h, v3.4s, #6
add v0.8h, v4.8h, v24.8h
add v1.8h, v5.8h, v24.8h
add v2.8h, v6.8h, v24.8h
add v3.8h, v7.8h, v24.8h
movi v20.8h, #0
smin v0.8h, v0.8h, v23.8h
smin v1.8h, v1.8h, v23.8h
smin v2.8h, v2.8h, v23.8h
smin v3.8h, v3.8h, v23.8h
smax v0.8h, v0.8h, v20.8h
smax v1.8h, v1.8h, v20.8h
smax v2.8h, v2.8h, v20.8h
smax v3.8h, v3.8h, v20.8h
.else
// Make sure that uninitialized pixels out of range past the right
// edge are in range; their actual values shouldn't matter.
and v0.16b, v0.16b, v23.16b
and v1.16b, v1.16b, v23.16b
and v2.16b, v2.16b, v23.16b
and v3.16b, v3.16b, v23.16b
.endif
bl gather32_neon
uxtl v4.8h, v6.8b // scaling
uxtl2 v5.8h, v6.16b
uxtl v6.8h, v7.8b
uxtl2 v7.8h, v7.16b
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
ushl v5.8h, v5.8h, v29.8h
ushl v6.8h, v6.8h, v29.8h
ushl v7.8h, v7.8h, v29.8h
sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
sqrdmulh v17.8h, v17.8h, v5.8h
sqrdmulh v18.8h, v18.8h, v6.8h
sqrdmulh v19.8h, v19.8h, v7.8h
usqadd v10.8h, v16.8h // *src + noise
usqadd v11.8h, v17.8h
usqadd v12.8h, v18.8h
usqadd v13.8h, v19.8h
umax v0.8h, v10.8h, v30.8h
umax v1.8h, v11.8h, v30.8h
umax v2.8h, v12.8h, v30.8h
umax v3.8h, v13.8h, v30.8h
umin v0.8h, v0.8h, v31.8h
umin v1.8h, v1.8h, v31.8h
umin v2.8h, v2.8h, v31.8h
umin v3.8h, v3.8h, v31.8h
subs w9, w9, #1
.if \oy
dup v25.8h, v28.h[0]
dup v26.8h, v28.h[1]
.endif
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x2 // dst
b.gt 1b
.if \oy
cmp w12, #0
mov w9, w12 // restore actual remaining h
b.gt L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
.endif
b 9f
.endm
fguv_loop_sx0 0, 0, 0
fguv_loop_sx0 0, 0, 1
fguv_loop_sx0 0, 1, 0
fguv_loop_sx0 0, 1, 1
fguv_loop_sx0 1, 0, 0
fguv_loop_sx0 1, 0, 1
fguv_loop_sx0 1, 1, 0
fguv_loop_sx0 1, 1, 1
9:
ldp d14, d15, [sp, #64]
ldp d12, d13, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d8, d9, [sp, #16]
ldr x30, [sp], #80
ret
L(fguv_loop_sx0_tbl):
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
.hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
endfunc
function fguv_loop_sx1_neon
.macro fguv_loop_sx1 csfl, ox, oy
L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
1:
.if \ox
ld1 {v18.4h}, [x4], x10 // grain_lut old
.endif
.if \oy
ld1 {v20.8h, v21.8h}, [x8], x10 // grain_lut top
.endif
.if \ox && \oy
ld1 {v19.4h}, [x11], x10 // grain_lut top old
.endif
ld1 {v16.8h, v17.8h}, [x5], x10 // grain_lut
.if \ox
smull v18.4s, v18.4h, v27.4h
smlal v18.4s, v16.4h, v28.4h
.endif
.if \oy
.if \ox
smull v19.4s, v19.4h, v27.4h
smlal v19.4s, v20.4h, v28.4h
sqrshrn v18.4h, v18.4s, #5
sqrshrn v19.4h, v19.4s, #5
smin v18.4h, v18.4h, v15.4h
smin v19.4h, v19.4h, v15.4h
smax v18.4h, v18.4h, v14.4h
smax v19.4h, v19.4h, v14.4h
ins v16.d[0], v18.d[0]
ins v20.d[0], v19.d[0]
.endif
smull v0.4s, v16.4h, v26.4h
smull2 v1.4s, v16.8h, v26.8h
smull v2.4s, v17.4h, v26.4h
smull2 v3.4s, v17.8h, v26.8h
smlal v0.4s, v20.4h, v25.4h
smlal2 v1.4s, v20.8h, v25.8h
smlal v2.4s, v21.4h, v25.4h
smlal2 v3.4s, v21.8h, v25.8h
sqrshrn v16.4h, v0.4s, #5
sqrshrn2 v16.8h, v1.4s, #5
sqrshrn v17.4h, v2.4s, #5
sqrshrn2 v17.8h, v3.4s, #5
.endif
.if \ox && !\oy
sqrshrn v18.4h, v18.4s, #5
smin v18.4h, v18.4h, v15.4h
.endif
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 // luma
.if \oy
smin v16.8h, v16.8h, v15.8h
smin v17.8h, v17.8h, v15.8h
smax v16.8h, v16.8h, v14.8h
smax v17.8h, v17.8h, v14.8h
.endif
.if \ox && !\oy
smax v18.4h, v18.4h, v14.4h
.endif
ld1 {v10.8h, v11.8h}, [x1], x2 // src
.if \ox && !\oy
ins v16.d[0], v18.d[0]
.endif
addp v0.8h, v0.8h, v1.8h
addp v1.8h, v2.8h, v3.8h
urshr v0.8h, v0.8h, #1
urshr v1.8h, v1.8h, #1
.if !\csfl
smull v2.4s, v0.4h, v8.4h
smull2 v3.4s, v0.8h, v8.8h
smull v0.4s, v1.4h, v8.4h
smull2 v1.4s, v1.8h, v8.8h
smlal v2.4s, v10.4h, v9.4h
smlal2 v3.4s, v10.8h, v9.8h
smlal v0.4s, v11.4h, v9.4h
smlal2 v1.4s, v11.8h, v9.8h
shrn v2.4h, v2.4s, #6
shrn2 v2.8h, v3.4s, #6
shrn v3.4h, v0.4s, #6
shrn2 v3.8h, v1.4s, #6
add v0.8h, v2.8h, v24.8h
add v1.8h, v3.8h, v24.8h
movi v2.8h, #0
smin v0.8h, v0.8h, v23.8h
smin v1.8h, v1.8h, v23.8h
smax v0.8h, v0.8h, v2.8h
smax v1.8h, v1.8h, v2.8h
.else
// Make sure that uninitialized pixels out of range past the right
// edge are in range; their actual values shouldn't matter.
and v0.16b, v0.16b, v23.16b
and v1.16b, v1.16b, v23.16b
.endif
bl gather16_neon
uxtl v4.8h, v6.8b // scaling
uxtl2 v5.8h, v6.16b
ushl v4.8h, v4.8h, v29.8h // scaling << (15 - scaling_shift)
ushl v5.8h, v5.8h, v29.8h
sqrdmulh v16.8h, v16.8h, v4.8h // round2((scaling << (15 - scaling_shift) * grain, 15)
sqrdmulh v17.8h, v17.8h, v5.8h
usqadd v10.8h, v16.8h // *src + noise
usqadd v11.8h, v17.8h
umax v0.8h, v10.8h, v30.8h
umax v1.8h, v11.8h, v30.8h
umin v0.8h, v0.8h, v31.8h
umin v1.8h, v1.8h, v31.8h
.if \oy
mov v16.16b, v25.16b
.endif
subs w9, w9, #1
.if \oy
mov v25.16b, v26.16b
mov v26.16b, v16.16b
.endif
st1 {v0.8h, v1.8h}, [x0], x2 // dst
b.gt 1b
.if \oy
cmp w12, #0
mov w9, w12 // restore actual remaining h
b.gt L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
.endif
b 9f
.endm
fguv_loop_sx1 0, 0, 0
fguv_loop_sx1 0, 0, 1
fguv_loop_sx1 0, 1, 0
fguv_loop_sx1 0, 1, 1
fguv_loop_sx1 1, 0, 0
fguv_loop_sx1 1, 0, 1
fguv_loop_sx1 1, 1, 0
fguv_loop_sx1 1, 1, 1
9:
ldp d14, d15, [sp, #64]
ldp d12, d13, [sp, #48]
ldp d10, d11, [sp, #32]
ldp d8, d9, [sp, #16]
ldr x30, [sp], #80
ret
L(fguv_loop_sx1_tbl):
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
.hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
endfunc

View File

@ -60,6 +60,7 @@ void BF(dav1d_generate_grain_uv_ ## suff, neon)(entry buf[][GRAIN_WIDTH], \
GEN_GRAIN_UV(420);
GEN_GRAIN_UV(422);
GEN_GRAIN_UV(444);
#endif
// Use ptrdiff_t instead of int for the last few parameters, to get the
// same layout of parameters on the stack across platforms.
@ -149,7 +150,6 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
}
}
#if BITDEPTH == 8
#define fguv_ss_fn(nm, sx, sy) \
static void \
fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
@ -204,16 +204,12 @@ fguv_ss_fn(420, 1, 1);
fguv_ss_fn(422, 1, 0);
fguv_ss_fn(444, 0, 0);
#endif
#endif
COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if ARCH_AARCH64
#if BITDEPTH == 8
#if ARCH_AARCH64 && BITDEPTH == 8
c->generate_grain_y = BF(dav1d_generate_grain_y, neon);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_generate_grain_uv_420, neon);
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_generate_grain_uv_422, neon);
@ -221,10 +217,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_arm)(Dav1dFilmGrainDSPContext *const c
#endif
c->fgy_32x32xn = fgy_32x32xn_neon;
#if BITDEPTH == 8
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_neon;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_neon;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_neon;
#endif
#endif
}

View File

@ -144,6 +144,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'arm/32/cdef.S',
'arm/32/film_grain.S',
'arm/32/ipred.S',
'arm/32/loopfilter.S',
'arm/32/looprestoration.S',
@ -154,6 +155,7 @@ if is_asm_enabled
if dav1d_bitdepths.contains('16')
libdav1d_sources_asm += files(
'arm/32/cdef16.S',
'arm/32/film_grain16.S',
'arm/32/ipred16.S',
'arm/32/itx16.S',
'arm/32/loopfilter16.S',
@ -218,11 +220,14 @@ if is_asm_enabled
'x86/film_grain16_avx2.asm',
'x86/ipred16_avx2.asm',
'x86/itx16_avx2.asm',
'x86/itx16_sse.asm',
'x86/loopfilter16_avx2.asm',
'x86/looprestoration16_avx2.asm',
'x86/mc16_avx2.asm',
'x86/cdef16_sse.asm',
'x86/itx16_sse.asm',
'x86/loopfilter16_sse.asm',
'x86/looprestoration16_sse.asm',
'x86/mc16_sse.asm',
)
endif

View File

@ -29,7 +29,6 @@
%if ARCH_X86_64
SECTION_RODATA 32
pd_0x10000: times 8 dd 0x10000
pw_1024: times 16 dw 1024
pw_23_22: times 8 dw 23, 22
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
@ -844,7 +843,7 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
mov r7d, [fg_dataq+FGData.scaling_shift]
lea r8, [pb_mask]
%define base r8-pb_mask
vpbroadcastw m11, [base+round_vals+r7*2-12]
vpbroadcastw m11, [base+mul_bits+r7*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
mov r9d, r9m ; bdmax
sar r9d, 11 ; is_12bpc
@ -854,7 +853,6 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
lea r9d, [r6d*2+r9d]
vpbroadcastw m12, [base+max+r9*2]
vpbroadcastw m10, r9m
mov r9mp, r7
pxor m2, m2
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused1, \
@ -921,27 +919,17 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
vpgatherdd m5, [scalingq+m6-3], m3
vpgatherdd m6, [scalingq+m7-3], m9
REPX {psrld x, 24}, m8, m4, m5, m6
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
packssdw m8, m4
packssdw m5, m6
; grain = grain_lut[offy+y][offx+x]
movu m9, [grain_lutq+offxyq*2]
movu m3, [grain_lutq+offxyq*2+32]
; noise = round2(scaling[src] * grain, scaling_shift)
; the problem here is that since the grain is 10-bits, the product of
; scaling*grain is 17+sign bits, so we need to unfortunately do some
; of these steps in 32-bits
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r9m}, m9, m7, m3, m8
packssdw m9, m7
packssdw m3, m8
REPX {pmullw x, m11}, m8, m5
pmulhrsw m9, m8
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9
@ -1014,7 +1002,8 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
vpgatherdd m5, [scalingq+m6-3], m3
vpgatherdd m6, [scalingq+m7-3], m9
REPX {psrld x, 24}, m8, m4, m5, m6
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
packssdw m8, m4
packssdw m5, m6
; grain = grain_lut[offy+y][offx+x]
movu m9, [grain_lutq+offxyq*2]
@ -1033,17 +1022,9 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
movu m3, [grain_lutq+offxyq*2+32]
; noise = round2(scaling[src] * grain, scaling_shift)
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r9m}, m9, m7, m3, m8
packssdw m9, m7
packssdw m3, m8
REPX {pmullw x, m11}, m8, m5
pmulhrsw m9, m8
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9
@ -1167,16 +1148,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
vpgatherdd m6, [scalingq+m4-3], m3
vpgatherdd m4, [scalingq+m5-3], m9
REPX {psrld x, 24}, m6, m4
REPX {por x, [pd_0x10000]}, m6, m4
packssdw m6, m4
; noise = round2(scaling[src] * grain, scaling_shift)
punpckhwd m9, m7, m11
punpcklwd m7, m11
pmaddwd m6, m7
pmaddwd m4, m9
REPX {psrad x, r9m}, m6, m4
packssdw m6, m4
pmullw m6, m11
pmulhrsw m6, m7
; same for the other half
pminuw m1, m10, [srcq+32] ; m0-1: src as word
@ -1187,16 +1163,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
vpgatherdd m5, [scalingq+m4-3], m3
vpgatherdd m4, [scalingq+m9-3], m7
REPX {psrld x, 24}, m5, m4
REPX {por x, [pd_0x10000]}, m5, m4
punpckhwd m9, m8, m11
punpcklwd m8, m11
pmaddwd m5, m8
pmaddwd m4, m9
REPX {psrad x, r9m}, m5, m4
packssdw m5, m4
pmullw m5, m11
pmulhrsw m5, m8
; dst = clip_pixel(src, noise)
paddw m0, m6
paddw m1, m5
@ -1313,15 +1284,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
pcmpeqw m9, m9
vpgatherdd m4, [scalingq+m5-3], m9
REPX {psrld x, 24}, m6, m4
REPX {por x, [pd_0x10000]}, m6, m4
packssdw m6, m4
; noise = round2(scaling[src] * grain, scaling_shift)
punpckhwd m9, m7, m11
punpcklwd m7, m11
pmaddwd m9, m4
pmaddwd m7, m6
REPX {psrad x, r9m}, m9, m7
packssdw m7, m9
pmullw m6, m11
pmulhrsw m7, m6
; other half
punpckhwd m5, m1, m2
@ -1333,15 +1300,11 @@ cglobal fgy_32x32xn_16bpc, 6, 14, 16, dst, src, stride, fg_data, w, scaling, gra
pcmpeqw m6, m6
vpgatherdd m4, [scalingq+m5-3], m6
REPX {psrld x, 24}, m9, m4
REPX {por x, [pd_0x10000]}, m9, m4
packssdw m9, m4
; noise = round2(scaling[src] * grain, scaling_shift)
punpckhwd m6, m3, m11
punpcklwd m3, m11
pmaddwd m6, m4
pmaddwd m3, m9
REPX {psrad x, r9m}, m6, m3
packssdw m3, m6
pmullw m9, m11
pmulhrsw m3, m9
; dst = clip_pixel(src, noise)
paddw m0, m7
@ -1378,7 +1341,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%define base r8-pb_mask
lea r8, [pb_mask]
mov r7d, [fg_dataq+FGData.scaling_shift]
vpbroadcastw m11, [base+round_vals+r7*2-12]
vpbroadcastw m11, [base+mul_bits+r7*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
mov r9d, r13m ; bdmax
sar r9d, 11 ; is_12bpc
@ -1391,7 +1354,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpbroadcastw m12, [base+max+r10*2]
vpbroadcastw m10, r13m
pxor m2, m2
mov r13mp, r7
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
@ -1510,24 +1472,17 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpgatherdd m5, [scalingq+m6-3], m3
vpgatherdd m6, [scalingq+m7-3], m9
REPX {psrld x, 24}, m8, m4, m5, m6
REPX {por x, [pd_0x10000]}, m8, m4, m5, m6
packssdw m8, m4
packssdw m5, m6
; grain = grain_lut[offy+y][offx+x]
movu m9, [grain_lutq+offxyq*2]
movu m3, [grain_lutq+offxyq*2+82*2]
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r13m}, m9, m7, m3, m8
packssdw m9, m7
packssdw m3, m8
REPX {pmullw x, m11}, m8, m5
pmulhrsw m9, m8
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9
@ -1655,15 +1610,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
pcmpeqw m7, m7
vpgatherdd m4, [scalingq+m5-3], m7
REPX {psrld x, 24}, m8, m4
REPX {por x, [pd_0x10000]}, m8, m4
packssdw m8, m4
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
REPX {psrad x, r13m}, m9, m7
packssdw m9, m7
pmullw m8, m11
pmulhrsw m9, m8
; same for the other half
punpckhwd m7, m6, m2
@ -1673,15 +1624,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpgatherdd m5, [scalingq+m6-3], m8
vpgatherdd m6, [scalingq+m7-3], m4
REPX {psrld x, 24}, m5, m6
REPX {por x, [pd_0x10000]}, m5, m6
packssdw m5, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r13m}, m3, m8
packssdw m3, m8
pmullw m5, m11
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9
@ -1841,15 +1788,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
pcmpeqw m7, m7
vpgatherdd m4, [scalingq+m5-3], m7
REPX {psrld x, 24}, m8, m4
REPX {por x, [pd_0x10000]}, m8, m4
packssdw m8, m4
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
REPX {psrad x, r13m}, m9, m7
packssdw m9, m7
pmullw m8, m11
pmulhrsw m9, m8
; same for the other half
punpckhwd m7, m6, m2
@ -1859,16 +1802,12 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpgatherdd m5, [scalingq+m6-3], m8
vpgatherdd m6, [scalingq+m7-3], m4
REPX {psrld x, 24}, m5, m6
REPX {por x, [pd_0x10000]}, m5, m6
packssdw m5, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
movu m3, [grain_lutq+offxyq*2+82*2]
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r13m}, m3, m8
packssdw m3, m8
pmullw m5, m11
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9
@ -2025,15 +1964,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
pcmpeqw m7, m7
vpgatherdd m4, [scalingq+m5-3], m7
REPX {psrld x, 24}, m8, m4
REPX {por x, [pd_0x10000]}, m8, m4
packssdw m8, m4
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m7, m9, m11
punpcklwd m9, m11
pmaddwd m9, m8
pmaddwd m7, m4
REPX {psrad x, r13m}, m9, m7
packssdw m9, m7
pmullw m8, m11
pmulhrsw m9, m8
; same for the other half
punpckhwd m7, m6, m2
@ -2043,15 +1978,11 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpgatherdd m5, [scalingq+m6-3], m8
vpgatherdd m6, [scalingq+m7-3], m4
REPX {psrld x, 24}, m5, m6
REPX {por x, [pd_0x10000]}, m5, m6
packssdw m5, m6
; noise = round2(scaling[luma_src] * grain, scaling_shift)
punpckhwd m8, m3, m11
punpcklwd m3, m11
pmaddwd m3, m5
pmaddwd m8, m6
REPX {psrad x, r13m}, m3, m8
packssdw m3, m8
pmullw m5, m11
pmulhrsw m3, m5
; dst = clip_pixel(src, noise)
paddw m0, m9

File diff suppressed because it is too large Load Diff

View File

@ -36,6 +36,7 @@ decl_loopfilter_sb_fn(dav1d_lpf_v_sb_uv_##ext)
decl_loopfilter_sb_fns(ssse3);
decl_loopfilter_sb_fns(avx2);
decl_loopfilter_sb_fns(16bpc_ssse3);
decl_loopfilter_sb_fns(16bpc_avx2);
COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const c) {
@ -48,6 +49,13 @@ COLD void bitfn(dav1d_loop_filter_dsp_init_x86)(Dav1dLoopFilterDSPContext *const
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_ssse3;
#else
#if ARCH_X86_64
c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_16bpc_ssse3;
c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_16bpc_ssse3;
c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_16bpc_ssse3;
c->loop_filter_sb[1][1] = dav1d_lpf_v_sb_uv_16bpc_ssse3;
#endif
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;

View File

@ -56,7 +56,7 @@ pd_8: dd 8
pd_25: dd 25
pd_4096: dd 4096
pd_34816: dd 34816
pd_m262128 dd -262128
pd_m262128: dd -262128
pd_0xf00800a4: dd 0xf00800a4
pd_0xf00801c7: dd 0xf00801c7

File diff suppressed because it is too large Load Diff

View File

@ -197,9 +197,9 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_x86)(Dav1dLoopRestorationDSPCont
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->wiener[0] = BF(dav1d_wiener_filter7, ssse3);
c->wiener[1] = BF(dav1d_wiener_filter5, ssse3);
#if BITDEPTH == 8
c->sgr[0] = BF(sgr_filter_5x5, ssse3);
c->sgr[1] = BF(sgr_filter_3x3, ssse3);
c->sgr[2] = BF(sgr_filter_mix, ssse3);

4144
third_party/dav1d/src/x86/mc16_sse.asm vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@
decl_##type##_fn(name##_16bpc_sse2); \
decl_##type##_fn(name##_16bpc_ssse3); \
decl_##type##_fn(name##_16bpc_avx2); \
decl_##type##_fn(name##_avx512icl);
decl_##type##_fn(name##_16bpc_avx512icl);
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_16bpc_##suffix
#define init_mct_fn(type, name, suffix) \
@ -147,8 +147,6 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
return;
#if BITDEPTH == 8
init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
@ -158,8 +156,8 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
@ -169,8 +167,9 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
#if ARCH_X86_64
#if BITDEPTH == 8 && ARCH_X86_64
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, ssse3);
init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, ssse3);
@ -194,6 +193,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, ssse3);
#endif
#if BITDEPTH == 8
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3;
@ -207,6 +207,18 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->emu_edge = dav1d_emu_edge_ssse3;
c->resize = dav1d_resize_ssse3;
#else
c->avg = dav1d_avg_16bpc_ssse3;
c->w_avg = dav1d_w_avg_16bpc_ssse3;
c->mask = dav1d_mask_16bpc_ssse3;
c->w_mask[0] = dav1d_w_mask_444_16bpc_ssse3;
c->w_mask[1] = dav1d_w_mask_422_16bpc_ssse3;
c->w_mask[2] = dav1d_w_mask_420_16bpc_ssse3;
c->blend = dav1d_blend_16bpc_ssse3;
c->blend_v = dav1d_blend_v_16bpc_ssse3;
c->blend_h = dav1d_blend_h_16bpc_ssse3;
c->emu_edge = dav1d_emu_edge_16bpc_ssse3;
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))

View File

@ -282,9 +282,9 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
#ifdef readtime
#define bench_new(...)\
do {\
func_type *tfunc = func_new;\
checkasm_set_signal_handler_state(1);\
if (checkasm_bench_func()) {\
checkasm_set_signal_handler_state(1);\
func_type *tfunc = func_new;\
uint64_t tsum = 0;\
int tcount = 0;\
for (int ti = 0; ti < BENCH_RUNS; ti++) {\
@ -299,9 +299,11 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
tcount++;\
}\
}\
checkasm_set_signal_handler_state(0);\
checkasm_update_bench(tcount, tsum);\
} else {\
tfunc(__VA_ARGS__);\
}\
checkasm_set_signal_handler_state(0);\
} while (0)
#else
#define bench_new(...) do {} while (0)

View File

@ -188,12 +188,21 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
for (int i = 0; i <= fg_data[0].overlap_flag; i++) {
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
int w, h, row_num;
if (fg_data[0].overlap_flag) {
w = 35 + (rnd() % 93);
h = 3 + (rnd() % 29);
row_num = i ? 1 + (rnd() & 0x7ff) : 0;
if (i == 0) {
row_num = 0;
h = 1 + (rnd() % 31);
} else {
row_num = 1 + (rnd() & 0x7ff);
if (i == 1) {
h = 3 + (rnd() % 30);
} else {
h = 1 + (rnd() & 1);
}
}
} else {
w = 1 + (rnd() & 127);
h = 1 + (rnd() & 31);
@ -220,6 +229,11 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
}
}
fg_data[0].overlap_flag = 1;
for (int y = 0; y < 32; y++) {
// Make sure all pixels are in range
for (int x = 0; x < 128; x++)
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
}
bench_new(a_dst, src, stride, fg_data, 64, scaling, grain_lut, 32,
1 HIGHBD_TAIL_SUFFIX);
}
@ -311,12 +325,21 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
for (fg_data[0].overlap_flag = 0; fg_data[0].overlap_flag <= 1;
fg_data[0].overlap_flag++)
{
for (int i = 0; i <= fg_data[0].overlap_flag; i++) {
for (int i = 0; i <= 2 * fg_data[0].overlap_flag; i++) {
int w, h, row_num;
if (fg_data[0].overlap_flag) {
w = (36 >> ss_x) + (rnd() % (92 >> ss_x));
h = (4 >> ss_y) + (rnd() % (28 >> ss_y));
row_num = i ? 1 + (rnd() & 0x7ff) : 0;
if (i == 0) {
row_num = 0;
h = 1 + (rnd() & (31 >> ss_y));
} else {
row_num = 1 + (rnd() & 0x7ff);
if (i == 1) {
h = (ss_y ? 2 : 3) + (rnd() % (ss_y ? 15 : 30));
} else {
h = ss_y ? 1 : 1 + (rnd() & 1);
}
}
} else {
w = 1 + (rnd() & (127 >> ss_x));
h = 1 + (rnd() & (31 >> ss_y));
@ -350,6 +373,13 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
}
fg_data[0].overlap_flag = 1;
for (int y = 0; y < 32; y++) {
// Make sure all pixels are in range
for (int x = 0; x < 128; x++) {
src[y * PXSTRIDE(stride) + x] &= bitdepth_max;
luma_src[y * PXSTRIDE(lstride) + x] &= bitdepth_max;
}
}
bench_new(a_dst, src, stride, fg_data, 32, scaling, grain_lut[1], 16,
1, luma_src, lstride, uv_pl, is_identity HIGHBD_TAIL_SUFFIX);
}

View File

@ -33,13 +33,12 @@
#include "src/loopfilter.h"
static void init_lpf_border(pixel *const dst, const ptrdiff_t stride,
int E, int I, int H, const int bitdepth_max)
int E, int I, const int bitdepth_max)
{
const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
const int F = 1 << bitdepth_min_8;
E <<= bitdepth_min_8;
I <<= bitdepth_min_8;
H <<= bitdepth_min_8;
const int filter_type = rnd() % 4;
const int edge_diff = rnd() % ((E + 2) * 4) - 2 * (E + 2);
@ -171,7 +170,7 @@ static void check_lpf_sb(loopfilter_sb_fn fn, const char *const name,
L = l[2 * x + 1][lf_idx] ? l[2 * x + 1][lf_idx] : l[2 * x][lf_idx];
}
init_lpf_border(c_dst + i * (dir ? 1 : 16), dir ? 128 : 1,
lut.e[L], lut.i[L], L >> 4, bitdepth_max);
lut.e[L], lut.i[L], bitdepth_max);
}
memcpy(a_dst_mem, c_dst_mem, 128 * sizeof(pixel) * 16);