Bug 1901600 - Update dav1d to 92f592ed104ba92ad35c781ee93f354525eef503 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D213129
This commit is contained in:
Updatebot 2024-06-11 18:00:44 +00:00
parent a4f4b59279
commit b50386f0ec
30 changed files with 6789 additions and 2779 deletions

View File

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 5b5399911dd24703de641d65eda5b7f1e845d060 (2024-04-15T13:19:42.000+02:00).
release: 92f592ed104ba92ad35c781ee93f354525eef503 (2024-06-05T23:22:36.000+02:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 5b5399911dd24703de641d65eda5b7f1e845d060
revision: 92f592ed104ba92ad35c781ee93f354525eef503
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View File

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "5b5399911dd24703de641d65eda5b7f1e845d060"
#define DAV1D_VERSION "92f592ed104ba92ad35c781ee93f354525eef503"

View File

@ -1,3 +1,18 @@
Changes for 1.4.2 'Road Runner':
--------------------------------
1.4.2 is a small release of dav1d, improving notably ARM, AVX-512 and PowerPC
- AVX2 optimizations for 8-tap and new variants for 6-tap
- AVX-512 optimizations for 8-tap and new variants for 6-tap
- Improve entropy decoding on ARM64
- New ARM64 optimizations for convolutions based on DotProd extension
- New ARM64 optimizations for convolutions based on i8mm extension
- New ARM64 optimizations for subpel and prep filters for i8mm
- Misc improvements on existing ARM64 optimizations, notably for put/prep
- New PowerPC9 optimizations for loopfilter
- Support for macOS kperf API for benchmarking
Changes for 1.4.1 'Road Runner':
--------------------------------
@ -246,7 +261,7 @@ Changes for 0.6.0 'Gyrfalcon':
- New SSSE3 optimizations for film grain
- New AVX2 optimizations for msac_adapt16
- Fix rare mismatches against the reference decoder, notably because of clipping
- Improvements on ARM64 on msac, cdef and looprestoration optimizations
- Improvements on ARM64 on msac, cdef, mc_blend_v and looprestoration optimizations
- Improvements on AVX2 optimizations for cdef_filter
- Improvements in the C version for itxfm, cdef_filter

View File

@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '1.4.1',
version: '1.4.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',

View File

@ -840,100 +840,108 @@ endfunc
function put_neon, export=1
adr x9, L(put_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
sub x9, x9, x8
br x9
2:
20:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.h}[0], [x2], x3
ld1 {v1.h}[0], [x2], x3
2:
ldrh w9, [x2]
ldrh w10, [x2, x3]
add x2, x2, x3, lsl #1
subs w5, w5, #2
st1 {v0.h}[0], [x0], x1
st1 {v1.h}[0], [x0], x1
strh w9, [x0]
strh w10, [x0, x1]
add x0, x0, x1, lsl #1
b.gt 2b
ret
4:
40:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.s}[0], [x2], x3
ld1 {v1.s}[0], [x2], x3
4:
ldr w9, [x2]
ldr w10, [x2, x3]
add x2, x2, x3, lsl #1
subs w5, w5, #2
st1 {v0.s}[0], [x0], x1
st1 {v1.s}[0], [x0], x1
str w9, [x0]
str w10, [x0, x1]
add x0, x0, x1, lsl #1
b.gt 4b
ret
8:
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8b}, [x2], x3
ld1 {v1.8b}, [x2], x3
8:
ldr x9, [x2]
ldr x10, [x2, x3]
add x2, x2, x3, lsl #1
subs w5, w5, #2
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
str x9, [x0]
str x10, [x0, x1]
add x0, x0, x1, lsl #1
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
add x8, x0, x1
lsl x1, x1, #1
add x9, x2, x3
lsl x3, x3, #1
16:
ld1 {v0.16b}, [x2], x3
ld1 {v1.16b}, [x9], x3
ldr q0, [x2]
ldr q1, [x2, x3]
add x2, x2, x3, lsl #1
subs w5, w5, #2
st1 {v0.16b}, [x0], x1
st1 {v1.16b}, [x8], x1
str q0, [x0]
str q1, [x0, x1]
add x0, x0, x1, lsl #1
b.gt 16b
ret
32:
320:
AARCH64_VALID_JUMP_TARGET
ldp x6, x7, [x2]
ldp x8, x9, [x2, #16]
stp x6, x7, [x0]
subs w5, w5, #1
stp x8, x9, [x0, #16]
32:
ldp q0, q1, [x2]
add x2, x2, x3
stp q0, q1, [x0]
add x0, x0, x1
ldp q2, q3, [x2]
add x2, x2, x3
stp q2, q3, [x0]
subs w5, w5, #2
add x0, x0, x1
b.gt 32b
ret
64:
640:
AARCH64_VALID_JUMP_TARGET
ldp x6, x7, [x2]
ldp x8, x9, [x2, #16]
stp x6, x7, [x0]
ldp x10, x11, [x2, #32]
stp x8, x9, [x0, #16]
subs w5, w5, #1
ldp x12, x13, [x2, #48]
stp x10, x11, [x0, #32]
stp x12, x13, [x0, #48]
64:
ldp q0, q1, [x2]
stp q0, q1, [x0]
ldp q2, q3, [x2, #32]
add x2, x2, x3
stp q2, q3, [x0, #32]
subs w5, w5, #1
add x0, x0, x1
b.gt 64b
ret
128:
1280:
AARCH64_VALID_JUMP_TARGET
128:
ldp q0, q1, [x2]
ldp q2, q3, [x2, #32]
stp q0, q1, [x0]
ldp q4, q5, [x2, #64]
ldp q2, q3, [x2, #32]
stp q2, q3, [x0, #32]
ldp q6, q7, [x2, #96]
subs w5, w5, #1
ldp q4, q5, [x2, #64]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
ldp q6, q7, [x2, #96]
add x2, x2, x3
stp q6, q7, [x0, #96]
subs w5, w5, #1
add x0, x0, x1
b.gt 128b
ret
L(put_tbl):
.hword L(put_tbl) - 128b
.hword L(put_tbl) - 64b
.hword L(put_tbl) - 32b
.hword L(put_tbl) - 1280b
.hword L(put_tbl) - 640b
.hword L(put_tbl) - 320b
.hword L(put_tbl) - 160b
.hword L(put_tbl) - 8b
.hword L(put_tbl) - 4b
.hword L(put_tbl) - 2b
.hword L(put_tbl) - 80b
.hword L(put_tbl) - 40b
.hword L(put_tbl) - 20b
endfunc
@ -942,119 +950,146 @@ endfunc
function prep_neon, export=1
adr x9, L(prep_tbl)
ldrh w8, [x9, x8, lsl #1]
sub x9, x9, w8, uxtw
movi v24.16b, #16
sub x9, x9, x8
br x9
4:
40:
AARCH64_VALID_JUMP_TARGET
4:
ld1 {v0.s}[0], [x1], x2
ld1 {v0.s}[1], [x1], x2
ld1 {v1.s}[0], [x1], x2
subs w4, w4, #2
ld1 {v1.s}[1], [x1], x2
ushll v0.8h, v0.8b, #4
ushll v1.8h, v1.8b, #4
st1 {v0.4h, v1.4h}, [x0], #16
subs w4, w4, #4
stp q0, q1, [x0], #32
b.gt 4b
ret
8:
80:
AARCH64_VALID_JUMP_TARGET
ld1 {v0.8b}, [x1], x2
ld1 {v1.8b}, [x1], x2
subs w4, w4, #2
8:
ldr d0, [x1]
ldr d1, [x1, x2]
add x1, x1, x2, lsl #1
ldr d2, [x1]
ldr d3, [x1, x2]
add x1, x1, x2, lsl #1
ushll v0.8h, v0.8b, #4
ushll v1.8h, v1.8b, #4
st1 {v0.8h, v1.8h}, [x0], #32
umull v2.8h, v2.8b, v24.8b
umull v3.8h, v3.8b, v24.8b
subs w4, w4, #4
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
add x0, x0, #64
b.gt 8b
ret
160:
AARCH64_VALID_JUMP_TARGET
add x9, x1, x2
lsl x2, x2, #1
16:
ld1 {v0.16b}, [x1], x2
ld1 {v1.16b}, [x9], x2
subs w4, w4, #2
ushll v4.8h, v0.8b, #4
ushll2 v5.8h, v0.16b, #4
ushll v6.8h, v1.8b, #4
ushll2 v7.8h, v1.16b, #4
st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
ldr q1, [x1]
ldr q3, [x1, x2]
add x1, x1, x2, lsl #1
ushll v0.8h, v1.8b, #4
ushll2 v1.8h, v1.16b, #4
ldr q5, [x1]
ldr q7, [x1, x2]
add x1, x1, x2, lsl #1
umull v2.8h, v3.8b, v24.8b
umull2 v3.8h, v3.16b, v24.16b
ushll v4.8h, v5.8b, #4
ushll2 v5.8h, v5.16b, #4
umull v6.8h, v7.8b, v24.8b
umull2 v7.8h, v7.16b, v24.16b
subs w4, w4, #4
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x0, x0, #128
b.gt 16b
ret
320:
AARCH64_VALID_JUMP_TARGET
add x8, x0, w3, uxtw
32:
ld1 {v0.16b, v1.16b}, [x1], x2
ldp q4, q5, [x1]
add x1, x1, x2
ldp q6, q7, [x1]
add x1, x1, x2
ushll v0.8h, v4.8b, #4
ushll2 v1.8h, v4.16b, #4
umull v2.8h, v5.8b, v24.8b
umull2 v3.8h, v5.16b, v24.16b
ushll v4.8h, v6.8b, #4
ushll2 v5.8h, v6.16b, #4
umull v6.8h, v7.8b, v24.8b
umull2 v7.8h, v7.16b, v24.16b
subs w4, w4, #2
ushll v4.8h, v0.8b, #4
ushll2 v5.8h, v0.16b, #4
ld1 {v2.16b, v3.16b}, [x1], x2
ushll v6.8h, v1.8b, #4
ushll2 v7.8h, v1.16b, #4
ushll v16.8h, v2.8b, #4
st1 {v4.8h, v5.8h}, [x0], x7
ushll2 v17.8h, v2.16b, #4
st1 {v6.8h, v7.8h}, [x8], x7
ushll v18.8h, v3.8b, #4
st1 {v16.8h, v17.8h}, [x0], x7
ushll2 v19.8h, v3.16b, #4
st1 {v18.8h, v19.8h}, [x8], x7
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x0, x0, #128
b.gt 32b
ret
640:
AARCH64_VALID_JUMP_TARGET
add x8, x0, #32
mov x6, #64
64:
ldp q0, q1, [x1]
subs w4, w4, #1
ushll v4.8h, v0.8b, #4
ushll2 v5.8h, v0.16b, #4
ldp q2, q3, [x1, #32]
ushll v6.8h, v1.8b, #4
ushll2 v7.8h, v1.16b, #4
ldp q4, q5, [x1]
ldp q6, q7, [x1, #32]
add x1, x1, x2
ushll v16.8h, v2.8b, #4
st1 {v4.8h, v5.8h}, [x0], x6
ushll2 v17.8h, v2.16b, #4
ushll v18.8h, v3.8b, #4
st1 {v6.8h, v7.8h}, [x8], x6
ushll2 v19.8h, v3.16b, #4
st1 {v16.8h, v17.8h}, [x0], x6
st1 {v18.8h, v19.8h}, [x8], x6
ushll v0.8h, v4.8b, #4
ushll2 v1.8h, v4.16b, #4
umull v2.8h, v5.8b, v24.8b
umull2 v3.8h, v5.16b, v24.16b
ushll v4.8h, v6.8b, #4
ushll2 v5.8h, v6.16b, #4
umull v6.8h, v7.8b, v24.8b
umull2 v7.8h, v7.16b, v24.16b
subs w4, w4, #1
stp q0, q1, [x0]
stp q2, q3, [x0, #32]
stp q4, q5, [x0, #64]
stp q6, q7, [x0, #96]
add x0, x0, #128
b.gt 64b
ret
1280:
AARCH64_VALID_JUMP_TARGET
add x8, x0, #64
mov x6, #128
128:
ldp q0, q1, [x1]
ldp q2, q3, [x1, #32]
ushll v16.8h, v0.8b, #4
ushll2 v17.8h, v0.16b, #4
ushll v18.8h, v1.8b, #4
ushll2 v19.8h, v1.16b, #4
ushll v20.8h, v2.8b, #4
ushll2 v21.8h, v2.16b, #4
ldp q4, q5, [x1, #64]
st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
ushll v22.8h, v3.8b, #4
ushll2 v23.8h, v3.16b, #4
ushll v24.8h, v4.8b, #4
ushll2 v25.8h, v4.16b, #4
ushll v26.8h, v5.8b, #4
ushll2 v27.8h, v5.16b, #4
ldp q6, q7, [x1, #96]
st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
ushll v28.8h, v6.8b, #4
ushll2 v29.8h, v6.16b, #4
ushll v30.8h, v7.8b, #4
ushll2 v31.8h, v7.16b, #4
subs w4, w4, #1
ldp q28, q29, [x1]
ldp q30, q31, [x1, #32]
ushll v16.8h, v28.8b, #4
ushll2 v17.8h, v28.16b, #4
umull v18.8h, v29.8b, v24.8b
umull2 v19.8h, v29.16b, v24.16b
ushll v20.8h, v30.8b, #4
ushll2 v21.8h, v30.16b, #4
umull v22.8h, v31.8b, v24.8b
umull2 v23.8h, v31.16b, v24.16b
ldp q28, q29, [x1, #64]
ldp q30, q31, [x1, #96]
add x1, x1, x2
st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
stp q16, q17, [x0]
stp q18, q19, [x0, #32]
stp q20, q21, [x0, #64]
stp q22, q23, [x0, #96]
ushll v16.8h, v28.8b, #4
ushll2 v17.8h, v28.16b, #4
umull v18.8h, v29.8b, v24.8b
umull2 v19.8h, v29.16b, v24.16b
ushll v20.8h, v30.8b, #4
ushll2 v21.8h, v30.16b, #4
umull v22.8h, v31.8b, v24.8b
umull2 v23.8h, v31.16b, v24.16b
subs w4, w4, #1
stp q16, q17, [x0, #128]
stp q18, q19, [x0, #160]
stp q20, q21, [x0, #192]
stp q22, q23, [x0, #224]
add x0, x0, #256
b.gt 128b
ret
@ -1063,8 +1098,8 @@ L(prep_tbl):
.hword L(prep_tbl) - 640b
.hword L(prep_tbl) - 320b
.hword L(prep_tbl) - 160b
.hword L(prep_tbl) - 8b
.hword L(prep_tbl) - 4b
.hword L(prep_tbl) - 80b
.hword L(prep_tbl) - 40b
endfunc

File diff suppressed because it is too large Load Diff

View File

@ -35,14 +35,14 @@
#define CNT 28
#define ALLOW_UPDATE_CDF 32
#define COEFFS_BASE_OFFSET 30
#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
const coeffs
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
.short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
endconst
const bits
.short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
.short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
// masks8
.short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
endconst
.macro ld1_n d0, d1, src, sz, n
@ -96,13 +96,6 @@ endconst
.endif
.endm
.macro urhadd_n d0, d1, s0, s1, s2, s3, sz, n
urhadd \d0\sz, \s0\sz, \s2\sz
.if \n == 16
urhadd \d1\sz, \s1\sz, \s3\sz
.endif
.endm
.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
sshl \d0\sz, \s0\sz, \s2\sz
.if \n == 16
@ -129,93 +122,189 @@ endconst
function msac_decode_symbol_adapt4_neon, export=1
.macro decode_update sz, szb, n
.if \n == 16
sub sp, sp, #48
.endif
add x8, x0, #RNG
ld1_n v0, v1, x1, \sz, \n // cdf
ld1r {v4\sz}, [x8] // rng
movrel x9, coeffs, 30
ld1r {v29\sz}, [x8] // rng
movrel x9, coeffs, COEFFS_BASE_OFFSET
movi v31\sz, #0x7f, lsl #8 // 0x7f00
sub x9, x9, x2, lsl #1
sub x10, x9, x2, lsl #1
mvni v30\sz, #0x3f // 0xffc0
and v7\szb, v4\szb, v31\szb // rng & 0x7f00
str h4, [sp, #14] // store original u = s->rng
and v7\szb, v29\szb, v31\szb // rng & 0x7f00
.if \n == 16
str h29, [sp, #14] // store original u = s->rng
.endif
and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
ld1_n v4, v5, x10, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
add x8, x0, #DIF + 6
ldr d28, [x0, #DIF]
add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
movrel x8, bits
dup v30\sz, v28.h[3] // dif >> (EC_WIN_SIZE - 16)
.if \n == 8
ldur q31, [x9, #MASKS8_OFFSET]
.elseif \n == 16
str_n q4, q5, sp, #16, \n // store v values to allow indexed access
ld1_n v16, v17, x8, .8h, \n
cmhs_n v2, v3, v6, v6, v4, v5, .8h, \n // c >= v
and_n v6, v7, v2, v3, v16, v17, .16b, \n // One bit per halfword set in the mask
.if \n == 16
add v6.8h, v6.8h, v7.8h
.endif
addv h6, v6.8h // Aggregate mask bits
ldr w4, [x0, #ALLOW_UPDATE_CDF]
umov w3, v6.h[0]
rbit w3, w3
clz w15, w3 // ret
cbz w4, L(renorm)
// After the condition starts being true it continues, such that the vector looks like:
// 0, 0, 0 ... -1, -1
cmhs_n v2, v3, v30, v30, v4, v5, \sz, \n // c >= v
.if \n == 4
ext v29\szb, v29\szb, v4\szb, #6 // u
umov x15, v2.d[0]
ldr w4, [x0, #ALLOW_UPDATE_CDF]
rev x15, x15
sub v29\sz, v29\sz, v4\sz // rng = u-v
// rev + clz = count trailing zeros
clz x15, x15 // 16*ret
.elseif \n == 8
// The final short of the compare is always set.
// Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
// For n == 8:
// -0x202 + -0x202 + ... + 0xF0E
// (0x202*7) | (1 << 8)
// ^-------offset for second byte of the short
and v31\szb, v31\szb, v2\szb
ext v29\szb, v29\szb, v4\szb, #14 // u
addv h31, v31\sz // ((2*ret + 1) << 8) | (2*ret)
ldr w4, [x0, #ALLOW_UPDATE_CDF]
sub v30\sz, v30\sz, v4\sz // (dif >> 48) - v
smov w15, v31.b[0] // 2*ret
sub v29\sz, v29\sz, v4\sz // rng = u-v
.elseif \n == 16
add v6\sz, v2\sz, v3\sz
addv h31, v6\sz // -n + ret
ldr w4, [x0, #ALLOW_UPDATE_CDF]
smov w15, v31.h[0]
.endif
cbz w4, 0f
// update_cdf
ldrh w3, [x1, x2, lsl #1] // count = cdf[n_symbols]
movi v5\szb, #0xff
.if \n == 16
// 16 case has a lower bound that guarantees n_symbols > 2
mov w4, #-5
.else
.elseif \n == 8
mvn w14, w2
mov w4, #-4
cmn w14, #3 // set C if n_symbols <= 2
.else
// if n_symbols < 4 (or < 6 even) then
// (1 + n_symbols) >> 2 == n_symbols > 2
add w14, w2, #17 // (1 + n_symbols) + (4 << 2)
.endif
sub_n v16, v17, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
orr v2\sz, #0x80, lsl #8
.if \n == 16
orr v3\sz, #0x80, lsl #8
.endif
urhadd_n v4, v5, v5, v5, v2, v3, \sz, \n // i >= val ? -1 : 32768
.if \n == 16
sub w4, w4, w3, lsr #4 // -((count >> 4) + 5)
.else
.elseif \n == 8
lsr w14, w3, #4 // count >> 4
sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4)
.else
neg w4, w14, lsr #2 // -((n_symbols > 2) + 4)
sub w4, w4, w3, lsr #4 // -((count >> 4) + (n_symbols > 2) + 4)
.endif
sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
sub_n v2, v3, v2, v3, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6\sz, w4 // -rate
sub w3, w3, w3, lsr #5 // count - (count == 32)
sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0)
sshl_n v4, v5, v4, v5, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
sshl_n v2, v3, v2, v3, v6, v6, \sz, \n // ({32768,-1} - cdf[i]) >> rate
add w3, w3, #1 // count + (count < 32)
add_n v0, v1, v0, v1, v4, v5, \sz, \n // cdf + (32768 - cdf[i]) >> rate
add_n v0, v1, v16, v17, v2, v3, \sz, \n // cdf + (32768 - cdf[i]) >> rate
st1_n v0, v1, x1, \sz, \n
strh w3, [x1, x2, lsl #1]
.endm
decode_update .4h, .8b, 4
0:
// renorm
.if \n == 4
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
mov x4, v29.d[0] // rng (packed)
mov x3, v4.d[0] // v (packed)
L(renorm):
add x8, sp, #16
add x8, x8, w15, uxtw #1
ldrh w3, [x8] // v
ldurh w4, [x8, #-2] // u
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
// garbage in the remaining bits, but we can work around this.
lsr x4, x4, x15 // rng
lsr x3, x3, x15 // v
lsl w5, w4, #16 // rng << 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
clz w5, w5 // d = clz(rng << 16)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
strh w4, [x0, #RNG]
b.lo 1f
str w6, [x0, #CNT]
str x7, [x0, #DIF]
lsr w0, w15, #4
ret
1:
lsr w15, w15, #4
b L(refill)
.elseif \n == 8
ldr w6, [x0, #CNT]
tbl v30.8b, {v30.16b}, v31.8b
tbl v29.8b, {v29.16b}, v31.8b
ins v28.h[3], v30.h[0] // dif - (v << 48)
clz v0.4h, v29.4h // d = clz(rng)
umov w5, v0.h[0]
ushl v29.4h, v29.4h, v0.4h // rng << d
// The vec for clz(rng) is filled with garbage after the first short,
// but ushl/sshl conveniently uses only the first byte for the shift
// amount.
ushl d28, d28, d0 // (dif - (v << 48)) << d
subs w6, w6, w5 // cnt -= d
str h29, [x0, #RNG]
b.lo 1f
str w6, [x0, #CNT]
str d28, [x0, #DIF]
lsr w0, w15, #1 // ret
ret
1:
lsr w15, w15, #1 // ret
mov x7, v28.d[0]
b L(refill)
.elseif \n == 16
add x8, sp, w15, sxtw #1
ldrh w3, [x8, #48] // v
ldurh w4, [x8, #46] // u
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.hs 4f
add sp, sp, #48
b.lo 1f
str w6, [x0, #CNT]
str x7, [x0, #DIF]
add w0, w15, #\n // ret
ret
1:
add w15, w15, #\n // ret
b L(refill)
.endif
.endm
decode_update .4h, .8b, 4
L(refill):
// refill
ldp x3, x4, [x0] // BUF_POS, BUF_END
add x5, x3, #8
@ -243,7 +332,6 @@ L(renorm2):
str x7, [x0, #DIF]
mov w0, w15
add sp, sp, #48
ret
5: // pad_with_ones
@ -272,29 +360,26 @@ endfunc
function msac_decode_symbol_adapt8_neon, export=1
decode_update .8h, .16b, 8
b L(renorm)
endfunc
function msac_decode_symbol_adapt16_neon, export=1
decode_update .8h, .16b, 16
b L(renorm)
endfunc
function msac_decode_hi_tok_neon, export=1
ld1 {v0.4h}, [x1] // cdf
add x16, x0, #RNG
movi v31.4h, #0x7f, lsl #8 // 0x7f00
movrel x17, coeffs, 30-2*3
movrel x17, coeffs, COEFFS_BASE_OFFSET-2*3
mvni v30.4h, #0x3f // 0xffc0
ldrh w9, [x1, #6] // count = cdf[n_symbols]
ld1r {v3.4h}, [x16] // rng
ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret)
add x17, x0, #DIF + 6
mov w13, #-24
mov w13, #-24*8
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
ldr w10, [x0, #ALLOW_UPDATE_CDF]
ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16)
sub sp, sp, #48
ldr w6, [x0, #CNT]
ldr x7, [x0, #DIF]
1:
@ -302,14 +387,14 @@ function msac_decode_hi_tok_neon, export=1
sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret)
add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
str h3, [sp, #14] // store original u = s->rng
cmhs v2.4h, v1.4h, v4.4h // c >= v
str q4, [sp, #16] // store v values to allow indexed access
addv h6, v2.4h // -4 + ret
add w13, w13, #5
smov w15, v6.h[0]
add x8, sp, #16
add w15, w15, #4 // ret
add w13, w13, #5*8
ext v18.8b, v3.8b, v4.8b, #6 // u
umov x15, v2.d[0]
rev x15, x15
sub v18.4h, v18.4h, v4.4h // rng = u-v
// rev + clz = count trailing zeros
clz x15, x15 // 16*ret
cbz w10, 2f
// update_cdf
@ -317,29 +402,32 @@ function msac_decode_hi_tok_neon, export=1
mov w4, #-5
orr v2.4h, #0x80, lsl #8 // i >= val ? -1 : 32768
sub w4, w4, w9, lsr #4 // -((count >> 4) + 5)
sub v4.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
sub v2.4h, v2.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i])
dup v6.4h, w4 // -rate
sub w9, w9, w9, lsr #5 // count - (count == 32)
sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
sshl v2.4h, v2.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate
add w9, w9, #1 // count + (count < 32)
add v0.4h, v5.4h, v4.4h // cdf[i] + (32768 - cdf[i]) >> rate
add v0.4h, v5.4h, v2.4h // cdf[i] + (32768 - cdf[i]) >> rate
st1 {v0.4h}, [x1]
and v17.8b, v0.8b, v30.8b // cdf & 0xffc0
strh w9, [x1, #6]
2:
add x8, x8, w15, uxtw #1
ldrh w3, [x8] // v
ldurh w4, [x8, #-2] // u
sub w4, w4, w3 // rng = u - v
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
mov x4, v18.d[0] // rng (packed)
mov x3, v4.d[0] // v (packed)
// Shift 'v'/'rng' for ret into the 16 least sig bits. There is
// garbage in the remaining bits, but we can work around this.
lsr x4, x4, x15 // rng
lsr x3, x3, x15 // v
lsl w5, w4, #16 // rng << 16
sub x7, x7, x3, lsl #48 // dif - (v << 48)
clz w5, w5 // d = clz(rng << 16)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
strh w4, [x0, #RNG]
dup v3.4h, w4
b.hs 5f
@ -366,17 +454,15 @@ function msac_decode_hi_tok_neon, export=1
orr x7, x7, x8 // dif |= next_bits
5: // end
lsl w15, w15, #1
sub w15, w15, #5
sub w15, w15, #5*8
lsr x12, x7, #48
adds w13, w13, w15 // carry = tok_br < 3 || tok == 15
dup v1.8h, w12
b.cc 1b // loop if !carry
add w13, w13, #30
add w13, w13, #30*8
str w6, [x0, #CNT]
add sp, sp, #48
str x7, [x0, #DIF]
lsr w0, w13, #1
lsr w0, w13, #4
ret
6: // pad_with_ones
@ -405,7 +491,6 @@ endfunc
function msac_decode_bool_equi_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
bic w4, w5, #0xff // r &= 0xff00
add w4, w4, #8
@ -418,12 +503,20 @@ function msac_decode_bool_equi_neon, export=1
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc
function msac_decode_bool_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
bic w1, w1, #0x3f // f &= ~63
@ -438,13 +531,21 @@ function msac_decode_bool_neon, export=1
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc
function msac_decode_bool_adapt_neon, export=1
ldr w9, [x1] // cdf[0-1]
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
and w2, w9, #0xffc0 // f &= ~63
@ -462,7 +563,7 @@ function msac_decode_bool_adapt_neon, export=1
clz w5, w4 // clz(rng)
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
cbz w10, 1f
lsr w2, w9, #16 // count = cdf[1]
and w9, w9, #0xffff // cdf[0]
@ -480,5 +581,15 @@ function msac_decode_bool_adapt_neon, export=1
strh w9, [x1]
strh w10, [x1, #2]
b L(renorm2)
1:
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (dif - (v << 48)) << d
str w4, [x0, #RNG]
b.lo L(refill)
str w6, [x0, #CNT]
str x7, [x0, #DIF]
mov w0, w15
ret
endfunc

View File

@ -62,6 +62,7 @@
decl_8tap_fns(neon);
decl_8tap_fns(neon_dotprod);
decl_8tap_fns(neon_i8mm);
decl_mc_fn(BF(dav1d_put_bilin, neon));
decl_mct_fn(BF(dav1d_prep_bilin, neon));
@ -109,11 +110,17 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
c->emu_edge = BF(dav1d_emu_edge, neon);
#if ARCH_AARCH64
#if HAVE_DOTPROD && BITDEPTH == 8
#if ARCH_AARCH64 && BITDEPTH == 8
#if HAVE_DOTPROD
if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
init_8tap_fns(neon_dotprod);
#endif // HAVE_DOTPROD && BITDEPTH == 8
#endif // ARCH_AARCH64
#endif // HAVE_DOTPROD
#if HAVE_I8MM
if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
init_8tap_fns(neon_i8mm);
#endif // HAVE_I8MM
#endif // ARCH_AARCH64 && BITDEPTH == 8
}

View File

@ -82,6 +82,9 @@ static ALWAYS_INLINE unsigned dav1d_get_cpu_flags(void) {
#if defined(__VSX__)
flags |= DAV1D_PPC_CPU_FLAG_VSX;
#endif
#if defined(__POWER9_VECTOR__)
flags |= DAV1D_PPC_CPU_FLAG_PWR9;
#endif
#elif ARCH_RISCV
#if defined(__riscv_v)
flags |= DAV1D_RISCV_CPU_FLAG_V;

View File

@ -1162,7 +1162,7 @@ static int decode_b(Dav1dTaskContext *const t,
ts->cdf.m.use_filter_intra[bs]);
if (is_filter) {
b->y_mode = FILTER_PRED;
b->y_angle = dav1d_msac_decode_symbol_adapt4(&ts->msac,
b->y_angle = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.filter_intra, 4);
}
if (DEBUG_BLOCK_INFO)

View File

@ -232,7 +232,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%elif PIC
call $+5 ; special-cased to not affect the RSB on most CPU:s
pop %1
add %1, (%2)-$+1
add %1, -$+1+%2
%else
mov %1, %2
%endif
@ -864,16 +864,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%macro cextern 1
%xdefine %1 mangle(private_prefix %+ _ %+ %1)
CAT_XDEFINE cglobaled_, %1, 1
CAT_XDEFINE cglobaled_, %1, 2
extern %1
%endmacro
; like cextern, but without the prefix
; Like cextern, but without the prefix. This should be used for symbols from external libraries.
%macro cextern_naked 1
%ifdef PREFIX
%xdefine %1 mangle(%1)
%endif
CAT_XDEFINE cglobaled_, %1, 1
CAT_XDEFINE cglobaled_, %1, 3
extern %1
%endmacro
@ -1268,12 +1268,27 @@ INIT_XMM
%endmacro
%macro call_internal 2
%xdefine %%i %2
%define %%j %%i
%ifndef cglobaled_%2
%ifdef cglobaled_%1
%xdefine %%i %1
%endif
%elif FORMAT_ELF
%if ARCH_X86_64
%if cglobaled_%2 >= 2
; Always emit PLT relocations when calling external functions,
; the linker will eliminate unnecessary PLT indirections anyway.
%define %%j %%i wrt ..plt
%endif
call %%i
%elif PIC && cglobaled_%2 == 3
; Go through the GOT for functions declared using cextern_naked with
; PIC, as such functions presumably exists in external libraries.
extern _GLOBAL_OFFSET_TABLE_
LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc
%define %%j [eax+%%i wrt ..got]
%endif
%endif
call %%j
LOAD_MM_PERMUTATION %%i
%endmacro

View File

@ -263,7 +263,6 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
f->c = c;
f->task_thread.ttd = &c->task_thread;
f->lf.last_sharpness = -1;
dav1d_refmvs_init(&f->rf);
}
for (unsigned m = 0; m < c->n_tc; m++) {
@ -664,7 +663,7 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
dav1d_free(f->lf.lr_mask);
dav1d_free(f->lf.tx_lpf_right_edge[0]);
dav1d_free(f->lf.start_of_tile_row);
dav1d_refmvs_clear(&f->rf);
dav1d_free_aligned(f->rf.r);
dav1d_free_aligned(f->lf.cdef_line_buf);
dav1d_free_aligned(f->lf.lr_line_buf);
}

View File

@ -249,6 +249,8 @@ static void loop_filter_v_sb128uv_c(pixel *dst, const ptrdiff_t stride,
#include "src/arm/loopfilter.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/loopfilter.h"
#elif ARCH_PPC64LE
#include "src/ppc/loopfilter.h"
#elif ARCH_X86
#include "src/x86/loopfilter.h"
#endif
@ -265,6 +267,8 @@ COLD void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c)
loop_filter_dsp_init_arm(c);
#elif ARCH_LOONGARCH64
loop_filter_dsp_init_loongarch(c);
#elif ARCH_PPC64LE
loop_filter_dsp_init_ppc(c);
#elif ARCH_X86
loop_filter_dsp_init_x86(c);
#endif

View File

@ -73,14 +73,14 @@ libdav1d_tmpl_sources = files(
'recon_tmpl.c',
)
libdav1d_arch_tmpl_sources = []
libdav1d_arch_tmpl_sources = {}
libdav1d_bitdepth_objs = []
# ASM specific sources
libdav1d_asm_objs = []
# Arch-specific flags
arch_flags = []
arch_flags = {}
if is_asm_enabled
if (host_machine.cpu_family() == 'aarch64' or
host_machine.cpu_family().startswith('arm'))
@ -232,9 +232,9 @@ if is_asm_enabled
'loongarch/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
libdav1d_arch_tmpl_sources += {'default': files(
'loongarch/looprestoration_tmpl.c',
)
)}
libdav1d_sources_asm = files(
'loongarch/mc.S',
@ -246,20 +246,25 @@ if is_asm_enabled
)
libdav1d_asm_objs += libdav1d_sources_asm
elif host_machine.cpu() == 'ppc64le'
arch_flags = ['-maltivec', '-mvsx']
arch_flags += {'vsx': ['-maltivec', '-mvsx', '-DDAV1D_VSX']}
libdav1d_sources += files(
'ppc/cpu.c',
)
libdav1d_arch_tmpl_sources += files(
libdav1d_arch_tmpl_sources += {'vsx': files(
'ppc/cdef_tmpl.c',
'ppc/looprestoration_tmpl.c',
)
)}
arch_flags += {'pwr9': ['-mcpu=power9', '-DDAV1D_PWR9']}
libdav1d_arch_tmpl_sources += {'pwr9': files(
'ppc/loopfilter_tmpl.c',
)}
elif host_machine.cpu_family().startswith('riscv')
libdav1d_sources += files(
'riscv/cpu.c',
)
if host_machine.cpu_family() == 'riscv64'
libdav1d_sources += files(
'riscv/64/cpu.S',
'riscv/64/itx.S',
)
endif
@ -320,15 +325,17 @@ endforeach
# Helper library for each bitdepth and architecture-specific flags
foreach bitdepth : dav1d_bitdepths
foreach subarch : libdav1d_arch_tmpl_sources.keys()
libdav1d_bitdepth_objs += static_library(
'dav1d_arch_bitdepth_@0@'.format(bitdepth),
libdav1d_arch_tmpl_sources, config_h_target,
'dav1d_arch_bitdepth_@0@_@1@'.format(bitdepth,subarch),
libdav1d_arch_tmpl_sources[subarch], config_h_target,
include_directories: dav1d_inc_dirs,
dependencies : [stdatomic_dependencies],
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags,
c_args : ['-DBITDEPTH=@0@'.format(bitdepth)] + libdav1d_flags + arch_flags.get(subarch, []),
install : false,
build_by_default : false,
).extract_all_objects(recursive: true)
endforeach
endforeach
# The final dav1d library

View File

@ -68,7 +68,7 @@ unsigned dav1d_msac_decode_bool_c(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_hi_tok_c(MsacContext *s, uint16_t *cdf);
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
/* Supported n_symbols ranges: adapt4: 1-4, adapt8: 1-7, adapt16: 3-15 */
/* Supported n_symbols ranges: adapt4: 1-3, adapt8: 1-7, adapt16: 3-15 */
#ifndef dav1d_msac_decode_symbol_adapt4
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
#endif

View File

@ -40,12 +40,16 @@ COLD unsigned dav1d_get_cpu_flags_ppc(void) {
unsigned flags = 0;
#if defined(HAVE_GETAUXVAL) && ARCH_PPC64LE
unsigned long hw_cap = getauxval(AT_HWCAP);
unsigned long hw_cap2 = getauxval(AT_HWCAP2);
#elif defined(HAVE_ELF_AUX_INFO) && ARCH_PPC64LE
unsigned long hw_cap = 0;
unsigned long hw_cap2 = 0;
elf_aux_info(AT_HWCAP, &hw_cap, sizeof(hw_cap));
elf_aux_info(AT_HWCAP2, &hw_cap2, sizeof(hw_cap2));
#endif
#ifdef HAVE_AUX
flags |= (hw_cap & PPC_FEATURE_HAS_VSX) ? DAV1D_PPC_CPU_FLAG_VSX : 0;
flags |= (hw_cap2 & PPC_FEATURE2_ARCH_3_00) ? DAV1D_PPC_CPU_FLAG_PWR9 : 0;
#endif
return flags;
}

View File

@ -30,6 +30,7 @@
enum CpuFlags {
DAV1D_PPC_CPU_FLAG_VSX = 1 << 0,
DAV1D_PPC_CPU_FLAG_PWR9 = 1 << 1,
};
unsigned dav1d_get_cpu_flags_ppc(void);

View File

@ -44,6 +44,10 @@
#define i64x2 vector signed long long
#define b64x2 vector bool long long
#define i8h_to_i16(v) ((i16x8) vec_unpackh((i8x16)v))
#define i8l_to_i16(v) ((i16x8) vec_unpackl((i8x16)v))
#define u8h_to_i16(v) ((i16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
#define u8l_to_i16(v) ((i16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
#define u8h_to_u16(v) ((u16x8) vec_mergeh((u8x16) v, vec_splat_u8(0)))
#define u8l_to_u16(v) ((u16x8) vec_mergel((u8x16) v, vec_splat_u8(0)))
#define u16h_to_i32(v) ((i32x4) vec_mergeh((u16x8) v, vec_splat_u16(0)))

47
third_party/dav1d/src/ppc/loopfilter.h vendored Normal file
View File

@ -0,0 +1,47 @@
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
#include "src/loopfilter.h"
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_y, pwr9));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_y, pwr9));
decl_loopfilter_sb_fn(BF(dav1d_lpf_h_sb_uv, pwr9));
decl_loopfilter_sb_fn(BF(dav1d_lpf_v_sb_uv, pwr9));
static ALWAYS_INLINE void loop_filter_dsp_init_ppc(Dav1dLoopFilterDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_PPC_CPU_FLAG_PWR9)) return;
#if BITDEPTH == 8
c->loop_filter_sb[0][0] = BF(dav1d_lpf_h_sb_y, pwr9);
c->loop_filter_sb[0][1] = BF(dav1d_lpf_v_sb_y, pwr9);
c->loop_filter_sb[1][0] = BF(dav1d_lpf_h_sb_uv, pwr9);
c->loop_filter_sb[1][1] = BF(dav1d_lpf_v_sb_uv, pwr9);
#endif
}

File diff suppressed because it is too large Load Diff

View File

@ -369,7 +369,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
idx = dav1d_msac_decode_symbol_adapt4(&ts->msac,
idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
*txtp = dav1d_tx_types_per_set[idx + 0];
} else {
@ -412,7 +412,7 @@ static int decode_coefs(Dav1dTaskContext *const t,
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
break; \
}
case_sz(0, 16, 4, [is_1d]);
case_sz(0, 16, 8, [is_1d]);
case_sz(1, 32, 8, [is_1d]);
case_sz(2, 64, 8, [is_1d]);
case_sz(3, 128, 8, [is_1d]);

View File

@ -657,19 +657,19 @@ void dav1d_refmvs_tile_sbrow_init(refmvs_tile *const rt, const refmvs_frame *con
{
if (rf->n_tile_threads == 1) tile_row_idx = 0;
rt->rp_proj = &rf->rp_proj[16 * rf->rp_stride * tile_row_idx];
const int uses_2pass = rf->n_tile_threads > 1 && rf->n_frame_threads > 1;
const ptrdiff_t pass_off = (uses_2pass && pass == 2) ?
35 * rf->r_stride * rf->n_tile_rows : 0;
refmvs_block *r = &rf->r[35 * rf->r_stride * tile_row_idx + pass_off];
const ptrdiff_t r_stride = rf->rp_stride * 2;
const ptrdiff_t pass_off = (rf->n_frame_threads > 1 && pass == 2) ?
35 * 2 * rf->n_blocks : 0;
refmvs_block *r = &rf->r[35 * r_stride * tile_row_idx + pass_off];
const int sbsz = rf->sbsz;
const int off = (sbsz * sby) & 16;
for (int i = 0; i < sbsz; i++, r += rf->r_stride)
for (int i = 0; i < sbsz; i++, r += r_stride)
rt->r[off + 5 + i] = r;
rt->r[off + 0] = r;
r += rf->r_stride;
r += r_stride;
rt->r[off + 1] = NULL;
rt->r[off + 2] = r;
r += rf->r_stride;
r += r_stride;
rt->r[off + 3] = NULL;
rt->r[off + 4] = r;
if (sby & 1) {
@ -805,37 +805,37 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
/*const*/ refmvs_temporal_block *const rp_ref[7],
const int n_tile_threads, const int n_frame_threads)
{
const int rp_stride = ((frm_hdr->width[0] + 127) & ~127) >> 3;
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
const int n_blocks = rp_stride * n_tile_rows;
rf->sbsz = 16 << seq_hdr->sb128;
rf->frm_hdr = frm_hdr;
rf->iw8 = (frm_hdr->width[0] + 7) >> 3;
rf->ih8 = (frm_hdr->height + 7) >> 3;
rf->iw4 = rf->iw8 << 1;
rf->ih4 = rf->ih8 << 1;
const ptrdiff_t r_stride = ((frm_hdr->width[0] + 127) & ~127) >> 2;
const int n_tile_rows = n_tile_threads > 1 ? frm_hdr->tiling.rows : 1;
if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->r) dav1d_freep_aligned(&rf->r);
const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
/* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm,
* so add 4 bytes of padding to avoid buffer overreads. */
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64);
if (!rf->r) return DAV1D_ERR(ENOMEM);
rf->r_stride = r_stride;
}
const ptrdiff_t rp_stride = r_stride >> 1;
if (rp_stride != rf->rp_stride || n_tile_rows != rf->n_tile_rows) {
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
rf->rp_proj = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->rp_proj) * 16 * rp_stride * n_tile_rows, 64);
if (!rf->rp_proj) return DAV1D_ERR(ENOMEM);
rf->rp = rp;
rf->rp_stride = rp_stride;
}
rf->n_tile_rows = n_tile_rows;
rf->n_tile_threads = n_tile_threads;
rf->n_frame_threads = n_frame_threads;
rf->rp = rp;
rf->rp_ref = rp_ref;
if (n_blocks != rf->n_blocks) {
const size_t r_sz = sizeof(*rf->r) * 35 * 2 * n_blocks * (1 + (n_frame_threads > 1));
const size_t rp_proj_sz = sizeof(*rf->rp_proj) * 16 * n_blocks;
/* Note that sizeof(*rf->r) == 12, but it's accessed using 16-byte unaligned
* loads in save_tmvs() asm which can overread 4 bytes into rp_proj. */
dav1d_free_aligned(rf->r);
rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, r_sz + rp_proj_sz, 64);
if (!rf->r) {
rf->n_blocks = 0;
return DAV1D_ERR(ENOMEM);
}
rf->rp_proj = (refmvs_temporal_block*)((uintptr_t)rf->r + r_sz);
rf->n_blocks = n_blocks;
}
const unsigned poc = frm_hdr->frame_offset;
for (int i = 0; i < 7; i++) {
const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
@ -848,6 +848,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
// temporal MV setup
rf->n_mfmvs = 0;
rf->rp_ref = rp_ref;
if (frm_hdr->use_ref_frame_mvs && seq_hdr->order_hint_n_bits) {
int total = 2;
if (rp_ref[0] && ref_ref_poc[0][6] != ref_poc[3] /* alt-of-last != gold */) {
@ -896,18 +897,6 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
return 0;
}
void dav1d_refmvs_init(refmvs_frame *const rf) {
rf->r = NULL;
rf->r_stride = 0;
rf->rp_proj = NULL;
rf->rp_stride = 0;
}
void dav1d_refmvs_clear(refmvs_frame *const rf) {
if (rf->r) dav1d_freep_aligned(&rf->r);
if (rf->rp_proj) dav1d_freep_aligned(&rf->rp_proj);
}
static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
const int bx4, const int bw4, int bh4)
{

View File

@ -72,14 +72,14 @@ typedef struct refmvs_frame {
int mfmv_ref2ref[3][7];
int n_mfmvs;
int n_blocks;
refmvs_temporal_block *rp;
/*const*/ refmvs_temporal_block *const *rp_ref;
refmvs_temporal_block *rp_proj;
ptrdiff_t rp_stride;
refmvs_block *r; // 35 x r_stride memory
ptrdiff_t r_stride;
int n_tile_rows, n_tile_threads, n_frame_threads;
int n_tile_threads, n_frame_threads;
} refmvs_frame;
typedef struct refmvs_tile {
@ -121,10 +121,6 @@ typedef struct Dav1dRefmvsDSPContext {
splat_mv_fn splat_mv;
} Dav1dRefmvsDSPContext;
// call once per frame thread
void dav1d_refmvs_init(refmvs_frame *rf);
void dav1d_refmvs_clear(refmvs_frame *rf);
// call once per frame
int dav1d_refmvs_init_frame(refmvs_frame *rf,
const Dav1dSequenceHeader *seq_hdr,

44
third_party/dav1d/src/riscv/64/cpu.S vendored Normal file
View File

@ -0,0 +1,44 @@
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Nathan Egge
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "src/riscv/asm.S"
// This function detects non-compliant RVV 0.7.1 hardware which reports support
// for the V extension through HWCAP, by intentionally setting tail and mask
// agnostic vector configurations that were only introduced in RVV 0.9 spec.
// Existing non-compliant (pre RVV 1.0) hardware will set the VILL bit in VTYPE
// (indicating an illegal vector configuration) which is stored in the XLEN-1
// bit position, thus a simple sign check is sufficient for detection.
// This work around is inexpensive and harmless on compliant hardware, but we
// should still consider removing it once all non-compliant RVV 0.7.1 hardware
// is out of service.
function has_compliant_rvv, export=1, ext=v
vsetvli t0, zero, e8, m1, ta, ma
csrr a0, vtype
sgtz a0, a0
ret
endfunc

View File

@ -38,11 +38,13 @@
#endif
int dav1d_has_compliant_rvv(void);
COLD unsigned dav1d_get_cpu_flags_riscv(void) {
unsigned flags = 0;
#if defined(HAVE_GETAUXVAL)
unsigned long hw_cap = getauxval(AT_HWCAP);
flags |= (hw_cap & HWCAP_RVV) ? DAV1D_RISCV_CPU_FLAG_V : 0;
flags |= (hw_cap & HWCAP_RVV) && dav1d_has_compliant_rvv() ? DAV1D_RISCV_CPU_FLAG_V : 0;
#endif
return flags;

View File

@ -29,7 +29,6 @@
#include "src/mc.h"
#define decl_fn(type, name) \
decl_##type##_fn(BF(name, sse2)); \
decl_##type##_fn(BF(name, ssse3)); \
decl_##type##_fn(BF(name, avx2)); \
decl_##type##_fn(BF(name, avx512icl));
@ -108,25 +107,6 @@ decl_fn(resize, dav1d_resize);
static ALWAYS_INLINE void mc_dsp_init_x86(Dav1dMCDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
return;
#if BITDEPTH == 8
init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2);
c->warp8x8 = BF(dav1d_warp_affine_8x8, sse2);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, sse2);
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
return;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -92,6 +92,31 @@ JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32
JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32
struc rf
.frm_hdr: resq 1
.iw4: resd 1
.ih4: resd 1
.iw8: resd 1
.ih8: resd 1
.sbsz: resd 1
.use_rf_mvs: resd 1
.sign_bias: resb 7
.mfmv_sign: resb 7
.pocdiff: resb 7
.mfmv_ref: resb 3
.mfmv_ref2cur: resd 3
.mfmv_ref2ref: resd 3*7
.n_mfmvs: resd 1
.n_blocks: resd 1
.rp: resq 1
.rp_ref: resq 1
.rp_proj: resq 1
.rp_stride: resq 1
.r: resq 1
.n_tile_threads: resd 1
.n_frame_threads: resd 1
endstruc
SECTION .text
%macro movif32 2
@ -341,16 +366,16 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
stride, rp_proj, roff, troff, \
xendi, xstarti, iw8, ih8, dst
xor r14d, r14d
cmp dword [rfq+212], 1 ; n_tile_threads
mov ih8d, [rfq+20] ; rf->ih8
mov iw8d, [rfq+16] ; rf->iw8
cmp dword [rfq+rf.n_tile_threads], 1
mov ih8d, [rfq+rf.ih8]
mov iw8d, [rfq+rf.iw8]
mov xstartd, xstartd
mov xendd, xendd
cmove tridxd, r14d
lea xstartid, [xstartq-8]
lea xendid, [xendq+8]
mov strideq, [rfq+184]
mov rp_projq, [rfq+176]
mov strideq, [rfq+rf.rp_stride]
mov rp_projq, [rfq+rf.rp_proj]
cmp ih8d, yendd
mov [rsp+0x30], strideq
cmovs yendd, ih8d
@ -397,7 +422,7 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
jg .init_xloop_start
DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
_, _, xendi, xstarti, stride5, _, n
mov r13d, [rfq+152] ; rf->n_mfmvs
mov r13d, [rfq+rf.n_mfmvs]
test r13d, r13d
jz .ret
mov [rsp+0x0c], r13d
@ -418,14 +443,14 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
ref, rp_ref, xendi, xstarti, _, _, n
mov rfq, [rsp+0x48]
mov refd, [rfq+56+nq*4] ; ref2cur
mov refd, [rfq+rf.mfmv_ref2cur+nq*4]
cmp refd, 0x80000000
je .next_n
mov [rsp+0x40], refd
mov offq, [rsp+0x00] ; ystart * stride * 5
movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n]
movzx refd, byte [rfq+rf.mfmv_ref+nq]
lea refsignq, [refq-4]
mov rp_refq, [rfq+168]
mov rp_refq, [rfq+rf.rp_ref]
movq m2, refsignq
add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset
mov [rsp+0x14], nd
@ -452,8 +477,8 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
test refd, refd
jz .next_x_bad_ref
mov rfq, [rsp+0x48]
lea r14d, [16+n7q+refq]
mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1]
lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1]
mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1]
test ref2refd, ref2refd
jz .next_x_bad_ref
lea fracq, [mv_proj]

View File

@ -131,7 +131,7 @@ else
mapfile -t dirs < <(printf "${ARGON_DIR}/%s\n" "$@" | sort -u)
fi
ver_info="dav1d $("$DAV1D" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D"
ver_info="dav1d $("$DAV1D" --filmgrain "$FILMGRAIN" --cpumask "$CPUMASK" --threads "$THREADS" -v 2>&1) filmgrain=$FILMGRAIN cpumask=$CPUMASK" || error "Error! Can't run $DAV1D"
files=()
for d in "${dirs[@]}"; do