refmvs: Shrink mfmv_ref arrays

Includes updates to load_tmvs() asm implementations.
This commit is contained in:
Henrik Gramner
2025-05-26 18:47:16 +02:00
parent d877915ffe
commit 953f746352
7 changed files with 43 additions and 51 deletions
+7 -7
View File
@@ -386,20 +386,20 @@ function load_tmvs_neon, export=1
movrel x1, div_mult_tbl
10: // nloop
ldr w16, [x29, x10, lsl #2] // ref2cur = rf->mfmv_ref2cur[n]
cmp w16, #-32 // instead of INT_MIN, we can use smaller constants
b.lt 9f // if (ref2cur == INT_MIN) continue
ldrsb w16, [x29, x10] // ref2cur = rf->mfmv_ref2cur[n]
cmp w16, #-32
b.eq 9f // if (ref2cur == INVALID_REF2CUR) continue
add x17, x10, #(RMVSF_MFMV_REF - RMVSF_MFMV_REF2CUR) // n - (&rf->mfmv_ref - &rf->mfmv_ref2cur)
mov x20, #4
ldrb w17, [x29, x17] // ref = rf->mfmv_ref[n]
ldr x13, [x29, #(RMVSF_RP_REF - RMVSF_MFMV_REF2CUR)]
mov w28, #28 // 7 * sizeof(int)
sub x21, x10, x10, lsl #3 // -(n * 7)
smaddl x20, row_start8, wstride5, x20 // row_start8 * stride * sizeof(refmvs_temporal_block) + 4
mov w12, row_start8 // y = row_start8
add x21, x29, #(RMVSF_MFMV_REF2REF - RMVSF_MFMV_REF2CUR - 4) // &rf->mfmv_ref2ref - 1
add x28, x29, #(RMVSF_MFMV_REF2REF - RMVSF_MFMV_REF2CUR - 1) // &rf->mfmv_ref2ref - 1
ldr x13, [x13, x17, lsl #3] // rf->rp_ref[ref]
smaddl x28, w28, w10, x21 // rf->mfmv_ref2ref[n] - 1
sub x28, x28, x21 // rf->mfmv_ref2ref[n] - 1
sub w17, w17, #4 // ref_sign = ref - 4
add x13, x13, x20 // r = &rf->rp_ref[ref][row_start8 * stride].ref
dup v0.2s, w17 // ref_sign
@@ -418,7 +418,7 @@ function load_tmvs_neon, export=1
ldrb w22, [x23, x11] // b_ref = rb->ref
cbz w22, 6f // if (!b_ref) continue
ldr w24, [x28, x22, lsl #2] // ref2ref = rf->mfmv_ref2ref[n][b_ref - 1]
ldrb w24, [x28, x22] // ref2ref = rf->mfmv_ref2ref[n][b_ref - 1]
cbz w24, 6f // if (!ref2ref) continue
ldrh w20, [x1, x24, lsl #1] // div_mult[ref2ref]
+6 -6
View File
@@ -47,12 +47,12 @@
#define RMVSF_IH8 20
#define RMVSF_MFMV_REF 53
#define RMVSF_MFMV_REF2CUR 56
#define RMVSF_MFMV_REF2REF 68
#define RMVSF_N_MFMVS 152
#define RMVSF_RP_REF 168
#define RMVSF_RP_PROJ 176
#define RMVSF_RP_STRIDE 184
#define RMVSF_N_TILE_THREADS 200
#define RMVSF_MFMV_REF2REF 59
#define RMVSF_N_MFMVS 80
#define RMVSF_RP_REF 96
#define RMVSF_RP_PROJ 104
#define RMVSF_RP_STRIDE 112
#define RMVSF_N_TILE_THREADS 128
#endif
#endif /* ARM_ASM_OFFSETS_H */
+12 -19
View File
@@ -202,11 +202,11 @@ function load_tmvs_lsx
st.d s8, sp, 64
vld vr16, a0, 16
vld vr0, a0, 52 // rf->mfmv_ref
ld.w s8, a0, 152 // [0] - rf->n_mfmvs
vld vr17, a0, 168 // [0] - rp_ref| [1]- rp_proj
ld.d t1, a0, 184 // stride
ld.w t0, a0, 200
vld vr0, a0, 48 // rf->mfmv_ref, rf->mfmv_ref2cur
ld.w s8, a0, 80 // [0] - rf->n_mfmvs
vld vr17, a0, 96 // [0] - rp_ref| [1]- rp_proj
ld.d t1, a0, 112 // stride
ld.w t0, a0, 128
addi.w t0, t0, -1
bnez t0, 1f
addi.w a1, zero, 0
@@ -248,8 +248,7 @@ function load_tmvs_lsx
mul.w t6, a4, t2
addi.d s7, zero, 40
vpickve2gr.w t1, vr1, 0 // col_end8i
vbsrl.v vr2, vr0, 4 // rf->mfmv_ref2cur
addi.d t5, a0, 64 // rf->mfmv_ref2ref
addi.d t5, a0, 58 // rf->mfmv_ref2ref - 1
la.local t8, la_div_mult
vld vr6, t8, 0
vld vr7, t8, 16
@@ -265,17 +264,11 @@ function load_tmvs_lsx
vpickod.b vr15, vr9, vr8
vpickve2gr.d s6, vr17, 0 // rf->rp_ref
5:
vld vr10, t5, 0
vld vr11, t5, 16
vpickev.h vr10, vr11, vr10
vpickev.b vr10, vr11, vr10 // [1...7]
vld vr10, t5, 0 // ref2ref [1...7]
vpickve2gr.b t8, vr0, 8 // ref2cur
vbsrl.v vr0, vr0, 1
vpickve2gr.wu t8, vr2, 0 // ref2cur
vbsrl.v vr2, vr2, 4
srli.d t4, t8, 24
xori t4, t4, 0x80
beqz t4, 8f
addi.w t4, t8, 32
beqz t4, 8f // INVALID_REF2CUR
vreplgr2vr.h vr23, t8
vshuf.b vr6, vr14, vr12, vr10
@@ -284,7 +277,7 @@ function load_tmvs_lsx
vmulwev.w.h vr6, vr8, vr23
vmulwod.w.h vr7, vr8, vr23
vpickve2gr.b s0, vr0, 0 // ref
vpickve2gr.b s0, vr0, 4 // ref
slli.d t8, s0, 3
ldx.d s1, s6, t8 // rf->rp_ref[ref]
addi.d s0, s0, -4 // ref_sign
@@ -460,7 +453,7 @@ function load_tmvs_lsx
blt s2, a5, 6b
8:
addi.d a6, a6, 1 // n + 1
addi.d t5, t5, 28 // mfmv_ref2ref(offset) + 28
addi.d t5, t5, 7 // mfmv_ref2ref(offset) + 7
blt a6, s8, 5b
.end_load:
+5 -5
View File
@@ -710,7 +710,7 @@ static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
rp_proj = &rf->rp_proj[16 * stride * tile_row_idx];
for (int n = 0; n < rf->n_mfmvs; n++) {
const int ref2cur = rf->mfmv_ref2cur[n];
if (ref2cur == INT_MIN) continue;
if (ref2cur == INVALID_REF2CUR) continue;
const int ref = rf->mfmv_ref[n];
const int ref_sign = ref - 4;
@@ -836,7 +836,7 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
rf->n_blocks = n_blocks;
}
const unsigned poc = frm_hdr->frame_offset;
const int poc = frm_hdr->frame_offset;
for (int i = 0; i < 7; i++) {
const int poc_diff = get_poc_diff(seq_hdr->order_hint_n_bits,
ref_poc[i], poc);
@@ -875,15 +875,15 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
rf->mfmv_ref[rf->n_mfmvs++] = 1; // last2
for (int n = 0; n < rf->n_mfmvs; n++) {
const unsigned rpoc = ref_poc[rf->mfmv_ref[n]];
const int rpoc = ref_poc[rf->mfmv_ref[n]];
const int diff1 = get_poc_diff(seq_hdr->order_hint_n_bits,
rpoc, frm_hdr->frame_offset);
if (abs(diff1) > 31) {
rf->mfmv_ref2cur[n] = INT_MIN;
rf->mfmv_ref2cur[n] = INVALID_REF2CUR;
} else {
rf->mfmv_ref2cur[n] = rf->mfmv_ref[n] < 4 ? -diff1 : diff1;
for (int m = 0; m < 7; m++) {
const unsigned rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
const int rrpoc = ref_ref_poc[rf->mfmv_ref[n]][m];
const int diff2 = get_poc_diff(seq_hdr->order_hint_n_bits,
rpoc, rrpoc);
// unsigned comparison also catches the < 0 case
+4 -3
View File
@@ -38,10 +38,11 @@
#include "src/tables.h"
#define INVALID_MV 0x80008000
#define INVALID_REF2CUR (-32)
PACKED(typedef struct refmvs_temporal_block {
mv mv;
int8_t ref;
uint8_t ref;
}) refmvs_temporal_block;
CHECK_SIZE(refmvs_temporal_block, 5);
@@ -72,8 +73,8 @@ typedef struct refmvs_frame {
uint8_t sign_bias[7], mfmv_sign[7];
int8_t pocdiff[7];
uint8_t mfmv_ref[3];
int mfmv_ref2cur[3];
int mfmv_ref2ref[3][7];
int8_t mfmv_ref2cur[3];
uint8_t mfmv_ref2ref[3][7];
int n_mfmvs;
int n_blocks;
+8 -10
View File
@@ -104,8 +104,8 @@ struc rf
.mfmv_sign: resb 7
.pocdiff: resb 7
.mfmv_ref: resb 3
.mfmv_ref2cur: resd 3
.mfmv_ref2ref: resd 3*7
.mfmv_ref2cur: resb 3
.mfmv_ref2ref: resb 3*7
.n_mfmvs: resd 1
.n_blocks: resd 1
.rp: resq 1
@@ -432,7 +432,7 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
mov [rsp+0x38], yendd
mov [rsp+0x20], xstartid
xor nd, nd
xor n7d, n7d
lea n7q, [rfq+rf.mfmv_ref2ref-1]
imul r9, strideq ; ystart * stride
mov [rsp+0x48], rfq
mov [rsp+0x18], stride5q
@@ -443,8 +443,8 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
ref, rp_ref, xendi, xstarti, _, _, n
mov rfq, [rsp+0x48]
mov refd, [rfq+rf.mfmv_ref2cur+nq*4]
cmp refd, 0x80000000
movsx refd, byte [rfq+rf.mfmv_ref2cur+nq]
cmp refd, -32 ; INVALID_REF2CUR
je .next_n
mov [rsp+0x40], refd
mov offq, [rsp+0x00] ; ystart * stride * 5
@@ -473,12 +473,10 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
.xloop:
lea rbd, [xq*5]
add rbq, srcq
movsx refd, byte [rbq+4]
movzx refd, byte [rbq+4]
test refd, refd
jz .next_x_bad_ref
mov rfq, [rsp+0x48]
lea ref2refd, [(rf.mfmv_ref2ref/4)+n7q+refq-1]
mov ref2refd, [rfq+ref2refq*4] ; rf->mfmv_ref2ref[n][b_ref-1]
movzx ref2refd, byte [n7q+refq] ; rf->mfmv_ref2ref[n][b_ref-1]
test ref2refd, ref2refd
jz .next_x_bad_ref
lea fracq, [mv_proj]
@@ -554,7 +552,7 @@ cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
mov nd, [rsp+0x14]
mov ystartd, [rsp+0x24]
.next_n:
add n7d, 7
add n7q, 7
inc nd
cmp nd, [rsp+0x0c] ; n_mfmvs
jne .nloop
+1 -1
View File
@@ -49,7 +49,7 @@ static inline int get_min_mv_val(const int idx) {
else return (idx - 36) * 10000;
}
static inline void gen_tmv(refmvs_temporal_block *const rb, const int *ref2ref) {
static inline void gen_tmv(refmvs_temporal_block *const rb, const uint8_t *const ref2ref) {
rb->ref = rnd() % 7;
if (!rb->ref) return;
static const int x_prob[] = {