AArch64: Trim Armv8.0 Neon path of 6-tap and 8-tap MC functions

There are some instruction sequences we could merge after the lane
load/store patch (c28f364cef).

This change will simplify the loading of filter weights to save 288
bytes in the Armv8.0 Neon path of 6-tap and 8-tap MC functions.
This commit is contained in:
Arpad Panyik
2024-09-10 13:37:55 +02:00
committed by Martin Storsjö
parent 948e739d7d
commit b8110a4c14
2 changed files with 22 additions and 44 deletions
+11 -22
View File
@@ -1475,8 +1475,7 @@ L(\type\()_\taps\()_h):
20: // 2xN h
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1509,8 +1508,7 @@ L(\type\()_\taps\()_h):
40: // 4xN h
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1741,8 +1739,7 @@ function L(\type\()_\taps\()_v)
b.gt 28f
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1821,8 +1818,7 @@ function L(\type\()_\taps\()_v)
// 4x2, 4x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1897,8 +1893,7 @@ function L(\type\()_\taps\()_v)
// 8x2, 8x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1996,8 +1991,7 @@ function L(\type\()_\taps\()_v)
b.gt 1680b
// 16x2, 16x4 v
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -2064,11 +2058,9 @@ function L(\type\()_\taps\()_hv)
20:
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 280f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
// 2x2, 2x4 hv
sub \sr2, \src, #1
@@ -2202,11 +2194,9 @@ L(\type\()_\taps\()_filter_2):
40:
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 480f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
sub \sr2, \src, #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
@@ -2396,9 +2386,8 @@ L(\type\()_\taps\()_filter_4):
320:
AARCH64_VALID_JUMP_TARGET
b.gt 880f
add \xmy, \xmy, #2
ld1 {v0.8b}, [\xmx]
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
.ifc \taps, 6tap
sub \src, \src, #2
.else
+11 -22
View File
@@ -1618,8 +1618,7 @@ L(\type\()_\taps\()_h):
20: // 2xN h
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #2
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1651,8 +1650,7 @@ L(\type\()_\taps\()_h):
40: // 4xN h
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
sub \src, \src, #2
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1859,8 +1857,7 @@ function L(\type\()_\taps\()_v)
b.gt 28f
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1937,8 +1934,7 @@ function L(\type\()_\taps\()_v)
// 4x2, 4x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -2002,8 +1998,7 @@ function L(\type\()_\taps\()_v)
// 8x2, 8x4 v
cmp \h, #2
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -2091,8 +2086,7 @@ function L(\type\()_\taps\()_v)
b.gt 1680b
// 16x2, 16x4 v
add \xmy, \xmy, #2
ldr s0, [\xmy]
ldur s0, [\xmy, #2]
sub \src, \src, \s_strd
sxtl v0.8h, v0.8b
@@ -2154,11 +2148,9 @@ function L(\type\()_\taps\()_hv)
20:
AARCH64_VALID_JUMP_TARGET
.ifc \type, put
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 280f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
// 2x2, 2x4 hv
sub \sr2, \src, #2
@@ -2301,11 +2293,9 @@ L(\type\()_\taps\()_filter_2):
40:
AARCH64_VALID_JUMP_TARGET
add \xmx, \xmx, #2
ldr s0, [\xmx]
ldur s0, [\xmx, #2]
b.gt 480f
add \xmy, \xmy, #2
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
sub \sr2, \src, #2
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
@@ -2501,9 +2491,8 @@ L(\type\()_\taps\()_filter_4):
320:
AARCH64_VALID_JUMP_TARGET
b.gt 880f
add \xmy, \xmy, #2
ld1 {v0.8b}, [\xmx]
ldr s1, [\xmy]
ldur s1, [\xmy, #2]
.ifc \taps, 6tap
sub \src, \src, #4
.else