VectorOps: Handle more VUMax SVE cases better

We can avoid needing to use movprfx here by moving
directly into the destination when possible and just
doing the UMAX directly.

Also expands the unsigned max tests to test values with
the sign bit set to ensure all behavior is caught.
This commit is contained in:
Lioncache 2023-10-18 17:14:22 +02:00
parent 65eec673fc
commit f85fae0041
6 changed files with 131 additions and 34 deletions

View File

@ -2255,14 +2255,18 @@ DEF_OP(VUMax) {
if (HostSupportsSVE256 && Is256Bit) {
const auto Pred = PRED_TMP_32B.Merging();
// In any case where the destination aliases one of the source vectors
// then we can just perform the UMAX in place.
if (Dst == Vector1) {
// Trivial case where we can perform the operation in place.
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z());
} else if (Dst == Vector2) {
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z());
} else {
// SVE UMAX is a destructive operation, so we need a temporary.
movprfx(VTMP1.Z(), Vector1.Z());
umax(SubRegSize, VTMP1.Z(), Pred, VTMP1.Z(), Vector2.Z());
mov(Dst.Z(), VTMP1.Z());
// SVE UMAX is a destructive operation, but we know nothing is
// aliasing the destination by this point, so we can move into
// the destination without needing a temporary.
movprfx(Dst.Z(), Vector1.Z());
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z());
}
} else {
switch (ElementSize) {

View File

@ -2,12 +2,14 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8182838485868788"],
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"]
},
"MemoryRegions": {
"0x100000000": "4096"
@ -26,6 +28,14 @@ vpmaxub ymm3, ymm0, ymm1
vpmaxub xmm4, xmm0, [rdx + 32]
vpmaxub ymm5, ymm0, [rdx + 32]
; Some funky combinations for testing fast paths
; Related to SVE sources aliasing the destination
vmovapd ymm6, ymm0
vpmaxub ymm6, ymm6, ymm5
vmovapd ymm7, ymm0
vpmaxub ymm7, ymm5, ymm7
hlt
align 32
@ -33,7 +43,7 @@ align 32
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x8182838485868788
dq 0x6162636465666768
dq 0x5152535455565758

View File

@ -2,12 +2,14 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8172737485767778"],
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"]
},
"MemoryRegions": {
"0x100000000": "4096"
@ -26,6 +28,14 @@ vpmaxud ymm3, ymm0, ymm1
vpmaxud xmm4, xmm0, [rdx + 32]
vpmaxud ymm5, ymm0, [rdx + 32]
; Some funky combinations for testing fast paths
; Related to SVE sources aliasing the destination
vmovapd ymm6, ymm0
vpmaxud ymm6, ymm6, ymm5
vmovapd ymm7, ymm0
vpmaxud ymm7, ymm5, ymm7
hlt
align 32
@ -33,7 +43,7 @@ align 32
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x8172737485767778
dq 0x6162636465666768
dq 0x5152535455565758

View File

@ -2,12 +2,14 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8172837485768778"],
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"]
},
"MemoryRegions": {
"0x100000000": "4096"
@ -26,6 +28,14 @@ vpmaxuw ymm3, ymm0, ymm1
vpmaxuw xmm4, xmm0, [rdx + 32]
vpmaxuw ymm5, ymm0, [rdx + 32]
; Some funky combinations for testing fast paths
; Related to SVE sources aliasing the destination
vmovapd ymm6, ymm0
vpmaxuw ymm6, ymm6, ymm5
vmovapd ymm7, ymm0
vpmaxuw ymm7, ymm5, ymm7
hlt
align 32
@ -33,7 +43,7 @@ align 32
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x4142434445464748
dq 0x7172737475767778
dq 0x8172837485768778
dq 0x6162636465666768
dq 0x5152535455565758

View File

@ -4662,22 +4662,44 @@
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 1 0b01 0xdd 128-bit"
],
"ExpectedArm64ASM": [
"umax v16.16b, v17.16b, v18.16b"
]
},
"vpmaxub ymm0, ymm0, ymm2": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 1 0b01 0xde 256-bit"
],
"ExpectedArm64ASM": [
"umax z16.b, p7/m, z16.b, z18.b"
]
},
"vpmaxub ymm0, ymm1, ymm0": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 1 0b01 0xde 256-bit"
],
"ExpectedArm64ASM": [
"umax z16.b, p7/m, z16.b, z17.b"
]
},
"vpmaxub ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 1 0b01 0xde 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z0, z17",
"umax z0.b, p7/m, z0.b, z18.b",
"mov z16.d, z0.d"
"movprfx z16, z17",
"umax z16.b, p7/m, z16.b, z18.b"
]
},
"vpandn xmm0, xmm1, xmm2": {

View File

@ -1526,21 +1526,42 @@
"umax v16.8h, v17.8h, v18.8h"
]
},
"vpmaxuw ymm0, ymm1, ymm0": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 2 0b01 0x3e 256-bit"
],
"ExpectedArm64ASM": [
"umax z16.h, p7/m, z16.h, z17.h"
]
},
"vpmaxuw ymm0, ymm0, ymm2": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 2 0b01 0x3e 256-bit"
],
"ExpectedArm64ASM": [
"umax z16.h, p7/m, z16.h, z18.h"
]
},
"vpmaxuw ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 2 0b01 0x3e 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z0, z17",
"umax z0.h, p7/m, z0.h, z18.h",
"mov z16.d, z0.d"
"movprfx z16, z17",
"umax z16.h, p7/m, z16.h, z18.h"
]
},
"vpmaxud xmm0, xmm1, xmm2": {
"ExpectedInstructionCount": 1,
"Optimal": "No",
"Optimal": "Yes",
"Comment": [
"Map 2 0b01 0x3f 128-bit"
],
@ -1548,16 +1569,36 @@
"umax v16.4s, v17.4s, v18.4s"
]
},
"vpmaxud ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"vpmaxud ymm0, ymm0, ymm2": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Aliasing source and destination",
"Map 2 0b01 0x3f 256-bit"
],
"ExpectedArm64ASM": [
"umax z16.s, p7/m, z16.s, z18.s"
]
},
"vpmaxud ymm0, ymm1, ymm0": {
"ExpectedInstructionCount": 1,
"Optimal": "Yes",
"Comment": [
"Map 2 0b01 0x3f 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z0, z17",
"umax z0.s, p7/m, z0.s, z18.s",
"mov z16.d, z0.d"
"umax z16.s, p7/m, z16.s, z17.s"
]
},
"vpmaxud ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 2,
"Optimal": "Yes",
"Comment": [
"Map 2 0b01 0x3f 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z16, z17",
"umax z16.s, p7/m, z16.s, z18.s"
]
},
"vpmulld xmm0, xmm1, xmm2": {