mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-01-08 22:52:51 +00:00
VectorOps: Handle more VUMax SVE cases better
We can avoid needing to use movprfx here by moving directly into the destination when possible and just doing the UMAX directly. Also expands the unsigned max tests to test values with the sign bit set to ensure all behavior is caught.
This commit is contained in:
parent
65eec673fc
commit
f85fae0041
@ -2255,14 +2255,18 @@ DEF_OP(VUMax) {
|
||||
if (HostSupportsSVE256 && Is256Bit) {
|
||||
const auto Pred = PRED_TMP_32B.Merging();
|
||||
|
||||
// In any case where the destination aliases one of the source vectors
|
||||
// then we can just perform the UMAX in place.
|
||||
if (Dst == Vector1) {
|
||||
// Trivial case where we can perform the operation in place.
|
||||
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z());
|
||||
} else if (Dst == Vector2) {
|
||||
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector1.Z());
|
||||
} else {
|
||||
// SVE UMAX is a destructive operation, so we need a temporary.
|
||||
movprfx(VTMP1.Z(), Vector1.Z());
|
||||
umax(SubRegSize, VTMP1.Z(), Pred, VTMP1.Z(), Vector2.Z());
|
||||
mov(Dst.Z(), VTMP1.Z());
|
||||
// SVE UMAX is a destructive operation, but we know nothing is
|
||||
// aliasing the destination by this point, so we can move into
|
||||
// the destination without needing a temporary.
|
||||
movprfx(Dst.Z(), Vector1.Z());
|
||||
umax(SubRegSize, Dst.Z(), Pred, Dst.Z(), Vector2.Z());
|
||||
}
|
||||
} else {
|
||||
switch (ElementSize) {
|
||||
|
@ -2,12 +2,14 @@
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8182838485868788"],
|
||||
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
|
||||
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
|
||||
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
|
||||
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"],
|
||||
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8182838485868788"]
|
||||
},
|
||||
"MemoryRegions": {
|
||||
"0x100000000": "4096"
|
||||
@ -26,6 +28,14 @@ vpmaxub ymm3, ymm0, ymm1
|
||||
vpmaxub xmm4, xmm0, [rdx + 32]
|
||||
vpmaxub ymm5, ymm0, [rdx + 32]
|
||||
|
||||
; Some funky combinations for testing fast paths
|
||||
; Related to SVE sources aliasing the destination
|
||||
vmovapd ymm6, ymm0
|
||||
vpmaxub ymm6, ymm6, ymm5
|
||||
|
||||
vmovapd ymm7, ymm0
|
||||
vpmaxub ymm7, ymm5, ymm7
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
@ -33,7 +43,7 @@ align 32
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x8182838485868788
|
||||
|
||||
dq 0x6162636465666768
|
||||
dq 0x5152535455565758
|
||||
|
@ -2,12 +2,14 @@
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8172737485767778"],
|
||||
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
|
||||
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
|
||||
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
|
||||
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"],
|
||||
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172737485767778"]
|
||||
},
|
||||
"MemoryRegions": {
|
||||
"0x100000000": "4096"
|
||||
@ -26,6 +28,14 @@ vpmaxud ymm3, ymm0, ymm1
|
||||
vpmaxud xmm4, xmm0, [rdx + 32]
|
||||
vpmaxud ymm5, ymm0, [rdx + 32]
|
||||
|
||||
; Some funky combinations for testing fast paths
|
||||
; Related to SVE sources aliasing the destination
|
||||
vmovapd ymm6, ymm0
|
||||
vpmaxud ymm6, ymm6, ymm5
|
||||
|
||||
vmovapd ymm7, ymm0
|
||||
vpmaxud ymm7, ymm5, ymm7
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
@ -33,7 +43,7 @@ align 32
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x8172737485767778
|
||||
|
||||
dq 0x6162636465666768
|
||||
dq 0x5152535455565758
|
||||
|
@ -2,12 +2,14 @@
|
||||
{
|
||||
"HostFeatures": ["AVX"],
|
||||
"RegData": {
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x7172737475767778"],
|
||||
"XMM0": ["0x4142434445464748", "0x7172737475767778", "0x4142434445464748", "0x8172837485768778"],
|
||||
"XMM1": ["0x6162636465666768", "0x5152535455565758", "0x6162636465666768", "0x5152535455565758"],
|
||||
"XMM2": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"],
|
||||
"XMM3": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
|
||||
"XMM4": ["0x6162636465666768", "0x7172737475767778", "0x0000000000000000", "0x0000000000000000"],
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x7172737475767778"]
|
||||
"XMM5": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
|
||||
"XMM6": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"],
|
||||
"XMM7": ["0x6162636465666768", "0x7172737475767778", "0x6162636465666768", "0x8172837485768778"]
|
||||
},
|
||||
"MemoryRegions": {
|
||||
"0x100000000": "4096"
|
||||
@ -26,6 +28,14 @@ vpmaxuw ymm3, ymm0, ymm1
|
||||
vpmaxuw xmm4, xmm0, [rdx + 32]
|
||||
vpmaxuw ymm5, ymm0, [rdx + 32]
|
||||
|
||||
; Some funky combinations for testing fast paths
|
||||
; Related to SVE sources aliasing the destination
|
||||
vmovapd ymm6, ymm0
|
||||
vpmaxuw ymm6, ymm6, ymm5
|
||||
|
||||
vmovapd ymm7, ymm0
|
||||
vpmaxuw ymm7, ymm5, ymm7
|
||||
|
||||
hlt
|
||||
|
||||
align 32
|
||||
@ -33,7 +43,7 @@ align 32
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x4142434445464748
|
||||
dq 0x7172737475767778
|
||||
dq 0x8172837485768778
|
||||
|
||||
dq 0x6162636465666768
|
||||
dq 0x5152535455565758
|
||||
|
@ -4662,22 +4662,44 @@
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 1 0b01 0xdd 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax v16.16b, v17.16b, v18.16b"
|
||||
]
|
||||
},
|
||||
"vpmaxub ymm0, ymm0, ymm2": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 1 0b01 0xde 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax z16.b, p7/m, z16.b, z18.b"
|
||||
]
|
||||
},
|
||||
"vpmaxub ymm0, ymm1, ymm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 1 0b01 0xde 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax z16.b, p7/m, z16.b, z17.b"
|
||||
]
|
||||
},
|
||||
"vpmaxub ymm0, ymm1, ymm2": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Optimal": "No",
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Map 1 0b01 0xde 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movprfx z0, z17",
|
||||
"umax z0.b, p7/m, z0.b, z18.b",
|
||||
"mov z16.d, z0.d"
|
||||
"movprfx z16, z17",
|
||||
"umax z16.b, p7/m, z16.b, z18.b"
|
||||
]
|
||||
},
|
||||
"vpandn xmm0, xmm1, xmm2": {
|
||||
|
@ -1526,21 +1526,42 @@
|
||||
"umax v16.8h, v17.8h, v18.8h"
|
||||
]
|
||||
},
|
||||
"vpmaxuw ymm0, ymm1, ymm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 2 0b01 0x3e 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax z16.h, p7/m, z16.h, z17.h"
|
||||
]
|
||||
},
|
||||
"vpmaxuw ymm0, ymm0, ymm2": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 2 0b01 0x3e 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax z16.h, p7/m, z16.h, z18.h"
|
||||
]
|
||||
},
|
||||
"vpmaxuw ymm0, ymm1, ymm2": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Optimal": "No",
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x3e 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movprfx z0, z17",
|
||||
"umax z0.h, p7/m, z0.h, z18.h",
|
||||
"mov z16.d, z0.d"
|
||||
"movprfx z16, z17",
|
||||
"umax z16.h, p7/m, z16.h, z18.h"
|
||||
]
|
||||
},
|
||||
"vpmaxud xmm0, xmm1, xmm2": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "No",
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x3f 128-bit"
|
||||
],
|
||||
@ -1548,16 +1569,36 @@
|
||||
"umax v16.4s, v17.4s, v18.4s"
|
||||
]
|
||||
},
|
||||
"vpmaxud ymm0, ymm1, ymm2": {
|
||||
"ExpectedInstructionCount": 3,
|
||||
"Optimal": "No",
|
||||
"vpmaxud ymm0, ymm0, ymm2": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Aliasing source and destination",
|
||||
"Map 2 0b01 0x3f 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"umax z16.s, p7/m, z16.s, z18.s"
|
||||
]
|
||||
},
|
||||
"vpmaxud ymm0, ymm1, ymm0": {
|
||||
"ExpectedInstructionCount": 1,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x3f 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movprfx z0, z17",
|
||||
"umax z0.s, p7/m, z0.s, z18.s",
|
||||
"mov z16.d, z0.d"
|
||||
"umax z16.s, p7/m, z16.s, z17.s"
|
||||
]
|
||||
},
|
||||
"vpmaxud ymm0, ymm1, ymm2": {
|
||||
"ExpectedInstructionCount": 2,
|
||||
"Optimal": "Yes",
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x3f 256-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"movprfx z16, z17",
|
||||
"umax z16.s, p7/m, z16.s, z18.s"
|
||||
]
|
||||
},
|
||||
"vpmulld xmm0, xmm1, xmm2": {
|
||||
|
Loading…
Reference in New Issue
Block a user