VectorOps: Handle SVE VFCADD a little better

If no registers alias, then we can move the first source directly into the
destination and then perform the FCADD operation as opposed to using a
temporary.
This commit is contained in:
Lioncache 2023-10-19 14:40:25 +02:00
parent 8f246b206b
commit 24f2796141
4 changed files with 92 additions and 25 deletions

View File

@ -4963,13 +4963,16 @@ DEF_OP(VFCADD) {
// Trivial case where we already have first vector in the destination
// register. We can just do the operation in place.
fcadd(SubRegSize, Dst.Z(), Mask, Vector1.Z(), Vector2.Z(), Rotate);
}
else {
} else if (Dst == Vector2) {
// SVE FCADD is a destructive operation, so we need
// a temporary for performing operations.
movprfx(VTMP1.Z(), Vector1.Z());
fcadd(SubRegSize, VTMP1.Z(), Mask, VTMP1.Z(), Vector2.Z(), Rotate);
mov(Dst.Z(), VTMP1.Z());
} else {
// We have no source/dest aliasing, so we can move into the destination.
movprfx(Dst.Z(), Vector1.Z());
fcadd(SubRegSize, Dst.Z(), Mask, Dst.Z(), Vector2.Z(), Rotate);
}
} else {
if (OpSize == 8) {

View File

@ -2,17 +2,16 @@
{
"HostFeatures": ["AVX"],
"RegData": {
"XMM2": ["0xBFF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0xBFF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM4": ["0x3FF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x3FF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM6": ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"],
"XMM7": ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"],
"XMM8": ["0x3FF0000000000000", "0x4008000000000000", "0x3FF0000000000000", "0x4008000000000000"],
"XMM9": ["0x3FF0000000000000", "0x4008000000000000", "0x3FF0000000000000", "0x4008000000000000"]
},
"MemoryRegions": {
"0x100000000": "4096"
"XMM2" : ["0xBFF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM3" : ["0xBFF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM4" : ["0x3FF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM5" : ["0x3FF0000000000000", "0x4008000000000000", "0x0000000000000000", "0x0000000000000000"],
"XMM6" : ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"],
"XMM7" : ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"],
"XMM8" : ["0x3FF0000000000000", "0x4008000000000000", "0x3FF0000000000000", "0x4008000000000000"],
"XMM9" : ["0x3FF0000000000000", "0x4008000000000000", "0x3FF0000000000000", "0x4008000000000000"],
"XMM10": ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"],
"XMM11": ["0xBFF0000000000000", "0x4008000000000000", "0xBFF0000000000000", "0x4008000000000000"]
}
}
%endif
@ -34,6 +33,13 @@ vaddsubpd ymm7, ymm0, ymm1
vaddsubpd ymm8, ymm1, [rdx]
vaddsubpd ymm9, ymm1, ymm0
; Aliasing source/destination vectors
vmovapd ymm10, [rdx]
vaddsubpd ymm10, ymm10, ymm1
vmovapd ymm11, [rdx + 32]
vaddsubpd ymm11, ymm0, ymm11
hlt
align 32

View File

@ -5,10 +5,9 @@
"XMM2": ["0x41200000C0000000", "0x41200000C0C00000", "0x0000000000000000", "0x0000000000000000"],
"XMM3": ["0x41200000C0000000", "0x41200000C0C00000", "0x41200000C0000000", "0x41200000C0C00000"],
"XMM4": ["0x41200000C0000000", "0x41200000C0C00000", "0x0000000000000000", "0x0000000000000000"],
"XMM5": ["0x41200000C0000000", "0x41200000C0C00000", "0x41200000C0000000", "0x41200000C0C00000"]
},
"MemoryRegions": {
"0x100000000": "4096"
"XMM5": ["0x41200000C0000000", "0x41200000C0C00000", "0x41200000C0000000", "0x41200000C0C00000"],
"XMM6": ["0x41200000C0000000", "0x41200000C0C00000", "0x41200000C0000000", "0x41200000C0C00000"],
"XMM7": ["0x41200000C0000000", "0x41200000C0C00000", "0x41200000C0000000", "0x41200000C0C00000"]
}
}
%endif
@ -24,6 +23,13 @@ vaddsubps ymm3, ymm0, [rdx + 32]
vaddsubps xmm4, xmm0, xmm1
vaddsubps ymm5, ymm0, ymm1
; Aliasing source/destination vectors
vmovapd ymm6, [rdx]
vaddsubps ymm6, ymm6, ymm1
vmovapd ymm7, [rdx + 32]
vaddsubps ymm7, ymm0, ymm7
hlt
align 32

View File

@ -20,8 +20,37 @@
"fcadd v16.2d, v17.2d, v2.2d, #90"
]
},
"vaddsubpd ymm0, ymm0, ymm2": {
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"Aliasing source and destination",
"Map 1 0b01 0xd0 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z1, z18",
"ext z1.b, z1.b, z18.b, #8",
"mov z2.d, z1.d",
"fcadd z16.d, p7/m, z16.d, z2.d, #90"
]
},
"vaddsubpd ymm0, ymm1, ymm0": {
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
"Aliasing source and destination",
"Map 1 0b01 0xd0 256-bit"
],
"ExpectedArm64ASM": [
"movprfx z1, z16",
"ext z1.b, z1.b, z16.b, #8",
"mov z2.d, z1.d",
"movprfx z16, z17",
"fcadd z16.d, p7/m, z16.d, z2.d, #90"
]
},
"vaddsubpd ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
"Map 1 0b01 0xd0 256-bit"
@ -30,9 +59,8 @@
"movprfx z1, z18",
"ext z1.b, z1.b, z18.b, #8",
"mov z2.d, z1.d",
"movprfx z0, z17",
"fcadd z0.d, p7/m, z0.d, z2.d, #90",
"mov z16.d, z0.d"
"movprfx z16, z17",
"fcadd z16.d, p7/m, z16.d, z2.d, #90"
]
},
"vaddsubps xmm0, xmm1, xmm2": {
@ -46,17 +74,41 @@
"fcadd v16.4s, v17.4s, v2.4s, #90"
]
},
"vaddsubps ymm0, ymm1, ymm0": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Aliasing source and destination",
"Map 1 0b11 0xd0 256-bit"
],
"ExpectedArm64ASM": [
"revw z2.d, p7/m, z16.d",
"movprfx z16, z17",
"fcadd z16.s, p7/m, z16.s, z2.s, #90"
]
},
"vaddsubps ymm0, ymm0, ymm2": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"Aliasing source and destination",
"Map 1 0b11 0xd0 256-bit"
],
"ExpectedArm64ASM": [
"revw z2.d, p7/m, z18.d",
"fcadd z16.s, p7/m, z16.s, z2.s, #90"
]
},
"vaddsubps ymm0, ymm1, ymm2": {
"ExpectedInstructionCount": 4,
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xd0 256-bit"
],
"ExpectedArm64ASM": [
"revw z2.d, p7/m, z18.d",
"movprfx z0, z17",
"fcadd z0.s, p7/m, z0.s, z2.s, #90",
"mov z16.d, z0.d"
"movprfx z16, z17",
"fcadd z16.s, p7/m, z16.s, z2.s, #90"
]
}
}