Merge pull request #3825 from Sonicadvance1/scale_64bit_gather

AVX128: Prescale addresses in gathers if possible
This commit is contained in:
Ryan Houdek 2024-07-05 19:10:43 -07:00 committed by GitHub
commit 47d077ff22
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 181 additions and 392 deletions

View File

@ -2603,6 +2603,19 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, O
BaseAddr = Invalid();
}
if (ElementLoadSize == OpSize::i64Bit && AddrElementSize == OpSize::i64Bit && (VSIB.Scale == 2 || VSIB.Scale == 4) &&
CTX->HostFeatures.SupportsSVE128) {
// SVE gather instructions don't support scaling their vector elements by anything other than 1 or the address element size.
// Pre-scale 64-bit addresses in the case that scale doesn't match in-order to hit SVE code paths more frequently.
// Only hit this path if the host supports SVE. Otherwise it's a degradation for the ASIMD codepath.
VSIB.Low = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale));
if (!Is128Bit) {
VSIB.High = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.High, FEXCore::ilog2(VSIB.Scale));
}
///< Set the scale to one now that it has been prescaled.
VSIB.Scale = 1;
}
RefPair Result {};
///< Calculate the low-half.
Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High,

View File

@ -867,22 +867,16 @@
]
},
"vpgatherqq xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -890,22 +884,16 @@
]
},
"vpgatherqq xmm0, [xmm1*4 + rax], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -951,7 +939,7 @@
]
},
"vpgatherqq ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -959,27 +947,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #1",
"shl v3.2d, v3.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [x4, z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -987,7 +963,7 @@
]
},
"vpgatherqq ymm0, [ymm1*4 + rax], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -995,27 +971,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #2",
"shl v3.2d, v3.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [x4, z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -1795,22 +1759,16 @@
]
},
"vgatherqpd xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -1818,22 +1776,16 @@
]
},
"vgatherqpd xmm0, [xmm1*4 + rax], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -1879,7 +1831,7 @@
]
},
"vgatherqpd ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -1887,27 +1839,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #1",
"shl v3.2d, v3.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [x4, z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -1915,7 +1855,7 @@
]
},
"vgatherqpd ymm0, [ymm1*4 + rax], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -1923,27 +1863,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #2",
"shl v3.2d, v3.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [x4, z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -2710,22 +2638,16 @@
]
},
"vpgatherqq xmm0, [xmm1*2], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #1",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -2733,22 +2655,16 @@
]
},
"vpgatherqq xmm0, [xmm1*4], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #2",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -2795,7 +2711,7 @@
]
},
"vpgatherqq ymm0, [ymm1*2], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -2803,27 +2719,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #1",
"shl v3.2d, v3.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #1",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"lsl x1, x0, #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"lsl x1, x0, #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -2831,7 +2735,7 @@
]
},
"vpgatherqq ymm0, [ymm1*4], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -2839,27 +2743,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #2",
"shl v3.2d, v3.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #2",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"lsl x1, x0, #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"lsl x1, x0, #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -3644,22 +3536,16 @@
]
},
"vgatherqpd xmm0, [xmm1*2], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #1",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -3667,22 +3553,16 @@
]
},
"vgatherqpd xmm0, [xmm1*4], xmm2": {
"ExpectedInstructionCount": 15,
"ExpectedInstructionCount": 9,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #2",
"ld1 {v16.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z2.d]",
"mov z16.d, p0/m, z0.d",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -3729,7 +3609,7 @@
]
},
"vgatherqpd ymm0, [ymm1*2], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -3737,27 +3617,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #1",
"shl v3.2d, v3.2d, #1",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #1",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #1",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"lsl x1, x0, #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"lsl x1, x0, #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",
@ -3765,7 +3633,7 @@
]
},
"vgatherqpd ymm0, [ymm1*4], ymm2": {
"ExpectedInstructionCount": 28,
"ExpectedInstructionCount": 16,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -3773,27 +3641,15 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"shl v5.2d, v17.2d, #2",
"shl v3.2d, v3.2d, #2",
"mrs x20, nzcv",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"lsl x1, x0, #2",
"ld1 {v16.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"lsl x1, x0, #2",
"ld1 {v16.d}[1], [x1]",
"mov x0, v4.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[0]",
"lsl x1, x0, #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v4.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v3.d[1]",
"lsl x1, x0, #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [z5.d]",
"mov z16.d, p0/m, z0.d",
"cmplt p0.d, p6/z, z4.d, #0",
"ld1d {z0.d}, p0/z, [z3.d]",
"mov z2.d, p0/m, z0.d",
"str q2, [x28, #16]",
"movi v18.2d, #0x0",
"str q18, [x28, #48]",

View File

@ -2810,23 +2810,16 @@
]
},
"vpgatherqq xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 18,
"ExpectedInstructionCount": 11,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov v2.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"sel z2.d, p0, z0.d, z16.d",
"movi v18.2d, #0x0",
"mov z1.q, q18",
"not p0.b, p7/z, p6.b",
@ -2836,23 +2829,16 @@
]
},
"vpgatherqq xmm0, [xmm1*4 + rax], xmm2": {
"ExpectedInstructionCount": 18,
"ExpectedInstructionCount": 11,
"Comment": [
"Map 2 0b01 0x91 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov v2.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"sel z2.d, p0, z0.d, z16.d",
"movi v18.2d, #0x0",
"mov z1.q, q18",
"not p0.b, p7/z, p6.b",
@ -2891,7 +2877,7 @@
]
},
"vpgatherqq ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -2899,28 +2885,15 @@
"mov z2.q, z16.q[1]",
"mov z3.q, z18.q[1]",
"mov z4.q, z17.q[1]",
"shl v5.2d, v17.2d, #1",
"shl v4.2d, v4.2d, #1",
"mrs x20, nzcv",
"mov v5.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v5.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v5.d}[1], [x1]",
"mov x0, v3.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v3.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"sel z5.d, p0, z0.d, z16.d",
"cmplt p0.d, p6/z, z3.d, #0",
"ld1d {z0.d}, p0/z, [x4, z4.d]",
"mov z2.d, p0/m, z0.d",
"mov z1.q, q2",
"mov z16.d, z5.d",
"not p0.b, p7/z, p6.b",
@ -2930,7 +2903,7 @@
]
},
"vpgatherqq ymm0, [ymm1*4 + rax], ymm2": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 2 0b01 0x91 256-bit"
],
@ -2938,28 +2911,15 @@
"mov z2.q, z16.q[1]",
"mov z3.q, z18.q[1]",
"mov z4.q, z17.q[1]",
"shl v5.2d, v17.2d, #2",
"shl v4.2d, v4.2d, #2",
"mrs x20, nzcv",
"mov v5.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v5.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v5.d}[1], [x1]",
"mov x0, v3.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v3.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"sel z5.d, p0, z0.d, z16.d",
"cmplt p0.d, p6/z, z3.d, #0",
"ld1d {z0.d}, p0/z, [x4, z4.d]",
"mov z2.d, p0/m, z0.d",
"mov z1.q, q2",
"mov z16.d, z5.d",
"not p0.b, p7/z, p6.b",
@ -3806,23 +3766,16 @@
]
},
"vgatherqpd xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 18,
"ExpectedInstructionCount": 11,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #1",
"mrs x20, nzcv",
"mov v2.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"sel z2.d, p0, z0.d, z16.d",
"movi v18.2d, #0x0",
"mov z1.q, q18",
"not p0.b, p7/z, p6.b",
@ -3832,23 +3785,16 @@
]
},
"vgatherqpd xmm0, [xmm1*4 + rax], xmm2": {
"ExpectedInstructionCount": 18,
"ExpectedInstructionCount": 11,
"Comment": [
"Map 2 0b01 0x93 128-bit"
],
"ExpectedArm64ASM": [
"shl v2.2d, v17.2d, #2",
"mrs x20, nzcv",
"mov v2.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z2.d]",
"sel z2.d, p0, z0.d, z16.d",
"movi v18.2d, #0x0",
"mov z1.q, q18",
"not p0.b, p7/z, p6.b",
@ -3887,7 +3833,7 @@
]
},
"vgatherqpd ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -3895,28 +3841,15 @@
"mov z2.q, z16.q[1]",
"mov z3.q, z18.q[1]",
"mov z4.q, z17.q[1]",
"shl v5.2d, v17.2d, #1",
"shl v4.2d, v4.2d, #1",
"mrs x20, nzcv",
"mov v5.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v5.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v5.d}[1], [x1]",
"mov x0, v3.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[0]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[0], [x1]",
"mov x0, v3.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[1]",
"add x1, x4, x0, lsl #1",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"sel z5.d, p0, z0.d, z16.d",
"cmplt p0.d, p6/z, z3.d, #0",
"ld1d {z0.d}, p0/z, [x4, z4.d]",
"mov z2.d, p0/m, z0.d",
"mov z1.q, q2",
"mov z16.d, z5.d",
"not p0.b, p7/z, p6.b",
@ -3926,7 +3859,7 @@
]
},
"vgatherqpd ymm0, [ymm1*4 + rax], ymm2": {
"ExpectedInstructionCount": 31,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 2 0b01 0x93 256-bit"
],
@ -3934,28 +3867,15 @@
"mov z2.q, z16.q[1]",
"mov z3.q, z18.q[1]",
"mov z4.q, z17.q[1]",
"shl v5.2d, v17.2d, #2",
"shl v4.2d, v4.2d, #2",
"mrs x20, nzcv",
"mov v5.16b, v16.16b",
"mov x0, v18.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v5.d}[0], [x1]",
"mov x0, v18.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v17.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v5.d}[1], [x1]",
"mov x0, v3.d[0]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[0]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[0], [x1]",
"mov x0, v3.d[1]",
"tbz x0, #63, #+0x10",
"mov x0, v4.d[1]",
"add x1, x4, x0, lsl #2",
"ld1 {v2.d}[1], [x1]",
"cmplt p0.d, p6/z, z18.d, #0",
"ld1d {z0.d}, p0/z, [x4, z5.d]",
"sel z5.d, p0, z0.d, z16.d",
"cmplt p0.d, p6/z, z3.d, #0",
"ld1d {z0.d}, p0/z, [x4, z4.d]",
"mov z2.d, p0/m, z0.d",
"mov z1.q, q2",
"mov z16.d, z5.d",
"not p0.b, p7/z, p6.b",