mirror of
https://github.com/FEX-Emu/FEX.git
synced 2024-12-04 20:46:24 +00:00
Merge pull request #3825 from Sonicadvance1/scale_64bit_gather
AVX128: Prescale addresses in gathers if possible
This commit is contained in:
commit
47d077ff22
@ -2603,6 +2603,19 @@ OpDispatchBuilder::RefPair OpDispatchBuilder::AVX128_VPGatherImpl(OpSize Size, O
|
||||
BaseAddr = Invalid();
|
||||
}
|
||||
|
||||
if (ElementLoadSize == OpSize::i64Bit && AddrElementSize == OpSize::i64Bit && (VSIB.Scale == 2 || VSIB.Scale == 4) &&
|
||||
CTX->HostFeatures.SupportsSVE128) {
|
||||
// SVE gather instructions don't support scaling their vector elements by anything other than 1 or the address element size.
|
||||
// Pre-scale 64-bit addresses in the case that scale doesn't match in-order to hit SVE code paths more frequently.
|
||||
// Only hit this path if the host supports SVE. Otherwise it's a degradation for the ASIMD codepath.
|
||||
VSIB.Low = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.Low, FEXCore::ilog2(VSIB.Scale));
|
||||
if (!Is128Bit) {
|
||||
VSIB.High = _VShlI(OpSize::i128Bit, OpSize::i64Bit, VSIB.High, FEXCore::ilog2(VSIB.Scale));
|
||||
}
|
||||
///< Set the scale to one now that it has been prescaled.
|
||||
VSIB.Scale = 1;
|
||||
}
|
||||
|
||||
RefPair Result {};
|
||||
///< Calculate the low-half.
|
||||
Result.Low = _VLoadVectorGatherMasked(OpSize::i128Bit, ElementLoadSize, Dest.Low, Mask.Low, BaseAddr, VSIB.Low, VSIB.High,
|
||||
|
@ -867,22 +867,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*2 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -890,22 +884,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*4 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -951,7 +939,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*2 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -959,27 +947,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v3.2d, v3.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -987,7 +963,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*4 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -995,27 +971,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v3.2d, v3.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -1795,22 +1759,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*2 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -1818,22 +1776,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*4 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -1879,7 +1831,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*2 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -1887,27 +1839,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v3.2d, v3.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -1915,7 +1855,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*4 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -1923,27 +1863,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v3.2d, v3.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -2710,22 +2638,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*2], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -2733,22 +2655,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*4], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -2795,7 +2711,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*2], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -2803,27 +2719,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v3.2d, v3.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -2831,7 +2735,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*4], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -2839,27 +2743,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v3.2d, v3.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -3644,22 +3536,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*2], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -3667,22 +3553,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*4], xmm2": {
|
||||
"ExpectedInstructionCount": 15,
|
||||
"ExpectedInstructionCount": 9,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z2.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #16]",
|
||||
"str q18, [x28, #48]",
|
||||
@ -3729,7 +3609,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*2], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -3737,27 +3617,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v3.2d, v3.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"lsl x1, x0, #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
@ -3765,7 +3633,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*4], ymm2": {
|
||||
"ExpectedInstructionCount": 28,
|
||||
"ExpectedInstructionCount": 16,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -3773,27 +3641,15 @@
|
||||
"ldr q2, [x28, #16]",
|
||||
"ldr q3, [x28, #32]",
|
||||
"ldr q4, [x28, #48]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v3.2d, v3.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v16.d}[1], [x1]",
|
||||
"mov x0, v4.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[0]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v4.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v3.d[1]",
|
||||
"lsl x1, x0, #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z5.d]",
|
||||
"mov z16.d, p0/m, z0.d",
|
||||
"cmplt p0.d, p6/z, z4.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [z3.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"str q2, [x28, #16]",
|
||||
"movi v18.2d, #0x0",
|
||||
"str q18, [x28, #48]",
|
||||
|
@ -2810,23 +2810,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*2 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 18,
|
||||
"ExpectedInstructionCount": 11,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov v2.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"sel z2.d, p0, z0.d, z16.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"mov z1.q, q18",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -2836,23 +2829,16 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq xmm0, [xmm1*4 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 18,
|
||||
"ExpectedInstructionCount": 11,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov v2.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"sel z2.d, p0, z0.d, z16.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"mov z1.q, q18",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -2891,7 +2877,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*2 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 31,
|
||||
"ExpectedInstructionCount": 18,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -2899,28 +2885,15 @@
|
||||
"mov z2.q, z16.q[1]",
|
||||
"mov z3.q, z18.q[1]",
|
||||
"mov z4.q, z17.q[1]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v4.2d, v4.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov v5.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v5.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v5.d}[1], [x1]",
|
||||
"mov x0, v3.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v3.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"sel z5.d, p0, z0.d, z16.d",
|
||||
"cmplt p0.d, p6/z, z3.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z4.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"mov z1.q, q2",
|
||||
"mov z16.d, z5.d",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -2930,7 +2903,7 @@
|
||||
]
|
||||
},
|
||||
"vpgatherqq ymm0, [ymm1*4 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 31,
|
||||
"ExpectedInstructionCount": 18,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x91 256-bit"
|
||||
],
|
||||
@ -2938,28 +2911,15 @@
|
||||
"mov z2.q, z16.q[1]",
|
||||
"mov z3.q, z18.q[1]",
|
||||
"mov z4.q, z17.q[1]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v4.2d, v4.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov v5.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v5.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v5.d}[1], [x1]",
|
||||
"mov x0, v3.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v3.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"sel z5.d, p0, z0.d, z16.d",
|
||||
"cmplt p0.d, p6/z, z3.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z4.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"mov z1.q, q2",
|
||||
"mov z16.d, z5.d",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -3806,23 +3766,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*2 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 18,
|
||||
"ExpectedInstructionCount": 11,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov v2.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"sel z2.d, p0, z0.d, z16.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"mov z1.q, q18",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -3832,23 +3785,16 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd xmm0, [xmm1*4 + rax], xmm2": {
|
||||
"ExpectedInstructionCount": 18,
|
||||
"ExpectedInstructionCount": 11,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 128-bit"
|
||||
],
|
||||
"ExpectedArm64ASM": [
|
||||
"shl v2.2d, v17.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov v2.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z2.d]",
|
||||
"sel z2.d, p0, z0.d, z16.d",
|
||||
"movi v18.2d, #0x0",
|
||||
"mov z1.q, q18",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -3887,7 +3833,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*2 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 31,
|
||||
"ExpectedInstructionCount": 18,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -3895,28 +3841,15 @@
|
||||
"mov z2.q, z16.q[1]",
|
||||
"mov z3.q, z18.q[1]",
|
||||
"mov z4.q, z17.q[1]",
|
||||
"shl v5.2d, v17.2d, #1",
|
||||
"shl v4.2d, v4.2d, #1",
|
||||
"mrs x20, nzcv",
|
||||
"mov v5.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v5.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v5.d}[1], [x1]",
|
||||
"mov x0, v3.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[0]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v3.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[1]",
|
||||
"add x1, x4, x0, lsl #1",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"sel z5.d, p0, z0.d, z16.d",
|
||||
"cmplt p0.d, p6/z, z3.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z4.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"mov z1.q, q2",
|
||||
"mov z16.d, z5.d",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
@ -3926,7 +3859,7 @@
|
||||
]
|
||||
},
|
||||
"vgatherqpd ymm0, [ymm1*4 + rax], ymm2": {
|
||||
"ExpectedInstructionCount": 31,
|
||||
"ExpectedInstructionCount": 18,
|
||||
"Comment": [
|
||||
"Map 2 0b01 0x93 256-bit"
|
||||
],
|
||||
@ -3934,28 +3867,15 @@
|
||||
"mov z2.q, z16.q[1]",
|
||||
"mov z3.q, z18.q[1]",
|
||||
"mov z4.q, z17.q[1]",
|
||||
"shl v5.2d, v17.2d, #2",
|
||||
"shl v4.2d, v4.2d, #2",
|
||||
"mrs x20, nzcv",
|
||||
"mov v5.16b, v16.16b",
|
||||
"mov x0, v18.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v5.d}[0], [x1]",
|
||||
"mov x0, v18.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v17.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v5.d}[1], [x1]",
|
||||
"mov x0, v3.d[0]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[0]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[0], [x1]",
|
||||
"mov x0, v3.d[1]",
|
||||
"tbz x0, #63, #+0x10",
|
||||
"mov x0, v4.d[1]",
|
||||
"add x1, x4, x0, lsl #2",
|
||||
"ld1 {v2.d}[1], [x1]",
|
||||
"cmplt p0.d, p6/z, z18.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z5.d]",
|
||||
"sel z5.d, p0, z0.d, z16.d",
|
||||
"cmplt p0.d, p6/z, z3.d, #0",
|
||||
"ld1d {z0.d}, p0/z, [x4, z4.d]",
|
||||
"mov z2.d, p0/m, z0.d",
|
||||
"mov z1.q, q2",
|
||||
"mov z16.d, z5.d",
|
||||
"not p0.b, p7/z, p6.b",
|
||||
|
Loading…
Reference in New Issue
Block a user