Merge pull request #3836 from Sonicadvance1/optimize_sve_vpgatherdd

AVX128: Optimize the vpgatherdd/vgatherdps cases that would fall back to ASIMD
This commit is contained in:
Mai 2024-07-08 21:43:36 -04:00 committed by GitHub
commit 22b26696ba
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 275 additions and 505 deletions

View File

@ -2742,8 +2742,42 @@ void OpDispatchBuilder::AVX128_VPGATHER(OpcodeArgs) {
auto VSIB = AVX128_LoadVSIB(Op, Op->Src[0], Op->Flags, NeedsHighAddrBytes);
auto Mask = AVX128_LoadSource_WithOpSize(Op, Op->Src[1], Op->Flags, !Is128Bit);
bool NeedsSVEScale = (VSIB.Scale == 2 || VSIB.Scale == 8) || (VSIB.BaseAddr == Invalid() && VSIB.Scale != 1);
const bool NeedsExplicitSVEPath =
CTX->HostFeatures.SupportsSVE128 && AddrElementSize == OpSize::i32Bit && ElementLoadSize == OpSize::i32Bit && NeedsSVEScale;
RefPair Result {};
if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
if (NeedsExplicitSVEPath) {
// Special case for VGATHERDPS/VPGATHERDD (32-bit addresses loading 32-bit elements) that can't use the SVE codepath.
// The problem is due to the scale not matching SVE limitations, we need to prescale the addresses to be 64-bit.
auto ScaleVSIBHalf = [this](Ref VSIB, Ref BaseAddr, int32_t Displacement, uint8_t Scale) -> RefVSIB {
RefVSIB Result {};
Result.High = _VSSHLL2(OpSize::i128Bit, OpSize::i32Bit, VSIB, FEXCore::ilog2(Scale));
Result.Low = _VSSHLL(OpSize::i128Bit, OpSize::i32Bit, VSIB, FEXCore::ilog2(Scale));
Result.Displacement = Displacement;
Result.BaseAddr = BaseAddr;
///< Set the scale to one now that it has been prescaled as well.
Result.Scale = 1;
return Result;
};
RefVSIB VSIBLow = ScaleVSIBHalf(VSIB.Low, VSIB.BaseAddr, VSIB.Displacement, VSIB.Scale);
RefVSIB VSIBHigh {};
if (NeedsHighAddrBytes) {
VSIBHigh = ScaleVSIBHalf(VSIB.High, VSIB.BaseAddr, VSIB.Displacement, VSIB.Scale);
}
///< AddressElementSize is now OpSize::i64Bit
Result = AVX128_VPGatherQPSImpl(Dest.Low, Mask.Low, VSIBLow);
if (NeedsHighAddrBytes) {
auto Res = AVX128_VPGatherQPSImpl(Dest.High, Mask.High, VSIBHigh);
Result.High = Res.Low;
}
} else if (AddrElementSize == OpSize::i64Bit && ElementLoadSize == OpSize::i32Bit) {
Result = AVX128_VPGatherQPSImpl(Dest.Low, Mask.Low, VSIB);
} else {
Result = AVX128_VPGatherImpl(SizeToOpSize(Size), ElementLoadSize, AddrElementSize, Dest, Mask, VSIB);

View File

@ -149,32 +149,21 @@
]
},
"vpgatherdd xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x90 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #1",
"sshll v3.2d, v17.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -198,32 +187,21 @@
]
},
"vpgatherdd xmm0, [xmm1*8 + rax], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x90 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #3",
"sshll v3.2d, v17.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -253,7 +231,7 @@
]
},
"vpgatherdd ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x90 256-bit"
],
@ -261,49 +239,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #1",
"sshll v6.2d, v17.2s, #1",
"sshll2 v7.2d, v3.4s, #1",
"sshll v3.2d, v3.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -331,7 +287,7 @@
]
},
"vpgatherdd ymm0, [ymm1*8 + rax], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x90 256-bit"
],
@ -339,49 +295,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #3",
"sshll v6.2d, v17.2s, #3",
"sshll2 v7.2d, v3.4s, #3",
"sshll v3.2d, v3.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -911,32 +845,21 @@
]
},
"vgatherdps xmm0, [xmm1*2 + rax], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x92 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #1",
"sshll v3.2d, v17.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -960,32 +883,21 @@
]
},
"vgatherdps xmm0, [xmm1*8 + rax], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x92 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #3",
"sshll v3.2d, v17.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -1015,7 +927,7 @@
]
},
"vgatherdps ymm0, [ymm1*2 + rax], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x92 256-bit"
],
@ -1023,49 +935,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #1",
"sshll v6.2d, v17.2s, #1",
"sshll2 v7.2d, v3.4s, #1",
"sshll v3.2d, v3.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"add x1, x4, w0, sxtw #1",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -1093,7 +983,7 @@
]
},
"vgatherdps ymm0, [ymm1*8 + rax], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x92 256-bit"
],
@ -1101,49 +991,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #3",
"sshll v6.2d, v17.2s, #3",
"sshll2 v7.2d, v3.4s, #3",
"sshll v3.2d, v3.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"add x1, x4, w0, sxtw #3",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [x4, z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [x4, z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -1657,32 +1525,21 @@
]
},
"vpgatherdd xmm0, [xmm1*2], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x90 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #1",
"sshll v3.2d, v17.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -1707,32 +1564,21 @@
]
},
"vpgatherdd xmm0, [xmm1*8], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x90 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #3",
"sshll v3.2d, v17.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -1762,7 +1608,7 @@
]
},
"vpgatherdd ymm0, [ymm1*2], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x90 256-bit"
],
@ -1770,49 +1616,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #1",
"sshll v6.2d, v17.2s, #1",
"sshll2 v7.2d, v3.4s, #1",
"sshll v3.2d, v3.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -1842,7 +1666,7 @@
]
},
"vpgatherdd ymm0, [ymm1*8], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x90 256-bit"
],
@ -1850,49 +1674,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #3",
"sshll v6.2d, v17.2s, #3",
"sshll2 v7.2d, v3.4s, #3",
"sshll v3.2d, v3.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -2428,32 +2230,21 @@
]
},
"vgatherdps xmm0, [xmm1*2], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x92 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #1",
"sshll v3.2d, v17.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -2478,32 +2269,21 @@
]
},
"vgatherdps xmm0, [xmm1*8], xmm2": {
"ExpectedInstructionCount": 25,
"ExpectedInstructionCount": 14,
"Comment": [
"Map 2 0b01 0x92 128-bit"
],
"ExpectedArm64ASM": [
"sshll2 v2.2d, v17.4s, #3",
"sshll v3.2d, v17.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[3], [x1]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z2.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"str q18, [x28, #16]",
"str q18, [x28, #48]",
@ -2533,7 +2313,7 @@
]
},
"vgatherdps ymm0, [ymm1*2], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x92 256-bit"
],
@ -2541,49 +2321,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #1",
"sshll v6.2d, v17.2s, #1",
"sshll2 v7.2d, v3.4s, #1",
"sshll v3.2d, v3.2s, #1",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"sbfiz x1, x0, #1, #32",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]
@ -2613,7 +2371,7 @@
]
},
"vgatherdps ymm0, [ymm1*8], ymm2": {
"ExpectedInstructionCount": 48,
"ExpectedInstructionCount": 26,
"Comment": [
"Map 2 0b01 0x92 256-bit"
],
@ -2621,49 +2379,27 @@
"ldr q2, [x28, #16]",
"ldr q3, [x28, #32]",
"ldr q4, [x28, #48]",
"sshll2 v5.2d, v17.4s, #3",
"sshll v6.2d, v17.2s, #3",
"sshll2 v7.2d, v3.4s, #3",
"sshll v3.2d, v3.2s, #3",
"mrs x20, nzcv",
"mov w0, v18.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[0], [x1]",
"mov w0, v18.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[1], [x1]",
"mov w0, v18.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[2], [x1]",
"mov w0, v18.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v17.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v16.s}[3], [x1]",
"mov w0, v4.s[0]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[0]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[0], [x1]",
"mov w0, v4.s[1]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[1]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[1], [x1]",
"mov w0, v4.s[2]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[2]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[2], [x1]",
"mov w0, v4.s[3]",
"tbz w0, #31, #+0x10",
"smov x0, v3.s[3]",
"sbfiz x1, x0, #3, #32",
"ld1 {v2.s}[3], [x1]",
"str q2, [x28, #16]",
"cmplt p0.s, p6/z, z18.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z6.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z5.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z16.s, p0/m, z0.s",
"movi v18.2d, #0x0",
"cmplt p0.s, p6/z, z4.s, #0",
"punpklo p1.h, p0.b",
"ld1w {z0.d}, p1/z, [z3.d]",
"punpkhi p1.h, p0.b",
"ld1w {z1.d}, p1/z, [z7.d]",
"uzp1 v0.4s, v0.4s, v1.4s",
"mov z2.s, p0/m, z0.s",
"str q2, [x28, #16]",
"str q18, [x28, #48]",
"msr nzcv, x20"
]