Merge pull request #3497 from Sonicadvance1/movmaskb_constant

JIT: Optimize pmovmaskb with a named vector constant
This commit is contained in:
Ryan Houdek 2024-03-18 16:08:40 -07:00 committed by GitHub
commit ab8ee64352
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 61 additions and 87 deletions

View File

@ -27,6 +27,8 @@ constexpr static uint64_t NamedVectorConstants[FEXCore::IR::NamedVectorConstant:
{0x0706'0504'0302'0100ULL, 0x0F0E'0D0C'FFFF'FFFFULL}, // NAMED_VECTOR_BLENDPS_1011B
{0xFFFF'FFFF'0302'0100ULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1101B
{0x0706'0504'FFFF'FFFFULL, 0x0F0E'0D0C'0B0A'0908ULL}, // NAMED_VECTOR_BLENDPS_1110B
{0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB
{0x8040'2010'0804'0201ULL, 0x8040'2010'0804'0201ULL}, // NAMED_VECTOR_MOVMASKB_UPPER
};
constexpr static auto PSHUFLW_LUT {

View File

@ -1104,7 +1104,7 @@ void OpDispatchBuilder::MOVMSKOpOne(OpcodeArgs) {
const auto ExtractSize = Is256Bit ? 4 : 2;
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags);
OrderedNode *VMask = _VDupFromGPR(SrcSize, 8, _Constant(0x80'40'20'10'08'04'02'01ULL));
OrderedNode *VMask = LoadAndCacheNamedVectorConstant(SrcSize, NAMED_VECTOR_MOVMASKB);
auto VCMP = _VCMPLTZ(SrcSize, 1, Src);
auto VAnd = _VAnd(SrcSize, 1, VCMP, VMask);

View File

@ -533,6 +533,8 @@ enum NamedVectorConstant : uint8_t {
NAMED_VECTOR_BLENDPS_1011B,
NAMED_VECTOR_BLENDPS_1101B,
NAMED_VECTOR_BLENDPS_1110B,
NAMED_VECTOR_MOVMASKB,
NAMED_VECTOR_MOVMASKB_UPPER,
NAMED_VECTOR_CONST_POOL_MAX,
// Beginning of named constants that don't have a constant pool backing.
NAMED_VECTOR_ZERO = NAMED_VECTOR_CONST_POOL_MAX,

View File

@ -55,7 +55,7 @@
"0x66 0x0f 0x3a 0xdf"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2080]",
"ldr q2, [x28, #2096]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
@ -68,7 +68,7 @@
"0x66 0x0f 0x3a 0xdf"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2080]",
"ldr q2, [x28, #2096]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",

View File

@ -197,10 +197,10 @@
"ldr q3, [x11, #272]",
"ldr q4, [x11]",
"ldr q5, [x11, #16]",
"ldr x0, [x28, #1688]",
"ldr x0, [x28, #1704]",
"ldr q6, [x0, #2832]",
"tbl v2.16b, {v2.16b}, v6.16b",
"ldr x0, [x28, #1688]",
"ldr x0, [x28, #1704]",
"ldr q7, [x0, #432]",
"tbl v3.16b, {v3.16b}, v7.16b",
"ldr q8, [x11, #32]",
@ -281,7 +281,7 @@
"mov v9.s[2], w25",
"mov v9.s[1], w20",
"mov v9.s[0], w22",
"ldr x0, [x28, #1688]",
"ldr x0, [x28, #1704]",
"ldr q10, [x0, #224]",
"tbl v4.16b, {v4.16b}, v10.16b",
"mov w20, v9.s[1]",

View File

@ -1614,15 +1614,11 @@
]
},
"pmovmskb eax, mm0": {
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 8,
"Comment": "0x0f 0xd7",
"ExpectedArm64ASM": [
"ldr d2, [x28, #768]",
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v3.2d, x20",
"ldr d3, [x28, #2208]",
"cmlt v2.16b, v2.16b, #0",
"and v2.16b, v2.16b, v3.16b",
"addp v2.16b, v2.16b, v2.16b",

View File

@ -35,14 +35,10 @@
]
},
"pmovmskb eax, xmm0": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 7,
"Comment": "0x66 0x0f 0xd7",
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v2.2d, x20",
"ldr q2, [x28, #2208]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",

View File

@ -67,16 +67,12 @@
]
},
"vpmovmskb rax, xmm0": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 7,
"Comment": [
"Map 1 0b01 0xd7 256-bit"
],
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v2.2d, x20",
"ldr q2, [x28, #2208]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",
@ -86,16 +82,13 @@
]
},
"vpmovmskb rax, ymm0": {
"ExpectedInstructionCount": 21,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 1 0b01 0xd7 256-bit"
],
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"mov z2.d, x20",
"ldr x0, [x28, #1672]",
"ld1b {z2.b}, p7/z, [x0]",
"mrs x0, nzcv",
"mov z0.d, #0",
"cmplt p0.b, p7/z, z16.b, #0",

View File

@ -624,7 +624,7 @@
"0x66 0x0f 0x38 0x41"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #1968]",
"ldr q2, [x28, #1984]",
"zip1 v3.8h, v2.8h, v17.8h",
"zip2 v2.8h, v2.8h, v17.8h",
"umin v2.4s, v3.4s, v2.4s",

View File

@ -315,7 +315,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2096]",
"ldr q2, [x28, #2112]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -325,7 +325,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2112]",
"ldr q2, [x28, #2128]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -344,7 +344,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2128]",
"ldr q2, [x28, #2144]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -364,7 +364,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2144]",
"ldr q2, [x28, #2160]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -383,7 +383,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2160]",
"ldr q2, [x28, #2176]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -393,7 +393,7 @@
"0x66 0x0f 0x3a 0x0c"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2176]",
"ldr q2, [x28, #2192]",
"tbx v16.16b, {v17.16b}, v2.16b"
]
},
@ -462,7 +462,7 @@
"0x66 0x0f 0x3a 0x0e"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1720]",
"ldr x0, [x28, #1736]",
"ldr q2, [x0, #3440]",
"tbx v16.16b, {v17.16b}, v2.16b"
]

View File

@ -2909,7 +2909,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2224]",
"ldr x3, [x28, #2272]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
@ -2920,7 +2920,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2240]",
"ldr x3, [x28, #2288]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
@ -2981,7 +2981,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2232]",
"ldr x3, [x28, #2280]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",
@ -2994,7 +2994,7 @@
"mov x0, x6",
"mov x1, x20",
"mov x2, x7",
"ldr x3, [x28, #2248]",
"ldr x3, [x28, #2296]",
"str x30, [sp, #-16]!",
"blr x3",
"ldr x30, [sp], #16",

View File

@ -646,7 +646,7 @@
"Comment": "0x0f 0x50",
"ExpectedArm64ASM": [
"ushr v2.4s, v16.4s, #31",
"ldr q3, [x28, #2064]",
"ldr q3, [x28, #2080]",
"ushl v2.4s, v2.4s, v3.4s",
"addv s2, v2.4s",
"mov w4, v2.s[0]"
@ -657,7 +657,7 @@
"Comment": "0x0f 0x50",
"ExpectedArm64ASM": [
"ushr v2.4s, v16.4s, #31",
"ldr q3, [x28, #2064]",
"ldr q3, [x28, #2080]",
"ushl v2.4s, v2.4s, v3.4s",
"addv s2, v2.4s",
"mov w4, v2.s[0]"
@ -1041,7 +1041,7 @@
"Comment": "0x0f 0x70",
"ExpectedArm64ASM": [
"ldr d2, [x28, #784]",
"ldr x0, [x28, #1672]",
"ldr x0, [x28, #1688]",
"ldr d3, [x0, #16]",
"tbl v2.8b, {v2.16b}, v3.8b",
"str d2, [x28, #768]"
@ -1052,7 +1052,7 @@
"Comment": "0x0f 0x70",
"ExpectedArm64ASM": [
"ldr d2, [x4]",
"ldr x0, [x28, #1672]",
"ldr x0, [x28, #1688]",
"ldr d3, [x0, #16]",
"tbl v2.8b, {v2.16b}, v3.8b",
"str d2, [x28, #768]"
@ -3315,7 +3315,7 @@
"ExpectedInstructionCount": 3,
"Comment": "0x0f 0xc6",
"ExpectedArm64ASM": [
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q2, [x0, #16]",
"tbl v16.16b, {v16.16b, v17.16b}, v2.16b"
]
@ -3324,7 +3324,7 @@
"ExpectedInstructionCount": 5,
"Comment": "0x0f 0xc6",
"ExpectedArm64ASM": [
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q2, [x0, #16]",
"mov v0.16b, v17.16b",
"mov v1.16b, v16.16b",
@ -3336,7 +3336,7 @@
"Comment": "0x0f 0xc6",
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q3, [x0, #16]",
"mov v0.16b, v16.16b",
"mov v1.16b, v2.16b",
@ -3430,15 +3430,11 @@
]
},
"pmovmskb eax, mm0": {
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 8,
"Comment": "0x0f 0xd7",
"ExpectedArm64ASM": [
"ldr d2, [x28, #768]",
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v3.2d, x20",
"ldr d3, [x28, #2208]",
"cmlt v2.16b, v2.16b, #0",
"and v2.16b, v2.16b, v3.16b",
"addp v2.16b, v2.16b, v2.16b",

View File

@ -522,7 +522,7 @@
"0x66 0x0f 0x70"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1688]",
"ldr x0, [x28, #1704]",
"ldr q2, [x0, #16]",
"tbl v16.16b, {v17.16b}, v2.16b"
]
@ -536,7 +536,7 @@
],
"ExpectedArm64ASM": [
"ldr q2, [x4]",
"ldr x0, [x28, #1688]",
"ldr x0, [x28, #1704]",
"ldr q3, [x0, #16]",
"tbl v16.16b, {v2.16b}, v3.16b"
]
@ -1014,7 +1014,7 @@
"ExpectedInstructionCount": 3,
"Comment": "0x66 0x0f 0xd0",
"ExpectedArm64ASM": [
"ldr q2, [x28, #2032]",
"ldr q2, [x28, #2048]",
"eor v2.16b, v17.16b, v2.16b",
"fadd v16.2d, v16.2d, v2.2d"
]
@ -1067,14 +1067,10 @@
]
},
"pmovmskb eax, xmm0": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 7,
"Comment": "0x66 0x0f 0xd7",
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v2.2d, x20",
"ldr q2, [x28, #2208]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",

View File

@ -354,7 +354,7 @@
"0xf3 0x0f 0x70"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1680]",
"ldr x0, [x28, #1696]",
"ldr q2, [x0, #16]",
"tbl v16.16b, {v17.16b}, v2.16b"
]

View File

@ -296,7 +296,7 @@
"0xf2 0x0f 0x70"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1672]",
"ldr x0, [x28, #1688]",
"ldr q2, [x0, #16]",
"tbl v16.16b, {v17.16b}, v2.16b"
]
@ -452,7 +452,7 @@
"ExpectedInstructionCount": 3,
"Comment": "0xf2 0x0f 0xd0",
"ExpectedArm64ASM": [
"ldr q2, [x28, #2000]",
"ldr q2, [x28, #2016]",
"eor v2.16b, v17.16b, v2.16b",
"fadd v16.4s, v16.4s, v2.4s"
]

View File

@ -2755,7 +2755,7 @@
"Map 1 0b00 0xC6 128-bit"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q2, [x0, #16]",
"tbl v16.16b, {v17.16b, v18.16b}, v2.16b"
]
@ -2824,7 +2824,7 @@
"Map 1 0b00 0xC6 128-bit"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q2, [x0, #32]",
"tbl v16.16b, {v17.16b, v18.16b}, v2.16b"
]
@ -2893,7 +2893,7 @@
"Map 1 0b00 0xC6 128-bit"
],
"ExpectedArm64ASM": [
"ldr x0, [x28, #1696]",
"ldr x0, [x28, #1712]",
"ldr q2, [x0, #48]",
"tbl v16.16b, {v17.16b, v18.16b}, v2.16b"
]
@ -4338,7 +4338,7 @@
"Map 1 0b01 0xd0 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2032]",
"ldr q2, [x28, #2048]",
"eor v2.16b, v18.16b, v2.16b",
"fadd v16.2d, v17.2d, v2.2d"
]
@ -4361,7 +4361,7 @@
"Map 1 0b11 0xd0 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2000]",
"ldr q2, [x28, #2016]",
"eor v2.16b, v18.16b, v2.16b",
"fadd v16.4s, v17.4s, v2.4s"
]
@ -4493,16 +4493,12 @@
]
},
"vpmovmskb rax, xmm0": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 7,
"Comment": [
"Map 1 0b01 0xd7 256-bit"
],
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"dup v2.2d, x20",
"ldr q2, [x28, #2208]",
"cmlt v3.16b, v16.16b, #0",
"and v2.16b, v3.16b, v2.16b",
"addp v2.16b, v2.16b, v2.16b",
@ -4512,16 +4508,13 @@
]
},
"vpmovmskb rax, ymm0": {
"ExpectedInstructionCount": 21,
"ExpectedInstructionCount": 18,
"Comment": [
"Map 1 0b01 0xd7 256-bit"
],
"ExpectedArm64ASM": [
"mov x20, #0x201",
"movk x20, #0x804, lsl #16",
"movk x20, #0x2010, lsl #32",
"movk x20, #0x8040, lsl #48",
"mov z2.d, x20",
"ldr x0, [x28, #1672]",
"ld1b {z2.b}, p7/z, [x0]",
"mrs x0, nzcv",
"mov z0.d, #0",
"cmplt p0.b, p7/z, z16.b, #0",

View File

@ -1575,7 +1575,7 @@
"Map 2 0b01 0x41 256-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #1968]",
"ldr q2, [x28, #1984]",
"zip1 v3.8h, v2.8h, v17.8h",
"zip2 v2.8h, v2.8h, v17.8h",
"umin v2.4s, v3.4s, v2.4s",

View File

@ -4799,7 +4799,7 @@
"Map 3 0b01 0xdf 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2080]",
"ldr q2, [x28, #2096]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",
@ -4812,7 +4812,7 @@
"Map 3 0b01 0xdf 128-bit"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2080]",
"ldr q2, [x28, #2096]",
"movi v3.2d, #0x0",
"mov v16.16b, v17.16b",
"unimplemented (Unimplemented)",