OpcodeDispatcher: Eliminate unnecessary moves in {AVX}VFCMPOp

We dealing with scalar vector sources, we don't need to zero-extend
the vector, and we can just use it as is.
This commit is contained in:
Lioncache 2023-08-19 20:42:42 -04:00
parent db60a2fd4b
commit 343b00818d
4 changed files with 113 additions and 171 deletions

View File

@ -2384,9 +2384,12 @@ OrderedNode* OpDispatchBuilder::VFCMPOpImpl(OpcodeArgs, size_t ElementSize, bool
template<size_t ElementSize, bool Scalar>
void OpDispatchBuilder::VFCMPOp(OpcodeArgs) {
// No need for zero-extending in the scalar case, since
// all we need is an insert at the end of the operation.
const auto SrcSize = Scalar && Op->Src[0].IsGPR() ? 16U : GetSrcSize(Op);
const auto DstSize = GetDstSize(Op);
OrderedNode *Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags, -1);
OrderedNode *Src = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], SrcSize, Op->Flags, -1);
OrderedNode *Dest = LoadSource_WithOpSize(FPRClass, Op, Op->Dest, DstSize, Op->Flags, -1);
const uint8_t CompType = Op->Src[1].Data.Literal.Value;
@ -2406,6 +2409,9 @@ void OpDispatchBuilder::VFCMPOp<8, true>(OpcodeArgs);
template <size_t ElementSize, bool Scalar>
void OpDispatchBuilder::AVXVFCMPOp(OpcodeArgs) {
// No need for zero-extending in the scalar case, since
// all we need is an insert at the end of the operation.
const auto SrcSize = Scalar && Op->Src[1].IsGPR() ? 16U : GetSrcSize(Op);
const auto DstSize = GetDstSize(Op);
const auto Is128Bit = DstSize == Core::CPUState::XMM_SSE_REG_SIZE;
@ -2413,7 +2419,7 @@ void OpDispatchBuilder::AVXVFCMPOp(OpcodeArgs) {
const uint8_t CompType = Op->Src[2].Data.Literal.Value;
OrderedNode *Src1 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[0], DstSize, Op->Flags, -1);
OrderedNode *Src2 = LoadSource(FPRClass, Op, Op->Src[1], Op->Flags, -1);
OrderedNode *Src2 = LoadSource_WithOpSize(FPRClass, Op, Op->Src[1], SrcSize, Op->Flags, -1);
OrderedNode *Result = VFCMPOpImpl(Op, ElementSize, Scalar, Src1, Src2, CompType);
if (Is128Bit) {

View File

@ -600,36 +600,42 @@
]
},
"cmpss xmm0, xmm1, 0": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmeq s4, s16, s4",
"fcmeq s4, s16, s17",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 1": {
"ExpectedInstructionCount": 5,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmgt s4, s4, s16",
"fcmgt s4, s17, s16",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 2": {
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmge s4, s17, s16",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 3": {
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
@ -637,92 +643,62 @@
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmge s4, s4, s16",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 3": {
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmge s0, s16, s4",
"fcmgt s1, s4, s16",
"fcmge s0, s16, s17",
"fcmgt s1, s17, s16",
"orr v4.8b, v0.8b, v1.8b",
"mvn v4.8b, v4.8b",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 4": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmeq s4, s16, s4",
"fcmeq s4, s16, s17",
"mvn v4.8b, v4.8b",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 5": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmgt s4, s4, s16",
"fcmgt s4, s17, s16",
"mvn v4.16b, v4.16b",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 6": {
"ExpectedInstructionCount": 6,
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmge s4, s4, s16",
"fcmge s4, s17, s16",
"mvn v4.16b, v4.16b",
"mov v16.s[0], v4.s[0]"
]
},
"cmpss xmm0, xmm1, 7": {
"ExpectedInstructionCount": 7,
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf3 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"movi v0.2d, #0x0",
"mov v0.s[0], v17.s[0]",
"mov v4.16b, v0.16b",
"fcmge s0, s16, s4",
"fcmgt s1, s4, s16",
"fcmge s0, s16, s17",
"fcmgt s1, s17, s16",
"orr v4.8b, v0.8b, v1.8b",
"mov v16.s[0], v4.s[0]"
]

View File

@ -413,103 +413,42 @@
]
},
"cmpsd xmm0, xmm1, 0": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmeq d4, d16, d4",
"fcmeq d4, d16, d17",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 1": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmgt d4, d4, d16",
"fcmgt d4, d17, d16",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 2": {
"ExpectedInstructionCount": 3,
"ExpectedInstructionCount": 2,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmge d4, d4, d16",
"fcmge d4, d17, d16",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 3": {
"ExpectedInstructionCount": 6,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmge d0, d16, d4",
"fcmgt d1, d4, d16",
"orr v4.8b, v0.8b, v1.8b",
"mvn v4.8b, v4.8b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 4": {
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmeq d4, d16, d4",
"mvn v4.8b, v4.8b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 5": {
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmgt d4, d4, d16",
"mvn v4.16b, v4.16b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 6": {
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmge d4, d4, d16",
"mvn v4.16b, v4.16b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 7": {
"ExpectedInstructionCount": 5,
"Optimal": "No",
"Comment": [
@ -517,9 +456,62 @@
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"mov v4.8b, v17.8b",
"fcmge d0, d16, d4",
"fcmgt d1, d4, d16",
"fcmge d0, d16, d17",
"fcmgt d1, d17, d16",
"orr v4.8b, v0.8b, v1.8b",
"mvn v4.8b, v4.8b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 4": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmeq d4, d16, d17",
"mvn v4.8b, v4.8b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 5": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmgt d4, d17, d16",
"mvn v4.16b, v4.16b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 6": {
"ExpectedInstructionCount": 3,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmge d4, d17, d16",
"mvn v4.16b, v4.16b",
"mov v16.d[0], v4.d[0]"
]
},
"cmpsd xmm0, xmm1, 7": {
"ExpectedInstructionCount": 4,
"Optimal": "No",
"Comment": [
"With AFP mode FEX can remove an insert after the operation.",
"0xf2 0x0f 0xc2"
],
"ExpectedArm64ASM": [
"fcmge d0, d16, d17",
"fcmgt d1, d17, d16",
"orr v4.8b, v0.8b, v1.8b",
"mov v16.d[0], v4.d[0]"
]

View File

@ -3136,7 +3136,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x00": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3144,9 +3144,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmeq s5, s4, s5",
"mov v4.s[0], v5.s[0]",
"mov v4.16b, v4.16b",
@ -3155,7 +3152,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x01": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3163,9 +3160,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmgt s5, s5, s4",
"mov v4.s[0], v5.s[0]",
"mov v4.16b, v4.16b",
@ -3174,7 +3168,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x02": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3182,9 +3176,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmge s5, s5, s4",
"mov v4.s[0], v5.s[0]",
"mov v4.16b, v4.16b",
@ -3193,7 +3184,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x03": {
"ExpectedInstructionCount": 13,
"ExpectedInstructionCount": 10,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3201,9 +3192,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmge s0, s4, s5",
"fcmgt s1, s5, s4",
"orr v5.8b, v0.8b, v1.8b",
@ -3215,7 +3203,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x04": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3223,9 +3211,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmeq s5, s4, s5",
"mvn v5.8b, v5.8b",
"mov v4.s[0], v5.s[0]",
@ -3235,7 +3220,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x05": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3243,9 +3228,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmgt s5, s5, s4",
"mvn v5.16b, v5.16b",
"mov v4.s[0], v5.s[0]",
@ -3255,7 +3237,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x06": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3263,9 +3245,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmge s5, s5, s4",
"mvn v5.16b, v5.16b",
"mov v4.s[0], v5.s[0]",
@ -3275,7 +3254,7 @@
]
},
"vcmpss xmm0, xmm1, xmm2, 0x07": {
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 9,
"Optimal": "No",
"Comment": [
"Map 1 0b10 0xC2 128-bit"
@ -3283,9 +3262,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"movi v0.2d, #0x0",
"mov v0.s[0], v5.s[0]",
"mov v5.16b, v0.16b",
"fcmge s0, s4, s5",
"fcmgt s1, s5, s4",
"orr v5.8b, v0.8b, v1.8b",
@ -3296,7 +3272,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x00": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3304,7 +3280,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmeq d5, d4, d5",
"mov v4.d[0], v5.d[0]",
"mov v4.16b, v4.16b",
@ -3313,7 +3288,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x01": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3321,7 +3296,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmgt d5, d5, d4",
"mov v4.d[0], v5.d[0]",
"mov v4.16b, v4.16b",
@ -3330,7 +3304,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x02": {
"ExpectedInstructionCount": 8,
"ExpectedInstructionCount": 7,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3338,7 +3312,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmge d5, d5, d4",
"mov v4.d[0], v5.d[0]",
"mov v4.16b, v4.16b",
@ -3347,7 +3320,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x03": {
"ExpectedInstructionCount": 11,
"ExpectedInstructionCount": 10,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3355,7 +3328,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmge d0, d4, d5",
"fcmgt d1, d5, d4",
"orr v5.8b, v0.8b, v1.8b",
@ -3367,7 +3339,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x04": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3375,7 +3347,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmeq d5, d4, d5",
"mvn v5.8b, v5.8b",
"mov v4.d[0], v5.d[0]",
@ -3385,7 +3356,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x05": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3393,7 +3364,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmgt d5, d5, d4",
"mvn v5.16b, v5.16b",
"mov v4.d[0], v5.d[0]",
@ -3403,7 +3373,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x06": {
"ExpectedInstructionCount": 9,
"ExpectedInstructionCount": 8,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3411,7 +3381,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmge d5, d5, d4",
"mvn v5.16b, v5.16b",
"mov v4.d[0], v5.d[0]",
@ -3421,7 +3390,7 @@
]
},
"vcmpsd xmm0, xmm1, xmm2, 0x07": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Optimal": "No",
"Comment": [
"Map 1 0b11 0xC2 128-bit"
@ -3429,7 +3398,6 @@
"ExpectedArm64ASM": [
"mov z4.d, p7/m, z17.d",
"mov z5.d, p7/m, z18.d",
"mov v5.8b, v5.8b",
"fcmge d0, d4, d5",
"fcmgt d1, d5, d4",
"orr v5.8b, v0.8b, v1.8b",