Merge pull request #3667 from alyssarosenzweig/opt/pcmp

Optimize PCMPESTRI flags a bit
This commit is contained in:
Ryan Houdek 2024-05-28 22:06:37 -07:00 committed by GitHub
commit 35ec54f920
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 20 additions and 81 deletions

View File

@ -50,22 +50,22 @@ struct OpHandlers<IR::OP_VPCMPESTRX> {
// Bits are arranged as:
// Bit #: 3 2 1 0
// [OF | CF | SF | ZF]
// [SF | ZF | CF | OF]
uint32_t flags = 0;
flags |= (valid_rhs < upper_limit) ? 0b01 : 0b00;
flags |= (valid_lhs < upper_limit) ? 0b10 : 0b00;
flags |= (valid_rhs < upper_limit) ? 0b0100 : 0b0000;
flags |= (valid_lhs < upper_limit) ? 0b1000 : 0b0000;
const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs);
if (result != 0) {
flags |= 0b0100;
flags |= 0b0010;
}
if ((result & 1) != 0) {
flags |= 0b1000;
flags |= 0b0001;
}
// We tack the flags on top of the result to avoid needing to handle
// multiple return values in the JITs.
return result | (flags << 16);
// We track the flags in the usual NZCV bit position so we can msr them
// later. Avoids handling flags natively in JIT.
return result | (flags << 28);
}
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) {

View File

@ -5066,33 +5066,15 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask
OrderedNode* IfZero = _Constant(16 >> (Control & 1));
OrderedNode* IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags);
OrderedNode* Result = _Select(IR::COND_EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero);
const uint8_t GPRSize = CTX->GetGPRSize();
if (GPRSize == 8) {
// If being stored to an 8-byte register, zero extend the 4-byte result.
Result = _Bfe(OpSize::i64Bit, 32, 0, Result);
}
// Store the result, it is already zero-extended to 64-bit implicitly.
StoreGPRRegister(X86State::REG_RCX, Result);
}
// Set all of the necessary flags.
// We use the top 16-bits of the result to store the flags
// in the form:
//
// Bit: 19 18 17 16
// [OF | CF | SF | ZF]
//
const auto GetFlagBit = [this, IntermediateResult](int BitIndex) {
return _Bfe(OpSize::i32Bit, 1, BitIndex, IntermediateResult);
};
SetRFLAG<X86State::RFLAG_ZF_RAW_LOC>(GetFlagBit(16));
SetRFLAG<X86State::RFLAG_SF_RAW_LOC>(GetFlagBit(17));
SetRFLAG<X86State::RFLAG_CF_RAW_LOC>(GetFlagBit(18));
SetRFLAG<X86State::RFLAG_OF_RAW_LOC>(GetFlagBit(19));
// Set all of the necessary flags. NZCV stored in bits 28...31 like the hw op.
SetNZCV(IntermediateResult);
PossiblySetNZCVBits = ~0;
ZeroPF_AF();
}

View File

@ -250,7 +250,7 @@
]
},
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 49,
"ExpectedInstructionCount": 40,
"Comment": [
"A Hat In Time spends at least 5% CPU time in this instruction",
"Comes from vcruntime140.dll wcsstr"
@ -293,16 +293,7 @@
"clz w23, w0",
"csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"csel x5, x22, x23, eq",
"mov w26, #0x1",
"msr nzcv, x20"
]

View File

@ -35,7 +35,7 @@
},
"Instructions": {
"pcmpestrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 43,
"ExpectedInstructionCount": 35,
"Comment": [
"0x66 0x0f 0x3A 0x60"
],
@ -73,20 +73,12 @@
"mov w27, #0x0",
"uxth w0, w20",
"fmov s16, w0",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1",
"msr nzcv, x20"
]
},
"pcmpestri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 51,
"ExpectedInstructionCount": 42,
"Comment": [
"0x66 0x0f 0x3A 0x61"
],
@ -130,22 +122,13 @@
"clz w23, w0",
"csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"csel x5, x22, x23, eq",
"mov w26, #0x1",
"msr nzcv, x20"
]
},
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 41,
"ExpectedInstructionCount": 33,
"Comment": [
"0x66 0x0f 0x3A 0x62"
],
@ -181,20 +164,12 @@
"mov w27, #0x0",
"uxth w0, w20",
"fmov s16, w0",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1",
"msr nzcv, x20"
]
},
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 49,
"ExpectedInstructionCount": 40,
"Comment": [
"0x66 0x0f 0x3A 0x63"
],
@ -236,16 +211,7 @@
"clz w23, w0",
"csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"csel x5, x22, x23, eq",
"mov w26, #0x1",
"msr nzcv, x20"
]