Merge pull request #3667 from alyssarosenzweig/opt/pcmp

Optimize PCMPESTRI flags a bit
This commit is contained in:
Ryan Houdek 2024-05-28 22:06:37 -07:00 committed by GitHub
commit 35ec54f920
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 20 additions and 81 deletions

View File

@ -50,22 +50,22 @@ struct OpHandlers<IR::OP_VPCMPESTRX> {
// Bits are arranged as: // Bits are arranged as:
// Bit #: 3 2 1 0 // Bit #: 3 2 1 0
// [OF | CF | SF | ZF] // [SF | ZF | CF | OF]
uint32_t flags = 0; uint32_t flags = 0;
flags |= (valid_rhs < upper_limit) ? 0b01 : 0b00; flags |= (valid_rhs < upper_limit) ? 0b0100 : 0b0000;
flags |= (valid_lhs < upper_limit) ? 0b10 : 0b00; flags |= (valid_lhs < upper_limit) ? 0b1000 : 0b0000;
const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs); const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs);
if (result != 0) { if (result != 0) {
flags |= 0b0100; flags |= 0b0010;
} }
if ((result & 1) != 0) { if ((result & 1) != 0) {
flags |= 0b1000; flags |= 0b0001;
} }
// We tack the flags on top of the result to avoid needing to handle // We track the flags in the usual NZCV bit position so we can msr them
// multiple return values in the JITs. // later. Avoids handling flags natively in JIT.
return result | (flags << 16); return result | (flags << 28);
} }
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) { FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) {

View File

@ -5066,33 +5066,15 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask
OrderedNode* IfZero = _Constant(16 >> (Control & 1)); OrderedNode* IfZero = _Constant(16 >> (Control & 1));
OrderedNode* IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags); OrderedNode* IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags);
OrderedNode* Result = _Select(IR::COND_EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero); OrderedNode* Result = _Select(IR::COND_EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero);
const uint8_t GPRSize = CTX->GetGPRSize(); // Store the result, it is already zero-extended to 64-bit implicitly.
if (GPRSize == 8) {
// If being stored to an 8-byte register, zero extend the 4-byte result.
Result = _Bfe(OpSize::i64Bit, 32, 0, Result);
}
StoreGPRRegister(X86State::REG_RCX, Result); StoreGPRRegister(X86State::REG_RCX, Result);
} }
// Set all of the necessary flags. // Set all of the necessary flags. NZCV stored in bits 28...31 like the hw op.
// We use the top 16-bits of the result to store the flags SetNZCV(IntermediateResult);
// in the form: PossiblySetNZCVBits = ~0;
//
// Bit: 19 18 17 16
// [OF | CF | SF | ZF]
//
const auto GetFlagBit = [this, IntermediateResult](int BitIndex) {
return _Bfe(OpSize::i32Bit, 1, BitIndex, IntermediateResult);
};
SetRFLAG<X86State::RFLAG_ZF_RAW_LOC>(GetFlagBit(16));
SetRFLAG<X86State::RFLAG_SF_RAW_LOC>(GetFlagBit(17));
SetRFLAG<X86State::RFLAG_CF_RAW_LOC>(GetFlagBit(18));
SetRFLAG<X86State::RFLAG_OF_RAW_LOC>(GetFlagBit(19));
ZeroPF_AF(); ZeroPF_AF();
} }

View File

@ -250,7 +250,7 @@
] ]
}, },
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": { "pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
"ExpectedInstructionCount": 49, "ExpectedInstructionCount": 40,
"Comment": [ "Comment": [
"A Hat In Time spends at least 5% CPU time in this instruction", "A Hat In Time spends at least 5% CPU time in this instruction",
"Comes from vcruntime140.dll wcsstr" "Comes from vcruntime140.dll wcsstr"
@ -293,16 +293,7 @@
"clz w23, w0", "clz w23, w0",
"csinv w23, w23, wzr, ne", "csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)", "cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq", "csel x5, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1", "mov w26, #0x1",
"msr nzcv, x20" "msr nzcv, x20"
] ]

View File

@ -35,7 +35,7 @@
}, },
"Instructions": { "Instructions": {
"pcmpestrm xmm0, xmm1, 0_0_00_00_00b": { "pcmpestrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 43, "ExpectedInstructionCount": 35,
"Comment": [ "Comment": [
"0x66 0x0f 0x3A 0x60" "0x66 0x0f 0x3A 0x60"
], ],
@ -73,20 +73,12 @@
"mov w27, #0x0", "mov w27, #0x0",
"uxth w0, w20", "uxth w0, w20",
"fmov s16, w0", "fmov s16, w0",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1", "mov w26, #0x1",
"msr nzcv, x20" "msr nzcv, x20"
] ]
}, },
"pcmpestri xmm0, xmm1, 0_0_00_00_00b": { "pcmpestri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 51, "ExpectedInstructionCount": 42,
"Comment": [ "Comment": [
"0x66 0x0f 0x3A 0x61" "0x66 0x0f 0x3A 0x61"
], ],
@ -130,22 +122,13 @@
"clz w23, w0", "clz w23, w0",
"csinv w23, w23, wzr, ne", "csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)", "cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq", "csel x5, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1", "mov w26, #0x1",
"msr nzcv, x20" "msr nzcv, x20"
] ]
}, },
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": { "pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 41, "ExpectedInstructionCount": 33,
"Comment": [ "Comment": [
"0x66 0x0f 0x3A 0x62" "0x66 0x0f 0x3A 0x62"
], ],
@ -181,20 +164,12 @@
"mov w27, #0x0", "mov w27, #0x0",
"uxth w0, w20", "uxth w0, w20",
"fmov s16, w0", "fmov s16, w0",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1", "mov w26, #0x1",
"msr nzcv, x20" "msr nzcv, x20"
] ]
}, },
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": { "pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
"ExpectedInstructionCount": 49, "ExpectedInstructionCount": 40,
"Comment": [ "Comment": [
"0x66 0x0f 0x3A 0x63" "0x66 0x0f 0x3A 0x63"
], ],
@ -236,16 +211,7 @@
"clz w23, w0", "clz w23, w0",
"csinv w23, w23, wzr, ne", "csinv w23, w23, wzr, ne",
"cmp x21, #0x0 (0)", "cmp x21, #0x0 (0)",
"csel x21, x22, x23, eq", "csel x5, x22, x23, eq",
"mov w5, w21",
"ubfx w21, w20, #16, #1",
"lsl x21, x21, #30",
"ubfx w22, w20, #17, #1",
"orr w21, w21, w22, lsl #31",
"ubfx w22, w20, #18, #1",
"orr w21, w21, w22, lsl #29",
"ubfx w20, w20, #19, #1",
"orr w20, w21, w20, lsl #28",
"mov w26, #0x1", "mov w26, #0x1",
"msr nzcv, x20" "msr nzcv, x20"
] ]