mirror of
https://github.com/FEX-Emu/FEX.git
synced 2025-03-01 11:18:42 +00:00
Merge pull request #3667 from alyssarosenzweig/opt/pcmp
Optimize PCMPESTRI flags a bit
This commit is contained in:
commit
35ec54f920
@ -50,22 +50,22 @@ struct OpHandlers<IR::OP_VPCMPESTRX> {
|
|||||||
|
|
||||||
// Bits are arranged as:
|
// Bits are arranged as:
|
||||||
// Bit #: 3 2 1 0
|
// Bit #: 3 2 1 0
|
||||||
// [OF | CF | SF | ZF]
|
// [SF | ZF | CF | OF]
|
||||||
uint32_t flags = 0;
|
uint32_t flags = 0;
|
||||||
flags |= (valid_rhs < upper_limit) ? 0b01 : 0b00;
|
flags |= (valid_rhs < upper_limit) ? 0b0100 : 0b0000;
|
||||||
flags |= (valid_lhs < upper_limit) ? 0b10 : 0b00;
|
flags |= (valid_lhs < upper_limit) ? 0b1000 : 0b0000;
|
||||||
|
|
||||||
const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs);
|
const uint32_t result = HandlePolarity(aggregation, control, upper_limit, valid_rhs);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
flags |= 0b0100;
|
flags |= 0b0010;
|
||||||
}
|
}
|
||||||
if ((result & 1) != 0) {
|
if ((result & 1) != 0) {
|
||||||
flags |= 0b1000;
|
flags |= 0b0001;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We tack the flags on top of the result to avoid needing to handle
|
// We track the flags in the usual NZCV bit position so we can msr them
|
||||||
// multiple return values in the JITs.
|
// later. Avoids handling flags natively in JIT.
|
||||||
return result | (flags << 16);
|
return result | (flags << 28);
|
||||||
}
|
}
|
||||||
|
|
||||||
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) {
|
FEXCORE_PRESERVE_ALL_ATTR static int32_t GetExplicitLength(uint64_t reg, uint16_t control) {
|
||||||
|
@ -5066,33 +5066,15 @@ void OpDispatchBuilder::PCMPXSTRXOpImpl(OpcodeArgs, bool IsExplicit, bool IsMask
|
|||||||
|
|
||||||
OrderedNode* IfZero = _Constant(16 >> (Control & 1));
|
OrderedNode* IfZero = _Constant(16 >> (Control & 1));
|
||||||
OrderedNode* IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags);
|
OrderedNode* IfNotZero = UseMSBIndex ? _FindMSB(IR::OpSize::i32Bit, ResultNoFlags) : _FindLSB(IR::OpSize::i32Bit, ResultNoFlags);
|
||||||
|
|
||||||
OrderedNode* Result = _Select(IR::COND_EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero);
|
OrderedNode* Result = _Select(IR::COND_EQ, ResultNoFlags, ZeroConst, IfZero, IfNotZero);
|
||||||
|
|
||||||
const uint8_t GPRSize = CTX->GetGPRSize();
|
// Store the result, it is already zero-extended to 64-bit implicitly.
|
||||||
if (GPRSize == 8) {
|
|
||||||
// If being stored to an 8-byte register, zero extend the 4-byte result.
|
|
||||||
Result = _Bfe(OpSize::i64Bit, 32, 0, Result);
|
|
||||||
}
|
|
||||||
StoreGPRRegister(X86State::REG_RCX, Result);
|
StoreGPRRegister(X86State::REG_RCX, Result);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set all of the necessary flags.
|
// Set all of the necessary flags. NZCV stored in bits 28...31 like the hw op.
|
||||||
// We use the top 16-bits of the result to store the flags
|
SetNZCV(IntermediateResult);
|
||||||
// in the form:
|
PossiblySetNZCVBits = ~0;
|
||||||
//
|
|
||||||
// Bit: 19 18 17 16
|
|
||||||
// [OF | CF | SF | ZF]
|
|
||||||
//
|
|
||||||
const auto GetFlagBit = [this, IntermediateResult](int BitIndex) {
|
|
||||||
return _Bfe(OpSize::i32Bit, 1, BitIndex, IntermediateResult);
|
|
||||||
};
|
|
||||||
|
|
||||||
SetRFLAG<X86State::RFLAG_ZF_RAW_LOC>(GetFlagBit(16));
|
|
||||||
SetRFLAG<X86State::RFLAG_SF_RAW_LOC>(GetFlagBit(17));
|
|
||||||
SetRFLAG<X86State::RFLAG_CF_RAW_LOC>(GetFlagBit(18));
|
|
||||||
SetRFLAG<X86State::RFLAG_OF_RAW_LOC>(GetFlagBit(19));
|
|
||||||
|
|
||||||
ZeroPF_AF();
|
ZeroPF_AF();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,7 +250,7 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
|
"pcmpistri xmm0, xmm1, 0_0_00_11_01b": {
|
||||||
"ExpectedInstructionCount": 49,
|
"ExpectedInstructionCount": 40,
|
||||||
"Comment": [
|
"Comment": [
|
||||||
"A Hat In Time spends at least 5% CPU time in this instruction",
|
"A Hat In Time spends at least 5% CPU time in this instruction",
|
||||||
"Comes from vcruntime140.dll wcsstr"
|
"Comes from vcruntime140.dll wcsstr"
|
||||||
@ -293,16 +293,7 @@
|
|||||||
"clz w23, w0",
|
"clz w23, w0",
|
||||||
"csinv w23, w23, wzr, ne",
|
"csinv w23, w23, wzr, ne",
|
||||||
"cmp x21, #0x0 (0)",
|
"cmp x21, #0x0 (0)",
|
||||||
"csel x21, x22, x23, eq",
|
"csel x5, x22, x23, eq",
|
||||||
"mov w5, w21",
|
|
||||||
"ubfx w21, w20, #16, #1",
|
|
||||||
"lsl x21, x21, #30",
|
|
||||||
"ubfx w22, w20, #17, #1",
|
|
||||||
"orr w21, w21, w22, lsl #31",
|
|
||||||
"ubfx w22, w20, #18, #1",
|
|
||||||
"orr w21, w21, w22, lsl #29",
|
|
||||||
"ubfx w20, w20, #19, #1",
|
|
||||||
"orr w20, w21, w20, lsl #28",
|
|
||||||
"mov w26, #0x1",
|
"mov w26, #0x1",
|
||||||
"msr nzcv, x20"
|
"msr nzcv, x20"
|
||||||
]
|
]
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
},
|
},
|
||||||
"Instructions": {
|
"Instructions": {
|
||||||
"pcmpestrm xmm0, xmm1, 0_0_00_00_00b": {
|
"pcmpestrm xmm0, xmm1, 0_0_00_00_00b": {
|
||||||
"ExpectedInstructionCount": 43,
|
"ExpectedInstructionCount": 35,
|
||||||
"Comment": [
|
"Comment": [
|
||||||
"0x66 0x0f 0x3A 0x60"
|
"0x66 0x0f 0x3A 0x60"
|
||||||
],
|
],
|
||||||
@ -73,20 +73,12 @@
|
|||||||
"mov w27, #0x0",
|
"mov w27, #0x0",
|
||||||
"uxth w0, w20",
|
"uxth w0, w20",
|
||||||
"fmov s16, w0",
|
"fmov s16, w0",
|
||||||
"ubfx w21, w20, #16, #1",
|
|
||||||
"lsl x21, x21, #30",
|
|
||||||
"ubfx w22, w20, #17, #1",
|
|
||||||
"orr w21, w21, w22, lsl #31",
|
|
||||||
"ubfx w22, w20, #18, #1",
|
|
||||||
"orr w21, w21, w22, lsl #29",
|
|
||||||
"ubfx w20, w20, #19, #1",
|
|
||||||
"orr w20, w21, w20, lsl #28",
|
|
||||||
"mov w26, #0x1",
|
"mov w26, #0x1",
|
||||||
"msr nzcv, x20"
|
"msr nzcv, x20"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"pcmpestri xmm0, xmm1, 0_0_00_00_00b": {
|
"pcmpestri xmm0, xmm1, 0_0_00_00_00b": {
|
||||||
"ExpectedInstructionCount": 51,
|
"ExpectedInstructionCount": 42,
|
||||||
"Comment": [
|
"Comment": [
|
||||||
"0x66 0x0f 0x3A 0x61"
|
"0x66 0x0f 0x3A 0x61"
|
||||||
],
|
],
|
||||||
@ -130,22 +122,13 @@
|
|||||||
"clz w23, w0",
|
"clz w23, w0",
|
||||||
"csinv w23, w23, wzr, ne",
|
"csinv w23, w23, wzr, ne",
|
||||||
"cmp x21, #0x0 (0)",
|
"cmp x21, #0x0 (0)",
|
||||||
"csel x21, x22, x23, eq",
|
"csel x5, x22, x23, eq",
|
||||||
"mov w5, w21",
|
|
||||||
"ubfx w21, w20, #16, #1",
|
|
||||||
"lsl x21, x21, #30",
|
|
||||||
"ubfx w22, w20, #17, #1",
|
|
||||||
"orr w21, w21, w22, lsl #31",
|
|
||||||
"ubfx w22, w20, #18, #1",
|
|
||||||
"orr w21, w21, w22, lsl #29",
|
|
||||||
"ubfx w20, w20, #19, #1",
|
|
||||||
"orr w20, w21, w20, lsl #28",
|
|
||||||
"mov w26, #0x1",
|
"mov w26, #0x1",
|
||||||
"msr nzcv, x20"
|
"msr nzcv, x20"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
|
"pcmpistrm xmm0, xmm1, 0_0_00_00_00b": {
|
||||||
"ExpectedInstructionCount": 41,
|
"ExpectedInstructionCount": 33,
|
||||||
"Comment": [
|
"Comment": [
|
||||||
"0x66 0x0f 0x3A 0x62"
|
"0x66 0x0f 0x3A 0x62"
|
||||||
],
|
],
|
||||||
@ -181,20 +164,12 @@
|
|||||||
"mov w27, #0x0",
|
"mov w27, #0x0",
|
||||||
"uxth w0, w20",
|
"uxth w0, w20",
|
||||||
"fmov s16, w0",
|
"fmov s16, w0",
|
||||||
"ubfx w21, w20, #16, #1",
|
|
||||||
"lsl x21, x21, #30",
|
|
||||||
"ubfx w22, w20, #17, #1",
|
|
||||||
"orr w21, w21, w22, lsl #31",
|
|
||||||
"ubfx w22, w20, #18, #1",
|
|
||||||
"orr w21, w21, w22, lsl #29",
|
|
||||||
"ubfx w20, w20, #19, #1",
|
|
||||||
"orr w20, w21, w20, lsl #28",
|
|
||||||
"mov w26, #0x1",
|
"mov w26, #0x1",
|
||||||
"msr nzcv, x20"
|
"msr nzcv, x20"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
|
"pcmpistri xmm0, xmm1, 0_0_00_00_00b": {
|
||||||
"ExpectedInstructionCount": 49,
|
"ExpectedInstructionCount": 40,
|
||||||
"Comment": [
|
"Comment": [
|
||||||
"0x66 0x0f 0x3A 0x63"
|
"0x66 0x0f 0x3A 0x63"
|
||||||
],
|
],
|
||||||
@ -236,16 +211,7 @@
|
|||||||
"clz w23, w0",
|
"clz w23, w0",
|
||||||
"csinv w23, w23, wzr, ne",
|
"csinv w23, w23, wzr, ne",
|
||||||
"cmp x21, #0x0 (0)",
|
"cmp x21, #0x0 (0)",
|
||||||
"csel x21, x22, x23, eq",
|
"csel x5, x22, x23, eq",
|
||||||
"mov w5, w21",
|
|
||||||
"ubfx w21, w20, #16, #1",
|
|
||||||
"lsl x21, x21, #30",
|
|
||||||
"ubfx w22, w20, #17, #1",
|
|
||||||
"orr w21, w21, w22, lsl #31",
|
|
||||||
"ubfx w22, w20, #18, #1",
|
|
||||||
"orr w21, w21, w22, lsl #29",
|
|
||||||
"ubfx w20, w20, #19, #1",
|
|
||||||
"orr w20, w21, w20, lsl #28",
|
|
||||||
"mov w26, #0x1",
|
"mov w26, #0x1",
|
||||||
"msr nzcv, x20"
|
"msr nzcv, x20"
|
||||||
]
|
]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user