[X86] Improved sched models for X86 BT*rr instructions.

https://reviews.llvm.org/D49243

llvm-svn: 338365
This commit is contained in:
Andrew V. Tischenko 2018-07-31 12:33:48 +00:00
parent c0ea535a72
commit fac48f4efe
13 changed files with 112 additions and 48 deletions

View File

@ -1750,7 +1750,7 @@ def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags
// Bit tests instructions: BT, BTS, BTR, BTC.
let Defs = [EFLAGS] in {
let SchedRW = [WriteALU] in {
let SchedRW = [WriteBitTest] in {
def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>,
@ -1783,7 +1783,7 @@ let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
[]>, TB, NotMemoryFoldable;
}
let SchedRW = [WriteALU] in {
let SchedRW = [WriteBitTest] in {
def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>,
@ -1818,7 +1818,7 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
} // SchedRW
let hasSideEffects = 0 in {
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
@ -1842,7 +1842,7 @@ def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
@ -1861,7 +1861,7 @@ def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
Requires<[In64BitMode]>;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
@ -1885,7 +1885,7 @@ def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB;
@ -1908,7 +1908,7 @@ def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
Requires<[In64BitMode]>;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>,
OpSize16, TB, NotMemoryFoldable;
@ -1932,7 +1932,7 @@ def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
NotMemoryFoldable;
}
let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in {
def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB;
def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),

View File

@ -137,6 +137,7 @@ def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> {
let NumMicroOps = 3;
}
def : WriteRes<WriteLAHFSAHF, [BWPort06]>;
def : WriteRes<WriteBitTest,[BWPort06]>; // Bit Test instrs
// Bit counts.
defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
@ -603,14 +604,6 @@ def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> {
let ResourceCycles = [1];
}
def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>;
def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8",
"BT(16|32|64)rr",
"BTC(16|32|64)ri8",
"BTC(16|32|64)rr",
"BTR(16|32|64)ri8",
"BTR(16|32|64)rr",
"BTS(16|32|64)ri8",
"BTS(16|32|64)rr")>;
def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> {
let Latency = 1;

View File

@ -150,6 +150,7 @@ def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
let NumMicroOps = 3;
}
def : WriteRes<WriteLAHFSAHF, [HWPort06]>;
def : WriteRes<WriteBitTest,[HWPort06]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
@ -895,14 +896,6 @@ def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
let ResourceCycles = [1];
}
def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>;
def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8",
"BT(16|32|64)rr",
"BTC(16|32|64)ri8",
"BTC(16|32|64)rr",
"BTR(16|32|64)ri8",
"BTR(16|32|64)rr",
"BTS(16|32|64)ri8",
"BTS(16|32|64)rr")>;
def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
let Latency = 1;

View File

@ -145,6 +145,7 @@ def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> {
let NumMicroOps = 3;
}
def : WriteRes<WriteLAHFSAHF, [SBPort05]>;
def : WriteRes<WriteBitTest,[SBPort05]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
@ -570,14 +571,6 @@ def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
let ResourceCycles = [1];
}
def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>;
def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8",
"BT(16|32|64)rr",
"BTC(16|32|64)ri8",
"BTC(16|32|64)rr",
"BTR(16|32|64)ri8",
"BTR(16|32|64)rr",
"BTS(16|32|64)ri8",
"BTS(16|32|64)rr")>;
def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
let Latency = 1;

View File

@ -136,6 +136,7 @@ def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> {
let NumMicroOps = 3;
}
def : WriteRes<WriteLAHFSAHF, [SKLPort06]>;
def : WriteRes<WriteBitTest,[SKLPort06]>; //
// Bit counts.
defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>;
@ -605,14 +606,6 @@ def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> {
let ResourceCycles = [1];
}
def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8",
"BT(16|32|64)rr",
"BTC(16|32|64)ri8",
"BTC(16|32|64)rr",
"BTR(16|32|64)ri8",
"BTR(16|32|64)rr",
"BTS(16|32|64)ri8",
"BTS(16|32|64)rr")>;
def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> {
let Latency = 1;

View File

@ -136,6 +136,7 @@ def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> {
let NumMicroOps = 3;
}
def : WriteRes<WriteLAHFSAHF, [SKXPort06]>;
def : WriteRes<WriteBitTest,[SKXPort06]>; //
// Integer shifts and rotates.
defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
@ -618,14 +619,6 @@ def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> {
let ResourceCycles = [1];
}
def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>;
def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8",
"BT(16|32|64)rr",
"BTC(16|32|64)ri8",
"BTC(16|32|64)rr",
"BTR(16|32|64)ri8",
"BTR(16|32|64)rr",
"BTS(16|32|64)ri8",
"BTS(16|32|64)rr")>;
def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> {
let Latency = 1;

View File

@ -142,6 +142,7 @@ def WriteFCMOV : SchedWrite; // X87 conditional move.
def WriteSETCC : SchedWrite; // Set register based on condition code.
def WriteSETCCStore : SchedWrite;
def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH.
def WriteBitTest : SchedWrite; // Bit Test - TODO add memory folding support
// Integer shifts and rotates.
defm WriteShift : X86SchedWritePair;

View File

@ -108,6 +108,7 @@ def : WriteRes<WriteLAHFSAHF, [AtomPort01]> {
let Latency = 2;
let ResourceCycles = [2];
}
def : WriteRes<WriteBitTest,[AtomPort01]>;
defm : X86WriteResUnsupported<WriteIMulH>;

View File

@ -188,6 +188,7 @@ defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional m
def : WriteRes<WriteSETCC, [JALU01]>; // Setcc.
def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>;
def : WriteRes<WriteLAHFSAHF, [JALU01]>;
def : WriteRes<WriteBitTest,[JALU01]>;
// This is for simple LEAs with one or two input operands.
def : WriteRes<WriteLEA, [JALU01]>;

View File

@ -120,6 +120,7 @@ def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> {
let ResourceCycles = [2,1];
}
def : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>;
def : WriteRes<WriteBitTest,[SLM_IEC_RSV01]>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on

View File

@ -198,6 +198,7 @@ defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>;
def : WriteRes<WriteSETCC, [ZnALU]>;
def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>;
defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>;
def : WriteRes<WriteBitTest,[ZnALU]>;
// Bit counts.
defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>;

View File

@ -21,6 +21,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Regex.h"
#include "llvm/Support/raw_ostream.h"
@ -33,6 +34,16 @@ using namespace llvm;
#define DEBUG_TYPE "subtarget-emitter"
#ifdef EXPENSIVE_CHECKS
// FIXME: TableGen is failed iff EXPENSIVE_CHECKS defined
static constexpr bool OptCheckSchedClasses = true;
#else
// FIXME: the default value should be false
static cl::opt<bool> OptCheckSchedClasses(
"check-sched-class-table", cl::init(true), cl::Hidden,
cl::desc("Check sched class table on different types of inconsistencies"));
#endif
#ifndef NDEBUG
static void dumpIdxVec(ArrayRef<unsigned> V) {
for (unsigned Idx : V)
@ -223,6 +234,7 @@ CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
collectOptionalProcessorInfo();
checkCompleteness();
checkSchedClasses();
}
void CodeGenSchedModels::collectRetireControlUnits() {
@ -699,6 +711,86 @@ void CodeGenSchedModels::collectSchedClasses() {
}
}
void CodeGenSchedModels::checkSchedClasses() {
if (!OptCheckSchedClasses)
return;
std::string str;
raw_string_ostream OS(str);
// Check each instruction for each model to see if its overridden too often.
// Iff YES it's a candidate for more fine-grained Sched Class.
for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
StringRef InstName = Inst->TheDef->getName();
unsigned SCIdx = getSchedClassIdx(*Inst);
if (!SCIdx)
continue;
CodeGenSchedClass &SC = getSchedClass(SCIdx);
if (SC.Writes.empty())
continue;
const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
if (RWDefs.empty())
continue;
// FIXME: what should be threshold here?
if (RWDefs.size() > (ProcModels.size() / 2)) {
// FIXME: this dump hangs the execution !!!
// SC.dump(&Target.getSchedModels());
OS << "SchedRW machine model for inst '" << InstName << "' (";
for (auto I : SC.Writes)
OS << " " << SchedWrites[I].Name;
for (auto I : SC.Reads)
OS << " " << SchedReads[I].Name;
OS << " ) should be updated /improvedbecause it's overriden " << RWDefs.size()
<< " times out of " << ProcModels.size() << " models:\n\t";
for (Record *RWDef : RWDefs)
OS << " " << getProcModel(RWDef->getValueAsDef("SchedModel")).ModelName;
PrintWarning(OS.str());
str.clear();
}
// TODO: here we should check latency/uop in SC vs. RWDef. Maybe we
// should do it iff RWDefs.size() == 1 only.
// Iff latency/uop are the same then warn about unnecessary redefine.
if (RWDefs.size()) {
for (Record *RWDef : RWDefs) {
IdxVec Writes;
IdxVec Reads;
findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"), Writes,
Reads);
if ((Writes.size() == SC.Writes.size()) &&
(Reads.size() == SC.Reads.size())) {
// TODO: do we need sorting Write & Reads?
for (unsigned I = 0, S = SC.Writes.size(); I < S; I++) {
auto SCSchedW = SchedWrites[SC.Writes[I]];
auto SchedW = SchedWrites[Writes[I]];
if (!SCSchedW.TheDef || !SchedW.TheDef)
continue;
const RecordVal *R = SCSchedW.TheDef->getValue("Latency");
// FIXME: We should deal with default Latency here
if (!R || !R->getValue())
continue;
auto SCLat = SCSchedW.TheDef->getValueAsInt("Latency");
auto SCuOp = SCSchedW.TheDef->getValueAsInt("NumMicroOps");
auto Lat = SchedW.TheDef->getValueAsInt("Latency");
auto uOp = SchedW.TheDef->getValueAsInt("NumMicroOps");
if ((SCLat == Lat) && (SCuOp == uOp))
OS << "Overridden verion of inst '" << InstName
<< "' has the same latency & uOp values as the original one "
"for model '"
<< getProcModel(RWDef->getValueAsDef("SchedModel")).ModelName
<< "'\n";
}
if (!str.empty()) {
PrintWarning(OS.str());
str.clear();
}
}
}
}
}
}
// Get the SchedClass index for an instruction.
unsigned
CodeGenSchedModels::getSchedClassIdx(const CodeGenInstruction &Inst) const {

View File

@ -443,6 +443,8 @@ private:
void collectSchedClasses();
void checkSchedClasses();
void collectRetireControlUnits();
void collectRegisterFiles();