IR,OpcodeDispatcher,JIT: fuse adcs flags

The usual tricks, also requires introducing a bare adc op to optimize adcs to,
but we wanted that anyway!

Also support a zero source, so we can calculate "foo + CF" in one instruction to
optimize the "lock adc" cases.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
This commit is contained in:
Alyssa Rosenzweig 2024-02-25 10:17:22 -04:00
parent 0ef72bf118
commit 6994fc3a01
7 changed files with 81 additions and 39 deletions

View File

@ -143,6 +143,26 @@ DEF_OP(AdcNZCV) {
adcs(EmitSize, ARMEmitter::Reg::zr, GetReg(Op->Src1.ID()), GetReg(Op->Src2.ID()));
}
DEF_OP(AdcWithFlags) {
auto Op = IROp->C<IR::IROp_AdcWithFlags>();
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == IR::i32Bit || OpSize == IR::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
adcs(EmitSize, GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2.ID()));
}
DEF_OP(Adc) {
auto Op = IROp->C<IR::IROp_Adc>();
const auto OpSize = IROp->Size;
LOGMAN_THROW_AA_FMT(OpSize == IR::i32Bit || OpSize == IR::i64Bit, "Unsupported {} size: {}", __func__, OpSize);
const auto EmitSize = OpSize == IR::i64Bit ? ARMEmitter::Size::i64Bit : ARMEmitter::Size::i32Bit;
adc(EmitSize, GetReg(Node), GetZeroableReg(Op->Src1), GetReg(Op->Src2.ID()));
}
DEF_OP(SbbNZCV) {
auto Op = IROp->C<IR::IROp_SbbNZCV>();
const auto OpSize = IROp->Size;

View File

@ -391,27 +391,22 @@ void OpDispatchBuilder::ADCOp(OpcodeArgs) {
uint8_t Size = GetDstSize(Op);
const auto OpSize = IR::SizeToOpSize(std::max<uint8_t>(4u, Size));
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);
auto ALUOp = _Add(OpSize, Src, CF);
OrderedNode *Result{};
OrderedNode *Before{};
if (DestIsLockedMem(Op)) {
auto ALUOp = _Adc(OpSize, _Constant(0), Src);
HandledLock = true;
OrderedNode *DestMem = LoadSource(GPRClass, Op, Op->Dest, Op->Flags, {.LoadData = false});
DestMem = AppendSegmentOffset(DestMem, Op->Flags);
Before = _AtomicFetchAdd(IR::SizeToOpSize(Size), ALUOp, DestMem);
Result = _Add(OpSize, Before, ALUOp);
}
else {
Before = LoadSource(GPRClass, Op, Op->Dest, Op->Flags);
Result = _Add(OpSize, Before, ALUOp);
StoreResult(GPRClass, Op, Result, -1);
}
if (Size < 4)
Result = _Bfe(IR::SizeToOpSize(std::max<uint8_t>(4u, Size)), Size * 8, 0, Result);
GenerateFlags_ADC(Op, Result, Before, Src, CF);
OrderedNode *Result = CalculateFlags_ADC(Size, Before, Src);
if (!DestIsLockedMem(Op))
StoreResult(GPRClass, Op, Result, -1);
}
template<uint32_t SrcIndex, bool SetFlags>

View File

@ -78,7 +78,6 @@ friend class FEXCore::IR::PassManager;
public:
enum class FlagsGenerationType : uint8_t {
TYPE_NONE,
TYPE_ADC,
TYPE_SBB,
TYPE_SUB,
TYPE_MUL,
@ -1646,7 +1645,7 @@ private:
OrderedNode *Src2;
} TwoSource;
// ADC, SBB
// SBB
struct {
OrderedNode *Src1;
OrderedNode *Src2;
@ -1746,7 +1745,7 @@ private:
void CalculateAF(OrderedNode *Src1, OrderedNode *Src2);
void CalculateOF(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, bool Sub);
void CalculateFlags_ADC(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF);
OrderedNode *CalculateFlags_ADC(uint8_t SrcSize, OrderedNode *Src1, OrderedNode *Src2);
void CalculateFlags_SBB(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF);
OrderedNode *CalculateFlags_SUB(uint8_t SrcSize, OrderedNode *Src1, OrderedNode *Src2, bool UpdateCF = true);
OrderedNode *CalculateFlags_ADD(uint8_t SrcSize, OrderedNode *Src1, OrderedNode *Src2, bool UpdateCF = true);
@ -1780,21 +1779,6 @@ private:
*
* Depending on the operation it may force a RFLAGs calculation before storing the new deferred state.
* @{ */
void GenerateFlags_ADC(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF) {
CurrentDeferredFlags = DeferredFlagData {
.Type = FlagsGenerationType::TYPE_ADC,
.SrcSize = GetSrcSize(Op),
.Res = Res,
.Sources = {
.ThreeSource = {
.Src1 = Src1,
.Src2 = Src2,
.Src3 = CF,
},
},
};
}
void GenerateFlags_SBB(FEXCore::X86Tables::DecodedOp Op, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF) {
CurrentDeferredFlags = DeferredFlagData {
.Type = FlagsGenerationType::TYPE_SBB,

View File

@ -310,14 +310,6 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) {
}
switch (CurrentDeferredFlags.Type) {
case FlagsGenerationType::TYPE_ADC:
CalculateFlags_ADC(
CurrentDeferredFlags.SrcSize,
CurrentDeferredFlags.Res,
CurrentDeferredFlags.Sources.ThreeSource.Src1,
CurrentDeferredFlags.Sources.ThreeSource.Src2,
CurrentDeferredFlags.Sources.ThreeSource.Src3);
break;
case FlagsGenerationType::TYPE_SBB:
CalculateFlags_SBB(
CurrentDeferredFlags.SrcSize,
@ -477,18 +469,22 @@ void OpDispatchBuilder::CalculateDeferredFlags(uint32_t FlagsToCalculateMask) {
NZCVDirty = false;
}
void OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF) {
OrderedNode *OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, OrderedNode *Src1, OrderedNode *Src2) {
auto Zero = _Constant(0);
auto One = _Constant(1);
auto OpSize = SrcSize == 8 ? OpSize::i64Bit : OpSize::i32Bit;
OrderedNode *Res;
CalculateAF(Src1, Src2);
CalculatePF(Res);
if (SrcSize >= 4) {
HandleNZCV_RMW();
_AdcNZCV(OpSize, Src1, Src2);
Res = _AdcWithFlags(OpSize, Src1, Src2);
} else {
auto CF = GetRFLAG(FEXCore::X86State::RFLAG_CF_RAW_LOC);
Res = _Add(OpSize, _Add(OpSize, Src1, Src2), CF);
Res = _Bfe(OpSize, SrcSize * 8, 0, Res);
// SF/ZF
SetNZ_ZeroCV(SrcSize, Res);
@ -504,6 +500,9 @@ void OpDispatchBuilder::CalculateFlags_ADC(uint8_t SrcSize, OrderedNode *Res, Or
// Signed
CalculateOF(SrcSize, Res, Src1, Src2, false);
}
CalculatePF(Res);
return Res;
}
void OpDispatchBuilder::CalculateFlags_SBB(uint8_t SrcSize, OrderedNode *Res, OrderedNode *Src1, OrderedNode *Src2, OrderedNode *CF) {

View File

@ -951,6 +951,15 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = Adc OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": [ "Integer Add with carry",
"Will truncate to 64 or 32bits"
],
"DestSize": "Size",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
"Desc": [ "Integer Add with shifted register",
"Will truncate to 64 or 32bits"
@ -994,6 +1003,14 @@
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"GPR = AdcWithFlags OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Adds and set NZCV for the sum of two GPRs and carry-in given as NZCV"],
"HasSideEffects": true,
"DestSize": "Size",
"EmitValidation": [
"Size == FEXCore::IR::OpSize::i32Bit || Size == FEXCore::IR::OpSize::i64Bit"
]
},
"AdcNZCV OpSize:#Size, GPR:$Src1, GPR:$Src2": {
"Desc": ["Set NZCV for the sum of two GPRs and carry-in given as NZCV"],
"HasSideEffects": true,

View File

@ -995,6 +995,22 @@ bool ConstProp::ConstantInlining(IREmitter *IREmit, const IRListView& CurrentIR)
break;
}
case OP_ADC:
case OP_ADCWITHFLAGS:
{
auto Op = IROp->C<IR::IROp_Adc>();
uint64_t Constant1{};
if (IREmit->IsValueConstant(Op->Header.Args[0], &Constant1)) {
if (Constant1 == 0) {
IREmit->SetWriteCursor(CurrentIR.GetNode(Op->Header.Args[0]));
IREmit->ReplaceNodeArgument(CodeNode, 0, CreateInlineConstant(IREmit, 0));
Changed = true;
}
}
break;
}
case OP_CONDADDNZCV:
{
auto Op = IROp->C<IR::IROp_CondAddNZCV>();

View File

@ -142,6 +142,14 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp)
.Replacement = OP_SUB,
};
case OP_ADCWITHFLAGS:
return {
.Read = FLAG_C,
.Write = FLAG_NZCV,
.CanReplace = true,
.Replacement = OP_ADC,
};
case OP_ADDNZCV:
case OP_SUBNZCV:
case OP_TESTNZ:
@ -170,6 +178,9 @@ DeadFlagCalculationEliminination::Classify(IROp_Header *IROp)
case OP_LOADNZCV:
return {.Read = FLAG_NZCV};
case OP_ADC:
return {.Read = FLAG_C};
case OP_ADCNZCV:
case OP_SBBNZCV:
return {