ConstProp: drop address fusion

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
This commit is contained in:
Alyssa Rosenzweig 2024-05-31 10:45:27 -04:00
parent 8b5ca303e3
commit 85a69be5b6
3 changed files with 6 additions and 241 deletions

View File

@ -79,7 +79,7 @@ void PassManager::AddDefaultPasses(FEXCore::Context::ContextImpl* ctx, bool Inli
}
InsertPass(CreateDeadStoreElimination());
InsertPass(CreateConstProp(InlineConstants, ctx->HostFeatures.SupportsTSOImm9, Is64BitMode()));
InsertPass(CreateConstProp(InlineConstants, ctx->HostFeatures.SupportsTSOImm9));
InsertPass(CreateInlineCallOptimization(&ctx->CPUID));
InsertPass(CreateDeadFlagCalculationEliminination());
}

View File

@ -16,7 +16,7 @@ class Pass;
class RegisterAllocationPass;
class RegisterAllocationData;
fextl::unique_ptr<FEXCore::IR::Pass> CreateConstProp(bool InlineConstants, bool SupportsTSOImm9, bool Is64BitMode);
fextl::unique_ptr<FEXCore::IR::Pass> CreateConstProp(bool InlineConstants, bool SupportsTSOImm9);
fextl::unique_ptr<FEXCore::IR::Pass> CreateContextLoadStoreElimination(bool SupportsAVX);
fextl::unique_ptr<FEXCore::IR::Pass> CreateInlineCallOptimization(const FEXCore::CPUIDEmu* CPUID);
fextl::unique_ptr<FEXCore::IR::Pass> CreateDeadFlagCalculationEliminination();

View File

@ -59,9 +59,6 @@ static bool IsImmLogical(uint64_t imm, unsigned width) {
static bool IsImmAddSub(uint64_t imm) {
return vixl::aarch64::Assembler::IsImmAddSub(imm);
}
static bool IsMemoryScale(uint64_t Scale, uint8_t AccessSize) {
return Scale == AccessSize;
}
static bool IsSIMM9Range(uint64_t imm) {
// AArch64 signed immediate unscaled 9-bit range.
@ -89,137 +86,6 @@ static bool IsTSOImm9(uint64_t imm) {
}
}
struct MemExtendedAddrResult {
MemOffsetType OffsetType;
uint8_t OffsetScale;
OrderedNode* Base;
OrderedNode* OffsetReg;
};
static inline std::optional<MemExtendedAddrResult> TryAddShiftScale(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
auto AddShift = AddressHeader->C<IROp_AddShift>();
if (AddShift->Shift == IR::ShiftType::LSL) {
auto Scale = 1U << AddShift->ShiftAmount;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
} else if (Scale == 1) {
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
}
}
return std::nullopt;
}
// If this optimization doesn't succeed, it will return the nullopt
static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
// Try to optimize: AddShift Base, LSHL(Offset, Scale)
if (AddressHeader->Op == OP_ADDSHIFT) {
return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
}
LOGMAN_THROW_A_FMT(AddressHeader->Op == OP_ADD, "Invalid address Op");
auto Src0Header = IREmit->GetOpHeader(AddressHeader->Args[0]);
if (Src0Header->Size == 8) {
// Try to optimize: Base + MUL(Offset, Scale)
if (Src0Header->Op == OP_MUL) {
uint64_t Scale;
if (IREmit->IsValueConstant(Src0Header->Args[1], &Scale)) {
if (IsMemoryScale(Scale, AccessSize)) {
// remove mul as it can be folded to the mem op
return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
IREmit->UnwrapNode(Src0Header->Args[0])};
} else if (Scale == 1) {
// remove nop mul
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
// Try to optimize: Base + LSHL(Offset, Scale)
else if (Src0Header->Op == OP_LSHL) {
uint64_t Constant2;
if (IREmit->IsValueConstant(Src0Header->Args[1], &Constant2)) {
uint8_t Scale = 1 << Constant2;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return MemExtendedAddrResult {MEM_OFFSET_SXTX, Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
IREmit->UnwrapNode(Src0Header->Args[0])};
} else if (Scale == 1) {
// remove nop shift
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
// Try to optimize: Base + (u32)Offset
else if (Src0Header->Op == OP_BFE) {
auto Bfe = Src0Header->C<IROp_Bfe>();
if (Bfe->lsb == 0 && Bfe->Width == 32) {
// todo: arm can also scale here
return MemExtendedAddrResult {MEM_OFFSET_UXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
// Try to optimize: Base + (s32)Offset
else if (Src0Header->Op == OP_SBFE) {
auto Sbfe = Src0Header->C<IROp_Sbfe>();
if (Sbfe->lsb == 0 && Sbfe->Width == 32) {
// todo: arm can also scale here
return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
// no match anywhere, just add
// However, if we have one 32bit negative constant, we need to sign extend it
auto Arg0_ = AddressHeader->Args[0];
auto Arg1_ = AddressHeader->Args[1];
auto Arg1H = IREmit->GetOpHeader(Arg1_);
auto Arg0 = IREmit->UnwrapNode(Arg0_);
auto Arg1 = IREmit->UnwrapNode(Arg1_);
uint64_t ConstVal = 0;
// Only optimize in 32bits reg+const where const < 16Kb.
if (Arg1H->Size == 4 && IREmit->IsValueConstant(Arg1_, &ConstVal)) {
// Base is Arg0, Constant (Displacement in Arg1)
OrderedNode* Base = Arg0;
OrderedNode* Cnt = Arg1;
int32_t Val32 = (int32_t)ConstVal;
if (Val32 > -16384 && Val32 < 0) {
return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, Base, Cnt};
} else if (Val32 >= 0 && Val32 < 16384) {
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Base, Cnt};
}
} else if (AddressHeader->Size == 4) {
// Do not optimize 32bit reg+reg.
// Something like :
// add w20, w7, w5
// ldr w7, [x20]
//
// cannot be simplified to (or any other single load instruction)
// ldr w7, [x5, w7, sxtx]
return std::nullopt;
} else {
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Arg0, Arg1};
}
return std::nullopt;
}
static std::optional<MemExtendedAddrResult> MemVectorAtomicExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
// Atomic TSO emulation of vectors use half-barriers. So it gets the full addressing support of vector loadstores
// Addressing capabilities
// - LDR, [Reg, Reg, LSL <Size>]
// - LDR, [Reg], imm12 Scaled <Size> ///< TODO: Implement this
// - LDUR, [Reg], imm9 (Signed [-256,256)) ///< TODO: Implement this
// TODO: Implement support for FEAT_LRCPC3.
// - LDAPUR [reg], imm9 (Signed [-256,256))
// Try to optimize: AddShift Base, LSHL(Offset, Scale)
if (AddressHeader->Op == OP_ADDSHIFT) {
return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
}
return std::nullopt;
}
static bool IsBfeAlreadyDone(IREmitter* IREmit, OrderedNodeWrapper src, uint64_t Width) {
auto IROp = IREmit->GetOpHeader(src);
if (IROp->Op == OP_BFE) {
@ -233,10 +99,9 @@ static bool IsBfeAlreadyDone(IREmitter* IREmit, OrderedNodeWrapper src, uint64_t
class ConstProp final : public FEXCore::IR::Pass {
public:
explicit ConstProp(bool DoInlineConstants, bool SupportsTSOImm9, bool Is64BitMode)
explicit ConstProp(bool DoInlineConstants, bool SupportsTSOImm9)
: InlineConstants(DoInlineConstants)
, SupportsTSOImm9 {SupportsTSOImm9}
, Is64BitMode(Is64BitMode) {}
, SupportsTSOImm9 {SupportsTSOImm9} {}
void Run(IREmitter* IREmit) override;
@ -265,7 +130,6 @@ private:
return Result.first->second;
}
bool SupportsTSOImm9 {};
bool Is64BitMode;
// This is a heuristic to limit constant pool live ranges to reduce RA interference pressure.
// If the range is unbounded then RA interference pressure seems to increase to the point
// that long blocks of constant usage can slow to a crawl.
@ -339,105 +203,6 @@ doneOp:;
// constprop + some more per instruction logic
void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& CurrentIR, OrderedNode* CodeNode, IROp_Header* IROp) {
switch (IROp->Op) {
case OP_LOADMEMTSO: {
auto Op = IROp->CW<IR::IROp_LoadMemTSO>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
// TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
// Support once hardware is available to use this.
auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
case OP_STOREMEMTSO: {
auto Op = IROp->CW<IR::IROp_StoreMemTSO>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
// TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
// Support once hardware is available to use this.
auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
case OP_LOADMEM: {
auto Op = IROp->CW<IR::IROp_LoadMem>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (AddressHeader->Op == OP_ADD && ((Is64BitMode && AddressHeader->Size == 8) || (!Is64BitMode && AddressHeader->Size == 4))) {
auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
case OP_STOREMEM: {
auto Op = IROp->CW<IR::IROp_StoreMem>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (AddressHeader->Op == OP_ADD && ((Is64BitMode && AddressHeader->Size == 8) || (!Is64BitMode && AddressHeader->Size == 4))) {
auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
case OP_PREFETCH: {
auto Op = IROp->CW<IR::IROp_Prefetch>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
const bool SupportedOp = AddressHeader->Op == OP_ADD || AddressHeader->Op == OP_ADDSHIFT;
if (SupportedOp && ((Is64BitMode && AddressHeader->Size == 8) || (!Is64BitMode && AddressHeader->Size == 4))) {
auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
case OP_ADD:
case OP_SUB:
case OP_ADDWITHFLAGS:
@ -1041,7 +806,7 @@ void ConstProp::Run(IREmitter* IREmit) {
}
}
fextl::unique_ptr<FEXCore::IR::Pass> CreateConstProp(bool InlineConstants, bool SupportsTSOImm9, bool Is64BitMode) {
return fextl::make_unique<ConstProp>(InlineConstants, SupportsTSOImm9, Is64BitMode);
fextl::unique_ptr<FEXCore::IR::Pass> CreateConstProp(bool InlineConstants, bool SupportsTSOImm9) {
return fextl::make_unique<ConstProp>(InlineConstants, SupportsTSOImm9);
}
} // namespace FEXCore::IR