Merge pull request #3669 from Sonicadvance1/fix_addshift_operation

ConstProp fixes for Darwinia
This commit is contained in:
Ryan Houdek 2024-05-29 19:43:13 -07:00 committed by GitHub
commit ab0a6bbe9f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 301 additions and 51 deletions

View File

@ -1001,7 +1001,8 @@
},
"GPR = AddShift OpSize:#Size, GPR:$Src1, GPR:$Src2, ShiftType:$Shift{ShiftType::LSL}, u8:$ShiftAmount{0}": {
"Desc": [ "Integer Add with shifted register",
"Will truncate to 64 or 32bits"
"Will truncate to 64 or 32bits",
"Dest = Src1 + (Src2 << ShiftAmount)"
],
"DestSize": "Size",
"EmitValidation": [

View File

@ -89,25 +89,32 @@ static bool IsTSOImm9(uint64_t imm) {
}
}
using MemExtendedAddrResult = std::tuple<MemOffsetType, uint8_t, OrderedNode*, OrderedNode*>;
struct MemExtendedAddrResult {
MemOffsetType OffsetType;
uint8_t OffsetScale;
OrderedNode* Base;
OrderedNode* OffsetReg;
};
static inline std::optional<MemExtendedAddrResult> TryAddShiftScale(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
auto AddShift = AddressHeader->C<IROp_AddShift>();
if (AddShift->Shift == IR::ShiftType::LSL) {
auto Scale = 1U << AddShift->ShiftAmount;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
} else if (Scale == 1) {
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src1), IREmit->UnwrapNode(AddShift->Src2)};
}
}
return std::nullopt;
}
// If this optimization doesn't succeed, it will return the nullopt
static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
// Try to optimize: AddShift Base, LSHL(Offset, Scale)
if (AddressHeader->Op == OP_ADDSHIFT) {
auto AddShift = AddressHeader->C<IROp_AddShift>();
if (AddShift->Shift == IR::ShiftType::LSL) {
auto Scale = 1U << AddShift->ShiftAmount;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddShift->Src2), IREmit->UnwrapNode(AddShift->Src1)));
} else if (Scale == 1) {
return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddShift->Src2), IREmit->UnwrapNode(AddShift->Src1)));
}
}
return std::nullopt;
return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
}
LOGMAN_THROW_A_FMT(AddressHeader->Op == OP_ADD, "Invalid address Op");
@ -119,12 +126,11 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
if (IREmit->IsValueConstant(Src0Header->Args[1], &Scale)) {
if (IsMemoryScale(Scale, AccessSize)) {
// remove mul as it can be folded to the mem op
return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, (uint8_t)Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
IREmit->UnwrapNode(Src0Header->Args[0])};
} else if (Scale == 1) {
// remove nop mul
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
@ -132,15 +138,14 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
else if (Src0Header->Op == OP_LSHL) {
uint64_t Constant2;
if (IREmit->IsValueConstant(Src0Header->Args[1], &Constant2)) {
uint64_t Scale = 1 << Constant2;
uint8_t Scale = 1 << Constant2;
if (IsMemoryScale(Scale, AccessSize)) {
// remove shift as it can be folded to the mem op
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTX, Scale, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, Scale, IREmit->UnwrapNode(AddressHeader->Args[1]),
IREmit->UnwrapNode(Src0Header->Args[0])};
} else if (Scale == 1) {
// remove nop shift
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
@ -149,8 +154,7 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
auto Bfe = Src0Header->C<IROp_Bfe>();
if (Bfe->lsb == 0 && Bfe->Width == 32) {
// todo: arm can also scale here
return std::make_optional(
std::make_tuple(MEM_OFFSET_UXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_UXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
// Try to optimize: Base + (s32)Offset
@ -158,8 +162,7 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
auto Sbfe = Src0Header->C<IROp_Sbfe>();
if (Sbfe->lsb == 0 && Sbfe->Width == 32) {
// todo: arm can also scale here
return std::make_optional(
std::make_tuple(MEM_OFFSET_SXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])));
return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, IREmit->UnwrapNode(AddressHeader->Args[1]), IREmit->UnwrapNode(Src0Header->Args[0])};
}
}
}
@ -181,9 +184,9 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
int32_t Val32 = (int32_t)ConstVal;
if (Val32 > -16384 && Val32 < 0) {
return std::make_optional(std::make_tuple(MEM_OFFSET_SXTW, 1, Base, Cnt));
return MemExtendedAddrResult {MEM_OFFSET_SXTW, 1, Base, Cnt};
} else if (Val32 >= 0 && Val32 < 16384) {
return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, Base, Cnt));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Base, Cnt};
}
} else if (AddressHeader->Size == 4) {
// Do not optimize 32bit reg+reg.
@ -195,11 +198,28 @@ static std::optional<MemExtendedAddrResult> MemExtendedAddressing(IREmitter* IRE
// ldr w7, [x5, w7, sxtx]
return std::nullopt;
} else {
return std::make_optional(std::make_tuple(MEM_OFFSET_SXTX, 1, Arg0, Arg1));
return MemExtendedAddrResult {MEM_OFFSET_SXTX, 1, Arg0, Arg1};
}
return std::nullopt;
}
static std::optional<MemExtendedAddrResult> MemVectorAtomicExtendedAddressing(IREmitter* IREmit, uint8_t AccessSize, IROp_Header* AddressHeader) {
// Atomic TSO emulation of vectors use half-barriers. So it gets the full addressing support of vector loadstores
// Addressing capabilities
// - LDR, [Reg, Reg, LSL <Size>]
// - LDR, [Reg], imm12 Scaled <Size> ///< TODO: Implement this
// - LDUR, [Reg], imm9 (Signed [-256,256)) ///< TODO: Implement this
// TODO: Implement support for FEAT_LRCPC3.
// - LDAPUR [reg], imm9 (Signed [-256,256))
// Try to optimize: AddShift Base, LSHL(Offset, Scale)
if (AddressHeader->Op == OP_ADDSHIFT) {
return TryAddShiftScale(IREmit, AccessSize, AddressHeader);
}
return std::nullopt;
}
static bool IsBfeAlreadyDone(IREmitter* IREmit, OrderedNodeWrapper src, uint64_t Width) {
auto IROp = IREmit->GetOpHeader(src);
if (IROp->Op == OP_BFE) {
@ -323,18 +343,18 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
auto Op = IROp->CW<IR::IROp_LoadMemTSO>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Op == OP_ADD && AddressHeader->Size == 8) {
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
// TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
// Support once hardware is available to use this.
auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
@ -343,18 +363,18 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
auto Op = IROp->CW<IR::IROp_StoreMemTSO>();
auto AddressHeader = IREmit->GetOpHeader(Op->Addr);
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Op == OP_ADD && AddressHeader->Size == 8) {
if (Op->Class == FEXCore::IR::FPRClass && AddressHeader->Size == 8) {
// TODO: LRCPC3 supports a vector unscaled offset like LRCPC2.
// Support once hardware is available to use this.
auto MaybeMemAddr = MemExtendedAddressing(IREmit, IROp->Size, AddressHeader);
auto MaybeMemAddr = MemVectorAtomicExtendedAddressing(IREmit, IROp->Size, AddressHeader);
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
@ -368,12 +388,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
@ -387,12 +407,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}
@ -408,12 +428,12 @@ void ConstProp::ConstantPropagation(IREmitter* IREmit, const IRListView& Current
if (!MaybeMemAddr) {
break;
}
auto [OffsetType, OffsetScale, Arg0, Arg1] = *MaybeMemAddr;
auto [OffsetType, OffsetScale, Base, OffsetReg] = *MaybeMemAddr;
Op->OffsetType = OffsetType;
Op->OffsetScale = OffsetScale;
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Arg0); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, Arg1); // Offset
IREmit->ReplaceNodeArgument(CodeNode, Op->Addr_Index, Base); // Addr
IREmit->ReplaceNodeArgument(CodeNode, Op->Offset_Index, OffsetReg); // Offset
}
break;
}

View File

@ -0,0 +1,95 @@
%ifdef CONFIG
{
"RegData": {
"RAX": "0x5152535455565758",
"RBX": "0x5152535455565758",
"RCX": "0x5152535455565758",
"RDX": "0x5152535455565758",
"RDI": "0x5152535455565758",
"XMM0": ["0x5152535455565758", "0x0"],
"XMM1": ["0x5152535455565758", "0x0"],
"XMM2": ["0x5152535455565758", "0x0"],
"XMM3": ["0x5152535455565758", "0x0"],
"XMM4": ["0x5152535455565758", "0x0"],
"MM0": "0x5152535455565758",
"MM1": "0x5152535455565758",
"MM2": "0x5152535455565758",
"MM3": "0x5152535455565758",
"MM4": "0x5152535455565758"
},
"MemoryRegions": {
"0x00000000a0000000": "4096",
"0x0000000110000000": "4096"
},
"MemoryData": {
"0x00000000a0000000": "0x4142434445464748",
"0x0000000110000000": "0x5152535455565758"
}
}
%endif
; FEX had a bug in its const-prop pass where x86 SIB scale would accidentally transpose the register that was scaling with the base.
; This test explicitly tests SIB in a way that a transpose would load data from the wrong address.
; Basic layout is [r14 + (r15 * 8)]
; r14 will be the base
mov r14, 0x1000_0000
; r15 will be the index
mov r15, 0x2000_0000
; Correct transpose will be at 0x0000000110000000
; Incorrect transpose will be at 0x00000000a0000000
; Break the block
jmp .test
.test:
; Basic GPR SIB test
mov rax, [r14 + (r15 * 8)]
; Basic Vector SIB test
movq xmm0, [r14 + (r15 * 8)]
; Basic MMX SIB test
movq mm0, [r14 + (r15 * 8)]
; Break the block now
jmp .test2
.test2:
; FEX GPR/XMM LoadMem const prop might only happen with disjoint add + mul so check this
; Need to be able to const-prop the multiply
imul r13, r15, 8
; Test base + offset transposed both ways, for all three types
mov rbx, [r14 + r13]
mov rcx, [r13 + r14]
movq xmm1, [r14 + r13]
movq xmm2, [r13 + r14]
movq mm1, [r14 + r13]
movq mm2, [r13 + r14]
; Break the block now
jmp .test3
.test3:
; FEX GPR/XMM LoadMem const prop might only happen with disjoint add + lshl so check this
; Need to be able to const-prop the lshl
mov r13, r15
shl r13, 3
; Test base + offset transposed both ways, for all three types
mov rdx, [r14 + r13]
mov rdi, [r13 + r14]
movq xmm3, [r14 + r13]
movq xmm4, [r13 + r14]
movq mm3, [r14 + r13]
movq mm4, [r13 + r14]
hlt

View File

@ -0,0 +1,26 @@
%ifdef CONFIG
{
"RegData": {
"XMM5": ["0x0000000000000048", "0x0000000000000047"]
}
}
%endif
; FEX-Emu had a bug where a vector load that was using SIB addressing would overflow to larger than what ARM could encode.
; Test that here.
; Original bug came from the Darwinia Linux binary from function `HUF_readDTableX1_wksp`
mov rbx, 0
lea r15, [rel .data - 0x3d4]
; Break the block
jmp .test
.test:
pmovzxbq xmm5, word [rbx+r15+0x3d4]
hlt
.data:
dq 0x4142434445464748, 0x5152535455565758

View File

@ -918,7 +918,115 @@
"prefetch [rax + rcx*8]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"prfm pldl1keep, [x5, x4, sxtx #3]"
"prfm pldl1keep, [x4, x5, sxtx #3]"
]
},
"movzx ebx, byte [rax + rcx*1]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"ldrb w7, [x4, x5, sxtx]"
]
},
"movzx ebx, byte [rax + rcx*2]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #1",
"ldrb w7, [x20]"
]
},
"movzx ebx, byte [rax + rcx*4]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #2",
"ldrb w7, [x20]"
]
},
"movzx ebx, byte [rax + rcx*8]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #3",
"ldrb w7, [x20]"
]
},
"movzx ebx, word [rax + rcx*1]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"ldrh w7, [x4, x5, sxtx]"
]
},
"movzx ebx, word [rax + rcx*2]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #1",
"ldrh w7, [x20]"
]
},
"movzx ebx, word [rax + rcx*4]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #2",
"ldrh w7, [x20]"
]
},
"movzx ebx, word [rax + rcx*8]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #3",
"ldrh w7, [x20]"
]
},
"mov ebx, [rax + rcx*1]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"ldr w7, [x4, x5, sxtx]"
]
},
"mov ebx, [rax + rcx*2]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #1",
"ldr w7, [x20]"
]
},
"mov ebx, [rax + rcx*4]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #2",
"ldr w7, [x20]"
]
},
"mov ebx, [rax + rcx*8]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #3",
"ldr w7, [x20]"
]
},
"mov rbx, [rax + rcx*1]": {
"ExpectedInstructionCount": 1,
"ExpectedArm64ASM": [
"ldr x7, [x4, x5, sxtx]"
]
},
"mov rbx, [rax + rcx*2]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #1",
"ldr x7, [x20]"
]
},
"mov rbx, [rax + rcx*4]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #2",
"ldr x7, [x20]"
]
},
"mov rbx, [rax + rcx*8]": {
"ExpectedInstructionCount": 2,
"ExpectedArm64ASM": [
"add x20, x4, x5, lsl #3",
"ldr x7, [x20]"
]
}
}