mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-02 16:56:50 +00:00
[CGP / PowerPC] avoid multi-block overhead for simple memcmp expansion
The test diff for PowerPC shows we can better optimize if this case is one block. For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed cheap and SDAG can't optimize across blocks. Instead of this: _cmp_eq8: movq (%rdi), %rax cmpq (%rsi), %rax je LBB23_1 ## BB#2: ## %res_block movl $1, %ecx jmp LBB23_3 LBB23_1: xorl %ecx, %ecx LBB23_3: ## %endblock xorl %eax, %eax testl %ecx, %ecx sete %al retq We get this: cmp_eq8: movq (%rdi), %rcx xorl %eax, %eax cmpq (%rsi), %rcx sete %al retq And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall(). If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we can lower the IR produced here optimally. Differential Revision: https://reviews.llvm.org/D34005 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304987 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
9a5b056374
commit
94a001edde
@ -1675,6 +1675,7 @@ class MemCmpExpansion {
|
|||||||
void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
|
void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
|
||||||
void emitMemCmpResultBlock(bool IsLittleEndian);
|
void emitMemCmpResultBlock(bool IsLittleEndian);
|
||||||
Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
|
Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
|
||||||
|
Value *getMemCmpEqZeroOneBlock(unsigned Size);
|
||||||
unsigned getLoadSize(unsigned Size);
|
unsigned getLoadSize(unsigned Size);
|
||||||
unsigned getNumLoads(unsigned Size);
|
unsigned getNumLoads(unsigned Size);
|
||||||
|
|
||||||
@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
|
|||||||
unsigned MaxLoadSize, unsigned LoadsPerBlock)
|
unsigned MaxLoadSize, unsigned LoadsPerBlock)
|
||||||
: CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {
|
: CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {
|
||||||
|
|
||||||
IRBuilder<> Builder(CI->getContext());
|
// A memcmp with zero-comparison with only one block of load and compare does
|
||||||
BasicBlock *StartBlock = CI->getParent();
|
// not need to set up any extra blocks. This case could be handled in the DAG,
|
||||||
EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
|
// but since we have all of the machinery to flexibly expand any memcpy here,
|
||||||
setupEndBlockPHINodes();
|
// we choose to handle this case too to avoid fragmented lowering.
|
||||||
IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
|
IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
|
||||||
|
|
||||||
// Calculate how many load compare blocks are required for an expansion of
|
|
||||||
// given Size.
|
|
||||||
NumBlocks = calculateNumBlocks(Size);
|
NumBlocks = calculateNumBlocks(Size);
|
||||||
createResultBlock();
|
if (!IsUsedForZeroCmp || NumBlocks != 1) {
|
||||||
|
BasicBlock *StartBlock = CI->getParent();
|
||||||
|
EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
|
||||||
|
setupEndBlockPHINodes();
|
||||||
|
createResultBlock();
|
||||||
|
|
||||||
// If return value of memcmp is not used in a zero equality, we need to
|
// If return value of memcmp is not used in a zero equality, we need to
|
||||||
// calculate which source was larger. The calculation requires the
|
// calculate which source was larger. The calculation requires the
|
||||||
// two loaded source values of each load compare block.
|
// two loaded source values of each load compare block.
|
||||||
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
|
// These will be saved in the phi nodes created by setupResultBlockPHINodes.
|
||||||
if (!IsUsedForZeroCmp)
|
if (!IsUsedForZeroCmp)
|
||||||
setupResultBlockPHINodes();
|
setupResultBlockPHINodes();
|
||||||
|
|
||||||
// Create the number of required load compare basic blocks.
|
// Create the number of required load compare basic blocks.
|
||||||
createLoadCmpBlocks();
|
createLoadCmpBlocks();
|
||||||
|
|
||||||
// Update the terminator added by splitBasicBlock to branch to the first
|
// Update the terminator added by splitBasicBlock to branch to the first
|
||||||
// LoadCmpBlock.
|
// LoadCmpBlock.
|
||||||
|
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
IRBuilder<> Builder(CI->getContext());
|
||||||
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
Builder.SetCurrentDebugLocation(CI->getDebugLoc());
|
||||||
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void MemCmpExpansion::createLoadCmpBlocks() {
|
void MemCmpExpansion::createLoadCmpBlocks() {
|
||||||
@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
|
|||||||
unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
|
unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
|
||||||
unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
|
unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
|
||||||
|
|
||||||
Builder.SetInsertPoint(LoadCmpBlocks[Index]);
|
// For a single-block expansion, start inserting before the memcmp call.
|
||||||
|
if (LoadCmpBlocks.empty())
|
||||||
|
Builder.SetInsertPoint(CI);
|
||||||
|
else
|
||||||
|
Builder.SetInsertPoint(LoadCmpBlocks[Index]);
|
||||||
|
|
||||||
Value *Cmp = nullptr;
|
Value *Cmp = nullptr;
|
||||||
for (unsigned i = 0; i < NumLoads; ++i) {
|
for (unsigned i = 0; i < NumLoads; ++i) {
|
||||||
unsigned LoadSize = getLoadSize(RemainingBytes);
|
unsigned LoadSize = getLoadSize(RemainingBytes);
|
||||||
@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
|
|||||||
return PhiRes;
|
return PhiRes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A memcmp expansion that compares equality with 0 and only has one block of
|
||||||
|
/// load and compare can bypass the compare, branch, and phi IR that is required
|
||||||
|
/// in the general case.
|
||||||
|
Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
|
||||||
|
unsigned NumBytesProcessed = 0;
|
||||||
|
IRBuilder<> Builder(CI->getContext());
|
||||||
|
Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
|
||||||
|
return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
|
||||||
|
}
|
||||||
|
|
||||||
// This function expands the memcmp call into an inline expansion and returns
|
// This function expands the memcmp call into an inline expansion and returns
|
||||||
// the memcmp result.
|
// the memcmp result.
|
||||||
Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
|
Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
|
||||||
if (IsUsedForZeroCmp)
|
if (IsUsedForZeroCmp)
|
||||||
return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
|
return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
|
||||||
|
getMemCmpExpansionZeroCase(Size, IsLittleEndian);
|
||||||
|
|
||||||
// This loop calls emitLoadCompareBlock for comparing Size bytes of the two
|
// This loop calls emitLoadCompareBlock for comparing Size bytes of the two
|
||||||
// memcmp sources. It starts with loading using the maximum load size set by
|
// memcmp sources. It starts with loading using the maximum load size set by
|
||||||
|
@ -17,13 +17,13 @@ declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_add
|
|||||||
; Check 4 bytes - requires 1 load for each param.
|
; Check 4 bytes - requires 1 load for each param.
|
||||||
define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
|
define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
|
||||||
; CHECK-LABEL: zeroEqualityTest02:
|
; CHECK-LABEL: zeroEqualityTest02:
|
||||||
; CHECK: # BB#0: # %loadbb
|
; CHECK: # BB#0:
|
||||||
; CHECK-NEXT: lwz 3, 0(3)
|
; CHECK-NEXT: lwz 3, 0(3)
|
||||||
; CHECK-NEXT: lwz 4, 0(4)
|
; CHECK-NEXT: lwz 4, 0(4)
|
||||||
; CHECK-NEXT: li 5, 1
|
; CHECK-NEXT: xor 3, 3, 4
|
||||||
; CHECK-NEXT: cmplw 3, 4
|
; CHECK-NEXT: cntlzw 3, 3
|
||||||
; CHECK-NEXT: isel 3, 0, 5, 2
|
; CHECK-NEXT: srwi 3, 3, 5
|
||||||
; CHECK-NEXT: clrldi 3, 3, 32
|
; CHECK-NEXT: xori 3, 3, 1
|
||||||
; CHECK-NEXT: blr
|
; CHECK-NEXT: blr
|
||||||
%call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
|
%call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
|
||||||
%not.cmp = icmp ne i32 %call, 0
|
%not.cmp = icmp ne i32 %call, 0
|
||||||
|
Loading…
Reference in New Issue
Block a user