[CGP / PowerPC] avoid multi-block overhead for simple memcmp expansion

The test diff for PowerPC shows we can better optimize if this case is one block.

For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed 
cheap and SDAG can't optimize across blocks. 

Instead of this:

_cmp_eq8:
  movq  (%rdi), %rax
  cmpq  (%rsi), %rax
  je  LBB23_1
## BB#2:                                ## %res_block
  movl  $1, %ecx
  jmp LBB23_3
LBB23_1:
  xorl  %ecx, %ecx
LBB23_3:                                ## %endblock
  xorl  %eax, %eax
  testl %ecx, %ecx
  sete  %al
  retq

We get this:

cmp_eq8:   
  movq  (%rdi), %rcx
  xorl  %eax, %eax
  cmpq  (%rsi), %rcx
  sete  %al
  retq

And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall(). 
If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable 
CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we 
can lower the IR produced here optimally.

Differential Revision: https://reviews.llvm.org/D34005


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304987 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Sanjay Patel 2017-06-08 16:53:18 +00:00
parent 9a5b056374
commit 94a001edde
2 changed files with 47 additions and 26 deletions

View File

@ -1675,6 +1675,7 @@ class MemCmpExpansion {
void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
void emitMemCmpResultBlock(bool IsLittleEndian); void emitMemCmpResultBlock(bool IsLittleEndian);
Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
Value *getMemCmpEqZeroOneBlock(unsigned Size);
unsigned getLoadSize(unsigned Size); unsigned getLoadSize(unsigned Size);
unsigned getNumLoads(unsigned Size); unsigned getNumLoads(unsigned Size);
@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
unsigned MaxLoadSize, unsigned LoadsPerBlock) unsigned MaxLoadSize, unsigned LoadsPerBlock)
: CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) { : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {
IRBuilder<> Builder(CI->getContext()); // A memcmp with zero-comparison with only one block of load and compare does
BasicBlock *StartBlock = CI->getParent(); // not need to set up any extra blocks. This case could be handled in the DAG,
EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); // but since we have all of the machinery to flexibly expand any memcpy here,
setupEndBlockPHINodes(); // we choose to handle this case too to avoid fragmented lowering.
IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
// Calculate how many load compare blocks are required for an expansion of
// given Size.
NumBlocks = calculateNumBlocks(Size); NumBlocks = calculateNumBlocks(Size);
createResultBlock(); if (!IsUsedForZeroCmp || NumBlocks != 1) {
BasicBlock *StartBlock = CI->getParent();
EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
setupEndBlockPHINodes();
createResultBlock();
// If return value of memcmp is not used in a zero equality, we need to // If return value of memcmp is not used in a zero equality, we need to
// calculate which source was larger. The calculation requires the // calculate which source was larger. The calculation requires the
// two loaded source values of each load compare block. // two loaded source values of each load compare block.
// These will be saved in the phi nodes created by setupResultBlockPHINodes. // These will be saved in the phi nodes created by setupResultBlockPHINodes.
if (!IsUsedForZeroCmp) if (!IsUsedForZeroCmp)
setupResultBlockPHINodes(); setupResultBlockPHINodes();
// Create the number of required load compare basic blocks. // Create the number of required load compare basic blocks.
createLoadCmpBlocks(); createLoadCmpBlocks();
// Update the terminator added by splitBasicBlock to branch to the first // Update the terminator added by splitBasicBlock to branch to the first
// LoadCmpBlock. // LoadCmpBlock.
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
}
IRBuilder<> Builder(CI->getContext());
Builder.SetCurrentDebugLocation(CI->getDebugLoc()); Builder.SetCurrentDebugLocation(CI->getDebugLoc());
StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
} }
void MemCmpExpansion::createLoadCmpBlocks() { void MemCmpExpansion::createLoadCmpBlocks() {
@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
Builder.SetInsertPoint(LoadCmpBlocks[Index]); // For a single-block expansion, start inserting before the memcmp call.
if (LoadCmpBlocks.empty())
Builder.SetInsertPoint(CI);
else
Builder.SetInsertPoint(LoadCmpBlocks[Index]);
Value *Cmp = nullptr; Value *Cmp = nullptr;
for (unsigned i = 0; i < NumLoads; ++i) { for (unsigned i = 0; i < NumLoads; ++i) {
unsigned LoadSize = getLoadSize(RemainingBytes); unsigned LoadSize = getLoadSize(RemainingBytes);
@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
return PhiRes; return PhiRes;
} }
/// A memcmp expansion that compares equality with 0 and only has one block of
/// load and compare can bypass the compare, branch, and phi IR that is required
/// in the general case.
Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
unsigned NumBytesProcessed = 0;
IRBuilder<> Builder(CI->getContext());
Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
}
// This function expands the memcmp call into an inline expansion and returns // This function expands the memcmp call into an inline expansion and returns
// the memcmp result. // the memcmp result.
Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) { Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
if (IsUsedForZeroCmp) if (IsUsedForZeroCmp)
return getMemCmpExpansionZeroCase(Size, IsLittleEndian); return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
getMemCmpExpansionZeroCase(Size, IsLittleEndian);
// This loop calls emitLoadCompareBlock for comparing Size bytes of the two // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
// memcmp sources. It starts with loading using the maximum load size set by // memcmp sources. It starts with loading using the maximum load size set by

View File

@ -17,13 +17,13 @@ declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_add
; Check 4 bytes - requires 1 load for each param. ; Check 4 bytes - requires 1 load for each param.
define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) { define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
; CHECK-LABEL: zeroEqualityTest02: ; CHECK-LABEL: zeroEqualityTest02:
; CHECK: # BB#0: # %loadbb ; CHECK: # BB#0:
; CHECK-NEXT: lwz 3, 0(3) ; CHECK-NEXT: lwz 3, 0(3)
; CHECK-NEXT: lwz 4, 0(4) ; CHECK-NEXT: lwz 4, 0(4)
; CHECK-NEXT: li 5, 1 ; CHECK-NEXT: xor 3, 3, 4
; CHECK-NEXT: cmplw 3, 4 ; CHECK-NEXT: cntlzw 3, 3
; CHECK-NEXT: isel 3, 0, 5, 2 ; CHECK-NEXT: srwi 3, 3, 5
; CHECK-NEXT: clrldi 3, 3, 32 ; CHECK-NEXT: xori 3, 3, 1
; CHECK-NEXT: blr ; CHECK-NEXT: blr
%call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4) %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
%not.cmp = icmp ne i32 %call, 0 %not.cmp = icmp ne i32 %call, 0