From 94a001edde5fa3ef7cc233799ef0f0ca2cc69e90 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 8 Jun 2017 16:53:18 +0000 Subject: [PATCH] [CGP / PowerPC] avoid multi-block overhead for simple memcmp expansion The test diff for PowerPC shows we can better optimize if this case is one block. For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed cheap and SDAG can't optimize across blocks. Instead of this: _cmp_eq8: movq (%rdi), %rax cmpq (%rsi), %rax je LBB23_1 ## BB#2: ## %res_block movl $1, %ecx jmp LBB23_3 LBB23_1: xorl %ecx, %ecx LBB23_3: ## %endblock xorl %eax, %eax testl %ecx, %ecx sete %al retq We get this: cmp_eq8: movq (%rdi), %rcx xorl %eax, %eax cmpq (%rsi), %rcx sete %al retq And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall(). If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we can lower the IR produced here optimally. Differential Revision: https://reviews.llvm.org/D34005 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304987 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/CodeGenPrepare.cpp | 63 ++++++++++++------- .../memCmpUsedInZeroEqualityComparison.ll | 10 +-- 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index e797f1af70c..b9e0e2198e1 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -1675,6 +1675,7 @@ class MemCmpExpansion { void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); void emitMemCmpResultBlock(bool IsLittleEndian); Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + Value *getMemCmpEqZeroOneBlock(unsigned Size); unsigned getLoadSize(unsigned Size); unsigned getNumLoads(unsigned Size); @@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, unsigned LoadsPerBlock) : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) { - IRBuilder<> Builder(CI->getContext()); - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - - // Calculate how many load compare blocks are required for an expansion of - // given Size. NumBlocks = calculateNumBlocks(Size); - createResultBlock(); + if (!IsUsedForZeroCmp || NumBlocks != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) - setupResultBlockPHINodes(); + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } + + IRBuilder<> Builder(CI->getContext()); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); } void MemCmpExpansion::createLoadCmpBlocks() { @@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); - Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Value *Cmp = nullptr; for (unsigned i = 0; i < NumLoads; ++i) { unsigned LoadSize = getLoadSize(RemainingBytes); @@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size, return PhiRes; } +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { + unsigned NumBytesProcessed = 0; + IRBuilder<> Builder(CI->getContext()); + Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) { if (IsUsedForZeroCmp) - return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : + getMemCmpExpansionZeroCase(Size, IsLittleEndian); // This loop calls emitLoadCompareBlock for comparing Size bytes of the two // memcmp sources. It starts with loading using the maximum load size set by diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll index 84d3e884102..e2c842f25b9 100644 --- a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll +++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll @@ -17,13 +17,13 @@ declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_add ; Check 4 bytes - requires 1 load for each param. define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) { ; CHECK-LABEL: zeroEqualityTest02: -; CHECK: # BB#0: # %loadbb +; CHECK: # BB#0: ; CHECK-NEXT: lwz 3, 0(3) ; CHECK-NEXT: lwz 4, 0(4) -; CHECK-NEXT: li 5, 1 -; CHECK-NEXT: cmplw 3, 4 -; CHECK-NEXT: isel 3, 0, 5, 2 -; CHECK-NEXT: clrldi 3, 3, 32 +; CHECK-NEXT: xor 3, 3, 4 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: srwi 3, 3, 5 +; CHECK-NEXT: xori 3, 3, 1 ; CHECK-NEXT: blr %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4) %not.cmp = icmp ne i32 %call, 0