From 94a001edde5fa3ef7cc233799ef0f0ca2cc69e90 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 8 Jun 2017 16:53:18 +0000
Subject: [PATCH] [CGP / PowerPC] avoid multi-block overhead for simple memcmp
 expansion

The test diff for PowerPC shows we can better optimize if this case is one block.

For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed
cheap and SDAG can't optimize across blocks.

Instead of this:

_cmp_eq8:
  movq  (%rdi), %rax
  cmpq  (%rsi), %rax
  je  LBB23_1
## BB#2:                                ## %res_block
  movl  $1, %ecx
  jmp LBB23_3
LBB23_1:
  xorl  %ecx, %ecx
LBB23_3:                                ## %endblock
  xorl  %eax, %eax
  testl %ecx, %ecx
  sete  %al
  retq

We get this:

cmp_eq8:
  movq  (%rdi), %rcx
  xorl  %eax, %eax
  cmpq  (%rsi), %rcx
  sete  %al
  retq

And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall().
If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable
CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we
can lower the IR produced here optimally.

Differential Revision: https://reviews.llvm.org/D34005


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@304987 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CodeGenPrepare.cpp                | 63 ++++++++++++-------
 .../memCmpUsedInZeroEqualityComparison.ll     | 10 +--
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index e797f1af70c..b9e0e2198e1 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -1675,6 +1675,7 @@ class MemCmpExpansion {
   void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
   void emitMemCmpResultBlock(bool IsLittleEndian);
   Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
+  Value *getMemCmpEqZeroOneBlock(unsigned Size);
   unsigned getLoadSize(unsigned Size);
   unsigned getNumLoads(unsigned Size);
 
@@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size,
                                  unsigned MaxLoadSize, unsigned LoadsPerBlock)
     : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) {
 
-  IRBuilder<> Builder(CI->getContext());
-  BasicBlock *StartBlock = CI->getParent();
-  EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
-  setupEndBlockPHINodes();
+  // A memcmp with zero-comparison with only one block of load and compare does
+  // not need to set up any extra blocks. This case could be handled in the DAG,
+  // but since we have all of the machinery to flexibly expand any memcpy here,
+  // we choose to handle this case too to avoid fragmented lowering.
   IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
-
-  // Calculate how many load compare blocks are required for an expansion of
-  // given Size.
   NumBlocks = calculateNumBlocks(Size);
-  createResultBlock();
+  if (!IsUsedForZeroCmp || NumBlocks != 1) {
+    BasicBlock *StartBlock = CI->getParent();
+    EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+    setupEndBlockPHINodes();
+    createResultBlock();
 
-  // If return value of memcmp is not used in a zero equality, we need to
-  // calculate which source was larger. The calculation requires the
-  // two loaded source values of each load compare block.
-  // These will be saved in the phi nodes created by setupResultBlockPHINodes.
-  if (!IsUsedForZeroCmp)
-    setupResultBlockPHINodes();
+    // If return value of memcmp is not used in a zero equality, we need to
+    // calculate which source was larger. The calculation requires the
+    // two loaded source values of each load compare block.
+    // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+    if (!IsUsedForZeroCmp)
+      setupResultBlockPHINodes();
 
-  // Create the number of required load compare basic blocks.
-  createLoadCmpBlocks();
+    // Create the number of required load compare basic blocks.
+    createLoadCmpBlocks();
 
-  // Update the terminator added by splitBasicBlock to branch to the first
-  // LoadCmpBlock.
+    // Update the terminator added by splitBasicBlock to branch to the first
+    // LoadCmpBlock.
+    StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+  }
+
+  IRBuilder<> Builder(CI->getContext());
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-  StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
 }
 
 void MemCmpExpansion::createLoadCmpBlocks() {
@@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size,
   unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
   unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
 
-  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // For a single-block expansion, start inserting before the memcmp call.
+  if (LoadCmpBlocks.empty())
+    Builder.SetInsertPoint(CI);
+  else
+    Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
   Value *Cmp = nullptr;
   for (unsigned i = 0; i < NumLoads; ++i) {
     unsigned LoadSize = getLoadSize(RemainingBytes);
@@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
   return PhiRes;
 }
 
+/// A memcmp expansion that compares equality with 0 and only has one block of
+/// load and compare can bypass the compare, branch, and phi IR that is required
+/// in the general case.
+Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) {
+  unsigned NumBytesProcessed = 0;
+  IRBuilder<> Builder(CI->getContext());
+  Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder);
+  return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext()));
+}
+
 // This function expands the memcmp call into an inline expansion and returns
 // the memcmp result.
 Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) {
   if (IsUsedForZeroCmp)
-    return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
+    return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) :
+                            getMemCmpExpansionZeroCase(Size, IsLittleEndian);
 
   // This loop calls emitLoadCompareBlock for comparing Size bytes of the two
   // memcmp sources. It starts with loading using the maximum load size set by
diff --git a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
index 84d3e884102..e2c842f25b9 100644
--- a/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ b/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -17,13 +17,13 @@ declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_add
 ; Check 4 bytes - requires 1 load for each param.
 define signext i32 @zeroEqualityTest02(i8* %x, i8* %y) {
 ; CHECK-LABEL: zeroEqualityTest02:
-; CHECK:       # BB#0: # %loadbb
+; CHECK:       # BB#0:
 ; CHECK-NEXT:    lwz 3, 0(3)
 ; CHECK-NEXT:    lwz 4, 0(4)
-; CHECK-NEXT:    li 5, 1
-; CHECK-NEXT:    cmplw 3, 4
-; CHECK-NEXT:    isel 3, 0, 5, 2
-; CHECK-NEXT:    clrldi 3, 3, 32
+; CHECK-NEXT:    xor 3, 3, 4
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    srwi 3, 3, 5
+; CHECK-NEXT:    xori 3, 3, 1
 ; CHECK-NEXT:    blr
   %call = tail call signext i32 @memcmp(i8* %x, i8* %y, i64 4)
   %not.cmp = icmp ne i32 %call, 0