Generalize MemCpyOpt's handling of call slot forwarding to function properly when the call slot

forwarding is implemented with a load/store pair rather than a memcpy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@116637 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-25 21:00:00 +00:00 · 2010-10-15 22:52:12 +00:00 · 2010-10-15 22:52:12 +00:00 · 6549121c66
commit 6549121c66
parent 00ed59a968
2 changed files with 73 additions and 18 deletions
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@ -321,7 +321,8 @@ namespace {
    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
    bool processMemCpy(MemCpyInst *M);
    bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(MemCpyInst *cpy, CallInst *C);
+    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
+                              uint64_t cpyLen, CallInst *C);
    bool iterateOnFunction(Function &F);
  };
  
@ -339,7 +340,6 @@ INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
                    false, false)

-
 /// processStore - When GVN is scanning forward over instructions, we look for
 /// some other patterns to fold away.  In particular, this looks for stores to
 /// neighboring locations of memory.  If it sees enough consequtive ones
@ -347,6 +347,37 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization",
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
  if (SI->isVolatile()) return false;
  
+  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) return false;
+
+  // Detect cases where we're performing call slot forwarding, but
+  // happen to be using a load-store pair to implement it, rather than
+  // a memcpy.
+  if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
+    if (!LI->isVolatile() && LI->hasOneUse()) {
+      MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();
+
+      MemDepResult dep = MD.getDependency(LI);
+      CallInst *C = 0;
+      if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst()))
+        C = dyn_cast<CallInst>(dep.getInst());
+      
+      if (C) {
+        bool changed = performCallSlotOptzn(LI,
+                        SI->getPointerOperand()->stripPointerCasts(), 
+                        LI->getPointerOperand()->stripPointerCasts(),
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
+        if (changed) {
+          MD.removeInstruction(SI);
+          SI->eraseFromParent();
+          LI->eraseFromParent();
+          ++NumMemCpyInstr;
+          return true;
+        }
+      }
+    }
+  }
+  
  LLVMContext &Context = SI->getContext();

  // There are two cases that are interesting for this code to handle: memcpy
@ -359,8 +390,6 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
  if (!ByteVal)
    return false;

-  TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  if (!TD) return false;
  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
  Module *M = SI->getParent()->getParent()->getParent();

@ -494,7 +523,9 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
 /// performCallSlotOptzn - takes a memcpy and a call that it depends on,
 /// and checks for the possibility of a call slot optimization by having
 /// the call write its result directly into the destination of the memcpy.
-bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
+bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
+                                     Value *cpyDest, Value *cpySrc,
+                                     uint64_t cpyLen, CallInst *C) {
  // The general transformation to keep in mind is
  //
  //   call @func(..., src, ...)
@ -511,16 +542,8 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {

  // Deliberately get the source and destination with bitcasts stripped away,
  // because we'll need to do type comparisons based on the underlying type.
-  Value *cpyDest = cpy->getDest();
-  Value *cpySrc = cpy->getSource();
  CallSite CS(C);

-  // We need to be able to reason about the size of the memcpy, so we require
-  // that it be a constant.
-  ConstantInt *cpyLength = dyn_cast<ConstantInt>(cpy->getLength());
-  if (!cpyLength)
-    return false;
-
  // Require that src be an alloca.  This simplifies the reasoning considerably.
  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
  if (!srcAlloca)
@ -537,7 +560,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) *
    srcArraySize->getZExtValue();

-  if (cpyLength->getZExtValue() < srcSize)
+  if (cpyLen < srcSize)
    return false;

  // Check that accessing the first srcSize bytes of dest will not cause a
@ -606,7 +629,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
  // the use analysis, we also need to know that it does not sneakily
  // access dest.  We rely on AA to figure this out for us.
  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
-  if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) !=
+  if (AA.getModRefInfo(C, cpyDest, srcSize) !=
      AliasAnalysis::NoModRef)
    return false;

@ -635,7 +658,6 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {

  // Remove the memcpy
  MD.removeInstruction(cpy);
-  cpy->eraseFromParent();
  ++NumMemCpyInstr;

  return true;
@ -649,6 +671,10 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) {
 bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
  MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>();

+  // We can only optimize statically-sized memcpy's.
+  ConstantInt *cpyLen = dyn_cast<ConstantInt>(M->getLength());
+  if (!cpyLen) return false;
+
  // The are two possible optimizations we can do for memcpy:
  //   a) memcpy-memcpy xform which exposes redundance for DSE.
  //   b) call-memcpy xform for return slot optimization.
@ -656,8 +682,12 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
  if (!dep.isClobber())
    return false;
  if (!isa<MemCpyInst>(dep.getInst())) {
-    if (CallInst *C = dyn_cast<CallInst>(dep.getInst()))
-      return performCallSlotOptzn(M, C);
+    if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) {
+      bool changed = performCallSlotOptzn(M, M->getDest(), M->getSource(),
+                                  cpyLen->getZExtValue(), C);
+      if (changed) M->eraseFromParent();
+      return changed;
+    }
    return false;
  }
  
--- a/test/Transforms/MemCpyOpt/loadstore-sret.ll
+++ b/test/Transforms/MemCpyOpt/loadstore-sret.ll
@ -0,0 +1,25 @@
+; RUN: opt -S < %s -memcpyopt | FileCheck %s
+; <rdar://problem/8536696>
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%"class.std::auto_ptr" = type { i32* }
+
+; CHECK: @_Z3foov
+define void @_Z3foov(%"class.std::auto_ptr"* noalias nocapture sret %agg.result) ssp {
+_ZNSt8auto_ptrIiED1Ev.exit:
+  %temp.lvalue = alloca %"class.std::auto_ptr", align 8
+; CHECK: call void @_Z3barv(%"class.std::auto_ptr"* sret %agg.result)
+  call void @_Z3barv(%"class.std::auto_ptr"* sret %temp.lvalue)
+  %tmp.i.i = getelementptr inbounds %"class.std::auto_ptr"* %temp.lvalue, i64 0, i32 0
+; CHECK-NOT: load
+  %tmp2.i.i = load i32** %tmp.i.i, align 8
+  %tmp.i.i4 = getelementptr inbounds %"class.std::auto_ptr"* %agg.result, i64 0, i32 0
+; CHECK-NOT: store
+  store i32* %tmp2.i.i, i32** %tmp.i.i4, align 8
+; CHECK: ret void
+  ret void
+}
+
+declare void @_Z3barv(%"class.std::auto_ptr"* sret)