[BDCE] Add a bit-tracking DCE pass

BDCE is a bit-tracking dead code elimination pass. It is based on ADCE (the "aggressive DCE" pass), with the added capability to track dead bits of integer valued instructions and remove those instructions when all of the bits are dead. Currently, it does not actually do this all-bits-dead removal, but rather replaces the instruction's uses with a constant zero, and lets instcombine (and the later run of ADCE) do the rest. Because we essentially get a run of ADCE "for free" while tracking the dead bits, we also do what ADCE does and removes actually-dead instructions as well (this includes instructions newly trivially dead because all bits were dead, but not all such instructions can be removed). The motivation for this is a case like: int __attribute__((const)) foo(int i); int bar(int x) { x |= (4 & foo(5)); x |= (8 & foo(3)); x |= (16 & foo(2)); x |= (32 & foo(1)); x |= (64 & foo(0)); x |= (128& foo(4)); return x >> 4; } As it turns out, if you order the bit-field insertions so that all of the dead ones come last, then instcombine will remove them. However, if you pick some other order (such as the one above), the fact that some of the calls to foo() are useless is not locally obvious, and we don't remove them (without this pass). I did a quick compile-time overhead check using sqlite from the test suite (Release+Asserts). BDCE took ~0.4% of the compilation time (making it about twice as expensive as ADCE). I've not looked at why yet, but we eliminate instructions due to having all-dead bits in: External/SPEC/CFP2006/447.dealII/447.dealII External/SPEC/CINT2006/400.perlbench/400.perlbench External/SPEC/CINT2006/403.gcc/403.gcc MultiSource/Applications/ClamAV/clamscan MultiSource/Benchmarks/7zip/7zip-benchmark llvm-svn: 229462
2024-11-29 16:41:27 +00:00 · 2015-02-17 01:36:59 +00:00 · 2015-02-17 01:36:59 +00:00 · 2bb61ba2fe
commit 2bb61ba2fe
parent 2754714fb9
10 changed files with 812 additions and 0 deletions
--- a/llvm/include/llvm-c/Transforms/Scalar.h
+++ b/llvm/include/llvm-c/Transforms/Scalar.h
@ -35,6 +35,9 @@ extern "C" {
 /** See llvm::createAggressiveDCEPass function. */
 void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM);

+/** See llvm::createBitTrackingDCEPass function. */
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM);
+
 /** See llvm::createAlignmentFromAssumptionsPass function. */
 void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM);

--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@ -65,6 +65,7 @@ void initializeTarget(PassRegistry&);
 void initializeAAEvalPass(PassRegistry&);
 void initializeAddDiscriminatorsPass(PassRegistry&);
 void initializeADCEPass(PassRegistry&);
+void initializeBDCEPass(PassRegistry&);
 void initializeAliasAnalysisAnalysisGroup(PassRegistry&);
 void initializeAliasAnalysisCounterPass(PassRegistry&);
 void initializeAliasDebuggerPass(PassRegistry&);
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@ -49,6 +49,7 @@ namespace {

      (void) llvm::createAAEvalPass();
      (void) llvm::createAggressiveDCEPass();
+      (void) llvm::createBitTrackingDCEPass();
      (void) llvm::createAliasAnalysisCounterPass();
      (void) llvm::createAliasDebugger();
      (void) llvm::createArgumentPromotionPass();
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@ -80,6 +80,13 @@ FunctionPass *createDeadStoreEliminationPass();
 //
 FunctionPass *createAggressiveDCEPass();

+//===----------------------------------------------------------------------===//
+//
+// BitTrackingDCE - This pass uses a bit-tracking DCE algorithm in order to
+// remove computations of dead bits.
+//
+FunctionPass *createBitTrackingDCEPass();
+
 //===----------------------------------------------------------------------===//
 //
 // SROA - Replace aggregates or pieces of aggregates with scalar SSA values.
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -252,6 +252,11 @@ void PassManagerBuilder::populateModulePassManager(
  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
  MPM.add(createSCCPPass());                  // Constant prop with SCCP

+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  MPM.add(createBitTrackingDCEPass());        // Delete dead bit computations
+
  // Run instcombine after redundancy elimination to exploit opportunities
  // opened up by them.
  MPM.add(createInstructionCombiningPass());
--- a/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/llvm/lib/Transforms/Scalar/BDCE.cpp
@ -0,0 +1,408 @@
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+
+namespace {
+struct BDCE : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  BDCE() : FunctionPass(ID) {
+    initializeBDCEPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function& F) override;
+
+  void getAnalysisUsage(AnalysisUsage& AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+
+  void determineLiveOperandBits(const Instruction *UserI,
+                                const Instruction *I, unsigned OperandNo,
+                                const APInt &AOut, APInt &AB,
+                                APInt &KnownZero, APInt &KnownOne,
+                                APInt &KnownZero2, APInt &KnownOne2);
+
+  AssumptionCache *AC;
+  const DataLayout *DL;
+  DominatorTree *DT;
+};
+}
+
+char BDCE::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(BDCE, "bdce", "Bit-Tracking Dead Code Elimination",
+                    false, false)
+
+static bool isAlwaysLive(Instruction *I) {
+  return isa<TerminatorInst>(I) || isa<DbgInfoIntrinsic>(I) ||
+         isa<LandingPadInst>(I) || I->mayHaveSideEffects();
+}
+
+void BDCE::determineLiveOperandBits(const Instruction *UserI,
+                                    const Instruction *I, unsigned OperandNo,
+                                    const APInt &AOut, APInt &AB,
+                                    APInt &KnownZero, APInt &KnownOne,
+                                    APInt &KnownZero2, APInt &KnownOne2) {
+  unsigned BitWidth = AB.getBitWidth();
+
+  // We're called once per operand, but for some instructions, we need to
+  // compute known bits of both operands in order to determine the live bits of
+  // either (when both operands are instructions themselves). We don't,
+  // however, want to do this twice, so we cache the result in APInts that live
+  // in the caller. For the two-relevant-operands case, both operand values are
+  // provided here.
+  auto ComputeKnownBits = [&](unsigned BitWidth, const Value *V1,
+                              const Value *V2) {
+    KnownZero = APInt(BitWidth, 0);
+    KnownOne =  APInt(BitWidth, 0);
+    computeKnownBits(const_cast<Value*>(V1), KnownZero, KnownOne, DL, 0, AC,
+                     UserI, DT);
+
+    if (V2) {
+      KnownZero2 = APInt(BitWidth, 0);
+      KnownOne2 =  APInt(BitWidth, 0);
+      computeKnownBits(const_cast<Value*>(V2), KnownZero2, KnownOne2, DL, 0, AC,
+                       UserI, DT);
+    }
+  };
+
+  switch (UserI->getOpcode()) {
+  default: break;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(UserI))
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::bswap:
+        // The alive bits of the input are the swapped alive bits of
+        // the output.
+        AB = AOut.byteSwap();
+        break;
+      case Intrinsic::ctlz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the left of, and including, the leftmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getHighBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countLeadingZeros()+1));
+        }
+        break;
+      case Intrinsic::cttz:
+        if (OperandNo == 0) {
+          // We need some output bits, so we need all bits of the
+          // input to the right of, and including, the rightmost bit
+          // known to be one.
+          ComputeKnownBits(BitWidth, I, nullptr);
+          AB = APInt::getLowBitsSet(BitWidth,
+                 std::min(BitWidth, KnownOne.countTrailingZeros()+1));
+        }
+        break;
+      }
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+    // Find the highest live output bit. We don't need any more input
+    // bits than that (adds, and thus subtracts, ripple only to the
+    // left).
+    AB = APInt::getLowBitsSet(BitWidth, AOut.getActiveBits());
+    break;
+  case Instruction::Shl:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.lshr(ShiftAmt);
+
+        // If the shift is nuw/nsw, then the high bits are not dead
+        // (because we've promised that they *must* be zero).
+        const ShlOperator *S = cast<ShlOperator>(UserI);
+        if (S->hasNoSignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        else if (S->hasNoUnsignedWrap())
+          AB |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::LShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<LShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::AShr:
+    if (OperandNo == 0)
+      if (ConstantInt *CI =
+            dyn_cast<ConstantInt>(UserI->getOperand(1))) {
+        uint64_t ShiftAmt = CI->getLimitedValue(BitWidth-1);
+        AB = AOut.shl(ShiftAmt);
+        // Because the high input bit is replicated into the
+        // high-order bits of the result, if we need any of those
+        // bits, then we must keep the highest input bit.
+        if ((AOut & APInt::getHighBitsSet(BitWidth, ShiftAmt))
+            .getBoolValue())
+          AB.setBit(BitWidth-1);
+
+        // If the shift is exact, then the low bits are not dead
+        // (they must be zero).
+        if (cast<AShrOperator>(UserI)->isExact())
+          AB |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+      }
+    break;
+  case Instruction::And:
+    AB = AOut;
+
+    // For bits that are known zero, the corresponding bits in the
+    // other operand are dead (unless they're both zero, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownZero2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownZero & ~KnownZero2);
+    }
+    break;
+  case Instruction::Or:
+    AB = AOut;
+
+    // For bits that are known one, the corresponding bits in the
+    // other operand are dead (unless they're both one, in which
+    // case they can't both be dead, so just mark the LHS bits as
+    // dead).
+    if (OperandNo == 0) {
+      ComputeKnownBits(BitWidth, I, UserI->getOperand(1));
+      AB &= ~KnownOne2;
+    } else {
+      if (!isa<Instruction>(UserI->getOperand(0)))
+        ComputeKnownBits(BitWidth, UserI->getOperand(0), I);
+      AB &= ~(KnownOne & ~KnownOne2);
+    }
+    break;
+  case Instruction::Xor:
+  case Instruction::PHI:
+    AB = AOut;
+    break;
+  case Instruction::Trunc:
+    AB = AOut.zext(BitWidth);
+    break;
+  case Instruction::ZExt:
+    AB = AOut.trunc(BitWidth);
+    break;
+  case Instruction::SExt:
+    AB = AOut.trunc(BitWidth);
+    // Because the high input bit is replicated into the
+    // high-order bits of the result, if we need any of those
+    // bits, then we must keep the highest input bit.
+    if ((AOut & APInt::getHighBitsSet(AOut.getBitWidth(),
+                                      AOut.getBitWidth() - BitWidth))
+        .getBoolValue())
+      AB.setBit(BitWidth-1);
+    break;
+  case Instruction::Select:
+    if (OperandNo != 0)
+      AB = AOut;
+    break;
+  }
+}
+
+bool BDCE::runOnFunction(Function& F) {
+  if (skipOptnoneFunction(F))
+    return false;
+
+  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  DL = F.getParent()->getDataLayout();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  DenseMap<Instruction *, APInt> AliveBits;
+  SmallVector<Instruction*, 128> Worklist;
+
+  // The set of visited instructions (non-integer-typed only).
+  SmallPtrSet<Instruction*, 128> Visited;
+
+  // Collect the set of "root" instructions that are known live.
+  for (Instruction &I : inst_range(F)) {
+    if (!isAlwaysLive(&I))
+      continue;
+
+    // For integer-valued instructions, set up an initial empty set of alive
+    // bits and add the instruction to the work list. For other instructions
+    // add their operands to the work list (for integer values operands, mark
+    // all bits as live).
+    if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+      AliveBits[&I] = APInt(IT->getBitWidth(), 0);
+      Worklist.push_back(&I);
+      continue;
+    }
+
+    // Non-integer-typed instructions...
+    for (Use &OI : I.operands()) {
+      if (Instruction *J = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(J->getType()))
+          AliveBits[J] = APInt::getAllOnesValue(IT->getBitWidth());
+        Worklist.push_back(J);
+      }
+    }
+    // To save memory, we don't add I to the Visited set here. Instead, we
+    // check isAlwaysLive on every instruction when searching for dead
+    // instructions later (we need to check isAlwaysLive for the
+    // integer-typed instructions anyway).
+  }
+
+  // Propagate liveness backwards to operands.
+  while (!Worklist.empty()) {
+    Instruction *UserI = Worklist.pop_back_val();
+
+    DEBUG(dbgs() << "BDCE: Visiting: " << *UserI);
+    APInt AOut;
+    if (UserI->getType()->isIntegerTy()) {
+      AOut = AliveBits[UserI];
+      DEBUG(dbgs() << " Alive Out: " << AOut);
+    }
+    DEBUG(dbgs() << "\n");
+
+    if (!UserI->getType()->isIntegerTy())
+      Visited.insert(UserI);
+
+    APInt KnownZero, KnownOne, KnownZero2, KnownOne2;
+    // Compute the set of alive bits for each operand. These are anded into the
+    // existing set, if any, and if that changes the set of alive bits, the
+    // operand is added to the work-list.
+    for (Use &OI : UserI->operands()) {
+      if (Instruction *I = dyn_cast<Instruction>(OI)) {
+        if (IntegerType *IT = dyn_cast<IntegerType>(I->getType())) {
+          unsigned BitWidth = IT->getBitWidth();
+          APInt AB = APInt::getAllOnesValue(BitWidth);
+          if (UserI->getType()->isIntegerTy() && !AOut &&
+              !isAlwaysLive(UserI)) {
+            AB = APInt(BitWidth, 0);
+          } else {
+            // If all bits of the output are dead, then all bits of the input 
+            // Bits of each operand that are used to compute alive bits of the
+            // output are alive, all others are dead.
+            determineLiveOperandBits(UserI, I, OI.getOperandNo(), AOut, AB,
+                                     KnownZero, KnownOne,
+                                     KnownZero2, KnownOne2);
+          }
+
+          // If we've added to the set of alive bits (or the operand has not
+          // been previously visited), then re-queue the operand to be visited
+          // again.
+          APInt ABPrev(BitWidth, 0);
+          auto ABI = AliveBits.find(I);
+          if (ABI != AliveBits.end())
+            ABPrev = ABI->second;
+
+          APInt ABNew = AB | ABPrev;
+          if (ABNew != ABPrev || ABI == AliveBits.end()) {
+            AliveBits[I] = std::move(ABNew);
+            Worklist.push_back(I);
+          }
+        } else if (!Visited.count(I)) {
+          Worklist.push_back(I);
+        }
+      }
+    }
+  }
+
+  bool Changed = false;
+  // The inverse of the live set is the dead set.  These are those instructions
+  // which have no side effects and do not influence the control flow or return
+  // value of the function, and may therefore be deleted safely.
+  // NOTE: We reuse the Worklist vector here for memory efficiency.
+  for (Instruction &I : inst_range(F)) {
+    // For live instructions that have all dead bits, first make them dead by
+    // replacing all uses with something else. Then, if they don't need to
+    // remain live (because they have side effects, etc.) we can remove them.
+    if (I.getType()->isIntegerTy()) {
+      auto ABI = AliveBits.find(&I);
+      if (ABI != AliveBits.end()) {
+        if (ABI->second.getBoolValue())
+          continue;
+
+        DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n");
+        // FIXME: In theory we could substitute undef here instead of zero.
+        // This should be reconsidered once we settle on the semantics of
+        // undef, poison, etc.
+        Value *Zero = ConstantInt::get(I.getType(), 0);
+        ++NumSimplified;
+        I.replaceAllUsesWith(Zero);
+        Changed = true;
+      }
+    } else if (Visited.count(&I)) {
+      continue;
+    }
+
+    if (isAlwaysLive(&I))
+      continue;
+
+    DEBUG(dbgs() << "BDCE: Removing: " << I << " (unused)\n");
+    Worklist.push_back(&I);
+    I.dropAllReferences();
+    Changed = true;
+  }
+
+  for (Instruction *&I : Worklist) {
+    ++NumRemoved;
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createBitTrackingDCEPass() {
+  return new BDCE();
+}
+
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@ -1,6 +1,7 @@
 add_llvm_library(LLVMScalarOpts
  ADCE.cpp
  AlignmentFromAssumptions.cpp
+  BDCE.cpp
  ConstantHoisting.cpp
  ConstantProp.cpp
  CorrelatedValuePropagation.cpp
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@ -28,6 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
  initializeADCEPass(Registry);
+  initializeBDCEPass(Registry);
  initializeAlignmentFromAssumptionsPass(Registry);
  initializeSampleProfileLoaderPass(Registry);
  initializeConstantHoistingPass(Registry);
@ -83,6 +84,10 @@ void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
  unwrap(PM)->add(createAggressiveDCEPass());
 }

+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
 void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
  unwrap(PM)->add(createAlignmentFromAssumptionsPass());
 }
--- a/llvm/test/Transforms/BDCE/basic.ll
+++ b/llvm/test/Transforms/BDCE/basic.ll
@ -0,0 +1,348 @@
+; RUN: opt -S -bdce -instsimplify < %s | FileCheck %s
+; RUN: opt -S -instsimplify < %s | FileCheck %s -check-prefix=CHECK-IO
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind readnone
+define signext i32 @bar(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 4
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 8
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shr = ashr i32 %or15, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+
+; Check that instsimplify is not doing this all on its own.
+; CHECK-IO-LABEL: @bar
+; CHECK-IO: tail call signext i32 @foo(i32 signext 5)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 3)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 2)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 1)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 0)
+; CHECK-IO: tail call signext i32 @foo(i32 signext 4)
+; CHECK-IO: ret i32
+}
+
+; Function Attrs: nounwind readnone
+declare signext i32 @foo(i32 signext) #0
+
+; Function Attrs: nounwind readnone
+define signext i32 @far(i32 signext %x) #1 {
+entry:
+  %call = tail call signext i32 @goo(i32 signext 5) #1
+  %and = and i32 %call, 4
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @goo(i32 signext 3) #1
+  %and2 = and i32 %call1, 8
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @goo(i32 signext 2) #1
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @goo(i32 signext 1) #1
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @goo(i32 signext 0) #1
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @goo(i32 signext 4) #1
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shr = ashr i32 %or15, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @far
+; Calls to foo(5) and foo(3) are still there, but their results are not used.
+; CHECK: tail call signext i32 @goo(i32 signext 5)
+; CHECK-NEXT: tail call signext i32 @goo(i32 signext 3)
+; CHECK-NEXT: tail call signext i32 @goo(i32 signext 2)
+; CHECK: tail call signext i32 @goo(i32 signext 1)
+; CHECK: tail call signext i32 @goo(i32 signext 0)
+; CHECK: tail call signext i32 @goo(i32 signext 4)
+; CHECK: ret i32
+
+; Check that instsimplify is not doing this all on its own.
+; CHECK-IO-LABEL: @far
+; CHECK-IO: tail call signext i32 @goo(i32 signext 5)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 3)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 2)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 1)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 0)
+; CHECK-IO: tail call signext i32 @goo(i32 signext 4)
+; CHECK-IO: ret i32
+}
+
+declare signext i32 @goo(i32 signext) #1
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar1(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %bs = tail call i32 @llvm.bswap.i32(i32 %or15) #0
+  %shr = ashr i32 %bs, 4
+  ret i32 %shr
+
+; CHECK-LABEL: @tar1
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.bswap.i32(i32) #0
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar2(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %shl = shl i32 %or15, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar2
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar3(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %add = add i32 %or15, 5
+  %shl = shl i32 %add, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar3
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar4(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %sub = sub i32 %or15, 5
+  %shl = shl i32 %sub, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar4
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar5(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %xor = xor i32 %or15, 5
+  %shl = shl i32 %xor, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar5
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i32 @tar7(i32 signext %x, i1 %b) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %v = select i1 %b, i32 %or15, i32 5
+  %shl = shl i32 %v, 10
+  ret i32 %shl
+
+; CHECK-LABEL: @tar7
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i32
+}
+
+; Function Attrs: nounwind readnone
+define signext i16 @tar8(i32 signext %x) #0 {
+entry:
+  %call = tail call signext i32 @foo(i32 signext 5) #0
+  %and = and i32 %call, 33554432
+  %or = or i32 %and, %x
+  %call1 = tail call signext i32 @foo(i32 signext 3) #0
+  %and2 = and i32 %call1, 67108864
+  %or3 = or i32 %or, %and2
+  %call4 = tail call signext i32 @foo(i32 signext 2) #0
+  %and5 = and i32 %call4, 16
+  %or6 = or i32 %or3, %and5
+  %call7 = tail call signext i32 @foo(i32 signext 1) #0
+  %and8 = and i32 %call7, 32
+  %or9 = or i32 %or6, %and8
+  %call10 = tail call signext i32 @foo(i32 signext 0) #0
+  %and11 = and i32 %call10, 64
+  %or12 = or i32 %or9, %and11
+  %call13 = tail call signext i32 @foo(i32 signext 4) #0
+  %and14 = and i32 %call13, 128
+  %or15 = or i32 %or12, %and14
+  %tr = trunc i32 %or15 to i16
+  ret i16 %tr
+
+; CHECK-LABEL: @tar8
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 5)
+; CHECK-NOT: tail call signext i32 @foo(i32 signext 3)
+; CHECK: tail call signext i32 @foo(i32 signext 2)
+; CHECK: tail call signext i32 @foo(i32 signext 1)
+; CHECK: tail call signext i32 @foo(i32 signext 0)
+; CHECK: tail call signext i32 @foo(i32 signext 4)
+; CHECK: ret i16
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
--- a/llvm/test/Transforms/BDCE/dce-pure.ll
+++ b/llvm/test/Transforms/BDCE/dce-pure.ll
@ -0,0 +1,33 @@
+; RUN: opt -bdce -S < %s | FileCheck %s
+
+declare i32 @strlen(i8*) readonly nounwind
+
+define void @test1() {
+  call i32 @strlen( i8* null )
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: call
+; CHECK: ret void
+}
+
+define i32 @test2() {
+  ; invoke of pure function should not be deleted!
+  invoke i32 @strlen( i8* null ) readnone
+                  to label %Cont unwind label %Other
+
+Cont:           ; preds = %0
+  ret i32 0
+
+Other:          ; preds = %0
+   %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            cleanup
+  ret i32 1
+
+; CHECK-LABEL: @test2
+; CHECK: invoke
+; CHECK: ret i32 1
+}
+
+declare i32 @__gxx_personality_v0(...)
+