Commoning of target specific load/store intrinsics in Early CSE.

Phabricator revision: http://reviews.llvm.org/D7121 Patch by Sanjin Sijaric <ssijaric@codeaurora.org>! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227149 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-02 18:42:36 +00:00 · 2015-01-26 22:51:15 +00:00 · 2015-01-26 22:51:15 +00:00 · 13faabb6c5
commit 13faabb6c5
parent f96362358f
6 changed files with 486 additions and 29 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -23,6 +23,7 @@
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H

 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/DataTypes.h"

@ -35,6 +36,20 @@ class Type;
 class User;
 class Value;

+/// \brief Information about a load/store intrinsic defined by the target.
+struct MemIntrinsicInfo {
+  MemIntrinsicInfo()
+      : ReadMem(false), WriteMem(false), Vol(false), MatchingId(0),
+        NumMemRefs(0), PtrVal(nullptr) {}
+  bool ReadMem;
+  bool WriteMem;
+  bool Vol;
+  // Same Id is set by the target for corresponding load/store intrinsics.
+  unsigned short MatchingId;
+  int NumMemRefs;
+  Value *PtrVal;
+};
+
 /// TargetTransformInfo - This pass provides access to the codegen
 /// interfaces that are needed for IR-level transformations.
 class TargetTransformInfo {
@ -443,6 +458,20 @@ public:
  /// any callee-saved registers, so would require a spill and fill.
  virtual unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const;

+  /// \returns True if the intrinsic is a supported memory intrinsic.  Info
+  /// will contain additional information - whether the intrinsic may write
+  /// or read to memory, volatility and the pointer.  Info is undefined
+  /// if false is returned.
+  virtual bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                  MemIntrinsicInfo &Info) const;
+
+  /// \returns A value which is the result of the given memory intrinsic.  New
+  /// instructions may be created to extract the result from the given intrinsic
+  /// memory operation.  Returns nullptr if the target cannot create a result
+  /// from the given intrinsic.
+  virtual Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                   Type *ExpectedType) const;
+
  /// @}

  /// Analysis group identification.
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -254,6 +254,16 @@ unsigned TargetTransformInfo::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys)
  return PrevTTI->getCostOfKeepingLiveOverCall(Tys);
 }

+Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
+    IntrinsicInst *Inst, Type *ExpectedType) const {
+  return PrevTTI->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
+}
+
+bool TargetTransformInfo::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                             MemIntrinsicInfo &Info) const {
+  return PrevTTI->getTgtMemIntrinsic(Inst, Info);
+}
+
 namespace {

 struct NoTTI final : ImmutablePass, TargetTransformInfo {
@ -656,6 +666,15 @@ struct NoTTI final : ImmutablePass, TargetTransformInfo {
    return 0;
  }

+  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+                          MemIntrinsicInfo &Info) const override {
+    return false;
+  }
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType) const override {
+    return nullptr;
+  }
 };

 } // end anonymous namespace
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@ -44,6 +44,12 @@ class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
  /// are set if the result needs to be inserted and/or extracted from vectors.
  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;

+  enum MemIntrinsicType {
+    VECTOR_LDST_TWO_ELEMENTS,
+    VECTOR_LDST_THREE_ELEMENTS,
+    VECTOR_LDST_FOUR_ELEMENTS
+  };
+
 public:
  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
    llvm_unreachable("This pass cannot be directly constructed");
@ -131,6 +137,11 @@ public:
  void getUnrollingPreferences(const Function *F, Loop *L,
                               UnrollingPreferences &UP) const override;

+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType) const override;
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst,
+                          MemIntrinsicInfo &Info) const override;

  /// @}
 };
@ -554,3 +565,83 @@ void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
  // Disable partial & runtime unrolling on -Os.
  UP.PartialOptSizeThreshold = 0;
 }
+
+Value *AArch64TTI::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                     Type *ExpectedType) const {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4: {
+    // Create a struct type
+    StructType *ST = dyn_cast<StructType>(ExpectedType);
+    if (!ST)
+      return nullptr;
+    unsigned NumElts = Inst->getNumArgOperands() - 1;
+    if (ST->getNumElements() != NumElts)
+      return nullptr;
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
+        return nullptr;
+    }
+    Value *Res = UndefValue::get(ExpectedType);
+    IRBuilder<> Builder(Inst);
+    for (unsigned i = 0, e = NumElts; i != e; ++i) {
+      Value *L = Inst->getArgOperand(i);
+      Res = Builder.CreateInsertValue(Res, L, i);
+    }
+    return Res;
+  }
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    if (Inst->getType() == ExpectedType)
+      return Inst;
+    return nullptr;
+  }
+}
+
+bool AArch64TTI::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                    MemIntrinsicInfo &Info) const {
+  switch (Inst->getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+    Info.Vol = false;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(0);
+    break;
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+    Info.ReadMem = false;
+    Info.WriteMem = true;
+    Info.Vol = false;
+    Info.NumMemRefs = 1;
+    Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
+    break;
+  }
+
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_st2:
+    Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_st3:
+    Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
+    break;
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_st4:
+    Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
+    break;
+  }
+  return true;
+}
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
@ -273,6 +274,7 @@ class EarlyCSE : public FunctionPass {
 public:
  const DataLayout *DL;
  const TargetLibraryInfo *TLI;
+  const TargetTransformInfo *TTI;
  DominatorTree *DT;
  AssumptionCache *AC;
  typedef RecyclingAllocator<
@ -383,14 +385,83 @@ private:
    bool Processed;
  };

+  /// \brief Wrapper class to handle memory instructions, including loads,
+  /// stores and intrinsic loads and stores defined by the target.
+  class ParseMemoryInst {
+  public:
+    ParseMemoryInst(Instruction *Inst, const TargetTransformInfo *TTI)
+        : Load(false), Store(false), Vol(false), MayReadFromMemory(false),
+          MayWriteToMemory(false), MatchingId(-1), Ptr(nullptr) {
+      MayReadFromMemory = Inst->mayReadFromMemory();
+      MayWriteToMemory = Inst->mayWriteToMemory();
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+        MemIntrinsicInfo Info;
+        if (!TTI->getTgtMemIntrinsic(II, Info))
+          return;
+        if (Info.NumMemRefs == 1) {
+          Store = Info.WriteMem;
+          Load = Info.ReadMem;
+          MatchingId = Info.MatchingId;
+          MayReadFromMemory = Info.ReadMem;
+          MayWriteToMemory = Info.WriteMem;
+          Vol = Info.Vol;
+          Ptr = Info.PtrVal;
+        }
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Load = true;
+        Vol = !LI->isSimple();
+        Ptr = LI->getPointerOperand();
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        Store = true;
+        Vol = !SI->isSimple();
+        Ptr = SI->getPointerOperand();
+      }
+    }
+    bool isLoad() { return Load; }
+    bool isStore() { return Store; }
+    bool isVolatile() { return Vol; }
+    bool isMatchingMemLoc(const ParseMemoryInst &Inst) {
+      return Ptr == Inst.Ptr && MatchingId == Inst.MatchingId;
+    }
+    bool isValid() { return Ptr != nullptr; }
+    int getMatchingId() { return MatchingId; }
+    Value *getPtr() { return Ptr; }
+    bool mayReadFromMemory() { return MayReadFromMemory; }
+    bool mayWriteToMemory() { return MayWriteToMemory; }
+
+  private:
+    bool Load;
+    bool Store;
+    bool Vol;
+    bool MayReadFromMemory;
+    bool MayWriteToMemory;
+    // For regular (non-intrinsic) loads/stores, this is set to -1. For
+    // intrinsic loads/stores, the id is retrieved from the corresponding
+    // field in the MemIntrinsicInfo structure.  That field contains
+    // non-negative values only.
+    int MatchingId;
+    Value *Ptr;
+  };
+
  bool processNode(DomTreeNode *Node);

  void getAnalysisUsage(AnalysisUsage &AU) const override {
    AU.addRequired<AssumptionCacheTracker>();
    AU.addRequired<DominatorTreeWrapperPass>();
    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfo>();
    AU.setPreservesCFG();
  }
+
+  Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+      return LI;
+    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+      return SI->getValueOperand();
+    assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+    return TTI->getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
+                                                  ExpectedType);
+  }
 };
 }

@ -420,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
  /// as long as there in no instruction that reads memory.  If we see a store
  /// to the same location, we delete the dead store.  This zaps trivial dead
  /// stores which can occur in bitfield code among other things.
-  StoreInst *LastStore = nullptr;
+  Instruction *LastStore = nullptr;

  bool Changed = false;

@ -475,10 +546,11 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
      continue;
    }

+    ParseMemoryInst MemInst(Inst, TTI);
    // If this is a non-volatile load, process it.
-    if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    if (MemInst.isValid() && MemInst.isLoad()) {
      // Ignore volatile loads.
-      if (!LI->isSimple()) {
+      if (MemInst.isVolatile()) {
        LastStore = nullptr;
        continue;
      }
@ -486,27 +558,35 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
      // If we have an available version of this load, and if it is the right
      // generation, replace this instruction.
      std::pair<Value *, unsigned> InVal =
-          AvailableLoads->lookup(Inst->getOperand(0));
+          AvailableLoads->lookup(MemInst.getPtr());
      if (InVal.first != nullptr && InVal.second == CurrentGeneration) {
-        DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
-                     << "  to: " << *InVal.first << '\n');
-        if (!Inst->use_empty())
-          Inst->replaceAllUsesWith(InVal.first);
-        Inst->eraseFromParent();
-        Changed = true;
-        ++NumCSELoad;
-        continue;
+        Value *Op = getOrCreateResult(InVal.first, Inst->getType());
+        if (Op != nullptr) {
+          DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+                       << "  to: " << *InVal.first << '\n');
+          if (!Inst->use_empty())
+            Inst->replaceAllUsesWith(Op);
+          Inst->eraseFromParent();
+          Changed = true;
+          ++NumCSELoad;
+          continue;
+        }
      }

      // Otherwise, remember that we have this instruction.
-      AvailableLoads->insert(Inst->getOperand(0), std::pair<Value *, unsigned>(
-                                                      Inst, CurrentGeneration));
+      AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+                                                   Inst, CurrentGeneration));
      LastStore = nullptr;
      continue;
    }

    // If this instruction may read from memory, forget LastStore.
-    if (Inst->mayReadFromMemory())
+    // Load/store intrinsics will indicate both a read and a write to
+    // memory.  The target may override this (e.g. so that a store intrinsic
+    // does not read  from memory, and thus will be treated the same as a
+    // regular store for commoning purposes).
+    if (Inst->mayReadFromMemory() &&
+        !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
      LastStore = nullptr;

    // If this is a read-only call, process it.
@ -537,17 +617,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
    if (Inst->mayWriteToMemory()) {
      ++CurrentGeneration;

-      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (MemInst.isValid() && MemInst.isStore()) {
        // We do a trivial form of DSE if there are two stores to the same
        // location with no intervening loads.  Delete the earlier store.
-        if (LastStore &&
-            LastStore->getPointerOperand() == SI->getPointerOperand()) {
-          DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
-                       << "  due to: " << *Inst << '\n');
-          LastStore->eraseFromParent();
-          Changed = true;
-          ++NumDSE;
-          LastStore = nullptr;
+        if (LastStore) {
+          ParseMemoryInst LastStoreMemInst(LastStore, TTI);
+          if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
+            DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+                         << "  due to: " << *Inst << '\n');
+            LastStore->eraseFromParent();
+            Changed = true;
+            ++NumDSE;
+            LastStore = nullptr;
+          }
          // fallthrough - we can exploit information about this store
        }

@ -556,13 +638,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
        // version of the pointer.  It is safe to forward from volatile stores
        // to non-volatile loads, so we don't have to check for volatility of
        // the store.
-        AvailableLoads->insert(SI->getPointerOperand(),
-                               std::pair<Value *, unsigned>(
-                                   SI->getValueOperand(), CurrentGeneration));
+        AvailableLoads->insert(MemInst.getPtr(), std::pair<Value *, unsigned>(
+                                                     Inst, CurrentGeneration));

        // Remember that this was the last store we saw for DSE.
-        if (SI->isSimple())
-          LastStore = SI;
+        if (!MemInst.isVolatile())
+          LastStore = Inst;
      }
    }
  }
@ -584,6 +665,7 @@ bool EarlyCSE::runOnFunction(Function &F) {
  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
  DL = DLP ? &DLP->getDataLayout() : nullptr;
  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfo>();
  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);

--- a/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
+++ b/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
@ -0,0 +1,231 @@
+; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
+
+define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse2
+; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
+entry:
+; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
+; CHECK-LABEL: @test_cse3
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %1 = bitcast i32* %a to i8*
+  %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
+  %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
+  %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+
+define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
+; away by Early CSE.
+; CHECK-LABEL: @test_nocse
+; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  store i32 0, i32* %b, align 4
+  %5 = bitcast i32* %a to i8*
+  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
+  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
+  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
+; to mismatch between st2 and ld3.
+; CHECK-LABEL: @test_nocse2
+; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
+entry:
+; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
+; mismatch between st2 and st3.
+; CHECK-LABEL: @test_nocse3
+; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
+; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
+  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
+  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = bitcast i32* %a to i8*
+  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
+  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
+  %3 = bitcast <16 x i8> %1 to <4 x i32>
+  %4 = bitcast <16 x i8> %2 to <4 x i32>
+  call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
+  %5 = bitcast i32* %a to i8*
+  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
+  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
+  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
+  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret <4 x i32> %res.0
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind
+declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
+
+; Function Attrs: nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
+
+define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
+entry:
+  %add = add <4 x i32> %__p0, %__p1
+  ret <4 x i32> %add
+}
--- a/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
+++ b/test/Transforms/EarlyCSE/AArch64/lit.local.cfg
@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'AArch64' in targets:
+    config.unsupported = True