From 85ee7e948700a10a660236ff7814920fa3fa656d Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Tue, 19 Jul 2016 23:19:20 +0000 Subject: [PATCH] [LSV] Insert stores at the right point. Summary: Previously, the insertion point for stores was the last instruction in Chain *before calling getVectorizablePrefixEndIdx*. Thus if getVectorizablePrefixEndIdx didn't return Chain.size(), we still would insert at the last instruction in Chain. This patch changes our internal API a bit in an attempt to make it less prone to this sort of error. As a result, we end up recalculating the Chain's boundary instructions, but I think worrying about the speed hit of this is a premature optimization right now. Reviewers: asbirlea, tstellarAMD Subscribers: mzolotukhin, arsenm, llvm-commits Differential Revision: https://reviews.llvm.org/D22534 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@276056 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Vectorize/LoadStoreVectorizer.cpp | 58 +++++++++---------- .../AMDGPU/insertion-point.ll | 31 +++++++++- 2 files changed, 57 insertions(+), 32 deletions(-) diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 290681599c0..1027eacb3ad 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -104,13 +104,12 @@ private: std::pair, ArrayRef> splitOddVectorElts(ArrayRef Chain, unsigned ElementSizeBits); - /// Checks for instructions which may affect the memory accessed - /// in the chain between \p From and \p To. Returns Index, where - /// \p Chain[0, Index) is the largest vectorizable chain prefix. - /// The elements of \p Chain should be all loads or all stores. - unsigned getVectorizablePrefixEndIdx(ArrayRef Chain, - BasicBlock::iterator From, - BasicBlock::iterator To); + /// Finds the largest prefix of Chain that's vectorizable, checking for + /// intervening instructions which may affect the memory accessed by the + /// instructions within Chain. + /// + /// The elements of \p Chain must be all loads or all stores. + ArrayRef getVectorizablePrefix(ArrayRef Chain); /// Collects load and store instructions to vectorize. std::pair collectInstructions(BasicBlock *BB); @@ -424,14 +423,12 @@ Vectorizer::splitOddVectorElts(ArrayRef Chain, return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft)); } -unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef Chain, - BasicBlock::iterator From, - BasicBlock::iterator To) { +ArrayRef Vectorizer::getVectorizablePrefix(ArrayRef Chain) { SmallVector, 16> MemoryInstrs; SmallVector, 16> ChainInstrs; unsigned InstrIdx = 0; - for (Instruction &I : make_range(From, To)) { + for (Instruction &I : make_range(getBoundaryInstrs(Chain))) { ++InstrIdx; if (isa(I) || isa(I)) { if (!is_contained(Chain, &I)) @@ -445,7 +442,7 @@ unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef Chain, } assert(Chain.size() == ChainInstrs.size() && - "All instructions in the Chain must exist in [From, To)."); + "All instrs in Chain must be within range getBoundaryInstrs(Chain)."); unsigned ChainIdx = 0; for (auto EntryChain : ChainInstrs) { @@ -487,12 +484,12 @@ unsigned Vectorizer::getVectorizablePrefixEndIdx(ArrayRef Chain, << " " << *Ptr1 << '\n'; }); - return ChainIdx; + return Chain.slice(0, ChainIdx); } } ChainIdx++; } - return Chain.size(); + return Chain; } std::pair @@ -694,22 +691,20 @@ bool Vectorizer::vectorizeStoreChain( return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { + ArrayRef NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { // There exists a side effect instruction, no vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); // Store size should be 1B, 2B or multiple of 4B. @@ -773,7 +768,8 @@ bool Vectorizer::vectorizeStoreChain( } } - // Set insert point. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*Last); Value *Vec = UndefValue::get(VecTy); @@ -849,22 +845,20 @@ bool Vectorizer::vectorizeLoadChain( return false; } - BasicBlock::iterator First, Last; - std::tie(First, Last) = getBoundaryInstrs(Chain); - unsigned StopChain = getVectorizablePrefixEndIdx(Chain, First, Last); - if (StopChain == 0) { + ArrayRef NewChain = getVectorizablePrefix(Chain); + if (NewChain.empty()) { // There exists a side effect instruction, no vectorization possible. InstructionsProcessed->insert(Chain.begin(), Chain.end()); return false; } - if (StopChain == 1) { + if (NewChain.size() == 1) { // Failed after the first instruction. Discard it and try the smaller chain. - InstructionsProcessed->insert(Chain.front()); + InstructionsProcessed->insert(NewChain.front()); return false; } // Update Chain to the valid vectorizable subchain. - Chain = Chain.slice(0, StopChain); + Chain = NewChain; ChainSize = Chain.size(); // Load size should be 1B, 2B or multiple of 4B. @@ -927,7 +921,11 @@ bool Vectorizer::vectorizeLoadChain( V->dump(); }); - // Set insert point. + // getVectorizablePrefix already computed getBoundaryInstrs. The value of + // Last may have changed since then, but the value of First won't have. If it + // matters, we could compute getBoundaryInstrs only once and reuse it here. + BasicBlock::iterator First, Last; + std::tie(First, Last) = getBoundaryInstrs(Chain); Builder.SetInsertPoint(&*First); Value *Bitcast = diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll index 64a0480d8d3..4672224672f 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -2,8 +2,9 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" -; Check relative position of the inserted vector load relative to the existing -; adds. Vectorized loads should be inserted at the position of the first load. +; Check position of the inserted vector load/store. Vectorized loads should be +; inserted at the position of the first load in the chain, and stores should be +; inserted at the position of the last store. ; CHECK-LABEL: @insert_load_point( ; CHECK: %z = add i32 %x, 4 @@ -59,4 +60,30 @@ entry: ret void } +; Here we have four stores, with an aliasing load before the last one. We can +; vectorize the first two stores as <2 x float>, but this vectorized store must +; be inserted at the location of the second scalar store, not the fourth one. +; +; CHECK-LABEL: @insert_store_point_alias +; CHECK: store <2 x float> +; CHECK: store float +; CHECK-SAME: %a.idx.2 +; CHECK: load float, float addrspace(1)* %a.idx.2 +; CHECK: store float +; CHECK-SAME: %a.idx.3 +define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %idx) { + %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 + %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 + %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 + + store float 0.0, float addrspace(1)* %a.idx, align 4 + store float 0.0, float addrspace(1)* %a.idx.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.2, align 4 + %x = load float, float addrspace(1)* %a.idx.2, align 4 + store float 0.0, float addrspace(1)* %a.idx.3, align 4 + + ret float %x +} + attributes #0 = { nounwind }