From 34c171d3ad7c13feb8342d848d2c1f451f3fd1bd Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 9 Mar 2016 22:59:30 +0000 Subject: [PATCH] [LICM] Store promotion when memory is thread local This patch teaches LICM's implementation of store promotion to exploit the fact that the memory location being accessed might be provable thread local. The fact it's thread local weakens the requirements for where we can insert stores since no other thread can observe the write. This allows us perform store promotion even in cases where the store is not guaranteed to execute in the loop. Two key assumption worth drawing out is that this assumes a) no-capture is strong enough to imply no-escape, and b) standard allocation functions like malloc, calloc, and operator new return values which can be assumed not to have previously escaped. In future work, it would be nice to generalize this so that it works without directly seeing the allocation site. I believe that the nocapture return attribute should be suitable for this purpose, but haven't investigated carefully. It's also likely that we could support unescaped allocas with similar reasoning, but since SROA and Mem2Reg should destroy those, they're less interesting than they first might seem. Differential Revision: http://reviews.llvm.org/D16783 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@263072 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Transforms/Utils/LoopUtils.h | 4 +- lib/Transforms/Scalar/LICM.cpp | 67 +++++++++-- test/Transforms/LICM/promote-tls.ll | 133 ++++++++++++++++++++++ 3 files changed, 191 insertions(+), 13 deletions(-) create mode 100644 test/Transforms/LICM/promote-tls.ll diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index 222fe5e89d1..9705e5c6872 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -373,8 +373,8 @@ bool hoistRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *, bool promoteLoopAccessesToScalars(AliasSet &, SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, - DominatorTree *, Loop *, AliasSetTracker *, - LICMSafetyInfo *); + DominatorTree *, const TargetLibraryInfo *, + Loop *, AliasSetTracker *, LICMSafetyInfo *); /// \brief Computes safety information for a loop /// checks loop body & header for the possibility of may throw diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 0ac53ba92cb..daf76218e62 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -35,10 +35,12 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -223,7 +225,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Loop over all of the alias sets in the tracker object. for (AliasSet &AS : *CurAST) Changed |= promoteLoopAccessesToScalars(AS, ExitBlocks, InsertPts, - PIC, LI, DT, CurLoop, + PIC, LI, DT, TLI, CurLoop, CurAST, &SafetyInfo); // Once we have promoted values across the loop body we have to recursively @@ -846,7 +848,9 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, SmallVectorImpl&ExitBlocks, SmallVectorImpl&InsertPts, PredIteratorCache &PIC, LoopInfo *LI, - DominatorTree *DT, Loop *CurLoop, + DominatorTree *DT, + const TargetLibraryInfo *TLI, + Loop *CurLoop, AliasSetTracker *CurAST, LICMSafetyInfo * SafetyInfo) { // Verify inputs. @@ -879,10 +883,25 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, // // is not safe, because *P may only be valid to access if 'c' is true. // + // The safety property divides into two parts: + // 1) The memory may not be dereferenceable on entry to the loop. In this + // case, we can't insert the required load in the preheader. + // 2) The memory model does not allow us to insert a store along any dynamic + // path which did not originally have one. + // // It is safe to promote P if all uses are direct load/stores and if at // least one is guaranteed to be executed. bool GuaranteedToExecute = false; + // It is also safe to promote P if we can prove that speculating a load into + // the preheader is safe (i.e. proving dereferenceability on all + // paths through the loop), and that the memory can be proven thread local + // (so that the memory model requirement doesn't apply.) We first establish + // the former, and then run a capture analysis below to establish the later. + // We can use any access within the alias set to prove dereferenceability + // since they're all must alias. + bool CanSpeculateLoad = false; + SmallVector LoopUses; SmallPtrSet PointerMustAliases; @@ -892,6 +911,16 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, AAMDNodes AATags; bool HasDedicatedExits = CurLoop->hasDedicatedExits(); + // Don't sink stores from loops without dedicated block exits. Exits + // containing indirect branches are not transformed by loop simplify, + // make sure we catch that. An additional load may be generated in the + // preheader for SSA updater, so also avoid sinking when no preheader + // is available. + if (!HasDedicatedExits || !Preheader) + return false; + + const DataLayout &MDL = Preheader->getModule()->getDataLayout(); + // Check that all of the pointers in the alias set have the same type. We // cannot (yet) promote a memory location that is loaded and stored in // different sizes. While we are at it, collect alignment and AA info. @@ -918,6 +947,12 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, assert(!Load->isVolatile() && "AST broken"); if (!Load->isSimple()) return Changed; + + if (!GuaranteedToExecute && !CanSpeculateLoad) + CanSpeculateLoad = + isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop, + SafetyInfo, + Preheader->getTerminator()); } else if (const StoreInst *Store = dyn_cast(UI)) { // Stores *of* the pointer are not interesting, only stores *to* the // pointer. @@ -926,13 +961,6 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, assert(!Store->isVolatile() && "AST broken"); if (!Store->isSimple()) return Changed; - // Don't sink stores from loops without dedicated block exits. Exits - // containing indirect branches are not transformed by loop simplify, - // make sure we catch that. An additional load may be generated in the - // preheader for SSA updater, so also avoid sinking when no preheader - // is available. - if (!HasDedicatedExits || !Preheader) - return Changed; // Note that we only check GuaranteedToExecute inside the store case // so that we do not introduce stores where they did not exist before @@ -953,6 +981,14 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, GuaranteedToExecute = isGuaranteedToExecute(*UI, DT, CurLoop, SafetyInfo); + + if (!GuaranteedToExecute && !CanSpeculateLoad) { + CanSpeculateLoad = + isDereferenceableAndAlignedPointer(Store->getPointerOperand(), + Store->getAlignment(), MDL, + Preheader->getTerminator(), + DT, TLI); + } } else return Changed; // Not a load or store. @@ -968,8 +1004,17 @@ bool llvm::promoteLoopAccessesToScalars(AliasSet &AS, } } - // If there isn't a guaranteed-to-execute instruction, we can't promote. - if (!GuaranteedToExecute) + // Check legality per comment above. Otherwise, we can't promote. + bool PromotionIsLegal = GuaranteedToExecute; + if (!PromotionIsLegal && CanSpeculateLoad) { + // If this is a thread local location, then we can insert stores along + // paths which originally didn't have them without violating the memory + // model. + Value *Object = GetUnderlyingObject(SomePtr, MDL); + PromotionIsLegal = isAllocLikeFn(Object, TLI) && + !PointerMayBeCaptured(Object, true, true); + } + if (!PromotionIsLegal) return Changed; // Figure out the loop exits and their insertion points, if this is the diff --git a/test/Transforms/LICM/promote-tls.ll b/test/Transforms/LICM/promote-tls.ll new file mode 100644 index 00000000000..7763114ef45 --- /dev/null +++ b/test/Transforms/LICM/promote-tls.ll @@ -0,0 +1,133 @@ +; RUN: opt -tbaa -basicaa -licm -S < %s | FileCheck %s + +; If we can prove a local is thread local, we can insert stores during +; promotion which wouldn't be legal otherwise. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-linux-generic" + +@p = external global i8* + +declare i8* @malloc(i64) + +; Exercise the TLS case +define i32* @test(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %old = load i32, i32* %addr, align 4 + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK: store i32 %new1.lcssa, i32* %addr, align 1 + ret i32* null + +for.body: + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK: store i32 %new.lcssa, i32* %addr, align 1 + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} + +declare i8* @not_malloc(i64) + +; Negative test - not TLS +define i32* @test_neg(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @not_malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %old = load i32, i32* %addr, align 4 + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK-NOT: store + ret i32* null + +for.body: +; CHECK-LABEL: for.body: +; CHECK: store i32 %new, i32* %addr, align 4 + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK-NOT: store + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} + +; Negative test - can't speculate load since branch +; may control alignment +define i32* @test_neg2(i32 %n) { +entry: + ;; ignore the required null check for simplicity + %mem = call dereferenceable(16) noalias i8* @malloc(i64 16) + %addr = bitcast i8* %mem to i32* + br label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + br label %for.header + +for.header: + %i.02 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + ; deliberate impossible to analyze branch + %guard = load volatile i8*, i8** @p + %exitcmp = icmp eq i8* %guard, null + br i1 %exitcmp, label %for.body, label %early-exit + +early-exit: +; CHECK-LABEL: early-exit: +; CHECK-NOT: store + ret i32* null + +for.body: +; CHECK-LABEL: for.body: +; CHECK: store i32 %new, i32* %addr, align 4 + %old = load i32, i32* %addr, align 4 + %new = add i32 %old, 1 + store i32 %new, i32* %addr, align 4 + %inc = add nsw i32 %i.02, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.header, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body +; CHECK-LABEL: for.cond.for.end_crit_edge: +; CHECK-NOT: store + %split = phi i32* [ %addr, %for.body ] + ret i32* null +} +