[GPGPU] Collect parameter dimension used in MemoryAccesses

When using -polly-ignore-integer-wrapping and -polly-acc-codegen-managed-memory
we add parameter dimensions lazily to the domains, which results in PPCG not
including parameter dimensions that are only used in memory accesses in the
kernel space. To make sure these parameters are still passed to the kernel, we
collect these parameter dimensions and align the kernel's parameter space
before code-generating it.

llvm-svn: 311239
This commit is contained in:
Tobias Grosser 2017-08-19 12:58:28 +00:00
parent ee7d232a41
commit 43df2020e7
4 changed files with 78 additions and 6 deletions

View File

@ -23,6 +23,9 @@
#include "llvm/ADT/SmallVector.h"
#include "isl/ctx.h"
#include "isl/union_map.h"
#include "isl-noexceptions.h"
#include <utility>
#include <vector>
@ -41,6 +44,9 @@ struct SubtreeReferences {
SetVector<Value *> &Values;
SetVector<const SCEV *> &SCEVs;
BlockGenerator &BlockGen;
// In case an (optional) parameter space location is provided, parameter space
// information is collected as well.
isl::space *ParamSpace;
};
/// Extract the out-of-scop values and SCEVs referenced from a ScopStmt.
@ -50,6 +56,10 @@ struct SubtreeReferences {
/// statements we force the generation of alloca memory locations and list
/// these locations in the set of out-of-scop values as well.
///
/// We also collect an isl::space that includes all parameter dimensions
/// used in the statement's memory accesses, in case the ParamSpace pointer
/// is non-null.
///
/// @param Stmt The statement for which to extract the information.
/// @param UserPtr A void pointer that can be casted to a
/// SubtreeReferences structure.

View File

@ -229,6 +229,12 @@ isl_stat addReferencesFromStmt(const ScopStmt *Stmt, void *UserPtr,
}
for (auto &Access : *Stmt) {
if (References.ParamSpace) {
isl::space ParamSpace = Access->getLatestAccessRelation().get_space();
(*References.ParamSpace) =
References.ParamSpace->align_params(ParamSpace);
}
if (Access->isLatestArrayKind()) {
auto *BasePtr = Access->getScopArrayInfo()->getBasePtr();
if (Instruction *OpInst = dyn_cast<Instruction>(BasePtr))
@ -297,7 +303,7 @@ void IslNodeBuilder::getReferencesInSubtree(__isl_keep isl_ast_node *For,
SetVector<const SCEV *> SCEVs;
struct SubtreeReferences References = {
LI, SE, S, ValueMap, Values, SCEVs, getBlockGenerator()};
LI, SE, S, ValueMap, Values, SCEVs, getBlockGenerator(), nullptr};
for (const auto &I : IDToValue)
Values.insert(I.second);

View File

@ -436,7 +436,8 @@ private:
/// in the scop, nor do they immediately surroung the Scop.
/// See [Code generation of induction variables of loops outside
/// Scops]
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>>
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
isl::space>
getReferencesInKernel(ppcg_kernel *Kernel);
/// Compute the sizes of the execution grid for a given kernel.
@ -1434,13 +1435,16 @@ getFunctionsFromRawSubtreeValues(SetVector<Value *> RawSubtreeValues,
return SubtreeFunctions;
}
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>>
std::tuple<SetVector<Value *>, SetVector<Function *>, SetVector<const Loop *>,
isl::space>
GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
SetVector<Value *> SubtreeValues;
SetVector<const SCEV *> SCEVs;
SetVector<const Loop *> Loops;
isl::space ParamSpace = isl::space(S.getIslCtx(), 0, 0).params();
SubtreeReferences References = {
LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator()};
LI, SE, S, ValueMap, SubtreeValues, SCEVs, getBlockGenerator(),
&ParamSpace};
for (const auto &I : IDToValue)
SubtreeValues.insert(I.second);
@ -1507,7 +1511,8 @@ GPUNodeBuilder::getReferencesInKernel(ppcg_kernel *Kernel) {
else
ReplacedValues.insert(It->second);
}
return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops);
return std::make_tuple(ReplacedValues, ValidSubtreeFunctions, Loops,
ParamSpace);
}
void GPUNodeBuilder::clearDominators(Function *F) {
@ -1751,9 +1756,16 @@ void GPUNodeBuilder::createKernel(__isl_take isl_ast_node *KernelStmt) {
SetVector<Value *> SubtreeValues;
SetVector<Function *> SubtreeFunctions;
SetVector<const Loop *> Loops;
std::tie(SubtreeValues, SubtreeFunctions, Loops) =
isl::space ParamSpace;
std::tie(SubtreeValues, SubtreeFunctions, Loops, ParamSpace) =
getReferencesInKernel(Kernel);
// Add parameters that appear only in the access function to the kernel
// space. This is important to make sure that all isl_ids are passed as
// parameters to the kernel, even though we may not have all parameters
// in the context to improve compile time.
Kernel->space = isl_space_align_params(Kernel->space, ParamSpace.release());
assert(Kernel->tree && "Device AST of kernel node is empty");
Instruction &HostInsertPoint = *Builder.GetInsertPoint();

View File

@ -0,0 +1,44 @@
; RUN: opt %loadPolly -polly-codegen-ppcg -polly-acc-dump-kernel-ir \
; RUN: -polly-invariant-load-hoisting -polly-ignore-aliasing \
; RUN: -polly-process-unprofitable -polly-ignore-parameter-bounds \
; RUN: -polly-acc-fail-on-verify-module-failure \
; RUN: -polly-acc-codegen-managed-memory \
; RUN: -disable-output < %s | \
; RUN: FileCheck %s
; REQUIRES: pollyacc
; Verify that we correctly generate a kernel even if certain invariant load
; hoisted parameters appear only in memory accesses, but not domain elements.
; CHECK: @FUNC_quux_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_tmp4, i32 %tmp3, i32 %tmp, i32 %tmp31, i32 %tmp2)
target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-unknown-linux-gnu"
%struct.hoge = type { i8*, i64, i64, [1 x %struct.widget] }
%struct.widget = type { i64, i64, i64 }
@global = external unnamed_addr global %struct.hoge, align 32
define void @quux(i32* noalias %arg, i32* noalias %arg1) {
bb:
%tmp = load i32, i32* %arg, align 4
%tmp2 = sext i32 %tmp to i64
%tmp3 = load i32, i32* %arg1, align 4
%tmp4 = load [0 x double]*, [0 x double]** bitcast (%struct.hoge* @global to [0 x double]**), align 32
br label %bb5
bb5: ; preds = %bb5, %bb
%tmp6 = phi i32 [ %tmp11, %bb5 ], [ 0, %bb ]
%tmp7 = sext i32 %tmp6 to i64
%tmp8 = sub nsw i64 %tmp7, %tmp2
%tmp9 = getelementptr [0 x double], [0 x double]* %tmp4, i64 0, i64 %tmp8
store double undef, double* %tmp9, align 8
%tmp10 = icmp eq i32 %tmp6, %tmp3
%tmp11 = add i32 %tmp6, 1
br i1 %tmp10, label %bb12, label %bb5
bb12: ; preds = %bb5
ret void
}