[OpenMPOpt][HideMemTransfersLatency] Get values stored in offload arrays

getValuesInOffloadArrays goes through the offload arrays in __tgt_target_data_begin_mapper getting the values stored in them before the call is issued.

call void @__tgt_target_data_begin_mapper(arg0, arg1,
    i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
...)

Diferential Revision: https://reviews.llvm.org/D86300
This commit is contained in:
Hamilton Tobon Mosquera 2020-08-31 15:29:22 -05:00
parent 8e53f78912
commit b5757ca2c4
2 changed files with 242 additions and 0 deletions

View File

@ -26,6 +26,7 @@
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include "llvm/Analysis/ValueTracking.h"
using namespace llvm;
using namespace omp;
@ -379,6 +380,87 @@ struct OMPInformationCache : public InformationCache {
SmallPtrSetImpl<Kernel> &Kernels;
};
/// Used to map the values physically (in the IR) stored in an offload
/// array, to a vector in memory.
struct OffloadArray {
/// Physical array (in the IR).
AllocaInst *Array = nullptr;
/// Mapped values.
SmallVector<Value *, 8> StoredValues;
/// Last stores made in the offload array.
SmallVector<StoreInst *, 8> LastAccesses;
OffloadArray() = default;
/// Initializes the OffloadArray with the values stored in \p Array before
/// instruction \p Before is reached. Returns false if the initialization
/// fails.
/// This MUST be used immediately after the construction of the object.
bool initialize(AllocaInst &Array, Instruction &Before) {
if (!Array.getAllocatedType()->isArrayTy())
return false;
if (!getValues(Array, Before))
return false;
this->Array = &Array;
return true;
}
private:
/// Traverses the BasicBlock where \p Array is, collecting the stores made to
/// \p Array, leaving StoredValues with the values stored before the
/// instruction \p Before is reached.
bool getValues(AllocaInst &Array, Instruction &Before) {
// Initialize container.
const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
StoredValues.assign(NumValues, nullptr);
LastAccesses.assign(NumValues, nullptr);
// TODO: This assumes the instruction \p Before is in the same
// BasicBlock as Array. Make it general, for any control flow graph.
BasicBlock *BB = Array.getParent();
if (BB != Before.getParent())
return false;
const DataLayout &DL = Array.getModule()->getDataLayout();
const unsigned int PointerSize = DL.getPointerSize();
for (Instruction &I : *BB) {
if (&I == &Before)
break;
if (!isa<StoreInst>(&I))
continue;
auto *S = cast<StoreInst>(&I);
int64_t Offset = -1;
auto *Dst = GetPointerBaseWithConstantOffset(S->getPointerOperand(),
Offset, DL);
if (Dst == &Array) {
int64_t Idx = Offset / PointerSize;
StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
LastAccesses[Idx] = S;
}
}
return isFilled();
}
/// Returns true if all values in StoredValues and
/// LastAccesses are not nullptrs.
bool isFilled() {
const unsigned NumValues = StoredValues.size();
for (unsigned I = 0; I < NumValues; ++I) {
if (!StoredValues[I] || !LastAccesses[I])
return false;
}
return true;
}
};
struct OpenMPOpt {
using OptimizationRemarkGetter =
@ -589,6 +671,12 @@ private:
if (!RTCall)
return false;
OffloadArray OffloadArrays[3];
if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
return false;
LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
// TODO: Check if can be moved upwards.
bool WasSplit = false;
Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
@ -603,6 +691,93 @@ private:
return Changed;
}
/// Maps the values stored in the offload arrays passed as arguments to
/// \p RuntimeCall into the offload arrays in \p OAs.
bool getValuesInOffloadArrays(CallInst &RuntimeCall,
MutableArrayRef<OffloadArray> OAs) {
assert(OAs.size() == 3 && "Need space for three offload arrays!");
// A runtime call that involves memory offloading looks something like:
// call void @__tgt_target_data_begin_mapper(arg0, arg1,
// i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
// ...)
// So, the idea is to access the allocas that allocate space for these
// offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
// Therefore:
// i8** %offload_baseptrs.
const unsigned BasePtrsArgNum = 2;
Value *BasePtrsArg = RuntimeCall.getArgOperand(BasePtrsArgNum);
// i8** %offload_ptrs.
const unsigned PtrsArgNum = 3;
Value *PtrsArg = RuntimeCall.getArgOperand(PtrsArgNum);
// i8** %offload_sizes.
const unsigned SizesArgNum = 4;
Value *SizesArg = RuntimeCall.getArgOperand(SizesArgNum);
// Get values stored in **offload_baseptrs.
auto *V = getUnderlyingObject(BasePtrsArg);
if (!isa<AllocaInst>(V))
return false;
auto *BasePtrsArray = cast<AllocaInst>(V);
if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
return false;
// Get values stored in **offload_baseptrs.
V = getUnderlyingObject(PtrsArg);
if (!isa<AllocaInst>(V))
return false;
auto *PtrsArray = cast<AllocaInst>(V);
if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
return false;
// Get values stored in **offload_sizes.
V = getUnderlyingObject(SizesArg);
// If it's a [constant] global array don't analyze it.
if (isa<GlobalValue>(V))
return isa<Constant>(V);
if (!isa<AllocaInst>(V))
return false;
auto *SizesArray = cast<AllocaInst>(V);
if (!OAs[2].initialize(*SizesArray, RuntimeCall))
return false;
return true;
}
/// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
/// For now this is a way to test that the function getValuesInOffloadArrays
/// is working properly.
/// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
assert(OAs.size() == 3 && "There are three offload arrays to debug!");
LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
std::string ValuesStr;
raw_string_ostream Printer(ValuesStr);
std::string Separator = " --- ";
for (auto *BP : OAs[0].StoredValues) {
BP->print(Printer);
Printer << Separator;
}
LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
ValuesStr.clear();
for (auto *P : OAs[1].StoredValues) {
P->print(Printer);
Printer << Separator;
}
LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
ValuesStr.clear();
for (auto *S : OAs[2].StoredValues) {
S->print(Printer);
Printer << Separator;
}
LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
}
/// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
/// moved. Returns nullptr if the movement is not possible, or not worth it.
Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {

View File

@ -0,0 +1,67 @@
; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency -debug-only=openmp-opt < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@.__omp_offloading_heavyComputation.region_id = weak constant i8 0
@.offload_maptypes. = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
; CHECK-LABEL: {{[^@]+}}Successfully got offload values:
; CHECK-NEXT: offload_baseptrs: double* %a --- %size.addr = alloca i32, align 4 ---
; CHECK-NEXT: offload_ptrs: double* %a --- %size.addr = alloca i32, align 4 ---
; CHECK-NEXT: offload_sizes: %0 = shl nuw nsw i64 %conv, 3 --- i64 4 ---
;int heavyComputation(double* a, unsigned size) {
; int random = rand() % 7;
;
; //#pragma omp target data map(a[0:size], size)
; void* args[2];
; args[0] = &a;
; args[1] = &size;
; __tgt_target_data_begin(..., args, ...)
;
; #pragma omp target teams
; for (int i = 0; i < size; ++i) {
; a[i] = ++a[i] * 3.141624;
; }
;
; return random;
;}
define dso_local i32 @heavyComputation(double* %a, i32 %size) {
entry:
%size.addr = alloca i32, align 4
%.offload_baseptrs = alloca [2 x i8*], align 8
%.offload_ptrs = alloca [2 x i8*], align 8
%.offload_sizes = alloca [2 x i64], align 8
store i32 %size, i32* %size.addr, align 4
%call = tail call i32 (...) @rand()
%conv = zext i32 %size to i64
%0 = shl nuw nsw i64 %conv, 3
%1 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 0
%2 = bitcast [2 x i8*]* %.offload_baseptrs to double**
store double* %a, double** %2, align 8
%3 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 0
%4 = bitcast [2 x i8*]* %.offload_ptrs to double**
store double* %a, double** %4, align 8
%5 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 0
store i64 %0, i64* %5, align 8
%6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i64 0, i64 1
%7 = bitcast i8** %6 to i32**
store i32* %size.addr, i32** %7, align 8
%8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i64 0, i64 1
%9 = bitcast i8** %8 to i32**
store i32* %size.addr, i32** %9, align 8
%10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
store i64 4, i64* %10, align 8
call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
%rem = srem i32 %call, 7
call void @__tgt_target_data_end_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes., i64 0, i64 0), i8** null)
ret i32 %rem
}
declare void @__tgt_target_data_begin_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8**)
declare dso_local i32 @rand(...)