mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-30 17:21:10 +00:00
Revert "[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass."
This reverts commit cdb6f05e2d
.
The build is broken with:
You have called ADD_LIBRARY for library obj.MLIRGPUtoCUDATransforms without any source files. This typically indicates a problem with your CMakeLists.txt file
This commit is contained in:
parent
c32d695b09
commit
5c3ebd7725
@ -1,36 +0,0 @@
|
||||
//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
|
||||
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
|
||||
|
||||
#include "mlir/Support/LLVM.h"
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class Location;
|
||||
class ModuleOp;
|
||||
|
||||
template <typename T>
|
||||
class OperationPass;
|
||||
|
||||
/// Creates a pass to convert a gpu.launch_func operation into a sequence of
|
||||
/// GPU runtime calls.
|
||||
///
|
||||
/// This pass does not generate code to call GPU runtime APIs directly but
|
||||
/// instead uses a small wrapper library that exports a stable and conveniently
|
||||
/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
|
||||
std::unique_ptr<OperationPass<ModuleOp>>
|
||||
createConvertGpuLaunchFuncToGpuRuntimeCallsPass();
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
|
@ -45,6 +45,15 @@ using CubinGenerator =
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||||
createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);
|
||||
|
||||
/// Creates a pass to convert a gpu.launch_func operation into a sequence of
|
||||
/// CUDA calls.
|
||||
///
|
||||
/// This pass does not generate code to call CUDA directly but instead uses a
|
||||
/// small wrapper library that exports a stable and conveniently typed ABI
|
||||
/// on top of CUDA.
|
||||
std::unique_ptr<OperationPass<ModuleOp>>
|
||||
createConvertGpuLaunchFuncToCudaCallsPass();
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_
|
||||
|
@ -79,18 +79,12 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GPUCommon
|
||||
// GPUToCUDA
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
|
||||
"ModuleOp"> {
|
||||
let summary = "Convert all launch_func ops to GPU runtime calls";
|
||||
let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
|
||||
let options = [
|
||||
Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
|
||||
"\"nvvm.cubin\"",
|
||||
"Annotation attribute string for GPU binary">,
|
||||
];
|
||||
def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> {
|
||||
let summary = "Convert all launch_func ops to CUDA runtime calls";
|
||||
let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()";
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -15,7 +15,6 @@
|
||||
#define MLIR_INITALLPASSES_H_
|
||||
|
||||
#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
|
||||
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
||||
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
|
||||
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
|
||||
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
|
||||
|
@ -1,6 +1,5 @@
|
||||
add_subdirectory(AffineToStandard)
|
||||
add_subdirectory(AVX512ToLLVM)
|
||||
add_subdirectory(GPUCommon)
|
||||
add_subdirectory(GPUToCUDA)
|
||||
add_subdirectory(GPUToNVVM)
|
||||
add_subdirectory(GPUToROCDL)
|
||||
|
@ -1,21 +0,0 @@
|
||||
set(SOURCES
|
||||
ConvertLaunchFuncToRuntimeCalls.cpp
|
||||
)
|
||||
|
||||
add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
|
||||
${SOURCES}
|
||||
|
||||
DEPENDS
|
||||
MLIRConversionPassIncGen
|
||||
intrinsics_gen
|
||||
|
||||
LINK_COMPONENTS
|
||||
Core
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRGPU
|
||||
MLIRIR
|
||||
MLIRLLVMIR
|
||||
MLIRPass
|
||||
MLIRSupport
|
||||
)
|
@ -2,6 +2,10 @@ set(LLVM_OPTIONAL_SOURCES
|
||||
ConvertKernelFuncToCubin.cpp
|
||||
)
|
||||
|
||||
set(SOURCES
|
||||
ConvertLaunchFuncToCudaCalls.cpp
|
||||
)
|
||||
|
||||
if (MLIR_CUDA_CONVERSIONS_ENABLED)
|
||||
list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")
|
||||
set(NVPTX_LIBS
|
||||
|
@ -1,4 +1,4 @@
|
||||
//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
|
||||
//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
@ -7,13 +7,13 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a pass to convert gpu.launch_func op into a sequence of
|
||||
// GPU runtime calls. As most of GPU runtimes does not have a stable published
|
||||
// ABI, this pass uses a slim runtime layer that builds on top of the public
|
||||
// API from GPU runtime headers.
|
||||
// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,
|
||||
// this pass uses a slim runtime layer that builds on top of the public API from
|
||||
// the CUDA headers.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
||||
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
|
||||
|
||||
#include "../PassDetail.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
@ -35,34 +35,33 @@
|
||||
using namespace mlir;
|
||||
|
||||
// To avoid name mangling, these are defined in the mini-runtime file.
|
||||
static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
|
||||
static constexpr const char *kGpuModuleGetFunctionName =
|
||||
"mgpuModuleGetFunction";
|
||||
static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
|
||||
static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper";
|
||||
static constexpr const char *kGpuStreamSynchronizeName =
|
||||
"mgpuStreamSynchronize";
|
||||
static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
|
||||
static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
|
||||
static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
|
||||
static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
|
||||
static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
|
||||
static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
|
||||
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
|
||||
static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
|
||||
|
||||
static constexpr const char *kCubinAnnotation = "nvvm.cubin";
|
||||
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";
|
||||
|
||||
namespace {
|
||||
|
||||
/// A pass to convert gpu.launch_func operations into a sequence of GPU
|
||||
/// runtime calls. Currently it supports CUDA and ROCm (HIP).
|
||||
/// A pass to convert gpu.launch_func operations into a sequence of CUDA
|
||||
/// runtime calls.
|
||||
///
|
||||
/// In essence, a gpu.launch_func operations gets compiled into the following
|
||||
/// sequence of runtime calls:
|
||||
///
|
||||
/// * moduleLoad -- loads the module given the cubin / hsaco data
|
||||
/// * moduleGetFunction -- gets a handle to the actual kernel function
|
||||
/// * getStreamHelper -- initializes a new compute stream on GPU
|
||||
/// * launchKernel -- launches the kernel on a stream
|
||||
/// * streamSynchronize -- waits for operations on the stream to finish
|
||||
/// * mcuModuleLoad -- loads the module given the cubin data
|
||||
/// * mcuModuleGetFunction -- gets a handle to the actual kernel function
|
||||
/// * mcuGetStreamHelper -- initializes a new CUDA stream
|
||||
/// * mcuLaunchKernelName -- launches the kernel on a stream
|
||||
/// * mcuStreamSynchronize -- waits for operations on the stream to finish
|
||||
///
|
||||
/// Intermediate data structures are allocated on the stack.
|
||||
class GpuLaunchFuncToGpuRuntimeCallsPass
|
||||
: public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
|
||||
GpuLaunchFuncToGpuRuntimeCallsPass> {
|
||||
class GpuLaunchFuncToCudaCallsPass
|
||||
: public ConvertGpuLaunchFuncToCudaCallsBase<GpuLaunchFuncToCudaCallsPass> {
|
||||
private:
|
||||
LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }
|
||||
|
||||
@ -100,9 +99,8 @@ private:
|
||||
getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
|
||||
}
|
||||
|
||||
LLVM::LLVMType getGpuRuntimeResultType() {
|
||||
// This is declared as an enum in both CUDA and ROCm (HIP), but helpers
|
||||
// use i32.
|
||||
LLVM::LLVMType getCUResultType() {
|
||||
// This is declared as an enum in CUDA but helpers use i32.
|
||||
return getInt32Type();
|
||||
}
|
||||
|
||||
@ -114,7 +112,7 @@ private:
|
||||
/*alignment=*/0);
|
||||
}
|
||||
|
||||
void declareGpuRuntimeFunctions(Location loc);
|
||||
void declareCudaFunctions(Location loc);
|
||||
void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
|
||||
unsigned pos, Value one);
|
||||
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
|
||||
@ -134,7 +132,7 @@ public:
|
||||
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });
|
||||
|
||||
// GPU kernel modules are no longer necessary since we have a global
|
||||
// constant with the CUBIN, or HSACO data.
|
||||
// constant with the CUBIN data.
|
||||
for (auto m :
|
||||
llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
|
||||
m.erase();
|
||||
@ -153,31 +151,30 @@ private:
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
// Adds declarations for the needed helper functions from the runtime wrappers.
|
||||
// Adds declarations for the needed helper functions from the CUDA wrapper.
|
||||
// The types in comments give the actual types expected/returned but the API
|
||||
// uses void pointers. This is fine as they have the same linkage in C.
|
||||
void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
|
||||
Location loc) {
|
||||
void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
|
||||
ModuleOp module = getOperation();
|
||||
OpBuilder builder(module.getBody()->getTerminator());
|
||||
if (!module.lookupSymbol(kGpuModuleLoadName)) {
|
||||
if (!module.lookupSymbol(cuModuleLoadName)) {
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuModuleLoadName,
|
||||
loc, cuModuleLoadName,
|
||||
LLVM::LLVMType::getFunctionTy(
|
||||
getGpuRuntimeResultType(),
|
||||
getCUResultType(),
|
||||
{
|
||||
getPointerPointerType(), /* CUmodule *module */
|
||||
getPointerType() /* void *cubin */
|
||||
},
|
||||
/*isVarArg=*/false));
|
||||
}
|
||||
if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {
|
||||
if (!module.lookupSymbol(cuModuleGetFunctionName)) {
|
||||
// The helper uses void* instead of CUDA's opaque CUmodule and
|
||||
// CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.
|
||||
// CUfunction.
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuModuleGetFunctionName,
|
||||
loc, cuModuleGetFunctionName,
|
||||
LLVM::LLVMType::getFunctionTy(
|
||||
getGpuRuntimeResultType(),
|
||||
getCUResultType(),
|
||||
{
|
||||
getPointerPointerType(), /* void **function */
|
||||
getPointerType(), /* void *module */
|
||||
@ -185,15 +182,15 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
|
||||
},
|
||||
/*isVarArg=*/false));
|
||||
}
|
||||
if (!module.lookupSymbol(kGpuLaunchKernelName)) {
|
||||
// Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
|
||||
// match the LLVM type if MLIR's index type, which the GPU dialect uses.
|
||||
if (!module.lookupSymbol(cuLaunchKernelName)) {
|
||||
// Other than the CUDA api, the wrappers use uintptr_t to match the
|
||||
// LLVM type if MLIR's index type, which the GPU dialect uses.
|
||||
// Furthermore, they use void* instead of CUDA's opaque CUfunction and
|
||||
// CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
|
||||
// CUstream.
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuLaunchKernelName,
|
||||
loc, cuLaunchKernelName,
|
||||
LLVM::LLVMType::getFunctionTy(
|
||||
getGpuRuntimeResultType(),
|
||||
getCUResultType(),
|
||||
{
|
||||
getPointerType(), /* void* f */
|
||||
getIntPtrType(), /* intptr_t gridXDim */
|
||||
@ -209,23 +206,23 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
|
||||
},
|
||||
/*isVarArg=*/false));
|
||||
}
|
||||
if (!module.lookupSymbol(kGpuGetStreamHelperName)) {
|
||||
// Helper function to get the current GPU compute stream. Uses void*
|
||||
// instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
|
||||
if (!module.lookupSymbol(cuGetStreamHelperName)) {
|
||||
// Helper function to get the current CUDA stream. Uses void* instead of
|
||||
// CUDAs opaque CUstream.
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuGetStreamHelperName,
|
||||
loc, cuGetStreamHelperName,
|
||||
LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false));
|
||||
}
|
||||
if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
|
||||
if (!module.lookupSymbol(cuStreamSynchronizeName)) {
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuStreamSynchronizeName,
|
||||
LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(),
|
||||
loc, cuStreamSynchronizeName,
|
||||
LLVM::LLVMType::getFunctionTy(getCUResultType(),
|
||||
getPointerType() /* CUstream stream */,
|
||||
/*isVarArg=*/false));
|
||||
}
|
||||
if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
|
||||
if (!module.lookupSymbol(kMcuMemHostRegister)) {
|
||||
builder.create<LLVM::LLVMFuncOp>(
|
||||
loc, kGpuMemHostRegisterName,
|
||||
loc, kMcuMemHostRegister,
|
||||
LLVM::LLVMType::getFunctionTy(getVoidType(),
|
||||
{
|
||||
getPointerType(), /* void *ptr */
|
||||
@ -246,11 +243,10 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
|
||||
/// This is necessary to construct the list of arguments passed to the kernel
|
||||
/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
|
||||
/// of stack-allocated type-erased pointers to the actual arguments.
|
||||
void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
|
||||
Location loc,
|
||||
Value param, Value list,
|
||||
unsigned pos,
|
||||
Value one) {
|
||||
void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder,
|
||||
Location loc, Value param,
|
||||
Value list, unsigned pos,
|
||||
Value one) {
|
||||
auto memLocation = builder.create<LLVM::AllocaOp>(
|
||||
loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
|
||||
/*alignment=*/1);
|
||||
@ -265,16 +261,16 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
|
||||
builder.create<LLVM::StoreOp>(loc, casted, gep);
|
||||
}
|
||||
|
||||
// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
|
||||
// launch call. The arguments are extracted from the launchOp.
|
||||
// Generates a parameters array to be used with a CUDA kernel launch call. The
|
||||
// arguments are extracted from the launchOp.
|
||||
// The generated code is essentially as follows:
|
||||
//
|
||||
// %array = alloca(numparams * sizeof(void *))
|
||||
// for (i : [0, NumKernelOperands))
|
||||
// %array[i] = cast<void*>(KernelOperand[i])
|
||||
// return %array
|
||||
Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
|
||||
gpu::LaunchFuncOp launchOp, OpBuilder &builder) {
|
||||
Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,
|
||||
OpBuilder &builder) {
|
||||
|
||||
// Get the launch target.
|
||||
auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
|
||||
@ -342,7 +338,7 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
|
||||
// %1 = llvm.constant (0 : index)
|
||||
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
|
||||
// }
|
||||
Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
|
||||
Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(
|
||||
StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
|
||||
// Make sure the trailing zero is included in the constant.
|
||||
std::vector<char> kernelName(name.begin(), name.end());
|
||||
@ -356,26 +352,30 @@ Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
|
||||
}
|
||||
|
||||
// Emits LLVM IR to launch a kernel function. Expects the module that contains
|
||||
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
|
||||
// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
|
||||
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the
|
||||
// kernel function in the IR.
|
||||
// While MLIR has no global constants, also expects a cubin getter function in
|
||||
// an 'nvvm.cubingetter' attribute. Such function is expected to return a
|
||||
// pointer to the cubin blob when invoked.
|
||||
// With these given, the generated code in essence is
|
||||
//
|
||||
// %0 = call %binarygetter
|
||||
// %0 = call %cubingetter
|
||||
// %1 = alloca sizeof(void*)
|
||||
// call %moduleLoad(%2, %1)
|
||||
// call %mcuModuleLoad(%2, %1)
|
||||
// %2 = alloca sizeof(void*)
|
||||
// %3 = load %1
|
||||
// %4 = <see generateKernelNameConstant>
|
||||
// call %moduleGetFunction(%2, %3, %4)
|
||||
// %5 = call %getStreamHelper()
|
||||
// call %mcuModuleGetFunction(%2, %3, %4)
|
||||
// %5 = call %mcuGetStreamHelper()
|
||||
// %6 = load %2
|
||||
// %7 = <see setupParamsArray>
|
||||
// call %launchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
|
||||
// call %streamSynchronize(%5)
|
||||
void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
|
||||
// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
|
||||
// call %mcuStreamSynchronize(%5)
|
||||
void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
|
||||
mlir::gpu::LaunchFuncOp launchOp) {
|
||||
OpBuilder builder(launchOp);
|
||||
Location loc = launchOp.getLoc();
|
||||
declareGpuRuntimeFunctions(loc);
|
||||
declareCudaFunctions(loc);
|
||||
|
||||
auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
|
||||
builder.getI32IntegerAttr(0));
|
||||
@ -385,51 +385,51 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
|
||||
launchOp.getKernelModuleName());
|
||||
assert(kernelModule && "expected a kernel module");
|
||||
|
||||
auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
|
||||
if (!binaryAttr) {
|
||||
auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);
|
||||
if (!cubinAttr) {
|
||||
kernelModule.emitOpError()
|
||||
<< "missing " << gpuBinaryAnnotation << " attribute";
|
||||
<< "missing " << kCubinAnnotation << " attribute";
|
||||
return signalPassFailure();
|
||||
}
|
||||
|
||||
SmallString<128> nameBuffer(kernelModule.getName());
|
||||
nameBuffer.append(kGpuBinaryStorageSuffix);
|
||||
nameBuffer.append(kCubinStorageSuffix);
|
||||
Value data = LLVM::createGlobalString(
|
||||
loc, builder, nameBuffer.str(), binaryAttr.getValue(),
|
||||
loc, builder, nameBuffer.str(), cubinAttr.getValue(),
|
||||
LLVM::Linkage::Internal, getLLVMDialect());
|
||||
|
||||
// Emit the load module call to load the module data. Error checking is done
|
||||
// in the called helper function.
|
||||
auto gpuModule = allocatePointer(builder, loc);
|
||||
auto gpuModuleLoad =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
|
||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
|
||||
builder.getSymbolRefAttr(gpuModuleLoad),
|
||||
ArrayRef<Value>{gpuModule, data});
|
||||
auto cuModule = allocatePointer(builder, loc);
|
||||
auto cuModuleLoad =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName);
|
||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
|
||||
builder.getSymbolRefAttr(cuModuleLoad),
|
||||
ArrayRef<Value>{cuModule, data});
|
||||
// Get the function from the module. The name corresponds to the name of
|
||||
// the kernel function.
|
||||
auto gpuOwningModuleRef =
|
||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuModule);
|
||||
auto cuOwningModuleRef =
|
||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);
|
||||
auto kernelName = generateKernelNameConstant(
|
||||
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
|
||||
auto gpuFunction = allocatePointer(builder, loc);
|
||||
auto gpuModuleGetFunction =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);
|
||||
auto cuFunction = allocatePointer(builder, loc);
|
||||
auto cuModuleGetFunction =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
|
||||
builder.create<LLVM::CallOp>(
|
||||
loc, ArrayRef<Type>{getGpuRuntimeResultType()},
|
||||
builder.getSymbolRefAttr(gpuModuleGetFunction),
|
||||
ArrayRef<Value>{gpuFunction, gpuOwningModuleRef, kernelName});
|
||||
loc, ArrayRef<Type>{getCUResultType()},
|
||||
builder.getSymbolRefAttr(cuModuleGetFunction),
|
||||
ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
|
||||
// Grab the global stream needed for execution.
|
||||
auto gpuGetStreamHelper =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuGetStreamHelperName);
|
||||
auto gpuStream = builder.create<LLVM::CallOp>(
|
||||
auto cuGetStreamHelper =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);
|
||||
auto cuStream = builder.create<LLVM::CallOp>(
|
||||
loc, ArrayRef<Type>{getPointerType()},
|
||||
builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef<Value>{});
|
||||
builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});
|
||||
// Invoke the function with required arguments.
|
||||
auto gpuLaunchKernel =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);
|
||||
auto gpuFunctionRef =
|
||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuFunction);
|
||||
auto cuLaunchKernel =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);
|
||||
auto cuFunctionRef =
|
||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
|
||||
auto paramsArray = setupParamsArray(launchOp, builder);
|
||||
if (!paramsArray) {
|
||||
launchOp.emitOpError() << "cannot pass given parameters to the kernel";
|
||||
@ -438,25 +438,25 @@ void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
|
||||
auto nullpointer =
|
||||
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
|
||||
builder.create<LLVM::CallOp>(
|
||||
loc, ArrayRef<Type>{getGpuRuntimeResultType()},
|
||||
builder.getSymbolRefAttr(gpuLaunchKernel),
|
||||
ArrayRef<Value>{gpuFunctionRef, launchOp.getOperand(0),
|
||||
loc, ArrayRef<Type>{getCUResultType()},
|
||||
builder.getSymbolRefAttr(cuLaunchKernel),
|
||||
ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),
|
||||
launchOp.getOperand(1), launchOp.getOperand(2),
|
||||
launchOp.getOperand(3), launchOp.getOperand(4),
|
||||
launchOp.getOperand(5), zero, /* sharedMemBytes */
|
||||
gpuStream.getResult(0), /* stream */
|
||||
cuStream.getResult(0), /* stream */
|
||||
paramsArray, /* kernel params */
|
||||
nullpointer /* extra */});
|
||||
// Sync on the stream to make it synchronous.
|
||||
auto gpuStreamSync =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);
|
||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
|
||||
builder.getSymbolRefAttr(gpuStreamSync),
|
||||
ArrayRef<Value>(gpuStream.getResult(0)));
|
||||
auto cuStreamSync =
|
||||
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
|
||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
|
||||
builder.getSymbolRefAttr(cuStreamSync),
|
||||
ArrayRef<Value>(cuStream.getResult(0)));
|
||||
launchOp.erase();
|
||||
}
|
||||
|
||||
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
|
||||
mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
|
||||
return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
|
||||
mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
|
||||
return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
|
||||
}
|
@ -1,13 +1,11 @@
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda | FileCheck %s
|
||||
|
||||
module attributes {gpu.container_module} {
|
||||
|
||||
// CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
|
||||
// CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")
|
||||
// ROCDL: llvm.mlir.global internal constant @[[global:.*]]("HSACO")
|
||||
|
||||
gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"} {
|
||||
gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {
|
||||
llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
|
||||
llvm.return
|
||||
}
|
||||
@ -20,15 +18,15 @@ module attributes {gpu.container_module} {
|
||||
|
||||
// CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index)
|
||||
// CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
|
||||
// CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]]
|
||||
// CHECK-SAME: -> !llvm<"i8*">
|
||||
// CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
|
||||
// CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**">
|
||||
// CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: llvm.call @mgpuGetStreamHelper
|
||||
// CHECK: llvm.call @mgpuLaunchKernel
|
||||
// CHECK: llvm.call @mgpuStreamSynchronize
|
||||
// CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32
|
||||
// CHECK: llvm.call @mcuGetStreamHelper
|
||||
// CHECK: llvm.call @mcuLaunchKernel
|
||||
// CHECK: llvm.call @mcuStreamSynchronize
|
||||
"gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel }
|
||||
: (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> ()
|
||||
|
@ -30,7 +30,7 @@ int32_t reportErrorIfAny(CUresult result, const char *where) {
|
||||
}
|
||||
} // anonymous namespace
|
||||
|
||||
extern "C" int32_t mgpuModuleLoad(void **module, void *data) {
|
||||
extern "C" int32_t mcuModuleLoad(void **module, void *data) {
|
||||
int32_t err = reportErrorIfAny(
|
||||
cuModuleLoadData(reinterpret_cast<CUmodule *>(module), data),
|
||||
"ModuleLoad");
|
||||
@ -48,11 +48,11 @@ extern "C" int32_t mcuModuleGetFunction(void **function, void *module,
|
||||
// The wrapper uses intptr_t instead of CUDA's unsigned int to match
|
||||
// the type of MLIR's index type. This avoids the need for casts in the
|
||||
// generated MLIR code.
|
||||
extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
|
||||
intptr_t gridY, intptr_t gridZ,
|
||||
intptr_t blockX, intptr_t blockY,
|
||||
intptr_t blockZ, int32_t smem, void *stream,
|
||||
void **params, void **extra) {
|
||||
extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX,
|
||||
intptr_t gridY, intptr_t gridZ,
|
||||
intptr_t blockX, intptr_t blockY,
|
||||
intptr_t blockZ, int32_t smem, void *stream,
|
||||
void **params, void **extra) {
|
||||
return reportErrorIfAny(
|
||||
cuLaunchKernel(reinterpret_cast<CUfunction>(function), gridX, gridY,
|
||||
gridZ, blockX, blockY, blockZ, smem,
|
||||
@ -60,13 +60,13 @@ extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX,
|
||||
"LaunchKernel");
|
||||
}
|
||||
|
||||
extern "C" void *mgpuGetStreamHelper() {
|
||||
extern "C" void *mcuGetStreamHelper() {
|
||||
CUstream stream;
|
||||
reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
|
||||
return stream;
|
||||
}
|
||||
|
||||
extern "C" int32_t mgpuStreamSynchronize(void *stream) {
|
||||
extern "C" int32_t mcuStreamSynchronize(void *stream) {
|
||||
return reportErrorIfAny(
|
||||
cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
|
||||
}
|
||||
@ -75,7 +75,7 @@ extern "C" int32_t mgpuStreamSynchronize(void *stream) {
|
||||
|
||||
// Allows to register byte array with the CUDA runtime. Helpful until we have
|
||||
// transfer functions implemented.
|
||||
extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
|
||||
extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) {
|
||||
reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0),
|
||||
"MemHostRegister");
|
||||
}
|
||||
@ -99,7 +99,7 @@ void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef<int64_t> sizes,
|
||||
assert(strides == llvm::makeArrayRef(denseStrides));
|
||||
|
||||
std::fill_n(pointer, count, value);
|
||||
mgpuMemHostRegister(pointer, count * sizeof(T));
|
||||
mcuMemHostRegister(pointer, count * sizeof(T));
|
||||
}
|
||||
|
||||
extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) {
|
||||
|
@ -14,7 +14,6 @@
|
||||
|
||||
#include "llvm/ADT/STLExtras.h"
|
||||
|
||||
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
|
||||
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
|
||||
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
|
||||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
|
||||
@ -116,7 +115,7 @@ static LogicalResult runMLIRPasses(ModuleOp m) {
|
||||
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
|
||||
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
|
||||
pm.addPass(createLowerToLLVMPass());
|
||||
pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());
|
||||
pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());
|
||||
|
||||
return pm.run(m);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user