mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-03-05 08:58:13 +00:00
Reland [mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow (llvm#65768)
The revert happened due to a build bot failure that threw 'CUDA_ERROR_UNSUPPORTED_PTX_VERSION'. The failure's root cause was a pass using "+ptx76" for compilation and an old CUDA driver on the bot. This commit relands the patch with "+ptx60". Original Gh PR: #65768 Original commit message: Migrate tests referencing `gpu-to-cubin` to the new compilation workflow using `TargetAttrs`. The `test-lower-to-nvvm` pass pipeline was modified to use the new compilation workflow to simplify the introduction of future tests. The `createLowerGpuOpsToNVVMOpsPass` function was removed, as it didn't allow for passing all options available in the `ConvertGpuOpsToNVVMOp` pass.
This commit is contained in:
parent
2374ae4362
commit
119c489cc1
@ -16,9 +16,7 @@ namespace mlir {
|
||||
class LLVMTypeConverter;
|
||||
class ConversionTarget;
|
||||
class RewritePatternSet;
|
||||
|
||||
template <typename OpT>
|
||||
class OperationPass;
|
||||
class Pass;
|
||||
|
||||
namespace gpu {
|
||||
class GPUModuleOp;
|
||||
@ -45,14 +43,6 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter,
|
||||
/// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM.
|
||||
void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
RewritePatternSet &patterns);
|
||||
|
||||
/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The
|
||||
/// index bitwidth used for the lowering of the device side index computations
|
||||
/// is configurable.
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>> createLowerGpuOpsToNVVMOpsPass(
|
||||
unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
|
||||
bool hasRedux = false);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_
|
||||
|
@ -486,7 +486,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
|
||||
|
||||
def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
|
||||
let summary = "Generate NVVM operations for gpu operations";
|
||||
let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
|
||||
let dependentDialects = [
|
||||
"cf::ControlFlowDialect",
|
||||
"memref::MemRefDialect",
|
||||
|
@ -210,11 +210,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
|
||||
/// code.
|
||||
struct LowerGpuOpsToNVVMOpsPass
|
||||
: public impl::ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
|
||||
LowerGpuOpsToNVVMOpsPass() = default;
|
||||
LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) {
|
||||
this->indexBitwidth = indexBitwidth;
|
||||
this->hasRedux = hasRedux;
|
||||
}
|
||||
using Base::Base;
|
||||
|
||||
void runOnOperation() override {
|
||||
gpu::GPUModuleOp m = getOperation();
|
||||
@ -378,8 +374,3 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
||||
"__nv_tanh");
|
||||
populateOpPatterns<math::TanOp>(converter, patterns, "__nv_tanf", "__nv_tan");
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||||
mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) {
|
||||
return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth, hasRedux);
|
||||
}
|
||||
|
@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler(
|
||||
pm.addPass(createSparseGPUCodegenPass());
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createLowerGpuOpsToNVVMOpsPass());
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createConvertGpuOpsToNVVMOps());
|
||||
}
|
||||
|
||||
// TODO(springerm): Add sparse support to the BufferDeallocation pass and add
|
||||
|
@ -1,6 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \
|
||||
// RUN: 2>&1 | FileCheck %s
|
||||
|
||||
// CHECK: Generated by LLVM NVPTX Back-End
|
||||
|
@ -2,10 +2,9 @@
|
||||
// NOTE: this test requires gpu-sm80
|
||||
//
|
||||
// RUN: mlir-opt \
|
||||
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \
|
||||
// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
|
||||
// RUN: %s \
|
||||
// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \
|
||||
// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
|
||||
// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
|
@ -1,9 +1,7 @@
|
||||
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
|
||||
// RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
|
||||
// RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \
|
||||
// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\
|
||||
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
|
||||
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
|
||||
// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \
|
||||
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: -shared-libs=%mlir_c_runner_utils \
|
||||
|
@ -2,9 +2,7 @@
|
||||
// everything on the same thread.
|
||||
// RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
|
||||
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: -gpu-kernel-outlining |\
|
||||
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
|
||||
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
|
||||
// RUN: -test-lower-to-nvvm | \
|
||||
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: -shared-libs=%mlir_c_runner_utils \
|
||||
@ -15,9 +13,7 @@
|
||||
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \
|
||||
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
|
||||
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: -gpu-kernel-outlining |\
|
||||
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
|
||||
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
|
||||
// RUN: -test-lower-to-nvvm | \
|
||||
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: -shared-libs=%mlir_c_runner_utils \
|
||||
@ -27,9 +23,7 @@
|
||||
// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \
|
||||
// RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \
|
||||
// RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: -gpu-kernel-outlining |\
|
||||
// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\
|
||||
// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\
|
||||
// RUN: -test-lower-to-nvvm | \
|
||||
// RUN: mlir-cpu-runner -e main -entry-point-result=void \
|
||||
// RUN: -shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: -shared-libs=%mlir_c_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
|
||||
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -3,9 +3,7 @@
|
||||
// Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention.
|
||||
// This test also uses gpu.memcpy operations (instead of gpu.host_register).
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \
|
||||
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --entry-point-result=void \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \
|
||||
// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
@ -10,9 +8,7 @@
|
||||
|
||||
// Same as above but with the memref bare pointer lowering convention.
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,7 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
|
||||
// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \
|
||||
// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
|
||||
// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
|
@ -1,8 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,6 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -1,7 +1,5 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -gpu-kernel-outlining \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
|
||||
// RUN: | mlir-opt -gpu-to-llvm \
|
||||
// RUN: | mlir-opt -test-lower-to-nvvm \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_cuda_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
|
@ -65,11 +65,11 @@ struct TestLowerToNVVMOptions
|
||||
llvm::cl::init("nvptx64-nvidia-cuda")};
|
||||
PassOptions::Option<std::string> cubinChip{
|
||||
*this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."),
|
||||
llvm::cl::init("sm_80")};
|
||||
llvm::cl::init("sm_50")};
|
||||
PassOptions::Option<std::string> cubinFeatures{
|
||||
*this, "cubin-features",
|
||||
llvm::cl::desc("Features to use to serialize to cubin."),
|
||||
llvm::cl::init("+ptx76")};
|
||||
llvm::cl::init("+ptx60")};
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -126,13 +126,14 @@ void buildGpuPassPipeline(OpPassManager &pm,
|
||||
|
||||
// TODO: C++20 designated initializers.
|
||||
// The following pass is inconsistent.
|
||||
// ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
|
||||
// convertGpuOpsToNVVMOpsOptions.indexBitwidth =
|
||||
// options.kernelIndexBitWidth;
|
||||
// TODO: fix inconsistence.
|
||||
ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
|
||||
convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv =
|
||||
options.kernelUseBarePtrCallConv;
|
||||
convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth;
|
||||
convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true;
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(
|
||||
// TODO: fix inconsistence.
|
||||
createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/
|
||||
options.kernelIndexBitWidth));
|
||||
createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions));
|
||||
|
||||
// TODO: C++20 designated initializers.
|
||||
ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
|
||||
@ -141,22 +142,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
|
||||
createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
|
||||
|
||||
// TODO: C++20 designated initializers.
|
||||
GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
|
||||
// Note: hostBarePtrCallConv must be false for now otherwise
|
||||
// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
|
||||
// lower the to bare ptr.
|
||||
gpuToLLVMConversionOptions.hostBarePtrCallConv =
|
||||
options.hostUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.kernelBarePtrCallConv =
|
||||
options.kernelUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.useOpaquePointers = true;
|
||||
|
||||
// TODO: something useful here.
|
||||
// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(
|
||||
createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
|
||||
|
||||
// Convert vector to LLVM (always needed).
|
||||
// TODO: C++20 designated initializers.
|
||||
ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
|
||||
@ -170,11 +155,6 @@ void buildGpuPassPipeline(OpPassManager &pm,
|
||||
|
||||
// Finally we can reconcile unrealized casts.
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
|
||||
|
||||
#if MLIR_GPU_TO_CUBIN_PASS_ENABLE
|
||||
pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
|
||||
options.cubinTriple, options.cubinChip, options.cubinFeatures));
|
||||
#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE
|
||||
}
|
||||
|
||||
void buildLowerToNVVMPassPipeline(OpPassManager &pm,
|
||||
@ -251,6 +231,34 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Host post-GPUModule-specific stuff.
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Attach an NVVM target to all the GPU modules with the provided target
|
||||
// options.
|
||||
// TODO: C++20 designated initializers.
|
||||
GpuNVVMAttachTargetOptions nvvmTargetOptions;
|
||||
nvvmTargetOptions.triple = options.cubinTriple;
|
||||
nvvmTargetOptions.chip = options.cubinChip;
|
||||
nvvmTargetOptions.features = options.cubinFeatures;
|
||||
pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
|
||||
|
||||
// Convert GPU to LLVM.
|
||||
// TODO: C++20 designated initializers.
|
||||
GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
|
||||
// Note: hostBarePtrCallConv must be false for now otherwise
|
||||
// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
|
||||
// lower the to bare ptr.
|
||||
gpuToLLVMConversionOptions.hostBarePtrCallConv =
|
||||
options.hostUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.kernelBarePtrCallConv =
|
||||
options.kernelUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.useOpaquePointers = true;
|
||||
|
||||
// TODO: something useful here.
|
||||
// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
|
||||
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
|
||||
|
||||
// Serialize all GPU modules to binaries.
|
||||
pm.addPass(createGpuModuleToBinaryPass());
|
||||
|
||||
// Convert vector to LLVM (always needed).
|
||||
// TODO: C++20 designated initializers.
|
||||
ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
|
||||
@ -265,22 +273,6 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm,
|
||||
convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
|
||||
pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));
|
||||
|
||||
// This must happen after cubin translation otherwise gpu.launch_func is
|
||||
// illegal if no cubin annotation is present.
|
||||
// TODO: C++20 designated initializers.
|
||||
GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
|
||||
// Note: hostBarePtrCallConv must be false for now otherwise
|
||||
// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
|
||||
// lower the to bare ptr.
|
||||
gpuToLLVMConversionOptions.hostBarePtrCallConv =
|
||||
options.hostUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.kernelBarePtrCallConv =
|
||||
options.kernelUseBarePtrCallConv;
|
||||
gpuToLLVMConversionOptions.useOpaquePointers = true;
|
||||
// TODO: something useful here.
|
||||
// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
|
||||
pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));
|
||||
|
||||
// Convert Func to LLVM (always needed).
|
||||
// TODO: C++20 designated initializers.
|
||||
ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
|
||||
|
Loading…
x
Reference in New Issue
Block a user