[mlir][GPU] Extend GPU kernel outlining to generate DL specification

This patch extends the GPU kernel outlining pass so that it can take in an optional data layout specification that will be attached to the GPU module operation generated. If the data layout specification is not provided the default data layout is used instead. Reviewed By: herhut, mehdi_amini Differential Revision: https://reviews.llvm.org/D115722
2024-11-27 07:31:28 +00:00 · 2021-12-16 09:47:41 +00:00 · 2021-12-16 09:47:41 +00:00 · 32fe1a8a25
commit 32fe1a8a25
parent 59a85a7a52
6 changed files with 67 additions and 5 deletions
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@ -25,7 +25,8 @@ class Module;
 namespace mlir {
 /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
 /// a separate kernel function.
-std::unique_ptr<OperationPass<ModuleOp>> createGpuKernelOutliningPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());

 /// Rewrites a function region so that GPU ops execute asynchronously.
 std::unique_ptr<OperationPass<FuncOp>> createGpuAsyncRegionPass();
--- a/mlir/include/mlir/Dialect/GPU/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Passes.td
@ -14,6 +14,7 @@ include "mlir/Pass/PassBase.td"
 def GpuKernelOutlining : Pass<"gpu-kernel-outlining", "ModuleOp"> {
  let summary = "Outline gpu.launch bodies to kernel functions";
  let constructor = "mlir::createGpuKernelOutliningPass()";
+  let dependentDialects = ["mlir::DLTIDialect"];
 }

 def GpuAsyncRegionPass : FunctionPass<"gpu-async-region"> {
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@ -12,6 +12,7 @@

 #include "PassDetail.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/GPU/Utils.h"
@ -20,6 +21,7 @@
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/SymbolTable.h"
+#include "mlir/Parser.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"

@ -239,6 +241,31 @@ namespace {
 class GpuKernelOutliningPass
    : public GpuKernelOutliningBase<GpuKernelOutliningPass> {
 public:
+  GpuKernelOutliningPass(StringRef dlStr) {
+    if (!dlStr.empty() && !dataLayoutStr.hasValue())
+      dataLayoutStr = dlStr.str();
+  }
+
+  GpuKernelOutliningPass(const GpuKernelOutliningPass &other)
+      : dataLayoutSpec(other.dataLayoutSpec) {
+    dataLayoutStr = other.dataLayoutStr;
+  }
+
+  LogicalResult initialize(MLIRContext *context) override {
+    // Initialize the data layout specification from the data layout string.
+    if (!dataLayoutStr.empty()) {
+      Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context);
+      if (!resultAttr)
+        return failure();
+
+      dataLayoutSpec = resultAttr.dyn_cast<DataLayoutSpecInterface>();
+      if (!dataLayoutSpec)
+        return failure();
+    }
+
+    return success();
+  }
+
  void runOnOperation() override {
    SymbolTable symbolTable(getOperation());
    bool modified = false;
@ -290,6 +317,12 @@ private:
    OpBuilder builder(context);
    auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(),
                                                         kernelFunc.getName());
+
+    // If a valid data layout spec was provided, attach it to the kernel module.
+    // Otherwise, the default data layout will be used.
+    if (dataLayoutSpec)
+      kernelModule->setAttr("dlspec", dataLayoutSpec);
+
    SymbolTable symbolTable(kernelModule);
    symbolTable.insert(kernelFunc);

@ -313,10 +346,18 @@ private:

    return kernelModule;
  }
+
+  Option<std::string> dataLayoutStr{
+      *this, "data-layout-str",
+      llvm::cl::desc("String containing the data layout specification to be "
+                     "attached to the GPU kernel module")};
+
+  DataLayoutSpecInterface dataLayoutSpec;
 };

 } // namespace

-std::unique_ptr<OperationPass<ModuleOp>> mlir::createGpuKernelOutliningPass() {
-  return std::make_unique<GpuKernelOutliningPass>();
+std::unique_ptr<OperationPass<ModuleOp>>
+mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) {
+  return std::make_unique<GpuKernelOutliningPass>(dataLayoutStr);
 }
--- a/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/GPU/Transforms/PassDetail.h
@ -10,6 +10,7 @@
 #define DIALECT_GPU_TRANSFORMS_PASSDETAIL_H_

 #include "mlir/Dialect/Async/IR/Async.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Pass/Pass.h"

 namespace mlir {
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@ -1,4 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect -gpu-kernel-outlining=data-layout-str='#dlti.dl_spec<#dlti.dl_entry<index,32:i32>>' -split-input-file %s | FileCheck --check-prefix CHECK-DL %s

 // CHECK: module attributes {gpu.container_module}

@ -35,8 +36,9 @@ func @launch() {
  return
 }

+// CHECK-DL-LABEL: gpu.module @launch_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}

-// CHECK-LABEL: module @launch_kernel
+// CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
 // CHECK-NEXT: %[[BID:.*]] = "gpu.block_id"() {dimension = "x"} : () -> index
@ -81,7 +83,10 @@ func @multiple_launches() {
  return
 }

-// CHECK: module @multiple_launches_kernel
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-DL-LABEL: gpu.module @multiple_launches_kernel_0 attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
+// CHECK: gpu.module @multiple_launches_kernel
 // CHECK: func @multiple_launches_kernel
 // CHECK: module @multiple_launches_kernel_0
 // CHECK: func @multiple_launches_kernel
@ -106,6 +111,8 @@ func @extra_constants_not_inlined(%arg0: memref<?xf32>) {
  return
 }

+// CHECK-DL-LABEL: gpu.module @extra_constants_not_inlined_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_not_inlined_kernel(%{{.*}}: memref<?xf32>, %{{.*}}: index)
 // CHECK: arith.constant 2

@ -130,6 +137,8 @@ func @extra_constants(%arg0: memref<?xf32>) {
  return
 }

+// CHECK-DL-LABEL: gpu.module @extra_constants_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>
 // CHECK: arith.constant 2
@ -158,6 +167,8 @@ func @extra_constants_noarg(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
  return
 }

+// CHECK-DL-LABEL: gpu.module @extra_constants_noarg_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK-LABEL: func @extra_constants_noarg_kernel(
 // CHECK-SAME: %[[KARG0:.*]]: memref<?xf32>, %[[KARG1:.*]]: index
 // CHECK: %[[KCST:.*]] = arith.constant 2
@ -186,6 +197,8 @@ func @multiple_uses(%arg0 : memref<?xf32>) {
  return
 }

+// CHECK-DL-LABEL: gpu.module @multiple_uses_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----

 // CHECK-LABEL: @multiple_uses2
@ -213,6 +226,8 @@ func @multiple_uses2(%arg0 : memref<*xf32>) {
  return
 }

+// CHECK-DL-LABEL: gpu.module @multiple_uses2_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // -----

 llvm.mlir.global internal @global(42 : i64) : i64
@ -242,6 +257,8 @@ func @recursive_device_function() {
  return
 }

+// CHECK-DL-LABEL: gpu.module @function_call_kernel attributes {dlspec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+
 // CHECK: gpu.module @function_call_kernel {
 // CHECK:   gpu.func @function_call_kernel()
 // CHECK:     call @device_function() : () -> ()
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@ -2999,6 +2999,7 @@ cc_library(
    deps = [
        ":ArithmeticDialect",
        ":Async",
+        ":DLTIDialect",
        ":GPUDialect",
        ":GPUPassIncGen",
        ":MemRefDialect",