mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-10 10:01:42 +00:00
[MLIR] Parallelize affine.for op to 1-D affine.parallel op
Introduce pass to convert parallel affine.for op into 1-D affine.parallel op. Run using --affine-parallelize. Removes test-detect-parallel: pass for checking parallel affine.for ops. Differential Revision: https://reviews.llvm.org/D82672
This commit is contained in:
parent
71f342d6c3
commit
5f2843857f
@ -36,6 +36,10 @@ std::unique_ptr<OperationPass<FuncOp>> createSimplifyAffineStructuresPass();
|
||||
std::unique_ptr<OperationPass<FuncOp>>
|
||||
createAffineLoopInvariantCodeMotionPass();
|
||||
|
||||
/// Creates a pass to convert all parallel affine.for's into 1-d affine.parallel
|
||||
/// ops.
|
||||
std::unique_ptr<OperationPass<FuncOp>> createAffineParallelizePass();
|
||||
|
||||
/// Performs packing (or explicit copying) of accessed memref regions into
|
||||
/// buffers in the specified faster memory space through either pointwise copies
|
||||
/// or DMA operations.
|
||||
|
@ -112,6 +112,11 @@ def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
|
||||
];
|
||||
}
|
||||
|
||||
def AffineParallelize : FunctionPass<"affine-parallelize"> {
|
||||
let summary = "Convert affine.for ops into 1-D affine.parallel";
|
||||
let constructor = "mlir::createAffineParallelizePass()";
|
||||
}
|
||||
|
||||
def SimplifyAffineStructures : FunctionPass<"simplify-affine-structures"> {
|
||||
let summary = "Simplify affine expressions in maps/sets and normalize "
|
||||
"memrefs";
|
||||
|
@ -15,9 +15,16 @@
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class AffineForOp;
|
||||
class AffineIfOp;
|
||||
class AffineParallelOp;
|
||||
struct LogicalResult;
|
||||
|
||||
/// Replaces parallel affine.for op with 1-d affine.parallel op.
|
||||
/// mlir::isLoopParallel detect the parallel affine.for ops.
|
||||
/// There is no cost model currently used to drive this parallelization.
|
||||
void affineParallelize(AffineForOp forOp);
|
||||
|
||||
/// Hoists out affine.if/else to as high as possible, i.e., past all invariant
|
||||
/// affine.fors/parallel's. Returns success if any hoisting happened; folded` is
|
||||
/// set to true if the op was folded or erased. This hoisting could lead to
|
||||
|
50
mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
Normal file
50
mlir/lib/Dialect/Affine/Transforms/AffineParallelize.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
//===- AffineParallelize.cpp - Affineparallelize Pass---------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a parallelizer for affine loop nests that is able to
|
||||
// perform inner or outer loop parallelization.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "PassDetail.h"
|
||||
#include "mlir/Analysis/AffineStructures.h"
|
||||
#include "mlir/Analysis/LoopAnalysis.h"
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h"
|
||||
#include "mlir/Dialect/Affine/Passes.h.inc"
|
||||
#include "mlir/Dialect/Affine/Utils.h"
|
||||
#include "mlir/Transforms/LoopUtils.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
#define DEBUG_TYPE "affine-parallel"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace {
|
||||
/// Convert all parallel affine.for op into 1-D affine.parallel op.
|
||||
struct AffineParallelize : public AffineParallelizeBase<AffineParallelize> {
|
||||
void runOnFunction() override;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void AffineParallelize::runOnFunction() {
|
||||
FuncOp f = getFunction();
|
||||
SmallVector<AffineForOp, 8> parallelizableLoops;
|
||||
f.walk([&](AffineForOp loop) {
|
||||
if (isLoopParallel(loop))
|
||||
parallelizableLoops.push_back(loop);
|
||||
});
|
||||
for (AffineForOp loop : parallelizableLoops)
|
||||
affineParallelize(loop);
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<FuncOp>> mlir::createAffineParallelizePass() {
|
||||
return std::make_unique<AffineParallelize>();
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
add_mlir_dialect_library(MLIRAffineTransforms
|
||||
AffineDataCopyGeneration.cpp
|
||||
AffineLoopInvariantCodeMotion.cpp
|
||||
AffineParallelize.cpp
|
||||
LoopTiling.cpp
|
||||
LoopUnroll.cpp
|
||||
LoopUnrollAndJam.cpp
|
||||
|
@ -129,6 +129,20 @@ static AffineIfOp hoistAffineIfOp(AffineIfOp ifOp, Operation *hoistOverOp) {
|
||||
return hoistedIfOp;
|
||||
}
|
||||
|
||||
/// Replace affine.for with a 1-d affine.parallel by moving the former's body
|
||||
/// into the latter one.
|
||||
void mlir::affineParallelize(AffineForOp forOp) {
|
||||
Location loc = forOp.getLoc();
|
||||
OpBuilder outsideBuilder(forOp);
|
||||
// Create empty 1-D affine.parallel op.
|
||||
AffineParallelOp newPloop = outsideBuilder.create<AffineParallelOp>(
|
||||
loc, forOp.getLowerBoundMap(), forOp.getLowerBoundOperands(),
|
||||
forOp.getUpperBoundMap(), forOp.getUpperBoundOperands());
|
||||
// Steal the body of the old affine for op and erase it.
|
||||
newPloop.region().takeBody(forOp.region());
|
||||
forOp.erase();
|
||||
}
|
||||
|
||||
// Returns success if any hoisting happened.
|
||||
LogicalResult mlir::hoistAffineIfOp(AffineIfOp ifOp, bool *folded) {
|
||||
// Apply canonicalization patterns and folding - this is necessary for the
|
||||
|
@ -1,47 +0,0 @@
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s -test-detect-parallel -split-input-file -verify-diagnostics | FileCheck %s
|
||||
|
||||
// CHECK-LABEL: func @loop_nest_3d_outer_two_parallel
|
||||
func @loop_nest_3d_outer_two_parallel(%N : index) {
|
||||
%0 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
%1 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
%2 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
affine.for %i = 0 to %N {
|
||||
// expected-remark@-1 {{parallel loop}}
|
||||
affine.for %j = 0 to %N {
|
||||
// expected-remark@-1 {{parallel loop}}
|
||||
affine.for %k = 0 to %N {
|
||||
// expected-remark@-1 {{sequential loop}}
|
||||
%5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
|
||||
%6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
|
||||
%7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
|
||||
%8 = mulf %5, %6 : vector<64xf32>
|
||||
%9 = addf %7, %8 : vector<64xf32>
|
||||
affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: unknown_op_conservative
|
||||
func @unknown_op_conservative() {
|
||||
affine.for %i = 0 to 10 {
|
||||
// expected-remark@-1 {{sequential loop}}
|
||||
"unknown"() : () -> ()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: non_affine_load
|
||||
func @non_affine_load() {
|
||||
%0 = alloc() : memref<100 x f32>
|
||||
affine.for %i = 0 to 100 {
|
||||
// expected-remark@-1 {{sequential loop}}
|
||||
load %0[%i] : memref<100 x f32>
|
||||
}
|
||||
return
|
||||
}
|
118
mlir/test/Dialect/Affine/parallelize.mlir
Normal file
118
mlir/test/Dialect/Affine/parallelize.mlir
Normal file
@ -0,0 +1,118 @@
|
||||
// RUN: mlir-opt %s -allow-unregistered-dialect -affine-parallelize| FileCheck %s
|
||||
|
||||
// For multiple nested for-loops.
|
||||
// CHECK-DAG: [[MAP5:#map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7) -> (d0 + d1, d2 * 2 + d3, d4 * 2 + d5, d6 + d7)>
|
||||
// CHECK-LABEL: func @reduce_window_max() {
|
||||
func @reduce_window_max() {
|
||||
%cst = constant 0.000000e+00 : f32
|
||||
%0 = alloc() : memref<1x8x8x64xf32>
|
||||
%1 = alloc() : memref<1x18x18x64xf32>
|
||||
affine.for %arg0 = 0 to 1 {
|
||||
affine.for %arg1 = 0 to 8 {
|
||||
affine.for %arg2 = 0 to 8 {
|
||||
affine.for %arg3 = 0 to 64 {
|
||||
affine.store %cst, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
affine.for %arg0 = 0 to 1 {
|
||||
affine.for %arg1 = 0 to 8 {
|
||||
affine.for %arg2 = 0 to 8 {
|
||||
affine.for %arg3 = 0 to 64 {
|
||||
affine.for %arg4 = 0 to 1 {
|
||||
affine.for %arg5 = 0 to 3 {
|
||||
affine.for %arg6 = 0 to 3 {
|
||||
affine.for %arg7 = 0 to 1 {
|
||||
%2 = affine.load %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
|
||||
%3 = affine.load %1[%arg0 + %arg4, %arg1 * 2 + %arg5, %arg2 * 2 + %arg6, %arg3 + %arg7] : memref<1x18x18x64xf32>
|
||||
%4 = cmpf "ogt", %2, %3 : f32
|
||||
%5 = select %4, %2, %3 : f32
|
||||
affine.store %5, %0[%arg0, %arg1, %arg2, %arg3] : memref<1x8x8x64xf32>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: %[[cst:.*]] = constant 0.000000e+00 : f32
|
||||
// CHECK: %[[v0:.*]] = alloc() : memref<1x8x8x64xf32>
|
||||
// CHECK: %[[v1:.*]] = alloc() : memref<1x18x18x64xf32>
|
||||
// CHECK: affine.parallel (%[[arg0:.*]]) = (0) to (1) {
|
||||
// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (8) {
|
||||
// CHECK: affine.parallel (%[[arg2:.*]]) = (0) to (8) {
|
||||
// CHECK: affine.parallel (%[[arg3:.*]]) = (0) to (64) {
|
||||
// CHECK: affine.store %[[cst]], %[[v0]][%[[arg0]], %[[arg1]], %[[arg2]], %[[arg3]]] : memref<1x8x8x64xf32>
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: affine.parallel (%[[a0:.*]]) = (0) to (1) {
|
||||
// CHECK: affine.parallel (%[[a1:.*]]) = (0) to (8) {
|
||||
// CHECK: affine.parallel (%[[a2:.*]]) = (0) to (8) {
|
||||
// CHECK: affine.parallel (%[[a3:.*]]) = (0) to (64) {
|
||||
// CHECK: affine.parallel (%[[a4:.*]]) = (0) to (1) {
|
||||
// CHECK: affine.for %[[a5:.*]] = 0 to 3 {
|
||||
// CHECK: affine.for %[[a6:.*]] = 0 to 3 {
|
||||
// CHECK: affine.parallel (%[[a7:.*]]) = (0) to (1) {
|
||||
// CHECK: %[[lhs:.*]] = affine.load %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
|
||||
// CHECK: %[[rhs:.*]] = affine.load %[[v1]][%[[a0]] + %[[a4]], %[[a1]] * 2 + %[[a5]], %[[a2]] * 2 + %[[a6]], %[[a3]] + %[[a7]]] : memref<1x18x18x64xf32>
|
||||
// CHECK: %[[res:.*]] = cmpf "ogt", %[[lhs]], %[[rhs]] : f32
|
||||
// CHECK: %[[sel:.*]] = select %[[res]], %[[lhs]], %[[rhs]] : f32
|
||||
// CHECK: affine.store %[[sel]], %[[v0]][%[[a0]], %[[a1]], %[[a2]], %[[a3]]] : memref<1x8x8x64xf32>
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
// CHECK: }
|
||||
|
||||
func @loop_nest_3d_outer_two_parallel(%N : index) {
|
||||
%0 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
%1 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
%2 = alloc() : memref<1024 x 1024 x vector<64xf32>>
|
||||
affine.for %i = 0 to %N {
|
||||
affine.for %j = 0 to %N {
|
||||
%7 = affine.load %2[%i, %j] : memref<1024x1024xvector<64xf32>>
|
||||
affine.for %k = 0 to %N {
|
||||
%5 = affine.load %0[%i, %k] : memref<1024x1024xvector<64xf32>>
|
||||
%6 = affine.load %1[%k, %j] : memref<1024x1024xvector<64xf32>>
|
||||
%8 = mulf %5, %6 : vector<64xf32>
|
||||
%9 = addf %7, %8 : vector<64xf32>
|
||||
affine.store %9, %2[%i, %j] : memref<1024x1024xvector<64xf32>>
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK: affine.parallel (%[[arg1:.*]]) = (0) to (symbol(%arg0)) {
|
||||
// CHECK-NEXT: affine.parallel (%[[arg2:.*]]) = (0) to (symbol(%arg0)) {
|
||||
// CHECK: affine.for %[[arg3:.*]] = 0 to %arg0 {
|
||||
|
||||
// CHECK-LABEL: unknown_op_conservative
|
||||
func @unknown_op_conservative() {
|
||||
affine.for %i = 0 to 10 {
|
||||
// CHECK: affine.for %[[arg1:.*]] = 0 to 10 {
|
||||
"unknown"() : () -> ()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: non_affine_load
|
||||
func @non_affine_load() {
|
||||
%0 = alloc() : memref<100 x f32>
|
||||
affine.for %i = 0 to 100 {
|
||||
// CHECK: affine.for %{{.*}} = 0 to 100 {
|
||||
load %0[%i] : memref<100 x f32>
|
||||
}
|
||||
return
|
||||
}
|
@ -3,7 +3,6 @@ add_mlir_library(MLIRAffineTransformsTestPasses
|
||||
TestAffineDataCopy.cpp
|
||||
TestAffineLoopUnswitching.cpp
|
||||
TestLoopPermutation.cpp
|
||||
TestParallelismDetection.cpp
|
||||
TestVectorizationUtils.cpp
|
||||
|
||||
EXCLUDE_FROM_LIBMLIR
|
||||
|
@ -1,47 +0,0 @@
|
||||
//===- ParallelismDetection.cpp - Parallelism Detection pass ------------*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a pass to detect parallel affine 'affine.for' ops.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Analysis/Utils.h"
|
||||
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
|
||||
using namespace mlir;
|
||||
|
||||
namespace {
|
||||
|
||||
struct TestParallelismDetection
|
||||
: public PassWrapper<TestParallelismDetection, FunctionPass> {
|
||||
void runOnFunction() override;
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
||||
// Walks the function and emits a note for all 'affine.for' ops detected as
|
||||
// parallel.
|
||||
void TestParallelismDetection::runOnFunction() {
|
||||
FuncOp f = getFunction();
|
||||
OpBuilder b(f.getBody());
|
||||
f.walk([&](AffineForOp forOp) {
|
||||
if (isLoopParallel(forOp))
|
||||
forOp.emitRemark("parallel loop");
|
||||
else
|
||||
forOp.emitRemark("sequential loop");
|
||||
});
|
||||
}
|
||||
|
||||
namespace mlir {
|
||||
void registerTestParallelismDetection() {
|
||||
PassRegistration<TestParallelismDetection> pass(
|
||||
"test-detect-parallel", "Test parallelism detection ");
|
||||
}
|
||||
} // namespace mlir
|
@ -62,7 +62,6 @@ void registerTestMatchers();
|
||||
void registerTestMemRefDependenceCheck();
|
||||
void registerTestMemRefStrideCalculation();
|
||||
void registerTestOpaqueLoc();
|
||||
void registerTestParallelismDetection();
|
||||
void registerTestPreparationPassWithAllowedMemrefResults();
|
||||
void registerTestGpuParallelLoopMappingPass();
|
||||
void registerTestSCFUtilsPass();
|
||||
@ -137,7 +136,6 @@ void registerTestPasses() {
|
||||
registerTestMemRefDependenceCheck();
|
||||
registerTestMemRefStrideCalculation();
|
||||
registerTestOpaqueLoc();
|
||||
registerTestParallelismDetection();
|
||||
registerTestPreparationPassWithAllowedMemrefResults();
|
||||
registerTestGpuParallelLoopMappingPass();
|
||||
registerTestSCFUtilsPass();
|
||||
|
Loading…
Reference in New Issue
Block a user