[mlir][linalg] Padding transformation: Write back result to original destination

Copy back the padded result to the original destination of the computation. This is important for bufferization, to ensure that the result of the computation does not suddenly materialize in a different buffer due to padding.

A `bufferization.copy_tensor` is inserted for every (unpadded) result. Such ops bufferize to memcpys, but they fold away, should the padding fold away.

Differential Revision: https://reviews.llvm.org/D153554
This commit is contained in:
Matthias Springer 2023-06-27 14:29:34 +02:00
parent d31a6dfbc0
commit 431c49d6b6
6 changed files with 87 additions and 13 deletions

View File

@ -848,7 +848,8 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
DefaultValuedAttr<I64ArrayAttr, "{}">:$pack_paddings,
DefaultValuedAttr<
TypedArrayAttrBase<I64ArrayAttr, "array of arrays of i64">,
"{}">:$transpose_paddings);
"{}">:$transpose_paddings,
DefaultValuedAttr<BoolAttr, "true">:$copy_back);
let results = (outs TransformHandleTypeInterface:$transformed);
let assemblyFormat =

View File

@ -370,10 +370,12 @@ void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
/// and `packPaddings` to set padding value and nofold attribute of the created
/// tensor::PadOps, respectively. Update `paddedOp` to the cloned operation with
/// statically shaped `paddingDimensions` and return the extracted dynamically
/// shaped results. If padding fails, return failure.
/// shaped results. If padding fails, return failure. If `copyBack` is set, the
/// unpadded result is copied back into the original destination tensor.
FailureOr<SmallVector<Value>>
rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
const LinalgPaddingOptions &options, LinalgOp &paddedOp);
const LinalgPaddingOptions &options, LinalgOp &paddedOp,
bool copyBack);
namespace detail {

View File

@ -1601,7 +1601,7 @@ transform::PadOp::applyToOne(transform::TransformRewriter &rewriter,
options.paddingValues = paddingValues;
options.packPaddings = packPaddings;
FailureOr<SmallVector<Value>> result =
rewriteAsPaddedOp(rewriter, target, options, paddedOp);
rewriteAsPaddedOp(rewriter, target, options, paddedOp, getCopyBack());
if (succeeded(result)) {
// We need to perform our own replacement here because this API is still
// used in patterns that "pad and hoist", for which the replacement values

View File

@ -8,6 +8,7 @@
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
@ -138,7 +139,7 @@ static FailureOr<Value> padOperandToSmallestStaticBoundingBox(
FailureOr<SmallVector<Value>>
linalg::rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
const LinalgPaddingOptions &options,
LinalgOp &paddedOp) {
LinalgOp &paddedOp, bool copyBack) {
LLVM_DEBUG(DBGS() << "Start rewriteAsPaddedOp : " << opToPad << "\n");
Location loc = opToPad->getLoc();
@ -197,7 +198,21 @@ linalg::rewriteAsPaddedOp(RewriterBase &rewriter, LinalgOp opToPad,
loc, paddedResult, offsets, reifiedResultShapes[resultNumber],
strides));
}
return paddedSubtensorResults;
if (!copyBack)
return paddedSubtensorResults;
// Copy back unpadded results to the original destination (i.e., inits of the
// linalg op), so that the destination buffer of the computation does not
// change. If the padding folds away, this will materizalize as a memcpy
// between two identical buffers, which will then also fold away.
SmallVector<Value> copiedBack;
for (auto it :
llvm::zip(paddedSubtensorResults, opToPad.getDpsInitOperands())) {
copiedBack.push_back(rewriter.create<bufferization::CopyTensorOp>(
loc, std::get<0>(it), std::get<1>(it)->get()));
}
return copiedBack;
}
FailureOr<LinalgOp>
@ -209,8 +224,8 @@ mlir::linalg::padAndHoistLinalgOp(RewriterBase &rewriter, LinalgOp linalgOp,
// Pad the operation.
LinalgOp paddedOp;
FailureOr<SmallVector<Value>> newResults =
rewriteAsPaddedOp(rewriter, linalgOp, options, paddedOp);
FailureOr<SmallVector<Value>> newResults = rewriteAsPaddedOp(
rewriter, linalgOp, options, paddedOp, /*copyBack=*/false);
if (failed(newResults))
return rewriter.notifyMatchFailure(linalgOp,
"failed to rewrite as a padded op");

View File

@ -19,7 +19,8 @@ transform.sequence failures(propagate) {
%matmul_padded = transform.structured.pad %matmul_l1 {
padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2]
padding_dimensions=[0, 1, 2],
copy_back = false
} : (!transform.any_op) -> !transform.any_op
// In this case, the pad op is actually empty: we only tile the first dimension
@ -54,7 +55,8 @@ transform.sequence failures(propagate) {
%matmul_padded = transform.structured.pad %matmul_l1 {
padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2]
padding_dimensions=[0, 1, 2],
copy_back = false
} : (!transform.any_op) -> !transform.any_op
%pad = transform.get_producer_of_operand %matmul_padded[2]
@ -96,7 +98,8 @@ transform.sequence failures(propagate) {
%matmul_padded = transform.structured.pad %matmul_l1 {
padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2]
padding_dimensions=[0, 1, 2],
copy_back = false
} : (!transform.any_op) -> !transform.any_op
%pad = transform.get_producer_of_operand %matmul_padded[0]
@ -140,7 +143,8 @@ transform.sequence failures(propagate) {
%matmul_padded = transform.structured.pad %matmul_l1 {
padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2]
padding_dimensions=[0, 1, 2],
copy_back = false
} : (!transform.any_op) -> !transform.any_op
%pad = transform.get_producer_of_operand %matmul_padded[0]
@ -183,7 +187,8 @@ transform.sequence failures(propagate) {
%matmul_padded = transform.structured.pad %matmul_l1 {
padding_values=[0.0: f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2]
padding_dimensions=[0, 1, 2],
copy_back = false
} : (!transform.any_op) -> !transform.any_op
%pad = transform.get_producer_of_operand %matmul_padded[2]

View File

@ -241,3 +241,54 @@ transform.sequence failures(propagate) {
pack_paddings=[1, 1, 1]
} : (!transform.any_op) -> !transform.any_op
}
// -----
#map = affine_map<()[s0] -> (-s0 + 12, 7)>
// CHECK-LABEL: @pack_everything
func.func @pack_everything(%arg0: tensor<24x12xf32>,
%arg1: tensor<12x25xf32>,
%arg2: tensor<24x25xf32>,
%iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> {
%0 = affine.min #map()[%iv2]
// CHECK: %[[T0:.*]] = tensor.extract_slice %
// CHECK: %[[T1:.*]] = tensor.extract_slice %
// CHECK: %[[T2:.*]] = tensor.extract_slice %
%1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
%2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
%3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
// CHECK-DAG: %[[CST:.*]] = arith.constant 0.
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK: %[[PAD0:.*]] = tensor.pad %[[T0]] nofold
// CHECK: %[[PAD1:.*]] = tensor.pad %[[T1]] nofold
// CHECK: %[[PAD2:.*]] = tensor.pad %[[T2]] nofold
// CHECK: %[[T5:.*]] = linalg.matmul
// CHECK-SAME: ins(%[[PAD0]], %[[PAD1]] : tensor<4x7xf32>, tensor<7x5xf32>)
// CHECK-SAME: outs(%[[PAD2]] : tensor<4x5xf32>)
// Get unpadded result (no-op in this example).
// CHECK: %[[T6:.*]] = tensor.extract_slice %[[T5]]
// Copy back result to the original buffer, so that the destination of the
// computation does not change.
// CHECK: %[[T7:.*]] = bufferization.copy_tensor %[[T6]], %[[T2]]
%4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
// CHECK: %[[T8:.*]] = tensor.insert_slice %[[T7]] into %{{.*}}
%5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32>
func.return %5 : tensor<24x25xf32>
}
transform.sequence failures(propagate) {
^bb1(%arg1: !transform.any_op):
%0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op
%1 = transform.structured.pad %0 {
padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
padding_dimensions=[0, 1, 2],
pack_paddings=[1, 1, 1]
} : (!transform.any_op) -> !transform.any_op
}