Skip to content

[mlir][Affine] affine-super-vectorize transform op #126522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,35 @@ def SimplifyBoundedAffineOpsOp
}];
}

def SuperVectorizeOp
: Op<Transform_Dialect, "affine.super_vectorize",
[DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
DeclareOpInterfaceMethods<TransformOpInterface>]> {
let description = [{
Vectorize to a target independent n-D vector abstraction.
This operation is an exposition to the transform dialect of the affine-super-vectorize pass.
To make its usage easier, it ignores inputs which are children of an affine.for op (itself excluded),
this way the matcher can be simpler.

Example:
```
%0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.affine.super_vectorize %0 [8, 16] fastest_varying_pattern=[1,0] vectorize_reductions=true : !transform.any_op
```
}];

let arguments = (ins TransformHandleTypeInterface:$target,
DenseI64ArrayAttr:$vector_sizes,
OptionalAttr<DenseI64ArrayAttr>:$fastest_varying_pattern,
DefaultValuedAttr<BoolAttr, "false">:$vectorize_reductions);
let results = (outs);

let assemblyFormat = [{
$target $vector_sizes
(`fastest_varying_pattern` `=` $fastest_varying_pattern^)?
(`vectorize_reductions` `=` $vectorize_reductions^)?
attr-dict `:` type($target)
}];
}

#endif // Affine_TRANSFORM_OPS
6 changes: 6 additions & 0 deletions mlir/include/mlir/Dialect/Affine/Utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ struct VectorizationStrategy {
ReductionLoopMap reductionLoops;
};

/// Vectorize affine loops that are children of parentOp (including itself)
void vectorizeChildAffineLoops(Operation* parentOp,
bool vectorizeReductions,
ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern);

/// Replace affine store and load accesses by scalars by forwarding stores to
/// loads and eliminate invariant affine loads; consequently, eliminate dead
/// allocs.
Expand Down
32 changes: 32 additions & 0 deletions mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,12 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/ArrayRef.h"
#include <cstdint>

using namespace mlir;
using namespace mlir::affine;
Expand Down Expand Up @@ -148,6 +151,35 @@ void SimplifyBoundedAffineOpsOp::getEffects(
modifiesPayload(effects);
}

DiagnosedSilenceableFailure
SuperVectorizeOp::apply(transform::TransformRewriter &rewriter,
TransformResults &results,
TransformState &state) {
ArrayRef<int64_t> fastestVaryingPattern;
if (getFastestVaryingPattern().has_value()) {
if (getFastestVaryingPattern()->size() != getVectorSizes().size())
return emitSilenceableFailure(getLoc(),
"Fastest varying pattern specified with different size than "
"the vector size.");
fastestVaryingPattern = getFastestVaryingPattern().value();
}

for (Operation* target : state.getPayloadOps(getTarget()))
if (!target->getParentOfType<affine::AffineForOp>())
vectorizeChildAffineLoops(target,
getVectorizeReductions(),
getVectorSizes(),
fastestVaryingPattern);

return DiagnosedSilenceableFailure::success();
}

void SuperVectorizeOp::getEffects(
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
consumesHandle(getTargetMutable(), effects);
modifiesPayload(effects);
}

//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//
Expand Down
56 changes: 32 additions & 24 deletions mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
#include <optional>
Expand Down Expand Up @@ -1741,34 +1742,17 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
LLVM_DEBUG(dbgs() << "\n");
}

/// Applies vectorization to the current function by searching over a bunch of
/// predetermined patterns.
void Vectorize::runOnOperation() {
func::FuncOp f = getOperation();
if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");
return signalPassFailure();
}

if (vectorizeReductions && vectorSizes.size() != 1) {
f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
return signalPassFailure();
}

if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
f.emitError("Vectorization factor must be greater than zero.");
return signalPassFailure();
}

void affine::vectorizeChildAffineLoops(Operation* parentOp,
bool vectorizeReductions,
ArrayRef<int64_t> vectorSizes,
ArrayRef<int64_t> fastestVaryingPattern) {
DenseSet<Operation *> parallelLoops;
ReductionLoopMap reductionLoops;

// If 'vectorize-reduction=true' is provided, we also populate the
// `reductionLoops` map.
if (vectorizeReductions) {
f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
parentOp->walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
SmallVector<LoopReduction, 2> reductions;
if (isLoopParallel(loop, &reductions)) {
parallelLoops.insert(loop);
Expand All @@ -1778,18 +1762,42 @@ void Vectorize::runOnOperation() {
}
});
} else {
f.walk([&parallelLoops](AffineForOp loop) {
parentOp->walk([&parallelLoops](AffineForOp loop) {
if (isLoopParallel(loop))
parallelLoops.insert(loop);
});
}

// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
vectorizeLoops(parentOp, parallelLoops, vectorSizes, fastestVaryingPattern,
reductionLoops);
}

/// Applies vectorization to the current function by searching over a bunch of
/// predetermined patterns.
void Vectorize::runOnOperation() {
func::FuncOp f = getOperation();
if (!fastestVaryingPattern.empty() &&
fastestVaryingPattern.size() != vectorSizes.size()) {
f.emitRemark("Fastest varying pattern specified with different size than "
"the vector size.");
return signalPassFailure();
}

if (vectorizeReductions && vectorSizes.size() != 1) {
f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
return signalPassFailure();
}

if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
f.emitError("Vectorization factor must be greater than zero.");
return signalPassFailure();
}

vectorizeChildAffineLoops(f, vectorizeReductions, vectorSizes, fastestVaryingPattern);
}

/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:
/// * There must be at least one loop.
Expand Down
169 changes: 169 additions & 0 deletions mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//RUN: mlir-opt %s --transform-interpreter -split-input-file | FileCheck %s

// CHECK-LABEL: func @vec1d_3
func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
// CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C0]] : memref<?x?xf32>
// CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C1]] : memref<?x?xf32>
// CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %arg1, %[[C2]] : memref<?x?x?xf32>
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%M = memref.dim %A, %c0 : memref<?x?xf32>
%N = memref.dim %A, %c1 : memref<?x?xf32>
%P = memref.dim %B, %c2 : memref<?x?x?xf32>

// CHECK:for [[IV8:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128
// CHECK-NEXT: for [[IV9:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
// CHECK-NEXT: %[[APP9_0:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
// CHECK-NEXT: %[[APP9_1:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
affine.for %i8 = 0 to %M { // vectorized
affine.for %i9 = 0 to %N {
%a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
}
}
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.affine.super_vectorize %0 [128] : !transform.any_op
transform.yield
}
}

// -----

// CHECK-DAG: #[[$map_id1:map[0-9]*]] = affine_map<(d0) -> (d0)>
// CHECK-DAG: #[[$map_proj_d0d1_zerod1:map[0-9]*]] = affine_map<(d0, d1) -> (0, d1)>
// CHECK-DAG: #[[$map_proj_d0d1_d0zero:map[0-9]*]] = affine_map<(d0, d1) -> (d0, 0)>
// CHECK-LABEL: func @vectorize_matmul
func.func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%M = memref.dim %arg0, %c0 : memref<?x?xf32>
%K = memref.dim %arg0, %c1 : memref<?x?xf32>
%N = memref.dim %arg2, %c1 : memref<?x?xf32>
// CHECK: %[[C0:.*]] = arith.constant 0 : index
// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index
// CHECK-NEXT: %[[M:.*]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
// CHECK-NEXT: %[[K:.*]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
// CHECK-NEXT: %[[N:.*]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
// CHECK: {{.*}} #[[$map_id1]](%[[M]]) step 4 {
// CHECK-NEXT: {{.*}} #[[$map_id1]](%[[N]]) step 8 {
// CHECK: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<4x8xf32>
// CHECK-NEXT: vector.transfer_write %[[VC0]], %{{.*}}[%{{.*}}, %{{.*}}] : vector<4x8xf32>, memref<?x?xf32>
affine.for %i0 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
affine.for %i1 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
%cst = arith.constant 0.000000e+00 : f32
affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
}
}
// CHECK: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
// CHECK-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
// CHECK-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
// CHECK: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
// CHECK: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
// CHECK-NEXT: %[[C:.*]] = arith.mulf %[[B]], %[[A]] : vector<4x8xf32>
// CHECK: %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref<?x?xf32>, vector<4x8xf32>
// CHECK-NEXT: %[[E:.*]] = arith.addf %[[D]], %[[C]] : vector<4x8xf32>
// CHECK: vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] : vector<4x8xf32>, memref<?x?xf32>
affine.for %i2 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
affine.for %i3 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
affine.for %i4 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%K) {
%6 = affine.load %arg1[%i4, %i3] : memref<?x?xf32>
%7 = affine.load %arg0[%i2, %i4] : memref<?x?xf32>
%8 = arith.mulf %7, %6 : f32
%9 = affine.load %arg2[%i2, %i3] : memref<?x?xf32>
%10 = arith.addf %9, %8 : f32
affine.store %10, %arg2[%i2, %i3] : memref<?x?xf32>
}
}
}
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.affine.super_vectorize %0 [4, 8] : !transform.any_op
transform.yield
}
}

// -----

func.func @vec3d(%A : memref<?x?x?xf32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%0 = memref.dim %A, %c0 : memref<?x?x?xf32>
%1 = memref.dim %A, %c1 : memref<?x?x?xf32>
%2 = memref.dim %A, %c2 : memref<?x?x?xf32>
// CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
// CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
// CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
// CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 64 {
// CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 {
// CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} : memref<?x?x?xf32>, vector<32x64x256xf32>
affine.for %t0 = 0 to %0 {
affine.for %t1 = 0 to %0 {
affine.for %i0 = 0 to %0 {
affine.for %i1 = 0 to %1 {
affine.for %i2 = 0 to %2 {
%a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
}
}
}
}
}
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.affine.super_vectorize %0 [32, 64, 256] fastest_varying_pattern=[2,1,0] : !transform.any_op
transform.yield
}
}

// -----

// CHECK-LABEL: @vecdim_reduction_minf
// CHECK: affine.for %{{.*}} = 0 to 256 {
// CHECK: %[[vmax:.*]] = arith.constant dense<0x7F800000> : vector<128xf32>
// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xf32>) {
// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
// CHECK: %[[min:.*]] = arith.minimumf %[[red_iter]], %[[ld]] : vector<128xf32>
// CHECK: affine.yield %[[min]] : vector<128xf32>
// CHECK: }
// CHECK: %[[final_min:.*]] = vector.reduction <minimumf>, %[[vred:.*]] : vector<128xf32> into f32
// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xf32>
// CHECK: }

func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
%cst = arith.constant 0x7F800000 : f32
affine.for %i = 0 to 256 {
%final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
%ld = affine.load %in[%i, %j] : memref<256x512xf32>
%min = arith.minimumf %red_iter, %ld : f32
affine.yield %min : f32
}
affine.store %final_red, %out[%i] : memref<256xf32>
}
return
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.affine.super_vectorize %0 [128] vectorize_reductions=true : !transform.any_op
transform.yield
}
}