Skip to content

[mlir][affine] Use value bound inference to determine minimum/maximum trip counts in loop analysis #128113

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);

/// Returns the maximum trip count when the operand of forOp has a range. If the
/// operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);

/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
Expand Down
80 changes: 62 additions & 18 deletions mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/Analysis/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "llvm/Support/MathExtras.h"

#include "llvm/ADT/DenseSet.h"
Expand Down Expand Up @@ -212,31 +214,68 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}

/// The function make map be computed with the given operands to get the value
/// of trip, which may have a range when a range exists for either operand.
/// If type is equal to BoundType::LB get the minimum value of the trip, if type
/// is equal to BoundType::UB get the maximum value of the trip.
static std::optional<uint64_t>
getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
presburger::BoundType type) {
std::optional<uint64_t> tripCount;
for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) {
AffineMap subMap = map.getSubMap(i);
ValueBoundsConstraintSet::Variable var(subMap, operands);
auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::LB, var);
auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::UB, var, nullptr, /*closedUB*/ true);
if (failed(lbBound) || failed(ubBound))
return std::nullopt;
if (type == presburger::BoundType::LB) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(lbBound.value()));
else
tripCount = lbBound.value();
} else if (type == presburger::BoundType::UB) {
if (tripCount.has_value())
tripCount =
std::max(*tripCount, static_cast<uint64_t>(ubBound.value()));
else
tripCount = ubBound.value();
} else {
return std::nullopt;
}
}
return tripCount;
}

/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
/// cases.
std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
SmallVector<Value, 4> operands;
SmallVector<Value> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);

if (!map)
return std::nullopt;
return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
}

// Take the min if all trip counts are constant.
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
} else
return std::nullopt;
}
return tripCount;
/// Returns the maximum trip count when the operand of forOp has a range.
/// If the operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t>
mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
SmallVector<Value> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);

if (!map)
return std::nullopt;
return getKnownTripCountBound(map, operands, presburger::BoundType::UB);
}

/// Returns the greatest known integral divisor of the trip count. Affine
Expand All @@ -254,10 +293,14 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
std::optional<uint64_t> gcd;
for (auto resultExpr : map.getResults()) {
for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) {
uint64_t thisGcd;
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
uint64_t tripCount = constExpr.getValue();
AffineMap subMap = map.getSubMap(i);
ValueBoundsConstraintSet::Variable var(subMap, operands);
auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
mlir::presburger::BoundType::LB, var);
if (!failed(lbBound)) {
uint64_t tripCount = lbBound.value();
// 0 iteration loops (greatest divisor is 2^64 - 1).
if (tripCount == 0)
thisGcd = std::numeric_limits<uint64_t>::max();
Expand All @@ -266,7 +309,8 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
thisGcd = tripCount;
} else {
// Trip count is not a known constant; return its largest known divisor.
thisGcd = resultExpr.getLargestKnownDivisor();
thisGcd = map.getResult(i).getLargestKnownDivisor();
;
}
if (gcd.has_value())
gcd = std::gcd(*gcd, thisGcd);
Expand Down
40 changes: 28 additions & 12 deletions mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
/// Promotes the loop body of a forOp to its containing block if the forOp
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
if (!tripCount || *tripCount != 1)
std::optional<uint64_t> minTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
if (!minTripCount || *minTripCount != 1 || !maxTripCount ||
*maxTripCount != 1)
return failure();

// TODO: extend this for arbitrary affine bounds.
Expand Down Expand Up @@ -884,15 +886,23 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (mayBeConstantTripCount.has_value()) {
uint64_t tripCount = *mayBeConstantTripCount;
if (tripCount == 0)
return success();
if (tripCount == 1)
return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}
return failure();
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);

if (!mayBeConstantTripCount.has_value() &&
!maxMayBeConstantTripCount.has_value())
return failure();

uint64_t tripCount = *mayBeConstantTripCount;

// Trip equals 0, this loop cannot unroll.
if (tripCount <= 0)
return success();

if (succeeded(promoteIfSingleIteration(forOp)))
return success();

return loopUnrollByFactor(forOp, tripCount);
}

/// Unrolls this loop by the specified factor or by the trip count (if constant)
Expand Down Expand Up @@ -1013,8 +1023,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");

std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
Expand All @@ -1035,7 +1048,10 @@ LogicalResult mlir::affine::loopUnrollByFactor(
}

// Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
// If the trip count has a range, a clean up loop needs to be generated.
if ((mayBeConstantTripCount && maxMayBeConstantTripCount &&
*mayBeConstantTripCount != *maxMayBeConstantTripCount) ||
getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
// Loops where the lower bound is a max expression or the upper bound is
// a min expression and the trip count doesn't divide the unroll factor
// can't be unrolled since the lower bound of the cleanup loop in such cases
Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
// Compute constant bound for `valueDim`.
int64_t ubAdjustment = closedUB ? 0 : 1;
if (auto bound = cstr.cstr.getConstantBound64(type, pos))
return type == BoundType::UB ? *bound + ubAdjustment : *bound;
if (bound.has_value())
return type == BoundType::UB ? *bound + ubAdjustment : *bound;
return failure();
}

Expand Down
100 changes: 97 additions & 3 deletions mlir/test/Dialect/Affine/unroll.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// UNROLL-FULL-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 1)>
// UNROLL-FULL-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)>
// UNROLL-FULL-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-FULL-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 9) ceildiv 2) floordiv 4) * 8)>

// SHORT-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>

Expand All @@ -22,7 +23,8 @@
// UNROLL-BY-4-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)>
// UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<(d0) -> (d0)>
// UNROLL-BY-4-DAG: [[$MAP8:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>

// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func.func @loop_nest_simplest() {
Expand Down Expand Up @@ -258,6 +260,72 @@ gpu.module @unroll_full {
}
}

// UNROLL-FULL-LABEL: func @thread_partial_execution
func.func @thread_partial_execution() {
%c0 = arith.constant 0 :index
%c2 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index {
%sum = arith.addi %arg, %c0 : index
affine.yield %sum : index
}
// UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index
// UNROLL-FULL-NEXT: }
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func @unroll_all_thread
func.func @unroll_all_thread() {
%c0 = arith.constant 0 :index
%c2 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index {
%sum = arith.addi %arg, %c0 : index
affine.yield %sum : index
}
// UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
func.func @partial_unroll_factor_4() {
%c0 = arith.constant 0 :index
%c2 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index {
%sum = arith.addi %arg, %c0 : index
affine.yield %sum : index
}
gpu.terminator
}
// UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
// UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM_4]] : index
// UNROLL-FULL-NEXT: }
return
}

// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
Expand Down Expand Up @@ -470,7 +538,7 @@ func.func @loop_nest_operand1() {
// UNROLL-BY-4-LABEL: func @loop_nest_operand2() {
func.func @loop_nest_operand2() {
// UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 {
// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
// UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
Expand Down Expand Up @@ -516,7 +584,7 @@ func.func @floordiv_mod_ub(%M : index, %N : index) {
func.func @loop_nest_operand3() {
// UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 {
affine.for %i = 0 to 100 step 2 {
// UNROLL-BY-4: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
// UNROLL-BY-4: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
// UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
// UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
Expand Down Expand Up @@ -701,6 +769,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
return %sum : f32
}

// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
func.func @gpu_launch_unroll_by_factor_4() {
%c0 = arith.constant 0 :index
%c2 = arith.constant 2 : index
// UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index {
%sum = arith.addi %arg, %c0 : index
affine.yield %sum : index
}
gpu.terminator
}
// UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
// UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
// UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index
// UNROLL-BY-4-NEXT: }
return
}

// UNROLL-FULL: func @unroll_zero_trip_count_case
func.func @unroll_zero_trip_count_case() {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 0
Expand Down