Check for parallel IV in affineMapToSlice (EnzymeAD#658)

Pangoraw · web-flow · commit da5cb46eaa35 · 2025-04-06T10:30:23.000-05:00
* Check for parallel IV in affineMapToSlice

* remove log

* fmt

* fmt2

* Update test

* remove duplicate test

* literal
diff --git a/src/enzyme_ad/jax/Passes/AffineToStableHLORaising.cpp b/src/enzyme_ad/jax/Passes/AffineToStableHLORaising.cpp
@@ -200,53 +200,6 @@ emitIVToStableHLO(OpBuilder &builder, Value iv, InductionVariableRange range,
   maps[iota] = accessMap;
 }
 
-// Given an affine map for a load/store operation, compute the startIndices,
-// limitIndices and strides corresponding in the memref based on the loop
-// induction variables.
-//
-// (i) -> (0, i, 10) will give [0:1:1, begin:end:step, 10:11:1]
-// (i) -> (2 * i, i + 2, 10) will give [begin*2:end*2:2*step,
-// begin+2:end+2:step, 10:11:1]
-//
-// with begin:end:step corresponding to the range of the iv i.
-static LogicalResult affineMapToSlice(affine::AffineValueMap accessValueMap,
-                                      SmallVectorImpl<int64_t> &strides,
-                                      SmallVectorImpl<int64_t> &reverseDims) {
-  auto rank = accessValueMap.getNumResults();
-
-  strides.reserve(rank);
-
-  for (unsigned i = 0; i < rank; i++) {
-    auto expr = accessValueMap.getResult(i);
-
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
-      strides.push_back(1);
-      continue;
-    }
-
-    Value iv = getIVForExpr(accessValueMap, expr);
-    if (affine::isAffineForInductionVar(iv)) {
-      strides.push_back(1);
-      continue;
-    }
-
-    auto range = computeExprRange(accessValueMap, expr);
-
-    if (!range.has_value())
-      return failure();
-
-    if (range->step < 0) {
-      // 0:-1:-180 -> -179:1:1
-      strides.push_back(-range->step);
-      reverseDims.push_back(i);
-    } else {
-      strides.push_back(range->step);
-    }
-  }
-
-  return success();
-}
-
 // The name is parallel context but a more accurate description would be
 // LockStepContext
 struct ParallelContext {
@@ -337,6 +290,54 @@ struct ParallelContext {
   }
 };
 
+// Given an affine map for a load/store operation, compute the startIndices,
+// limitIndices and strides corresponding in the memref based on the loop
+// induction variables.
+//
+// (i) -> (0, i, 10) will give [0:1:1, begin:end:step, 10:11:1]
+// (i) -> (2 * i, i + 2, 10) will give [begin*2:end*2:2*step,
+// begin+2:end+2:step, 10:11:1]
+//
+// with begin:end:step corresponding to the range of the iv i.
+static LogicalResult affineMapToSlice(affine::AffineValueMap accessValueMap,
+                                      SmallVectorImpl<int64_t> &strides,
+                                      SmallVectorImpl<int64_t> &reverseDims,
+                                      ParallelContext pc) {
+  auto rank = accessValueMap.getNumResults();
+
+  strides.reserve(rank);
+
+  for (unsigned i = 0; i < rank; i++) {
+    auto expr = accessValueMap.getResult(i);
+
+    if (auto constExpr = dyn_cast<AffineConstantExpr>(expr)) {
+      strides.push_back(1);
+      continue;
+    }
+
+    Value iv = getIVForExpr(accessValueMap, expr);
+    if (affine::isAffineForInductionVar(iv) && !pc.isParallelIV(iv)) {
+      strides.push_back(1);
+      continue;
+    }
+
+    auto range = computeExprRange(accessValueMap, expr);
+
+    if (!range.has_value())
+      return failure();
+
+    if (range->step < 0) {
+      // 0:-1:-180 -> -179:1:1
+      strides.push_back(-range->step);
+      reverseDims.push_back(i);
+    } else {
+      strides.push_back(range->step);
+    }
+  }
+
+  return success();
+}
+
 static SmallVector<int64_t>
 affineMapShape(affine::AffineValueMap accessValueMap, ParallelContext pc) {
   AffineMap map = accessValueMap.getAffineMap();
@@ -1597,7 +1598,7 @@ tryRaisingOpToStableHLO(Operation *op, IRMapping &mapping, OpBuilder &builder,
     SmallVector<int64_t> strides;
     SmallVector<int64_t> reverseDims;
 
-    if (affineMapToSlice(accessValueMap, strides, reverseDims).failed()) {
+    if (affineMapToSlice(accessValueMap, strides, reverseDims, pc).failed()) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to affine map to slice: " << *op << "\n");
       return failure();
@@ -1733,7 +1734,7 @@ tryRaisingOpToStableHLO(Operation *op, IRMapping &mapping, OpBuilder &builder,
     SmallVector<int64_t> strides;
     SmallVector<int64_t> reverseDims;
 
-    if (affineMapToSlice(accessValueMap, strides, reverseDims).failed()) {
+    if (affineMapToSlice(accessValueMap, strides, reverseDims, pc).failed()) {
       LLVM_DEBUG(llvm::dbgs()
                  << "Failed to affine map to slice: " << *op << "\n");
       return failure();
diff --git a/test/lit_tests/raising/affine_to_stablehlo_forred2.mlir b/test/lit_tests/raising/affine_to_stablehlo_forred2.mlir
@@ -36,15 +36,18 @@ module {
 // CHECK-NEXT:    %5 = stablehlo.reshape %4 : (tensor<32x16xf64>) -> tensor<1x32x16xf64>
 // CHECK-NEXT:    %6 = stablehlo.slice %arg0 [8:9, 0:32, 0:16] : (tensor<9x32x16xf64>) -> tensor<1x32x16xf64>
 // CHECK-NEXT:    %7 = stablehlo.slice %arg1 [0:7, 0:32, 0:16] : (tensor<9x32x16xf64>) -> tensor<7x32x16xf64>
-// CHECK-NEXT:    %8 = stablehlo.slice %arg1 [1:8, 0:32, 0:16] : (tensor<9x32x16xf64>) -> tensor<7x32x16xf64>
-// CHECK-NEXT:    %9 = arith.addf %8, %7 : tensor<7x32x16xf64>
-// CHECK-NEXT:    %10 = stablehlo.broadcast_in_dim %4, dims = [1, 2] : (tensor<32x16xf64>) -> tensor<7x32x16xf64>
-// CHECK-NEXT{LITERAL}:    %11 = "stablehlo.reduce_window"(%9, %cst) <{base_dilations = array<i64: 1, 1, 1>, padding = dense<[[6, 0], [0, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = array<i64: 1, 1, 1>, window_dimensions = array<i64: 7, 1, 1>, window_strides = array<i64: 1, 1, 1>}> ({
+// CHECK-NEXT:    %8 = stablehlo.reverse %7, dims = [0] : tensor<7x32x16xf64>
+// CHECK-NEXT:    %9 = stablehlo.slice %arg1 [1:8, 0:32, 0:16] : (tensor<9x32x16xf64>) -> tensor<7x32x16xf64>
+// CHECK-NEXT:    %10 = stablehlo.reverse %9, dims = [0] : tensor<7x32x16xf64>
+// CHECK-NEXT:    %11 = arith.addf %10, %8 : tensor<7x32x16xf64>
+// CHECK-NEXT:    %12 = stablehlo.broadcast_in_dim %4, dims = [1, 2] : (tensor<32x16xf64>) -> tensor<7x32x16xf64>
+// CHECK-NEXT{LITERAL}:    %13 = "stablehlo.reduce_window"(%11, %cst) <{base_dilations = array<i64: 1, 1, 1>, padding = dense<[[6, 0], [0, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = array<i64: 1, 1, 1>, window_dimensions = array<i64: 7, 1, 1>, window_strides = array<i64: 1, 1, 1>}> ({
 // CHECK-NEXT:    ^bb0(%arg2: tensor<f64>, %arg3: tensor<f64>):
-// CHECK-NEXT:      %14 = stablehlo.add %arg2, %arg3 : tensor<f64>
-// CHECK-NEXT:      stablehlo.return %14 : tensor<f64>
+// CHECK-NEXT:      %17 = stablehlo.add %arg2, %arg3 : tensor<f64>
+// CHECK-NEXT:      stablehlo.return %17 : tensor<f64>
 // CHECK-NEXT:    }) : (tensor<7x32x16xf64>, tensor<f64>) -> tensor<7x32x16xf64>
-// CHECK-NEXT:    %12 = stablehlo.add %11, %10 : tensor<7x32x16xf64>
-// CHECK-NEXT:    %13 = stablehlo.concatenate %12, %5, %6, dim = 0 : (tensor<7x32x16xf64>, tensor<1x32x16xf64>, tensor<1x32x16xf64>) -> tensor<9x32x16xf64>
-// CHECK-NEXT:    return %13, %arg1 : tensor<9x32x16xf64>, tensor<9x32x16xf64>
+// CHECK-NEXT:    %14 = stablehlo.add %13, %12 : tensor<7x32x16xf64>
+// CHECK-NEXT:    %15 = stablehlo.reverse %14, dims = [0] : tensor<7x32x16xf64>
+// CHECK-NEXT:    %16 = stablehlo.concatenate %15, %5, %6, dim = 0 : (tensor<7x32x16xf64>, tensor<1x32x16xf64>, tensor<1x32x16xf64>) -> tensor<9x32x16xf64>
+// CHECK-NEXT:    return %16, %arg1 : tensor<9x32x16xf64>, tensor<9x32x16xf64>
 // CHECK-NEXT:  }