fix: DecomposeGenericByUnfoldingPermutation pass (#19)

chrsmcgrr · web-flow · commit c357983a6a3a · 2025-06-02T17:53:42.000+02:00
The following PR is attempting to:
1. extend the decompose pass to also reason about scalars which can appear on generic functions
2. Reject any operands that are not RankedTensors. As this pass was created with this assumption. Though this pass can be naturally called on linalg with memref or any type.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp
@@ -166,7 +166,14 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
   // out which operand can supply that runtime-value (tensor.dim).
   // Leaving it as a future TODO.
   if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) {
-        auto opType = cast<RankedTensorType>(oper.get().getType());
+        // Allow scalar values as these can be broadcasted on the input.
+        if (oper.get().getType().isIntOrFloat())
+          return false;
+        // If any of the operands are not a RankedTensorType, then we should
+        // return early. The pattern has been built with RankedTensors in mind.
+        if (!isa<RankedTensorType>(oper.get().getType()))
+          return true;
+        auto opType = cast<ShapedType>(oper.get().getType());
         return ShapedType::isDynamicShape(opType.getShape());
       }))
     return failure();
@@ -181,10 +188,27 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
   // Walk over each input operand and unfold if it is transposed, broadcast
   // or mix of two via operand's affine-map.
   for (int64_t i = 0; i < op.getNumDpsInputs(); ++i) {
-    auto &map = newMap[i];
-    auto inputRTType = cast<RankedTensorType>(newInitValues[i].getType());
-    auto elType = inputRTType.getElementType();
+    auto inputType = newInitValues[i].getType();
+    SmallVector<int64_t> inputShape =
+        llvm::TypeSwitch<Type, SmallVector<int64_t>>(inputType)
+            .Case([](RankedTensorType tensor) { return tensor.getShape(); })
+            .Case([](FloatType scalar) { return SmallVector<int64_t>({1}); })
+            .Case([](IntegerType scalar) { return SmallVector<int64_t>({1}); })
+            .Default([](Type) { return SmallVector<int64_t>(); });
+
+    Type elType = llvm::TypeSwitch<Type, Type>(inputType)
+                      .Case([](RankedTensorType tensor) {
+                        return tensor.getElementType();
+                      })
+                      .Case([](FloatType scalar) { return scalar; })
+                      .Case([](IntegerType scalar) { return scalar; })
+                      .Default([](Type) { return Type(); });
+
+    // If we were not able to result the information skip.
+    if (inputShape.empty() || !elType)
+      continue;
 
+    auto &map = newMap[i];
     /// Nothing to do if map is already an identity.
     if (map.isIdentity())
       continue;
@@ -197,7 +221,7 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
       /// rule: dim(result, i) = dim(input, permutation[i])
       SmallVector<int64_t> transposedShape(map.getNumResults());
       for (int64_t i = 0; i < map.getNumResults(); ++i)
-        transposedShape[i] = inputRTType.getShape()[permutation[i]];
+        transposedShape[i] = inputShape[permutation[i]];
 
       Value emptyTensor =
           rewriter.create<tensor::EmptyOp>(loc, transposedShape, elType);
@@ -211,13 +235,23 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
     // Does it require broadcast?
     if (!broadcastedDims.empty()) {
       assert(broadcastedDims.size() && "should have non size broadcast");
-      Value emptyTensor = rewriter.create<tensor::EmptyOp>(
-          loc, outputShape, inputRTType.getElementType());
+      Value emptyTensor =
+          rewriter.create<tensor::EmptyOp>(loc, outputShape, elType);
 
-      auto broadcastOp = rewriter.create<linalg::BroadcastOp>(
-          loc, newInitValues[i], emptyTensor, broadcastedDims);
+      Value source = newInitValues[i];
+      Value result;
+      // If a scalar is being broadcasted we can simply use a fill operation.
+      if (source.getType().isIntOrFloat()) {
+        result = rewriter.create<linalg::FillOp>(loc, source, emptyTensor)
+                     ->getResult(0);
+      } else {
+        result = rewriter
+                     .create<linalg::BroadcastOp>(loc, source, emptyTensor,
+                                                  broadcastedDims)
+                     ->getResult(0);
+      }
 
-      newInitValues[i] = broadcastOp->getResult(0);
+      newInitValues[i] = result;
       isChanged = true;
     }
     newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims());
diff --git a/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir b/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir
@@ -69,3 +69,60 @@ func.func @broadcast_only(%x : tensor<2x16x32xf32>, %y:  tensor<2x32xf32>, %z :
 // CHECK: %[[X_bc:.+]] = linalg.broadcast ins(%[[Y]] : tensor<2x32xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) dimensions = [1]
 // CHECK: {{.*}} = linalg.div ins(%[[X]], %[[X_bc]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%arg2 : tensor<2x16x32xf32>) -> tensor<2x16x32xf32>
 // CHECK-NOT: linalg.generic
+
+
+// -----
+
+#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#broadcast = affine_map<(d0, d1, d2) -> ()>
+func.func @scalar_broadcast(%x: tensor<1x8x16xf32>, %y:  f32) ->  tensor<1x8x16xf32> {
+  %empty = tensor.empty() : tensor<1x8x16xf32>
+  %res = linalg.generic
+     { indexing_maps = [#identity, #broadcast, #identity], iterator_types = ["parallel", "parallel", "parallel"]}
+     ins(%x, %y : tensor<1x8x16xf32>, f32)
+     outs(%empty : tensor<1x8x16xf32>) {
+     ^bb0(%in: f32, %in2: f32,  %out: f32):
+       %add = arith.addf %in, %in2 : f32
+       linalg.yield %add : f32
+  } -> tensor<1x8x16xf32>
+  return %res : tensor<1x8x16xf32>
+}
+
+// CHECK-LABEL: scalar_broadcast
+// CHECK-SAME:      %[[INPUT:.+]]: tensor<1x8x16xf32>
+// CHECK-SAME:      %[[SCALAR:.+]]: f32
+// CHECK-DAG:     %[[EMPTY_ADD:.+]] = tensor.empty() : tensor<1x8x16xf32>
+// CHECK-DAG:     %[[EMPTY_FILL:.+]] = tensor.empty() : tensor<1x8x16xf32>
+// CHECK-DAG:     %[[FILL:.+]] = linalg.fill
+// CHECK-SAME:                    ins(%[[SCALAR]] : f32)
+// CHECK-SAME:                    outs(%[[EMPTY_FILL]] : tensor<1x8x16xf32>)
+// CHECK:         %[[ADD:.+]] = linalg.add
+// CHECK-SAME:                    ins(%[[INPUT]], %[[FILL]] : tensor<1x8x16xf32>, tensor<1x8x16xf32>) 
+// CHECK-SAME:                    outs(%[[EMPTY_ADD]] : tensor<1x8x16xf32>)
+// CHECK:         return %[[ADD]] : tensor<1x8x16xf32>
+
+// -----
+
+#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#broadcast = affine_map<(d0, d1, d2) -> (d2)>
+func.func @ignore_non_ranked_tensor_types(%x: memref<1x8x16xf32>, %y:  memref<16xf32>) {
+  %empty = memref.alloc() : memref<1x8x16xf32>
+  linalg.generic
+     { indexing_maps = [#identity, #broadcast, #identity], iterator_types = ["parallel", "parallel", "parallel"]}
+     ins(%x, %y : memref<1x8x16xf32>,  memref<16xf32>)
+     outs(%empty : memref<1x8x16xf32>) {
+     ^bb0(%in: f32, %in2: f32,  %out: f32):
+       %add = arith.addf %in, %in2 : f32
+       linalg.yield %add : f32
+  }
+  func.return
+}
+
+// CHECK-LABEL: ignore_non_ranked_tensor_types
+// CHECK-SAME:      %[[X:.+]]: memref<1x8x16xf32>
+// CHECK-SAME:      %[[Y:.+]]: memref<16xf32>
+// CHECK:         %[[EMPTY:.+]] = memref.alloc() : memref<1x8x16xf32>
+// CHECK:         linalg.generic
+// CHECK-SAME:      ins(%[[X]], %[[Y]] : memref<1x8x16xf32>, memref<16xf32>)
+// CHECK-SAME:      outs(%[[EMPTY]] : memref<1x8x16xf32>)
+// CHECK:         return