From 764d8e65917472fed9a260caa1b9422994a40370 Mon Sep 17 00:00:00 2001
From: James Newling <james.newling@gmail.com>
Date: Wed, 25 Sep 2024 13:05:47 -0700
Subject: [PATCH] update

---
 .../Transforms/AMDAIEConvertToDma.cpp         |   2 +-
 .../Transforms/test/convert_to_dma.mlir       | 365 +++++++++---------
 2 files changed, 190 insertions(+), 177 deletions(-)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
index 8a688c11d..4a93ee31e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp
@@ -382,7 +382,7 @@ LogicalResult updateFromExpandShape(memref::ExpandShapeOp expandShapeOp,
   // Offsets. For now we don't do any arithmetic to split the offset across
   // dimensions, in theory we need to split the offset amongst the reassociation
   // indices, but for now I'm just putting the offset on the inner most
-  // dimension. 
+  // dimension.
   SmallVector<OpFoldResult> newOffsets(resultShape.size());
   for (int i = 0; i < resultShape.size(); i++) {
     newOffsets[i] = getAsIndexOpFoldResult(ctx, 0);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir
index 3f4e2dea4..487c5714a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/convert_to_dma.mlir
@@ -16,95 +16,6 @@ func.func @basic_unitdim_pack() {
   return
 }
 
-// -----
-
-// CHECK-LABEL: collapsed_and_expanded_pack_0
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: [0, 0] [10, 10] [10, 1]
-// CHECK-SAME: [0, 0] [10, 10] [10, 1]
-func.func @collapsed_and_expanded_pack_0() {
-  %alloc0 = memref.alloc() : memref<10x10xf32>
-  %src = memref.collapse_shape %alloc0 [[0, 1]] : memref<10x10xf32> into memref<100xf32>
-  %alloc1 = memref.alloc() : memref<100xf32>
-  %dst = memref.expand_shape %alloc1 [[0, 1]] output_shape [10, 10] : memref<100xf32> into memref<10x10xf32>
-  iree_linalg_ext.pack %src inner_dims_pos = [0] inner_tiles = [10] into %dst : (memref<100xf32> memref<10x10xf32>)
-  return
-}
-
-// -----
-
-
-// This test is essentially the same as the test above, except there is an expand operation on the %src.
-// CHECK-LABEL: @basic_unitdim_pack_expand
-// CHECK-DAG: %[[SRCMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 1>
-// CHECK-DAG: %[[DSTMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 2>
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: %[[DSTMEMREF]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]
-// CHECK-SAME: %[[SRCMEMREF]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]
-func.func @basic_unitdim_pack_expand() {
-  %src = memref.alloc() : memref<8x16xi32, 1>
-  %dst = memref.alloc() : memref<8x16xi32, 2>
-  %dst_e = memref.expand_shape %dst [[0, 1, 2], [3]] output_shape [1, 1, 8, 16]
-         : memref<8x16xi32, 2> into memref<1x1x8x16xi32, 2>
-  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %dst_e:
-           (memref<8x16xi32, 1> memref<1x1x8x16xi32, 2>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @basic_unitdim_unpack_expand
-// CHECK-DAG: %[[SRCMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 1>
-// CHECK-DAG: %[[DSTMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 2>
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: %[[DSTMEMREF]][0, 0] [8, 16] [16, 1]
-// CHECK-SAME: %[[SRCMEMREF]][0, 0, 0, 0] [1, 8, 1, 16] [128, 16, 128, 1]
-func.func @basic_unitdim_unpack_expand() {
-  %src = memref.alloc() : memref<8x16xi32, 1>
-  %dst = memref.alloc() : memref<8x16xi32, 2>
-  %src_e = memref.expand_shape %src [[0, 1, 2], [3]] output_shape [1, 1, 8, 16]
-         : memref<8x16xi32, 1> into memref<1x1x8x16xi32, 1>
-  iree_linalg_ext.unpack %src_e inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %dst:
-           (memref<1x1x8x16xi32, 1> memref<8x16xi32, 2>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: multidim_with_expand
-// CHECK: amdaie.dma_cpy_nd
-// dst of dma cpy:
-// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 100, 10, 1]
-// src of dma cpy:
-// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 10, 50, 1]
-func.func @multidim_with_expand() {
-  %src = memref.alloc() : memref<200x50xi32, 1>
-  %dst = memref.alloc() : memref<100x100xi32, 2>
-  %dst_e = memref.expand_shape %dst [[0, 1], [2, 3]] output_shape [20, 5, 10, 10]
-         : memref<100x100xi32, 2> into memref<20x5x10x10xi32, 2>
-  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [10, 10]
-     into %dst_e: (memref<200x50xi32, 1> memref<20x5x10x10xi32, 2>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: multidim_without_expand
-// CHECK: amdaie.dma_cpy_nd
-// dst of dma cpy:
-// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 100, 10, 1]
-// src of dma cpy:
-// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 10, 50, 1]
-func.func @multidim_without_expand() {
-  %src = memref.alloc() : memref<200x50xi32, 1>
-  %dst = memref.alloc() : memref<20x5x10x10xi32, 2>
-  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [10, 10]
-     into %dst: (memref<200x50xi32, 1> memref<20x5x10x10xi32, 2>)
-  return
-}
-
-
-
 // -----
 
 // CHECK-LABEL: @multidim_pack
@@ -123,7 +34,6 @@ func.func @multidim_pack() {
   return
 }
 
-
 // -----
 
 // CHECK-LABEL: @permute_pack
@@ -172,67 +82,6 @@ func.func @subview_pack() {
 
 // -----
 
-// CHECK-LABEL: @subview_then_collapse(%arg0: index)
-// CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<20x10xf32>
-// CHECK: %[[C10:.*]] = arith.constant 10 : index
-// CHECK: %[[MULI:.*]] = arith.muli %arg0, %[[C10]] : index
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: [0, 0] [5, 20] [20, 1]
-// CHECK-SAME: [0, %[[MULI]]] [5, 20] [20, 1]
-func.func @subview_then_collapse(%arg0 : index) {
-  %src = memref.alloc() : memref<20x10xf32>
-  %subview = memref.subview %src[%arg0, 0] [10, 10] [1, 1] :
-           memref<20x10xf32> to memref<10x10xf32, strided<[10, 1], offset: ?>>
-  %collapsed = memref.collapse_shape %subview [[0, 1]] : memref<10x10xf32, strided<[10, 1], offset: ?>>
-           into memref<100xf32, strided<[1], offset: ?>>
-  %dst = memref.alloc() : memref<5x20xf32>
-  iree_linalg_ext.pack %collapsed inner_dims_pos = [0] inner_tiles = [20] into %dst
-          : (memref<100xf32, strided<[1], offset: ?>> memref<5x20xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @expand_shape_after_subview
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: [0, 0, %arg0, 0, 0] [2, 3, 6, 6, 1] [300, 100, 10, 1, 1]
-// CHECK-SAME: [0, 0, 0, 0, 0] [2, 3, 6, 6, 1] [108, 6, 1, 18, 6]
-module {
-  func.func @expand_shape_after_subview(%arg0: index) {
-    %alloc = memref.alloc() : memref<10x10x10xf32>
-    %subview = memref.subview %alloc[0, %arg0, 0] [6, 6, 6] [1, 1, 1] :
-       memref<10x10x10xf32> to memref<6x6x6xf32, strided<[100, 10, 1], offset: ?>>
-    %expand_shape = memref.expand_shape %subview [[0, 1], [2], [3, 4]]
-       output_shape [2, 3, 6, 6, 1] : memref<6x6x6xf32, strided<[100, 10, 1], offset: ?>>
-       into memref<2x3x6x6x1xf32, strided<[300, 100, 10, 1, 1], offset: ?>>
-    %alloc_0 = memref.alloc() : memref<12x3x6xf32>
-    iree_linalg_ext.pack %alloc_0 inner_dims_pos = [0, 1] inner_tiles = [6, 1]
-       into %expand_shape : (memref<12x3x6xf32> memref<2x3x6x6x1xf32, strided<[300, 100, 10, 1, 1], offset: ?>>)
-    return
-  }
-}
-
-// -----
-
-// CHECK-LABEL: @subview_followed_by_subview(%arg0: index, %arg1: index)
-// CHECK: %[[SUM:.*]] = arith.addi %arg0, %arg1 : index
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: [0] [100] [1],
-// CHECK-SAME: [%[[SUM]], 5] [10, 10] [20, 1]
-func.func @subview_followed_by_subview(%arg0 : index, %arg1 : index){
-  %src = memref.alloc() : memref<20x20xf32>
-  %subview0 = memref.subview %src[%arg0, 2] [15, 15] [1, 1] :
-           memref<20x20xf32> to memref<15x15xf32, strided<[20, 1], offset: ?>>
-  %subview1 = memref.subview %subview0[%arg1, 3] [10, 10] [1, 1] :
-           memref<15x15xf32, strided<[20, 1], offset: ?>> to memref<10x10xf32, strided<[20, 1], offset: ?>>
-  %dst = memref.alloc() : memref<100xf32>
-  iree_linalg_ext.unpack %subview1 inner_dims_pos = [0] inner_tiles = [10] into %dst
-       : (memref<10x10xf32, strided<[20, 1], offset: ?>> memref<100xf32>)
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @collapsing_subview_pack
 // CHECK: %[[SRC_LOFI:.*]] = amdaie.logicalobjectfifo.from_memref {{.*}} !amdaie.logicalobjectfifo<memref<12x5x2x10x6x8xf32>>
 // CHECK: %[[DST_LOFI:.*]] = amdaie.logicalobjectfifo.from_memref {{.*}} !amdaie.logicalobjectfifo<memref<2x2x3x3xf32, 1>>
@@ -240,11 +89,19 @@ func.func @subview_followed_by_subview(%arg0 : index, %arg1 : index){
 // CHECK-SAME: %[[DST_LOFI]][0, 0, 0, 0] [2, 2, 3, 3] [18, 9, 3, 1]
 // CHECK-SAME: %[[SRC_LOFI]][0, 0, 0, 0] [2, 2, 3, 3] [14400, 480, 8, 4800]
 
+// Note on the stride on the source side of [14400, 480, 8, 4800], how
+// is calculated? The source (%sbv) in rank-3 with strides [4800, 480, 8].
+// The pack is essenitally 2 operations:
+// 1) a reshape 6x2x3 -> 2x3x2x3
+// 2) a permute 2x3x2x3 -> 2x2x3x3 (index 1 migrates to end).
+// The reshape makes the strides go from [4800, 480, 8] to [4800*3, 4800, 480, 8]
+// The permute makes the strides go from [4800*3, 4800, 480, 8] to [4800*3, 480, 8, 4800]
+
 func.func @collapsing_subview_pack() {
   %src = memref.alloc() : memref<12x5x2x10x6x8xf32>
-  %sbv = memref.subview %src[0, 0, 0, 0, 0, 0]
-                            [6, 1, 2, 1, 3, 1]
-                            [1, 1, 1, 1, 1, 1] :
+  %sbv = memref.subview %src[0, 0, 0, 0, 0, 0] // offset
+                            [6, 1, 2, 1, 3, 1] // size
+                            [1, 1, 1, 1, 1, 1] : // stride
           memref<12x5x2x10x6x8xf32> to memref<6x2x3xf32, strided<[4800,480,8]>>
   %dst= memref.alloc() : memref<2x2x3x3xf32, 1>
   iree_linalg_ext.pack %sbv inner_dims_pos = [0]
@@ -255,28 +112,6 @@ func.func @collapsing_subview_pack() {
 
 // -----
 
-// CHECK-LABEL: @subview_followed_by_expand(%arg0: index)
-// CHECK: amdaie.dma_cpy_nd
-// CHECK-SAME: [0, 0] [25, 4] [4, 1]
-// We might want to change the offsets to be
-// [%arg0 / 2, 0, %arg0 %2, 0]
-// in the future, but as the offsets ultimately get collapsed into a single
-// global cumulative offset, this would just be undone.
-// CHECK-SAME: [0, 0, %arg0, 2] [5, 5, 2, 2] [40, 2, 20, 1]
-func.func @subview_followed_by_expand(%arg0 : index){
-  %src = memref.alloc() : memref<20x20xf32>
-  %subview = memref.subview %src[%arg0, 2] [10, 10] [1, 1] :
-           memref<20x20xf32> to memref<10x10xf32, strided<[20, 1], offset: ?>>
-  %expanded = memref.expand_shape %subview [[0, 1], [2,3]] output_shape [5, 2, 5, 2] :
-          memref<10x10xf32, strided<[20, 1], offset: ?>> into memref<5x2x5x2xf32, strided<[40, 20, 2, 1], offset: ?>>
-  %dst = memref.alloc() : memref<25x4xf32>
-  iree_linalg_ext.unpack %expanded inner_dims_pos = [0, 1] inner_tiles = [5, 2] into %dst
-        : (memref<5x2x5x2xf32, strided<[40, 20, 2, 1], offset: ?>> memref<25x4xf32>)
-  return
-}
-
-// -----
-
 // CHECK-LABEL: @unitdim_unpack
 // CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<1x1x8x16xi32, 1>
 // CHECK: %[[FROMMEMREF0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC0]], {} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
@@ -460,5 +295,183 @@ func.func @permute_unpack_tricyle_permute(){
   return
 }
 
+// -----
+
+// The pack operation in the following test does not permutate any dimensions,
+// so we expect a contiguous copy on the source and destination sides.
+
+// CHECK-LABEL: collapsed_and_expanded_pack
+// CHECK: amdaie.dma_cpy_nd
+// destination of pack:
+// CHECK-SAME: [0, 0] [10, 10] [10, 1]
+// source of pack:
+// CHECK-SAME: [0, 0] [10, 10] [10, 1]
+func.func @collapsed_and_expanded_pack() {
+  %alloc0 = memref.alloc() : memref<10x10xf32>
+  %src = memref.collapse_shape %alloc0 [[0, 1]] : memref<10x10xf32> into memref<100xf32>
+  %alloc1 = memref.alloc() : memref<100xf32>
+  %dst = memref.expand_shape %alloc1 [[0, 1]] output_shape [10, 10] : memref<100xf32> into memref<10x10xf32>
+  iree_linalg_ext.pack %src inner_dims_pos = [0] inner_tiles = [10] into %dst : (memref<100xf32> memref<10x10xf32>)
+  return
+}
+
+// -----
+
+// The pack operation in the following test does not permutate any dimensions,
+// so we expect a contiguous copy on the source and destination sides.
+
+// CHECK-LABEL: @unitdim_pack_expand
+// CHECK-DAG: %[[SRCMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 1>
+// CHECK-DAG: %[[DSTMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 2>
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: %[[DSTMEMREF]][0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1]
+// CHECK-SAME: %[[SRCMEMREF]][0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]
+func.func @unitdim_pack_expand() {
+  %src = memref.alloc() : memref<8x16xi32, 1>
+  %dst = memref.alloc() : memref<8x16xi32, 2>
+  %dst_e = memref.expand_shape %dst [[0, 1, 2], [3]] output_shape [1, 1, 8, 16]
+         : memref<8x16xi32, 2> into memref<1x1x8x16xi32, 2>
+  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %dst_e:
+           (memref<8x16xi32, 1> memref<1x1x8x16xi32, 2>)
+  return
+}
+
+// -----
+
+// The unpack operation in the following test does not permutate any dimensions,
+// so we expect a contiguous copy on the source and destination sides.
+
+// CHECK-LABEL: @unitdim_unpack_expand
+// CHECK-DAG: %[[SRCMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 1>
+// CHECK-DAG: %[[DSTMEMREF:.*]] = amdaie.logicalobjectfifo.from_memref{{.*}}memref<8x16xi32, 2>
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: %[[DSTMEMREF]][0, 0] [8, 16] [16, 1]
+// CHECK-SAME: %[[SRCMEMREF]][0, 0, 0, 0] [1, 8, 1, 16] [128, 16, 128, 1]
+func.func @unitdim_unpack_expand() {
+  %src = memref.alloc() : memref<8x16xi32, 1>
+  %dst = memref.alloc() : memref<8x16xi32, 2>
+  %src_e = memref.expand_shape %src [[0, 1, 2], [3]] output_shape [1, 1, 8, 16]
+         : memref<8x16xi32, 1> into memref<1x1x8x16xi32, 1>
+  iree_linalg_ext.unpack %src_e inner_dims_pos = [0, 1] inner_tiles = [8, 16] into %dst:
+           (memref<1x1x8x16xi32, 1> memref<8x16xi32, 2>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: multidim_with_expand
+// CHECK: amdaie.dma_cpy_nd
+// dst of dma cpy:
+// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 100, 10, 1]
+// src of dma cpy:
+// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 10, 50, 1]
+func.func @multidim_with_expand() {
+  %src = memref.alloc() : memref<200x50xi32, 1>
+  %dst = memref.alloc() : memref<100x100xi32, 2>
+  %dst_e = memref.expand_shape %dst [[0, 1], [2, 3]] output_shape [20, 5, 10, 10]
+         : memref<100x100xi32, 2> into memref<20x5x10x10xi32, 2>
+  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [10, 10]
+     into %dst_e: (memref<200x50xi32, 1> memref<20x5x10x10xi32, 2>)
+  return
+}
+
+// -----
+
+// This test is included to illustrate that the dma copy is the same without the
+// expand operation (compare to multidim_with_expand above).
+// CHECK-LABEL: multidim_without_expand
+// CHECK: amdaie.dma_cpy_nd
+// dst of dma cpy:
+// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 100, 10, 1]
+// src of dma cpy:
+// CHECK-SAME: [0, 0, 0, 0] [20, 5, 10, 10] [500, 10, 50, 1]
+func.func @multidim_without_expand() {
+  %src = memref.alloc() : memref<200x50xi32, 1>
+  %dst = memref.alloc() : memref<20x5x10x10xi32, 2>
+  iree_linalg_ext.pack %src inner_dims_pos = [0, 1] inner_tiles = [10, 10]
+     into %dst: (memref<200x50xi32, 1> memref<20x5x10x10xi32, 2>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @subview_then_collapse(%arg0: index)
+// CHECK: %[[ALLOC0:.*]] = memref.alloc() : memref<20x10xf32>
+// CHECK: %[[C10:.*]] = arith.constant 10 : index
+// CHECK: %[[MULI:.*]] = arith.muli %arg0, %[[C10]] : index
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: [0, 0] [5, 20] [20, 1]
+// CHECK-SAME: [0, %[[MULI]]] [5, 20] [20, 1]
+func.func @subview_then_collapse(%arg0 : index) {
+  %src = memref.alloc() : memref<20x10xf32>
+  %subview = memref.subview %src[%arg0, 0] [10, 10] [1, 1] :
+           memref<20x10xf32> to memref<10x10xf32, strided<[10, 1], offset: ?>>
+  %collapsed = memref.collapse_shape %subview [[0, 1]] : memref<10x10xf32, strided<[10, 1], offset: ?>>
+           into memref<100xf32, strided<[1], offset: ?>>
+  %dst = memref.alloc() : memref<5x20xf32>
+  iree_linalg_ext.pack %collapsed inner_dims_pos = [0] inner_tiles = [20] into %dst
+          : (memref<100xf32, strided<[1], offset: ?>> memref<5x20xf32>)
+  return
+}
 
+// -----
 
+// CHECK-LABEL: @subview_then_expand
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: [0, 0, %arg0, 0, 0] [2, 3, 6, 6, 1] [300, 100, 10, 1, 1]
+// CHECK-SAME: [0, 0, 0, 0, 0] [2, 3, 6, 6, 1] [108, 6, 1, 18, 6]
+module {
+  func.func @subview_then_expand(%arg0: index) {
+    %alloc = memref.alloc() : memref<10x10x10xf32>
+    %subview = memref.subview %alloc[0, %arg0, 0] [6, 6, 6] [1, 1, 1] :
+       memref<10x10x10xf32> to memref<6x6x6xf32, strided<[100, 10, 1], offset: ?>>
+    %expand_shape = memref.expand_shape %subview [[0, 1], [2], [3, 4]]
+       output_shape [2, 3, 6, 6, 1] : memref<6x6x6xf32, strided<[100, 10, 1], offset: ?>>
+       into memref<2x3x6x6x1xf32, strided<[300, 100, 10, 1, 1], offset: ?>>
+    %alloc_0 = memref.alloc() : memref<12x3x6xf32>
+    iree_linalg_ext.pack %alloc_0 inner_dims_pos = [0, 1] inner_tiles = [6, 1]
+       into %expand_shape : (memref<12x3x6xf32> memref<2x3x6x6x1xf32, strided<[300, 100, 10, 1, 1], offset: ?>>)
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @subview_then_subview(%arg0: index, %arg1: index)
+// CHECK: %[[SUM:.*]] = arith.addi %arg0, %arg1 : index
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: [0] [100] [1],
+// CHECK-SAME: [%[[SUM]], 5] [10, 10] [20, 1]
+func.func @subview_then_subview(%arg0 : index, %arg1 : index){
+  %src = memref.alloc() : memref<20x20xf32>
+  %subview0 = memref.subview %src[%arg0, 2] [15, 15] [1, 1] :
+           memref<20x20xf32> to memref<15x15xf32, strided<[20, 1], offset: ?>>
+  %subview1 = memref.subview %subview0[%arg1, 3] [10, 10] [1, 1] :
+           memref<15x15xf32, strided<[20, 1], offset: ?>> to memref<10x10xf32, strided<[20, 1], offset: ?>>
+  %dst = memref.alloc() : memref<100xf32>
+  iree_linalg_ext.unpack %subview1 inner_dims_pos = [0] inner_tiles = [10] into %dst
+       : (memref<10x10xf32, strided<[20, 1], offset: ?>> memref<100xf32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @subview_then_expand_1(%arg0: index)
+// CHECK: amdaie.dma_cpy_nd
+// CHECK-SAME: [0, 0] [25, 4] [4, 1]
+// We might want to change the offsets to be
+// [%arg0 / 2, 0, %arg0 %2, 0]
+// in the future, but as the offsets ultimately get collapsed into a single
+// global cumulative offset, this would just be undone.
+// CHECK-SAME: [0, 0, %arg0, 2] [5, 5, 2, 2] [40, 2, 20, 1]
+func.func @subview_then_expand_1(%arg0 : index){
+  %src = memref.alloc() : memref<20x20xf32>
+  %subview = memref.subview %src[%arg0, 2] [10, 10] [1, 1] :
+           memref<20x20xf32> to memref<10x10xf32, strided<[20, 1], offset: ?>>
+  %expanded = memref.expand_shape %subview [[0, 1], [2,3]] output_shape [5, 2, 5, 2] :
+          memref<10x10xf32, strided<[20, 1], offset: ?>> into memref<5x2x5x2xf32, strided<[40, 20, 2, 1], offset: ?>>
+  %dst = memref.alloc() : memref<25x4xf32>
+  iree_linalg_ext.unpack %expanded inner_dims_pos = [0, 1] inner_tiles = [5, 2] into %dst
+        : (memref<5x2x5x2xf32, strided<[40, 20, 2, 1], offset: ?>> memref<25x4xf32>)
+  return
+}