From 4d16a3d9b3193888ab7857d73bf3ddb5dfc7f64b Mon Sep 17 00:00:00 2001
From: Ruizhe Zhao <kumasento@users.noreply.github.com>
Date: Tue, 19 Oct 2021 18:27:19 +0100
Subject: [PATCH] [ExtractScopStmt] add more tests and fixed bugs (#103)

* [Pass][ExtractScopStmt] internal index_cast

* Added test cases

* Temporarily added aes.mlir as a dummy test case

* [ExtractScopStmt] fixed issues with write op discovery

* [ExtractScopStmt] make sure dom store use same mem

* Removed unwanted tests
---
 lib/Transforms/ExtractScopStmt.cc             | 40 +++++++++-----
 .../ExtractScopStmt/no-loop-blockarg.mlir     | 13 +++++
 .../ExtractScopStmt/no-loop-non-affine.mlir   | 14 +++++
 test/polymer-opt/ExtractScopStmt/no-loop.mlir | 14 +++++
 .../scratchpad-dom-store-diff-mem.mlir        | 14 +++++
 .../scratchpad-dom-store-same-mem.mlir        | 23 ++++++++
 .../ExtractScopStmt/scratchpad-dom-store.mlir | 51 ++++++++++++++++++
 test/polymer-opt/Reg2Mem/aes.mlir             | 52 +++++++++++++++++++
 8 files changed, 209 insertions(+), 12 deletions(-)
 create mode 100644 test/polymer-opt/ExtractScopStmt/no-loop-blockarg.mlir
 create mode 100644 test/polymer-opt/ExtractScopStmt/no-loop-non-affine.mlir
 create mode 100644 test/polymer-opt/ExtractScopStmt/no-loop.mlir
 create mode 100644 test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-diff-mem.mlir
 create mode 100644 test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-same-mem.mlir
 create mode 100644 test/polymer-opt/ExtractScopStmt/scratchpad-dom-store.mlir
 create mode 100644 test/polymer-opt/Reg2Mem/aes.mlir
diff --git a/lib/Transforms/ExtractScopStmt.cc b/lib/Transforms/ExtractScopStmt.cc
index 3625b245..3f0687db 100644
--- a/lib/Transforms/ExtractScopStmt.cc
+++ b/lib/Transforms/ExtractScopStmt.cc
@@ -44,16 +44,10 @@ using CalleeToCallersMap =
 /// TODO: support CallOp.
 static void discoverMemWriteOps(mlir::FuncOp f,
                                 SmallVectorImpl<Operation *> &ops) {
-  bool hasAffineScope = false;
   f.getOperation()->walk([&](Operation *op) {
-    if (isa<mlir::AffineForOp>(op))
-      hasAffineScope = true;
-    if (isa<mlir::AffineWriteOpInterface, memref::StoreOp>(op))
+    if (isa<mlir::AffineWriteOpInterface>(op))
       ops.push_back(op);
   });
-
-  if (!hasAffineScope)
-    ops.clear();
 }
 
 /// Returns the newly created scratchpad.
@@ -126,6 +120,15 @@ insertScratchpadForInterprocUses(mlir::Operation *defOp,
   return memref;
 }
 
+static Value getMemRef(Operation *op) {
+  if (isa<mlir::AffineLoadOp, memref::LoadOp>(op))
+    return op->getOperand(0);
+  if (isa<mlir::AffineStoreOp, memref::StoreOp>(op))
+    return op->getOperand(1);
+
+  return nullptr;
+}
+
 /// Check is there any load in the use-def chains of op loads from a memref that
 /// is later updated by a store op that dominates the current op. We should use
 /// a proper RAW checker for this purpose.
@@ -145,14 +148,21 @@ static bool isUpdatedByDominatingStore(Operation *op, Operation *domOp,
 
   while (!worklist.empty()) {
     Operation *currOp = worklist.pop_back_val();
-    if (mlir::AffineLoadOp loadOp = dyn_cast<mlir::AffineLoadOp>(currOp)) {
-      Value memref = loadOp.memref();
 
+    if (Value memref = getMemRef(currOp))
       for (Operation *userOp : memref.getUsers())
-        if (mlir::AffineStoreOp storeOp = dyn_cast<mlir::AffineStoreOp>(userOp))
-          if (dom.dominates(storeOp, domOp))
+        // Both affine.store and memref.store should be counted.
+        if (isa<mlir::AffineStoreOp, memref::StoreOp>(userOp))
+          if (memref == getMemRef(userOp) && userOp != domOp &&
+              dom.dominates(userOp, domOp)) {
+            LLVM_DEBUG(dbgs()
+                       << "The load op:\n\t" << (*currOp)
+                       << "\nThe store op:\n\t" << (*userOp)
+                       << "\naccess to the same memref:\n\t" << memref
+                       << "\nand the store is dominating the final write:\n\t"
+                       << (*domOp));
             return true;
-    }
+          }
 
     for (mlir::Value operand : currOp->getOperands())
       if (Operation *defOp = operand.getDefiningOp()) {
@@ -385,6 +395,12 @@ static unsigned extractScopStmt(mlir::FuncOp f, unsigned numCallees,
   SmallVector<Operation *, 8> writeOps;
   discoverMemWriteOps(f, writeOps);
 
+  LLVM_DEBUG({
+    dbgs() << "Discovered memref write ops:\n";
+    for (Operation *op : writeOps)
+      op->dump();
+  });
+
   llvm::SetVector<Operation *> opsToRemove;
   // Map from an op in the original funcOp to which callee it would belong to.
   OpToCalleeMap opToCallee;
diff --git a/test/polymer-opt/ExtractScopStmt/no-loop-blockarg.mlir b/test/polymer-opt/ExtractScopStmt/no-loop-blockarg.mlir
new file mode 100644
index 00000000..67346a06
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/no-loop-blockarg.mlir
@@ -0,0 +1,13 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+func @no_loop_blockarg(%A: memref<1xf32>, %a: f32) {
+  affine.store %a, %A[0] : memref<1xf32>
+  return
+}
+
+// CHECK: func private @S0(%[[a:.*]]: f32, %[[A:.*]]: memref<1xf32>) attributes {scop.stmt}
+// CHECK-NEXT: affine.store %[[a]], %[[A]][0]
+
+// CHECK: func @no_loop_blockarg(%[[A:.*]]: memref<1xf32>, %[[a:.*]]: f32) 
+// CHECK-NEXT: call @S0(%[[a]], %[[A]]) : (f32, memref<1xf32>) -> ()
+
diff --git a/test/polymer-opt/ExtractScopStmt/no-loop-non-affine.mlir b/test/polymer-opt/ExtractScopStmt/no-loop-non-affine.mlir
new file mode 100644
index 00000000..7e728092
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/no-loop-non-affine.mlir
@@ -0,0 +1,14 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+func @foo(%A: memref<1xf32>) {
+  %0 = arith.constant 1.23 : f32
+  %c0 = arith.constant 0 : index
+  memref.store %0, %A[%c0] : memref<1xf32>
+  return
+}
+
+// CHECK-LABEL: func @foo
+// CHECK-NEXT: %{{.*}} = arith.constant
+// CHECK-NEXT: %{{.*}} = arith.constant
+// CHECK-NEXT: memref.store
+// CHECK-NEXT: return
diff --git a/test/polymer-opt/ExtractScopStmt/no-loop.mlir b/test/polymer-opt/ExtractScopStmt/no-loop.mlir
new file mode 100644
index 00000000..c39d8118
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/no-loop.mlir
@@ -0,0 +1,14 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+func @no_loop(%A: memref<1xf32>) {
+  %0 = arith.constant 1.23 : f32
+  affine.store %0, %A[0] : memref<1xf32>
+  return
+}
+
+// CHECK: func private @S0(%[[A:.*]]: memref<1xf32>) attributes {scop.stmt}
+// CHECK-NEXT: %[[CST:.*]] = arith.constant 1.23
+// CHECK-NEXT: affine.store %[[CST]], %[[A]][0]
+
+// CHECK: func @no_loop(%[[A:.*]]: memref<1xf32>) 
+// CHECK-NEXT: call @S0(%[[A]]) : (memref<1xf32>) -> ()
diff --git a/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-diff-mem.mlir b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-diff-mem.mlir
new file mode 100644
index 00000000..2da95601
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-diff-mem.mlir
@@ -0,0 +1,14 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+// There should be no scratchpad inserted.
+func @foo(%A: memref<1xf32>, %B: memref<1xf32>) {
+  %c0 = arith.constant 0 : index
+  %0 = affine.load %A[0] : memref<1xf32>
+  affine.store %0, %B[0] : memref<1xf32>
+  affine.store %0, %A[0] : memref<1xf32>
+  return
+}
+
+// CHECK: func @foo(%[[A:.*]]: memref<1xf32>, %[[B:.*]]: memref<1xf32>)
+// CHECK-NEXT: call @S0(%[[B]], %[[A]])
+// CHECK-NEXT: call @S1(%[[A]])
diff --git a/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-same-mem.mlir b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-same-mem.mlir
new file mode 100644
index 00000000..e2d16196
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store-same-mem.mlir
@@ -0,0 +1,23 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+func @foo(%A: memref<1xf32>) {
+  %c0 = arith.constant 0 : index
+  %0 = affine.load %A[0] : memref<1xf32>
+  affine.store %0, %A[0] : memref<1xf32>
+  affine.store %0, %A[0] : memref<1xf32>
+  return
+}
+
+// CHECK: func private @S0(%[[A:.*]]: memref<1xf32>, %[[SCRATCHPAD:.*]]: memref<1xf32>)
+// CHECK-NEXT: %[[VAL0:.*]] = affine.load %[[A]][0]
+// CHECK-NEXT: affine.store %[[VAL0]], %[[SCRATCHPAD]][0]
+// CHECK-NEXT: affine.store %[[VAL0]], %[[A]][0]
+
+// CHECK: func private @S1(%[[A:.*]]: memref<1xf32>, %[[SCRATCHPAD:.*]]: memref<1xf32>)
+// CHECK-NEXT: %[[VAL0:.*]] = affine.load %[[SCRATCHPAD]][0]
+// CHECK-NEXT: affine.store %[[VAL0]], %[[A]][0]
+
+// CHECK: func @foo(%[[A:.*]]: memref<1xf32>)
+// CHECK-NEXT: %[[SCRATCHPAD:.*]] = memref.alloca() : memref<1xf32>
+// CHECK-NEXT: call @S0(%[[A]], %[[SCRATCHPAD]])
+// CHECK-NEXT: call @S1(%[[A]], %[[SCRATCHPAD]])
diff --git a/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store.mlir b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store.mlir
new file mode 100644
index 00000000..f1a2384b
--- /dev/null
+++ b/test/polymer-opt/ExtractScopStmt/scratchpad-dom-store.mlir
@@ -0,0 +1,51 @@
+// RUN: polymer-opt %s -extract-scop-stmt | FileCheck %s
+
+
+/// The data-flow of the following program. Due to the existence of the dominating-store edge,
+/// we should replace the load edge on the left by a scratchpad.
+///
+
+///                   +---load A[i] ---+
+///                   |     |          |
+///                   |     |          |
+///                   |     v       dominating
+///                   |   mulf         |
+///   replace --->    |     |          |
+///                   |     |          v
+///                   |     +----->store A[i]
+///                   |     |          +
+///                   |     |       dominating
+///                   |     v          |
+///                   +-->addf <-------+
+///                         |
+///                         +----->store A[i]
+
+func @foo(%A: memref<10xf32>) {
+  affine.for %i = 0 to 10 {
+    %0 = affine.load %A[%i] : memref<10xf32>
+    %1 = arith.mulf %0, %0 : f32
+    affine.store %1, %A[%i] : memref<10xf32>
+    // Should replace %0 by a load from a scratchpad.
+    %2 = arith.addf %1, %0 : f32
+    affine.store %2, %A[%i] : memref<10xf32>
+  }
+  return
+}
+
+// CHECK: func private @S0(%[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: memref<1xf32>) attributes {scop.stmt} 
+// CHECK:   %[[VAL0:.*]] = affine.load %[[ARG0]][symbol(%[[ARG1]])]
+// CHECK:   affine.store %[[VAL0]], %[[ARG2]][0]
+// CHECK:   %[[VAL1:.*]] = arith.mulf %[[VAL0]], %[[VAL0]]
+// CHECK:   affine.store %[[VAL1]], %[[ARG0]][symbol(%[[ARG1]])]
+
+// CHECK: func private @S1(%[[ARG0:.*]]: memref<10xf32>, %[[ARG1:.*]]: index, %[[ARG2:.*]]: memref<1xf32>) attributes {scop.stmt} 
+// CHECK:   %[[VAL0:.*]] = affine.load %[[ARG0]][symbol(%[[ARG1]])]
+// CHECK:   %[[VAL1:.*]] = affine.load %[[ARG2]][0]
+// CHECK:   %[[VAL2:.*]] = arith.addf %[[VAL0]], %[[VAL1]]
+// CHECK:   affine.store %[[VAL2]], %[[ARG0]][symbol(%[[ARG1]])]
+
+// CHECK: func @foo(%[[ARG0:.*]]: memref<10xf32>) 
+// CHECK:   %[[VAL0:.*]] = memref.alloca()
+// CHECK:   affine.for %[[ARG1:.*]] = 0 to 10 
+// CHECK:     call @S0(%[[ARG0]], %[[ARG1]], %[[VAL0]])
+// CHECK:     call @S1(%[[ARG0]], %[[ARG1]], %[[VAL0]])
diff --git a/test/polymer-opt/Reg2Mem/aes.mlir b/test/polymer-opt/Reg2Mem/aes.mlir
new file mode 100644
index 00000000..95bfa3e0
--- /dev/null
+++ b/test/polymer-opt/Reg2Mem/aes.mlir
@@ -0,0 +1,52 @@
+// RUN: exit 0
+ 
+func @encrypt(%arg0: memref<?x16xi32>, %arg1: memref<?xi32>) attributes {llvm.linkage = #llvm.linkage<external>} {
+  %c1_i32 = constant 1 : i32
+  %c4_i32 = constant 4 : i32
+  %c15_i32 = constant 15 : i32
+  %c8_i32 = constant 8 : i32
+  %c283_i32 = constant 283 : i32
+  %0 = memref.alloca() : memref<1024xi32>
+  affine.for %arg2 = 1 to 5 {
+    affine.for %arg3 = 0 to 16 {
+      %1 = affine.load %arg1[%arg3 * 4] : memref<?xi32>
+      %2 = shift_right_signed %1, %c4_i32 : i32
+      %3 = index_cast %2 : i32 to index
+      %4 = and %1, %c15_i32 : i32
+      %5 = index_cast %4 : i32 to index
+      %6 = memref.load %arg0[%3, %5] : memref<?x16xi32>
+      affine.store %6, %arg1[%arg3 * 4] : memref<?xi32>
+    }
+    affine.for %arg3 = 0 to 1023 {
+      %1 = affine.load %arg1[%arg3] : memref<?xi32>
+      %2 = shift_left %1, %c1_i32 : i32
+      affine.store %2, %0[%arg3] : memref<1024xi32>
+      %3 = shift_right_signed %2, %c8_i32 : i32
+      %4 = cmpi eq, %3, %c1_i32 : i32
+      scf.if %4 {
+        %10 = xor %2, %c283_i32 : i32
+        affine.store %10, %0[%arg3] : memref<1024xi32>
+      }
+      %5 = affine.load %arg1[%arg3 + 1] : memref<?xi32>
+      %6 = shift_left %5, %c1_i32 : i32
+      %7 = xor %5, %6 : i32
+      %8 = shift_right_signed %7, %c8_i32 : i32
+      %9 = cmpi eq, %8, %c1_i32 : i32
+      scf.if %9 {
+        %10 = xor %7, %c283_i32 : i32
+        %11 = affine.load %0[%arg3] : memref<1024xi32>
+        %12 = xor %11, %10 : i32
+        affine.store %12, %0[%arg3] : memref<1024xi32>
+      } else {
+        %10 = affine.load %0[%arg3] : memref<1024xi32>
+        %11 = xor %10, %7 : i32
+        affine.store %11, %0[%arg3] : memref<1024xi32>
+      }
+    }
+    affine.for %arg3 = 0 to 1024 {
+      %1 = affine.load %0[%arg3] : memref<1024xi32>
+      affine.store %1, %arg1[%arg3] : memref<?xi32>
+    }
+  }
+  return
+}