[CINN] Add method to check applicability of GridReduce

lshpku · lshpku · commit fed2ec86feb4 · 2024-09-30T12:44:31.000Z
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc b/paddle/cinn/hlir/framework/pir/trivial_op_impl.cc
@@ -721,6 +721,97 @@ std::vector<int64_t> GetLoopStrides(const ir::Expr& body,
   return loop_strides;
 }
 
+// Check whether we can apply grid reduce in this fusion group.
+// We can apply grid reduce if there is exactly one reduce, and whose result is
+// not broadcasted before output.
+bool GetCanApplyGridReduce(const std::vector<ir::Expr>& op_compute_bodies,
+                           const std::vector<int64_t>& reduce_axis) {
+  using trivial_fusion_detail::GetAllForIters;
+  using trivial_fusion_detail::IsReduceBody;
+  using trivial_fusion_detail::ExprSetFinderUtils::ChildScheduleBlockRealizes;
+  using trivial_fusion_detail::ExprSetFinderUtils::ChildStores;
+  using trivial_fusion_detail::ExprSetFinderUtils::ChildTensorLoads;
+  using trivial_fusion_detail::ExprSetFinderUtils::
+      ScheduleBlockRealizeIsNotInit;
+
+  // Names of tensors that are downstream of reduce.
+  // A tensor is downstream of reduce either if it is produced by a reduce, or
+  // if it has data dependency on another tensor that is downstream of reduce.
+  std::unordered_set<std::string> reduce_downstream_tensor_names;
+  int reduce_count = 0;
+
+  const auto IsReduceDownstream = [&](const ir::Expr& expr_block) {
+    for (auto& expr_load : ChildTensorLoads(expr_block)) {
+      std::string load_tensor_name = expr_load.As<ir::Load>()->name();
+      if (reduce_downstream_tensor_names.count(load_tensor_name) > 0) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  const auto AddReduceDownstream = [&](const ir::Expr& expr_block) {
+    auto expr_store = ChildStores.GetSingle(expr_block);
+    std::string store_tensor_name = expr_store.As<ir::Store>()->name();
+    reduce_downstream_tensor_names.insert(store_tensor_name);
+  };
+
+  const auto CheckOutputHasReduceAxis = [&](const ir::Expr& body,
+                                            const ir::Expr& expr_block) {
+    std::vector<ir::Var> all_loop_vars = GetAllForIters(body);
+    std::unordered_set<std::string> reduce_loop_vars;
+    for (int64_t axis : reduce_axis) {
+      reduce_loop_vars.insert(all_loop_vars[axis]->name);
+    }
+
+    std::unordered_set<std::string> reduce_iter_vars;
+    auto* block = expr_block.As<ir::ScheduleBlockRealize>();
+    auto& iter_vars = block->schedule_block.As<ir::ScheduleBlock>()->iter_vars;
+    for (int i = 0; i < iter_vars.size(); i++) {
+      ir::Var loop_var = block->iter_values[i].as_var_ref();
+      if (reduce_loop_vars.count(loop_var->name) > 0) {
+        reduce_iter_vars.insert(iter_vars[i]->name);
+      }
+    }
+
+    // The result is true if the indices of the output tensor contain any
+    // reduce iter vars.
+    auto expr_store = ChildStores.GetSingle(expr_block);
+    for (auto& index_expr : expr_store.As<ir::Store>()->indices) {
+      if (reduce_iter_vars.count(index_expr.as_var_ref()->name) > 0) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  for (const auto& body : op_compute_bodies) {
+    ir::Expr expr_block =
+        (ChildScheduleBlockRealizes * ScheduleBlockRealizeIsNotInit)
+            .GetSingle(body);
+    bool is_reduce_body = IsReduceBody(body);
+    bool is_reduce_downstream = IsReduceDownstream(expr_block);
+    bool output_has_reduce_axis = CheckOutputHasReduceAxis(body, expr_block);
+
+    if (is_reduce_body) {
+      ++reduce_count;
+    }
+    if (is_reduce_downstream || is_reduce_body) {
+      AddReduceDownstream(expr_block);
+    }
+
+    // When a block is downstream of reduce, its output shouldn't contain
+    // reduce axis. Otherwise, it broadcasts the result of reduce. If this
+    // is the case, we cannot apply grid reduce.
+    if (is_reduce_downstream && output_has_reduce_axis) {
+      VLOG(4) << "grid reduce is prohibited by block: " << expr_block;
+      return false;
+    }
+  }
+
+  return reduce_count == 1;
+}
+
 std::shared_ptr<FusionGroupInfo> GetFusionGroupInfo(
     const std::vector<ir::Expr>& op_compute_bodies) {
   using trivial_fusion_detail::AppendBound;
@@ -792,6 +883,10 @@ std::shared_ptr<FusionGroupInfo> GetFusionGroupInfo(
                      }
                    });
   }
+
+  group_info->can_apply_grid_reduce =
+      GetCanApplyGridReduce(op_compute_bodies, group_info->reduce_axis);
+
   VLOG(4) << group_info->DebugPrint();
   return group_info;
 }
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op_impl.h b/paddle/cinn/hlir/framework/pir/trivial_op_impl.h
@@ -168,12 +168,16 @@ struct FusionGroupInfo {
   std::vector<int64_t> loop_strides;
   std::vector<int64_t> reduce_axis;
   std::vector<std::string> reduce_var_name;
+  bool can_apply_grid_reduce;
 
   std::string DebugPrint() {
-    return "GroupInfo\nloop_ranges: " + cinn::utils::Join(loop_ranges, " ") +
-           "\nloop_strides: " + cinn::utils::Join(loop_strides, ", ") +
-           "\nreduce_axis: " + cinn::utils::Join(reduce_axis, " ") +
-           "\nreduce_var_name: " + cinn::utils::Join(reduce_var_name, " ");
+    std::stringstream ss;
+    ss << "GroupInfo\nloop_ranges: " << cinn::utils::Join(loop_ranges, " ")
+       << "\nloop_strides: " << cinn::utils::Join(loop_strides, ", ")
+       << "\nreduce_axis: " << cinn::utils::Join(reduce_axis, " ")
+       << "\nreduce_var_name: " << cinn::utils::Join(reduce_var_name, " ")
+       << "\ncan_apply_grid_reduce: " << can_apply_grid_reduce;
+    return ss.str();
   }
 };