[PIR+CINN]Add FusionOpInfo to enhance CompilationCache logic (#63615)

Aurelius84 · web-flow · commit 8d530db4fcde · 2024-04-18T10:32:55.000+08:00
* [PIR+CINN]Add FusionOpInfo to enhance CompilationCache logic

* fix UT
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.cc b/paddle/cinn/hlir/framework/pir/compilation_cache.cc
@@ -24,7 +24,7 @@ void* BackendResource::GetHostFuncPtr() const {
   VLOG(4) << "Lookup kernel name: " << host_fn_name_;
   void* ptr = backend_compiler_->Lookup(host_fn_name_);
   PADDLE_ENFORCE_NOT_NULL(ptr,
-                          phi::errors::InvalidArgument(
+                          ::common::errors::InvalidArgument(
                               "Can't find kernel function %s", host_fn_name_));
   return ptr;
 }
@@ -34,8 +34,8 @@ void* BackendResource::GetInferFuncPtr() const {
   void* ptr = backend_compiler_->Lookup(infer_fn_name_);
   PADDLE_ENFORCE_NOT_NULL(
       ptr,
-      phi::errors::InvalidArgument("Can't find infer shape function %s",
-                                   infer_fn_name_));
+      ::common::errors::InvalidArgument("Can't find infer shape function %s",
+                                        infer_fn_name_));
   return ptr;
 }
 
@@ -61,7 +61,7 @@ const CompilationCache::CacheValue& CompilationCache::Get(
   PADDLE_ENFORCE_EQ(
       Has(key),
       true,
-      phi::errors::NotFound("%s is not in CompliatonCache.", key));
+      ::common::errors::NotFound("%s is not in CompliatonCache.", key));
   return cache_.at(key);
 }
 
@@ -71,6 +71,12 @@ pir::CINNKernelInfo CompilationCache::GetKernelInfo(const CacheKey& key) const {
 
 void CompilationCache::Insert(const CacheKey& key, const CacheValue& value) {
   VLOG(6) << "Insert CompilationCache for: " << key;
+  PADDLE_ENFORCE_EQ(Has(key),
+                    false,
+                    ::common::errors::PreconditionNotMet(
+                        "%s is already in CompliatonCache while calling "
+                        "CompilationCache::Insert().",
+                        key));
   cache_.insert({key, value});
 }
 
diff --git a/paddle/cinn/hlir/framework/pir/compilation_cache.h b/paddle/cinn/hlir/framework/pir/compilation_cache.h
@@ -93,6 +93,7 @@ class CompilationCache {
   const CacheValue& Get(const CacheKey& key) const;
   void Insert(const CacheKey& key, const CacheValue& value);
   void Clear();
+  size_t Size() const { return cache_.size(); }
 
   pir::CINNKernelInfo GetKernelInfo(const CacheKey& key) const;
 
diff --git a/paddle/cinn/hlir/framework/pir/compilation_task.cc b/paddle/cinn/hlir/framework/pir/compilation_task.cc
@@ -83,6 +83,7 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(
   VLOG(5) << "Start to compile module into cuda kernel...";
   backend_resource->GetBackendCompiler()->Build(module, "");
   compilation_result->SetBackendResource(backend_resource);
+  VLOG(5) << "End to compile module into cuda kernel.";
   return compilation_result;
 }
 
diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.cc b/paddle/cinn/hlir/framework/pir/fusion_info.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/cinn/hlir/framework/pir/fusion_info.h"
 #include "paddle/common/enforce.h"
+#include "paddle/common/flags.h"
 #include "paddle/pir/include/core/ir_printer.h"
+PD_DECLARE_bool(enable_cinn_compile_cache);
 
 namespace cinn::hlir::framework::pir {
 
@@ -46,10 +48,12 @@ std::ostream& operator<<(std::ostream& os, const ValueInfo& value_info) {
 
 OperationInfo::OperationInfo(const ::pir::Operation& op) {
   name_ = op.name();
+  input_infos_.reserve(op.num_operands());
   for (const auto value : op.operands_source()) {
     if (!value || !value.type()) continue;
     input_infos_.emplace_back(value);
   }
+  output_infos_.reserve(op.num_results());
   for (const auto value : op.results()) {
     if (!value || !value.type()) continue;
     output_infos_.emplace_back(value);
@@ -58,6 +62,7 @@ OperationInfo::OperationInfo(const ::pir::Operation& op) {
   const auto& attributes = op.attributes();
   std::map<std::string, ::pir::Attribute, std::less<>> order_attributes(
       attributes.begin(), attributes.end());
+  attr_infos_.reserve(attributes.size());
   for (const auto& [attr_name, attr_value] : order_attributes) {
     if (!attr_value || attr_name == kOpCallStack) continue;
     attr_infos_.emplace_back(attr_name, attr_value);
@@ -85,9 +90,53 @@ std::ostream& operator<<(std::ostream& os, const OperationInfo& op_info) {
   return os;
 }
 
+std::size_t FusionOpInfo::hash() const {
+  std::size_t seed = op_info_.hash();
+  for (const auto& [value_index, op_info_hash] : inner_deps_) {
+    hash_combine(seed, value_index);
+    hash_combine(seed, op_info_hash);
+  }
+  return seed;
+}
+
+std::ostream& operator<<(std::ostream& os, const FusionOpInfo& info) {
+  os << info.op_info_ << ", inner_deps:{";
+  for (const auto& [value_index, op_info_hash] : info.inner_deps_) {
+    os << " (" << value_index << ", " << op_info_hash << ")";
+  }
+  os << "}";
+  return os;
+}
+
 FusionInfo::FusionInfo(const OpLoweringGroup& group) {
-  for (const auto* op : TopologySort(group)) {
-    op_infos_.emplace_back(*op);
+  std::unordered_map<const ::pir::Operation*, size_t> op_mapper;
+  unique_fn_name_ = group.FuncName();
+
+  const auto GetInnerUpstreamOps =
+      [&](const ::pir::Operation* op) -> decltype(auto) {
+    std::unordered_map<size_t, size_t> upstream_ops_index_hash;
+    for (size_t i = 0; i < op->num_operands(); ++i) {
+      const auto value = op->operand_source(i);
+      if (!value || !value.defining_op()) continue;
+      const auto* defining_op = value.defining_op();
+      if (op_mapper.count(defining_op) == 0) continue;
+      PADDLE_ENFORCE_LT(op_mapper[defining_op],
+                        this->op_infos_.size(),
+                        ::common::errors::OutOfRange(
+                            "Required op_mapper[defining_op] < "
+                            "op_infos_.size(), but received index %d",
+                            op_mapper[defining_op]));
+      upstream_ops_index_hash.emplace(
+          i, this->op_infos_[op_mapper[defining_op]].hash());
+    }
+    return upstream_ops_index_hash;
+  };
+
+  const auto sorted_ops = TopologySort(group);
+  for (size_t i = 0; i < sorted_ops.size(); ++i) {
+    const auto& op = sorted_ops[i];
+    op_infos_.emplace_back(*op, GetInnerUpstreamOps(op));
+    op_mapper.insert({op, i});
   }
 }
 
@@ -97,13 +146,16 @@ std::size_t FusionInfo::hash() const {
   }
   std::size_t seed = 2153;
   for (const auto& info : op_infos_) hash_combine(seed, info);
+  if (!FLAGS_enable_cinn_compile_cache) hash_combine(seed, unique_fn_name_);
   return seed;
 }
 
 std::ostream& operator<<(std::ostream& os, const FusionInfo& fusion_info) {
   os << "FusionInfo - " << fusion_info.hash();
   if (VLOG_IS_ON(5)) {
     os << "{\n";
+    if (!FLAGS_enable_cinn_compile_cache)
+      os << "fn_name: " << fusion_info.unique_fn_name_;
     for (const auto& op_info : fusion_info.op_infos_) os << op_info << "\n";
     os << "}\n";
   }
diff --git a/paddle/cinn/hlir/framework/pir/fusion_info.h b/paddle/cinn/hlir/framework/pir/fusion_info.h
@@ -57,6 +57,21 @@ class OperationInfo {
   std::vector<AttributeInfo> attr_infos_;
 };
 
+class FusionOpInfo {
+ public:
+  FusionOpInfo(const ::pir::Operation &op,
+               const std::unordered_map<size_t, size_t> &deps)
+      : op_info_(op), inner_deps_(deps) {}
+
+  std::size_t hash() const;
+  friend std::ostream &operator<<(std::ostream &os, const FusionOpInfo &info);
+
+ private:
+  OperationInfo op_info_;
+  // oprand_source id : OperationInfo hash
+  std::unordered_map<size_t, size_t> inner_deps_;
+};
+
 class FusionInfo {
   using IntArgsMap = std::map<int, CINNKernelInfo::ArgDimIdx>;
 
@@ -74,13 +89,18 @@ class FusionInfo {
   friend std::ostream &operator<<(std::ostream &os, const FusionInfo &info);
 
  private:
-  std::vector<OperationInfo> op_infos_;
+  std::vector<FusionOpInfo> op_infos_;
   std::size_t cached_hash_value_{0};
+
+  // Used to make same subgraphs have unique FusionInfo while
+  // FLAGS_enable_cinn_compile_cache = false, default empty;
+  std::string unique_fn_name_{""};
 };
 
 std::ostream &operator<<(std::ostream &os, const AttributeInfo &info);
 std::ostream &operator<<(std::ostream &os, const ValueInfo &info);
 std::ostream &operator<<(std::ostream &os, const OperationInfo &info);
+std::ostream &operator<<(std::ostream &os, const FusionOpInfo &info);
 std::ostream &operator<<(std::ostream &os, const FusionInfo &info);
 
 // See boost.hash_combine for details
@@ -114,5 +134,6 @@ namespace std {
 REGISTER_STD_HASH(AttributeInfo);
 REGISTER_STD_HASH(ValueInfo);
 REGISTER_STD_HASH(OperationInfo);
+REGISTER_STD_HASH(FusionOpInfo);
 REGISTER_STD_HASH(FusionInfo)
 }  // namespace std
diff --git a/paddle/cinn/hlir/framework/pir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -71,6 +71,7 @@ std::vector<pir::CINNKernelInfo> PirCompiler::Build(
                         utils::SequenceDispatcher(0, task_size),
                         /*thread_num=*/thread_size);
   }
+  VLOG(5) << "Finished compiling " << task_size << " Cinn Kernel info.";
   ctx_mapper.SetFinalize(true);
   ctx_mapper.UpdateGlobalCache();
   return ctx_mapper.RecoverKernelInfos();
@@ -115,8 +116,11 @@ CompilationContextMapper::RecoverKernelInfos() {
 
   std::vector<pir::CINNKernelInfo> kernel_infos(fusion_infos_.size());
   for (size_t i = 0; i < fusion_infos_.size(); ++i) {
-    kernel_infos[i] =
-        CompilationCache::Instance().GetKernelInfo(fusion_infos_[i]);
+    const auto& compilation_result =
+        FLAGS_enable_cinn_compile_cache
+            ? CompilationCache::Instance().Get(fusion_infos_[i])
+            : compilation_results_[i];
+    kernel_infos[i] = compilation_result->GetKernelInfo();
   }
   return kernel_infos;
 }
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
@@ -1871,6 +1871,14 @@ void BindUtils(pybind11::module *m) {
     pybind11::gil_scoped_release release;
     VLOG(4) << "clear CINN CompilationCache and free BackendResource.";
     cinn::hlir::framework::CompilationCache::Instance().Clear();
+#endif
+  });
+
+  m->def("cinn_compilation_cache_size", []() {
+#ifdef PADDLE_WITH_CINN
+    pybind11::gil_scoped_release release;
+    VLOG(4) << "clear CINN CompilationCache and free BackendResource.";
+    return cinn::hlir::framework::CompilationCache::Instance().Size();
 #endif
   });
 }
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_90.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.base import core
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.relu = paddle.nn.functional.relu
+
+    def triple_full(self):
+        y1 = paddle.full([4], 1)
+        y2 = paddle.full([4], 0)
+        y3 = paddle.full([4], 0)
+        return y1, y2, y3
+
+    def concat_case_1(self):
+        y1, y2, y3 = self.triple_full()
+        out = paddle.concat([y1, y2, y3])
+        return self.relu(out)
+
+    def concat_case_2(self):
+        y1, y2, y3 = self.triple_full()
+        out = paddle.concat([y2, y1, y3])
+        return self.relu(out)
+
+    def concat_case_3(self):
+        y1, y2, y3 = self.triple_full()
+        out = paddle.concat([y3, y2, y1])
+        return self.relu(out)
+
+    def forward(self, x):
+        outs = []
+        for fn in [self.concat_case_1, self.concat_case_2, self.concat_case_3]:
+            # to tigger duplicate subgraph and cache them.
+            for i in range(3):
+                outs.append(self.relu(fn()))
+        outs.append(self.relu(x))
+        return outs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = (paddle.rand(shape=[12], dtype=paddle.float32),)
+        self.net = LayerCase()
+
+    def eval(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        net.eval()
+        outs = net(*self.inputs)
+        return outs
+
+    def check_with_flag(self, cache_size):
+        st_out = self.eval(self.net, to_static=True)
+        cinn_out = self.eval(
+            self.net, to_static=True, with_prim=True, with_cinn=True
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-6)
+
+        # Check cache size
+        np.testing.assert_equal(
+            core.pir.cinn_compilation_cache_size(), cache_size
+        )
+
+    def test_ast_prim_cinn(self):
+        # NOTE(Aurelius84): Deny relu to split fused subgraph.
+        paddle.set_flags(
+            {
+                "FLAGS_deny_cinn_ops": "relu",
+                "FLAGS_prim_forward_blacklist": "pd_op.relu",
+            }
+        )
+        self.check_with_flag(cache_size=3)
+
+    def test_ast_prim_cinn_disable_cache(self):
+        core.pir.clear_cinn_compilation_cache()
+        # NOTE(Aurelius84): Deny relu to split fused subgraph.
+        paddle.set_flags(
+            {
+                "FLAGS_deny_cinn_ops": "relu",
+                "FLAGS_prim_forward_blacklist": "pd_op.relu",
+                "FLAGS_enable_cinn_compile_cache": False,
+            }
+        )
+        # if disable cinn_compile_caceh, each subgraph will be considered as unqiue.
+        self.check_with_flag(cache_size=9)
+
+
+if __name__ == '__main__':
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ std::shared_ptr<pir::CompilationResult> CompilationTask::BuildPirCINNKernelInfo(`
`83`	`83`	`VLOG(5) << "Start to compile module into cuda kernel...";`
`84`	`84`	`backend_resource->GetBackendCompiler()->Build(module, "");`
`85`	`85`	`compilation_result->SetBackendResource(backend_resource);`
	`86`	`+ VLOG(5) << "End to compile module into cuda kernel.";`
`86`	`87`	`return compilation_result;`
`87`	`88`	`}`
`88`	`89`