PaddlePaddle · zhhsplendid · Sep 18, 2021 · Aug 5, 2021 · Sep 2, 2021 · Sep 7, 2021
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -43,6 +43,7 @@ cc_library(graph SRCS graph.cc DEPS node pretty_log)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(cost_model SRCS cost_model.cc DEPS executor graph profiler proto_desc device_tracer)
 
 SET(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if (WITH_TESTING)
@@ -141,6 +142,7 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
+cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
 cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)

diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
@@ -0,0 +1,256 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cost_model.h"
+
+#include <memory>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+using ir::Graph;
+using platform::Event;
+using platform::MemEvent;
+
+const double CostData::NOT_MEASURED = -1;
+
+CostData::~CostData() {
+  // TODO(zhhsplendid): when we save a copy of program/graph, we should delete
+  // here.
+}
+
+double CostData::GetOpTimeMs(int op_id) const { return op_time_ms_.at(op_id); }
+double CostData::GetOpMemoryBytes(int op_id) const {
+  return op_memory_bytes_.at(op_id);
+}
+double CostData::GetWholeTimeMs() const { return whole_time_ms_; }
+double CostData::GetWholeMemoryBytes() const { return whole_memory_bytes_; }
+
+const Graph* CostData::GetGraph() const { return graph_; }
+const ProgramDesc* CostData::GetProgram() const { return program_; }
+
+bool CostData::SetCostData(const ProgramDesc& program,
+                           const std::vector<std::vector<Event>>& time_events) {
+  // TODO(zhhsplendid): Make a copy so that CostData can be available even if
+  // SWE changes Program, the copy can be saved into pointer program_
+  if (program.Size() == 0) {
+    whole_time_ms_ = 0;
+    whole_memory_bytes_ = 0;
+    return true;
+  }
+
+  if (time_events.empty()) {
+    LOG(WARNING) << "Input time_events for CostModel is empty";
+    return false;
+  }
+
+  std::vector<Event> main_thread_events = time_events[0];
+  // Support global block only
+  // TODO(zhhsplendid): support sub blocks
+  const BlockDesc& global_block = program.Block(0);
+  size_t op_size = global_block.OpSize();
+  if (op_size == 0) {
+    whole_time_ms_ = 0;
+    whole_memory_bytes_ = 0;
+    return true;
+  }
+
+  bool event_to_cost_success = true;
+  size_t event_index = 0;
+  for (size_t i = 0; i < op_size; ++i) {
+    const OpDesc* op_desc = global_block.Op(i);
+    std::string op_type = op_desc->Type();
+
+    while (event_index < main_thread_events.size()) {
+      if (main_thread_events[event_index].name() == op_type &&
+          main_thread_events[event_index].type() ==
+              platform::EventType::kPushRange) {
+        break;
+      }
+      ++event_index;
+    }
+    if (event_index >= main_thread_events.size()) {
+      LOG(WARNING) << "Input time_events for Op " << i << ", type '" << op_type
+                   << "' have wrong format, skip this Op.";
+      event_to_cost_success = false;
+      continue;
+    }
+    size_t op_push_index = event_index;
+
+    while (event_index < main_thread_events.size()) {
+      // Is it possible to Push a lot of Ops with same type and then Pop?
+      // ControlFlow Op can be like that, but this version only support global
+      // block
+      // TODO(zhhsplendid): make a more strict mapping between push and pop
+      if (main_thread_events[event_index].name() == op_type &&
+          main_thread_events[event_index].type() ==
+              platform::EventType::kPopRange) {
+        break;
+      }
+      ++event_index;
+    }
+    if (event_index >= main_thread_events.size()) {
+      LOG(WARNING) << "Input time_events for Op " << i << ", type '" << op_type
+                   << "' have wrong format, skip this Op.";
+      event_to_cost_success = false;
+      continue;
+    }
+    size_t op_pop_index = event_index;
+    double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
+        main_thread_events[op_pop_index]);
+    double gpu_time_ms = 0;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
+        main_thread_events[op_pop_index]);
+#endif
+    double time_ms = gpu_time_ms + cpu_time_ms;
+    op_time_ms_[i] = time_ms;
+  }
+
+  event_index = 0;
+  int start_profiler_idx = -1;
+  int stop_profiler_idx = -1;
+  while (event_index < main_thread_events.size()) {
+    if (main_thread_events[event_index].name() == "_start_profiler_") {
+      start_profiler_idx = event_index;
+    } else if (main_thread_events[event_index].name() == "_stop_profiler_") {
+      stop_profiler_idx = event_index;
+      break;
+    }
+    ++event_index;
+  }
+  if (start_profiler_idx != -1 && stop_profiler_idx != -1) {
+    double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
+        main_thread_events[stop_profiler_idx]);
+    double gpu_time_ms = 0;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
+        main_thread_events[stop_profiler_idx]);
+#endif
+    whole_time_ms_ = gpu_time_ms + cpu_time_ms;
+  } else {
+    LOG(WARNING) << "Input time_events for whole time have wrong format";
+    event_to_cost_success = false;
+  }
+
+  return event_to_cost_success;
+}
+
+void PrintEvents(const std::vector<std::vector<Event>>* time_events,
+                 const std::vector<std::vector<MemEvent>>* mem_events) {
+  if (time_events != nullptr) {
+    for (size_t i = 0; i < time_events->size(); ++i) {
+      for (size_t j = 0; j < (*time_events)[i].size(); ++j) {
+        VLOG(4) << "Print time event (" << i << ", " << j << ")" << std::endl;
+        VLOG(4) << (*time_events)[i][j].name() << " "
+                << (*time_events)[i][j].attr() << std::endl;
+        VLOG(4) << "This: " << &(*time_events)[i][j]
+                << ", Parent: " << (*time_events)[i][j].parent() << std::endl;
+        if ((*time_events)[i][j].role() == platform::EventRole::kInnerOp) {
+          VLOG(4) << "role kInnerOp" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kUniqueOp) {
+          VLOG(4) << "role kUniqueOp" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kOrdinary) {
+          VLOG(4) << "role kOrdinary" << std::endl;
+        } else if ((*time_events)[i][j].role() ==
+                   platform::EventRole::kSpecial) {
+          VLOG(4) << "role kSpecial" << std::endl;
+        }
+
+        if ((*time_events)[i][j].type() == platform::EventType::kPopRange) {
+          VLOG(4) << "type kPopRange" << std::endl;
+        } else if ((*time_events)[i][j].type() ==
+                   platform::EventType::kPushRange) {
+          VLOG(4) << "type kPushRange" << std::endl;
+        } else if ((*time_events)[i][j].type() == platform::EventType::kMark) {
+          VLOG(4) << "type kMark" << std::endl;
+        }
+        VLOG(4) << std::endl;
+      }
+    }
+  }
+  if (mem_events != nullptr) {
+    for (size_t i = 0; i < mem_events->size(); ++i) {
+      for (size_t j = 0; j < (*mem_events)[i].size(); ++j) {
+        VLOG(4) << "Print mem event (" << i << ", " << j << ")" << std::endl;
+        VLOG(4) << (*mem_events)[i][j].annotation() << std::endl;
+      }
+    }
+  }
+}
+
+std::string ToLowerCopy(const std::string& in) {
+  std::string out(in);
+  std::transform(out.begin(), out.end(), out.begin(),
+                 [](unsigned char c) { return std::tolower(c); });
+  return out;
+}
+
+CostData CostModel::ProfileMeasure(
+    const ProgramDesc& main_program, const ProgramDesc& startup_program,
+    const std::string& device,
+    const std::vector<std::string>& fetch_cost_list) const {
+  // Currently fetch_cost_list is useless
+  // TODO(zhhsplendid): support different fetch data
+
+  platform::ProfilerState profiler_state;
+  platform::Place place;
+
+  std::string device_lower_case = ToLowerCopy(device);
+  if (device_lower_case == "cpu") {
+    profiler_state = platform::ProfilerState::kCPU;
+    place = platform::CPUPlace();
+  } else if (device_lower_case == "gpu") {
+    profiler_state = platform::ProfilerState::kAll;
+    place = platform::CUDAPlace();
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Not support %s in CostModel now", device));
+  }
+
+  Executor executor(place);
+  Scope scope;
+  executor.Run(startup_program, &scope, /*block_id = */ 0);
+
+  // TODO(zhhsplendid): handle the case that Profiler is already enabled
+  SetTracerOption(platform::TracerOption::kAllOpDetail);
+  EnableProfiler(profiler_state);
+  executor.Run(main_program, &scope, /*block_id = */ 0);
+
+  std::unique_ptr<std::vector<std::vector<Event>>> time_events(
+      new std::vector<std::vector<Event>>());
+  std::unique_ptr<std::vector<std::vector<MemEvent>>> mem_events(
+      new std::vector<std::vector<MemEvent>>());
+
+  CompleteProfilerEvents(/*tracer_profile= */ nullptr, time_events.get(),
+                         mem_events.get());
+
+  // TODO(zhhsplendid): remove debug vlog after this series of work
+  PrintEvents(time_events.get(), mem_events.get());
+
+  // Convert events to cost data
+  CostData cost_data;
+  cost_data.SetCostData(main_program, *time_events);
+
+  return cost_data;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+class CostData {
+ public:
+  CostData() {}
+
+  ~CostData();
+
+  // Support global block only
+  // TODO(zhhsplendid): add support for sub-block
+  double GetOpTimeMs(int op_id) const;
+  double GetOpMemoryBytes(int op_id) const;
+  double GetWholeTimeMs() const;
+  double GetWholeMemoryBytes() const;
+
+  const ir::Graph* GetGraph() const;
+  const ProgramDesc* GetProgram() const;
+
+  // Support Time Event only
+  // TODO(zhhsplendid): add memory
+  bool SetCostData(
+      const ProgramDesc& program,
+      const std::vector<std::vector<platform::Event>>& time_events);
+
+  static const double NOT_MEASURED;
+
+ private:
+  ir::Graph* graph_{nullptr};
+  ProgramDesc* program_{nullptr};
+  std::map<int, double> op_time_ms_;  // from Op Node id to time
+  std::map<int, double>
+      op_memory_bytes_;         // from Op Node id to total memory bytes
+  std::map<int, double> comm_;  // from Op Node id to communicate cost
+  double whole_time_ms_{
+      NOT_MEASURED};  // time cost of the whole program or graph
+  double whole_memory_bytes_{
+      NOT_MEASURED};  // memory cost of the whole program or graph
+  double whole_comm_{
+      NOT_MEASURED};  // communication cost of the whole program or graph
+};
+
+class CostModel {
+ public:
+  CostModel() {}
+  ~CostModel() {}
+
+  CostData ProfileMeasure(
+      const ProgramDesc& main_program, const ProgramDesc& startup_program,
+      const std::string& device,
+      const std::vector<std::string>& fetch_cost_list) const;
+};
+
+}  // namespace framework
+}  // namespace paddle