[PerfModel] Add Profile observer

Adds profile observer to system. This outputs the following information 1) Input tensor sizes 2) Argument list 3) Output tensor sizes 4) Operator run time Example output: I0206 14:00:51.217067 1730559 profile_observer_gpu.cc:53] --------- Starting operator Conv op#0 --------- I0206 14:00:51.217073 1730559 profile_observer_gpu.cc:65] Input 0: Tensor gpu_0/data of type float. Dims: (32,3,227,227,): I0206 14:00:51.217077 1730559 profile_observer_gpu.cc:65] Input 1: Tensor gpu_0/conv1_w of type float. Dims: (64,3,7,7,): I0206 14:00:51.217082 1730559 profile_observer_gpu.cc:71] Argument 0: name: "kernel" i: 7 I0206 14:00:51.217087 1730559 profile_observer_gpu.cc:71] Argument 1: name: "enable_tensor_core" i: 0 I0206 14:00:51.217089 1730559 profile_observer_gpu.cc:71] Argument 2: name: "exhaustive_search" i: 1 I0206 14:00:51.217092 1730559 profile_observer_gpu.cc:71] Argument 3: name: "float16_compute" i: 0 I0206 14:00:51.217095 1730559 profile_observer_gpu.cc:71] Argument 4: name: "stride" i: 2 I0206 14:00:51.217099 1730559 profile_observer_gpu.cc:71] Argument 5: name: "pad" i: 3 I0206 14:00:51.217103 1730559 profile_observer_gpu.cc:71] Argument 6: name: "order" s: "NCHW" I0206 14:00:51.217105 1730559 profile_observer_gpu.cc:71] Argument 7: name: "ws_nbytes_limit" i: 67108864 I0206 14:00:51.217109 1730559 profile_observer_gpu.cc:85] Output 0: Tensor gpu_0/conv1 of type float. Dims: (32,64,114,114,): I0206 14:00:51.217111 1730559 profile_observer_gpu.cc:88] --------- Finished operator Conv in 1.12685 ms --------- Example output for internal RNN op (from seq2seq): I0219 18:57:06.779331 2960991 profile_observer_gpu.cc:52] --------- Starting operator LSTMUnit op#3161697160-7 --------- I0219 18:57:06.779336 2960991 profile_observer_gpu.cc:59] Input 0: Tensor model0/encoder/layer3/lstm/hidden_t_prev of type float. Dims: (1,1,512,): I0219 18:57:06.779340 2960991 profile_observer_gpu.cc:59] Input 1: Tensor model0/encoder/layer3/lstm/cell_t_prev of type float. Dims: (1,1,512,): I0219 18:57:06.779343 2960991 profile_observer_gpu.cc:59] Input 2: Tensor model0/encoder/layer3/lstm/gates_t of type float. Dims: (1,1,2048,): I0219 18:57:06.779346 2960991 profile_observer_gpu.cc:59] Input 3: Tensor encoder_lengths of type int. Dims: (1,): I0219 18:57:06.779350 2960991 profile_observer_gpu.cc:59] Input 4: Tensor timestep_rnnexec_t24 of type int. Dims: (1,): I0219 18:57:06.779353 2960991 profile_observer_gpu.cc:70] Argument 0: name: "no_sequence_lengths" i: 0 I0219 18:57:06.779357 2960991 profile_observer_gpu.cc:70] Argument 1: name: "drop_states" i: 0 I0219 18:57:06.779362 2960991 profile_observer_gpu.cc:70] Argument 2: name: "forget_bias" f: 0 I0219 18:57:06.779366 2960991 profile_observer_gpu.cc:79] Output 0: Tensor model0/encoder/layer3/lstm/hidden_t of type float. Dims: (1,1,512,): I0219 18:57:06.779369 2960991 profile_observer_gpu.cc:79] Output 1: Tensor model0/encoder/layer3/lstm/cell_t of type float. Dims: (1,1,512,): I0219 18:57:06.779372 2960991 profile_observer_gpu.cc:89] RecurrentNetwork 3161697160: order: 7 I0219 18:57:06.779373 2960991 profile_observer_gpu.cc:92] --------- Finished operator LSTMUnit in 0.00153923 ms --------- Existing deficiencies: 1) Need support to create separate CPU and GPU builds Once this is approved, I'll port the changes over to OSS
lcy-seso · Mar 28, 2018 · d5e38a8 · d5e38a8
1 parent 677c8d6
commit d5e38a8
Show file tree

Hide file tree

Showing 3 changed files with 242 additions and 0 deletions.
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
@@ -460,6 +460,10 @@ class Operator : public OperatorBase {
     return HasAsyncPart() && context_.SupportsAsyncScheduling();
   }
 
+  const Context* getContext() const {
+    return &context_;
+  }
+
  protected:
   void RecordEvent(const char* err_msg = nullptr) final {
     if (event_) {

diff --git a/caffe2/observers/profile_observer.h b/caffe2/observers/profile_observer.h
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <unordered_map>
+
+#include "caffe2/core/common.h"
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/core/event.h"
+#include "caffe2/core/net.h"
+#include "caffe2/core/observer.h"
+#include "caffe2/core/operator.h"
+#include "caffe2/core/timer.h"
+#include "caffe2/observers/operator_attaching_net_observer.h"
+#include "caffe2/operators/rnn/rnn_capable_operator_observer.h"
+
+namespace caffe2 {
+
+/**
+ * This observer displays a description of each operator executed in a network.
+ * This includes input and tensors (name, size, type), arguments, and execution
+ * time. This can be used to analyze different performance characteristics.
+ * NOTE: Currently this observer only supports synchronized computation
+ **/
+
+class ProfileObserver;
+class ProfileCounter {
+ public:
+  explicit ProfileCounter() {}
+
+ protected:
+  Timer timer_;
+  float start_time_ = 0.0f;
+  float run_time_ = 0.0f;
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+};
+
+class ProfileOperatorObserver : public ProfileCounter,
+                                public RNNCapableOperatorObserver {
+ public:
+  explicit ProfileOperatorObserver(OperatorBase* subject) = delete;
+  explicit ProfileOperatorObserver(
+      OperatorBase* subject,
+      ProfileObserver* netObserver)
+      : RNNCapableOperatorObserver(subject), netObserver_(netObserver) {
+    if (subject) {
+      net_position_ = subject->net_position();
+    }
+  }
+  explicit ProfileOperatorObserver(
+      OperatorBase* subject,
+      ProfileObserver* netObserver,
+      int net_position,
+      int rnn_order)
+      : ProfileOperatorObserver(subject, netObserver) {
+    net_position_ = net_position;
+    rnn_order_ = rnn_order;
+  }
+
+  std::unique_ptr<ObserverBase<OperatorBase>> rnnCopy(
+      OperatorBase* subject,
+      int rnnOrder) const override;
+
+  void Dump() const;
+
+  virtual std::string getId() const {
+    std::stringstream ss;
+    ss << net_position_;
+    if (rnn_order_ != OperatorBase::kNoNetPositionSet) {
+      ss << "-" << rnn_order_;
+    }
+    return ss.str();
+  }
+
+ protected:
+  ProfileObserver* netObserver_;
+  int net_position_; // Needed because this is not visible in RNN Executor
+
+ private:
+  void Start() override;
+  void Stop() override;
+};
+
+class ProfileObserver final : public OperatorAttachingNetObserver<
+                                  ProfileOperatorObserver,
+                                  ProfileObserver> {
+ public:
+  explicit ProfileObserver(NetBase* subject)
+      : OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>(
+            subject,
+            this) {}
+
+  void Start() override{};
+  void Stop() override{};
+
+ private:
+  vector<const ProfileOperatorObserver*> operator_observers_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/observers/profile_observer_gpu.cc b/caffe2/observers/profile_observer_gpu.cc
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "caffe2/core/logging.h"
+#include "profile_observer.h"
+
+namespace caffe2 {
+
+void ProfileOperatorObserver::Dump() const {
+  static std::mutex loggingMutex;
+  std::lock_guard<std::mutex> lock(loggingMutex);
+
+  LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
+            << " op#" << getId() << " ---------";
+  for (int i = 0; i < subject_->InputSize(); ++i) {
+    if (subject_->InputIsType<TensorCPU>(i)) {
+      const auto& tensor = subject_->Input<TensorCPU>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    } else if (subject_->InputIsType<TensorCUDA>(i)) {
+      const auto& tensor = subject_->Input<TensorCUDA>(i);
+      const auto& name = subject_->debug_def().input(i);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
+    }
+  }
+
+  int a = 0;
+  for (const auto& arg : subject_->debug_def().arg()) {
+    LOG(INFO) << "Argument " << a << ": " << arg.ShortDebugString();
+    ++a;
+  }
+
+  for (int o = 0; o < subject_->OutputSize(); ++o) {
+    if (subject_->OutputIsType<TensorCPU>(o)) {
+      auto* tensor = subject_->Output<TensorCPU>(o);
+      const auto& name = subject_->debug_def().output(o);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
+    } else if (subject_->OutputIsType<TensorCUDA>(o)) {
+      auto* tensor = subject_->Output<TensorCUDA>(o);
+      const auto& name = subject_->debug_def().output(o);
+      TensorPrinter printer(name);
+      LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
+    }
+  }
+
+  LOG(INFO) << "--------- Finished operator " << subject_->debug_def().type()
+            << " in " << run_time_ << " ms ---------";
+}
+
+void ProfileOperatorObserver::Start() {
+  auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
+  if (cudaOp) {
+    auto context = cudaOp->getContext();
+    int device;
+    cudaGetDevice(&device);
+
+    cudaSetDevice(context->cuda_gpu_id());
+    cudaEventCreate(&start_);
+    cudaEventRecord(start_, context->cuda_stream());
+
+    cudaSetDevice(device);
+
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+      CAFFE_THROW("Encountered CUDA error Start: ", cudaGetErrorString(error));
+    }
+  } else {
+    start_time_ = timer_.MilliSeconds();
+  }
+}
+
+void ProfileOperatorObserver::Stop() {
+  auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
+  if (cudaOp) {
+    auto context = cudaOp->getContext();
+    int device;
+    cudaGetDevice(&device);
+
+    cudaSetDevice(context->cuda_gpu_id());
+    cudaEventCreate(&stop_);
+    cudaEventRecord(stop_, context->cuda_stream());
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&run_time_, start_, stop_);
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+
+    cudaSetDevice(device);
+
+    cudaError_t error = cudaGetLastError();
+    if (error != cudaSuccess) {
+      CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error));
+    }
+  } else {
+    run_time_ = timer_.MilliSeconds() - start_time_;
+  }
+
+  Dump();
+}
+
+std::unique_ptr<ObserverBase<OperatorBase>> ProfileOperatorObserver::rnnCopy(
+    OperatorBase* subject,
+    int rnn_order) const {
+  return std::unique_ptr<ObserverBase<OperatorBase>>(
+      new ProfileOperatorObserver(
+          subject, netObserver_, net_position_, rnn_order));
+}
+} // namespace caffe2