Skip to content

Commit

Permalink
[PerfModel] Add Profile observer
Browse files Browse the repository at this point in the history
Adds profile observer to system.  This outputs the following information
1) Input tensor sizes
2) Argument list
3) Output tensor sizes
4) Operator run time

Example output:
  I0206 14:00:51.217067 1730559 profile_observer_gpu.cc:53] --------- Starting operator Conv op#0 ---------
  I0206 14:00:51.217073 1730559 profile_observer_gpu.cc:65] Input 0: Tensor gpu_0/data of type float. Dims: (32,3,227,227,):
  I0206 14:00:51.217077 1730559 profile_observer_gpu.cc:65] Input 1: Tensor gpu_0/conv1_w of type float. Dims: (64,3,7,7,):
  I0206 14:00:51.217082 1730559 profile_observer_gpu.cc:71] Argument 0: name: "kernel" i: 7
  I0206 14:00:51.217087 1730559 profile_observer_gpu.cc:71] Argument 1: name: "enable_tensor_core" i: 0
  I0206 14:00:51.217089 1730559 profile_observer_gpu.cc:71] Argument 2: name: "exhaustive_search" i: 1
  I0206 14:00:51.217092 1730559 profile_observer_gpu.cc:71] Argument 3: name: "float16_compute" i: 0
  I0206 14:00:51.217095 1730559 profile_observer_gpu.cc:71] Argument 4: name: "stride" i: 2
  I0206 14:00:51.217099 1730559 profile_observer_gpu.cc:71] Argument 5: name: "pad" i: 3
  I0206 14:00:51.217103 1730559 profile_observer_gpu.cc:71] Argument 6: name: "order" s: "NCHW"
  I0206 14:00:51.217105 1730559 profile_observer_gpu.cc:71] Argument 7: name: "ws_nbytes_limit" i: 67108864
  I0206 14:00:51.217109 1730559 profile_observer_gpu.cc:85] Output 0: Tensor gpu_0/conv1 of type float. Dims: (32,64,114,114,):
  I0206 14:00:51.217111 1730559 profile_observer_gpu.cc:88] --------- Finished operator Conv in 1.12685 ms ---------

Example output for internal RNN op (from seq2seq):
  I0219 18:57:06.779331 2960991 profile_observer_gpu.cc:52] --------- Starting operator LSTMUnit op#3161697160-7 ---------
  I0219 18:57:06.779336 2960991 profile_observer_gpu.cc:59] Input 0: Tensor model0/encoder/layer3/lstm/hidden_t_prev of type float. Dims: (1,1,512,):
  I0219 18:57:06.779340 2960991 profile_observer_gpu.cc:59] Input 1: Tensor model0/encoder/layer3/lstm/cell_t_prev of type float. Dims: (1,1,512,):
  I0219 18:57:06.779343 2960991 profile_observer_gpu.cc:59] Input 2: Tensor model0/encoder/layer3/lstm/gates_t of type float. Dims: (1,1,2048,):
  I0219 18:57:06.779346 2960991 profile_observer_gpu.cc:59] Input 3: Tensor encoder_lengths of type int. Dims: (1,):
  I0219 18:57:06.779350 2960991 profile_observer_gpu.cc:59] Input 4: Tensor timestep_rnnexec_t24 of type int. Dims: (1,):
  I0219 18:57:06.779353 2960991 profile_observer_gpu.cc:70] Argument 0: name: "no_sequence_lengths" i: 0
  I0219 18:57:06.779357 2960991 profile_observer_gpu.cc:70] Argument 1: name: "drop_states" i: 0
  I0219 18:57:06.779362 2960991 profile_observer_gpu.cc:70] Argument 2: name: "forget_bias" f: 0
  I0219 18:57:06.779366 2960991 profile_observer_gpu.cc:79] Output 0: Tensor model0/encoder/layer3/lstm/hidden_t of type float. Dims: (1,1,512,):
  I0219 18:57:06.779369 2960991 profile_observer_gpu.cc:79] Output 1: Tensor model0/encoder/layer3/lstm/cell_t of type float. Dims: (1,1,512,):
  I0219 18:57:06.779372 2960991 profile_observer_gpu.cc:89] RecurrentNetwork 3161697160: order: 7
  I0219 18:57:06.779373 2960991 profile_observer_gpu.cc:92] --------- Finished operator LSTMUnit in 0.00153923 ms ---------

Existing deficiencies:
1) Need support to create separate CPU and GPU builds

Once this is approved, I'll port the changes over to OSS
  • Loading branch information
Martin Schatz authored and houseroad committed Mar 28, 2018
1 parent 677c8d6 commit d5e38a8
Show file tree
Hide file tree
Showing 3 changed files with 242 additions and 0 deletions.
4 changes: 4 additions & 0 deletions caffe2/core/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,10 @@ class Operator : public OperatorBase {
return HasAsyncPart() && context_.SupportsAsyncScheduling();
}

const Context* getContext() const {
return &context_;
}

protected:
void RecordEvent(const char* err_msg = nullptr) final {
if (event_) {
Expand Down
115 changes: 115 additions & 0 deletions caffe2/observers/profile_observer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <unordered_map>

#include "caffe2/core/common.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/event.h"
#include "caffe2/core/net.h"
#include "caffe2/core/observer.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/timer.h"
#include "caffe2/observers/operator_attaching_net_observer.h"
#include "caffe2/operators/rnn/rnn_capable_operator_observer.h"

namespace caffe2 {

/**
* This observer displays a description of each operator executed in a network.
* This includes input and tensors (name, size, type), arguments, and execution
* time. This can be used to analyze different performance characteristics.
* NOTE: Currently this observer only supports synchronized computation
**/

class ProfileObserver;
class ProfileCounter {
public:
explicit ProfileCounter() {}

protected:
Timer timer_;
float start_time_ = 0.0f;
float run_time_ = 0.0f;
cudaEvent_t start_;
cudaEvent_t stop_;
};

class ProfileOperatorObserver : public ProfileCounter,
public RNNCapableOperatorObserver {
public:
explicit ProfileOperatorObserver(OperatorBase* subject) = delete;
explicit ProfileOperatorObserver(
OperatorBase* subject,
ProfileObserver* netObserver)
: RNNCapableOperatorObserver(subject), netObserver_(netObserver) {
if (subject) {
net_position_ = subject->net_position();
}
}
explicit ProfileOperatorObserver(
OperatorBase* subject,
ProfileObserver* netObserver,
int net_position,
int rnn_order)
: ProfileOperatorObserver(subject, netObserver) {
net_position_ = net_position;
rnn_order_ = rnn_order;
}

std::unique_ptr<ObserverBase<OperatorBase>> rnnCopy(
OperatorBase* subject,
int rnnOrder) const override;

void Dump() const;

virtual std::string getId() const {
std::stringstream ss;
ss << net_position_;
if (rnn_order_ != OperatorBase::kNoNetPositionSet) {
ss << "-" << rnn_order_;
}
return ss.str();
}

protected:
ProfileObserver* netObserver_;
int net_position_; // Needed because this is not visible in RNN Executor

private:
void Start() override;
void Stop() override;
};

class ProfileObserver final : public OperatorAttachingNetObserver<
ProfileOperatorObserver,
ProfileObserver> {
public:
explicit ProfileObserver(NetBase* subject)
: OperatorAttachingNetObserver<ProfileOperatorObserver, ProfileObserver>(
subject,
this) {}

void Start() override{};
void Stop() override{};

private:
vector<const ProfileOperatorObserver*> operator_observers_;
};

} // namespace caffe2
123 changes: 123 additions & 0 deletions caffe2/observers/profile_observer_gpu.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "caffe2/core/logging.h"
#include "profile_observer.h"

namespace caffe2 {

void ProfileOperatorObserver::Dump() const {
static std::mutex loggingMutex;
std::lock_guard<std::mutex> lock(loggingMutex);

LOG(INFO) << "--------- Starting operator " << subject_->debug_def().type()
<< " op#" << getId() << " ---------";
for (int i = 0; i < subject_->InputSize(); ++i) {
if (subject_->InputIsType<TensorCPU>(i)) {
const auto& tensor = subject_->Input<TensorCPU>(i);
const auto& name = subject_->debug_def().input(i);
TensorPrinter printer(name);
LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
} else if (subject_->InputIsType<TensorCUDA>(i)) {
const auto& tensor = subject_->Input<TensorCUDA>(i);
const auto& name = subject_->debug_def().input(i);
TensorPrinter printer(name);
LOG(INFO) << "Input " << i << ": " << printer.MetaStr(tensor);
}
}

int a = 0;
for (const auto& arg : subject_->debug_def().arg()) {
LOG(INFO) << "Argument " << a << ": " << arg.ShortDebugString();
++a;
}

for (int o = 0; o < subject_->OutputSize(); ++o) {
if (subject_->OutputIsType<TensorCPU>(o)) {
auto* tensor = subject_->Output<TensorCPU>(o);
const auto& name = subject_->debug_def().output(o);
TensorPrinter printer(name);
LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
} else if (subject_->OutputIsType<TensorCUDA>(o)) {
auto* tensor = subject_->Output<TensorCUDA>(o);
const auto& name = subject_->debug_def().output(o);
TensorPrinter printer(name);
LOG(INFO) << "Output " << o << ": " << printer.MetaStr(*tensor);
}
}

LOG(INFO) << "--------- Finished operator " << subject_->debug_def().type()
<< " in " << run_time_ << " ms ---------";
}

void ProfileOperatorObserver::Start() {
auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
if (cudaOp) {
auto context = cudaOp->getContext();
int device;
cudaGetDevice(&device);

cudaSetDevice(context->cuda_gpu_id());
cudaEventCreate(&start_);
cudaEventRecord(start_, context->cuda_stream());

cudaSetDevice(device);

cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
CAFFE_THROW("Encountered CUDA error Start: ", cudaGetErrorString(error));
}
} else {
start_time_ = timer_.MilliSeconds();
}
}

void ProfileOperatorObserver::Stop() {
auto cudaOp = dynamic_cast_if_rtti<const Operator<CUDAContext>*>(subject_);
if (cudaOp) {
auto context = cudaOp->getContext();
int device;
cudaGetDevice(&device);

cudaSetDevice(context->cuda_gpu_id());
cudaEventCreate(&stop_);
cudaEventRecord(stop_, context->cuda_stream());
cudaEventSynchronize(stop_);
cudaEventElapsedTime(&run_time_, start_, stop_);
cudaEventDestroy(start_);
cudaEventDestroy(stop_);

cudaSetDevice(device);

cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
CAFFE_THROW("Encountered CUDA error Stop: ", cudaGetErrorString(error));
}
} else {
run_time_ = timer_.MilliSeconds() - start_time_;
}

Dump();
}

std::unique_ptr<ObserverBase<OperatorBase>> ProfileOperatorObserver::rnnCopy(
OperatorBase* subject,
int rnn_order) const {
return std::unique_ptr<ObserverBase<OperatorBase>>(
new ProfileOperatorObserver(
subject, netObserver_, net_position_, rnn_order));
}
} // namespace caffe2

0 comments on commit d5e38a8

Please sign in to comment.