Skip to content

Commit

Permalink
[Onnxifi] Warmup cache of output shapes (pytorch#48346)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#48346

Onnxifi now accepts output shape info for all possible batch sizes. This is used to avoid doing shape inference inside `OnnxifiOp::extractOutputBatchSizes()`.

FB:
In this diff we try to pre-calculate output shapes for all possible batch sizes inside `PredictorContainer` where we supposedly have enough data to do so. This data is then passed down to OnnxifiOp.

Here is the dependency graph that I built manually trying to understand the entire flow.
https://pxl.cl/1rQRv

Test Plan:
Strobelight data https://fburl.com/strobelight/jlhhgt21 shows that `OnnxifiOp::RunOnDevice()` now takes only 2.17% of CPU instead of ~20% CPU with the current implementation.

Also, the current implementation takes dozens of milliseconds according to ipiszy:
> After adding more logs, I found each shapeinference call actually takes 40~50ms.

I also added added time measurements temporarily for `OnnxifiOp::extractOutputBatchSizes()`. New impenentation typically consumes 1 to 4 microseconds, and, when data for current bs is not present yet in `output_reshape_info_`, it takes 20-40 microseconds which is still much better than the current implementation.

AF canary https://www.internalfb.com/intern/ads/canary/431357944274985799
AI canary https://www.internalfb.com/intern/ads/canary/431365503038313840

Verifying using test tier https://pxl.cl/1sZ4S

Reviewed By: yinghai, ipiszy

Differential Revision: D25047110

fbshipit-source-id: 872dc1578a1e8e7c3ade5f5e2711e77ba290a671
  • Loading branch information
khabinov authored and facebook-github-bot committed Dec 4, 2020
1 parent 0a42003 commit ba3962f
Show file tree
Hide file tree
Showing 6 changed files with 216 additions and 114 deletions.
8 changes: 5 additions & 3 deletions caffe2/opt/glow_net_transform.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,13 @@ void onnxifi(
const std::vector<std::string>& output_names,
const std::vector<std::string>& weight_names,
const std::unordered_set<int>& blacklist,
const ShapeInfoMap& shape_hints,
const ShapeInfoMap& shape_hints_max_bs,
bool use_onnx,
size_t max_batch_size,
size_t max_seq_size,
bool load_model_by_blob,
bool predictor_net_ssa_rewritten) {
bool predictor_net_ssa_rewritten,
const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
// Split SparseLengthsSumSparse so that we can lower the SparseLengthsSum part
splitSparseLengthsSumSparse(net, *ws);

Expand Down Expand Up @@ -143,8 +144,9 @@ void onnxifi(
opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
opts.timeout = FLAGS_onnxifi_timeout_ms;
opts.shape_hints_per_bs = shape_hints_per_bs;

ShapeInfoMap more_shape_hints = shape_hints;
ShapeInfoMap more_shape_hints = shape_hints_max_bs;
if (!FLAGS_onnxifi_shape_hints.empty()) {
parseShapeInfoMapFromString(FLAGS_onnxifi_shape_hints, more_shape_hints);
}
Expand Down
5 changes: 3 additions & 2 deletions caffe2/opt/glow_net_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ void onnxifi(
const std::vector<std::string>& output_names,
const std::vector<std::string>& weight_names,
const std::unordered_set<int>& blacklist,
const ShapeInfoMap& shape_hints,
const ShapeInfoMap& shape_hints_max_bs,
bool use_onnx,
size_t max_batch_size = 0,
size_t max_seq_size = 0,
bool load_model_by_blob = false,
bool predictor_net_ssa_rewritten = false);
bool predictor_net_ssa_rewritten = false,
const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs = {});

std::unordered_set<int> ParseNetPositionList(const std::string& str);
std::unordered_set<std::string> ParseBlackListOps(const std::string& str);
Expand Down
152 changes: 85 additions & 67 deletions caffe2/opt/onnxifi_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,46 @@ details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
return output_reshape_info;
}

template <>
template <typename DimContainer>
void OnnxifiOp<CPUContext>::fillOutputReshapeInfo(
const DimContainer& real_shape,
c10::ArrayRef<uint64_t> max_shape,
details::OutputReshapeInfo &output_reshape_info,
int currentIndex) {
CAFFE_ENFORCE_EQ(real_shape.size(), max_shape.size());
const auto dim_size = real_shape.size();
auto& begin = output_reshape_info.begins[currentIndex];
begin.Resize(dim_size);
int32_t* begin_ptr = begin.template mutable_data<int32_t>();
auto& end = output_reshape_info.ends[currentIndex];
end.Resize(dim_size);
int32_t* end_ptr = end.template mutable_data<int32_t>();
int32_t mismatch = 0;
for (int j = 0; j < dim_size; ++j) {
CAFFE_ENFORCE_GE(
max_shape[j],
real_shape[j],
"It is weird that max shape of ",
output_names_[currentIndex],
" is smaller than real shape at dim ",
j,
" (",
max_shape[j],
" vs ",
real_shape[j],
")");
begin_ptr[j] = 0;
if (max_shape[j] >= real_shape[j]) {
end_ptr[j] = real_shape[j];
mismatch += j;
} else {
end_ptr[j] = -1;
}
}
output_reshape_info.fast_path[currentIndex] = !mismatch;
}

template <>
int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
if (use_onnx_ || !adjust_output_batch_) {
Expand Down Expand Up @@ -337,77 +377,55 @@ int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
return current_batch_size;
}

auto it =
output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo());
auto& output_reshape_info = it.first->second;
BoundShapeSpec spec(dims[0], max_seq_size_);
auto bound_shape_inferencer =
BoundShapeInferencerRegistry()->Create("C10", spec);
for (int i = 0; i < InputSize(); ++i) {
at::IntArrayRef dim0;
bool quantized = false;
if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
const auto& input_tensor_int8 =
this->template Input<int8::Int8TensorCPU>(i);
const auto& t0 = input_tensor_int8.t;
dim0 = t0.sizes();
quantized = true;
} else {
const auto& t0 = Input(i);
dim0 = t0.sizes();
}
TensorShape shape;
for (const auto d : dim0) {
shape.add_dims(d);
}
std::vector<TensorBoundShape::DimType> dim_type(
shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
if (dim_type.size()) {
dim_type[0] = TensorBoundShape_DimType_BATCH;
auto& output_reshape_info = output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo()).first->second;

if (use_passed_output_shapes_) {
auto shape_info_it = output_shapes_per_bs_.find(current_batch_size);
CAFFE_ENFORCE(shape_info_it != output_shapes_per_bs_.end(), "Unable to find outputs shapes for bs=", current_batch_size);
CAFFE_ENFORCE_EQ(shape_info_it->second.size(), OutputSize());

for (int i = 0; i < OutputSize(); ++i) {
fillOutputReshapeInfo(shape_info_it->second[i], output_shapes_max_bs_[i], output_reshape_info, i);
}
input_shape_info_[input_names_[i]] =
ShapeInfo(dim_type, std::move(shape), quantized);
}
bound_shape_inferencer->InferBoundShapeAndType(
netdef_, input_shape_info_, nullptr, false);
const auto& shape_info = bound_shape_inferencer->shape_info();
for (int i = 0; i < OutputSize(); ++i) {
const auto it = shape_info.find(output_names_[i]);
CAFFE_ENFORCE(it != shape_info.end());
const auto& real_shape = it->second.shape;
const auto& max_shape = output_shapes_[i];
CAFFE_ENFORCE_EQ(real_shape.dims_size(), max_shape.size());
const auto dim_size = real_shape.dims_size();
auto& begin = output_reshape_info.begins[i];
begin.Resize(dim_size);
int32_t* begin_ptr = begin.template mutable_data<int32_t>();
auto& end = output_reshape_info.ends[i];
end.Resize(dim_size);
int32_t* end_ptr = end.template mutable_data<int32_t>();
int32_t mismatch = 0;
for (int j = 0; j < dim_size; ++j) {
CAFFE_ENFORCE_GE(
max_shape[j],
real_shape.dims(j),
"It is weird that max shape of ",
output_names_[i],
" is smaller than real shape at dim ",
j,
" (",
max_shape[j],
" vs ",
real_shape.dims(j),
")");
begin_ptr[j] = 0;
if (max_shape[j] >= real_shape.dims(j)) {
end_ptr[j] = real_shape.dims(j);
mismatch += j;
} else {
BoundShapeSpec spec(dims[0], max_seq_size_);
auto bound_shape_inferencer =
BoundShapeInferencerRegistry()->Create("C10", spec);
for (int i = 0; i < InputSize(); ++i) {
at::IntArrayRef dim0;
bool quantized = false;
if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
const auto& input_tensor_int8 =
this->template Input<int8::Int8TensorCPU>(i);
const auto& t0 = input_tensor_int8.t;
dim0 = t0.sizes();
quantized = true;
} else {
end_ptr[j] = -1;
const auto& t0 = Input(i);
dim0 = t0.sizes();
}
TensorShape shape;
for (const auto d : dim0) {
shape.add_dims(d);
}
std::vector<TensorBoundShape::DimType> dim_type(
shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
if (dim_type.size()) {
dim_type[0] = TensorBoundShape_DimType_BATCH;
}
input_shape_info_[input_names_[i]] =
ShapeInfo(dim_type, std::move(shape), quantized);
}
bound_shape_inferencer->InferBoundShapeAndType(
netdef_, input_shape_info_, nullptr, false);
const auto& shape_info = bound_shape_inferencer->shape_info();
for (int i = 0; i < OutputSize(); ++i) {
const auto find_res = shape_info.find(output_names_[i]);
CAFFE_ENFORCE(find_res != shape_info.end());
fillOutputReshapeInfo(find_res->second.shape.dims(), output_shapes_max_bs_[i], output_reshape_info, i);
}
output_reshape_info.fast_path[i] = !mismatch;
}

return current_batch_size;
}

Expand Down Expand Up @@ -458,7 +476,7 @@ void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
tensor_descriptor.dimensions = tensor_dims.size();
CAFFE_ENFORCE(
tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
auto& output_shape = output_shapes_[output_idx];
auto& output_shape = output_shapes_max_bs_[output_idx];
output_shape.clear();
output_shape.insert(
output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());
Expand Down
47 changes: 43 additions & 4 deletions caffe2/opt/onnxifi_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace caffe2 {
namespace details {

/// Provides slicing info for the outputs. All the vector members should be of
/// the same size as number of outpus of the Onnxifi op.
/// the same size as number of outputs of the Onnxifi op.
struct OutputReshapeInfo {
std::vector<Tensor> begins;
std::vector<Tensor> ends;
Expand Down Expand Up @@ -55,6 +55,7 @@ class OnnxifiOp final : public Operator<Context> {
timeout_(this->template GetSingleArgument<int>("timeout", 0)),
nominal_batch_idx_(
this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
adjust_quantized_offset_(this->template GetSingleArgument<int>(
"adjust_quantized_offset",
128)) {
Expand Down Expand Up @@ -86,7 +87,7 @@ class OnnxifiOp final : public Operator<Context> {
all_offsets_.reserve(ws->Blobs().size());
all_scales_.reserve(ws->Blobs().size());
input_shapes_.resize(input_names_.size());
output_shapes_.resize(output_names_.size());
output_shapes_max_bs_.resize(output_names_.size());
quantized_outputs_.resize(output_names_.size(), false);
int output_idx = 0;
ArgumentHelper helper(operator_def);
Expand Down Expand Up @@ -127,6 +128,30 @@ class OnnxifiOp final : public Operator<Context> {
adjust_quantized_offset_ = 0;
}

if (use_passed_output_shapes_) {
// Populate output_shapes_per_bs_
for (int bs = 1; bs < max_batch_size_; ++bs) {
auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());

std::unordered_map<std::string, details::TensorInfo> name_to_shape;
for (const auto& output_shape_tp : output_shapes_tp) {
name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
}
for (const auto& output_qshape_tp : output_qshapes_tp) {
name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
}

for (output_idx = 0; output_idx < output_names_.size(); ++output_idx) {
auto it = name_to_shape.find(output_names_[output_idx]);
output_shapes_per_bs_[bs].push_back({});
auto &output_shapes = output_shapes_per_bs_[bs].back();
std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
}
}
}

// Get output resizing hints
adjust_output_batch_ =
this->template GetSingleArgument<int>("adjust_output_batch", 0);
Expand Down Expand Up @@ -333,6 +358,14 @@ class OnnxifiOp final : public Operator<Context> {
#endif
}

/// Helper method for extractOutputBatchSizes(), used to deduplicate code of populating output reshape infos
template <typename DimContainer>
void fillOutputReshapeInfo(
const DimContainer& real_shape,
c10::ArrayRef<uint64_t> max_shape,
details::OutputReshapeInfo &output_reshape_info,
int index);

/// Extract output batch size. If the output batch size is going to be at
/// max_batch_size_, return true indicating that no output shape adjustment is
/// needed. Otherwise, return false.
Expand Down Expand Up @@ -418,7 +451,7 @@ class OnnxifiOp final : public Operator<Context> {
int nominal_batch_idx_{0};

// We bind the op input/output by position while ONNXIFI binds input/output by
// names. In addition, op input/output names can be writtten by, for example,
// names. In addition, op input/output names can be written by, for example,
// memonger. We cache the original input/output name of ONNX object here and
// bind them by position.
std::vector<std::string> input_names_;
Expand All @@ -428,7 +461,10 @@ class OnnxifiOp final : public Operator<Context> {
NetDef netdef_;

std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_;
std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_max_bs_;

// Mapping of batch sizes to output shapes
std::unordered_map<int, std::vector<c10::SmallVector<uint64_t, 4>>> output_shapes_per_bs_;

// Indicate if i-th output is a quantized tensor
std::vector<bool> quantized_outputs_;
Expand All @@ -449,6 +485,9 @@ class OnnxifiOp final : public Operator<Context> {
// max_batch_size
std::unordered_map<std::string, ShapeInfo> input_shape_info_;

// Whether we should use passed output shape hints or do shape inference
bool use_passed_output_shapes_{false};

// Whether we need to resize outputs or not
bool adjust_output_batch_{false};

Expand Down
Loading

0 comments on commit ba3962f

Please sign in to comment.