[Onnxifi] Warmup cache of output shapes (pytorch#48346)

Summary: Pull Request resolved: pytorch#48346 Onnxifi now accepts output shape info for all possible batch sizes. This is used to avoid doing shape inference inside `OnnxifiOp::extractOutputBatchSizes()`. FB: In this diff we try to pre-calculate output shapes for all possible batch sizes inside `PredictorContainer` where we supposedly have enough data to do so. This data is then passed down to OnnxifiOp. Here is the dependency graph that I built manually trying to understand the entire flow. https://pxl.cl/1rQRv Test Plan: Strobelight data https://fburl.com/strobelight/jlhhgt21 shows that `OnnxifiOp::RunOnDevice()` now takes only 2.17% of CPU instead of ~20% CPU with the current implementation. Also, the current implementation takes dozens of milliseconds according to ipiszy: > After adding more logs, I found each shapeinference call actually takes 40~50ms. I also added added time measurements temporarily for `OnnxifiOp::extractOutputBatchSizes()`. New impenentation typically consumes 1 to 4 microseconds, and, when data for current bs is not present yet in `output_reshape_info_`, it takes 20-40 microseconds which is still much better than the current implementation. AF canary https://www.internalfb.com/intern/ads/canary/431357944274985799 AI canary https://www.internalfb.com/intern/ads/canary/431365503038313840 Verifying using test tier https://pxl.cl/1sZ4S Reviewed By: yinghai, ipiszy Differential Revision: D25047110 fbshipit-source-id: 872dc1578a1e8e7c3ade5f5e2711e77ba290a671
lcy-seso · Dec 4, 2020 · ba3962f · ba3962f
1 parent 0a42003
commit ba3962f
Show file tree

Hide file tree

Showing 6 changed files with 216 additions and 114 deletions.
diff --git a/caffe2/opt/glow_net_transform.cc b/caffe2/opt/glow_net_transform.cc
@@ -108,12 +108,13 @@ void onnxifi(
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
     const std::unordered_set<int>& blacklist,
-    const ShapeInfoMap& shape_hints,
+    const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size,
     size_t max_seq_size,
     bool load_model_by_blob,
-    bool predictor_net_ssa_rewritten) {
+    bool predictor_net_ssa_rewritten,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs) {
   // Split SparseLengthsSumSparse so that we can lower the SparseLengthsSum part
   splitSparseLengthsSumSparse(net, *ws);
 
@@ -143,8 +144,9 @@ void onnxifi(
   opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
   opts.predictor_net_ssa_rewritten = predictor_net_ssa_rewritten;
   opts.timeout = FLAGS_onnxifi_timeout_ms;
+  opts.shape_hints_per_bs = shape_hints_per_bs;
 
-  ShapeInfoMap more_shape_hints = shape_hints;
+  ShapeInfoMap more_shape_hints = shape_hints_max_bs;
   if (!FLAGS_onnxifi_shape_hints.empty()) {
     parseShapeInfoMapFromString(FLAGS_onnxifi_shape_hints, more_shape_hints);
   }

diff --git a/caffe2/opt/glow_net_transform.h b/caffe2/opt/glow_net_transform.h
@@ -26,12 +26,13 @@ void onnxifi(
     const std::vector<std::string>& output_names,
     const std::vector<std::string>& weight_names,
     const std::unordered_set<int>& blacklist,
-    const ShapeInfoMap& shape_hints,
+    const ShapeInfoMap& shape_hints_max_bs,
     bool use_onnx,
     size_t max_batch_size = 0,
     size_t max_seq_size = 0,
     bool load_model_by_blob = false,
-    bool predictor_net_ssa_rewritten = false);
+    bool predictor_net_ssa_rewritten = false,
+    const std::unordered_map<int, ShapeInfoMap> &shape_hints_per_bs = {});
 
 std::unordered_set<int> ParseNetPositionList(const std::string& str);
 std::unordered_set<std::string> ParseBlackListOps(const std::string& str);

diff --git a/caffe2/opt/onnxifi_op.cc b/caffe2/opt/onnxifi_op.cc
@@ -300,6 +300,46 @@ details::OutputReshapeInfo OnnxifiOp<CPUContext>::initOutputReshapeInfo()
   return output_reshape_info;
 }
 
+template <>
+template <typename DimContainer>
+void OnnxifiOp<CPUContext>::fillOutputReshapeInfo(
+    const DimContainer& real_shape,
+    c10::ArrayRef<uint64_t> max_shape,
+    details::OutputReshapeInfo &output_reshape_info,
+    int currentIndex) {
+  CAFFE_ENFORCE_EQ(real_shape.size(), max_shape.size());
+  const auto dim_size = real_shape.size();
+  auto& begin = output_reshape_info.begins[currentIndex];
+  begin.Resize(dim_size);
+  int32_t* begin_ptr = begin.template mutable_data<int32_t>();
+  auto& end = output_reshape_info.ends[currentIndex];
+  end.Resize(dim_size);
+  int32_t* end_ptr = end.template mutable_data<int32_t>();
+  int32_t mismatch = 0;
+  for (int j = 0; j < dim_size; ++j) {
+    CAFFE_ENFORCE_GE(
+        max_shape[j],
+        real_shape[j],
+        "It is weird that max shape of ",
+        output_names_[currentIndex],
+        " is smaller than real shape at dim ",
+        j,
+        " (",
+        max_shape[j],
+        " vs ",
+        real_shape[j],
+        ")");
+    begin_ptr[j] = 0;
+    if (max_shape[j] >= real_shape[j]) {
+      end_ptr[j] = real_shape[j];
+      mismatch += j;
+    } else {
+      end_ptr[j] = -1;
+    }
+  }
+  output_reshape_info.fast_path[currentIndex] = !mismatch;
+}
+
 template <>
 int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
   if (use_onnx_ || !adjust_output_batch_) {
@@ -337,77 +377,55 @@ int OnnxifiOp<CPUContext>::extractOutputBatchSizes() {
     return current_batch_size;
   }
 
-  auto it =
-      output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo());
-  auto& output_reshape_info = it.first->second;
-  BoundShapeSpec spec(dims[0], max_seq_size_);
-  auto bound_shape_inferencer =
-      BoundShapeInferencerRegistry()->Create("C10", spec);
-  for (int i = 0; i < InputSize(); ++i) {
-    at::IntArrayRef dim0;
-    bool quantized = false;
-    if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
-      const auto& input_tensor_int8 =
-          this->template Input<int8::Int8TensorCPU>(i);
-      const auto& t0 = input_tensor_int8.t;
-      dim0 = t0.sizes();
-      quantized = true;
-    } else {
-      const auto& t0 = Input(i);
-      dim0 = t0.sizes();
-    }
-    TensorShape shape;
-    for (const auto d : dim0) {
-      shape.add_dims(d);
-    }
-    std::vector<TensorBoundShape::DimType> dim_type(
-        shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
-    if (dim_type.size()) {
-      dim_type[0] = TensorBoundShape_DimType_BATCH;
+  auto& output_reshape_info = output_reshape_info_.emplace(current_batch_size, initOutputReshapeInfo()).first->second;
+
+  if (use_passed_output_shapes_) {
+    auto shape_info_it = output_shapes_per_bs_.find(current_batch_size);
+    CAFFE_ENFORCE(shape_info_it != output_shapes_per_bs_.end(), "Unable to find outputs shapes for bs=", current_batch_size);
+    CAFFE_ENFORCE_EQ(shape_info_it->second.size(), OutputSize());
+
+    for (int i = 0; i < OutputSize(); ++i) {
+      fillOutputReshapeInfo(shape_info_it->second[i], output_shapes_max_bs_[i], output_reshape_info, i);
     }
-    input_shape_info_[input_names_[i]] =
-        ShapeInfo(dim_type, std::move(shape), quantized);
-  }
-  bound_shape_inferencer->InferBoundShapeAndType(
-      netdef_, input_shape_info_, nullptr, false);
-  const auto& shape_info = bound_shape_inferencer->shape_info();
-  for (int i = 0; i < OutputSize(); ++i) {
-    const auto it = shape_info.find(output_names_[i]);
-    CAFFE_ENFORCE(it != shape_info.end());
-    const auto& real_shape = it->second.shape;
-    const auto& max_shape = output_shapes_[i];
-    CAFFE_ENFORCE_EQ(real_shape.dims_size(), max_shape.size());
-    const auto dim_size = real_shape.dims_size();
-    auto& begin = output_reshape_info.begins[i];
-    begin.Resize(dim_size);
-    int32_t* begin_ptr = begin.template mutable_data<int32_t>();
-    auto& end = output_reshape_info.ends[i];
-    end.Resize(dim_size);
-    int32_t* end_ptr = end.template mutable_data<int32_t>();
-    int32_t mismatch = 0;
-    for (int j = 0; j < dim_size; ++j) {
-      CAFFE_ENFORCE_GE(
-          max_shape[j],
-          real_shape.dims(j),
-          "It is weird that max shape of ",
-          output_names_[i],
-          " is smaller than real shape at dim ",
-          j,
-          " (",
-          max_shape[j],
-          " vs ",
-          real_shape.dims(j),
-          ")");
-      begin_ptr[j] = 0;
-      if (max_shape[j] >= real_shape.dims(j)) {
-        end_ptr[j] = real_shape.dims(j);
-        mismatch += j;
+  } else {
+    BoundShapeSpec spec(dims[0], max_seq_size_);
+    auto bound_shape_inferencer =
+        BoundShapeInferencerRegistry()->Create("C10", spec);
+    for (int i = 0; i < InputSize(); ++i) {
+      at::IntArrayRef dim0;
+      bool quantized = false;
+      if (this->template InputIsType<int8::Int8TensorCPU>(i)) {
+        const auto& input_tensor_int8 =
+            this->template Input<int8::Int8TensorCPU>(i);
+        const auto& t0 = input_tensor_int8.t;
+        dim0 = t0.sizes();
+        quantized = true;
       } else {
-        end_ptr[j] = -1;
+        const auto& t0 = Input(i);
+        dim0 = t0.sizes();
+      }
+      TensorShape shape;
+      for (const auto d : dim0) {
+        shape.add_dims(d);
+      }
+      std::vector<TensorBoundShape::DimType> dim_type(
+          shape.dims_size(), TensorBoundShape_DimType_CONSTANT);
+      if (dim_type.size()) {
+        dim_type[0] = TensorBoundShape_DimType_BATCH;
       }
+      input_shape_info_[input_names_[i]] =
+          ShapeInfo(dim_type, std::move(shape), quantized);
+    }
+    bound_shape_inferencer->InferBoundShapeAndType(
+        netdef_, input_shape_info_, nullptr, false);
+    const auto& shape_info = bound_shape_inferencer->shape_info();
+    for (int i = 0; i < OutputSize(); ++i) {
+      const auto find_res = shape_info.find(output_names_[i]);
+      CAFFE_ENFORCE(find_res != shape_info.end());
+      fillOutputReshapeInfo(find_res->second.shape.dims(), output_shapes_max_bs_[i], output_reshape_info, i);
     }
-    output_reshape_info.fast_path[i] = !mismatch;
   }
+
   return current_batch_size;
 }
 
@@ -458,7 +476,7 @@ void OnnxifiOp<CPUContext>::setOutputShapeAndType(int output_idx) {
   tensor_descriptor.dimensions = tensor_dims.size();
   CAFFE_ENFORCE(
       tensor_descriptor.dimensions != 0, tensor_descriptor.name, " has 0 dim");
-  auto& output_shape = output_shapes_[output_idx];
+  auto& output_shape = output_shapes_max_bs_[output_idx];
   output_shape.clear();
   output_shape.insert(
       output_shape.begin(), tensor_dims.cbegin(), tensor_dims.cend());

diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h
@@ -19,7 +19,7 @@ namespace caffe2 {
 namespace details {
 
 /// Provides slicing info for the outputs. All the vector members should be of
-/// the same size as number of outpus of the Onnxifi op.
+/// the same size as number of outputs of the Onnxifi op.
 struct OutputReshapeInfo {
   std::vector<Tensor> begins;
   std::vector<Tensor> ends;
@@ -55,6 +55,7 @@ class OnnxifiOp final : public Operator<Context> {
         timeout_(this->template GetSingleArgument<int>("timeout", 0)),
         nominal_batch_idx_(
             this->template GetSingleArgument<int>("nominal_batch_idx", 0)),
+        use_passed_output_shapes_(this->template GetSingleArgument<int>("use_passed_output_shapes", 0)),
         adjust_quantized_offset_(this->template GetSingleArgument<int>(
             "adjust_quantized_offset",
             128)) {
@@ -86,7 +87,7 @@ class OnnxifiOp final : public Operator<Context> {
     all_offsets_.reserve(ws->Blobs().size());
     all_scales_.reserve(ws->Blobs().size());
     input_shapes_.resize(input_names_.size());
-    output_shapes_.resize(output_names_.size());
+    output_shapes_max_bs_.resize(output_names_.size());
     quantized_outputs_.resize(output_names_.size(), false);
     int output_idx = 0;
     ArgumentHelper helper(operator_def);
@@ -127,6 +128,30 @@ class OnnxifiOp final : public Operator<Context> {
       adjust_quantized_offset_ = 0;
     }
 
+    if (use_passed_output_shapes_) {
+      // Populate output_shapes_per_bs_
+      for (int bs = 1; bs < max_batch_size_; ++bs) {
+        auto output_shapes_tp = helper.GetRepeatedArgument<TensorProto>("output_shapes_bs_" + caffe2::to_string(bs));
+        auto output_qshapes_tp = helper.GetRepeatedArgument<TensorProto>("output_qshapes_bs_" + caffe2::to_string(bs));
+        CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size());
+
+        std::unordered_map<std::string, details::TensorInfo> name_to_shape;
+        for (const auto& output_shape_tp : output_shapes_tp) {
+          name_to_shape.emplace(output_shape_tp.name(), details::TensorInfo{output_shape_tp});
+        }
+        for (const auto& output_qshape_tp : output_qshapes_tp) {
+          name_to_shape.emplace(output_qshape_tp.name(), details::TensorInfo{output_qshape_tp});
+        }
+
+        for (output_idx = 0; output_idx < output_names_.size(); ++output_idx) {
+          auto it = name_to_shape.find(output_names_[output_idx]);
+          output_shapes_per_bs_[bs].push_back({});
+          auto &output_shapes = output_shapes_per_bs_[bs].back();
+          std::copy(it->second.dims.cbegin(), it->second.dims.cend(), std::back_inserter(output_shapes));
+        }
+      }
+    }
+
     // Get output resizing hints
     adjust_output_batch_ =
         this->template GetSingleArgument<int>("adjust_output_batch", 0);
@@ -333,6 +358,14 @@ class OnnxifiOp final : public Operator<Context> {
 #endif
   }
 
+  /// Helper method for extractOutputBatchSizes(), used to deduplicate code of populating output reshape infos
+  template <typename DimContainer>
+  void fillOutputReshapeInfo(
+      const DimContainer& real_shape,
+      c10::ArrayRef<uint64_t> max_shape,
+      details::OutputReshapeInfo &output_reshape_info,
+      int index);
+
   /// Extract output batch size. If the output batch size is going to be at
   /// max_batch_size_, return true indicating that no output shape adjustment is
   /// needed. Otherwise, return false.
@@ -418,7 +451,7 @@ class OnnxifiOp final : public Operator<Context> {
   int nominal_batch_idx_{0};
 
   // We bind the op input/output by position while ONNXIFI binds input/output by
-  // names. In addition, op input/output names can be writtten by, for example,
+  // names. In addition, op input/output names can be written by, for example,
   // memonger. We cache the original input/output name of ONNX object here and
   // bind them by position.
   std::vector<std::string> input_names_;
@@ -428,7 +461,10 @@ class OnnxifiOp final : public Operator<Context> {
   NetDef netdef_;
 
   std::vector<c10::SmallVector<uint64_t, 4>> input_shapes_;
-  std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_;
+  std::vector<c10::SmallVector<uint64_t, 4>> output_shapes_max_bs_;
+
+  // Mapping of batch sizes to output shapes
+  std::unordered_map<int, std::vector<c10::SmallVector<uint64_t, 4>>> output_shapes_per_bs_;
 
   // Indicate if i-th output is a quantized tensor
   std::vector<bool> quantized_outputs_;
@@ -449,6 +485,9 @@ class OnnxifiOp final : public Operator<Context> {
   // max_batch_size
   std::unordered_map<std::string, ShapeInfo> input_shape_info_;
 
+  // Whether we should use passed output shape hints or do shape inference
+  bool use_passed_output_shapes_{false};
+
   // Whether we need to resize outputs or not
   bool adjust_output_batch_{false};