[Graph Executor, VM] Add end to end benchmarking of models (apache#8858)

Add benchmarking that includes ovearhead of transfering inputs and outputs to and from the device. This should give an accurate measurement of the runtime a user would see when using the model. This is accomplished by adding functions that run from inputs to return values into the graph executor and the VM.
AndrewZhaoLuo · Sep 1, 2021 · 2501640 · 2501640
1 parent b3d6d78
commit 2501640
Show file tree

Hide file tree

Showing 7 changed files with 229 additions and 41 deletions.
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
@@ -258,6 +258,16 @@ class VirtualMachine : public runtime::ModuleNode {
    */
   void InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args);
 
+  /*!
+   * \brief Set inputs to a function.
+   * \param name The function name
+   * \param args args[offset:] are arguments to the
+   * function. If the arguments are not of the correct device for the function,
+   * they will be copied to the device.
+   * \param offset Starting offset of the arguments in `args`.
+   */
+  void SetInput(std::string name, TVMArgs args, int offset);
+
  protected:
   /*! \brief The virtual machine's packed function table. */
   std::vector<PackedFunc> packed_funcs_;

diff --git a/python/tvm/contrib/graph_executor.py b/python/tvm/contrib/graph_executor.py
@@ -321,15 +321,25 @@ def __getitem__(self, key):
         """
         return self.module[key]
 
-    def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
+    def benchmark(
+        self,
+        device,
+        func_name="run",
+        repeat=5,
+        number=5,
+        min_repeat_ms=None,
+        end_to_end=False,
+        **kwargs,
+    ):
         """Calculate runtime of a function by repeatedly calling it.
 
         Use this function to get an accurate measurement of the runtime of a function. The function
         is run multiple times in order to account for variability in measurements, processor speed
         or other external factors.  Mean, median, standard deviation, min and max runtime are all
         reported.  On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
         synchonization and data transfer operations are not counted towards the runtime. This allows
-        for fair comparison of runtimes across different functions and models.
+        for fair comparison of runtimes across different functions and models. The `end_to_end` flag
+        switches this behavior to include data transfer operations in the runtime.
 
         The benchmarking loop looks approximately like so:
 
@@ -346,7 +356,7 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
         Parameters
         ----------
         func_name : str
-            The function to benchmark
+            The function to benchmark. This is ignored if `end_to_end` is true.
 
         repeat : int
             Number of times to run the outer loop of the timing code (see above). The output will
@@ -363,6 +373,11 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
 
+        end_to_end : bool
+            If set, include time to transfer input tensors to the device and time to transfer
+            returned tensors in the total runtime. This will give accurate timings for end to end
+            workloads.
+
         kwargs : Dict[str, Object]
             Named arguments to the function. These are cached before running timing code, so that
             data transfer costs are not counted in the runtime.
@@ -374,6 +389,19 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
             access the individual runtimes (in seconds).
         """
         min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
+        if end_to_end:
+            # Have to unpack kwargs into a single list
+            args = []
+            for k, v in kwargs.items():
+                args.append(k)
+                args.append(v)
+            return self.module.time_evaluator(
+                "run_from_inputs",
+                device,
+                repeat=repeat,
+                number=number,
+                min_repeat_ms=min_repeat_ms,
+            )(device.device_type, device.device_id, *args)
         if kwargs:
             self.set_input(**kwargs)
         return self.module.time_evaluator(

diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
@@ -509,16 +509,25 @@ def get_input_index(self, input_name, func_name="main"):
         return self._get_input_index(input_name, func_name)
 
     def benchmark(
-        self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
+        self,
+        device,
+        *args,
+        func_name="main",
+        repeat=5,
+        number=5,
+        min_repeat_ms=None,
+        end_to_end=False,
+        **kwargs,
     ):
         """Calculate runtime of a function by repeatedly calling it.
 
         Use this function to get an accurate measurement of the runtime of a function. The function
         is run multiple times in order to account for variability in measurements, processor speed
         or other external factors.  Mean, median, standard deviation, min and max runtime are all
-        reported.  On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
+        reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
         synchonization and data transfer operations are not counted towards the runtime. This allows
-        for fair comparison of runtimes across different functions and models.
+        for fair comparison of runtimes across different functions and models. The `end_to_end` flag
+        switches this behavior to include data transfer operations in the runtime.
 
         The benchmarking loop looks approximately like so:
 
@@ -552,6 +561,11 @@ def benchmark(
             milliseconds. This can be used to ensure that the function is run enough to get an
             accurate measurement.
 
+        end_to_end : bool
+            If set, include time to transfer input tensors to the device and time to transfer
+            returned tensors in the total runtime. This will give accurate timings for end to end
+            workloads.
+
         args : Sequence[Object]
             Arguments to the function. These are cached before running timing code, so that data
             transfer costs are not counted in the runtime.
@@ -566,6 +580,23 @@ def benchmark(
             access the individual runtimes (in seconds).
         """
         min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
+        if end_to_end:
+            # We need to unpack keyword arguments into positional arguments
+            packed_args = list(args)
+            for k, v in kwargs.items():
+                i = self.get_input_index(k, func_name)
+                if i < 0:
+                    raise TypeError(f"{func_name}() got an unexpected keyword argument '{k}'")
+                while i >= len(packed_args):
+                    packed_args.append(None)
+                packed_args[i] = v
+            return self.module.time_evaluator(
+                "invoke_return_to_device",
+                device,
+                repeat=repeat,
+                number=number,
+                min_repeat_ms=min_repeat_ms,
+            )(func_name, device.device_type, device.device_id, *packed_args)
         if args or kwargs:
             self.set_input(func_name, *args, **kwargs)
         return self.module.time_evaluator(

diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
@@ -568,6 +568,34 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
   } else if (name == "run") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
+  } else if (name == "run_from_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+          CHECK(args.size() % 2 == 0)
+              << "Number of arguments to run_from_inputs must be an even number of key-value pairs";
+          Device host{static_cast<DLDeviceType>(args[0].operator int()), args[1].operator int()};
+          for (int i = 2; i < args.size(); i += 2) {
+            if (String::CanConvertFrom(args[i])) {
+              int in_idx = this->GetInputIndex(args[i].operator String());
+              if (in_idx >= 0) {
+                this->SetInput(in_idx, args[i + 1]);
+              } else {
+                LOG(FATAL) << args[i].operator String() << " is not a valid input name";
+              }
+            } else {
+              this->SetInput(args[i], args[i + 1]);
+            }
+          }
+          this->Run();
+          Array<NDArray> outputs;
+          for (int i = 0; i < this->NumOutputs(); i++) {
+            NDArray out = this->GetOutput(i);
+            NDArray a = NDArray::Empty(out.Shape(), out.DataType(), host);
+            a.CopyFrom(out);
+            outputs.push_back(a);
+          }
+          *rv = outputs;
+        });
   } else if (name == "load_params") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       this->LoadParams(args[0].operator std::string());

diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
@@ -118,6 +118,7 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
   if (name == "invoke") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       ICHECK(exec_) << "The executable is not created yet.";
+
       std::string func_name = args[0];
       auto git = exec_->global_map.find(func_name);
       ICHECK(git != exec_->global_map.end())
@@ -140,6 +141,26 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       TVMRetValue rv_;
       invoke.CallPacked(args, &rv_);
     });
+  } else if (name == "invoke_return_to_device") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      Device host{static_cast<DLDeviceType>(args[1].operator int()), args[2].operator int()};
+
+      SetInput(args[0].operator std::string(), args, 3);
+      PackedFunc invoke = GetFunction("invoke", sptr_to_self);
+      TVMRetValue rv_;
+      invoke.CallPacked(args, &rv_);  // Invoke only uses the first arg, so the rest of the args
+                                      // should not cause an issue
+      if (rv_.type_code() == kTVMObjectHandle) {
+        ADT adt = Downcast<ADT>(rv_.operator ObjectRef());
+        std::vector<ObjectRef> transfered;
+        for (size_t i = 0; i < adt.size(); i++) {
+          transfered.push_back(CopyTo(adt[i], host));
+        }
+        *rv = ADT(adt.tag(), transfered);
+      } else {
+        *rv = CopyTo(rv_, host);
+      }
+    });
   } else if (name == "get_output") {
     return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) {
       if (this->return_register_.as<ADTObj>()) {
@@ -191,47 +212,49 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
       this->Init(devices, alloc_types);
     });
   } else if (name == "set_input") {
-    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
-      ICHECK(exec_) << "The executable is not created yet.";
-      std::string func_name = args[0];
-      auto gvit = exec_->global_map.find(func_name);
-      ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
-      auto func_index = gvit->second;
-      const auto& vm_func = exec_->functions[func_index];
-      const auto& param_names = vm_func.params;
-      ICHECK_EQ(args.size() - 1, param_names.size())
-          << "The number of provided parameters doesn't match the number of arguments";
-      ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
-          << "The number of provided parameters doesn't match the number of assigned devices";
-      std::vector<ObjectRef> func_args(param_names.size());
-      for (int i = 1; i < args.size(); ++i) {
-        Index device_type = vm_func.params_device_type[i - 1];
-        Device dev = GetDevice(device_type);
-
-        if (args[i].type_code() == kTVMDLTensorHandle) {
-          // Automatically convert input DLTensors to NDArray
-          DLTensor* tensor = args[i];
-          std::vector<int64_t> shape;
-          for (int64_t i = 0; i < tensor->ndim; i++) {
-            shape.push_back(tensor->shape[i]);
-          }
-          NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
-          ary.CopyFrom(tensor);
-          func_args[i - 1] = ary;
-        } else {
-          ObjectRef obj = CopyTo(args[i], dev);
-          func_args[i - 1] = obj;
-        }
-      }
-      inputs_.erase(func_name);
-      inputs_.emplace(func_name, func_args);
-    });
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args[0], args, 1); });
   } else {
     LOG(FATAL) << "Unknown packed function: " << name;
     return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
   }
 }
 
+void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
+  ICHECK(exec_) << "The executable is not created yet.";
+  auto gvit = exec_->global_map.find(func_name);
+  ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
+  auto func_index = gvit->second;
+  const auto& vm_func = exec_->functions[func_index];
+  const auto& param_names = vm_func.params;
+  ICHECK_EQ(args.size() - offset, param_names.size())
+      << "The number of provided parameters doesn't match the number of arguments";
+  ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
+      << "The number of provided parameters doesn't match the number of assigned devices";
+  std::vector<ObjectRef> func_args(param_names.size());
+  for (int i = offset; i < args.size(); ++i) {
+    Index device_type = vm_func.params_device_type[i - offset];
+    Device dev = GetDevice(device_type);
+
+    if (args[i].type_code() == kTVMDLTensorHandle) {
+      // Automatically convert input DLTensors to NDArray
+      DLTensor* tensor = args[i];
+      std::vector<int64_t> shape;
+      for (int64_t i = 0; i < tensor->ndim; i++) {
+        shape.push_back(tensor->shape[i]);
+      }
+      NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
+      ary.CopyFrom(tensor);
+      func_args[i - offset] = ary;
+    } else {
+      ObjectRef obj = CopyTo(args[i], dev);
+      func_args[i - offset] = obj;
+    }
+  }
+  inputs_.erase(func_name);
+  inputs_.emplace(func_name, func_args);
+}
+
 inline Device VirtualMachine::GetDevice(Index device_type) const {
   ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type;
 

diff --git a/tests/python/relay/test_backend_graph_executor.py b/tests/python/relay/test_backend_graph_executor.py
@@ -25,6 +25,8 @@
 from tvm.relay.op import add
 import tvm.testing
 from tvm.relay.testing import mlp
+from tvm import rpc
+from tvm.contrib import utils
 
 # @tq, @jr should we put this in testing ns?
 def check_rts(expr, args, expected_result, mod=None):
@@ -348,5 +350,39 @@ def test_benchmark():
         assert result.std == 1.5
 
 
+@tvm.testing.parametrize_targets("cuda", "llvm")
+def test_benchmark_end_to_end(dev, target):
+    mod, params = mlp.get_workload(1)
+    lib = relay.build(mod, target=target, params=params)
+    exe = graph_executor.create(lib.get_graph_json(), lib.lib, dev)
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
+    result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
+    assert result.mean > 0
+    assert len(result.results) == 2
+
+
+@tvm.testing.requires_llvm
+def test_benchmark_end_to_end_rpc():
+    server = rpc.Server("127.0.0.1")
+    remote = rpc.connect(server.host, server.port)
+
+    mod, params = mlp.get_workload(1)
+    lib = relay.build(mod, target="llvm", params=params)
+
+    temp = utils.tempdir()
+    path = temp.relpath("library.so")
+    lib.export_library(path)
+    remote.upload(path)
+    rlib = remote.load_module("library.so")
+
+    dev = remote.cpu()
+    exe = graph_executor.create(lib.get_graph_json(), rlib, dev)
+
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
+    assert result.mean > 0
+    assert len(result.results) == 2
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
@@ -981,5 +981,37 @@ def test_benchmark():
         assert result.std == 1.5
 
 
+@tvm.testing.parametrize_targets("cuda", "llvm")
+def test_benchmark_end_to_end(dev, target):
+    mod, params = mlp.get_workload(1)
+    lib = vm.compile(mod, target=target, params=params)
+    exe = runtime.vm.VirtualMachine(lib, dev)
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
+    result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1, end_to_end=True)
+    assert result.mean > 0
+
+
+@tvm.testing.requires_llvm
+def test_benchmark_end_to_end_rpc():
+    server = rpc.Server("127.0.0.1")
+    remote = rpc.connect(server.host, server.port)
+
+    mod, params = mlp.get_workload(1)
+    lib = vm.compile(mod, target="llvm", params=params)
+
+    temp = utils.tempdir()
+    path = temp.relpath("vm_library.so")
+    lib.mod.export_library(path)
+    remote.upload(path)
+    rlib = remote.load_module("vm_library.so")
+
+    exe = runtime.vm.VirtualMachine(rlib, remote.cpu())
+    data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=remote.cpu())
+    result = exe.benchmark(
+        remote.cpu(), data=data, func_name="main", repeat=2, number=1, end_to_end=True
+    )
+    assert result.mean > 0
+
+
 if __name__ == "__main__":
     pytest.main([__file__])