Skip to content

Commit

Permalink
[Graph Executor, VM] Add end to end benchmarking of models (apache#8858)
Browse files Browse the repository at this point in the history
Add benchmarking that includes ovearhead of transfering inputs and
outputs to and from the device. This should give an accurate measurement
of the runtime a user would see when using the model. This is
accomplished by adding functions that run from inputs to return values
into the graph executor and the VM.
  • Loading branch information
Tristan Konolige authored and Andrew Zhao Luo committed Sep 1, 2021
1 parent b3d6d78 commit 2501640
Show file tree
Hide file tree
Showing 7 changed files with 229 additions and 41 deletions.
10 changes: 10 additions & 0 deletions include/tvm/runtime/vm/vm.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,16 @@ class VirtualMachine : public runtime::ModuleNode {
*/
void InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args);

/*!
* \brief Set inputs to a function.
* \param name The function name
* \param args args[offset:] are arguments to the
* function. If the arguments are not of the correct device for the function,
* they will be copied to the device.
* \param offset Starting offset of the arguments in `args`.
*/
void SetInput(std::string name, TVMArgs args, int offset);

protected:
/*! \brief The virtual machine's packed function table. */
std::vector<PackedFunc> packed_funcs_;
Expand Down
34 changes: 31 additions & 3 deletions python/tvm/contrib/graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,25 @@ def __getitem__(self, key):
"""
return self.module[key]

def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=None, **kwargs):
def benchmark(
self,
device,
func_name="run",
repeat=5,
number=5,
min_repeat_ms=None,
end_to_end=False,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.
Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
for fair comparison of runtimes across different functions and models. The `end_to_end` flag
switches this behavior to include data transfer operations in the runtime.
The benchmarking loop looks approximately like so:
Expand All @@ -346,7 +356,7 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
Parameters
----------
func_name : str
The function to benchmark
The function to benchmark. This is ignored if `end_to_end` is true.
repeat : int
Number of times to run the outer loop of the timing code (see above). The output will
Expand All @@ -363,6 +373,11 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
workloads.
kwargs : Dict[str, Object]
Named arguments to the function. These are cached before running timing code, so that
data transfer costs are not counted in the runtime.
Expand All @@ -374,6 +389,19 @@ def benchmark(self, device, func_name="run", repeat=5, number=5, min_repeat_ms=N
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if end_to_end:
# Have to unpack kwargs into a single list
args = []
for k, v in kwargs.items():
args.append(k)
args.append(v)
return self.module.time_evaluator(
"run_from_inputs",
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
)(device.device_type, device.device_id, *args)
if kwargs:
self.set_input(**kwargs)
return self.module.time_evaluator(
Expand Down
37 changes: 34 additions & 3 deletions python/tvm/runtime/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,16 +509,25 @@ def get_input_index(self, input_name, func_name="main"):
return self._get_input_index(input_name, func_name)

def benchmark(
self, device, *args, func_name="main", repeat=5, number=5, min_repeat_ms=None, **kwargs
self,
device,
*args,
func_name="main",
repeat=5,
number=5,
min_repeat_ms=None,
end_to_end=False,
**kwargs,
):
"""Calculate runtime of a function by repeatedly calling it.
Use this function to get an accurate measurement of the runtime of a function. The function
is run multiple times in order to account for variability in measurements, processor speed
or other external factors. Mean, median, standard deviation, min and max runtime are all
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
reported. On GPUs, CUDA and ROCm specifically, special on-device timers are used so that
synchonization and data transfer operations are not counted towards the runtime. This allows
for fair comparison of runtimes across different functions and models.
for fair comparison of runtimes across different functions and models. The `end_to_end` flag
switches this behavior to include data transfer operations in the runtime.
The benchmarking loop looks approximately like so:
Expand Down Expand Up @@ -552,6 +561,11 @@ def benchmark(
milliseconds. This can be used to ensure that the function is run enough to get an
accurate measurement.
end_to_end : bool
If set, include time to transfer input tensors to the device and time to transfer
returned tensors in the total runtime. This will give accurate timings for end to end
workloads.
args : Sequence[Object]
Arguments to the function. These are cached before running timing code, so that data
transfer costs are not counted in the runtime.
Expand All @@ -566,6 +580,23 @@ def benchmark(
access the individual runtimes (in seconds).
"""
min_repeat_ms = 0 if min_repeat_ms is None else min_repeat_ms
if end_to_end:
# We need to unpack keyword arguments into positional arguments
packed_args = list(args)
for k, v in kwargs.items():
i = self.get_input_index(k, func_name)
if i < 0:
raise TypeError(f"{func_name}() got an unexpected keyword argument '{k}'")
while i >= len(packed_args):
packed_args.append(None)
packed_args[i] = v
return self.module.time_evaluator(
"invoke_return_to_device",
device,
repeat=repeat,
number=number,
min_repeat_ms=min_repeat_ms,
)(func_name, device.device_type, device.device_id, *packed_args)
if args or kwargs:
self.set_input(func_name, *args, **kwargs)
return self.module.time_evaluator(
Expand Down
28 changes: 28 additions & 0 deletions src/runtime/graph_executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,34 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
} else if (name == "run") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->Run(); });
} else if (name == "run_from_inputs") {
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
CHECK(args.size() % 2 == 0)
<< "Number of arguments to run_from_inputs must be an even number of key-value pairs";
Device host{static_cast<DLDeviceType>(args[0].operator int()), args[1].operator int()};
for (int i = 2; i < args.size(); i += 2) {
if (String::CanConvertFrom(args[i])) {
int in_idx = this->GetInputIndex(args[i].operator String());
if (in_idx >= 0) {
this->SetInput(in_idx, args[i + 1]);
} else {
LOG(FATAL) << args[i].operator String() << " is not a valid input name";
}
} else {
this->SetInput(args[i], args[i + 1]);
}
}
this->Run();
Array<NDArray> outputs;
for (int i = 0; i < this->NumOutputs(); i++) {
NDArray out = this->GetOutput(i);
NDArray a = NDArray::Empty(out.Shape(), out.DataType(), host);
a.CopyFrom(out);
outputs.push_back(a);
}
*rv = outputs;
});
} else if (name == "load_params") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
this->LoadParams(args[0].operator std::string());
Expand Down
93 changes: 58 additions & 35 deletions src/runtime/vm/vm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
if (name == "invoke") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
ICHECK(exec_) << "The executable is not created yet.";

std::string func_name = args[0];
auto git = exec_->global_map.find(func_name);
ICHECK(git != exec_->global_map.end())
Expand All @@ -140,6 +141,26 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
TVMRetValue rv_;
invoke.CallPacked(args, &rv_);
});
} else if (name == "invoke_return_to_device") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
Device host{static_cast<DLDeviceType>(args[1].operator int()), args[2].operator int()};

SetInput(args[0].operator std::string(), args, 3);
PackedFunc invoke = GetFunction("invoke", sptr_to_self);
TVMRetValue rv_;
invoke.CallPacked(args, &rv_); // Invoke only uses the first arg, so the rest of the args
// should not cause an issue
if (rv_.type_code() == kTVMObjectHandle) {
ADT adt = Downcast<ADT>(rv_.operator ObjectRef());
std::vector<ObjectRef> transfered;
for (size_t i = 0; i < adt.size(); i++) {
transfered.push_back(CopyTo(adt[i], host));
}
*rv = ADT(adt.tag(), transfered);
} else {
*rv = CopyTo(rv_, host);
}
});
} else if (name == "get_output") {
return TypedPackedFunc<NDArray(int64_t)>([this](int64_t index) {
if (this->return_register_.as<ADTObj>()) {
Expand Down Expand Up @@ -191,47 +212,49 @@ PackedFunc VirtualMachine::GetFunction(const std::string& name,
this->Init(devices, alloc_types);
});
} else if (name == "set_input") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
ICHECK(exec_) << "The executable is not created yet.";
std::string func_name = args[0];
auto gvit = exec_->global_map.find(func_name);
ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
auto func_index = gvit->second;
const auto& vm_func = exec_->functions[func_index];
const auto& param_names = vm_func.params;
ICHECK_EQ(args.size() - 1, param_names.size())
<< "The number of provided parameters doesn't match the number of arguments";
ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
<< "The number of provided parameters doesn't match the number of assigned devices";
std::vector<ObjectRef> func_args(param_names.size());
for (int i = 1; i < args.size(); ++i) {
Index device_type = vm_func.params_device_type[i - 1];
Device dev = GetDevice(device_type);

if (args[i].type_code() == kTVMDLTensorHandle) {
// Automatically convert input DLTensors to NDArray
DLTensor* tensor = args[i];
std::vector<int64_t> shape;
for (int64_t i = 0; i < tensor->ndim; i++) {
shape.push_back(tensor->shape[i]);
}
NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
ary.CopyFrom(tensor);
func_args[i - 1] = ary;
} else {
ObjectRef obj = CopyTo(args[i], dev);
func_args[i - 1] = obj;
}
}
inputs_.erase(func_name);
inputs_.emplace(func_name, func_args);
});
return PackedFunc(
[sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { SetInput(args[0], args, 1); });
} else {
LOG(FATAL) << "Unknown packed function: " << name;
return PackedFunc([sptr_to_self, name](TVMArgs args, TVMRetValue* rv) {});
}
}

void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
ICHECK(exec_) << "The executable is not created yet.";
auto gvit = exec_->global_map.find(func_name);
ICHECK(gvit != exec_->global_map.end()) << "Cannot find function " << func_name;
auto func_index = gvit->second;
const auto& vm_func = exec_->functions[func_index];
const auto& param_names = vm_func.params;
ICHECK_EQ(args.size() - offset, param_names.size())
<< "The number of provided parameters doesn't match the number of arguments";
ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
<< "The number of provided parameters doesn't match the number of assigned devices";
std::vector<ObjectRef> func_args(param_names.size());
for (int i = offset; i < args.size(); ++i) {
Index device_type = vm_func.params_device_type[i - offset];
Device dev = GetDevice(device_type);

if (args[i].type_code() == kTVMDLTensorHandle) {
// Automatically convert input DLTensors to NDArray
DLTensor* tensor = args[i];
std::vector<int64_t> shape;
for (int64_t i = 0; i < tensor->ndim; i++) {
shape.push_back(tensor->shape[i]);
}
NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
ary.CopyFrom(tensor);
func_args[i - offset] = ary;
} else {
ObjectRef obj = CopyTo(args[i], dev);
func_args[i - offset] = obj;
}
}
inputs_.erase(func_name);
inputs_.emplace(func_name, func_args);
}

inline Device VirtualMachine::GetDevice(Index device_type) const {
ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type;

Expand Down
36 changes: 36 additions & 0 deletions tests/python/relay/test_backend_graph_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from tvm.relay.op import add
import tvm.testing
from tvm.relay.testing import mlp
from tvm import rpc
from tvm.contrib import utils

# @tq, @jr should we put this in testing ns?
def check_rts(expr, args, expected_result, mod=None):
Expand Down Expand Up @@ -348,5 +350,39 @@ def test_benchmark():
assert result.std == 1.5


@tvm.testing.parametrize_targets("cuda", "llvm")
def test_benchmark_end_to_end(dev, target):
mod, params = mlp.get_workload(1)
lib = relay.build(mod, target=target, params=params)
exe = graph_executor.create(lib.get_graph_json(), lib.lib, dev)
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"))
result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
assert result.mean > 0
assert len(result.results) == 2


@tvm.testing.requires_llvm
def test_benchmark_end_to_end_rpc():
server = rpc.Server("127.0.0.1")
remote = rpc.connect(server.host, server.port)

mod, params = mlp.get_workload(1)
lib = relay.build(mod, target="llvm", params=params)

temp = utils.tempdir()
path = temp.relpath("library.so")
lib.export_library(path)
remote.upload(path)
rlib = remote.load_module("library.so")

dev = remote.cpu()
exe = graph_executor.create(lib.get_graph_json(), rlib, dev)

data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
result = exe.benchmark(dev, data=data, func_name="run", repeat=2, number=1, end_to_end=True)
assert result.mean > 0
assert len(result.results) == 2


if __name__ == "__main__":
pytest.main([__file__])
32 changes: 32 additions & 0 deletions tests/python/relay/test_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -981,5 +981,37 @@ def test_benchmark():
assert result.std == 1.5


@tvm.testing.parametrize_targets("cuda", "llvm")
def test_benchmark_end_to_end(dev, target):
mod, params = mlp.get_workload(1)
lib = vm.compile(mod, target=target, params=params)
exe = runtime.vm.VirtualMachine(lib, dev)
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev)
result = exe.benchmark(dev, data, func_name="main", repeat=2, number=1, end_to_end=True)
assert result.mean > 0


@tvm.testing.requires_llvm
def test_benchmark_end_to_end_rpc():
server = rpc.Server("127.0.0.1")
remote = rpc.connect(server.host, server.port)

mod, params = mlp.get_workload(1)
lib = vm.compile(mod, target="llvm", params=params)

temp = utils.tempdir()
path = temp.relpath("vm_library.so")
lib.mod.export_library(path)
remote.upload(path)
rlib = remote.load_module("vm_library.so")

exe = runtime.vm.VirtualMachine(rlib, remote.cpu())
data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=remote.cpu())
result = exe.benchmark(
remote.cpu(), data=data, func_name="main", repeat=2, number=1, end_to_end=True
)
assert result.mean > 0


if __name__ == "__main__":
pytest.main([__file__])

0 comments on commit 2501640

Please sign in to comment.