Skip to content

Commit d7a8da2

Browse files
committed
add performance section
1 parent 69293b5 commit d7a8da2

File tree

1 file changed

+41
-2
lines changed

1 file changed

+41
-2
lines changed

tutorials/frontend/deploy_prequantized.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"):
8989

9090
runtime.set_input(input_name, inp)
9191
runtime.run()
92-
return runtime.get_output(0).asnumpy()
92+
return runtime.get_output(0).asnumpy(), runtime
9393

9494

9595
#################################################################################
@@ -169,7 +169,7 @@ def quantize_model(model, inp):
169169
#
170170
# Under the hood, quantization specific operators are lowered to a sequence of
171171
# standard Relay operators before compilation.
172-
tvm_result = run_tvm_model(mod, params, input_name, inp, target="llvm")
172+
tvm_result, rt_mod = run_tvm_model(mod, params, input_name, inp, target="llvm")
173173

174174
##########################################################################
175175
# Compare the output labels
@@ -188,6 +188,45 @@ def quantize_model(model, inp):
188188
print("%d in 1000 raw floating outputs identical." % np.sum(tvm_result[0] == pt_result[0]))
189189

190190

191+
##########################################################################
192+
# Measure performance
193+
# -------------------------
194+
# Here we give an example of how to measure performance of TVM compiled models.
195+
n_repeat = 100 # should be bigger to make the measurement more accurate
196+
ctx = tvm.cpu(0)
197+
ftimer = rt_mod.module.time_evaluator("run", ctx, number=1,
198+
repeat=n_repeat)
199+
prof_res = np.array(ftimer().results) * 1e3
200+
print("Elapsed ms:", np.mean(prof_res))
201+
202+
######################################################################
203+
# .. note::
204+
#
205+
# We recommend this method for the following reasons:
206+
#
207+
# * Measurements are done in C++, so there is no Python overhead
208+
# * It includes several warm up runs
209+
# * The same method can be used to profile on remote devices (android etc.).
210+
211+
212+
######################################################################
213+
# .. note::
214+
#
215+
# Unless the hardware has special support for fast 8 bit instructions, quantized models are
216+
# not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does
217+
# quantized convolution in 16 bit, even if the model itself is 8 bit.
218+
#
219+
# For x86, the best performance can be acheived on CPUs with AVX512 instructions set.
220+
# In this case, TVM utilizes the fastest available 8 bit instructions for the given target.
221+
# This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer).
222+
#
223+
# Moreover, the following general tips for CPU performance equally applies:
224+
#
225+
# * Set the environment variable TVM_NUM_THREADS to the number of physical cores
226+
# * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
227+
# "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
228+
229+
191230
###############################################################################
192231
# Deploy a quantized MXNet Model
193232
# ------------------------------

0 commit comments

Comments
 (0)