@@ -89,7 +89,7 @@ def run_tvm_model(mod, params, input_name, inp, target="llvm"):
8989
9090 runtime .set_input (input_name , inp )
9191 runtime .run ()
92- return runtime .get_output (0 ).asnumpy ()
92+ return runtime .get_output (0 ).asnumpy (), runtime
9393
9494
9595#################################################################################
@@ -169,7 +169,7 @@ def quantize_model(model, inp):
169169#
170170# Under the hood, quantization specific operators are lowered to a sequence of
171171# standard Relay operators before compilation.
172- tvm_result = run_tvm_model (mod , params , input_name , inp , target = "llvm" )
172+ tvm_result , rt_mod = run_tvm_model (mod , params , input_name , inp , target = "llvm" )
173173
174174##########################################################################
175175# Compare the output labels
@@ -188,6 +188,45 @@ def quantize_model(model, inp):
188188print ("%d in 1000 raw floating outputs identical." % np .sum (tvm_result [0 ] == pt_result [0 ]))
189189
190190
191+ ##########################################################################
192+ # Measure performance
193+ # -------------------------
194+ # Here we give an example of how to measure performance of TVM compiled models.
195+ n_repeat = 100 # should be bigger to make the measurement more accurate
196+ ctx = tvm .cpu (0 )
197+ ftimer = rt_mod .module .time_evaluator ("run" , ctx , number = 1 ,
198+ repeat = n_repeat )
199+ prof_res = np .array (ftimer ().results ) * 1e3
200+ print ("Elapsed ms:" , np .mean (prof_res ))
201+
202+ ######################################################################
203+ # .. note::
204+ #
205+ # We recommend this method for the following reasons:
206+ #
207+ # * Measurements are done in C++, so there is no Python overhead
208+ # * It includes several warm up runs
209+ # * The same method can be used to profile on remote devices (android etc.).
210+
211+
212+ ######################################################################
213+ # .. note::
214+ #
215+ # Unless the hardware has special support for fast 8 bit instructions, quantized models are
216+ # not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does
217+ # quantized convolution in 16 bit, even if the model itself is 8 bit.
218+ #
219+ # For x86, the best performance can be acheived on CPUs with AVX512 instructions set.
220+ # In this case, TVM utilizes the fastest available 8 bit instructions for the given target.
221+ # This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer).
222+ #
223+ # Moreover, the following general tips for CPU performance equally applies:
224+ #
225+ # * Set the environment variable TVM_NUM_THREADS to the number of physical cores
226+ # * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or
227+ # "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future)
228+
229+
191230###############################################################################
192231# Deploy a quantized MXNet Model
193232# ------------------------------
0 commit comments