Skip to content

Commit

Permalink
Add TensorRT uint8 models and tests (triton-inference-server#4946)
Browse files Browse the repository at this point in the history
* Add trt kUINT8 doc

* Add trt uint8 to L0_infer

* Expand uint8 test with other types
  • Loading branch information
kthui authored Oct 17, 2022
1 parent ac98a02 commit 0301b7a
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/user_guide/model_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ library.
|Model Config |TensorRT |TensorFlow |ONNX Runtime |PyTorch |API |NumPy |
|--------------|--------------|--------------|--------------|---------|---------|--------------|
|TYPE_BOOL | kBOOL |DT_BOOL |BOOL |kBool |BOOL |bool |
|TYPE_UINT8 | |DT_UINT8 |UINT8 |kByte |UINT8 |uint8 |
|TYPE_UINT8 | kUINT8 |DT_UINT8 |UINT8 |kByte |UINT8 |uint8 |
|TYPE_UINT16 | |DT_UINT16 |UINT16 | |UINT16 |uint16 |
|TYPE_UINT32 | |DT_UINT32 |UINT32 | |UINT32 |uint32 |
|TYPE_UINT64 | |DT_UINT64 |UINT64 | |UINT64 |uint64 |
Expand Down
34 changes: 34 additions & 0 deletions qa/L0_infer/infer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def _infer_exact_helper(tester,
for prefix in ensemble_prefix:
if prefix != "":
continue
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
continue

if 'python_dlpack' in BACKENDS:
_infer_exact_helper(self,
Expand All @@ -274,6 +276,14 @@ def _infer_exact_helper(tester,
output1_raw=output1_raw,
swap=swap)

def test_raw_uuu(self):
self._full_exact(np.uint8,
np.uint8,
np.uint8,
output0_raw=True,
output1_raw=True,
swap=True)

def test_raw_bbb(self):
self._full_exact(np.int8,
np.int8,
Expand Down Expand Up @@ -354,6 +364,30 @@ def test_raw_ibs(self):
output1_raw=True,
swap=False)

def test_raw_fuu(self):
self._full_exact(np.float32,
np.uint8,
np.uint8,
output0_raw=True,
output1_raw=True,
swap=False)

def test_raw_uff(self):
self._full_exact(np.uint8,
np.float32,
np.float32,
output0_raw=True,
output1_raw=True,
swap=False)

def test_raw_fuh(self):
self._full_exact(np.float32,
np.uint8,
np.float16,
output0_raw=True,
output1_raw=True,
swap=False)

def test_raw_iff(self):
self._full_exact(np.int32,
np.float32,
Expand Down
4 changes: 2 additions & 2 deletions qa/L0_infer/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ if [ "$TEST_VALGRIND" -eq 1 ]; then
fi

if [ "$TEST_SYSTEM_SHARED_MEMORY" -eq 1 ] || [ "$TEST_CUDA_SHARED_MEMORY" -eq 1 ]; then
EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="29"}
EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="33"}
else
EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="42"}
EXPECTED_NUM_TESTS=${EXPECTED_NUM_TESTS:="46"}
fi

TF_VERSION=${TF_VERSION:=2}
Expand Down
4 changes: 2 additions & 2 deletions qa/L0_server_status/server_status_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,8 +639,8 @@ def test_infer_stats_no_model(self):
else:
stats = infer_stats.model_stats
self.assertEqual(
len(stats), 207,
"expected 207 infer stats for all ready versions of all model"
len(stats), 219,
"expected 219 infer stats for all ready versions of all model"
)

except InferenceServerException as ex:
Expand Down
40 changes: 37 additions & 3 deletions qa/common/gen_qa_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def np_to_trt_dtype(np_dtype):
return trt.int8
elif np_dtype == np.int32:
return trt.int32
elif np_dtype == np.uint8:
return trt.uint8
elif np_dtype == np.float16:
return trt.float16
elif np_dtype == np.float32:
Expand Down Expand Up @@ -493,12 +495,30 @@ def create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,

in0 = network.add_input("INPUT0", trt_input_dtype, input_with_batchsize)
in1 = network.add_input("INPUT1", trt_input_dtype, input_with_batchsize)

# TRT uint8 cannot be used to represent quantized floating-point value yet
# uint8 must be converted to float16 or float32 before any operation
if trt_input_dtype == trt.uint8:
in0_cast = network.add_identity(in0)
in0_cast.set_output_type(0, trt.float32)
in0 = in0_cast.get_output(0)
in1_cast = network.add_identity(in1)
in1_cast.set_output_type(0, trt.float32)
in1 = in1_cast.get_output(0)

add = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUM)
sub = network.add_elementwise(in0, in1, trt.ElementWiseOperation.SUB)

out0 = add if not swap else sub
out1 = sub if not swap else add

# uint8 conversion after operations
if trt_output0_dtype == trt.uint8:
out0 = network.add_identity(out0.get_output(0))
out0.set_output_type(0, trt.uint8)
if trt_output1_dtype == trt.uint8:
out1 = network.add_identity(out1.get_output(0))
out1.set_output_type(0, trt.uint8)

out0.get_output(0).name = "OUTPUT0"
out1.get_output(0).name = "OUTPUT1"
network.mark_output(out0.get_output(0))
Expand Down Expand Up @@ -860,7 +880,16 @@ def create_plan_modelfile(models_dir,
input_shape, output0_shape, output1_shape):
return

if input_dtype != np.float32 or output0_dtype != np.float32 or output1_dtype != np.float32:
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
# TRT uint8 cannot be used to represent quantized floating-point value yet
# EXPLICIT_BATCH network and conversion are required to create models
create_plan_dynamic_rf_modelfile(models_dir, max_batch, model_version,
input_shape, output0_shape,
output1_shape, input_dtype,
output0_dtype, output1_dtype, swap,
min_dim, max_dim)

elif input_dtype != np.float32 or output0_dtype != np.float32 or output1_dtype != np.float32:
if (not tu.shape_is_fixed(input_shape) or
not tu.shape_is_fixed(output0_shape) or
not tu.shape_is_fixed(output1_shape)):
Expand Down Expand Up @@ -1877,6 +1906,8 @@ def create_fixed_models(models_dir,

# Tests with models that accept fixed-shape input/output tensors
if not FLAGS.variable:
create_fixed_models(FLAGS.models_dir, np.uint8, np.uint8, np.uint8,
('latest', 3))
create_fixed_models(FLAGS.models_dir, np.int8, np.int8, np.int8,
('latest', 1))
create_fixed_models(FLAGS.models_dir, np.int16, np.int16, np.int16,
Expand All @@ -1895,6 +1926,9 @@ def create_fixed_models(models_dir,
create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int8)
create_fixed_models(FLAGS.models_dir, np.int8, np.int32, np.int32)
create_fixed_models(FLAGS.models_dir, np.int32, np.int8, np.int16)
create_fixed_models(FLAGS.models_dir, np.float32, np.uint8, np.uint8)
create_fixed_models(FLAGS.models_dir, np.uint8, np.float32, np.float32)
create_fixed_models(FLAGS.models_dir, np.float32, np.uint8, np.float16)
create_fixed_models(FLAGS.models_dir, np.int32, np.float32, np.float32)
create_fixed_models(FLAGS.models_dir, np.float32, np.int32, np.int32)
create_fixed_models(FLAGS.models_dir, np.int32, np.float16, np.int16)
Expand Down Expand Up @@ -1979,7 +2013,7 @@ def create_fixed_models(models_dir,
swap=True)

if FLAGS.tensorrt:
for vt in [np.float32, np.float16, np.int32]:
for vt in [np.float32, np.float16, np.int32, np.uint8]:
create_plan_modelfile(FLAGS.models_dir,
8,
2, (16,), (16,), (16,),
Expand Down
11 changes: 10 additions & 1 deletion qa/common/infer_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,15 @@ def infer_exact(tester,
input0_array = input0_array.astype(input_dtype)
input1_array = input1_array.astype(input_dtype)

# for unsigned data type, the value being subtracted must be less than the
# value it is subtracted from, to avoid overflow.
if val_min == 0:
# swap element if the element at input 0 < input 1
tmp = np.where(input0_array < input1_array, input1_array, input0_array)
input1_array = np.where(input0_array < input1_array, input0_array,
input1_array)
input0_array = tmp

if not swap:
output0_array = input0_array + input1_array
output1_array = input0_array - input1_array
Expand Down Expand Up @@ -277,7 +286,7 @@ def inferAndCheckResults(tester, configs, pf, batch_size, model_version,
outputs, precreated_shm_regions, input0_list_tmp,
input1_list_tmp, shm_region_names, input0_byte_size,
input1_byte_size, output0_byte_size, output1_byte_size,
use_system_shared_memory, use_cuda_shared_memory,
use_system_shared_memory, use_cuda_shared_memory,
network_timeout, skip_request_id_check):
# Lazy shm imports...
if use_system_shared_memory or use_cuda_shared_memory:
Expand Down
24 changes: 23 additions & 1 deletion qa/common/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ def validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
input_shape, output0_shape, output1_shape):
"""Return True if input and output dtypes are supported by a TF model."""

# Not extending test to uint8 yet
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
return False

# If the input type is string the output type must be string or
# int32. This is because the QA models we generate convert strings
# internally to int32 for compute.
Expand All @@ -89,7 +93,9 @@ def validate_for_tf_model(input_dtype, output0_dtype, output1_dtype,
def validate_for_trt_model(input_dtype, output0_dtype, output1_dtype,
input_shape, output0_shape, output1_shape):
"""Return True if input and output dtypes are supported by a TRT model."""
supported_datatypes = [bool, np.int8, np.int32, np.float16, np.float32]
supported_datatypes = [
bool, np.int8, np.int32, np.uint8, np.float16, np.float32
]
if not input_dtype in supported_datatypes:
return False
if not output0_dtype in supported_datatypes:
Expand All @@ -113,6 +119,10 @@ def validate_for_ensemble_model(ensemble_type, input_dtype, output0_dtype,
output1_shape):
"""Return True if input and output dtypes are supported by the ensemble type."""

# Not extending test to uint8 yet
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
return False

# Those ensemble types contains "identity" model which doesn't allow STRING
# data type
# Test types that use identity for both input and output
Expand All @@ -133,6 +143,10 @@ def validate_for_onnx_model(input_dtype, output0_dtype, output1_dtype,
input_shape, output0_shape, output1_shape):
"""Return True if input and output dtypes are supported by a Onnx model."""

# Not extending test to uint8 yet
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
return False

# If the input type is string the output type must be string or
# int32. This is because the QA models we generate convert strings
# internally to int32 for compute.
Expand All @@ -154,6 +168,10 @@ def validate_for_libtorch_model(input_dtype,
reshape=False):
"""Return True if input and output dtypes are supported by a libtorch model."""

# Not extending test to uint8 yet
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
return False

# STRING data type does not support I/O with more than 1 dims. It supports
# batching when 'reshape' field is set properly to empty shape.
has_string_type = (input_dtype == np.object_) or (
Expand Down Expand Up @@ -181,6 +199,10 @@ def validate_for_openvino_model(input_dtype, output0_dtype, output1_dtype,
input_shape, output0_shape, output1_shape):
"""Return True if input and output dtypes are supported by an OpenVino model."""

# Not extending test to uint8 yet
if input_dtype == np.uint8 or output0_dtype == np.uint8 or output1_dtype == np.uint8:
return False

# float16 is not supported on CPU by OpenVino
supported_datatypes = [np.int8, np.int32, np.float32]
if not input_dtype in supported_datatypes:
Expand Down

0 comments on commit 0301b7a

Please sign in to comment.