triton-inference-server · jbkyang-nvi · Nov 16, 2023 · Aug 17, 2023 · Aug 24, 2023 · Sep 11, 2023
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
@@ -34,7 +34,6 @@ ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:23.10-py3-min
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
-ARG TRITON_BACKEND_REPO_TAG=main
 ARG TRITON_THIRD_PARTY_REPO_TAG=main
 ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
 ARG TRITON_ENABLE_GPU=ON
@@ -107,7 +106,6 @@ RUN rm -f /usr/bin/python && \
 ARG TRITON_CLIENT_REPO_SUBDIR
 ARG TRITON_COMMON_REPO_TAG
 ARG TRITON_CORE_REPO_TAG
-ARG TRITON_BACKEND_REPO_TAG
 ARG TRITON_THIRD_PARTY_REPO_TAG
 ARG TRITON_ENABLE_GPU
 ARG JAVA_BINDINGS_MAVEN_VERSION
@@ -123,7 +121,6 @@ RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
           -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
           -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
           -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-          -DTRITON_BACKEND_REPO_TAG=${TRITON_BACKEND_REPO_TAG} \
           -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
           -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
           -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \

diff --git a/qa/L0_client_timeout/client_timeout_test.py b/qa/L0_client_timeout/client_timeout_test.py
@@ -37,9 +37,9 @@
 
 import numpy as np
 import test_util as tu
-import tritongrpcclient as grpcclient
-import tritonhttpclient as httpclient
-from tritonclientutils import InferenceServerException
+import tritonclient.grpc as grpcclient
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
 
 
 class UserData:
@@ -58,6 +58,299 @@ class ClientTimeoutTest(tu.TestResultCollector):
     def setUp(self):
         self.model_name_ = "custom_identity_int32"
         self.input0_data_ = np.array([[10]], dtype=np.int32)
+        self.input0_data_byte_size_ = 32
+        self.SMALL_INTERVAL = 0.1  # seconds for a timeout
+        self.INFER_SMALL_INTERVAL = 2.0  # seconds for a timeout
+        self.NORMAL_INTERVAL = 5.0  # seconds for server to load then receive request
+
+    def test_grpc_server_live(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.is_server_live(client_timeout=self.SMALL_INTERVAL)
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        self.assertTrue(
+            triton_client.is_server_live(client_timeout=self.NORMAL_INTERVAL)
+        )
+
+    def test_grpc_is_server_ready(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.is_server_ready(client_timeout=self.SMALL_INTERVAL)
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        self.assertTrue(
+            triton_client.is_server_ready(client_timeout=self.NORMAL_INTERVAL)
+        )
+
+    def test_grpc_is_model_ready(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.is_model_ready(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        self.assertTrue(
+            triton_client.is_model_ready(
+                model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+            )
+        )
+
+    def test_grpc_get_server_metadata(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_server_metadata(client_timeout=self.SMALL_INTERVAL)
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+
+        triton_client.get_server_metadata(client_timeout=self.NORMAL_INTERVAL)
+
+    def test_grpc_get_model_metadata(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_model_metadata(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_model_metadata(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_model_config(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_model_config(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_model_config(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_model_repository_index(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_model_repository_index(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_model_repository_index(client_timeout=self.NORMAL_INTERVAL)
+
+    def test_grpc_load_model(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        triton_client.unload_model(model_name=self.model_name_)
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.load_model(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unload_model(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+        triton_client.load_model(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_unload_model(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.unload_model(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.load_model(model_name=self.model_name_)
+        triton_client.unload_model(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+        triton_client.load_model(model_name=self.model_name_)
+
+    def test_grpc_get_inference_statistics(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_inference_statistics(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_inference_statistics(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_update_trace_settings(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.update_trace_settings(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.update_trace_settings(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_trace_settings(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_trace_settings(
+                model_name=self.model_name_, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_trace_settings(
+            model_name=self.model_name_, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_update_log_settings(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        settings = {}
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.update_log_settings(
+                settings=settings, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.update_log_settings(
+            settings=settings, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_log_settings(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_log_settings(
+                as_json=True, client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_log_settings(
+            as_json=True, client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_system_shared_memory_status(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_system_shared_memory_status(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_system_shared_memory_status(
+            client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_register_system_shared_memory(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        triton_client.unregister_system_shared_memory()
+        import tritonclient.utils.shared_memory as shm
+
+        shm_ip0_handle = shm.create_shared_memory_region(
+            "input0_data", "/input_simple", self.input0_data_byte_size_
+        )
+        shm.set_shared_memory_region(shm_ip0_handle, [self.input0_data_])
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.register_system_shared_memory(
+                "input0_data",
+                "/input_simple",
+                self.input0_data_byte_size_,
+                client_timeout=self.SMALL_INTERVAL,
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_system_shared_memory()
+        triton_client.register_system_shared_memory(
+            "input0_data",
+            "/input_simple",
+            self.input0_data_byte_size_,
+            client_timeout=self.NORMAL_INTERVAL,
+        )
+        triton_client.unregister_system_shared_memory()
+
+    def test_grpc_unregister_system_shared_memory(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.unregister_system_shared_memory(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_system_shared_memory(
+            client_timeout=self.NORMAL_INTERVAL
+        )
+
+    def test_grpc_get_cuda_shared_memory_status(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.get_cuda_shared_memory_status(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.get_cuda_shared_memory_status(client_timeout=self.NORMAL_INTERVAL)
+
+    def test_grpc_register_cuda_shared_memory(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        import tritonclient.utils.cuda_shared_memory as cshm
+
+        input_data = np.array([[10]], dtype=np.int32)
+        byteSize = input_data.itemsize * input_data.size
+        shm_op0_handle = cshm.create_shared_memory_region(
+            "dummy_data", byte_size=byteSize, device_id=0
+        )
+        cshm.set_shared_memory_region(shm_op0_handle, [input_data])
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.register_cuda_shared_memory(
+                "dummy_data",
+                cshm.get_raw_handle(shm_op0_handle),
+                device_id=0,
+                byte_size=byteSize,
+                client_timeout=self.SMALL_INTERVAL,
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_cuda_shared_memory()
+        triton_client.register_cuda_shared_memory(
+            "dummy_data",
+            cshm.get_raw_handle(shm_op0_handle),
+            device_id=0,
+            byte_size=byteSize,
+            client_timeout=self.NORMAL_INTERVAL,
+        )
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_grpc_unregister_cuda_shared_memory(self):
+        triton_client = grpcclient.InferenceServerClient(
+            url="localhost:8001", verbose=True
+        )
+        with self.assertRaises(InferenceServerException) as cm:
+            _ = triton_client.unregister_cuda_shared_memory(
+                client_timeout=self.SMALL_INTERVAL
+            )
+        self.assertIn("Deadline Exceeded", str(cm.exception))
+        triton_client.unregister_cuda_shared_memory(client_timeout=self.NORMAL_INTERVAL)
 
     def _prepare_request(self, protocol):
         if protocol == "grpc":
@@ -118,7 +411,7 @@ def test_grpc_async_infer(self):
                 inputs=self.inputs_,
                 callback=partial(callback, user_data),
                 outputs=self.outputs_,
-                client_timeout=2,
+                client_timeout=self.INFER_SMALL_INTERVAL,
             )
             data_item = user_data._completed_requests.get()
             if type(data_item) == InferenceServerException:
@@ -190,7 +483,9 @@ def test_http_infer(self):
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0
+                url="localhost:8000",
+                verbose=True,
+                network_timeout=self.INFER_SMALL_INTERVAL,
             )
             _ = triton_client.infer(
                 model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_
@@ -216,7 +511,9 @@ def test_http_async_infer(self):
         # response. Expect an exception for small timeout values.
         with self.assertRaises(socket.timeout) as cm:
             triton_client = httpclient.InferenceServerClient(
-                url="localhost:8000", verbose=True, network_timeout=2.0
+                url="localhost:8000",
+                verbose=True,
+                network_timeout=self.INFER_SMALL_INTERVAL,
             )
             async_request = triton_client.async_infer(
                 model_name=self.model_name_, inputs=self.inputs_, outputs=self.outputs_