Merge branch 'main' into ov2024.3

triton-inference-server · Sep 17, 2024 · b268ad3 · b268ad3
2 parents be6726b + 68d4c01
commit b268ad3
Show file tree

Hide file tree

Showing 22 changed files with 1,020 additions and 248 deletions.
diff --git a/build.py b/build.py
@@ -1374,12 +1374,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 
     if enable_gpu:
         df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
-        df += """
+        # This segment will break the RHEL SBSA build. Need to determine whether
+        # this is necessary to incorporate.
+        if target_platform() != "rhel":
+            df += """
 # Extra defensive wiring for CUDA Compat lib
 RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\
-      && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
-      && ldconfig \\
-      && rm -f ${_CUDA_COMPAT_PATH}/lib
+    && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
+    && ldconfig \\
+    && rm -f ${_CUDA_COMPAT_PATH}/lib
 """
     else:
         df += add_cpu_libs_to_linux_dockerfile(backends, target_machine)

diff --git a/docs/customization_guide/tritonfrontend.md b/docs/customization_guide/tritonfrontend.md
@@ -1,3 +1,30 @@
+<!--
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
 ### Triton Server (tritonfrontend) Bindings
 
 The `tritonfrontend` python package is a set of bindings to Triton's existing frontends implemented in C++. Currently, `tritonfrontend` supports starting up `KServeHttp` and `KServeGrpc` frontends. These bindings used in-combination with Triton's Python In-Process API ([`tritonserver`](https://github.com/triton-inference-server/core/tree/main/python/tritonserver)) and [`tritonclient`](https://github.com/triton-inference-server/client/tree/main/src/python/library) extend the ability to use Triton's full feature set with a couple of lines of Python.

diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -31,18 +31,20 @@
 sys.path.append("../common")
 
 import os
+import time
 import unittest
+from functools import partial
 
 import infer_util as iu
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
 import tritonclient.http as httpclient
-import tritonshmutils.cuda_shared_memory as cshm
+import tritonclient.utils.cuda_shared_memory as cshm
 from tritonclient.utils import *
 
 
-class CudaSharedMemoryTest(tu.TestResultCollector):
+class CudaSharedMemoryTestBase(tu.TestResultCollector):
     DEFAULT_SHM_BYTE_SIZE = 64
 
     def setUp(self):
@@ -61,76 +63,6 @@ def _setup_client(self):
                 self.url, verbose=True
             )
 
-    def test_invalid_create_shm(self):
-        # Raises error since tried to create invalid cuda shared memory region
-        try:
-            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
-            cshm.destroy_shared_memory_region(shm_op0_handle)
-        except Exception as ex:
-            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
-
-    def test_valid_create_set_register(self):
-        # Create a valid cuda shared memory region, fill data in it and register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        cshm.set_shared_memory_region(
-            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
-        )
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_before_register(self):
-        # Create a valid cuda shared memory region and unregister before register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_unregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        self.triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 0)
-        else:
-            self.assertEqual(len(shm_status.regions), 0)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
-    def test_reregister_after_register(self):
-        # Create a valid cuda shared memory region and unregister after register
-        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        self.triton_client.register_cuda_shared_memory(
-            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-        )
-        try:
-            self.triton_client.register_cuda_shared_memory(
-                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
-            )
-        except Exception as ex:
-            self.assertIn(
-                "shared memory region 'dummy_data' already in manager", str(ex)
-            )
-        shm_status = self.triton_client.get_cuda_shared_memory_status()
-        if self.protocol == "http":
-            self.assertEqual(len(shm_status), 1)
-        else:
-            self.assertEqual(len(shm_status.regions), 1)
-        cshm.destroy_shared_memory_region(shm_op0_handle)
-
     def _configure_server(
         self,
         create_byte_size=DEFAULT_SHM_BYTE_SIZE,
@@ -205,6 +137,78 @@ def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
+
+class CudaSharedMemoryTest(CudaSharedMemoryTestBase):
+    def test_invalid_create_shm(self):
+        # Raises error since tried to create invalid cuda shared memory region
+        try:
+            shm_op0_handle = cshm.create_shared_memory_region("dummy_data", -1, 0)
+            cshm.destroy_shared_memory_region(shm_op0_handle)
+        except Exception as ex:
+            self.assertEqual(str(ex), "unable to create cuda shared memory handle")
+
+    def test_valid_create_set_register(self):
+        # Create a valid cuda shared memory region, fill data in it and register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        cshm.set_shared_memory_region(
+            shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
+        )
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_before_register(self):
+        # Create a valid cuda shared memory region and unregister before register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_unregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 0)
+        else:
+            self.assertEqual(len(shm_status.regions), 0)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
+    def test_reregister_after_register(self):
+        # Create a valid cuda shared memory region and unregister after register
+        shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
+        self.triton_client.register_cuda_shared_memory(
+            "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+        )
+        try:
+            self.triton_client.register_cuda_shared_memory(
+                "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
+            )
+        except Exception as ex:
+            self.assertIn(
+                "shared memory region 'dummy_data' already in manager", str(ex)
+            )
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
+            self.assertEqual(len(shm_status), 1)
+        else:
+            self.assertEqual(len(shm_status.regions), 1)
+        cshm.destroy_shared_memory_region(shm_op0_handle)
+
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
@@ -396,5 +400,169 @@ def test_infer_byte_size_out_of_bound(self):
         self._cleanup_server(shm_handles)
 
 
+class TestCudaSharedMemoryUnregister(CudaSharedMemoryTestBase):
+    def _test_unregister_shm_fail(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory()
+        self.assertIn(
+            "Failed to unregister the following cuda shared memory regions: input0_data ,input1_data ,output0_data ,output1_data",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("input1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'input1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output0_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output0_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.unregister_cuda_shared_memory("output1_data")
+        self.assertIn(
+            "Cannot unregister shared memory region 'output1_data', it is currently in use.",
+            str(ex.exception),
+        )
+
+    def _test_shm_not_found(self):
+        second_client = httpclient.InferenceServerClient("localhost:8000", verbose=True)
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("input1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'input1_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output0_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output0_data'",
+            str(ex.exception),
+        )
+
+        with self.assertRaises(InferenceServerException) as ex:
+            second_client.get_cuda_shared_memory_status("output1_data")
+        self.assertIn(
+            "Unable to find cuda shared memory region: 'output1_data'",
+            str(ex.exception),
+        )
+
+    def test_unregister_shm_during_inference_http(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                httpclient.InferInput("INPUT0", [1, 16], "INT32"),
+                httpclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                httpclient.InferRequestedOutput("OUTPUT0", binary_data=True),
+                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            async_request = self.triton_client.async_infer(
+                model_name="simple", inputs=inputs, outputs=outputs
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Blocking call
+            async_request.get_result()
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+    def test_unregister_shm_during_inference_grpc(self):
+        try:
+            self.triton_client.unregister_cuda_shared_memory()
+            shm_handles = self._configure_server()
+
+            inputs = [
+                grpcclient.InferInput("INPUT0", [1, 16], "INT32"),
+                grpcclient.InferInput("INPUT1", [1, 16], "INT32"),
+            ]
+            outputs = [
+                grpcclient.InferRequestedOutput("OUTPUT0"),
+                grpcclient.InferRequestedOutput("OUTPUT1"),
+            ]
+
+            inputs[0].set_shared_memory("input0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            inputs[1].set_shared_memory("input1_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[0].set_shared_memory("output0_data", self.DEFAULT_SHM_BYTE_SIZE)
+            outputs[1].set_shared_memory("output1_data", self.DEFAULT_SHM_BYTE_SIZE)
+
+            def callback(user_data, result, error):
+                if error:
+                    user_data.append(error)
+                else:
+                    user_data.append(result)
+
+            user_data = []
+
+            self.triton_client.async_infer(
+                model_name="simple",
+                inputs=inputs,
+                outputs=outputs,
+                callback=partial(callback, user_data),
+            )
+
+            # Ensure inference started
+            time.sleep(2)
+
+            # Try unregister shm regions during inference
+            self._test_unregister_shm_fail()
+
+            # Wait until the results are available in user_data
+            time_out = 20
+            while (len(user_data) == 0) and time_out > 0:
+                time_out = time_out - 1
+                time.sleep(1)
+            time.sleep(2)
+
+            # Try unregister shm regions after inference
+            self.triton_client.unregister_cuda_shared_memory()
+            self._test_shm_not_found()
+
+        finally:
+            self._cleanup_server(shm_handles)
+
+
 if __name__ == "__main__":
     unittest.main()