triton-inference-server · yinggeh · Feb 14, 2026 · Feb 17, 2026 · Feb 21, 2026 · Copilot
diff --git a/src/c++/library/http_client.cc b/src/c++/library/http_client.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -648,7 +648,6 @@ HttpInferRequest::ConvertBinaryInputToJSON(
     return Error(
         "datatype '" + datatype +
         "' is not supported with JSON. Please use the binary data format");
-
   } else if (datatype == "FP32") {
     for (size_t i = 0; i < element_count; i++) {
       data_json.AppendDouble(reinterpret_cast<const float*>(buf)[i]);

diff --git a/src/python/library/requirements/requirements.txt b/src/python/library/requirements/requirements.txt
@@ -1,4 +1,4 @@
-# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,6 +24,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+ml_dtypes>=0.5.4
 numpy>=1.19.1
 python-rapidjson>=0.9.1
 urllib3>=2.0.7
diff --git a/src/python/library/tritonclient/grpc/_infer_input.py b/src/python/library/tritonclient/grpc/_infer_input.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -124,22 +124,13 @@ def set_data_from_numpy(self, input_tensor):
         """
         if not isinstance(input_tensor, (np.ndarray,)):
             raise_error("input_tensor must be a numpy array")
-        # DLIS-3986: Special handling for bfloat16 until Numpy officially supports it
-        if self._input.datatype == "BF16":
-            if input_tensor.dtype != triton_to_np_dtype(self._input.datatype):
-                raise_error(
-                    "got unexpected datatype {} from numpy array, expected {} for BF16 type".format(
-                        input_tensor.dtype, triton_to_np_dtype(self._input.datatype)
-                    )
-                )
-        else:
-            dtype = np_to_triton_dtype(input_tensor.dtype)
-            if self._input.datatype != dtype:
-                raise_error(
-                    "got unexpected datatype {} from numpy array, expected {}".format(
-                        dtype, self._input.datatype
-                    )
+        dtype = np_to_triton_dtype(input_tensor.dtype)
+        if self._input.datatype != dtype:
+            raise_error(
+                "got unexpected datatype {} from numpy array, expected {}".format(
+                    dtype, self._input.datatype
                 )
+            )
         valid_shape = True
         if len(self._input.shape) != len(input_tensor.shape):
             valid_shape = False
@@ -163,12 +154,6 @@ def set_data_from_numpy(self, input_tensor):
                 self._raw_content = serialized_output.item()
             else:
                 self._raw_content = b""
-        elif self._input.datatype == "BF16":
-            serialized_output = serialize_bf16_tensor(input_tensor)
-            if serialized_output.size > 0:
-                self._raw_content = serialized_output.item()
-            else:
-                self._raw_content = b""
         else:
             self._raw_content = input_tensor.tobytes()
         return self

diff --git a/src/python/library/tritonclient/grpc/_infer_result.py b/src/python/library/tritonclient/grpc/_infer_result.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -77,10 +77,6 @@ def as_numpy(self, name):
                         np_array = deserialize_bytes_tensor(
                             self._result.raw_output_contents[index]
                         )
-                    elif datatype == "BF16":
-                        np_array = deserialize_bf16_tensor(
-                            self._result.raw_output_contents[index]
-                        )
                     else:
                         np_array = np.frombuffer(
                             self._result.raw_output_contents[index],

diff --git a/src/python/library/tritonclient/http/_infer_input.py b/src/python/library/tritonclient/http/_infer_input.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -26,13 +26,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 import numpy as np
-from tritonclient.utils import (
-    np_to_triton_dtype,
-    raise_error,
-    serialize_bf16_tensor,
-    serialize_byte_tensor,
-    triton_to_np_dtype,
-)
+from tritonclient.utils import np_to_triton_dtype, raise_error, serialize_byte_tensor
 
 
 class InferInput:
@@ -129,22 +123,13 @@ def set_data_from_numpy(self, input_tensor, binary_data=True):
         """
         if not isinstance(input_tensor, (np.ndarray,)):
             raise_error("input_tensor must be a numpy array")
-        # DLIS-3986: Special handling for bfloat16 until Numpy officially supports it
-        if self._datatype == "BF16":
-            if input_tensor.dtype != triton_to_np_dtype(self._datatype):
-                raise_error(
-                    "got unexpected datatype {} from numpy array, expected {} for BF16 type".format(
-                        input_tensor.dtype, triton_to_np_dtype(self._datatype)
-                    )
-                )
-        else:
-            dtype = np_to_triton_dtype(input_tensor.dtype)
-            if self._datatype != dtype:
-                raise_error(
-                    "got unexpected datatype {} from numpy array, expected {}".format(
-                        dtype, self._datatype
-                    )
+        dtype = np_to_triton_dtype(input_tensor.dtype)
+        if self._datatype != dtype:
+            raise_error(
+                "got unexpected datatype {} from numpy array, expected {}".format(
+                    dtype, self._datatype
                 )
+            )
         valid_shape = True
         if len(self._shape) != len(input_tensor.shape):
             valid_shape = False
@@ -202,12 +187,6 @@ def set_data_from_numpy(self, input_tensor, binary_data=True):
                     self._raw_data = serialized_output.item()
                 else:
                     self._raw_data = b""
-            elif self._datatype == "BF16":
-                serialized_output = serialize_bf16_tensor(input_tensor)
-                if serialized_output.size > 0:
-                    self._raw_data = serialized_output.item()
-                else:
-                    self._raw_data = b""
             else:
                 self._raw_data = input_tensor.tobytes()
             self._parameters["binary_data_size"] = len(self._raw_data)

diff --git a/src/python/library/tritonclient/http/_infer_result.py b/src/python/library/tritonclient/http/_infer_result.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,12 +30,7 @@
 
 import numpy as np
 import rapidjson as json
-from tritonclient.utils import (
-    deserialize_bf16_tensor,
-    deserialize_bytes_tensor,
-    raise_error,
-    triton_to_np_dtype,
-)
+from tritonclient.utils import deserialize_bytes_tensor, raise_error, triton_to_np_dtype
 
 
 class InferResult:
@@ -190,10 +185,6 @@ def as_numpy(self, name):
                                     np_array = deserialize_bytes_tensor(
                                         self._buffer[start_index:end_index]
                                     )
-                                elif datatype == "BF16":
-                                    np_array = deserialize_bf16_tensor(
-                                        self._buffer[start_index:end_index]
-                                    )
                                 else:
                                     np_array = np.frombuffer(
                                         self._buffer[start_index:end_index],

diff --git a/src/python/library/tritonclient/utils/__init__.py b/src/python/library/tritonclient/utils/__init__.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -28,6 +28,7 @@
 
 import struct
 
+import ml_dtypes
 import numpy as np
 
 from ._shared_memory_tensor import SharedMemoryTensor
@@ -149,6 +150,8 @@ def np_to_triton_dtype(np_dtype):
         return "UINT32"
     elif np_dtype == np.uint64:
         return "UINT64"
+    elif np_dtype == ml_dtypes.bfloat16:
+        return "BF16"
     elif np_dtype == np.float16:
         return "FP16"
     elif np_dtype == np.float32:
@@ -179,9 +182,11 @@ def triton_to_np_dtype(dtype):
         return np.uint32
     elif dtype == "UINT64":
         return np.uint64
+    elif dtype == "BF16":
+        return ml_dtypes.bfloat16
     elif dtype == "FP16":
         return np.float16
-    elif dtype == "FP32" or dtype == "BF16":
+    elif dtype == "FP32":
         return np.float32
     elif dtype == "FP64":
         return np.float64
@@ -274,75 +279,3 @@ def deserialize_bytes_tensor(encoded_tensor):
         offset += l
         strs.append(sb)
     return np.array(strs, dtype=np.object_)
-
-
-def serialize_bf16_tensor(input_tensor):
-    """
-    Serializes a bfloat16 tensor into a flat numpy array of bytes.
-    The numpy array should use dtype of np.float32.
-
-    Parameters
-    ----------
-    input_tensor : np.array
-        The bfloat16 tensor to serialize.
-
-    Returns
-    -------
-    serialized_bf16_tensor : np.array
-        The 1-D numpy array of type uint8 containing the serialized bytes in row-major form.
-
-    Raises
-    ------
-    InferenceServerException
-        If unable to serialize the given tensor.
-    """
-
-    if input_tensor.size == 0:
-        return np.empty([0], dtype=np.object_)
-
-    # If the input is a tensor of float32, then must flatten those into
-    # a 1-dimensional array containing the element bytes. All elements
-    # are concatenated together in row-major order.
-
-    if input_tensor.dtype != np.float32:
-        raise_error("cannot serialize bf16 tensor: invalid datatype")
-
-    flattened_ls = []
-    # 'C' order is row-major.
-    for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
-        # To truncate the float32 to a bfloat16, we need the high-order bits.
-        obj_bytes = struct.pack("<f", obj)[2:4]
-        flattened_ls.append(obj_bytes)
-    flattened = b"".join(flattened_ls)
-    flattened_array = np.asarray(flattened, dtype=np.object_)
-    if not flattened_array.flags["C_CONTIGUOUS"]:
-        flattened_array = np.ascontiguousarray(flattened_array, dtype=np.object_)
-    return flattened_array
-
-
-def deserialize_bf16_tensor(encoded_tensor):
-    """
-    Deserializes an encoded bf16 tensor into a
-    numpy array of dtype of python objects
-
-    Parameters
-    ----------
-    encoded_tensor : bytes
-        The encoded bytes tensor where each element
-        is 2 bytes (size of bfloat16)
-    Returns
-    -------
-    string_tensor : np.array
-        The 1-D numpy array of type float32 containing the
-        deserialized bytes in row-major form.
-
-    """
-    strs = list()
-    offset = 0
-    val_buf = encoded_tensor
-    while offset < len(val_buf):
-        sb = struct.unpack_from("<2s", val_buf, offset)[0]
-        # Bfloat16 contains 2 bytes
-        offset += 2
-        strs.append(struct.unpack("<f", (b"\x00\x00" + sb)))
-    return np.array(strs, dtype=np.float32)