Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/c++/library/http_client.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -648,7 +648,6 @@ HttpInferRequest::ConvertBinaryInputToJSON(
return Error(
"datatype '" + datatype +
"' is not supported with JSON. Please use the binary data format");

} else if (datatype == "FP32") {
for (size_t i = 0; i < element_count; i++) {
data_json.AppendDouble(reinterpret_cast<const float*>(buf)[i]);
Expand Down
3 changes: 2 additions & 1 deletion src/python/library/requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand All @@ -24,6 +24,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

ml_dtypes>=0.5.4
numpy>=1.19.1
python-rapidjson>=0.9.1
urllib3>=2.0.7
29 changes: 7 additions & 22 deletions src/python/library/tritonclient/grpc/_infer_input.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -124,22 +124,13 @@ def set_data_from_numpy(self, input_tensor):
"""
if not isinstance(input_tensor, (np.ndarray,)):
raise_error("input_tensor must be a numpy array")
# DLIS-3986: Special handling for bfloat16 until Numpy officially supports it
if self._input.datatype == "BF16":
if input_tensor.dtype != triton_to_np_dtype(self._input.datatype):
raise_error(
"got unexpected datatype {} from numpy array, expected {} for BF16 type".format(
input_tensor.dtype, triton_to_np_dtype(self._input.datatype)
)
)
else:
dtype = np_to_triton_dtype(input_tensor.dtype)
if self._input.datatype != dtype:
raise_error(
"got unexpected datatype {} from numpy array, expected {}".format(
dtype, self._input.datatype
)
dtype = np_to_triton_dtype(input_tensor.dtype)
if self._input.datatype != dtype:
raise_error(
"got unexpected datatype {} from numpy array, expected {}".format(
dtype, self._input.datatype
)
)
Comment on lines +127 to +133
Copy link

Copilot AI Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change makes BF16 inputs require an actual BF16 numpy dtype (via ml_dtypes.bfloat16 mapping). Previously, BF16 inputs could be provided as float32 (since triton_to_np_dtype("BF16") mapped to np.float32) and the client handled BF16 serialization. If existing users rely on the float32 workaround, consider accepting both float32 and bfloat16 for BF16 inputs (converting float32 to BF16 bytes) or clearly documenting the breaking behavior change.

Copilot uses AI. Check for mistakes.
valid_shape = True
if len(self._input.shape) != len(input_tensor.shape):
valid_shape = False
Expand All @@ -163,12 +154,6 @@ def set_data_from_numpy(self, input_tensor):
self._raw_content = serialized_output.item()
else:
self._raw_content = b""
elif self._input.datatype == "BF16":
serialized_output = serialize_bf16_tensor(input_tensor)
if serialized_output.size > 0:
self._raw_content = serialized_output.item()
else:
self._raw_content = b""
else:
self._raw_content = input_tensor.tobytes()
return self
Expand Down
6 changes: 1 addition & 5 deletions src/python/library/tritonclient/grpc/_infer_result.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -77,10 +77,6 @@ def as_numpy(self, name):
np_array = deserialize_bytes_tensor(
self._result.raw_output_contents[index]
)
elif datatype == "BF16":
np_array = deserialize_bf16_tensor(
self._result.raw_output_contents[index]
)
else:
np_array = np.frombuffer(
self._result.raw_output_contents[index],
Expand Down
37 changes: 8 additions & 29 deletions src/python/library/tritonclient/http/_infer_input.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand All @@ -26,13 +26,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
from tritonclient.utils import (
np_to_triton_dtype,
raise_error,
serialize_bf16_tensor,
serialize_byte_tensor,
triton_to_np_dtype,
)
from tritonclient.utils import np_to_triton_dtype, raise_error, serialize_byte_tensor


class InferInput:
Expand Down Expand Up @@ -129,22 +123,13 @@ def set_data_from_numpy(self, input_tensor, binary_data=True):
"""
if not isinstance(input_tensor, (np.ndarray,)):
raise_error("input_tensor must be a numpy array")
# DLIS-3986: Special handling for bfloat16 until Numpy officially supports it
if self._datatype == "BF16":
if input_tensor.dtype != triton_to_np_dtype(self._datatype):
raise_error(
"got unexpected datatype {} from numpy array, expected {} for BF16 type".format(
input_tensor.dtype, triton_to_np_dtype(self._datatype)
)
)
else:
dtype = np_to_triton_dtype(input_tensor.dtype)
if self._datatype != dtype:
raise_error(
"got unexpected datatype {} from numpy array, expected {}".format(
dtype, self._datatype
)
dtype = np_to_triton_dtype(input_tensor.dtype)
if self._datatype != dtype:
raise_error(
"got unexpected datatype {} from numpy array, expected {}".format(
dtype, self._datatype
)
)
Comment on lines +126 to +132
Copy link

Copilot AI Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change makes BF16 inputs require input_tensor.dtype to map to "BF16" (i.e., ml_dtypes.bfloat16). Previously the client accepted float32 tensors for BF16 (via triton_to_np_dtype("BF16") mapping) and handled conversion/serialization. If backward compatibility is desired, consider allowing float32 for BF16 here (and converting to BF16 bytes) or explicitly documenting this breaking change in the client behavior.

Copilot uses AI. Check for mistakes.
valid_shape = True
if len(self._shape) != len(input_tensor.shape):
valid_shape = False
Expand Down Expand Up @@ -202,12 +187,6 @@ def set_data_from_numpy(self, input_tensor, binary_data=True):
self._raw_data = serialized_output.item()
else:
self._raw_data = b""
elif self._datatype == "BF16":
serialized_output = serialize_bf16_tensor(input_tensor)
if serialized_output.size > 0:
self._raw_data = serialized_output.item()
else:
self._raw_data = b""
else:
self._raw_data = input_tensor.tobytes()
self._parameters["binary_data_size"] = len(self._raw_data)
Expand Down
13 changes: 2 additions & 11 deletions src/python/library/tritonclient/http/_infer_result.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -30,12 +30,7 @@

import numpy as np
import rapidjson as json
from tritonclient.utils import (
deserialize_bf16_tensor,
deserialize_bytes_tensor,
raise_error,
triton_to_np_dtype,
)
from tritonclient.utils import deserialize_bytes_tensor, raise_error, triton_to_np_dtype


class InferResult:
Expand Down Expand Up @@ -190,10 +185,6 @@ def as_numpy(self, name):
np_array = deserialize_bytes_tensor(
self._buffer[start_index:end_index]
)
elif datatype == "BF16":
np_array = deserialize_bf16_tensor(
self._buffer[start_index:end_index]
)
else:
np_array = np.frombuffer(
self._buffer[start_index:end_index],
Expand Down
81 changes: 7 additions & 74 deletions src/python/library/tritonclient/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -28,6 +28,7 @@

import struct

import ml_dtypes
import numpy as np

from ._shared_memory_tensor import SharedMemoryTensor
Expand Down Expand Up @@ -149,6 +150,8 @@ def np_to_triton_dtype(np_dtype):
return "UINT32"
elif np_dtype == np.uint64:
return "UINT64"
elif np_dtype == ml_dtypes.bfloat16:
return "BF16"
elif np_dtype == np.float16:
return "FP16"
elif np_dtype == np.float32:
Expand Down Expand Up @@ -179,9 +182,11 @@ def triton_to_np_dtype(dtype):
return np.uint32
elif dtype == "UINT64":
return np.uint64
elif dtype == "BF16":
return ml_dtypes.bfloat16
elif dtype == "FP16":
return np.float16
elif dtype == "FP32" or dtype == "BF16":
elif dtype == "FP32":
return np.float32
elif dtype == "FP64":
return np.float64
Expand Down Expand Up @@ -274,75 +279,3 @@ def deserialize_bytes_tensor(encoded_tensor):
offset += l
strs.append(sb)
return np.array(strs, dtype=np.object_)


def serialize_bf16_tensor(input_tensor):
"""
Serializes a bfloat16 tensor into a flat numpy array of bytes.
The numpy array should use dtype of np.float32.

Parameters
----------
input_tensor : np.array
The bfloat16 tensor to serialize.

Returns
-------
serialized_bf16_tensor : np.array
The 1-D numpy array of type uint8 containing the serialized bytes in row-major form.

Raises
------
InferenceServerException
If unable to serialize the given tensor.
"""

if input_tensor.size == 0:
return np.empty([0], dtype=np.object_)

# If the input is a tensor of float32, then must flatten those into
# a 1-dimensional array containing the element bytes. All elements
# are concatenated together in row-major order.

if input_tensor.dtype != np.float32:
raise_error("cannot serialize bf16 tensor: invalid datatype")

flattened_ls = []
# 'C' order is row-major.
for obj in np.nditer(input_tensor, flags=["refs_ok"], order="C"):
# To truncate the float32 to a bfloat16, we need the high-order bits.
obj_bytes = struct.pack("<f", obj)[2:4]
flattened_ls.append(obj_bytes)
flattened = b"".join(flattened_ls)
flattened_array = np.asarray(flattened, dtype=np.object_)
if not flattened_array.flags["C_CONTIGUOUS"]:
flattened_array = np.ascontiguousarray(flattened_array, dtype=np.object_)
return flattened_array


def deserialize_bf16_tensor(encoded_tensor):
"""
Deserializes an encoded bf16 tensor into a
numpy array of dtype of python objects

Parameters
----------
encoded_tensor : bytes
The encoded bytes tensor where each element
is 2 bytes (size of bfloat16)
Returns
-------
string_tensor : np.array
The 1-D numpy array of type float32 containing the
deserialized bytes in row-major form.

"""
strs = list()
offset = 0
val_buf = encoded_tensor
while offset < len(val_buf):
sb = struct.unpack_from("<2s", val_buf, offset)[0]
# Bfloat16 contains 2 bytes
offset += 2
strs.append(struct.unpack("<f", (b"\x00\x00" + sb)))
return np.array(strs, dtype=np.float32)
Loading