Skip to content

fix(llmobs): fix input token counting for bedrock prompt caching #13919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ddtrace/llmobs/_integrations/bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ddtrace.llmobs._integrations.bedrock_agents import _create_or_update_bedrock_trace_step_span
from ddtrace.llmobs._integrations.bedrock_agents import _extract_trace_step_id
from ddtrace.llmobs._integrations.bedrock_agents import translate_bedrock_trace
from ddtrace.llmobs._integrations.bedrock_utils import normalize_input_tokens
from ddtrace.llmobs._integrations.utils import get_final_message_converse_stream_message
from ddtrace.llmobs._integrations.utils import get_messages_from_converse_content
from ddtrace.llmobs._integrations.utils import update_proxy_workflow_input_output_value
Expand Down Expand Up @@ -81,6 +82,8 @@ def _llmobs_set_tags(
if ctx.get_item("llmobs.usage"):
usage_metrics = ctx["llmobs.usage"]

normalize_input_tokens(usage_metrics)

if "total_tokens" not in usage_metrics and (
"input_tokens" in usage_metrics or "output_tokens" in usage_metrics
):
Expand Down Expand Up @@ -324,6 +327,7 @@ def _converse_output_stream_processor() -> (
if not messages:
messages.append({"role": "assistant", "content": ""})

normalize_input_tokens(usage_metrics)
return messages, metadata, usage_metrics

@staticmethod
Expand Down
20 changes: 20 additions & 0 deletions ddtrace/llmobs/_integrations/bedrock_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from ddtrace.llmobs._constants import CACHE_READ_INPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import CACHE_WRITE_INPUT_TOKENS_METRIC_KEY
from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY


_MODEL_TYPE_IDENTIFIERS = (
"foundation-model/",
"custom-model/",
Expand Down Expand Up @@ -42,3 +47,18 @@ def parse_model_id(model_id: str):
return model_meta[-2], model_meta[-1]
return "custom", model_id
return "custom", "custom"


def normalize_input_tokens(usage_metrics: dict) -> None:
"""
`input_tokens` in bedrock's response usage metadata is the number of non-cached tokens. We normalize it to mean
the total tokens sent to the model to be consistent with other model providers.

Args:
usage_metrics: Dictionary containing token usage metrics that will be modified in-place
"""
if CACHE_READ_INPUT_TOKENS_METRIC_KEY in usage_metrics or CACHE_WRITE_INPUT_TOKENS_METRIC_KEY in usage_metrics:
input_tokens = usage_metrics.get(INPUT_TOKENS_METRIC_KEY, 0)
cache_read_tokens = usage_metrics.get(CACHE_READ_INPUT_TOKENS_METRIC_KEY, 0)
cache_write_tokens = usage_metrics.get(CACHE_WRITE_INPUT_TOKENS_METRIC_KEY, 0)
usage_metrics[INPUT_TOKENS_METRIC_KEY] = input_tokens + cache_read_tokens + cache_write_tokens
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
LLM Observability: Addresses an upstream issue in Bedrock prompt caching, which reports input tokens as the number of non-cached tokens instead of the total tokens sent to the model. With this fix, LLM Observability correctly counts input tokens to include cached read/write prompt tokens.
8 changes: 4 additions & 4 deletions tests/contrib/botocore/test_bedrock_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ def test_llmobs_converse_prompt_caching(self, bedrock_client, request_vcr, mock_
"temperature": 0.7,
},
token_metrics={
"input_tokens": 11,
"input_tokens": 1039,
"output_tokens": 264,
"total_tokens": 1303,
"cache_write_input_tokens": 1028,
Expand All @@ -470,7 +470,7 @@ def test_llmobs_converse_prompt_caching(self, bedrock_client, request_vcr, mock_
"temperature": 0.7,
},
token_metrics={
"input_tokens": 12,
"input_tokens": 1040,
"output_tokens": 185,
"total_tokens": 1225,
"cache_write_input_tokens": 0,
Expand Down Expand Up @@ -526,7 +526,7 @@ def test_llmobs_converse_stream_prompt_caching(self, bedrock_client, request_vcr
"temperature": 0.7,
},
token_metrics={
"input_tokens": 11,
"input_tokens": 1039,
"output_tokens": 236,
"total_tokens": 1275,
"cache_write_input_tokens": 1028,
Expand All @@ -549,7 +549,7 @@ def test_llmobs_converse_stream_prompt_caching(self, bedrock_client, request_vcr
"temperature": 0.7,
},
token_metrics={
"input_tokens": 12,
"input_tokens": 1040,
"output_tokens": 250,
"total_tokens": 1290,
"cache_write_input_tokens": 0,
Expand Down
Loading