Skip to content

Bugfix/usage for openrouter (#11627) #11785

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions litellm/cost_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,7 @@ def combine_usage_objects(usage_objects: List[Usage]) -> Usage:
Combine multiple Usage objects into a single Usage object, checking model keys for nested values.
"""
from litellm.types.utils import (
CompletionTokensDetails,
CompletionTokensDetailsWrapper,
PromptTokensDetailsWrapper,
Usage,
)
Expand Down Expand Up @@ -1271,7 +1271,9 @@ def combine_usage_objects(usage_objects: List[Usage]) -> Usage:
not hasattr(combined, "completion_tokens_details")
or not combined.completion_tokens_details
):
combined.completion_tokens_details = CompletionTokensDetails()
combined.completion_tokens_details = (
CompletionTokensDetailsWrapper()
)

# Check what keys exist in the model's completion_tokens_details
for attr in dir(usage.completion_tokens_details):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,15 +119,8 @@ async def convert_to_streaming_response_async(response_object: Optional[dict] =
model_response_object.choices = choice_list

if "usage" in response_object and response_object["usage"] is not None:
setattr(
model_response_object,
"usage",
Usage(
completion_tokens=response_object["usage"].get("completion_tokens", 0),
prompt_tokens=response_object["usage"].get("prompt_tokens", 0),
total_tokens=response_object["usage"].get("total_tokens", 0),
),
)
usage_object = Usage(**response_object["usage"])
setattr(model_response_object, "usage", usage_object)

if "id" in response_object:
model_response_object.id = response_object["id"]
Expand Down Expand Up @@ -172,10 +165,8 @@ def convert_to_streaming_response(response_object: Optional[dict] = None):
model_response_object.choices = choice_list

if "usage" in response_object and response_object["usage"] is not None:
setattr(model_response_object, "usage", Usage())
model_response_object.usage.completion_tokens = response_object["usage"].get("completion_tokens", 0) # type: ignore
model_response_object.usage.prompt_tokens = response_object["usage"].get("prompt_tokens", 0) # type: ignore
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
usage_object = Usage(**response_object["usage"])
setattr(model_response_object, "usage", usage_object)

if "id" in response_object:
model_response_object.id = response_object["id"]
Expand Down
1 change: 1 addition & 0 deletions litellm/litellm_core_utils/logging_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def _assemble_complete_response_from_streaming_chunks(
messages=request_kwargs.get("messages", None),
start_time=start_time,
end_time=end_time,
usage=getattr(result, "usage"),
)
except Exception as e:
log_message = (
Expand Down
152 changes: 113 additions & 39 deletions litellm/litellm_core_utils/streaming_chunk_builder_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ def get_combined_audio_content(
def _usage_chunk_calculation_helper(self, usage_chunk: Usage) -> dict:
prompt_tokens = 0
completion_tokens = 0
cost: Optional[float] = None
is_byok: Optional[bool] = None
## anthropic prompt caching information ##
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
Expand All @@ -262,6 +264,10 @@ def _usage_chunk_calculation_helper(self, usage_chunk: Usage) -> dict:
prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0
if "completion_tokens" in usage_chunk:
completion_tokens = usage_chunk.get("completion_tokens", 0) or 0
if "cost" in usage_chunk:
cost = usage_chunk.get("cost")
if "is_byok" in usage_chunk:
is_byok = usage_chunk.get("is_byok")
if "cache_creation_input_tokens" in usage_chunk:
cache_creation_input_tokens = usage_chunk.get("cache_creation_input_tokens")
if "cache_read_input_tokens" in usage_chunk:
Expand All @@ -286,6 +292,8 @@ def _usage_chunk_calculation_helper(self, usage_chunk: Usage) -> dict:
return {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"cost": cost,
"is_byok": is_byok,
"cache_creation_input_tokens": cache_creation_input_tokens,
"cache_read_input_tokens": cache_read_input_tokens,
"completion_tokens_details": completion_tokens_details,
Expand All @@ -306,26 +314,21 @@ def count_reasoning_tokens(self, response: ModelResponse) -> int:

return reasoning_tokens

def calculate_usage(
self,
chunks: List[Union[Dict[str, Any], ModelResponse]],
model: str,
completion_output: str,
messages: Optional[List] = None,
reasoning_tokens: Optional[int] = None,
) -> Usage:
def _process_usage_chunks(
self, chunks: List[Union[Dict[str, Any], ModelResponse]]
) -> Dict[str, Any]:
"""
Calculate usage for the given chunks.
Process chunks to extract usage information.
"""
returned_usage = Usage()
# # Update usage information if needed
prompt_tokens = 0
completion_tokens = 0
## anthropic prompt caching information ##
cost: Optional[float] = None
is_byok: Optional[bool] = None
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
completion_tokens_details: Optional[CompletionTokensDetails] = None
prompt_tokens_details: Optional[PromptTokensDetails] = None

for chunk in chunks:
usage_chunk: Optional[Usage] = None
if "usage" in chunk:
Expand All @@ -348,6 +351,14 @@ def calculate_usage(
and usage_chunk_dict["completion_tokens"] > 0
):
completion_tokens = usage_chunk_dict["completion_tokens"]
if usage_chunk_dict["cost"] is not None and (
usage_chunk_dict["cost"] > 0 or cost is None
):
cost = usage_chunk_dict["cost"]
if usage_chunk_dict["is_byok"] is not None and (
usage_chunk_dict["is_byok"] is True or is_byok is None
):
is_byok = usage_chunk_dict["is_byok"]
if usage_chunk_dict["cache_creation_input_tokens"] is not None and (
usage_chunk_dict["cache_creation_input_tokens"] > 0
or cache_creation_input_tokens is None
Expand All @@ -367,39 +378,59 @@ def calculate_usage(
"completion_tokens_details"
]
prompt_tokens_details = usage_chunk_dict["prompt_tokens_details"]
try:
returned_usage.prompt_tokens = prompt_tokens or token_counter(
model=model, messages=messages
)
except (
Exception
): # don't allow this failing to block a complete streaming response from being returned
print_verbose("token_counter failed, assuming prompt tokens is 0")
returned_usage.prompt_tokens = 0
returned_usage.completion_tokens = completion_tokens or token_counter(
model=model,
text=completion_output,
count_response_tokens=True, # count_response_tokens is a Flag to tell token counter this is a response, No need to add extra tokens we do for input messages
)
returned_usage.total_tokens = (
returned_usage.prompt_tokens + returned_usage.completion_tokens
)

if cache_creation_input_tokens is not None:
returned_usage._cache_creation_input_tokens = cache_creation_input_tokens
return {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"cost": cost,
"is_byok": is_byok,
"cache_creation_input_tokens": cache_creation_input_tokens,
"cache_read_input_tokens": cache_read_input_tokens,
"completion_tokens_details": completion_tokens_details,
"prompt_tokens_details": prompt_tokens_details,
}

def _set_additional_usage_properties(
self,
returned_usage: Usage,
usage_data: Dict[str, Any],
reasoning_tokens: Optional[int] = None,
) -> Usage:
"""
Set additional properties on the usage object.
"""
if usage_data["cost"] is not None:
returned_usage.cost = usage_data["cost"]
if usage_data["is_byok"] is not None:
returned_usage.is_byok = usage_data["is_byok"]

# Set cache tokens
if usage_data["cache_creation_input_tokens"] is not None:
returned_usage._cache_creation_input_tokens = usage_data[
"cache_creation_input_tokens"
]
setattr(
returned_usage,
"cache_creation_input_tokens",
cache_creation_input_tokens,
usage_data["cache_creation_input_tokens"],
) # for anthropic
if cache_read_input_tokens is not None:
returned_usage._cache_read_input_tokens = cache_read_input_tokens
if usage_data["cache_read_input_tokens"] is not None:
returned_usage._cache_read_input_tokens = usage_data[
"cache_read_input_tokens"
]
setattr(
returned_usage, "cache_read_input_tokens", cache_read_input_tokens
returned_usage,
"cache_read_input_tokens",
usage_data["cache_read_input_tokens"],
) # for anthropic
if completion_tokens_details is not None:
returned_usage.completion_tokens_details = completion_tokens_details

# Set token details
if usage_data["completion_tokens_details"] is not None:
returned_usage.completion_tokens_details = usage_data[
"completion_tokens_details"
]

# Handle reasoning tokens
if reasoning_tokens is not None:
if returned_usage.completion_tokens_details is None:
returned_usage.completion_tokens_details = (
Expand All @@ -412,8 +443,51 @@ def calculate_usage(
returned_usage.completion_tokens_details.reasoning_tokens = (
reasoning_tokens
)
if prompt_tokens_details is not None:
returned_usage.prompt_tokens_details = prompt_tokens_details

if usage_data["prompt_tokens_details"] is not None:
returned_usage.prompt_tokens_details = usage_data["prompt_tokens_details"]

return returned_usage

def calculate_usage(
self,
chunks: List[Union[Dict[str, Any], ModelResponse]],
model: str,
completion_output: str,
messages: Optional[List] = None,
reasoning_tokens: Optional[int] = None,
) -> Usage:
"""
Calculate usage for the given chunks.
"""
returned_usage = Usage()
usage_data = self._process_usage_chunks(chunks)

try:
returned_usage.prompt_tokens = usage_data["prompt_tokens"] or token_counter(
model=model, messages=messages
)
except (
Exception
): # don't allow this failing to block a complete streaming response from being returned
print_verbose("token_counter failed, assuming prompt tokens is 0")
returned_usage.prompt_tokens = 0

returned_usage.completion_tokens = usage_data[
"completion_tokens"
] or token_counter(
model=model,
text=completion_output,
count_response_tokens=True,
)

returned_usage.total_tokens = (
returned_usage.prompt_tokens + returned_usage.completion_tokens
)

returned_usage = self._set_additional_usage_properties(
returned_usage, usage_data, reasoning_tokens
)

return returned_usage

Expand Down
Loading
Loading