julep-ai
diff --git a/‎agents-api/agents_api/clients/litellm.py
+43-1 b/‎agents-api/agents_api/clients/litellm.py
+43-1
diff --git a/‎agents-api/agents_api/common/utils/usage.py
+123 b/‎agents-api/agents_api/common/utils/usage.py
+123
diff --git a/‎agents-api/agents_api/queries/__init__.py
+1 b/‎agents-api/agents_api/queries/__init__.py
+1
diff --git a/‎agents-api/agents_api/queries/usage/__init__.py
+20 b/‎agents-api/agents_api/queries/usage/__init__.py
+20
diff --git a/‎agents-api/agents_api/queries/usage/create_usage_record.py
+137 b/‎agents-api/agents_api/queries/usage/create_usage_record.py
+137
@@ -1,5 +1,6 @@
 from functools import wraps
 from typing import Literal
+from uuid import UUID
 
 import aiohttp
 from beartype import beartype
@@ -8,6 +9,7 @@
 from litellm import get_supported_openai_params
 from litellm.utils import CustomStreamWrapper, ModelResponse, get_valid_models
 
+from ..common.utils.usage import track_embedding_usage, track_usage
 from ..env import (
     embedding_dimensions,
     embedding_model_id,
@@ -76,7 +78,26 @@ async def acompletion(
         api_key=custom_api_key or litellm_master_key,
     )
 
-    return patch_litellm_response(model_response)
+    response = patch_litellm_response(model_response)
+
+    # Track usage in database if we have a user ID (which should be the developer ID)
+    user = settings.get("user")
+    if user and isinstance(response, ModelResponse):
+        try:
+            model = response.model
+            await track_usage(
+                developer_id=UUID(user),
+                model=model,
+                messages=messages,
+                response=response,
+                custom_api_used=custom_api_key is not None,
+                metadata={"tags": kwargs.get("tags", [])},
+            )
+        except Exception as e:
+            # Log error but don't fail the request if usage tracking fails
+            print(f"Error tracking usage: {e}")
+
+    return response
 
 
 @wraps(_aembedding)
@@ -114,6 +135,27 @@ async def aembedding(
         **settings,
     )
 
+    # Track embedding usage if we have a user ID
+    user = settings.get("user")
+    if user:
+        try:
+            model = response.model
+            await track_embedding_usage(
+                developer_id=UUID(user),
+                model=model,
+                inputs=input,
+                response=response,
+                custom_api_used=bool(custom_api_key),
+                metadata={
+                    "request_id": response.id if hasattr(response, "id") else None,
+                    "embedding_count": len(input),
+                    "tags": settings.get("tags", []),
+                },
+            )
+        except Exception as e:
+            # Log error but don't fail the request if usage tracking fails
+            print(f"Error tracking embedding usage: {e}")
+
     embedding_list: list[dict[Literal["embedding"], list[float]]] = response.data
 
     # Truncate the embedding to the specified dimensions
 
@@ -0,0 +1,123 @@
+"""
+Utilities for tracking token usage and costs for LLM API calls.
+"""
+
+from typing import Any
+from uuid import UUID
+
+from beartype import beartype
+from litellm.utils import ModelResponse, token_counter
+
+from ...queries.usage.create_usage_record import create_usage_record
+
+
+@beartype
+async def track_usage(
+    *,
+    developer_id: UUID,
+    model: str,
+    messages: list[dict],
+    response: ModelResponse,
+    custom_api_used: bool = False,
+    metadata: dict[str, Any] = {},
+) -> None:
+    """
+    Tracks token usage and costs for an LLM API call.
+
+    Parameters:
+        developer_id (UUID): The unique identifier for the developer.
+        model (str): The model used for the API call.
+        messages (list[dict]): The messages sent to the model.
+        response (ModelResponse): The response from the LLM API call.
+        custom_api_used (bool): Whether a custom API key was used.
+        metadata (dict): Additional metadata about the usage.
+
+    Returns:
+        None
+    """
+
+    # Try to get token counts from response.usage
+    if response.usage:
+        prompt_tokens = response.usage.prompt_tokens
+        completion_tokens = response.usage.completion_tokens
+    else:
+        # Calculate tokens manually if usage is not available
+        prompt_tokens = token_counter(model=model, messages=messages)
+
+        # Calculate completion tokens from the response
+        completion_content = [
+            {"content": choice.message.content}
+            for choice in response.choices
+            if hasattr(choice, "message")
+            and choice.message
+            and hasattr(choice.message, "content")
+            and choice.message.content
+        ]
+
+        completion_tokens = (
+            token_counter(model=model, messages=completion_content) if completion_content else 0
+        )
+
+    # Map the model name to the actual model name
+    actual_model = model
+
+    # Create usage record
+    await create_usage_record(
+        developer_id=developer_id,
+        model=actual_model,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=completion_tokens,
+        custom_api_used=custom_api_used,
+        metadata={
+            "request_id": response.id if hasattr(response, "id") else None,
+            **metadata,
+        },
+    )
+
+
+@beartype
+async def track_embedding_usage(
+    *,
+    developer_id: UUID,
+    model: str,
+    inputs: list[str],
+    response: Any,
+    custom_api_used: bool = False,
+    metadata: dict[str, Any] = {},
+) -> None:
+    """
+    Tracks token usage and costs for an embedding API call.
+
+    Parameters:
+        developer_id (UUID): The unique identifier for the developer.
+        model (str): The model used for the embedding.
+        inputs (list[str]): The inputs sent for embedding.
+        response (Any): The response from the embedding API call.
+        custom_api_used (bool): Whether a custom API key was used.
+        metadata (dict): Additional metadata about the usage.
+
+    Returns:
+        None
+    """
+
+    # Try to get token count from response.usage
+    if hasattr(response, "usage") and response.usage:
+        prompt_tokens = response.usage.prompt_tokens
+    else:
+        # Calculate tokens manually if usage is not available
+        prompt_tokens = sum(
+            token_counter(model=model, text=input_text) for input_text in inputs
+        )
+
+    # Map the model name to the actual model name
+    actual_model = model
+
+    # Create usage record for embeddings (no completion tokens)
+    await create_usage_record(
+        developer_id=developer_id,
+        model=actual_model,
+        prompt_tokens=prompt_tokens,
+        completion_tokens=0,  # Embeddings don't have completion tokens
+        custom_api_used=custom_api_used,
+        metadata=metadata,
+    )
@@ -17,4 +17,5 @@
 from . import sessions as sessions
 from . import tasks as tasks
 from . import tools as tools
+from . import usage as usage
 from . import users as users
@@ -0,0 +1,20 @@
+"""
+The `usage` module within the `queries` package provides functionality for tracking token usage
+and costs associated with LLM API calls. This includes:
+
+- Recording prompt and completion tokens
+- Calculating costs based on model pricing
+- Storing usage data with developer attribution
+- Supporting both standard and custom API usage
+
+Each function in this module constructs and executes SQL queries for database operations
+related to usage tracking and reporting.
+"""
+
+# ruff: noqa: F401, F403, F405
+
+from .create_usage_record import create_usage_record
+
+__all__ = [
+    "create_usage_record",
+]
@@ -0,0 +1,137 @@
+"""
+This module contains functionality for creating usage records in the PostgreSQL database.
+It tracks token usage and costs for LLM API calls.
+"""
+
+from typing import Any
+from uuid import UUID
+
+from beartype import beartype
+from litellm import cost_per_token
+
+from ...common.utils.db_exceptions import common_db_exceptions
+from ...metrics.counters import query_metrics
+from ..utils import pg_query, rewrap_exceptions
+
+FALLBACK_PRICING = {
+    # Meta Llama models
+    "meta-llama/llama-4-scout": {
+        "api_request": 0.08 / 1000,
+        "api_response": 0.45 / 1000,
+    },
+    "meta-llama/llama-4-maverick": {
+        "api_request": 0.19 / 1000,
+        "api_response": 0.85 / 1000,
+    },
+    "meta-llama/llama-4-maverick:free": {
+        "api_request": 0.0 / 1000,
+        "api_response": 0.0 / 1000,
+    },
+    # Qwen model
+    "qwen/qwen-2.5-72b-instruct": {
+        "api_request": 0.7 / 1000,
+        "api_response": 0.7 / 1000,
+    },
+    # Sao10k model
+    "sao10k/l3.3-euryale-70b": {
+        "api_request": 0.7 / 1000,
+        "api_response": 0.8 / 1000,
+    },
+    "sao10k/l3.1-euryale-70b": {
+        "api_request": 0.7 / 1000,
+        "api_response": 0.8 / 1000,
+    },
+}
+
+# Define the raw SQL query
+usage_query = """
+INSERT INTO usage (
+    developer_id,
+    model,
+    prompt_tokens,
+    completion_tokens,
+    cost,
+    estimated,
+    custom_api_used,
+    metadata
+)
+VALUES (
+    $1, -- developer_id
+    $2, -- model
+    $3, -- prompt_tokens
+    $4, -- completion_tokens
+    $5, -- cost
+    $6, -- estimated
+    $7, -- custom_api_used
+    $8  -- metadata
+)
+RETURNING *;
+"""
+
+
+@rewrap_exceptions(common_db_exceptions("usage", ["create"]))
+@query_metrics("create_usage_record")
+@pg_query
+@beartype
+async def create_usage_record(
+    *,
+    developer_id: UUID,
+    model: str,
+    prompt_tokens: int,
+    completion_tokens: int,
+    custom_api_used: bool = False,
+    estimated: bool = False,
+    metadata: dict[str, Any] | None = None,
+) -> tuple[str, list]:
+    """
+    Creates a usage record to track token usage and costs.
+
+    Parameters:
+        developer_id (UUID): The unique identifier for the developer.
+        model (str): The model used for the API call.
+        prompt_tokens (int): Number of tokens in the prompt.
+        completion_tokens (int): Number of tokens in the completion.
+        custom_api_used (bool): Whether a custom API key was used.
+        estimated (bool): Whether the token count is estimated.
+        metadata (dict | None): Additional metadata about the usage.
+
+    Returns:
+        tuple[str, list]: SQL query and parameters for creating the usage record.
+    """
+    # Calculate cost based on token usage
+    # For custom API keys, we still track usage but mark it as such
+    total_cost = 0.0
+
+    if not custom_api_used:
+        # Calculate cost using litellm's cost_per_token function
+        try:
+            prompt_cost, completion_cost = cost_per_token(
+                model, prompt_tokens=prompt_tokens, completion_tokens=completion_tokens
+            )
+            total_cost = prompt_cost + completion_cost
+        except Exception:
+            estimated = True
+
+            if model in FALLBACK_PRICING:
+                total_cost = (
+                    FALLBACK_PRICING[model]["api_request"] * prompt_tokens
+                    + FALLBACK_PRICING[model]["api_response"] * completion_tokens
+                )
+            else:
+                print(f"No fallback pricing found for model {model}")
+
+    params = [
+        developer_id,
+        model,
+        prompt_tokens,
+        completion_tokens,
+        total_cost,
+        estimated,
+        custom_api_used,
+        metadata or {},
+    ]
+
+    return (
+        usage_query,
+        params,
+    )