Skip to content

[DO NOT MERGE] Chore/update metrics #248

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/strands/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,6 @@ def __init__(
# Initialize tracer instance (no-op if not configured)
self.tracer = get_tracer()
self.trace_span: Optional[trace.Span] = None

self.tool_caller = Agent.ToolCaller(self)

@property
Expand Down
15 changes: 10 additions & 5 deletions src/strands/event_loop/event_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from functools import partial
from typing import Any, Callable, Dict, List, Optional, Tuple, cast

from ..telemetry import MetricsClient
from ..telemetry.metrics import EventLoopMetrics, Trace
from ..telemetry.tracer import get_tracer
from ..tools.executor import run_tools, validate_and_prepare_tools
Expand Down Expand Up @@ -105,10 +106,14 @@ def event_loop_cycle(
kwargs["event_loop_cycle_id"] = uuid.uuid4()

event_loop_metrics: EventLoopMetrics = kwargs.get("event_loop_metrics", EventLoopMetrics())

metrics_client = MetricsClient()
# Initialize state and get cycle trace
kwargs = initialize_state(**kwargs)
cycle_start_time, cycle_trace = event_loop_metrics.start_cycle()

attributes = {"event_loop_cycle_id": str(kwargs.get("event_loop_cycle_id"))}
cycle_start_time, cycle_trace = event_loop_metrics.start_cycle(metrics_client)
metrics_client.event_loop_cycle_count.add(1, attributes=attributes)
metrics_client.event_loop_start_cycle.add(1, attributes=attributes)
kwargs["event_loop_cycle_trace"] = cycle_trace

callback_handler(start=True)
Expand Down Expand Up @@ -227,7 +232,7 @@ def event_loop_cycle(
)

# End the cycle and return results
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace, metrics_client)
if cycle_span:
tracer.end_event_loop_cycle_span(
span=cycle_span,
Expand Down Expand Up @@ -380,7 +385,7 @@ def _handle_tool_execution(

if not tool_uses:
return stop_reason, message, event_loop_metrics, kwargs["request_state"]

metrics_client = MetricsClient()
tool_handler_process = partial(
tool_handler.process,
messages=messages,
Expand Down Expand Up @@ -418,7 +423,7 @@ def _handle_tool_execution(
tracer.end_event_loop_cycle_span(span=cycle_span, message=message, tool_result_message=tool_result_message)

if kwargs["request_state"].get("stop_event_loop", False):
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace)
event_loop_metrics.end_cycle(cycle_start_time, cycle_trace, metrics_client)
return stop_reason, message, event_loop_metrics, kwargs["request_state"]

return recurse_event_loop(
Expand Down
5 changes: 4 additions & 1 deletion src/strands/telemetry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
This module provides metrics and tracing functionality.
"""

from .metrics import EventLoopMetrics, Trace, metrics_to_string
from .config import get_otel_resource
from .metrics import EventLoopMetrics, MetricsClient, Trace, metrics_to_string
from .tracer import Tracer, get_tracer

__all__ = [
Expand All @@ -12,4 +13,6 @@
"metrics_to_string",
"Tracer",
"get_tracer",
"MetricsClient",
"get_otel_resource",
]
33 changes: 33 additions & 0 deletions src/strands/telemetry/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""OpenTelemetry configuration and setup utilities for Strands agents.

This module provides centralized configuration and initialization functionality
for OpenTelemetry components and other telemetry infrastructure shared across Strands applications.
"""

from importlib.metadata import version

from opentelemetry.sdk.resources import Resource


def get_otel_resource() -> Resource:
"""Create a standard OpenTelemetry resource with service information.

This function implements a singleton pattern - it will return the same
Resource object for the same service_name parameter.

Args:
service_name: Name of the service for OpenTelemetry.

Returns:
Resource object with standard service information.
"""
resource = Resource.create(
{
"service.name": __name__,
"service.version": version("strands-agents"),
"telemetry.sdk.name": "opentelemetry",
"telemetry.sdk.language": "python",
}
)

return resource
118 changes: 111 additions & 7 deletions src/strands/telemetry/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
from dataclasses import dataclass, field
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple

import opentelemetry.metrics as metrics_api
from opentelemetry.metrics import Counter, Histogram, Meter

from ..telemetry import metrics_constants as constants
from ..types.content import Message
from ..types.streaming import Metrics, Usage
from ..types.tools import ToolUse
Expand Down Expand Up @@ -117,22 +121,34 @@ class ToolMetrics:
error_count: int = 0
total_time: float = 0.0

def add_call(self, tool: ToolUse, duration: float, success: bool) -> None:
def add_call(
self,
tool: ToolUse,
duration: float,
success: bool,
metrics_client: "MetricsClient",
attributes: Optional[Dict[str, Any]] = None,
) -> None:
"""Record a new tool call with its outcome.

Args:
tool: The tool that was called.
duration: How long the call took in seconds.
success: Whether the call was successful.
metrics_client: The metrics client for recording the metrics.
attributes: attributes of the metrics.
"""
self.tool = tool # Update with latest tool state
self.call_count += 1
self.total_time += duration

metrics_client.tool_call_count.add(1, attributes=attributes)
metrics_client.tool_duration.record(duration, attributes=attributes)
if success:
self.success_count += 1
metrics_client.tool_success_count.add(1, attributes=attributes)
else:
self.error_count += 1
metrics_client.tool_error_count.add(1, attributes=attributes)


@dataclass
Expand All @@ -155,32 +171,42 @@ class EventLoopMetrics:
accumulated_usage: Usage = field(default_factory=lambda: Usage(inputTokens=0, outputTokens=0, totalTokens=0))
accumulated_metrics: Metrics = field(default_factory=lambda: Metrics(latencyMs=0))

def start_cycle(self) -> Tuple[float, Trace]:
def start_cycle(self, metrics_client: "MetricsClient") -> Tuple[float, Trace]:
"""Start a new event loop cycle and create a trace for it.

Returns:
A tuple containing the start time and the cycle trace object.
"""
metrics_client.event_loop_cycle_count.add(1)
self.cycle_count += 1
start_time = time.time()
cycle_trace = Trace(f"Cycle {self.cycle_count}", start_time=start_time)
self.traces.append(cycle_trace)
return start_time, cycle_trace

def end_cycle(self, start_time: float, cycle_trace: Trace) -> None:
def end_cycle(self, start_time: float, cycle_trace: Trace, metrics_client: "MetricsClient") -> None:
"""End the current event loop cycle and record its duration.

Args:
start_time: The timestamp when the cycle started.
cycle_trace: The trace object for this cycle.
metrics_client: The metrics client for recording the metrics.
"""
metrics_client.event_loop_end_cycle.add(1)
end_time = time.time()
duration = end_time - start_time
metrics_client.event_loop_cycle_duration.record(duration)
self.cycle_durations.append(duration)
cycle_trace.end(end_time)

def add_tool_usage(
self, tool: ToolUse, duration: float, tool_trace: Trace, success: bool, message: Message
self,
tool: ToolUse,
duration: float,
tool_trace: Trace,
success: bool,
message: Message,
metrics_client: "MetricsClient",
) -> None:
"""Record metrics for a tool invocation.

Expand All @@ -190,6 +216,7 @@ def add_tool_usage(
tool_trace: The trace object for this tool call.
success: Whether the tool call was successful.
message: The message associated with the tool call.
metrics_client: The metrics client for recording the metrics.
"""
tool_name = tool.get("name", "unknown_tool")
tool_use_id = tool.get("toolUseId", "unknown")
Expand All @@ -203,8 +230,16 @@ def add_tool_usage(
tool_trace.raw_name = f"{tool_name} - {tool_use_id}"
tool_trace.add_message(message)

self.tool_metrics.setdefault(tool_name, ToolMetrics(tool)).add_call(tool, duration, success)

self.tool_metrics.setdefault(tool_name, ToolMetrics(tool)).add_call(
tool,
duration,
success,
metrics_client,
attributes={
"tool_name": tool_name,
"tool_use_id": tool_use_id,
},
)
tool_trace.end()

def update_usage(self, usage: Usage) -> None:
Expand All @@ -213,6 +248,7 @@ def update_usage(self, usage: Usage) -> None:
Args:
usage: The usage data to add to the accumulated totals.
"""
# metrics_client.token_usage.add(usage["totalTokens"])
self.accumulated_usage["inputTokens"] += usage["inputTokens"]
self.accumulated_usage["outputTokens"] += usage["outputTokens"]
self.accumulated_usage["totalTokens"] += usage["totalTokens"]
Expand Down Expand Up @@ -355,3 +391,71 @@ def metrics_to_string(event_loop_metrics: EventLoopMetrics, allowed_names: Optio
A formatted string representation of the metrics.
"""
return "\n".join(_metrics_summary_to_lines(event_loop_metrics, allowed_names or set()))


class MetricsClient:
"""Singleton client for managing OpenTelemetry metrics instruments.

The actual metrics export destination (console, OTLP endpoint, etc.) is configured
through OpenTelemetry SDK configuration by users, not by this client.
"""

_instance: Optional["MetricsClient"] = None
meter: Meter
strands_agent_invocation_count: Counter
event_loop_cycle_count: Counter
event_loop_start_cycle: Counter
event_loop_end_cycle: Counter
event_loop_cycle_duration: Histogram
tool_call_count: Counter
tool_success_count: Counter
tool_error_count: Counter
tool_duration: Histogram

def __new__(cls) -> "MetricsClient":
"""Create or return the singleton instance of MetricsClient.

Returns:
The single MetricsClient instance.
"""
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance

def __init__(self) -> None:
"""Initialize the MetricsClient.

This method only runs once due to the singleton pattern.
Sets up the OpenTelemetry meter and creates metric instruments.
"""
if hasattr(self, "meter"):
return

logger.info("Creating Strands MetricsClient")
meter_provider: metrics_api.MeterProvider = metrics_api.get_meter_provider()
self.meter = meter_provider.get_meter(__name__)
self.create_instruments()

def create_instruments(self) -> None:
"""Create and initialize all OpenTelemetry metric instruments."""
self.strands_agent_invocation_count = self.meter.create_counter(
name=constants.STRANDS_AGENT_INVOCATION_COUNT, unit="Count"
)
self.event_loop_cycle_count = self.meter.create_counter(
name=constants.STRANDS_AGENT_EVENT_LOOP_CYCLE_COUNT, unit="Count"
)
self.event_loop_start_cycle = self.meter.create_counter(
name=constants.STRANDS_AGENT_EVENT_LOOP_START_CYCLE, unit="Count"
)
self.event_loop_end_cycle = self.meter.create_counter(
name=constants.STRANDS_AGENT_EVENT_LOOP_END_CYCLE, unit="Count"
)
self.event_loop_cycle_duration = self.meter.create_histogram(
name=constants.STRANDS_AGENT_EVENT_LOOP_CYCLE_DURATION, unit="s"
)
self.tool_call_count = self.meter.create_counter(name=constants.STRANDS_AGENT_TOOL_CALL_COUNT, unit="Count")
self.tool_success_count = self.meter.create_counter(
name=constants.STRANDS_AGENT_TOOL_SUCCESS_COUNT, unit="Count"
)
self.tool_error_count = self.meter.create_counter(name=constants.STRANDS_AGENT_TOOL_ERROR_COUNT, unit="Count")
self.tool_duration = self.meter.create_histogram(name=constants.STRANDS_AGENT_TOOL_DURATION, unit="s")
11 changes: 11 additions & 0 deletions src/strands/telemetry/metrics_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Metrics that are emitted in Strands-Agent."""

STRANDS_AGENT_INVOCATION_COUNT = "strands.agent.invocation_count"
STRANDS_AGENT_EVENT_LOOP_CYCLE_COUNT = "strands.agent.event_loop.cycle_count"
STRANDS_AGENT_EVENT_LOOP_START_CYCLE = "strands.agent.event_loop.start_cycle"
STRANDS_AGENT_EVENT_LOOP_END_CYCLE = "strands.agent.event_loop.end_cycle"
STRANDS_AGENT_EVENT_LOOP_CYCLE_DURATION = "strands.agent.event_loop.cycle_duration"
STRANDS_AGENT_TOOL_CALL_COUNT = "strands.agent.tool.call_count"
STRANDS_AGENT_TOOL_SUCCESS_COUNT = "strands.agent.tool.success_count"
STRANDS_AGENT_TOOL_ERROR_COUNT = "strands.agent.tool.error_count"
STRANDS_AGENT_TOOL_DURATION = "strands.agent.tool.duration"
15 changes: 3 additions & 12 deletions src/strands/telemetry/tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,19 @@
import logging
import os
from datetime import date, datetime, timezone
from importlib.metadata import version
from typing import Any, Dict, Mapping, Optional

import opentelemetry.trace as trace_api
from opentelemetry import propagate
from opentelemetry.baggage.propagation import W3CBaggagePropagator
from opentelemetry.propagators.composite import CompositePropagator
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider as SDKTracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter, SimpleSpanProcessor
from opentelemetry.trace import Span, StatusCode
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator

from ..agent.agent_result import AgentResult
from ..telemetry import get_otel_resource
from ..types.content import Message, Messages
from ..types.streaming import Usage
from ..types.tools import ToolResult, ToolUse
Expand Down Expand Up @@ -151,7 +150,6 @@ def __init__(
self.otlp_headers = otlp_headers or {}
self.tracer_provider: Optional[trace_api.TracerProvider] = None
self.tracer: Optional[trace_api.Tracer] = None

propagate.set_global_textmap(
CompositePropagator(
[
Expand All @@ -173,15 +171,7 @@ def _initialize_tracer(self) -> None:
self.tracer = self.tracer_provider.get_tracer(self.service_name)
return

# Create resource with service information
resource = Resource.create(
{
"service.name": self.service_name,
"service.version": version("strands-agents"),
"telemetry.sdk.name": "opentelemetry",
"telemetry.sdk.language": "python",
}
)
resource = get_otel_resource()

# Create tracer provider
self.tracer_provider = SDKTracerProvider(resource=resource)
Expand Down Expand Up @@ -216,6 +206,7 @@ def _initialize_tracer(self) -> None:
batch_processor = BatchSpanProcessor(otlp_exporter)
self.tracer_provider.add_span_processor(batch_processor)
logger.info("endpoint=<%s> | OTLP exporter configured with endpoint", endpoint)

except Exception as e:
logger.exception("error=<%s> | Failed to configure OTLP exporter", e)
elif self.otlp_endpoint and self.tracer_provider:
Expand Down
Loading
Loading