pydantic · dmontagu · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025
diff --git a/docs/evals.md b/docs/evals.md
@@ -426,7 +426,7 @@ import logfire
 from pydantic_evals import Case, Dataset
 from pydantic_evals.evaluators import Evaluator
 from pydantic_evals.evaluators.context import EvaluatorContext
-from pydantic_evals.otel.span_tree import SpanQuery, as_predicate
+from pydantic_evals.otel.span_tree import SpanQuery
 
 logfire.configure(  # ensure that an OpenTelemetry tracer is configured
     send_to_logfire='if-token-present'
@@ -443,7 +443,7 @@ class SpanTracingEvaluator(Evaluator[str, str]):
             return {'has_spans': False, 'performance_score': 0.0}
 
         # Find all spans with "processing" in the name
-        processing_spans = span_tree.find_all(lambda node: 'processing' in node.name)
+        processing_spans = span_tree.find(lambda node: 'processing' in node.name)
 
         # Calculate total processing time
         total_processing_time = sum(
@@ -452,7 +452,7 @@ class SpanTracingEvaluator(Evaluator[str, str]):
 
         # Check for error spans
         error_query: SpanQuery = {'name_contains': 'error'}
-        has_errors = span_tree.any(as_predicate(error_query))
+        has_errors = span_tree.any(error_query)
 
         # Calculate a performance score (lower is better)
         performance_score = 1.0 if total_processing_time < 0.5 else 0.5

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -823,17 +823,17 @@ async def _run_task(
         #   otherwise, we don't have a great way to get usage data from arbitrary frameworks.
         #   Ideally we wouldn't need to hard-code the specific logic here, but I'm not sure a great way to expose it to
         #   users. Maybe via an argument of type Callable[[SpanTree], dict[str, int | float]] or similar?
-        for node in span_tree.flattened():
+        for node in span_tree:
             if node.attributes.get('gen_ai.operation.name') == 'chat':
                 task_run.increment_metric('requests', 1)
             for k, v in node.attributes.items():
                 if not isinstance(v, (int, float)):
                     continue
                 # TODO: Revisit this choice to strip the prefix..
                 if k.startswith('gen_ai.usage.details.'):
-                    task_run.increment_metric(k[21:], v)
+                    task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
                 elif k.startswith('gen_ai.usage.'):
-                    task_run.increment_metric(k[13:], v)
+                    task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)
 
     return EvaluatorContext[InputsT, OutputT, MetadataT](
         name=case.name,

diff --git a/pydantic_evals/pydantic_evals/demo/__init__.py b/pydantic_evals/pydantic_evals/demo/__init__.py
diff --git a/pydantic_evals/pydantic_evals/evaluators/__init__.py b/pydantic_evals/pydantic_evals/evaluators/__init__.py
@@ -1,4 +1,4 @@
-from .common import Contains, Equals, EqualsExpected, IsInstance, LlmJudge, MaxDuration, Python, SpanQuery
+from .common import Contains, Equals, EqualsExpected, HasMatchingSpan, IsInstance, LlmJudge, MaxDuration, Python
 from .context import EvaluatorContext
 from .evaluator import EvaluationReason, EvaluationResult, EvaluationScalar, Evaluator, EvaluatorOutput, run_evaluator
 
@@ -10,7 +10,7 @@
     'IsInstance',
     'MaxDuration',
     'LlmJudge',
-    'SpanQuery',
+    'HasMatchingSpan',
     'Python',
     # context
     'EvaluatorContext',

diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py
@@ -6,7 +6,7 @@
 
 from pydantic_ai import models
 
-from ..otel.span_tree import SpanQuery as SpanNodeQuery, as_predicate
+from ..otel.span_tree import SpanQuery as SpanNodeQuery
 from .context import EvaluatorContext
 from .evaluator import EvaluationReason, Evaluator, EvaluatorOutput
 
@@ -17,7 +17,7 @@
     'IsInstance',
     'MaxDuration',
     'LlmJudge',
-    'SpanQuery',
+    'HasMatchingSpan',
     'Python',
 )
 
@@ -177,16 +177,16 @@ async def evaluate(
 
 
 @dataclass
-class SpanQuery(Evaluator[object, object, object]):
-    """Check if the span tree contains a span with the specified name."""
+class HasMatchingSpan(Evaluator[object, object, object]):
+    """Check if the span tree contains a span that matches the specified query."""
 
     query: SpanNodeQuery
 
     def evaluate(
         self,
         ctx: EvaluatorContext[object, object, object],
     ) -> bool:
-        return ctx.span_tree.find_first(as_predicate(self.query)) is not None
+        return ctx.span_tree.any(self.query)
 
 
 # TODO: Consider moving this to docs rather than providing it with the library, given the security implications
@@ -211,6 +211,6 @@ def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOu
     IsInstance,
     MaxDuration,
     LlmJudge,
-    SpanQuery,
+    HasMatchingSpan,
     # Python,  # not included by default for security reasons
 )