11from abc import ABC , abstractmethod
2+ from opentelemetry ._events import Event
3+
24from .types import LLMInvocation
5+ from opentelemetry import trace
6+ from opentelemetry .trace import (
7+ Tracer ,
8+ )
9+ from opentelemetry import _events
10+ from .deepeval import evaluate_answer_relevancy_metric
11+ from opentelemetry .trace import SpanContext , Span
12+ from opentelemetry .trace .span import NonRecordingSpan
313
414
515class EvaluationResult :
@@ -22,20 +32,75 @@ def evaluate(self, invocation: LLMInvocation) -> EvaluationResult:
2232 """
2333 pass
2434
25- class DeepEvalsEvaluator (Evaluator ):
35+ class DeepEvalEvaluator (Evaluator ):
2636 """
2737 Uses DeepEvals library for LLM-as-judge evaluations.
2838 """
29- def __init__ (self , config : dict = None ):
39+ def __init__ (self , event_logger , tracer : Tracer = None , config : dict = None ):
3040 # e.g. load models, setup API keys
3141 self .config = config or {}
42+ self ._tracer = tracer or trace .get_tracer (__name__ )
43+ self ._event_logger = event_logger or _events .get_event_logger (__name__ )
3244
33- def evaluate (self , invocation : LLMInvocation ) -> EvaluationResult :
45+ def evaluate (self , invocation : LLMInvocation ):
3446 # stub: integrate with deepevals SDK
3547 # result = deepevals.judge(invocation.prompt, invocation.response, **self.config)
36- score = 0.0 # placeholder
37- details = {"method" : "deepevals" }
38- return EvaluationResult (score = score , details = details )
48+ human_message = next ((msg for msg in invocation .messages if msg .type == "human" ), None )
49+ content = invocation .chat_generations [0 ].content
50+ if content is not None and content != "" :
51+ eval_arm = evaluate_answer_relevancy_metric (human_message .content , invocation .chat_generations [0 ].content , [])
52+ self ._do_telemetry (invocation .messages [1 ].content , invocation .chat_generations [0 ].content ,
53+ invocation .span_id , invocation .trace_id , eval_arm )
54+
55+ def _do_telemetry (self , query , output , parent_span_id , parent_trace_id , eval_arm ):
56+
57+ # emit event
58+ body = {
59+ "content" : f"query: { query } output: { output } " ,
60+ }
61+ attributes = {
62+ "gen_ai.evaluation.name" : "relevance" ,
63+ "gen_ai.evaluation.score" : eval_arm .score ,
64+ "gen_ai.evaluation.reasoning" : eval_arm .reason ,
65+ "gen_ai.evaluation.cost" : eval_arm .evaluation_cost ,
66+ }
67+
68+ event = Event (
69+ name = "gen_ai.evaluation.message" ,
70+ attributes = attributes ,
71+ body = body if body else None ,
72+ span_id = parent_span_id ,
73+ trace_id = parent_trace_id ,
74+ )
75+ self ._event_logger .emit (event )
76+
77+ # create span
78+ span_context = SpanContext (
79+ trace_id = parent_trace_id ,
80+ span_id = parent_span_id ,
81+ is_remote = False ,
82+ )
83+
84+ span = NonRecordingSpan (
85+ context = span_context ,
86+ )
87+
88+ tracer = trace .get_tracer (__name__ )
89+
90+ with tracer .start_as_current_span ("evaluation relevance" ) as span :
91+ # do evaluation
92+
93+ span .add_link (span_context , attributes = {
94+ "gen_ai.operation.name" : "evaluation" ,
95+ })
96+ span .set_attribute ("gen_ai.operation.name" , "evaluation" )
97+ span .set_attribute ("gen_ai.evaluation.name" , "relevance" )
98+ span .set_attribute ("gen_ai.evaluation.score" , eval_arm .score )
99+ span .set_attribute ("gen_ai.evaluation.label" , "Pass" )
100+ span .set_attribute ("gen_ai.evaluation.reasoning" , eval_arm .reason )
101+ span .set_attribute ("gen_ai.evaluation.model" , eval_arm .evaluation_model )
102+ span .set_attribute ("gen_ai.evaluation.cost" , eval_arm .evaluation_cost )
103+ #span.set_attribute("gen_ai.evaluation.verdict", eval_arm.verdicts)
39104
40105
41106class OpenLitEvaluator (Evaluator ):
@@ -54,16 +119,16 @@ def evaluate(self, invocation: LLMInvocation) -> EvaluationResult:
54119
55120# Registry for easy lookup
56121EVALUATORS = {
57- "deepevals " : DeepEvalsEvaluator ,
122+ "deepeval " : DeepEvalEvaluator ,
58123 "openlit" : OpenLitEvaluator ,
59124}
60125
61126
62- def get_evaluator (name : str , config : dict = None ) -> Evaluator :
127+ def get_evaluator (name : str , event_logger = None , tracer : Tracer = None , config : dict = None ) -> Evaluator :
63128 """
64129 Factory: return an evaluator by name.
65130 """
66131 cls = EVALUATORS .get (name .lower ())
67132 if not cls :
68133 raise ValueError (f"Unknown evaluator: { name } " )
69- return cls (config )
134+ return cls (event_logger , tracer , config )
0 commit comments