truera · daniel-huang-1230 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/trulens_eval/trulens_eval/feedback/prompts.py b/trulens_eval/trulens_eval/feedback/prompts.py
@@ -73,11 +73,13 @@
 
 %s
 
-The right answer is:
+The expected answer is:
 
 %s
 
-Answer only with an integer from 1 to 10 based on how close the responses are to the right answer.
+Answer only with an integer from 1 to 10 based on how semantically similar the responses are to the expected answer. 
+where 0 is no semantic similarity at all and 10 is perfect agreement between the responses and the expected answer.
+Never elaborate.
 """
 
 REMOVE_Y_N = " If so, respond Y. If not, respond N."

diff --git a/trulens_eval/trulens_eval/utils/generated.py b/trulens_eval/trulens_eval/utils/generated.py
@@ -4,9 +4,21 @@
 
 import logging
 import re
+from pydantic import BaseModel, field_validator, ValidationError
+
 
 logger = logging.getLogger(__name__)
 
+
+class Rating(BaseModel):
+    rating: int
+
+    @field_validator('rating')
+    def check_rating(cls, v):
+        if not (0 <= v <= 10):
+            raise ValueError('Rating must be between 0 and 10')
+        return v
+
 # for extracting the 0-10 rating, we are assuming the score will
 # always be the last part of the generated text from LLM - hence we are matching for the last
 # group of digits in the string
@@ -22,4 +34,9 @@ def re_0_10_rating(str_val):
             logger.warning(f"0-10 rating regex failed to match on: '{str_val}'")
             return -10  # so this will be reported as -1 after division by 10
 
-    return int(matches.group())
+    try:
+        rating = Rating(rating=int(matches.group()))
+        return rating.rating
+    except ValidationError as e:
+        logger.warning(f"Validation error: {e}")
+        return -10  # TODO: could consider incorporating re-asking and self-critique here with Instructor https://github.com/jxnl/instructor