feat(api): api update

stainless-app[bot] · stainless-app[bot] · commit 7652365c5570 · 2025-05-14T19:16:46.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,3 +1,3 @@
 configured_endpoints: 44
-openapi_spec_hash: 8ffde9b129ffc5edd4c4f8c9d866d869
+openapi_spec_hash: 056bc3805c2373563a6585103edd5cb8
 config_hash: 659f65b6ccf5612986f920f7f9abbcb5
diff --git a/src/codex/resources/projects/projects.py b/src/codex/resources/projects/projects.py
@@ -426,8 +426,8 @@ def validate(
         query: str,
         response: str,
         use_llm_matching: bool | NotGiven = NOT_GIVEN,
-        bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN,
         constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN,
+        custom_eval_thresholds: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
         custom_metadata: Optional[object] | NotGiven = NOT_GIVEN,
         eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
         options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN,
@@ -451,10 +451,13 @@ def validate(
         query will be recorded in the project for SMEs to answer.
 
         Args:
+          custom_eval_thresholds: Optional custom thresholds for specific evals. Keys should match with the keys
+              in the `eval_scores` dictionary.
+
           custom_metadata: Arbitrary metadata supplied by the user/system
 
-          eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM
-              will be used to generate scores.
+          eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
+              be used to generate scores.
 
           options: Typed dict of advanced configuration options for the Trustworthy Language Model.
               Many of these configurations are determined by the quality preset selected
@@ -575,8 +578,8 @@ def validate(
                     "prompt": prompt,
                     "query": query,
                     "response": response,
-                    "bad_response_thresholds": bad_response_thresholds,
                     "constrain_outputs": constrain_outputs,
+                    "custom_eval_thresholds": custom_eval_thresholds,
                     "custom_metadata": custom_metadata,
                     "eval_scores": eval_scores,
                     "options": options,
@@ -967,8 +970,8 @@ async def validate(
         query: str,
         response: str,
         use_llm_matching: bool | NotGiven = NOT_GIVEN,
-        bad_response_thresholds: project_validate_params.BadResponseThresholds | NotGiven = NOT_GIVEN,
         constrain_outputs: Optional[List[str]] | NotGiven = NOT_GIVEN,
+        custom_eval_thresholds: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
         custom_metadata: Optional[object] | NotGiven = NOT_GIVEN,
         eval_scores: Optional[Dict[str, float]] | NotGiven = NOT_GIVEN,
         options: Optional[project_validate_params.Options] | NotGiven = NOT_GIVEN,
@@ -992,10 +995,13 @@ async def validate(
         query will be recorded in the project for SMEs to answer.
 
         Args:
+          custom_eval_thresholds: Optional custom thresholds for specific evals. Keys should match with the keys
+              in the `eval_scores` dictionary.
+
           custom_metadata: Arbitrary metadata supplied by the user/system
 
-          eval_scores: Evaluation scores to use for flagging a response as bad. If not provided, TLM
-              will be used to generate scores.
+          eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
+              be used to generate scores.
 
           options: Typed dict of advanced configuration options for the Trustworthy Language Model.
               Many of these configurations are determined by the quality preset selected
@@ -1116,8 +1122,8 @@ async def validate(
                     "prompt": prompt,
                     "query": query,
                     "response": response,
-                    "bad_response_thresholds": bad_response_thresholds,
                     "constrain_outputs": constrain_outputs,
+                    "custom_eval_thresholds": custom_eval_thresholds,
                     "custom_metadata": custom_metadata,
                     "eval_scores": eval_scores,
                     "options": options,
diff --git a/src/codex/types/project_validate_params.py b/src/codex/types/project_validate_params.py
@@ -7,7 +7,7 @@
 
 from .._utils import PropertyInfo
 
-__all__ = ["ProjectValidateParams", "BadResponseThresholds", "Options"]
+__all__ = ["ProjectValidateParams", "Options"]
 
 
 class ProjectValidateParams(TypedDict, total=False):
@@ -21,15 +21,19 @@ class ProjectValidateParams(TypedDict, total=False):
 
     use_llm_matching: bool
 
-    bad_response_thresholds: BadResponseThresholds
-
     constrain_outputs: Optional[List[str]]
 
+    custom_eval_thresholds: Optional[Dict[str, float]]
+    """Optional custom thresholds for specific evals.
+
+    Keys should match with the keys in the `eval_scores` dictionary.
+    """
+
     custom_metadata: Optional[object]
     """Arbitrary metadata supplied by the user/system"""
 
     eval_scores: Optional[Dict[str, float]]
-    """Evaluation scores to use for flagging a response as bad.
+    """Scores assessing different aspects of the RAG system.
 
     If not provided, TLM will be used to generate scores.
     """
@@ -139,16 +143,6 @@ class ProjectValidateParams(TypedDict, total=False):
     x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")]
 
 
-class BadResponseThresholds(TypedDict, total=False):
-    context_sufficiency: Optional[float]
-
-    query_ease: Optional[float]
-
-    response_helpfulness: Optional[float]
-
-    trustworthiness: Optional[float]
-
-
 class Options(TypedDict, total=False):
     custom_eval_criteria: Iterable[object]
 
diff --git a/src/codex/types/project_validate_response.py b/src/codex/types/project_validate_response.py
@@ -8,7 +8,7 @@
 
 
 class EvalScores(BaseModel):
-    is_bad: bool
+    failed: bool
 
     score: Optional[float] = None
 
@@ -18,7 +18,7 @@ class EvalScores(BaseModel):
 class ProjectValidateResponse(BaseModel):
     eval_scores: Dict[str, EvalScores]
     """
-    Evaluation scores for the original response along with a boolean flag, `is_bad`,
+    Evaluation scores for the original response along with a boolean flag, `failed`,
     indicating whether the score is below the threshold.
     """
 
diff --git a/tests/api_resources/test_projects.py b/tests/api_resources/test_projects.py
@@ -444,13 +444,8 @@ def test_method_validate_with_all_params(self, client: Codex) -> None:
             query="query",
             response="response",
             use_llm_matching=True,
-            bad_response_thresholds={
-                "context_sufficiency": 0,
-                "query_ease": 0,
-                "response_helpfulness": 0,
-                "trustworthiness": 0,
-            },
             constrain_outputs=["string"],
+            custom_eval_thresholds={"foo": 0},
             custom_metadata={},
             eval_scores={"foo": 0},
             options={
@@ -944,13 +939,8 @@ async def test_method_validate_with_all_params(self, async_client: AsyncCodex) -
             query="query",
             response="response",
             use_llm_matching=True,
-            bad_response_thresholds={
-                "context_sufficiency": 0,
-                "query_ease": 0,
-                "response_helpfulness": 0,
-                "trustworthiness": 0,
-            },
             constrain_outputs=["string"],
+            custom_eval_thresholds={"foo": 0},
             custom_metadata={},
             eval_scores={"foo": 0},
             options={