Skip to content

Commit a5bbebe

Browse files
feat(api): add validate endpoint
1 parent a5e9f6e commit a5bbebe

File tree

7 files changed

+782
-4
lines changed

7 files changed

+782
-4
lines changed

.stats.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
configured_endpoints: 43
1+
configured_endpoints: 44
22
openapi_spec_hash: 97719fe7ae4c641a5a020dd21f2978dd
3-
config_hash: 5e459b33c53ffa6e554087a779bdb790
3+
config_hash: 659f65b6ccf5612986f920f7f9abbcb5

api.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ from codex.types import (
142142
ProjectExportResponse,
143143
ProjectIncrementQueriesResponse,
144144
ProjectRetrieveAnalyticsResponse,
145+
ProjectValidateResponse,
145146
)
146147
```
147148

@@ -155,6 +156,7 @@ Methods:
155156
- <code title="get /api/projects/{project_id}/export">client.projects.<a href="./src/codex/resources/projects/projects.py">export</a>(project_id) -> <a href="./src/codex/types/project_export_response.py">object</a></code>
156157
- <code title="post /api/projects/{project_id}/increment_queries">client.projects.<a href="./src/codex/resources/projects/projects.py">increment_queries</a>(project_id, \*\*<a href="src/codex/types/project_increment_queries_params.py">params</a>) -> <a href="./src/codex/types/project_increment_queries_response.py">object</a></code>
157158
- <code title="get /api/projects/{project_id}/analytics/">client.projects.<a href="./src/codex/resources/projects/projects.py">retrieve_analytics</a>(project_id, \*\*<a href="src/codex/types/project_retrieve_analytics_params.py">params</a>) -> <a href="./src/codex/types/project_retrieve_analytics_response.py">ProjectRetrieveAnalyticsResponse</a></code>
159+
- <code title="post /api/projects/{project_id}/validate">client.projects.<a href="./src/codex/resources/projects/projects.py">validate</a>(project_id, \*\*<a href="src/codex/types/project_validate_params.py">params</a>) -> <a href="./src/codex/types/project_validate_response.py">ProjectValidateResponse</a></code>
158160

159161
## AccessKeys
160162

src/codex/resources/projects/projects.py

Lines changed: 376 additions & 2 deletions
Large diffs are not rendered by default.

src/codex/types/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from .project_list_response import ProjectListResponse as ProjectListResponse
1313
from .project_return_schema import ProjectReturnSchema as ProjectReturnSchema
1414
from .project_update_params import ProjectUpdateParams as ProjectUpdateParams
15+
from .project_validate_params import ProjectValidateParams as ProjectValidateParams
1516
from .project_retrieve_response import ProjectRetrieveResponse as ProjectRetrieveResponse
17+
from .project_validate_response import ProjectValidateResponse as ProjectValidateResponse
1618
from .organization_schema_public import OrganizationSchemaPublic as OrganizationSchemaPublic
1719
from .user_activate_account_params import UserActivateAccountParams as UserActivateAccountParams
1820
from .project_increment_queries_params import ProjectIncrementQueriesParams as ProjectIncrementQueriesParams
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2+
3+
from __future__ import annotations
4+
5+
from typing import Dict, List, Iterable, Optional
6+
from typing_extensions import Literal, Required, Annotated, TypedDict
7+
8+
from .._utils import PropertyInfo
9+
10+
__all__ = ["ProjectValidateParams", "BadResponseThresholds", "Options"]
11+
12+
13+
class ProjectValidateParams(TypedDict, total=False):
14+
context: Required[str]
15+
16+
prompt: Required[str]
17+
18+
query: Required[str]
19+
20+
response: Required[str]
21+
22+
use_llm_matching: bool
23+
24+
bad_response_thresholds: BadResponseThresholds
25+
26+
constrain_outputs: Optional[List[str]]
27+
28+
custom_metadata: Optional[object]
29+
"""Arbitrary metadata supplied by the user/system"""
30+
31+
eval_scores: Optional[Dict[str, float]]
32+
"""Evaluation scores to use for flagging a response as bad.
33+
34+
If not provided, TLM will be used to generate scores.
35+
"""
36+
37+
options: Optional[Options]
38+
"""
39+
Typed dict of advanced configuration options for the Trustworthy Language Model.
40+
Many of these configurations are determined by the quality preset selected
41+
(learn about quality presets in the TLM [initialization method](./#class-tlm)).
42+
Specifying TLMOptions values directly overrides any default values set from the
43+
quality preset.
44+
45+
For all options described below, higher settings will lead to longer runtimes
46+
and may consume more tokens internally. You may not be able to run long prompts
47+
(or prompts with long responses) in your account, unless your token/rate limits
48+
are increased. If you hit token limit issues, try lower/less expensive
49+
TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to
50+
increase your limits.
51+
52+
The default values corresponding to each quality preset are:
53+
54+
- **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
55+
`use_self_reflection` = True. This preset improves LLM responses.
56+
- **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
57+
`use_self_reflection` = True. This preset improves LLM responses.
58+
- **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
59+
`use_self_reflection` = True.
60+
- **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
61+
`use_self_reflection` = True.
62+
- **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
63+
`use_self_reflection` = False. When using `get_trustworthiness_score()` on
64+
"base" preset, a cheaper self-reflection will be used to compute the
65+
trustworthiness score.
66+
67+
By default, the TLM uses the "medium" quality preset. The default base LLM
68+
`model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
69+
You can set custom values for these arguments regardless of the quality preset
70+
specified.
71+
72+
Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
73+
"gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
74+
"gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
75+
"claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
76+
"claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
77+
"gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
78+
faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
79+
"o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
80+
"gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
81+
"claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
82+
for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
83+
"claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
84+
"gpt-4.1-nano", "nova-micro".
85+
86+
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
87+
Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
88+
If you experience token/rate limit errors while using TLM, try lowering this number.
89+
For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512.
90+
91+
num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`.
92+
`TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one.
93+
This parameter must be between 1 and 20. It has no effect on `TLM.score()`.
94+
Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs.
95+
When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it.
96+
97+
num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring.
98+
Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs.
99+
Measuring consistency helps quantify the epistemic uncertainty associated with
100+
strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
101+
TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
102+
103+
use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
104+
Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
105+
Reflection helps quantify aleatoric uncertainty associated with challenging prompts
106+
and catches responses that are noticeably incorrect/bad upon further analysis.
107+
108+
similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
109+
trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
110+
Supported similarity measures include: "semantic" (based on natural language inference),
111+
"embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
112+
"code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
113+
and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
114+
115+
reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens)
116+
when generating alternative possible responses and reflecting on responses during trustworthiness scoring.
117+
Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs.
118+
119+
log (list[str], default = []): optionally specify additional logs or metadata that TLM should return.
120+
For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness.
121+
122+
custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring.
123+
The expected input format is a list of dictionaries, where each dictionary has the following keys:
124+
- name: Name of the evaluation criteria.
125+
- criteria: Instructions specifying the evaluation criteria.
126+
"""
127+
128+
quality_preset: Literal["best", "high", "medium", "low", "base"]
129+
"""The quality preset to use for the TLM or Trustworthy RAG API."""
130+
131+
task: Optional[str]
132+
133+
x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")]
134+
135+
x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")]
136+
137+
x_source: Annotated[str, PropertyInfo(alias="x-source")]
138+
139+
x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")]
140+
141+
142+
class BadResponseThresholds(TypedDict, total=False):
143+
context_sufficiency: Optional[float]
144+
145+
query_ease: Optional[float]
146+
147+
response_helpfulness: Optional[float]
148+
149+
trustworthiness: Optional[float]
150+
151+
152+
class Options(TypedDict, total=False):
153+
custom_eval_criteria: Iterable[object]
154+
155+
log: List[str]
156+
157+
max_tokens: int
158+
159+
model: str
160+
161+
num_candidate_responses: int
162+
163+
num_consistency_samples: int
164+
165+
reasoning_effort: str
166+
167+
similarity_measure: str
168+
169+
use_self_reflection: bool
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
2+
3+
from typing import Dict, Optional
4+
5+
from .._models import BaseModel
6+
7+
__all__ = ["ProjectValidateResponse", "EvalScores"]
8+
9+
10+
class EvalScores(BaseModel):
11+
is_bad: bool
12+
13+
score: Optional[float] = None
14+
15+
log: Optional[object] = None
16+
17+
18+
class ProjectValidateResponse(BaseModel):
19+
eval_scores: Dict[str, EvalScores]
20+
"""
21+
Evaluation scores for the original response along with a boolean flag, `is_bad`,
22+
indicating whether the score is below the threshold.
23+
"""
24+
25+
expert_answer: Optional[str] = None
26+
"""
27+
Alternate SME-provided answer from Codex if the response was flagged as bad and
28+
an answer was found in the Codex Project, or None otherwise.
29+
"""
30+
31+
is_bad_response: bool
32+
"""True if the response is flagged as potentially bad, False otherwise.
33+
34+
When True, a lookup is performed, which logs this query in the project for SMEs
35+
to answer, if it does not already exist.
36+
"""

0 commit comments

Comments
 (0)