|
| 1 | +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from typing import Dict, List, Iterable, Optional |
| 6 | +from typing_extensions import Literal, Required, Annotated, TypedDict |
| 7 | + |
| 8 | +from .._utils import PropertyInfo |
| 9 | + |
| 10 | +__all__ = ["ProjectValidateParams", "BadResponseThresholds", "Options"] |
| 11 | + |
| 12 | + |
| 13 | +class ProjectValidateParams(TypedDict, total=False): |
| 14 | + context: Required[str] |
| 15 | + |
| 16 | + prompt: Required[str] |
| 17 | + |
| 18 | + query: Required[str] |
| 19 | + |
| 20 | + response: Required[str] |
| 21 | + |
| 22 | + use_llm_matching: bool |
| 23 | + |
| 24 | + bad_response_thresholds: BadResponseThresholds |
| 25 | + |
| 26 | + constrain_outputs: Optional[List[str]] |
| 27 | + |
| 28 | + custom_metadata: Optional[object] |
| 29 | + """Arbitrary metadata supplied by the user/system""" |
| 30 | + |
| 31 | + eval_scores: Optional[Dict[str, float]] |
| 32 | + """Evaluation scores to use for flagging a response as bad. |
| 33 | +
|
| 34 | + If not provided, TLM will be used to generate scores. |
| 35 | + """ |
| 36 | + |
| 37 | + options: Optional[Options] |
| 38 | + """ |
| 39 | + Typed dict of advanced configuration options for the Trustworthy Language Model. |
| 40 | + Many of these configurations are determined by the quality preset selected |
| 41 | + (learn about quality presets in the TLM [initialization method](./#class-tlm)). |
| 42 | + Specifying TLMOptions values directly overrides any default values set from the |
| 43 | + quality preset. |
| 44 | +
|
| 45 | + For all options described below, higher settings will lead to longer runtimes |
| 46 | + and may consume more tokens internally. You may not be able to run long prompts |
| 47 | + (or prompts with long responses) in your account, unless your token/rate limits |
| 48 | + are increased. If you hit token limit issues, try lower/less expensive |
| 49 | + TLMOptions to be able to run longer prompts/responses, or contact Cleanlab to |
| 50 | + increase your limits. |
| 51 | +
|
| 52 | + The default values corresponding to each quality preset are: |
| 53 | +
|
| 54 | + - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8, |
| 55 | + `use_self_reflection` = True. This preset improves LLM responses. |
| 56 | + - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8, |
| 57 | + `use_self_reflection` = True. This preset improves LLM responses. |
| 58 | + - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8, |
| 59 | + `use_self_reflection` = True. |
| 60 | + - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4, |
| 61 | + `use_self_reflection` = True. |
| 62 | + - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0, |
| 63 | + `use_self_reflection` = False. When using `get_trustworthiness_score()` on |
| 64 | + "base" preset, a cheaper self-reflection will be used to compute the |
| 65 | + trustworthiness score. |
| 66 | +
|
| 67 | + By default, the TLM uses the "medium" quality preset. The default base LLM |
| 68 | + `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets. |
| 69 | + You can set custom values for these arguments regardless of the quality preset |
| 70 | + specified. |
| 71 | +
|
| 72 | + Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini", |
| 73 | + "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4", |
| 74 | + "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet", |
| 75 | + "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku", |
| 76 | + "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default = |
| 77 | + "gpt-4o-mini"): Underlying base LLM to use (better models yield better results, |
| 78 | + faster models yield faster/cheaper results). - Models still in beta: "o3", "o1", |
| 79 | + "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", |
| 80 | + "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2", |
| 81 | + "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models |
| 82 | + for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet", |
| 83 | + "claude-3.5-sonnet-v2". - Recommended models for low latency/costs: |
| 84 | + "gpt-4.1-nano", "nova-micro". |
| 85 | +
|
| 86 | + max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring). |
| 87 | + Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs. |
| 88 | + If you experience token/rate limit errors while using TLM, try lowering this number. |
| 89 | + For OpenAI models, this parameter must be between 64 and 4096. For Claude models, this parameter must be between 64 and 512. |
| 90 | +
|
| 91 | + num_candidate_responses (int, default = 1): how many alternative candidate responses are internally generated in `TLM.prompt()`. |
| 92 | + `TLM.prompt()` scores the trustworthiness of each candidate response, and then returns the most trustworthy one. |
| 93 | + This parameter must be between 1 and 20. It has no effect on `TLM.score()`. |
| 94 | + Higher values here can produce more accurate responses from `TLM.prompt()`, but at higher runtimes/costs. |
| 95 | + When it is 1, `TLM.prompt()` simply returns a standard LLM response and does not attempt to auto-improve it. |
| 96 | +
|
| 97 | + num_consistency_samples (int, default = 8): the amount of internal sampling to measure LLM response consistency, a factor affecting trustworthiness scoring. |
| 98 | + Must be between 0 and 20. Higher values produce more reliable TLM trustworthiness scores, but at higher runtimes/costs. |
| 99 | + Measuring consistency helps quantify the epistemic uncertainty associated with |
| 100 | + strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response. |
| 101 | + TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible. |
| 102 | +
|
| 103 | + use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence. |
| 104 | + Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores. |
| 105 | + Reflection helps quantify aleatoric uncertainty associated with challenging prompts |
| 106 | + and catches responses that are noticeably incorrect/bad upon further analysis. |
| 107 | +
|
| 108 | + similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the |
| 109 | + trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model. |
| 110 | + Supported similarity measures include: "semantic" (based on natural language inference), |
| 111 | + "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model), |
| 112 | + "code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies), |
| 113 | + and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs. |
| 114 | +
|
| 115 | + reasoning_effort ({"none", "low", "medium", "high"}, default = "high"): how much internal LLM calls are allowed to reason (number of thinking tokens) |
| 116 | + when generating alternative possible responses and reflecting on responses during trustworthiness scoring. |
| 117 | + Higher reasoning efforts may yield more reliable TLM trustworthiness scores. Reduce this value to reduce runtimes/costs. |
| 118 | +
|
| 119 | + log (list[str], default = []): optionally specify additional logs or metadata that TLM should return. |
| 120 | + For instance, include "explanation" here to get explanations of why a response is scored with low trustworthiness. |
| 121 | +
|
| 122 | + custom_eval_criteria (list[dict[str, Any]], default = []): optionally specify custom evalution criteria beyond the built-in trustworthiness scoring. |
| 123 | + The expected input format is a list of dictionaries, where each dictionary has the following keys: |
| 124 | + - name: Name of the evaluation criteria. |
| 125 | + - criteria: Instructions specifying the evaluation criteria. |
| 126 | + """ |
| 127 | + |
| 128 | + quality_preset: Literal["best", "high", "medium", "low", "base"] |
| 129 | + """The quality preset to use for the TLM or Trustworthy RAG API.""" |
| 130 | + |
| 131 | + task: Optional[str] |
| 132 | + |
| 133 | + x_client_library_version: Annotated[str, PropertyInfo(alias="x-client-library-version")] |
| 134 | + |
| 135 | + x_integration_type: Annotated[str, PropertyInfo(alias="x-integration-type")] |
| 136 | + |
| 137 | + x_source: Annotated[str, PropertyInfo(alias="x-source")] |
| 138 | + |
| 139 | + x_stainless_package_version: Annotated[str, PropertyInfo(alias="x-stainless-package-version")] |
| 140 | + |
| 141 | + |
| 142 | +class BadResponseThresholds(TypedDict, total=False): |
| 143 | + context_sufficiency: Optional[float] |
| 144 | + |
| 145 | + query_ease: Optional[float] |
| 146 | + |
| 147 | + response_helpfulness: Optional[float] |
| 148 | + |
| 149 | + trustworthiness: Optional[float] |
| 150 | + |
| 151 | + |
| 152 | +class Options(TypedDict, total=False): |
| 153 | + custom_eval_criteria: Iterable[object] |
| 154 | + |
| 155 | + log: List[str] |
| 156 | + |
| 157 | + max_tokens: int |
| 158 | + |
| 159 | + model: str |
| 160 | + |
| 161 | + num_candidate_responses: int |
| 162 | + |
| 163 | + num_consistency_samples: int |
| 164 | + |
| 165 | + reasoning_effort: str |
| 166 | + |
| 167 | + similarity_measure: str |
| 168 | + |
| 169 | + use_self_reflection: bool |
0 commit comments