feat: add label field to provider options (promptfoo#563)

sparticleinc · Mar 17, 2024 · 5efb6fe · 5efb6fe
1 parent 47e23e6
commit 5efb6fe
Show file tree

Hide file tree

Showing 9 changed files with 112 additions and 87 deletions.
diff --git a/examples/gpt-3.5-temperature-comparison/promptfooconfig.yaml b/examples/gpt-3.5-temperature-comparison/promptfooconfig.yaml
@@ -2,14 +2,14 @@ prompts:
   - 'Respond to the following instruction: {{message}}'
 
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-lowtemp
-      config:
-        temperature: 0
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-hightemp
-      config:
-        temperature: 1
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-lowtemp
+    config:
+      temperature: 0
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-hightemp
+    config:
+      temperature: 1
 
 tests:
   - vars:

diff --git a/examples/llama-gpt-comparison/promptfooconfig.yaml b/examples/llama-gpt-comparison/promptfooconfig.yaml
@@ -3,23 +3,24 @@ prompts:
   prompts/completion_prompt.txt: completion_prompt
 
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-lowtemp
-      prompts: chat_prompt
-      config:
-        temperature: 0
-        max_tokens: 128
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-hightemp
-      prompts: chat_prompt
-      config:
-        temperature: 1
-        max_tokens: 128
-  - replicate:replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48:
-      prompts: completion_prompt
-      config:
-        temperature: 0.01 # minimum temperature
-        max_length: 128
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-lowtemp
+    prompts: chat_prompt
+    config:
+      temperature: 0
+      max_tokens: 128
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-hightemp
+    prompts: chat_prompt
+    config:
+      temperature: 1
+      max_tokens: 128
+  - id: replicate:meta/llama70b-v2-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+    label: llama70b-v2-chat
+    prompts: completion_prompt
+    config:
+      temperature: 0.01 # minimum temperature
+      max_length: 128
 
 tests:
   - vars:

diff --git a/site/docs/configuration/parameters.md b/site/docs/configuration/parameters.md
@@ -131,12 +131,13 @@ prompts:
   prompts/llama_completion_prompt.txt: llama_completion_prompt
 
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      prompts: gpt_chat_prompt
-  - openai:gpt-4-turbo-0613:
-      prompts: gpt_chat_prompt
-  - replicate:replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48:
-      prompts: llama_completion_prompt
+  - id: openai:gpt-3.5-turbo-0613
+    prompts: gpt_chat_prompt
+  - id: openai:gpt-4-turbo-0613
+    prompts: gpt_chat_prompt
+  - id: replicate:meta/llama70b-v2-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+    label: llama70b-v2-chat
+    prompts: llama_completion_prompt
 ```
 
 In this configuration, the `gpt_chat_prompt` is used for both GPT-3.5 and GPT-4 models, while the `llama_completion_prompt` is used for the Llama v2 model. The prompts are defined in separate files within the `prompts` directory.

diff --git a/site/docs/guides/compare-llama2-vs-gpt.md b/site/docs/guides/compare-llama2-vs-gpt.md
@@ -69,12 +69,15 @@ prompts:
   prompts/completion_prompt.txt: completion_prompt
 
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      prompts: chat_prompt
-  - openai:gpt-4-0613:
-      prompts: chat_prompt
-  - replicate:replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48:
-      prompts: completion_prompt
+  - id: openai:gpt-3.5-turbo-0613
+    label: gpt-3.5
+    prompts: chat_prompt
+  - id: openai:gpt-4-0613
+    label: gpt-4
+    prompts: chat_prompt
+  - id: replicate:meta/llama70b-v2-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+    label: llama70b-v2-chat
+    prompts: completion_prompt
 ```
 
 :::info
@@ -161,27 +164,27 @@ Each model has a `config` field where you can specify additional parameters. Let
 
 ```yaml title=promptfooconfig.yaml
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      prompts: chat_prompt
-      // highlight-start
-      config:
-        temperature: 0
-        max_tokens: 128
-      // highlight-end
-  - openai:gpt-4-0613:
-      prompts: chat_prompt
-      // highlight-start
-      config:
-        temperature: 0
-        max_tokens: 128
-      // highlight-end
-  - replicate:replicate/llama70b-v2-chat:e951f18578850b652510200860fc4ea62b3b16fac280f83ff32282f87bbd2e48:
-      prompts: completion_prompt
-      // highlight-start
-      config:
-        temperature: 0.01  # minimum temperature
-        max_length: 128
-      // highlight-end
+  - id: openai:gpt-3.5-turbo-0613
+    prompts: chat_prompt
+    // highlight-start
+    config:
+      temperature: 0
+      max_tokens: 128
+    // highlight-end
+  - id: openai:gpt-4-0613
+    prompts: chat_prompt
+    // highlight-start
+    config:
+      temperature: 0
+      max_tokens: 128
+    // highlight-end
+  - id: replicate:meta/llama70b-v2-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+    prompts: completion_prompt
+    // highlight-start
+    config:
+      temperature: 0.01  # minimum temperature
+      max_length: 128
+    // highlight-end
 ```
 
 Here's what each parameter means:

diff --git a/site/docs/guides/evaluate-llm-temperature.md b/site/docs/guides/evaluate-llm-temperature.md
@@ -39,14 +39,14 @@ prompts:
   - 'Respond to the following instruction: {{message}}'
 
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-lowtemp
-      config:
-        temperature: 0.2
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-hightemp
-      config:
-        temperature: 0.9
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-lowtemp
+    config:
+      temperature: 0.2
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-hightemp
+    config:
+      temperature: 0.9
 
 tests:
   - vars:
@@ -130,18 +130,18 @@ Set a constant seed in the provider config:
 
 ```yaml
 providers:
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-lowtemp
-      config:
-        temperature: 0.2
-        // highlight-next-line
-        seed: 0
-  - openai:gpt-3.5-turbo-0613:
-      id: openai-gpt-3.5-turbo-hightemp
-      config:
-        temperature: 0.9
-        // highlight-next-line
-        seed: 0
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-lowtemp
+    config:
+      temperature: 0.2
+      // highlight-next-line
+      seed: 0
+  - id: openai:gpt-3.5-turbo-0613
+    label: openai-gpt-3.5-turbo-hightemp
+    config:
+      temperature: 0.9
+      // highlight-next-line
+      seed: 0
 ```
 
 The `eval` command also has a parameter, `repeat`, which runs each test multiple times:

diff --git a/src/evaluator.ts b/src/evaluator.ts
@@ -211,7 +211,7 @@ class Evaluator {
 
     // Set up the special _conversation variable
     const vars = test.vars || {};
-    const conversationKey = `${provider.id()}:${prompt.id}`;
+    const conversationKey = `${provider.label || provider.id()}:${prompt.id}`;
     const usesConversation = prompt.raw.includes('_conversation');
     if (
       !process.env.PROMPTFOO_DISABLE_CONVERSATION_VAR &&
@@ -232,6 +232,7 @@ class Evaluator {
     const setup = {
       provider: {
         id: provider.id(),
+        label: provider.label,
       },
       prompt: {
         raw: renderedPrompt,
@@ -416,10 +417,10 @@ class Evaluator {
             continue;
           }
         }
-        prompts.push({
+        const completedPrompt = {
           ...prompt,
           id: sha256(typeof prompt.raw === 'object' ? JSON.stringify(prompt.raw) : prompt.raw),
-          provider: provider.id(),
+          provider: provider.label || provider.id(),
           display: prompt.display,
           metrics: {
             score: 0,
@@ -437,7 +438,8 @@ class Evaluator {
             namedScores: {},
             cost: 0,
           },
-        });
+        };
+        prompts.push(completedPrompt);
       }
     }
 
@@ -624,7 +626,7 @@ class Evaluator {
         numComplete++;
         if (progressbar) {
           progressbar.increment({
-            provider: evalStep.provider.id(),
+            provider: evalStep.provider.label || evalStep.provider.id(),
             prompt: evalStep.prompt.raw.slice(0, 10).replace(/\n/g, ' '),
             vars: Object.entries(evalStep.test.vars || {})
               .map(([k, v]) => `${k}=${v}`)
@@ -686,7 +688,7 @@ class Evaluator {
           namedScores: row.namedScores,
           text: resultText,
           prompt: row.prompt.raw,
-          provider: row.provider.id,
+          provider: row.provider.label || row.provider.id,
           latencyMs: row.latencyMs,
           tokenUsage: row.response?.tokenUsage,
           gradingResult: row.gradingResult,

diff --git a/src/prompts.ts b/src/prompts.ts
@@ -54,6 +54,9 @@ export function readProviderPromptMap(
           'You must specify an `id` on the Provider when you override options.prompts',
         );
         ret[rawProvider.id] = rawProvider.prompts || allPrompts;
+        if (rawProvider.label) {
+          ret[rawProvider.label] = rawProvider.prompts || allPrompts;
+        }
       } else {
         const rawProvider = provider as ProviderOptionsMap;
         const originalId = Object.keys(rawProvider)[0];

diff --git a/src/providers.ts b/src/providers.ts
@@ -112,8 +112,9 @@ export async function loadApiProvider(
   } = {},
 ): Promise<ApiProvider> {
   const { options = {}, basePath, env } = context;
-  const providerOptions = {
-    id: options.id,
+  const providerOptions: ProviderOptions = {
+    // Hack(ian): Override id with label. This makes it so that debug and display info, which rely on id, will use the label instead.
+    id: options.label || options.id,
     config: {
       ...options.config,
       basePath,

diff --git a/src/types.ts b/src/types.ts
@@ -60,8 +60,10 @@ export interface EnvOverrides {
 
 export interface ProviderOptions {
   id?: ProviderId;
+  label?: ProviderLabel;
   config?: any;
   prompts?: string[]; // List of prompt display strings
+  env?: EnvOverrides;
 }
 
 export interface CallApiContextParams {
@@ -73,14 +75,24 @@ export interface CallApiOptionsParams {
 }
 
 export interface ApiProvider {
+  // Unique identifier for the provider
   id: () => string;
+
+  // Text generation function
   callApi: (
     prompt: string,
     context?: CallApiContextParams,
     options?: CallApiOptionsParams,
   ) => Promise<ProviderResponse>;
+
+  // Embedding function
   callEmbeddingApi?: (prompt: string) => Promise<ProviderEmbeddingResponse>;
+
+  // Classification function
   callClassificationApi?: (prompt: string) => Promise<ProviderClassificationResponse>;
+
+  // Shown on output 
+  label?: ProviderLabel;
 }
 
 export interface ApiEmbeddingProvider extends ApiProvider {
@@ -216,7 +228,7 @@ export interface PromptWithMetadata {
 }
 
 export interface EvaluateResult {
-  provider: Pick<ProviderOptions, 'id'>;
+  provider: Pick<ProviderOptions, 'id' | 'label'>;
   prompt: Prompt;
   vars: Record<string, string | object>;
   response?: ProviderResponse;
@@ -463,6 +475,8 @@ export interface TestSuite {
 
 export type ProviderId = string;
 
+export type ProviderLabel = string;
+
 export type ProviderFunction = ApiProvider['callApi'];
 
 export type ProviderOptionsMap = Record<ProviderId, ProviderOptions>;