Skip to content

Commit

Permalink
formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
typpo committed Oct 10, 2023
1 parent a264792 commit 0604a1a
Show file tree
Hide file tree
Showing 25 changed files with 574 additions and 305 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ See [Test assertions](https://promptfoo.dev/docs/configuration/expected-outputs)
| `starts-with` | output starts with string |
| `contains-any` | output contains any of the listed substrings |
| `contains-all` | output contains all list of substrings |
| `icontains-any` | output contains any of the listed substrings, case insensitive |
| `icontains-all` | output contains all list of substrings, case insensitive |
| `icontains-any` | output contains any of the listed substrings, case insensitive |
| `icontains-all` | output contains all list of substrings, case insensitive |
| `is-json` | output is valid json (optional json schema validation) |
| `contains-json` | output contains valid json (optional json schema validation) |
| `javascript` | provided Javascript function validates the output |
Expand Down
1 change: 1 addition & 0 deletions examples/mistral-llama-comparison/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ To get started, set your HF_API_TOKEN and REPLICATE_API_KEY environment variable
Next, change a few of the text files in prompts/ and edit promptfooconfig.yaml.

Then run:

```
promptfoo eval
```
Expand Down
1 change: 1 addition & 0 deletions examples/ollama-comparison/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ To get started, set your OPENAI_API_KEY environment variable and start Ollama.
Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.

Then run:

```
promptfoo eval
```
Expand Down
18 changes: 14 additions & 4 deletions src/assertions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,9 @@ export async function runAssertion(
Array.isArray(renderedValue),
'"icontains-any" assertion type must have an array value',
);
pass = renderedValue.some((value) => output.toLowerCase().includes(String(value).toLowerCase())) !== inverse;
pass =
renderedValue.some((value) => output.toLowerCase().includes(String(value).toLowerCase())) !==
inverse;
return {
pass,
score: pass ? 1 : 0,
Expand Down Expand Up @@ -289,7 +291,9 @@ export async function runAssertion(
Array.isArray(renderedValue),
'"icontains-all" assertion type must have an array value',
);
pass = renderedValue.every((value) => output.toLowerCase().includes(String(value).toLowerCase())) !== inverse;
pass =
renderedValue.every((value) => output.toLowerCase().includes(String(value).toLowerCase())) !==
inverse;
return {
pass,
score: pass ? 1 : 0,
Expand Down Expand Up @@ -391,7 +395,8 @@ export async function runAssertion(
const functionString = assertion.value.toString();
ret.assertion = {
type: 'javascript',
value: functionString.length > 50 ? functionString.slice(0, 50) + '...' : functionString,
value:
functionString.length > 50 ? functionString.slice(0, 50) + '...' : functionString,
};
}
return ret;
Expand Down Expand Up @@ -751,7 +756,12 @@ export function assertionFromString(expected: string): Assertion {
const fullType = notPrefix ? `not-${type}` : type;
const threshold = parseFloat(thresholdStr);

if (type === 'contains-any' || type === 'contains-all' || type === 'icontains-any' || type === 'icontains-all') {
if (
type === 'contains-any' ||
type === 'contains-all' ||
type === 'icontains-any' ||
type === 'icontains-all'
) {
return {
type: fullType as AssertionType,
value: value.split(',').map((s) => s.trim()),
Expand Down
53 changes: 36 additions & 17 deletions src/commands/list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ import telemetry from '../telemetry';
export function listCommand(program: Command) {
const listCommand = program.command('list').description('List various resources');

listCommand.command('evals')
listCommand
.command('evals')
.description('List evaluations.')
.action(async () => {
telemetry.maybeShowNotice();
Expand All @@ -19,21 +20,26 @@ export function listCommand(program: Command) {
await telemetry.send();

const evals = getEvals();
const tableData = evals.map(evl => ({
const tableData = evals.map((evl) => ({
'Eval ID': evl.id.slice(0, 6),
Filename: evl.filePath,
Prompts: evl.results.table.head.prompts.map(p => sha256(p.raw).slice(0, 6)).join(', '),
Vars: evl.results.table.head.vars.map(v => v).join(', '),
Prompts: evl.results.table.head.prompts.map((p) => sha256(p.raw).slice(0, 6)).join(', '),
Vars: evl.results.table.head.vars.map((v) => v).join(', '),
}));

logger.info(wrapTable(tableData));
printBorder();

logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(
`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`,
);
logger.info(
`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`,
);
});

listCommand.command('prompts')
listCommand
.command('prompts')
.description('List prompts used')
.action(async () => {
telemetry.maybeShowNotice();
Expand All @@ -43,20 +49,25 @@ export function listCommand(program: Command) {
await telemetry.send();

const prompts = getPrompts().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
const tableData = prompts.map(prompt => ({
const tableData = prompts.map((prompt) => ({
'Prompt ID': prompt.id.slice(0, 6),
'Raw': prompt.prompt.raw.slice(0, 100) + (prompt.prompt.raw.length > 100 ? '...' : ''),
Raw: prompt.prompt.raw.slice(0, 100) + (prompt.prompt.raw.length > 100 ? '...' : ''),
'# evals': prompt.count,
'Most recent eval': prompt.recentEvalId.slice(0, 6),
}));

logger.info(wrapTable(tableData));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(
`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`,
);
logger.info(
`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`,
);
});

listCommand.command('datasets')
listCommand
.command('datasets')
.description('List datasets used')
.action(async () => {
telemetry.maybeShowNotice();
Expand All @@ -66,18 +77,26 @@ export function listCommand(program: Command) {
await telemetry.send();

const datasets = getTestCases().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
const tableData = datasets.map(dataset => ({
const tableData = datasets.map((dataset) => ({
'Dataset ID': dataset.id.slice(0, 6),
'Highest scoring prompt': dataset.prompts.sort((a, b) => (b.prompt.metrics?.score || 0) - (a.prompt.metrics?.score || 0))[0].id.slice(0, 6),
'Highest scoring prompt': dataset.prompts
.sort((a, b) => (b.prompt.metrics?.score || 0) - (a.prompt.metrics?.score || 0))[0]
.id.slice(0, 6),
'# evals': dataset.count,
'# prompts': dataset.prompts.length,
'Most recent eval': dataset.recentEvalId.slice(0, 6),
}));

logger.info(wrapTable(tableData));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(
`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`,
);
logger.info(
`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`,
);
logger.info(
`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`,
);
});
}
148 changes: 93 additions & 55 deletions src/commands/show.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,33 +7,42 @@ import logger from '../logger';
import telemetry from '../telemetry';

export function showCommand(program: Command) {
const showCommand = program.command('show <id>').description('Show details of a specific resource').action(async (id: string) => {
const evl = getEvalFromHash(id);
if (evl) {
return handleEval(id);
}

const prompt = getPromptFromHash(id);
if (prompt) {
return handlePrompt(id);
}

const dataset = getDatasetFromHash(id);
if (dataset) {
return handleDataset(id);
}

logger.error(`No resource found with ID ${id}`);
});
const showCommand = program
.command('show <id>')
.description('Show details of a specific resource')
.action(async (id: string) => {
const evl = getEvalFromHash(id);
if (evl) {
return handleEval(id);
}

const prompt = getPromptFromHash(id);
if (prompt) {
return handlePrompt(id);
}

const dataset = getDatasetFromHash(id);
if (dataset) {
return handleDataset(id);
}

logger.error(`No resource found with ID ${id}`);
});

showCommand
.command('eval <id>')
.description('Show details of a specific evaluation')
.action(handleEval);

showCommand.command('prompt <id>').description('Show details of a specific prompt').action(handlePrompt);
showCommand
.command('prompt <id>')
.description('Show details of a specific prompt')
.action(handlePrompt);

showCommand.command('dataset <id>').description('Show details of a specific dataset').action(handleDataset);
showCommand
.command('dataset <id>')
.description('Show details of a specific dataset')
.action(handleDataset);
}

async function handleEval(id: string) {
Expand Down Expand Up @@ -61,7 +70,11 @@ async function handleEval(id: string) {
printBorder();
// TODO(ian): List prompt ids
logger.info(`${prompts.length} prompts`);
logger.info(`${vars.length} variables: ${vars.slice(0, 5).join(', ')}${vars.length > 5 ? ` (and ${vars.length - 5} more...)` : ''}`);
logger.info(
`${vars.length} variables: ${vars.slice(0, 5).join(', ')}${
vars.length > 5 ? ` (and ${vars.length - 5} more...)` : ''
}`,
);
}

async function handlePrompt(id: string) {
Expand Down Expand Up @@ -90,48 +103,73 @@ async function handlePrompt(id: string) {
'Eval ID': evl.id.slice(0, 6),
'Dataset ID': evl.datasetId.slice(0, 6),
'Raw score': evl.metrics?.score.toFixed(2) || '-',
'Pass rate': evl.metrics && evl.metrics.testPassCount + evl.metrics.testFailCount > 0 ? `${(evl.metrics.testPassCount / (evl.metrics.testPassCount + evl.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
'Pass rate':
evl.metrics && evl.metrics.testPassCount + evl.metrics.testFailCount > 0
? `${(
(evl.metrics.testPassCount /
(evl.metrics.testPassCount + evl.metrics.testFailCount)) *
100
).toFixed(2)}%`
: '-',
'Pass count': evl.metrics?.testPassCount || '-',
'Fail count': evl.metrics?.testFailCount || '-',
});
}
logger.info(wrapTable(table));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
logger.info(
`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`,
);
logger.info(
`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`,
);
}

async function handleDataset(id: string) {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'show dataset',
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'show dataset',
});
await telemetry.send();

const dataset = getDatasetFromHash(id);
if (!dataset) {
logger.error(`Dataset with ID ${id} not found.`);
return;
}

printBorder();
logger.info(chalk.bold(`Dataset ${id}`));
printBorder();

logger.info(`This dataset is used in the following evals:`);
const table = [];
for (const prompt of dataset.prompts
.sort((a, b) => b.evalId.localeCompare(a.evalId))
.slice(0, 10)) {
table.push({
'Eval ID': prompt.evalId.slice(0, 6),
'Prompt ID': prompt.id.slice(0, 6),
'Raw score': prompt.prompt.metrics?.score.toFixed(2) || '-',
'Pass rate':
prompt.prompt.metrics &&
prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount > 0
? `${(
(prompt.prompt.metrics.testPassCount /
(prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount)) *
100
).toFixed(2)}%`
: '-',
'Pass count': prompt.prompt.metrics?.testPassCount || '-',
'Fail count': prompt.prompt.metrics?.testFailCount || '-',
});
await telemetry.send();

const dataset = getDatasetFromHash(id);
if (!dataset) {
logger.error(`Dataset with ID ${id} not found.`);
return;
}

printBorder();
logger.info(chalk.bold(`Dataset ${id}`));
printBorder();

logger.info(`This dataset is used in the following evals:`);
const table = [];
for (const prompt of dataset.prompts.sort((a, b) => b.evalId.localeCompare(a.evalId)).slice(0, 10)) {
table.push({
'Eval ID': prompt.evalId.slice(0, 6),
'Prompt ID': prompt.id.slice(0, 6),
'Raw score': prompt.prompt.metrics?.score.toFixed(2) || '-',
'Pass rate': prompt.prompt.metrics && prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount > 0 ? `${(prompt.prompt.metrics.testPassCount / (prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
'Pass count': prompt.prompt.metrics?.testPassCount || '-',
'Fail count': prompt.prompt.metrics?.testFailCount || '-',
});
}
logger.info(wrapTable(table));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
}
logger.info(wrapTable(table));
printBorder();
logger.info(
`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`,
);
logger.info(
`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`,
);
}
6 changes: 4 additions & 2 deletions src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,10 @@ class Evaluator {
metrics.score += row.score;
metrics.testPassCount += row.success ? 1 : 0;
metrics.testFailCount += row.success ? 0 : 1;
metrics.assertPassCount += row.gradingResult?.componentResults?.filter(r => r.pass).length || 0;
metrics.assertFailCount += row.gradingResult?.componentResults?.filter(r => !r.pass).length || 0;
metrics.assertPassCount +=
row.gradingResult?.componentResults?.filter((r) => r.pass).length || 0;
metrics.assertFailCount +=
row.gradingResult?.componentResults?.filter((r) => !r.pass).length || 0;
},
);

Expand Down
5 changes: 4 additions & 1 deletion src/providers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ import {
AzureOpenAiCompletionProvider,
AzureOpenAiEmbeddingProvider,
} from './providers/azureopenai';
import { HuggingfaceFeatureExtractionProvider, HuggingfaceTextGenerationProvider } from './providers/huggingface';
import {
HuggingfaceFeatureExtractionProvider,
HuggingfaceTextGenerationProvider,
} from './providers/huggingface';

import type {
ApiProvider,
Expand Down
3 changes: 2 additions & 1 deletion src/providers/azureopenai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ class AzureOpenAiGenericProvider implements ApiProvider {
this.deploymentName = deploymentName;

this.apiKey = config?.apiKey || env?.AZURE_OPENAI_API_KEY || process.env.AZURE_OPENAI_API_KEY;
this.apiHost = config?.apiHost || env?.AZURE_OPENAI_API_HOST || process.env.AZURE_OPENAI_API_HOST;
this.apiHost =
config?.apiHost || env?.AZURE_OPENAI_API_HOST || process.env.AZURE_OPENAI_API_HOST;

this.config = config || {};
this.id = id ? () => id : this.id;
Expand Down
Loading

0 comments on commit 0604a1a

Please sign in to comment.