Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/sixty-planets-whisper.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'@example/ai-functions': patch
'@ai-sdk/openai': patch
---

Support for gpt-4o-transcribe for audio longer than 30 seconds, including a default response format of diarize_json
26 changes: 19 additions & 7 deletions content/providers/01-ai-sdk-providers/03-openai.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -2029,8 +2029,8 @@ const result = await transcribe({
audio: new Uint8Array([1, 2, 3, 4]),
providerOptions: {
openai: {
//timestampGranularities: ['word'],
timestampGranularities: ['segment'],
timestampGranularities: ['word'],
//timestampGranularities: ['segment'],
},
},
});
Expand Down Expand Up @@ -2063,13 +2063,25 @@ The following provider options are available:
- **include** _string[]_
Additional information to include in the transcription response.

- **chunkingStrategy** _'auto' | { type: 'server_vad', prefixPaddingMs?: number, silenceDurationMs?: number, threshold?: number }_
Controls how the audio is cut into chunks.
Required for `gpt-4o-transcribe-diarize` if the audio is longer than 30 seconds.

- **'auto'**: Automatically set chunking parameters based on the audio. The server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries.
- **object**: Manual chunking configuration using server-side VAD.
- **type** _'server_vad'_: Must be set to `'server_vad'`.
- **prefixPaddingMs** _number_: Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to `300`.
- **silenceDurationMs** _number_: Duration of silence to detect speech stop (in milliseconds). Defaults to `200`.
- **threshold** _number_: Sensitivity threshold (0.0 to 1.0) for voice activity detection. Defaults to `0.5`.

### Model Capabilities

| Model | Transcription | Duration | Segments | Language |
| ------------------------ | ------------------- | ------------------- | ------------------- | ------------------- |
| `whisper-1` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
| `gpt-4o-mini-transcribe` | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
| `gpt-4o-transcribe` | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
| Model | Transcription | Duration | Segments | Language |
| --------------------------- | ------------------- | ------------------- | ------------------- | ------------------- |
| `whisper-1` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
| `gpt-4o-mini-transcribe` | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
| `gpt-4o-transcribe` | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
| `gpt-4o-transcribe-diarize` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Cross size={18} /> |

## Speech Models

Expand Down
25 changes: 25 additions & 0 deletions examples/ai-functions/src/transcribe/openai-diarize.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { openai } from '@ai-sdk/openai';
import { experimental_transcribe as transcribe } from 'ai';
import { readFile } from 'fs/promises';
import { run } from '../lib/run';

run(async () => {
const result = await transcribe({
model: openai.transcription('gpt-4o-transcribe-diarize'),
audio: await readFile('data/galileo.mp3'),
providerOptions: {
openai: {
chunkingStrategy: 'auto',
},
},
});

console.log('Text:', result.text);
console.log('Duration:', result.durationInSeconds);
console.log('Language:', result.language);
console.log('Segments:', result.segments);
console.log('Warnings:', result.warnings);
// The full response body
// console.log('Responses:', result.responses[0].body.segments);
console.log('Responses:', result.responses);
});
4 changes: 2 additions & 2 deletions examples/ai-functions/src/transcribe/openai-verbose.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ run(async () => {
audio: await readFile('data/galileo.mp3'),
providerOptions: {
openai: {
//timestampGranularities: ['word'],
timestampGranularities: ['segment'],
timestampGranularities: ['word'],
//timestampGranularities: ['segment'],
},
},
});
Expand Down
17 changes: 10 additions & 7 deletions packages/openai/src/transcription/openai-transcription-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,19 @@ export const openaiTranscriptionResponseSchema = lazySchema(() =>
segments: z
.array(
z.object({
id: z.number(),
seek: z.number(),
id: z.union([z.number(), z.string()]),
seek: z.number().nullish(),
start: z.number(),
end: z.number(),
text: z.string(),
tokens: z.array(z.number()),
temperature: z.number(),
avg_logprob: z.number(),
compression_ratio: z.number(),
no_speech_prob: z.number(),
tokens: z.array(z.number()).nullish(),
temperature: z.number().nullish(),
avg_logprob: z.number().nullish(),
compression_ratio: z.number().nullish(),
no_speech_prob: z.number().nullish(),
// additional properties for diarized_json response format:
type: z.string().nullish(),
speaker: z.string().nullish(),
}),
)
.nullish(),
Expand Down
152 changes: 152 additions & 0 deletions packages/openai/src/transcription/openai-transcription-model.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -504,4 +504,156 @@ describe('doGenerate', () => {
expect(result.language).toBeUndefined();
expect(result.durationInSeconds).toBeUndefined();
});

it('should set response_format to "diarized_json" when model is "gpt-4o-transcribe-diarize"', async () => {
prepareJsonResponse();

const model = provider.transcription('gpt-4o-transcribe-diarize');
await model.doGenerate({
audio: audioData,
mediaType: 'audio/wav',
providerOptions: {
openai: {
chunkingStrategy: 'auto',
},
},
});

expect(await server.calls[0].requestBodyMultipart).toMatchInlineSnapshot(`
{
"chunking_strategy": "auto",
"file": File {
Symbol(kHandle): Blob {},
Symbol(kLength): 40169,
Symbol(kType): "audio/wav",
},
"model": "gpt-4o-transcribe-diarize",
"response_format": "diarized_json",
"temperature": "0",
"timestamp_granularities[]": "segment",
}
`);
});

it('should pass chunking_strategy object when specified', async () => {
prepareJsonResponse();

await model.doGenerate({
audio: audioData,
mediaType: 'audio/wav',
providerOptions: {
openai: {
chunkingStrategy: {
type: 'server_vad',
prefixPaddingMs: 500,
silenceDurationMs: 300,
threshold: 0.6,
},
},
},
});

expect(await server.calls[0].requestBodyMultipart).toMatchInlineSnapshot(`
{
"chunking_strategy": "{"type":"server_vad","prefix_padding_ms":500,"silence_duration_ms":300,"threshold":0.6}",
"file": File {
Symbol(kHandle): Blob {},
Symbol(kLength): 40169,
Symbol(kType): "audio/wav",
},
"model": "whisper-1",
"response_format": "verbose_json",
"temperature": "0",
"timestamp_granularities[]": "segment",
}
`);
});

it('should parse segments from diarized_json response', async () => {
server.urls['https://api.openai.com/v1/audio/transcriptions'].response = {
type: 'json-value',
body: {
task: 'transcribe',
duration: 10.0,
text: 'Speaker A: Hello. Speaker B: Hi there.',
segments: [
{
type: 'transcript.text.segment',
id: 'seg_001',
start: 0.0,
end: 5.0,
text: 'Hello.',
speaker: 'A',
},
{
type: 'transcript.text.segment',
id: 'seg_002',
start: 5.0,
end: 10.0,
text: 'Hi there.',
speaker: 'B',
},
],
usage: {
type: 'duration',
seconds: 10,
},
},
};

const model = provider.transcription('gpt-4o-transcribe-diarize');
const result = await model.doGenerate({
audio: audioData,
mediaType: 'audio/wav',
providerOptions: {
openai: {
chunkingStrategy: 'auto',
},
},
});

expect(result.segments).toMatchInlineSnapshot(`
[
{
"endSecond": 5,
"startSecond": 0,
"text": "Hello.",
},
{
"endSecond": 10,
"startSecond": 5,
"text": "Hi there.",
},
]
`);

// Also verify that we can access the raw response to see speaker info if needed
expect(result.response.body).toEqual({
task: 'transcribe',
duration: 10.0,
text: 'Speaker A: Hello. Speaker B: Hi there.',
segments: [
{
type: 'transcript.text.segment',
id: 'seg_001',
start: 0.0,
end: 5.0,
text: 'Hello.',
speaker: 'A',
},
{
type: 'transcript.text.segment',
id: 'seg_002',
start: 5.0,
end: 10.0,
text: 'Hi there.',
speaker: 'B',
},
],
usage: {
type: 'duration',
seconds: 10,
},
});
});
});
27 changes: 21 additions & 6 deletions packages/openai/src/transcription/openai-transcription-model.ts
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,27 @@ export class OpenAITranscriptionModel implements TranscriptionModelV3 {
prompt: openAIOptions.prompt,
// https://platform.openai.com/docs/api-reference/audio/createTranscription#audio_createtranscription-response_format
// prefer verbose_json to get segments for models that support it
response_format: [
'gpt-4o-transcribe',
'gpt-4o-mini-transcribe',
].includes(this.modelId)
? 'json'
: 'verbose_json',
response_format:
this.modelId === 'gpt-4o-transcribe-diarize'
? 'diarized_json'
: ['gpt-4o-transcribe', 'gpt-4o-mini-transcribe'].includes(
this.modelId,
)
? 'json'
: 'verbose_json',
temperature: openAIOptions.temperature,
timestamp_granularities: openAIOptions.timestampGranularities,
chunking_strategy:
typeof openAIOptions.chunkingStrategy === 'object'
? {
type: openAIOptions.chunkingStrategy.type,
prefix_padding_ms:
openAIOptions.chunkingStrategy.prefixPaddingMs,
silence_duration_ms:
openAIOptions.chunkingStrategy.silenceDurationMs,
threshold: openAIOptions.chunkingStrategy.threshold,
}
: openAIOptions.chunkingStrategy,
};

for (const [key, value] of Object.entries(transcriptionModelOptions)) {
Expand All @@ -161,6 +174,8 @@ export class OpenAITranscriptionModel implements TranscriptionModelV3 {
for (const item of value) {
formData.append(`${key}[]`, String(item));
}
} else if (typeof value === 'object') {
formData.append(key, JSON.stringify(value));
} else {
formData.append(key, String(value));
}
Expand Down
17 changes: 17 additions & 0 deletions packages/openai/src/transcription/openai-transcription-options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export type OpenAITranscriptionModelId =
| 'whisper-1'
| 'gpt-4o-mini-transcribe'
| 'gpt-4o-transcribe'
| 'gpt-4o-transcribe-diarize'
| (string & {});

// https://platform.openai.com/docs/api-reference/audio/createTranscription
Expand Down Expand Up @@ -41,6 +42,22 @@ export const openAITranscriptionProviderOptions = lazySchema(() =>
.array(z.enum(['word', 'segment']))
.default(['segment'])
.optional(),

/**
* The chunking strategy to use for the transcription.
* Required for gpt-4o-transcribe-diarize if the audio is longer than 30 seconds.
*/
chunkingStrategy: z
.union([
z.literal('auto'),
z.object({
type: z.literal('server_vad'),
prefixPaddingMs: z.number().int().optional(),
silenceDurationMs: z.number().int().optional(),
threshold: z.number().min(0).max(1).optional(),
}),
])
.optional(),
}),
),
);
Expand Down
Loading