vercel · psinha40898 · Feb 5, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.changeset/sixty-planets-whisper.md b/.changeset/sixty-planets-whisper.md
@@ -0,0 +1,6 @@
+---
+'@example/ai-functions': patch
+'@ai-sdk/openai': patch
+---
+
+Support for gpt-4o-transcribe for audio longer than 30 seconds, including a default response format of diarize_json
diff --git a/content/providers/01-ai-sdk-providers/03-openai.mdx b/content/providers/01-ai-sdk-providers/03-openai.mdx
@@ -2029,8 +2029,8 @@ const result = await transcribe({
   audio: new Uint8Array([1, 2, 3, 4]),
   providerOptions: {
     openai: {
-      //timestampGranularities: ['word'],
-      timestampGranularities: ['segment'],
+      timestampGranularities: ['word'],
+      //timestampGranularities: ['segment'],
     },
   },
 });
@@ -2063,13 +2063,25 @@ The following provider options are available:
 - **include** _string[]_
   Additional information to include in the transcription response.
 
+- **chunkingStrategy** _'auto' | { type: 'server_vad', prefixPaddingMs?: number, silenceDurationMs?: number, threshold?: number }_
+  Controls how the audio is cut into chunks.
+  Required for `gpt-4o-transcribe-diarize` if the audio is longer than 30 seconds.
+
+  - **'auto'**: Automatically set chunking parameters based on the audio. The server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries.
+  - **object**: Manual chunking configuration using server-side VAD.
+    - **type** _'server_vad'_: Must be set to `'server_vad'`.
+    - **prefixPaddingMs** _number_: Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to `300`.
+    - **silenceDurationMs** _number_: Duration of silence to detect speech stop (in milliseconds). Defaults to `200`.
+    - **threshold** _number_: Sensitivity threshold (0.0 to 1.0) for voice activity detection. Defaults to `0.5`.
+
 ### Model Capabilities
 
-| Model                    | Transcription       | Duration            | Segments            | Language            |
-| ------------------------ | ------------------- | ------------------- | ------------------- | ------------------- |
-| `whisper-1`              | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
-| `gpt-4o-mini-transcribe` | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
-| `gpt-4o-transcribe`      | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
+| Model                       | Transcription       | Duration            | Segments            | Language            |
+| --------------------------- | ------------------- | ------------------- | ------------------- | ------------------- |
+| `whisper-1`                 | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> |
+| `gpt-4o-mini-transcribe`    | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
+| `gpt-4o-transcribe`         | <Check size={18} /> | <Cross size={18} /> | <Cross size={18} /> | <Cross size={18} /> |
+| `gpt-4o-transcribe-diarize` | <Check size={18} /> | <Check size={18} /> | <Check size={18} /> | <Cross size={18} /> |
 
 ## Speech Models
 

diff --git a/examples/ai-functions/src/transcribe/openai-diarize.ts b/examples/ai-functions/src/transcribe/openai-diarize.ts
@@ -0,0 +1,25 @@
+import { openai } from '@ai-sdk/openai';
+import { experimental_transcribe as transcribe } from 'ai';
+import { readFile } from 'fs/promises';
+import { run } from '../lib/run';
+
+run(async () => {
+  const result = await transcribe({
+    model: openai.transcription('gpt-4o-transcribe-diarize'),
+    audio: await readFile('data/galileo.mp3'),
+    providerOptions: {
+      openai: {
+        chunkingStrategy: 'auto',
+      },
+    },
+  });
+
+  console.log('Text:', result.text);
+  console.log('Duration:', result.durationInSeconds);
+  console.log('Language:', result.language);
+  console.log('Segments:', result.segments);
+  console.log('Warnings:', result.warnings);
+  // The full response body
+  // console.log('Responses:', result.responses[0].body.segments);
+  console.log('Responses:', result.responses);
+});
diff --git a/examples/ai-functions/src/transcribe/openai-verbose.ts b/examples/ai-functions/src/transcribe/openai-verbose.ts
@@ -9,8 +9,8 @@ run(async () => {
     audio: await readFile('data/galileo.mp3'),
     providerOptions: {
       openai: {
-        //timestampGranularities: ['word'],
-        timestampGranularities: ['segment'],
+        timestampGranularities: ['word'],
+        //timestampGranularities: ['segment'],
       },
     },
   });

diff --git a/packages/openai/src/transcription/openai-transcription-api.ts b/packages/openai/src/transcription/openai-transcription-api.ts
@@ -19,16 +19,19 @@ export const openaiTranscriptionResponseSchema = lazySchema(() =>
       segments: z
         .array(
           z.object({
-            id: z.number(),
-            seek: z.number(),
+            id: z.union([z.number(), z.string()]),
+            seek: z.number().nullish(),
             start: z.number(),
             end: z.number(),
             text: z.string(),
-            tokens: z.array(z.number()),
-            temperature: z.number(),
-            avg_logprob: z.number(),
-            compression_ratio: z.number(),
-            no_speech_prob: z.number(),
+            tokens: z.array(z.number()).nullish(),
+            temperature: z.number().nullish(),
+            avg_logprob: z.number().nullish(),
+            compression_ratio: z.number().nullish(),
+            no_speech_prob: z.number().nullish(),
+            // additional properties for diarized_json response format:
+            type: z.string().nullish(),
+            speaker: z.string().nullish(),
           }),
         )
         .nullish(),

diff --git a/packages/openai/src/transcription/openai-transcription-model.test.ts b/packages/openai/src/transcription/openai-transcription-model.test.ts
@@ -504,4 +504,156 @@ describe('doGenerate', () => {
     expect(result.language).toBeUndefined();
     expect(result.durationInSeconds).toBeUndefined();
   });
+
+  it('should set response_format to "diarized_json" when model is "gpt-4o-transcribe-diarize"', async () => {
+    prepareJsonResponse();
+
+    const model = provider.transcription('gpt-4o-transcribe-diarize');
+    await model.doGenerate({
+      audio: audioData,
+      mediaType: 'audio/wav',
+      providerOptions: {
+        openai: {
+          chunkingStrategy: 'auto',
+        },
+      },
+    });
+
+    expect(await server.calls[0].requestBodyMultipart).toMatchInlineSnapshot(`
+      {
+        "chunking_strategy": "auto",
+        "file": File {
+          Symbol(kHandle): Blob {},
+          Symbol(kLength): 40169,
+          Symbol(kType): "audio/wav",
+        },
+        "model": "gpt-4o-transcribe-diarize",
+        "response_format": "diarized_json",
+        "temperature": "0",
+        "timestamp_granularities[]": "segment",
+      }
+    `);
+  });
+
+  it('should pass chunking_strategy object when specified', async () => {
+    prepareJsonResponse();
+
+    await model.doGenerate({
+      audio: audioData,
+      mediaType: 'audio/wav',
+      providerOptions: {
+        openai: {
+          chunkingStrategy: {
+            type: 'server_vad',
+            prefixPaddingMs: 500,
+            silenceDurationMs: 300,
+            threshold: 0.6,
+          },
+        },
+      },
+    });
+
+    expect(await server.calls[0].requestBodyMultipart).toMatchInlineSnapshot(`
+      {
+        "chunking_strategy": "{"type":"server_vad","prefix_padding_ms":500,"silence_duration_ms":300,"threshold":0.6}",
+        "file": File {
+          Symbol(kHandle): Blob {},
+          Symbol(kLength): 40169,
+          Symbol(kType): "audio/wav",
+        },
+        "model": "whisper-1",
+        "response_format": "verbose_json",
+        "temperature": "0",
+        "timestamp_granularities[]": "segment",
+      }
+    `);
+  });
+
+  it('should parse segments from diarized_json response', async () => {
+    server.urls['https://api.openai.com/v1/audio/transcriptions'].response = {
+      type: 'json-value',
+      body: {
+        task: 'transcribe',
+        duration: 10.0,
+        text: 'Speaker A: Hello. Speaker B: Hi there.',
+        segments: [
+          {
+            type: 'transcript.text.segment',
+            id: 'seg_001',
+            start: 0.0,
+            end: 5.0,
+            text: 'Hello.',
+            speaker: 'A',
+          },
+          {
+            type: 'transcript.text.segment',
+            id: 'seg_002',
+            start: 5.0,
+            end: 10.0,
+            text: 'Hi there.',
+            speaker: 'B',
+          },
+        ],
+        usage: {
+          type: 'duration',
+          seconds: 10,
+        },
+      },
+    };
+
+    const model = provider.transcription('gpt-4o-transcribe-diarize');
+    const result = await model.doGenerate({
+      audio: audioData,
+      mediaType: 'audio/wav',
+      providerOptions: {
+        openai: {
+          chunkingStrategy: 'auto',
+        },
+      },
+    });
+
+    expect(result.segments).toMatchInlineSnapshot(`
+      [
+        {
+          "endSecond": 5,
+          "startSecond": 0,
+          "text": "Hello.",
+        },
+        {
+          "endSecond": 10,
+          "startSecond": 5,
+          "text": "Hi there.",
+        },
+      ]
+    `);
+
+    // Also verify that we can access the raw response to see speaker info if needed
+    expect(result.response.body).toEqual({
+      task: 'transcribe',
+      duration: 10.0,
+      text: 'Speaker A: Hello. Speaker B: Hi there.',
+      segments: [
+        {
+          type: 'transcript.text.segment',
+          id: 'seg_001',
+          start: 0.0,
+          end: 5.0,
+          text: 'Hello.',
+          speaker: 'A',
+        },
+        {
+          type: 'transcript.text.segment',
+          id: 'seg_002',
+          start: 5.0,
+          end: 10.0,
+          text: 'Hi there.',
+          speaker: 'B',
+        },
+      ],
+      usage: {
+        type: 'duration',
+        seconds: 10,
+      },
+    });
+  });
 });
diff --git a/packages/openai/src/transcription/openai-transcription-model.ts b/packages/openai/src/transcription/openai-transcription-model.ts
@@ -145,14 +145,27 @@ export class OpenAITranscriptionModel implements TranscriptionModelV3 {
         prompt: openAIOptions.prompt,
         // https://platform.openai.com/docs/api-reference/audio/createTranscription#audio_createtranscription-response_format
         // prefer verbose_json to get segments for models that support it
-        response_format: [
-          'gpt-4o-transcribe',
-          'gpt-4o-mini-transcribe',
-        ].includes(this.modelId)
-          ? 'json'
-          : 'verbose_json',
+        response_format:
+          this.modelId === 'gpt-4o-transcribe-diarize'
+            ? 'diarized_json'
+            : ['gpt-4o-transcribe', 'gpt-4o-mini-transcribe'].includes(
+                  this.modelId,
+                )
+              ? 'json'
+              : 'verbose_json',
         temperature: openAIOptions.temperature,
         timestamp_granularities: openAIOptions.timestampGranularities,
+        chunking_strategy:
+          typeof openAIOptions.chunkingStrategy === 'object'
+            ? {
+                type: openAIOptions.chunkingStrategy.type,
+                prefix_padding_ms:
+                  openAIOptions.chunkingStrategy.prefixPaddingMs,
+                silence_duration_ms:
+                  openAIOptions.chunkingStrategy.silenceDurationMs,
+                threshold: openAIOptions.chunkingStrategy.threshold,
+              }
+            : openAIOptions.chunkingStrategy,
       };
 
       for (const [key, value] of Object.entries(transcriptionModelOptions)) {
@@ -161,6 +174,8 @@ export class OpenAITranscriptionModel implements TranscriptionModelV3 {
             for (const item of value) {
               formData.append(`${key}[]`, String(item));
             }
+          } else if (typeof value === 'object') {
+            formData.append(key, JSON.stringify(value));
           } else {
             formData.append(key, String(value));
           }

diff --git a/packages/openai/src/transcription/openai-transcription-options.ts b/packages/openai/src/transcription/openai-transcription-options.ts
@@ -5,6 +5,7 @@ export type OpenAITranscriptionModelId =
   | 'whisper-1'
   | 'gpt-4o-mini-transcribe'
   | 'gpt-4o-transcribe'
+  | 'gpt-4o-transcribe-diarize'
   | (string & {});
 
 // https://platform.openai.com/docs/api-reference/audio/createTranscription
@@ -41,6 +42,22 @@ export const openAITranscriptionProviderOptions = lazySchema(() =>
         .array(z.enum(['word', 'segment']))
         .default(['segment'])
         .optional(),
+
+      /**
+       * The chunking strategy to use for the transcription.
+       * Required for gpt-4o-transcribe-diarize if the audio is longer than 30 seconds.
+       */
+      chunkingStrategy: z
+        .union([
+          z.literal('auto'),
+          z.object({
+            type: z.literal('server_vad'),
+            prefixPaddingMs: z.number().int().optional(),
+            silenceDurationMs: z.number().int().optional(),
+            threshold: z.number().min(0).max(1).optional(),
+          }),
+        ])
+        .optional(),
     }),
   ),
 );