Project import generated by Copybara.

AssemblyAI · he-james · commit d096b25141f5 · 2025-06-17T11:24:36.000-06:00
GitOrigin-RevId: a297f52dce11955795bb43417787b880f6ad67f5
diff --git a/README.md b/README.md
@@ -101,7 +101,7 @@ let transcript = await client.transcripts.transcribe({
 });
 ```
 
-> [!TIP]
+> [!NOTE]
 > You can also pass a local file path, a stream, or a buffer as the `audio` property.
 
 `transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
@@ -128,7 +128,7 @@ let transcript = await client.transcripts.transcribe({
 });
 ```
 
-> [!TIP]
+> **Note:**
 > You can also pass a file URL, a stream, or a buffer as the `audio` property.
 
 `transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
@@ -224,7 +224,7 @@ do {
 } while (previousPageUrl !== null);
 ```
 
-> [!TIP]
+> [!NOTE]
 > To paginate over all pages, you need to use the `page.page_details.prev_url`
 > because the transcripts are returned in descending order by creation date and time.
 > The first page is are the most recent transcript, and each "previous" page are older transcripts.
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "assemblyai",
-  "version": "4.13.2",
+  "version": "4.13.3",
   "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
   "engines": {
     "node": ">=18"
diff --git a/samples/speaker-diarization.ts b/samples/speaker-diarization.ts
@@ -0,0 +1,80 @@
+/**
+ * Example of using speaker diarization with speaker_options
+ *
+ * Note: speaker_options and speakers_expected are mutually exclusive.
+ * Use either speakers_expected for simple guidance OR speaker_options for advanced control.
+ */
+
+import { AssemblyAI, SpeakerOptions } from "assemblyai"
+
+// Replace with your API key
+const client = new AssemblyAI({
+  apiKey: "YOUR_API_KEY",
+})
+
+async function transcribeWithSpeakerDiarization() {
+  // Example 1: Basic speaker diarization (uses smart defaults)
+  // The model automatically detects the optimal number of speakers
+  let transcript = await client.transcripts.transcribe({
+    audio: "https://example.com/audio.mp3",
+    speaker_labels: true,
+  })
+
+  console.log("Basic speaker diarization:", transcript.id)
+
+  // Example 2: Provide a hint with speakers_expected (smart default with guidance)
+  // Still uses smart defaults but gives the model a hint about expected speakers
+  transcript = await client.transcripts.transcribe({
+    audio: "https://example.com/audio.mp3",
+    speaker_labels: true,
+    speakers_expected: 3,
+  })
+
+  console.log("With expected speakers:", transcript.id)
+
+  // Example 3: Set boundaries with speaker_options (controlled smart defaults)
+  // Constrains the smart defaults to work within specified bounds
+  const speakerOptions: SpeakerOptions = {
+    min_speakers_expected: 2, // At least 2 speakers (overrides smart default if < 2)
+    max_speakers_expected: 4, // At most 4 speakers (overrides smart default if > 4)
+  }
+
+  transcript = await client.transcripts.transcribe({
+    audio: "https://example.com/audio.mp3",
+    speaker_labels: true,
+    speaker_options: speakerOptions,
+  })
+
+  console.log("With speaker options:", transcript.id)
+
+  // Note: The following would be INVALID since speakers_expected and speaker_options are mutually exclusive:
+  // transcript = await client.transcripts.transcribe({
+  //   audio: "https://example.com/audio.mp3",
+  //   speaker_labels: true,
+  //   speakers_expected: 3, // ❌ Cannot use both
+  //   speaker_options: { min_speakers_expected: 2 }, // ❌ Cannot use both
+  // });
+
+  // Example 4: Edge case handling for challenging audio
+  // Use speaker_options when you need precise control over speaker detection
+  transcript = await client.transcripts.transcribe({
+    audio: "https://example.com/audio.mp3",
+    speaker_labels: true,
+    speaker_options: {
+      min_speakers_expected: 1, // Handle solo speakers or presentations
+      max_speakers_expected: 10, // Handle large meetings or conferences
+    },
+  })
+
+  console.log("Edge case handling:", transcript.id)
+
+  // Access the utterances with speaker labels
+  if (transcript.status === "completed" && transcript.utterances) {
+    for (const utterance of transcript.utterances) {
+      console.log(`Speaker ${utterance.speaker}: ${utterance.text}`)
+    }
+  }
+}
+
+// Run the example
+transcribeWithSpeakerDiarization().catch(console.error)
diff --git a/src/types/openapi.generated.ts b/src/types/openapi.generated.ts
@@ -1413,6 +1413,20 @@ export type SeverityScoreSummary = {
   medium: number;
 };
 
+/**
+ * Advanced options for controlling speaker diarization parameters
+ */
+export type SpeakerOptions = {
+  /**
+   * Minimum number of speakers expected in the audio
+   */
+  min_speakers_expected?: number | null;
+  /**
+   * Maximum number of speakers expected in the audio
+   */
+  max_speakers_expected?: number | null;
+};
+
 /**
  * The speech model to use for the transcription.
  */
@@ -2517,6 +2531,10 @@ export type Transcript = {
    * Tell the speaker label model how many speakers it should attempt to identify, up to 10. See {@link https://www.assemblyai.com/docs/models/speaker-diarization | Speaker diarization } for more details.
    */
   speakers_expected?: number | null;
+  /**
+   * Advanced options for controlling speaker diarization parameters
+   */
+  speaker_options?: SpeakerOptions | null;
   /**
    * The speech model used for the transcription. When `null`, the default model is used.
    * @defaultValue "null
@@ -3039,6 +3057,10 @@ export type TranscriptOptionalParams = {
    * @defaultValue "null
    */
   speakers_expected?: number | null;
+  /**
+   * Advanced options for controlling speaker diarization parameters
+   */
+  speaker_options?: SpeakerOptions | null;
   /**
    * The speech model to use for the transcription. When `null`, the "best" model is used.
    * @defaultValue best
diff --git a/tests/unit/speaker-options.test.ts b/tests/unit/speaker-options.test.ts
@@ -0,0 +1,127 @@
+import fetchMock from "jest-fetch-mock";
+import { SpeakerOptions } from "../../src";
+import { createClient, requestMatches } from "./utils";
+
+fetchMock.enableMocks();
+
+const assembly = createClient();
+const transcriptId = "transcript_123";
+const remoteAudioURL = "https://assembly.ai/espn.m4a";
+
+beforeEach(() => {
+  jest.clearAllMocks();
+  fetchMock.resetMocks();
+  fetchMock.doMock();
+});
+
+describe("speaker options", () => {
+  it("should create transcript with speaker_options", async () => {
+    const speakerOptions: SpeakerOptions = {
+      min_speakers_expected: 2,
+      max_speakers_expected: 4,
+    };
+
+    fetchMock.doMockOnceIf(
+      requestMatches({ url: "/v2/transcript", method: "POST" }),
+      JSON.stringify({ id: transcriptId, status: "queued" }),
+    );
+
+    const transcript = await assembly.transcripts.submit({
+      audio_url: remoteAudioURL,
+      speaker_labels: true,
+      speaker_options: speakerOptions,
+    });
+
+    expect(transcript.id).toBe(transcriptId);
+    expect(transcript.status).toBe("queued");
+
+    // Verify the request body included speaker_options
+    const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
+    expect(requestBody.speaker_labels).toBe(true);
+    expect(requestBody.speaker_options).toEqual(speakerOptions);
+  });
+
+  it("should create transcript with only min_speakers_expected", async () => {
+    const speakerOptions: SpeakerOptions = {
+      min_speakers_expected: 3,
+    };
+
+    fetchMock.doMockOnceIf(
+      requestMatches({ url: "/v2/transcript", method: "POST" }),
+      JSON.stringify({ id: transcriptId, status: "queued" }),
+    );
+
+    const transcript = await assembly.transcripts.submit({
+      audio_url: remoteAudioURL,
+      speaker_labels: true,
+      speaker_options: speakerOptions,
+    });
+
+    expect(transcript.id).toBe(transcriptId);
+
+    const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
+    expect(requestBody.speaker_options.min_speakers_expected).toBe(3);
+    expect(requestBody.speaker_options.max_speakers_expected).toBeUndefined();
+  });
+
+  it("should create transcript with only max_speakers_expected", async () => {
+    const speakerOptions: SpeakerOptions = {
+      max_speakers_expected: 5,
+    };
+
+    fetchMock.doMockOnceIf(
+      requestMatches({ url: "/v2/transcript", method: "POST" }),
+      JSON.stringify({ id: transcriptId, status: "queued" }),
+    );
+
+    const transcript = await assembly.transcripts.submit({
+      audio_url: remoteAudioURL,
+      speaker_labels: true,
+      speaker_options: speakerOptions,
+    });
+
+    expect(transcript.id).toBe(transcriptId);
+
+    const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
+    expect(requestBody.speaker_options.min_speakers_expected).toBeUndefined();
+    expect(requestBody.speaker_options.max_speakers_expected).toBe(5);
+  });
+
+  it("should create transcript with speakers_expected (without speaker_options)", async () => {
+    fetchMock.doMockOnceIf(
+      requestMatches({ url: "/v2/transcript", method: "POST" }),
+      JSON.stringify({ id: transcriptId, status: "queued" }),
+    );
+
+    const transcript = await assembly.transcripts.submit({
+      audio_url: remoteAudioURL,
+      speaker_labels: true,
+      speakers_expected: 3,
+    });
+
+    expect(transcript.id).toBe(transcriptId);
+
+    const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
+    expect(requestBody.speaker_labels).toBe(true);
+    expect(requestBody.speakers_expected).toBe(3);
+    expect(requestBody.speaker_options).toBeUndefined();
+  });
+
+  it("should handle null speaker_options", async () => {
+    fetchMock.doMockOnceIf(
+      requestMatches({ url: "/v2/transcript", method: "POST" }),
+      JSON.stringify({ id: transcriptId, status: "queued" }),
+    );
+
+    const transcript = await assembly.transcripts.submit({
+      audio_url: remoteAudioURL,
+      speaker_labels: true,
+      speaker_options: null,
+    });
+
+    expect(transcript.id).toBe(transcriptId);
+
+    const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
+    expect(requestBody.speaker_options).toBe(null);
+  });
+});

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "assemblyai",`
`3`		`- "version": "4.13.2",`
	`3`	`+ "version": "4.13.3",`
`4`	`4`	`"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",`
`5`	`5`	`"engines": {`
`6`	`6`	`"node": ">=18"`