Skip to content

Commit d096b25

Browse files
AssemblyAIhe-james
authored andcommitted
Project import generated by Copybara.
GitOrigin-RevId: a297f52dce11955795bb43417787b880f6ad67f5
1 parent c8756b7 commit d096b25

File tree

5 files changed

+233
-4
lines changed

5 files changed

+233
-4
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ let transcript = await client.transcripts.transcribe({
101101
});
102102
```
103103

104-
> [!TIP]
104+
> [!NOTE]
105105
> You can also pass a local file path, a stream, or a buffer as the `audio` property.
106106
107107
`transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
@@ -128,7 +128,7 @@ let transcript = await client.transcripts.transcribe({
128128
});
129129
```
130130

131-
> [!TIP]
131+
> **Note:**
132132
> You can also pass a file URL, a stream, or a buffer as the `audio` property.
133133
134134
`transcribe` queues a transcription job and polls it until the `status` is `completed` or `error`.
@@ -224,7 +224,7 @@ do {
224224
} while (previousPageUrl !== null);
225225
```
226226

227-
> [!TIP]
227+
> [!NOTE]
228228
> To paginate over all pages, you need to use the `page.page_details.prev_url`
229229
> because the transcripts are returned in descending order by creation date and time.
230230
> The first page is are the most recent transcript, and each "previous" page are older transcripts.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "assemblyai",
3-
"version": "4.13.2",
3+
"version": "4.13.3",
44
"description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.",
55
"engines": {
66
"node": ">=18"

samples/speaker-diarization.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/**
2+
* Example of using speaker diarization with speaker_options
3+
*
4+
* Note: speaker_options and speakers_expected are mutually exclusive.
5+
* Use either speakers_expected for simple guidance OR speaker_options for advanced control.
6+
*/
7+
8+
import { AssemblyAI, SpeakerOptions } from "assemblyai"
9+
10+
// Replace with your API key
11+
const client = new AssemblyAI({
12+
apiKey: "YOUR_API_KEY",
13+
})
14+
15+
async function transcribeWithSpeakerDiarization() {
16+
// Example 1: Basic speaker diarization (uses smart defaults)
17+
// The model automatically detects the optimal number of speakers
18+
let transcript = await client.transcripts.transcribe({
19+
audio: "https://example.com/audio.mp3",
20+
speaker_labels: true,
21+
})
22+
23+
console.log("Basic speaker diarization:", transcript.id)
24+
25+
// Example 2: Provide a hint with speakers_expected (smart default with guidance)
26+
// Still uses smart defaults but gives the model a hint about expected speakers
27+
transcript = await client.transcripts.transcribe({
28+
audio: "https://example.com/audio.mp3",
29+
speaker_labels: true,
30+
speakers_expected: 3,
31+
})
32+
33+
console.log("With expected speakers:", transcript.id)
34+
35+
// Example 3: Set boundaries with speaker_options (controlled smart defaults)
36+
// Constrains the smart defaults to work within specified bounds
37+
const speakerOptions: SpeakerOptions = {
38+
min_speakers_expected: 2, // At least 2 speakers (overrides smart default if < 2)
39+
max_speakers_expected: 4, // At most 4 speakers (overrides smart default if > 4)
40+
}
41+
42+
transcript = await client.transcripts.transcribe({
43+
audio: "https://example.com/audio.mp3",
44+
speaker_labels: true,
45+
speaker_options: speakerOptions,
46+
})
47+
48+
console.log("With speaker options:", transcript.id)
49+
50+
// Note: The following would be INVALID since speakers_expected and speaker_options are mutually exclusive:
51+
// transcript = await client.transcripts.transcribe({
52+
// audio: "https://example.com/audio.mp3",
53+
// speaker_labels: true,
54+
// speakers_expected: 3, // ❌ Cannot use both
55+
// speaker_options: { min_speakers_expected: 2 }, // ❌ Cannot use both
56+
// });
57+
58+
// Example 4: Edge case handling for challenging audio
59+
// Use speaker_options when you need precise control over speaker detection
60+
transcript = await client.transcripts.transcribe({
61+
audio: "https://example.com/audio.mp3",
62+
speaker_labels: true,
63+
speaker_options: {
64+
min_speakers_expected: 1, // Handle solo speakers or presentations
65+
max_speakers_expected: 10, // Handle large meetings or conferences
66+
},
67+
})
68+
69+
console.log("Edge case handling:", transcript.id)
70+
71+
// Access the utterances with speaker labels
72+
if (transcript.status === "completed" && transcript.utterances) {
73+
for (const utterance of transcript.utterances) {
74+
console.log(`Speaker ${utterance.speaker}: ${utterance.text}`)
75+
}
76+
}
77+
}
78+
79+
// Run the example
80+
transcribeWithSpeakerDiarization().catch(console.error)

src/types/openapi.generated.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1413,6 +1413,20 @@ export type SeverityScoreSummary = {
14131413
medium: number;
14141414
};
14151415

1416+
/**
1417+
* Advanced options for controlling speaker diarization parameters
1418+
*/
1419+
export type SpeakerOptions = {
1420+
/**
1421+
* Minimum number of speakers expected in the audio
1422+
*/
1423+
min_speakers_expected?: number | null;
1424+
/**
1425+
* Maximum number of speakers expected in the audio
1426+
*/
1427+
max_speakers_expected?: number | null;
1428+
};
1429+
14161430
/**
14171431
* The speech model to use for the transcription.
14181432
*/
@@ -2517,6 +2531,10 @@ export type Transcript = {
25172531
* Tell the speaker label model how many speakers it should attempt to identify, up to 10. See {@link https://www.assemblyai.com/docs/models/speaker-diarization | Speaker diarization } for more details.
25182532
*/
25192533
speakers_expected?: number | null;
2534+
/**
2535+
* Advanced options for controlling speaker diarization parameters
2536+
*/
2537+
speaker_options?: SpeakerOptions | null;
25202538
/**
25212539
* The speech model used for the transcription. When `null`, the default model is used.
25222540
* @defaultValue "null
@@ -3039,6 +3057,10 @@ export type TranscriptOptionalParams = {
30393057
* @defaultValue "null
30403058
*/
30413059
speakers_expected?: number | null;
3060+
/**
3061+
* Advanced options for controlling speaker diarization parameters
3062+
*/
3063+
speaker_options?: SpeakerOptions | null;
30423064
/**
30433065
* The speech model to use for the transcription. When `null`, the "best" model is used.
30443066
* @defaultValue best

tests/unit/speaker-options.test.ts

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import fetchMock from "jest-fetch-mock";
2+
import { SpeakerOptions } from "../../src";
3+
import { createClient, requestMatches } from "./utils";
4+
5+
fetchMock.enableMocks();
6+
7+
const assembly = createClient();
8+
const transcriptId = "transcript_123";
9+
const remoteAudioURL = "https://assembly.ai/espn.m4a";
10+
11+
beforeEach(() => {
12+
jest.clearAllMocks();
13+
fetchMock.resetMocks();
14+
fetchMock.doMock();
15+
});
16+
17+
describe("speaker options", () => {
18+
it("should create transcript with speaker_options", async () => {
19+
const speakerOptions: SpeakerOptions = {
20+
min_speakers_expected: 2,
21+
max_speakers_expected: 4,
22+
};
23+
24+
fetchMock.doMockOnceIf(
25+
requestMatches({ url: "/v2/transcript", method: "POST" }),
26+
JSON.stringify({ id: transcriptId, status: "queued" }),
27+
);
28+
29+
const transcript = await assembly.transcripts.submit({
30+
audio_url: remoteAudioURL,
31+
speaker_labels: true,
32+
speaker_options: speakerOptions,
33+
});
34+
35+
expect(transcript.id).toBe(transcriptId);
36+
expect(transcript.status).toBe("queued");
37+
38+
// Verify the request body included speaker_options
39+
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
40+
expect(requestBody.speaker_labels).toBe(true);
41+
expect(requestBody.speaker_options).toEqual(speakerOptions);
42+
});
43+
44+
it("should create transcript with only min_speakers_expected", async () => {
45+
const speakerOptions: SpeakerOptions = {
46+
min_speakers_expected: 3,
47+
};
48+
49+
fetchMock.doMockOnceIf(
50+
requestMatches({ url: "/v2/transcript", method: "POST" }),
51+
JSON.stringify({ id: transcriptId, status: "queued" }),
52+
);
53+
54+
const transcript = await assembly.transcripts.submit({
55+
audio_url: remoteAudioURL,
56+
speaker_labels: true,
57+
speaker_options: speakerOptions,
58+
});
59+
60+
expect(transcript.id).toBe(transcriptId);
61+
62+
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
63+
expect(requestBody.speaker_options.min_speakers_expected).toBe(3);
64+
expect(requestBody.speaker_options.max_speakers_expected).toBeUndefined();
65+
});
66+
67+
it("should create transcript with only max_speakers_expected", async () => {
68+
const speakerOptions: SpeakerOptions = {
69+
max_speakers_expected: 5,
70+
};
71+
72+
fetchMock.doMockOnceIf(
73+
requestMatches({ url: "/v2/transcript", method: "POST" }),
74+
JSON.stringify({ id: transcriptId, status: "queued" }),
75+
);
76+
77+
const transcript = await assembly.transcripts.submit({
78+
audio_url: remoteAudioURL,
79+
speaker_labels: true,
80+
speaker_options: speakerOptions,
81+
});
82+
83+
expect(transcript.id).toBe(transcriptId);
84+
85+
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
86+
expect(requestBody.speaker_options.min_speakers_expected).toBeUndefined();
87+
expect(requestBody.speaker_options.max_speakers_expected).toBe(5);
88+
});
89+
90+
it("should create transcript with speakers_expected (without speaker_options)", async () => {
91+
fetchMock.doMockOnceIf(
92+
requestMatches({ url: "/v2/transcript", method: "POST" }),
93+
JSON.stringify({ id: transcriptId, status: "queued" }),
94+
);
95+
96+
const transcript = await assembly.transcripts.submit({
97+
audio_url: remoteAudioURL,
98+
speaker_labels: true,
99+
speakers_expected: 3,
100+
});
101+
102+
expect(transcript.id).toBe(transcriptId);
103+
104+
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
105+
expect(requestBody.speaker_labels).toBe(true);
106+
expect(requestBody.speakers_expected).toBe(3);
107+
expect(requestBody.speaker_options).toBeUndefined();
108+
});
109+
110+
it("should handle null speaker_options", async () => {
111+
fetchMock.doMockOnceIf(
112+
requestMatches({ url: "/v2/transcript", method: "POST" }),
113+
JSON.stringify({ id: transcriptId, status: "queued" }),
114+
);
115+
116+
const transcript = await assembly.transcripts.submit({
117+
audio_url: remoteAudioURL,
118+
speaker_labels: true,
119+
speaker_options: null,
120+
});
121+
122+
expect(transcript.id).toBe(transcriptId);
123+
124+
const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string);
125+
expect(requestBody.speaker_options).toBe(null);
126+
});
127+
});

0 commit comments

Comments
 (0)