diff --git a/docs/core_docs/docs/integrations/chat/openai.ipynb b/docs/core_docs/docs/integrations/chat/openai.ipynb index 921b21f35330..f44f301dfff0 100644 --- a/docs/core_docs/docs/integrations/chat/openai.ipynb +++ b/docs/core_docs/docs/integrations/chat/openai.ipynb @@ -1028,116 +1028,6 @@ "console.log(\"USAGE:\", resWitCaching.response_metadata.usage);" ] }, - { - "cell_type": "markdown", - "id": "cc8b3c94", - "metadata": {}, - "source": [ - "## Audio output\n", - "\n", - "Some OpenAI models (such as `gpt-4o-audio-preview`) support generating audio output. This example shows how to use that feature:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "b4d579b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " id: 'audio_67117718c6008190a3afad3e3054b9b6',\n", - " data: 'UklGRqYwBgBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAATElTVBoAAABJTkZPSVNGVA4AAABMYXZmNTguMjkuMTAwAGRhdGFg',\n", - " expires_at: 1729201448,\n", - " transcript: 'Sure! Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!'\n", - "}\n" - ] - } - ], - "source": [ - "import { ChatOpenAI } from \"@langchain/openai\";\n", - "\n", - "const modelWithAudioOutput = new ChatOpenAI({\n", - " model: \"gpt-4o-audio-preview\",\n", - " // You may also pass these fields to `.bind` as a call argument.\n", - " modalities: [\"text\", \"audio\"], // Specifies that the model should output audio.\n", - " audio: {\n", - " voice: \"alloy\",\n", - " format: \"wav\",\n", - " },\n", - "});\n", - "\n", - "const audioOutputResult = await modelWithAudioOutput.invoke(\"Tell me a joke about cats.\");\n", - "const castMessageContent = audioOutputResult.content[0] as Record;\n", - "\n", - "console.log({\n", - " ...castMessageContent,\n", - " data: castMessageContent.data.slice(0, 100) // Sliced for brevity\n", - "})" - ] - }, - { - "cell_type": "markdown", - "id": "bfea3608", - "metadata": {}, - "source": [ - "We see that the audio data is returned inside the `data` field. We are also provided an `expires_at` date field. This field represents the date the audio response will no longer be accessible on the server for use in multi-turn conversations.\n", - "\n", - "### Streaming Audio Output\n", - "\n", - "OpenAI also supports streaming audio output. Here's an example:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "0fa68183", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " id: 'audio_671177fd836c819099b0110f5180a581audio_671177fd836c819099b0110f5180a581',\n", - " transcript: 'Why was the cat sitting on the computer? Because it wanted to keep an eye on the mouse!',\n", - " index: 0,\n", - " data: 'CQAMAAMADwAHAAsADAAKAA8ADQAPAAoABQANAAUAEAAIAA0ABwAHAAoAAAAFAAMABwAJAAQABwAAAAgAAgAFAAMAAwACAAAAAwAB',\n", - " expires_at: 1729201678\n", - "}\n" - ] - } - ], - "source": [ - "import { AIMessageChunk } from \"@langchain/core/messages\";\n", - "import { concat } from \"@langchain/core/utils/stream\"\n", - "import { ChatOpenAI } from \"@langchain/openai\";\n", - "\n", - "const modelWithStreamingAudioOutput = new ChatOpenAI({\n", - " model: \"gpt-4o-audio-preview\",\n", - " modalities: [\"text\", \"audio\"],\n", - " audio: {\n", - " voice: \"alloy\",\n", - " format: \"pcm16\", // Format must be `pcm16` for streaming\n", - " },\n", - "});\n", - "\n", - "const audioOutputStream = await modelWithStreamingAudioOutput.stream(\"Tell me a joke about cats.\");\n", - "let finalAudioOutputMsg: AIMessageChunk | undefined;\n", - "for await (const chunk of audioOutputStream) {\n", - " finalAudioOutputMsg = finalAudioOutputMsg ? concat(finalAudioOutputMsg, chunk) : chunk;\n", - "}\n", - "const castStreamedMessageContent = finalAudioOutputMsg?.content[1] as Record;\n", - "\n", - "console.log({\n", - " ...castStreamedMessageContent,\n", - " data: castStreamedMessageContent.data.slice(0, 100) // Sliced for brevity\n", - "})" - ] - }, { "cell_type": "markdown", "id": "3a5bb5ca-c3ae-4a58-be67-2cd18574b9a3", diff --git a/libs/langchain-openai/package.json b/libs/langchain-openai/package.json index f21c1d1d9c81..e842d776b63c 100644 --- a/libs/langchain-openai/package.json +++ b/libs/langchain-openai/package.json @@ -36,7 +36,7 @@ "license": "MIT", "dependencies": { "js-tiktoken": "^1.0.12", - "openai": "^4.68.0", + "openai": "^4.67.2", "zod": "^3.22.4", "zod-to-json-schema": "^3.22.3" }, diff --git a/libs/langchain-openai/src/chat_models.ts b/libs/langchain-openai/src/chat_models.ts index 914c7135b0ba..728d08eabed0 100644 --- a/libs/langchain-openai/src/chat_models.ts +++ b/libs/langchain-openai/src/chat_models.ts @@ -16,7 +16,6 @@ import { isAIMessage, convertToChunk, UsageMetadata, - MessageContent, } from "@langchain/core/messages"; import { type ChatGeneration, @@ -175,10 +174,8 @@ function openAIResponseToChatMessage( system_fingerprint: rawResponse.system_fingerprint, }; } - const content = message.audio ? [message.audio] : message.content; - return new AIMessage({ - content: content || "", + content: message.content || "", tool_calls: toolCalls, invalid_tool_calls: invalidToolCalls, additional_kwargs, @@ -199,17 +196,7 @@ function _convertDeltaToMessageChunk( includeRawResponse?: boolean ) { const role = delta.role ?? defaultRole; - let content: MessageContent; - if (delta.audio) { - content = [ - { - ...delta.audio, - index: rawResponse.choices[0].index, - }, - ]; - } else { - content = delta.content ?? ""; - } + const content = delta.content ?? ""; let additional_kwargs: Record; if (delta.function_call) { additional_kwargs = { @@ -385,26 +372,6 @@ export interface ChatOpenAICallOptions * @version 0.2.6 */ strict?: boolean; - /** - * Output types that you would like the model to generate for this request. Most - * models are capable of generating text, which is the default: - * - * `["text"]` - * - * The `gpt-4o-audio-preview` model can also be used to - * [generate audio](https://platform.openai.com/docs/guides/audio). To request that - * this model generate both text and audio responses, you can use: - * - * `["text", "audio"]` - */ - modalities?: Array; - - /** - * Parameters for audio output. Required when audio output is requested with - * `modalities: ["audio"]`. - * [Learn more](https://platform.openai.com/docs/guides/audio). - */ - audio?: OpenAIClient.Chat.ChatCompletionAudioParam; } export interface ChatOpenAIFields @@ -875,43 +842,6 @@ export interface ChatOpenAIFields * * *
- * - *
- * Audio Outputs - * - * ```typescript - * import { ChatOpenAI } from "@langchain/openai"; - * - * const modelWithAudioOutput = new ChatOpenAI({ - * model: "gpt-4o-audio-preview", - * // You may also pass these fields to `.bind` as a call argument. - * modalities: ["text", "audio"], // Specifies that the model should output audio. - * audio: { - * voice: "alloy", - * format: "wav", - * }, - * }); - * - * const audioOutputResult = await modelWithAudioOutput.invoke("Tell me a joke about cats."); - * const castMessageContent = audioOutputResult.content[0] as Record; - * - * console.log({ - * ...castMessageContent, - * data: castMessageContent.data.slice(0, 100) // Sliced for brevity - * }) - * ``` - * - * ```txt - * { - * id: 'audio_67117718c6008190a3afad3e3054b9b6', - * data: 'UklGRqYwBgBXQVZFZm10IBAAAAABAAEAwF0AAIC7AAACABAATElTVBoAAABJTkZPSVNGVA4AAABMYXZmNTguMjkuMTAwAGRhdGFg', - * expires_at: 1729201448, - * transcript: 'Sure! Why did the cat sit on the computer? Because it wanted to keep an eye on the mouse!' - * } - * ``` - *
- * - *
*/ export class ChatOpenAI< CallOptions extends ChatOpenAICallOptions = ChatOpenAICallOptions @@ -1028,10 +958,6 @@ export class ChatOpenAI< */ supportsStrictToolCalling?: boolean; - audio?: OpenAIClient.Chat.ChatCompletionAudioParam; - - modalities?: Array; - constructor( fields?: ChatOpenAIFields, /** @deprecated */ @@ -1100,8 +1026,6 @@ export class ChatOpenAI< this.stopSequences = this?.stop; this.user = fields?.user; this.__includeRawResponse = fields?.__includeRawResponse; - this.audio = fields?.audio; - this.modalities = fields?.modalities; if (this.azureOpenAIApiKey || this.azureADTokenProvider) { if ( @@ -1266,12 +1190,6 @@ export class ChatOpenAI< seed: options?.seed, ...streamOptionsConfig, parallel_tool_calls: options?.parallel_tool_calls, - ...(this.audio || options?.audio - ? { audio: this.audio || options?.audio } - : {}), - ...(this.modalities || options?.modalities - ? { modalities: this.modalities || options?.modalities } - : {}), ...this.modelKwargs, }; return params; @@ -1323,7 +1241,7 @@ export class ChatOpenAI< const streamIterable = await this.completionWithRetry(params, options); let usage: OpenAIClient.Completions.CompletionUsage | undefined; for await (const data of streamIterable) { - const choice = data?.choices?.[0]; + const choice = data?.choices[0]; if (data.usage) { usage = data.usage; } @@ -1346,6 +1264,12 @@ export class ChatOpenAI< prompt: options.promptIndex ?? 0, completion: choice.index ?? 0, }; + if (typeof chunk.content !== "string") { + console.log( + "[WARNING]: Received non-string content from OpenAI. This is currently not supported." + ); + continue; + } // eslint-disable-next-line @typescript-eslint/no-explicit-any const generationInfo: Record = { ...newTokenIndices }; if (choice.finish_reason != null) { @@ -1359,7 +1283,7 @@ export class ChatOpenAI< } const generationChunk = new ChatGenerationChunk({ message: chunk, - text: typeof chunk.content === "string" ? chunk.content : "", + text: chunk.content, generationInfo, }); yield generationChunk; @@ -1566,8 +1490,9 @@ export class ChatOpenAI< const generations: ChatGeneration[] = []; for (const part of data?.choices ?? []) { + const text = part.message?.content ?? ""; const generation: ChatGeneration = { - text: part.message?.content ?? "", + text, message: openAIResponseToChatMessage( part.message ?? { role: "assistant" }, data, diff --git a/libs/langchain-openai/src/tests/chat_models.int.test.ts b/libs/langchain-openai/src/tests/chat_models.int.test.ts index 71c3bddc0225..7c3510b3543a 100644 --- a/libs/langchain-openai/src/tests/chat_models.int.test.ts +++ b/libs/langchain-openai/src/tests/chat_models.int.test.ts @@ -19,7 +19,6 @@ import { import { CallbackManager } from "@langchain/core/callbacks/manager"; import { NewTokenIndices } from "@langchain/core/callbacks/base"; import { InMemoryCache } from "@langchain/core/caches"; -import { concat } from "@langchain/core/utils/stream"; import { ChatOpenAI } from "../chat_models.js"; // Save the original value of the 'LANGCHAIN_CALLBACKS_BACKGROUND' environment variable @@ -987,78 +986,3 @@ test("Test ChatOpenAI stream method", async () => { } expect(chunks.length).toEqual(1); }); - -describe("Audio output", () => { - test("Audio output", async () => { - const model = new ChatOpenAI({ - model: "gpt-4o-audio-preview", - temperature: 0, - modalities: ["text", "audio"], - audio: { - voice: "alloy", - format: "wav", - }, - }); - - const response = await model.invoke("Make me an audio clip of you yelling"); - expect(Array.isArray(response.content)).toBeTruthy(); - expect(Object.keys(response.content[0]).sort()).toEqual([ - "data", - "expires_at", - "id", - "transcript", - ]); - }); - - test("Audio output can stream", async () => { - const model = new ChatOpenAI({ - model: "gpt-4o-audio-preview", - temperature: 0, - modalities: ["text", "audio"], - audio: { - voice: "alloy", - format: "pcm16", - }, - }); - - const stream = await model.stream("Make me an audio clip of you yelling"); - let finalMsg: AIMessageChunk | undefined; - for await (const chunk of stream) { - finalMsg = finalMsg ? concat(finalMsg, chunk) : chunk; - } - if (!finalMsg) { - throw new Error("No final message found"); - } - console.dir(finalMsg, { depth: null }); - expect(Array.isArray(finalMsg.content)).toBeTruthy(); - expect(Object.keys(finalMsg.content[1]).sort()).toEqual([ - "data", - "expires_at", - "id", - "index", - "transcript", - ]); - }); - - test("Can bind audio output args", async () => { - const model = new ChatOpenAI({ - model: "gpt-4o-audio-preview", - temperature: 0, - }).bind({ - modalities: ["text", "audio"], - audio: { - voice: "alloy", - format: "wav", - }, - }); - - const response = await model.invoke("Make me an audio clip of you yelling"); - expect(Array.isArray(response.content)).toBeTruthy(); - expect(Object.keys(response.content[0]).sort()).toEqual([ - "data", - "expires_at", - "id", - "transcript", - ]); - }); -}); diff --git a/libs/langchain-openai/src/types.ts b/libs/langchain-openai/src/types.ts index 1eb449443791..5f5ca73809a5 100644 --- a/libs/langchain-openai/src/types.ts +++ b/libs/langchain-openai/src/types.ts @@ -167,27 +167,6 @@ export interface OpenAIChatInput extends OpenAIBaseInput { * If `undefined` the `strict` argument will not be passed to OpenAI. */ supportsStrictToolCalling?: boolean; - - /** - * Output types that you would like the model to generate for this request. Most - * models are capable of generating text, which is the default: - * - * `["text"]` - * - * The `gpt-4o-audio-preview` model can also be used to - * [generate audio](https://platform.openai.com/docs/guides/audio). To request that - * this model generate both text and audio responses, you can use: - * - * `["text", "audio"]` - */ - modalities?: Array; - - /** - * Parameters for audio output. Required when audio output is requested with - * `modalities: ["audio"]`. - * [Learn more](https://platform.openai.com/docs/guides/audio). - */ - audio?: OpenAIClient.Chat.ChatCompletionAudioParam; } export declare interface AzureOpenAIInput { diff --git a/yarn.lock b/yarn.lock index 8c9766673e28..a349c341e8b6 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12530,7 +12530,7 @@ __metadata: jest: ^29.5.0 jest-environment-node: ^29.6.4 js-tiktoken: ^1.0.12 - openai: ^4.68.0 + openai: ^4.67.2 prettier: ^2.8.3 release-it: ^17.6.0 rimraf: ^5.0.1 @@ -35510,9 +35510,9 @@ __metadata: languageName: node linkType: hard -"openai@npm:^4.68.0": - version: 4.68.0 - resolution: "openai@npm:4.68.0" +"openai@npm:^4.67.2": + version: 4.67.2 + resolution: "openai@npm:4.67.2" dependencies: "@types/node": ^18.11.18 "@types/node-fetch": ^2.6.4 @@ -35528,7 +35528,7 @@ __metadata: optional: true bin: openai: bin/cli - checksum: 2866e54ac1b34e074055dde7cc809bcc33d1172f0ab289dacd54ced04a62ab3c2b9f584fdb84ece981edc5c30939497af4e91fe33646f71d5c6ced5d7106a797 + checksum: 8c83e2632f2c51fea0f9b059026239a46ad171feaedd1456019481136defd468e828b0b091c53da3ebb65da37c4bb76455142c64ea9bc664124c1a341f7f2b78 languageName: node linkType: hard