langchain-ai · bracesproul · May 15, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts b/libs/langchain-google-gauth/src/tests/chat_models.audio.int.test.ts
diff --git a/libs/langchain-google-genai/src/tests/chat_models.int.test.ts b/libs/langchain-google-genai/src/tests/chat_models.int.test.ts
@@ -4,6 +4,10 @@ import { fileURLToPath } from "node:url";
 import * as path from "node:path";
 import { HumanMessage } from "@langchain/core/messages";
 import { ChatGoogleGenerativeAI } from "../chat_models.js";
+import {
+  ChatPromptTemplate,
+  MessagesPlaceholder,
+} from "@langchain/core/prompts";
 
 test("Test Google AI", async () => {
   const model = new ChatGoogleGenerativeAI({});
@@ -132,3 +136,48 @@ test("Test Google AI in streaming mode", async () => {
   expect(nrNewTokens > 1).toBe(true);
   expect(res.content).toBe(tokens);
 });
+
+async function fileToBase64(filePath: string): Promise<string> {
+  const fileData = await fs.readFile(filePath);
+  const base64String = Buffer.from(fileData).toString("base64");
+  return base64String;
+}
+
+test.skip("Gemini can understand audio", async () => {
+  // Update this with the correct path to an audio file on your machine.
+  const audioPath =
+    "/Users/bracesproul/code/lang-chain-ai/langchainjs/libs/langchain-google-gauth/src/tests/data/audio.mp3";
+  const audioMimeType = "audio/mp3";
+
+  const model = new ChatGoogleGenerativeAI({
+    model: "gemini-1.5-pro-latest",
+    temperature: 0,
+  });
+
+  const audioBase64 = await fileToBase64(audioPath);
+
+  const prompt = ChatPromptTemplate.fromMessages([
+    new MessagesPlaceholder("audio"),
+  ]);
+
+  const chain = prompt.pipe(model);
+  const response = await chain.invoke({
+    audio: new HumanMessage({
+      content: [
+        {
+          type: "media",
+          mimeType: audioMimeType,
+          data: audioBase64,
+        },
+        {
+          type: "text",
+          text: "Summarize the content in this audio. ALso, what is the speaker's tone?",
+        },
+      ],
+    }),
+  });
+
+  console.log(response.content);
+  expect(typeof response.content).toBe("string");
+  expect((response.content as string).length).toBeGreaterThan(15);
+});
diff --git a/libs/langchain-google-genai/src/utils.ts b/libs/langchain-google-genai/src/utils.ts
@@ -48,6 +48,22 @@ export function convertAuthorToRole(author: string) {
   }
 }
 
+function messageContentMedia(
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  content: Record<string, any>
+): Part {
+  if ("mimeType" in content && "data" in content) {
+    return {
+      inlineData: {
+        mimeType: content.mimeType,
+        data: content.data,
+      },
+    };
+  }
+
+  throw new Error("Invalid media content");
+}
+
 export function convertMessageContentToParts(
   content: MessageContent,
   isMultimodalModel: boolean
@@ -91,6 +107,8 @@ export function convertMessageContentToParts(
           mimeType,
         },
       };
+    } else if (c.type === "media") {
+      return messageContentMedia(c);
     }
     throw new Error(`Unknown content type ${(c as { type: string }).type}`);
   });