From 465bbfb3012022842377196816343911ff9e8404 Mon Sep 17 00:00:00 2001 From: Brace Sproul Date: Wed, 21 Aug 2024 11:30:04 -0700 Subject: [PATCH] core[minor]: Add LangSmith doc loader (#6568) * core[minor]: Add LangSmith doc loader * tests and nits * add test and docs * rename --- .../web_loaders/langsmith.ipynb | 302 ++++++++++++++++++ langchain-core/.gitignore | 4 + langchain-core/langchain.config.js | 1 + langchain-core/package.json | 13 + .../src/document_loaders/langsmith.ts | 182 +++++++++++ .../tests/langsmith.int.test.ts | 67 ++++ 6 files changed, 569 insertions(+) create mode 100644 docs/core_docs/docs/integrations/document_loaders/web_loaders/langsmith.ipynb create mode 100644 langchain-core/src/document_loaders/langsmith.ts create mode 100644 langchain-core/src/document_loaders/tests/langsmith.int.test.ts diff --git a/docs/core_docs/docs/integrations/document_loaders/web_loaders/langsmith.ipynb b/docs/core_docs/docs/integrations/document_loaders/web_loaders/langsmith.ipynb new file mode 100644 index 000000000000..b7e536113685 --- /dev/null +++ b/docs/core_docs/docs/integrations/document_loaders/web_loaders/langsmith.ipynb @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: FireCrawl\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LangSmithLoader\n", + "\n", + "This notebook provides a quick overview for getting started with the [LangSmithLoader](/docs/integrations/document_loaders/). For detailed documentation of all `LangSmithLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/_langchain_core.document_loaders_langsmith.LangSmithLoader.html).\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | [PY support](https://python.langchain.com/docs/integrations/document_loaders/langsmith)|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [LangSmithLoader](https://api.js.langchain.com/classes/_langchain_core.document_loaders_langsmith.LangSmithLoader.html) | [@langchain/community](https://api.js.langchain.com/classes/_langchain_core.html) | ✅ | beta | ✅ | \n", + "### Loader features\n", + "| Source | Web Loader | Node Envs Only\n", + "| :---: | :---: | :---: | \n", + "| LangSmithLoader | ✅ | ❌ | \n", + "\n", + "[FireCrawl](https://firecrawl.dev) crawls and convert any website into LLM-ready data. It crawls all accessible sub-pages and give you clean markdown and metadata for each. No sitemap required.\n", + "\n", + "FireCrawl handles complex tasks such as reverse proxies, caching, rate limits, and content blocked by JavaScript. Built by the [mendable.ai](https://mendable.ai) team.\n", + "\n", + "This guide shows how to scrap and crawl entire websites and load them using the `LangSmithLoader` in LangChain.\n", + "\n", + "## Setup\n", + "\n", + "To access the LangSmith document loader you'll need to install `@langchain/core`, create a [LangSmith](https://langsmith.com/) account and get an API key.\n", + "\n", + "### Credentials\n", + "\n", + "Sign up at https://langsmith.com and generate an API key. Once you've done this set the `LANGSMITH_API_KEY` environment variable:\n", + "\n", + "```bash\n", + "export LANGSMITH_API_KEY=\"your-api-key\"\n", + "```\n", + "\n", + "### Installation\n", + "\n", + "The `LangSmithLoader` integration lives in the `@langchain/core` package:\n", + "\n", + "```{=mdx}\n", + "import IntegrationInstallTooltip from \"@mdx_components/integration_install_tooltip.mdx\";\n", + "import Npm2Yarn from \"@theme/Npm2Yarn\";\n", + "\n", + "\n", + "\n", + "\n", + " @langchain/core\n", + "\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create example dataset\n", + "\n", + "For this example, we'll create a new dataset which we'll use in our document loader." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import { Client as LangSmithClient } from 'langsmith';\n", + "import { faker } from \"@faker-js/faker\";\n", + "\n", + "const lsClient = new LangSmithClient();\n", + "\n", + "const datasetName = \"LangSmith Few Shot Datasets Notebook\";\n", + "\n", + "const exampleInputs = Array.from({ length: 10 }, (_, i) => ({\n", + " input: faker.lorem.paragraph(),\n", + "}));\n", + "const exampleOutputs = Array.from({ length: 10 }, (_, i) => ({\n", + " output: faker.lorem.sentence(),\n", + "}));\n", + "const exampleMetadata = Array.from({ length: 10 }, (_, i) => ({\n", + " companyCatchPhrase: faker.company.catchPhrase(),\n", + "}));\n", + "\n", + "await lsClient.deleteDataset({\n", + " datasetName,\n", + "})\n", + "\n", + "const dataset = await lsClient.createDataset(datasetName);\n", + "\n", + "const examples = await lsClient.createExamples({\n", + " inputs: exampleInputs,\n", + " outputs: exampleOutputs,\n", + " metadata: exampleMetadata,\n", + " datasetId: dataset.id,\n", + "});" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import { LangSmithLoader } from \"@langchain/core/document_loaders/langsmith\"\n", + "\n", + "const loader = new LangSmithLoader({\n", + " datasetName: \"LangSmith Few Shot Datasets Notebook\",\n", + " // Instead of a datasetName, you can alternatively provide a datasetId\n", + " // datasetId: dataset.id,\n", + " contentKey: \"input\",\n", + " limit: 5,\n", + " // formatContent: (content) => content,\n", + " // ... other options\n", + "})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " pageContent: 'Conventus supellex aegrotatio termes. Vapulus abscido ubi vita coadunatio modi crapula comparo caecus. Acervus voluptate tergeo pariatur conor argumentum inventore vomito stella.',\n", + " metadata: {\n", + " id: 'f1a04800-6f7a-4232-9743-fb5d9029bf1f',\n", + " created_at: '2024-08-20T17:01:38.984045+00:00',\n", + " modified_at: '2024-08-20T17:01:38.984045+00:00',\n", + " name: '#f1a0 @ LangSmith Few Shot Datasets Notebook',\n", + " dataset_id: '9ccd66e6-e506-478c-9095-3d9e27575a89',\n", + " source_run_id: null,\n", + " metadata: {\n", + " dataset_split: [Array],\n", + " companyCatchPhrase: 'Integrated solution-oriented secured line'\n", + " },\n", + " inputs: {\n", + " input: 'Conventus supellex aegrotatio termes. Vapulus abscido ubi vita coadunatio modi crapula comparo caecus. Acervus voluptate tergeo pariatur conor argumentum inventore vomito stella.'\n", + " },\n", + " outputs: {\n", + " output: 'Excepturi adeptio spectaculum bis volaticus accusamus.'\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "const docs = await loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " id: 'f1a04800-6f7a-4232-9743-fb5d9029bf1f',\n", + " created_at: '2024-08-20T17:01:38.984045+00:00',\n", + " modified_at: '2024-08-20T17:01:38.984045+00:00',\n", + " name: '#f1a0 @ LangSmith Few Shot Datasets Notebook',\n", + " dataset_id: '9ccd66e6-e506-478c-9095-3d9e27575a89',\n", + " source_run_id: null,\n", + " metadata: {\n", + " dataset_split: [ 'base' ],\n", + " companyCatchPhrase: 'Integrated solution-oriented secured line'\n", + " },\n", + " inputs: {\n", + " input: 'Conventus supellex aegrotatio termes. Vapulus abscido ubi vita coadunatio modi crapula comparo caecus. Acervus voluptate tergeo pariatur conor argumentum inventore vomito stella.'\n", + " },\n", + " outputs: { output: 'Excepturi adeptio spectaculum bis volaticus accusamus.' }\n", + "}\n" + ] + } + ], + "source": [ + "console.log(docs[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " input: 'Conventus supellex aegrotatio termes. Vapulus abscido ubi vita coadunatio modi crapula comparo caecus. Acervus voluptate tergeo pariatur conor argumentum inventore vomito stella.'\n", + "}\n" + ] + } + ], + "source": [ + "console.log(docs[0].metadata.inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{ output: 'Excepturi adeptio spectaculum bis volaticus accusamus.' }\n" + ] + } + ], + "source": [ + "console.log(docs[0].metadata.outputs)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[\n", + " 'id',\n", + " 'created_at',\n", + " 'modified_at',\n", + " 'name',\n", + " 'dataset_id',\n", + " 'source_run_id',\n", + " 'metadata',\n", + " 'inputs',\n", + " 'outputs'\n", + "]\n" + ] + } + ], + "source": [ + "console.log(Object.keys(docs[0].metadata))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all `LangSmithLoader` features and configurations head to the [API reference](https://api.js.langchain.com/classes/_langchain_core.document_loaders_langsmith.LangSmithLoader.html)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "TypeScript", + "language": "typescript", + "name": "tslab" + }, + "language_info": { + "codemirror_mode": { + "mode": "typescript", + "name": "javascript", + "typescript": true + }, + "file_extension": ".ts", + "mimetype": "text/typescript", + "name": "typescript", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain-core/.gitignore b/langchain-core/.gitignore index 4791f5d14dab..efc6a4dc88ba 100644 --- a/langchain-core/.gitignore +++ b/langchain-core/.gitignore @@ -38,6 +38,10 @@ document_loaders/base.cjs document_loaders/base.js document_loaders/base.d.ts document_loaders/base.d.cts +document_loaders/langsmith.cjs +document_loaders/langsmith.js +document_loaders/langsmith.d.ts +document_loaders/langsmith.d.cts embeddings.cjs embeddings.js embeddings.d.ts diff --git a/langchain-core/langchain.config.js b/langchain-core/langchain.config.js index 620f4d3139db..b7fd982357ee 100644 --- a/langchain-core/langchain.config.js +++ b/langchain-core/langchain.config.js @@ -22,6 +22,7 @@ export const config = { chat_history: "chat_history", documents: "documents/index", "document_loaders/base": "document_loaders/base", + "document_loaders/langsmith": "document_loaders/langsmith", embeddings: "embeddings", example_selectors: "example_selectors/index", indexing: "indexing/index", diff --git a/langchain-core/package.json b/langchain-core/package.json index ed901e978c36..2768e05daa01 100644 --- a/langchain-core/package.json +++ b/langchain-core/package.json @@ -186,6 +186,15 @@ "import": "./document_loaders/base.js", "require": "./document_loaders/base.cjs" }, + "./document_loaders/langsmith": { + "types": { + "import": "./document_loaders/langsmith.d.ts", + "require": "./document_loaders/langsmith.d.cts", + "default": "./document_loaders/langsmith.d.ts" + }, + "import": "./document_loaders/langsmith.js", + "require": "./document_loaders/langsmith.cjs" + }, "./embeddings": { "types": { "import": "./embeddings.d.ts", @@ -653,6 +662,10 @@ "document_loaders/base.js", "document_loaders/base.d.ts", "document_loaders/base.d.cts", + "document_loaders/langsmith.cjs", + "document_loaders/langsmith.js", + "document_loaders/langsmith.d.ts", + "document_loaders/langsmith.d.cts", "embeddings.cjs", "embeddings.js", "embeddings.d.ts", diff --git a/langchain-core/src/document_loaders/langsmith.ts b/langchain-core/src/document_loaders/langsmith.ts new file mode 100644 index 000000000000..d41d0a2740f3 --- /dev/null +++ b/langchain-core/src/document_loaders/langsmith.ts @@ -0,0 +1,182 @@ +import { KVMap } from "langsmith/schemas"; +import { Client } from "langsmith"; +import { Document, DocumentInterface } from "../documents/document.js"; +import { AsyncCallerParams } from "../utils/async_caller.js"; +import { BaseDocumentLoader } from "./base.js"; + +// TODO: Replace with import from `langsmith` once exposed. +interface ClientConfig { + apiUrl?: string; + apiKey?: string; + callerOptions?: AsyncCallerParams; + timeout_ms?: number; + webUrl?: string; + anonymizer?: (values: KVMap) => KVMap; + hideInputs?: boolean | ((inputs: KVMap) => KVMap); + hideOutputs?: boolean | ((outputs: KVMap) => KVMap); + autoBatchTracing?: boolean; + pendingAutoBatchedRunLimit?: number; + fetchOptions?: RequestInit; +} + +export interface LangSmithLoaderFields { + datasetId?: string; + datasetName?: string; + exampleIds?: Array; + asOf?: Date | string; + splits?: string[]; + inlineS3Urls?: boolean; + offset?: number; + limit?: number; + metadata?: KVMap; + filter?: string; + contentKey?: string; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + formatContent?: (content: any) => string; + client?: Client; + clientConfig?: ClientConfig; +} + +/** + * Document loader integration with LangSmith. + * + * ## [Constructor args](https://api.js.langchain.com/interfaces/_langchain_core.document_loaders_langsmith.LangSmithLoaderFields.html) + * + *
+ * Load + * + * ```typescript + * import { LangSmithLoader } from '@langchain/core/document_loaders/langsmith'; + * import { Client } from 'langsmith'; + * + * const langSmithClient = new Client({ + * apiKey: process.env.LANGSMITH_API_KEY, + * }) + * + * const loader = new LangSmithLoader({ + * datasetId: "9a3b36f7-b308-40a5-9b46-6613853b6330", + * limit: 1, + * }); + * + * const docs = await loader.load(); + * ``` + * + * ```txt + * [ + * { + * pageContent: '{\n "input_key_str": "string",\n "input_key_bool": true\n}', + * metadata: { + * id: '8523d9e9-c123-4b23-9b46-21021nds289e', + * created_at: '2024-08-19T17:09:14.806441+00:00', + * modified_at: '2024-08-19T17:09:14.806441+00:00', + * name: '#8517 @ brace-test-dataset', + * dataset_id: '9a3b36f7-b308-40a5-9b46-6613853b6330', + * source_run_id: null, + * metadata: [Object], + * inputs: [Object], + * outputs: [Object] + * } + * } + * ] + * ``` + *
+ */ +export class LangSmithLoader extends BaseDocumentLoader { + datasetId?: string; + + datasetName?: string; + + exampleIds?: Array; + + asOf?: Date | string; + + splits?: string[]; + + inlineS3Urls?: boolean; + + offset?: number; + + limit?: number; + + metadata?: KVMap; + + filter?: string; + + contentKey: string[]; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + formatContent: (content: any) => string; + + client: Client; + + constructor(fields: LangSmithLoaderFields) { + super(); + + if (fields.client && fields.clientConfig) { + throw new Error("client and clientConfig cannot both be provided."); + } + this.client = fields.client ?? new Client(fields?.clientConfig); + this.contentKey = fields.contentKey ? fields.contentKey.split(".") : []; + this.formatContent = fields.formatContent ?? _stringify; + this.datasetId = fields.datasetId; + this.datasetName = fields.datasetName; + this.exampleIds = fields.exampleIds; + this.asOf = fields.asOf; + this.splits = fields.splits; + this.inlineS3Urls = fields.inlineS3Urls; + this.offset = fields.offset; + this.limit = fields.limit; + this.metadata = fields.metadata; + this.filter = fields.filter; + } + + async load(): Promise { + const documents: DocumentInterface[] = []; + for await (const example of this.client.listExamples({ + datasetId: this.datasetId, + datasetName: this.datasetName, + exampleIds: this.exampleIds, + asOf: this.asOf, + splits: this.splits, + inlineS3Urls: this.inlineS3Urls, + offset: this.offset, + limit: this.limit, + metadata: this.metadata, + filter: this.filter, + })) { + let content = example.inputs; + for (const key of this.contentKey) { + content = content[key]; + } + const contentStr = this.formatContent(content); + + const metadata: KVMap = example; + ["created_at", "modified_at"].forEach((k) => { + if (k in metadata) { + if (typeof metadata[k] === "object") { + // Dates are of type `object`, we want to convert them to strings. + metadata[k] = metadata[k].toString(); + } + } + }); + + documents.push({ + pageContent: contentStr, + metadata, + }); + } + return documents; + } +} + +function _stringify(x: string | KVMap): string { + if (typeof x === "string") { + return x; + } else { + try { + return JSON.stringify(x, null, 2); + } catch (error) { + return String(x); + } + } +} diff --git a/langchain-core/src/document_loaders/tests/langsmith.int.test.ts b/langchain-core/src/document_loaders/tests/langsmith.int.test.ts new file mode 100644 index 000000000000..a546bbf56847 --- /dev/null +++ b/langchain-core/src/document_loaders/tests/langsmith.int.test.ts @@ -0,0 +1,67 @@ +/* eslint-disable no-process-env */ +import { test, expect } from "@jest/globals"; +import { Client } from "langsmith"; +import { LangSmithLoader } from "../langsmith.js"; + +const DATASET_NAME = "brace-test-dataset"; +const DATASET_ID = "9a3b36f7-a297-40a5-944d-6613853b6330"; + +test("LangSmithLoader can load with client passed in", async () => { + const lsClient = new Client(); + const loader = new LangSmithLoader({ + datasetId: DATASET_ID, + client: lsClient, + }); + const docs = await loader.load(); + + expect(docs.length).toBeGreaterThanOrEqual(1); + console.log(docs[0]); + const parsedContent = JSON.parse(docs[0].pageContent); + expect(parsedContent).toHaveProperty("input_key_str"); + expect(parsedContent.input_key_str).toBe("string"); + expect(parsedContent).toHaveProperty("input_key_bool"); + expect(parsedContent.input_key_bool).toBe(true); + + expect(docs[0].metadata).toHaveProperty("created_at"); + expect(typeof docs[0].metadata.created_at).toBe("string"); + expect(docs[0].metadata).toHaveProperty("modified_at"); + expect(typeof docs[0].metadata.modified_at).toBe("string"); +}); + +test("LangSmithLoader can load with client options passed in", async () => { + const lsApiKey = process.env.LANGCHAIN_API_KEY; + // unassign the API key to confirm the client isn't overriding what we passed in. + process.env.LANGCHAIN_API_KEY = ""; + + try { + const lsConfigArgs = { + apiKey: lsApiKey, + }; + const loader = new LangSmithLoader({ + datasetId: DATASET_ID, + clientConfig: lsConfigArgs, + }); + const docs = await loader.load(); + + expect(docs.length).toBeGreaterThanOrEqual(1); + } finally { + process.env.LANGCHAIN_API_KEY = lsApiKey; + } +}); + +test("LangSmithLoader can load with dataset name", async () => { + const loader = new LangSmithLoader({ datasetName: DATASET_NAME }); + const docs = await loader.load(); + + expect(docs.length).toBeGreaterThanOrEqual(1); +}); + +test("Passing content key correctly loads that value", async () => { + const loader = new LangSmithLoader({ + datasetName: DATASET_NAME, + contentKey: "input_key_str", + }); + const docs = await loader.load(); + + expect(docs[0].pageContent).toBe("string"); +});