From 5922742d565aadfeef58648067b0874c5c57366d Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 12 Jun 2023 10:57:31 -0700 Subject: [PATCH 01/11] comment out --- .../examples/mongodb_atlas_vector_search.ipynb | 9 +++------ docs/modules/models/text_embedding/examples/embaas.ipynb | 6 +++--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb b/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb index b41ec8c8a697e..4ebd0a723da77 100644 --- a/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb +++ b/docs/modules/indexes/vectorstores/examples/mongodb_atlas_vector_search.ipynb @@ -1,12 +1,12 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "683953b3", "metadata": {}, "source": [ - "#### Commented out until further notice\n", + "Commented out until further notice\n", + "\n", "MongoDB Atlas Vector Search\n", "\n", ">[MongoDB Atlas](https://www.mongodb.com/docs/atlas/) is a fully-managed cloud database available in AWS , Azure, and GCP. It now has support for native Vector Search on your MongoDB document data.\n", @@ -46,7 +46,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "457ace44-1d95-4001-9dd5-78811ab208ad", "metadata": {}, @@ -66,7 +65,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1f3ecc42", "metadata": {}, @@ -162,7 +160,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "851a2ec9-9390-49a4-8412-3e132c9f789d", "metadata": {}, @@ -220,7 +217,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/models/text_embedding/examples/embaas.ipynb b/docs/modules/models/text_embedding/examples/embaas.ipynb index 5a1350e7638bf..2473fe9045003 100644 --- a/docs/modules/models/text_embedding/examples/embaas.ipynb +++ b/docs/modules/models/text_embedding/examples/embaas.ipynb @@ -129,14 +129,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.9.1" } }, "nbformat": 4, From 2c91f0d750eb153f72f7d095fadde18f8c683de8 Mon Sep 17 00:00:00 2001 From: Jens Madsen Date: Mon, 12 Jun 2023 22:27:10 +0200 Subject: [PATCH 02/11] chore: spedd up integration test by using smaller model (#6044) Adds a new parameter `relative_chunk_overlap` for the `SentenceTransformersTokenTextSplitter` constructor. The parameter sets the chunk overlap using a relative factor, e.g. for a model where the token limit is 100, a `relative_chunk_overlap=0.5` implies that `chunk_overlap=50` Tag maintainers/contributors who might be interested: @hwchase17, @dev2049 --- tests/integration_tests/test_text_splitter.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/tests/integration_tests/test_text_splitter.py b/tests/integration_tests/test_text_splitter.py index 3cf78c71a0b71..e27108f98874b 100644 --- a/tests/integration_tests/test_text_splitter.py +++ b/tests/integration_tests/test_text_splitter.py @@ -52,14 +52,14 @@ def test_token_text_splitter_from_tiktoken() -> None: def test_sentence_transformers_count_tokens() -> None: splitter = SentenceTransformersTokenTextSplitter( - model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + model_name="sentence-transformers/paraphrase-albert-small-v2" ) text = "Lorem ipsum" token_count = splitter.count_tokens(text=text) expected_start_stop_token_count = 2 - expected_text_token_count = 2 + expected_text_token_count = 5 expected_token_count = expected_start_stop_token_count + expected_text_token_count assert expected_token_count == token_count @@ -67,9 +67,9 @@ def test_sentence_transformers_count_tokens() -> None: def test_sentence_transformers_split_text() -> None: splitter = SentenceTransformersTokenTextSplitter( - model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + model_name="sentence-transformers/paraphrase-albert-small-v2" ) - text = "Lorem ipsum" + text = "lorem ipsum" text_chunks = splitter.split_text(text=text) expected_text_chunks = [text] assert expected_text_chunks == text_chunks @@ -79,14 +79,29 @@ def test_sentence_transformers_multiple_tokens() -> None: splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0) text = "Lorem " + text_token_count_including_start_and_stop_tokens = splitter.count_tokens(text=text) count_start_and_end_tokens = 2 - text_token_count = splitter.count_tokens(text=text) - count_start_and_end_tokens - token_multiplier = splitter.maximum_tokens_per_chunk // text_token_count + 1 - text_chunks = splitter.split_text(text=text * token_multiplier) + token_multiplier = ( + count_start_and_end_tokens + + (splitter.maximum_tokens_per_chunk - count_start_and_end_tokens) + // ( + text_token_count_including_start_and_stop_tokens + - count_start_and_end_tokens + ) + + 1 + ) + + # `text_to_split` does not fit in a single chunk + text_to_embed = text * token_multiplier + + text_chunks = splitter.split_text(text=text_to_embed) expected_number_of_chunks = 2 assert expected_number_of_chunks == len(text_chunks) actual = splitter.count_tokens(text=text_chunks[1]) - count_start_and_end_tokens - expected = token_multiplier * text_token_count - splitter.maximum_tokens_per_chunk + expected = ( + token_multiplier * (text_token_count_including_start_and_stop_tokens - 2) + - splitter.maximum_tokens_per_chunk + ) assert expected == actual From b023f0c0f244f890c8d9102776b9586eb603cb2a Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Mon, 12 Jun 2023 15:46:42 -0700 Subject: [PATCH 03/11] Text splitter for Markdown files by header (#5860) This creates a new kind of text splitter for markdown files. The user can supply a set of headers that they want to split the file on. We define a new text splitter class, `MarkdownHeaderTextSplitter`, that does a few things: (1) For each line, it determines the associated set of user-specified headers (2) It groups lines with common headers into splits See notebook for example usage and test cases. --- .../examples/markdown_header_metadata.ipynb | 324 ++++++++++++++++++ langchain/text_splitter.py | 150 ++++++++ 2 files changed, 474 insertions(+) create mode 100644 docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb diff --git a/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb new file mode 100644 index 0000000000000..db300d63075af --- /dev/null +++ b/docs/modules/indexes/text_splitters/examples/markdown_header_metadata.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70e9b619", + "metadata": {}, + "source": [ + "# MarkdownHeaderTextSplitter\n", + "\n", + "The objective is to split a markdown file by a specified set of headers.\n", + " \n", + "**Given this example:**\n", + "\n", + "# Foo\n", + "\n", + "## Bar\n", + "\n", + "Hi this is Jim \n", + "Hi this is Joe\n", + "\n", + "## Baz\n", + "\n", + "Hi this is Molly\n", + " \n", + "**Written as:**\n", + "\n", + "```\n", + "md = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim \\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + "```\n", + "\n", + "**If we want to split on specified headers:**\n", + "```\n", + "[(\"#\", \"Header 1\"),(\"##\", \"Header 2\")]\n", + "```\n", + "\n", + "**Then we expect:** \n", + "```\n", + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n", + "```\n", + "\n", + "**Options:**\n", + " \n", + "This also includes `return_each_line` in case a user want to perform other types of aggregation. \n", + "\n", + "If `return_each_line=True`, each line and associated header metadata are returned. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "19c044f0", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import MarkdownHeaderTextSplitter" + ] + }, + { + "cell_type": "markdown", + "id": "ec8d8053", + "metadata": {}, + "source": [ + "`Test case 1`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5cd0a66c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "# Doc\n", + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ## Baz\\n\\n Hi this is Molly' \n", + " \n", + "# Test case 1\n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + "]\n", + "\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", + "\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "67d25a1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "f1f74dfa", + "metadata": {}, + "source": [ + "`Test case 2`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2183c96a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Text under H3.', 'metadata': {'Header 1': 'H1', 'Header 2': 'H2', 'Header 3': 'H3'}}\n", + "{'content': 'Text under H2_2.', 'metadata': {'Header 1': 'H1_2', 'Header 2': 'H2_2'}}\n" + ] + } + ], + "source": [ + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + "]\n", + "markdown_document = '# H1\\n\\n## H2\\n\\n### H3\\n\\nText under H3.\\n\\n# H1_2\\n\\n## H2_2\\n\\nText under H2_2.'\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "add24254", + "metadata": {}, + "source": [ + "`Test case 3`" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c3f4690f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n ## Baz\\n\\n Hi this is Molly' \n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + "]\n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "20907fb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=True)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "9c448431", + "metadata": {}, + "source": [ + "`Test case 4`" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9858ea51", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Hi this is Jim \\nHi this is Joe', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar'}}\n", + "{'content': 'Hi this is Lance', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}}\n", + "{'content': 'Hi this is John', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo', 'Header 4': 'Bim'}}\n", + "{'content': 'Hi this is Molly', 'metadata': {'Header 1': 'Foo', 'Header 2': 'Baz'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Foo\\n\\n ## Bar\\n\\nHi this is Jim\\n\\nHi this is Joe\\n\\n ### Boo \\n\\n Hi this is Lance \\n\\n #### Bim \\n\\n Hi this is John \\n\\n ## Baz\\n\\n Hi this is Molly'\n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + " (\"####\", \"Header 4\"),\n", + "]\n", + " \n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + }, + { + "cell_type": "markdown", + "id": "bba6eb9e", + "metadata": {}, + "source": [ + "`Test case 5`" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8af8f9a2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'content': 'Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\nMarkdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'History'}}\n", + "{'content': 'As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\nadditional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence'}}\n", + "{'content': 'From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Rise and divergence', 'Header 4': 'Standardization'}}\n", + "{'content': 'Implementations of Markdown are available for over a dozen programming languages.', 'metadata': {'Header 1': 'Intro', 'Header 2': 'Implementations'}}\n" + ] + } + ], + "source": [ + "markdown_document = '# Intro \\n\\n ## History \\n\\n Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] \\n\\n Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. \\n\\n ## Rise and divergence \\n\\n As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for \\n\\n additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. \\n\\n #### Standardization \\n\\n From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. \\n\\n ## Implementations \\n\\n Implementations of Markdown are available for over a dozen programming languages.'\n", + " \n", + "headers_to_split_on = [\n", + " (\"#\", \"Header 1\"),\n", + " (\"##\", \"Header 2\"),\n", + " (\"###\", \"Header 3\"),\n", + " (\"####\", \"Header 4\"),\n", + "]\n", + " \n", + "markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on,return_each_line=False)\n", + "chunked_docs = markdown_splitter.split_text(markdown_document)\n", + "for chunk in chunked_docs:\n", + " print(chunk)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 89559505583c7..15723b66d0e39 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -12,12 +12,15 @@ Any, Callable, Collection, + Dict, Iterable, List, Literal, Optional, Sequence, + Tuple, Type, + TypedDict, TypeVar, Union, cast, @@ -254,6 +257,153 @@ def split_text(self, text: str) -> List[str]: return self._merge_splits(splits, _separator) +class LineType(TypedDict): + metadata: Dict[str, str] + content: str + + +class HeaderType(TypedDict): + level: int + name: str + data: str + + +class MarkdownHeaderTextSplitter: + """Implementation of splitting markdown files based on specified headers.""" + + def __init__( + self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False + ): + """Create a new MarkdownHeaderTextSplitter. + + Args: + headers_to_split_on: Headers we want to track + return_each_line: Return each line w/ associated headers + """ + # Output line-by-line or aggregated into chunks w/ common headers + self.return_each_line = return_each_line + # Given the headers we want to split on, + # (e.g., "#, ##, etc") order by length + self.headers_to_split_on = sorted( + headers_to_split_on, key=lambda split: len(split[0]), reverse=True + ) + + def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[LineType]: + """Combine lines with common metadata into chunks + Args: + lines: Line of text / associated header metadata + """ + aggregated_chunks: List[LineType] = [] + + for line in lines: + if ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] == line["metadata"] + ): + # If the last line in the aggregated list + # has the same metadata as the current line, + # append the current content to the last lines's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + else: + # Otherwise, append the current line to the aggregated list + aggregated_chunks.append(line) + return aggregated_chunks + + def split_text(self, text: str) -> List[LineType]: + """Split markdown file + Args: + text: Markdown file""" + + # Split the input text by newline character ("\n"). + lines = text.split("\n") + # Final output + lines_with_metadata: List[LineType] = [] + # Content and metadata of the chunk currently being processed + current_content: List[str] = [] + current_metadata: Dict[str, str] = {} + # Keep track of the nested header structure + # header_stack: List[Dict[str, Union[int, str]]] = [] + header_stack: List[HeaderType] = [] + initial_metadata: Dict[str, str] = {} + + for line in lines: + stripped_line = line.strip() + # Check each line against each of the header types (e.g., #, ##) + for sep, name in self.headers_to_split_on: + # Check if line starts with a header that we intend to split on + if stripped_line.startswith(sep) and ( + # Header with no text OR header is followed by space + # Both are valid conditions that sep is being used a header + len(stripped_line) == len(sep) + or stripped_line[len(sep)] == " " + ): + # Ensure we are tracking the header as metadata + if name is not None: + # Get the current header level + current_header_level = sep.count("#") + + # Pop out headers of lower or same level from the stack + while ( + header_stack + and header_stack[-1]["level"] >= current_header_level + ): + # We have encountered a new header + # at the same or higher level + popped_header = header_stack.pop() + # Clear the metadata for the + # popped header in initial_metadata + if popped_header["name"] in initial_metadata: + initial_metadata.pop(popped_header["name"]) + + # Push the current header to the stack + header: HeaderType = { + "level": current_header_level, + "name": name, + "data": stripped_line[len(sep) :].strip(), + } + header_stack.append(header) + # Update initial_metadata with the current header + initial_metadata[name] = header["data"] + + # Add the previous line to the lines_with_metadata + # only if current_content is not empty + if current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + break + else: + if stripped_line: + current_content.append(stripped_line) + elif current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + current_metadata = initial_metadata.copy() + + if current_content: + lines_with_metadata.append( + {"content": "\n".join(current_content), "metadata": current_metadata} + ) + + # lines_with_metadata has each line with associated header metadata + # aggregate these into chunks based on common metadata + if not self.return_each_line: + return self.aggregate_lines_to_chunks(lines_with_metadata) + else: + return lines_with_metadata + + # should be in newer Python versions (3.10+) # @dataclass(frozen=True, kw_only=True, slots=True) @dataclass(frozen=True) From 2f0088039d2b075c838632a5d4ec8cb045f8afa4 Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Mon, 12 Jun 2023 17:13:49 -0700 Subject: [PATCH 04/11] Log tracer errors (#6066) Example (would log several times if not for the helper fn. Would emit no logs due to mulithreading previously) ![image](https://github.com/hwchase17/langchain/assets/130414180/070d25ae-1f06-4487-9617-0a6f66f3f01e) --- langchain/callbacks/tracers/langchain.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/langchain/callbacks/tracers/langchain.py b/langchain/callbacks/tracers/langchain.py index 9f734e2983b5e..1bdeb352b071e 100644 --- a/langchain/callbacks/tracers/langchain.py +++ b/langchain/callbacks/tracers/langchain.py @@ -16,6 +16,16 @@ from langchain.schema import BaseMessage, messages_to_dict logger = logging.getLogger(__name__) +_LOGGED = set() + + +def log_error_once(method: str, exception: Exception) -> None: + """Log an error once.""" + global _LOGGED + if (method, type(exception)) in _LOGGED: + return + _LOGGED.add((method, type(exception))) + logger.error(exception) class LangChainTracer(BaseTracer): @@ -76,11 +86,21 @@ def _persist_run_single(self, run: Run) -> None: extra = run_dict.get("extra", {}) extra["runtime"] = get_runtime_environment() run_dict["extra"] = extra - run = self.client.create_run(**run_dict, session_name=self.session_name) + try: + run = self.client.create_run(**run_dict, session_name=self.session_name) + except Exception as e: + # Errors are swallowed by the thread executor so we need to log them here + log_error_once("post", e) + raise def _update_run_single(self, run: Run) -> None: """Update a run.""" - self.client.update_run(run.id, **run.dict()) + try: + self.client.update_run(run.id, **run.dict()) + except Exception as e: + # Errors are swallowed by the thread executor so we need to log them here + log_error_once("patch", e) + raise def _on_llm_start(self, run: Run) -> None: """Persist an LLM run.""" From 5b6bbf4ab2a33ed0d33ff5d3cb3979a7edc15682 Mon Sep 17 00:00:00 2001 From: Julius Lipp <43986145+juliuslipp@users.noreply.github.com> Date: Tue, 13 Jun 2023 10:13:52 +0800 Subject: [PATCH 05/11] Add embaas document extraction api endpoints (#6048) # Introduces embaas document extraction api endpoints In this PR, we add support for embaas document extraction endpoints to Text Embedding Models (with LLMs, in different PRs coming). We currently offer the MTEB leaderboard top performers, will continue to add top embedding models and soon add support for customers to deploy thier own models. Additional Documentation + Infomation can be found [here](https://embaas.io). While developing this integration, I closely followed the patterns established by other langchain integrations. Nonetheless, if there are any aspects that require adjustments or if there's a better way to present a new integration, let me know! :) Additionally, I fixed some docs in the embeddings integration. Related PR: #5976 #### Who can review? DataLoaders - @eyurtsev --- .../document_loaders/examples/embaas.ipynb | 167 +++++++++++++ langchain/document_loaders/__init__.py | 3 + langchain/document_loaders/embaas.py | 234 ++++++++++++++++++ langchain/embeddings/embaas.py | 7 +- .../document_loaders/test_embaas.py | 59 +++++ 5 files changed, 466 insertions(+), 4 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/embaas.ipynb create mode 100644 langchain/document_loaders/embaas.py create mode 100644 tests/integration_tests/document_loaders/test_embaas.py diff --git a/docs/modules/indexes/document_loaders/examples/embaas.ipynb b/docs/modules/indexes/document_loaders/examples/embaas.ipynb new file mode 100644 index 0000000000000..0c8c19d71acf8 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/embaas.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Embaas\n", + "[embaas](https://embaas.io) is a fully managed NLP API service that offers features like embedding generation, document text extraction, document to embeddings and more. You can choose a [variety of pre-trained models](https://embaas.io/docs/models/embeddings).\n", + "\n", + "### Prerequisites\n", + "Create a free embaas account at [https://embaas.io/register](https://embaas.io/register) and generate an [API key](https://embaas.io/dashboard/api-keys)\n", + "\n", + "### Document Text Extraction API\n", + "The document text extraction API allows you to extract the text from a given document. The API supports a variety of document formats, including PDF, mp3, mp4 and more. For a full list of supported formats, check out the API docs (link below)." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# Set API key\n", + "embaas_api_key = \"YOUR_API_KEY\"\n", + "# or set environment variable\n", + "os.environ[\"EMBAAS_API_KEY\"] = \"YOUR_API_KEY\"" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Using a blob (bytes)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.document_loaders.embaas import EmbaasBlobLoader\n", + "from langchain.document_loaders.blob_loaders import Blob" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "blob_loader = EmbaasBlobLoader()\n", + "blob = Blob.from_path(\"example.pdf\")\n", + "documents = blob_loader.load(blob)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# You can also directly create embeddings with your preferred embeddings model\n", + "blob_loader = EmbaasBlobLoader(params={\"model\": \"e5-large-v2\", \"should_embed\": True})\n", + "blob = Blob.from_path(\"example.pdf\")\n", + "documents = blob_loader.load(blob)\n", + "\n", + "print(documents[0][\"metadata\"][\"embedding\"])" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-12T22:19:48.366886Z", + "end_time": "2023-06-12T22:19:48.380467Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Using a file" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.document_loaders.embaas import EmbaasLoader" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "file_loader = EmbaasLoader(file_path=\"example.pdf\")\n", + "documents = file_loader.load()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 15, + "outputs": [], + "source": [ + "# Disable automatic text splitting\n", + "file_loader = EmbaasLoader(file_path=\"example.mp3\", params={\"should_chunk\": False})\n", + "documents = file_loader.load()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "start_time": "2023-06-12T22:24:31.880857Z", + "end_time": "2023-06-12T22:24:31.894665Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "For more detailed information about the embaas document text extraction API, please refer to [the official embaas API documentation](https://embaas.io/api-reference)." + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 87d2335e2344c..17f764b1f8e6e 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -31,6 +31,7 @@ OutlookMessageLoader, UnstructuredEmailLoader, ) +from langchain.document_loaders.embaas import EmbaasBlobLoader, EmbaasLoader from langchain.document_loaders.epub import UnstructuredEPubLoader from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.excel import UnstructuredExcelLoader @@ -250,4 +251,6 @@ "WikipediaLoader", "YoutubeLoader", "SnowflakeLoader", + "EmbaasLoader", + "EmbaasBlobLoader", ] diff --git a/langchain/document_loaders/embaas.py b/langchain/document_loaders/embaas.py new file mode 100644 index 0000000000000..5dc4071e86e0a --- /dev/null +++ b/langchain/document_loaders/embaas.py @@ -0,0 +1,234 @@ +import base64 +import warnings +from typing import Any, Dict, Iterator, List, Optional + +import requests +from pydantic import BaseModel, root_validator, validator +from typing_extensions import NotRequired, TypedDict + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseBlobParser, BaseLoader +from langchain.document_loaders.blob_loaders import Blob +from langchain.text_splitter import TextSplitter +from langchain.utils import get_from_dict_or_env + +EMBAAS_DOC_API_URL = "https://api.embaas.io/v1/document/extract-text/bytes/" + + +class EmbaasDocumentExtractionParameters(TypedDict): + """Parameters for the embaas document extraction API.""" + + mime_type: NotRequired[str] + """The mime type of the document.""" + file_extension: NotRequired[str] + """The file extension of the document.""" + file_name: NotRequired[str] + """The file name of the document.""" + + should_chunk: NotRequired[bool] + """Whether to chunk the document into pages.""" + chunk_size: NotRequired[int] + """The maximum size of the text chunks.""" + chunk_overlap: NotRequired[int] + """The maximum overlap allowed between chunks.""" + chunk_splitter: NotRequired[str] + """The text splitter class name for creating chunks.""" + separators: NotRequired[List[str]] + """The separators for chunks.""" + + should_embed: NotRequired[bool] + """Whether to create embeddings for the document in the response.""" + model: NotRequired[str] + """The model to pass to the Embaas document extraction API.""" + instruction: NotRequired[str] + """The instruction to pass to the Embaas document extraction API.""" + + +class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters): + bytes: str + """The base64 encoded bytes of the document to extract text from.""" + + +class BaseEmbaasLoader(BaseModel): + embaas_api_key: Optional[str] = None + api_url: str = EMBAAS_DOC_API_URL + """The URL of the embaas document extraction API.""" + params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() + """Additional parameters to pass to the embaas document extraction API.""" + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + embaas_api_key = get_from_dict_or_env( + values, "embaas_api_key", "EMBAAS_API_KEY" + ) + values["embaas_api_key"] = embaas_api_key + return values + + +class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): + """Wrapper around embaas's document byte loader service. + + To use, you should have the + environment variable ``EMBAAS_API_KEY`` set with your API key, or pass + it as a named parameter to the constructor. + + Example: + .. code-block:: python + + # Default parsing + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader() + blob = Blob.from_path(path="example.mp3") + documents = loader.parse(blob=blob) + + # Custom api parameters (create embeddings automatically) + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader( + params={ + "should_embed": True, + "model": "e5-large-v2", + "chunk_size": 256, + "chunk_splitter": "CharacterTextSplitter" + } + ) + blob = Blob.from_path(path="example.pdf") + documents = loader.parse(blob=blob) + """ + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + yield from self._get_documents(blob=blob) + + @staticmethod + def _api_response_to_documents(chunks: List[Dict[str, Any]]) -> List[Document]: + """Convert the API response to a list of documents.""" + docs = [] + for chunk in chunks: + metadata = chunk["metadata"] + if chunk.get("embedding", None) is not None: + metadata["embedding"] = chunk["embedding"] + doc = Document(page_content=chunk["text"], metadata=metadata) + docs.append(doc) + + return docs + + def _generate_payload(self, blob: Blob) -> EmbaasDocumentExtractionPayload: + """Generates payload for the API request.""" + base64_byte_str = base64.b64encode(blob.as_bytes()).decode() + payload: EmbaasDocumentExtractionPayload = EmbaasDocumentExtractionPayload( + bytes=base64_byte_str, + # Workaround for mypy issue: https://github.com/python/mypy/issues/9408 + # type: ignore + **self.params, + ) + + if blob.mimetype is not None and payload.get("mime_type", None) is None: + payload["mime_type"] = blob.mimetype + + return payload + + def _handle_request( + self, payload: EmbaasDocumentExtractionPayload + ) -> List[Document]: + """Sends a request to the embaas API and handles the response.""" + headers = { + "Authorization": f"Bearer {self.embaas_api_key}", + "Content-Type": "application/json", + } + + response = requests.post(self.api_url, headers=headers, json=payload) + response.raise_for_status() + + parsed_response = response.json() + return EmbaasBlobLoader._api_response_to_documents( + chunks=parsed_response["data"]["chunks"] + ) + + def _get_documents(self, blob: Blob) -> Iterator[Document]: + """Get the documents from the blob.""" + payload = self._generate_payload(blob=blob) + + try: + documents = self._handle_request(payload=payload) + except requests.exceptions.RequestException as e: + if e.response is None or not e.response.text: + raise ValueError( + f"Error raised by embaas document text extraction API: {e}" + ) + + parsed_response = e.response.json() + if "message" in parsed_response: + raise ValueError( + f"Validation Error raised by embaas document text extraction API:" + f" {parsed_response['message']}" + ) + raise + + yield from documents + + +class EmbaasLoader(BaseEmbaasLoader, BaseLoader): + """Wrapper around embaas's document loader service. + + To use, you should have the + environment variable ``EMBAAS_API_KEY`` set with your API key, or pass + it as a named parameter to the constructor. + + Example: + .. code-block:: python + + # Default parsing + from langchain.document_loaders.embaas import EmbaasLoader + loader = EmbaasLoader(file_path="example.mp3") + documents = loader.load() + + # Custom api parameters (create embeddings automatically) + from langchain.document_loaders.embaas import EmbaasBlobLoader + loader = EmbaasBlobLoader( + file_path="example.pdf", + params={ + "should_embed": True, + "model": "e5-large-v2", + "chunk_size": 256, + "chunk_splitter": "CharacterTextSplitter" + } + ) + documents = loader.load() + """ + + file_path: str + """The path to the file to load.""" + blob_loader: Optional[EmbaasBlobLoader] + """The blob loader to use. If not provided, a default one will be created.""" + + @validator("blob_loader", always=True) + def validate_blob_loader( + cls, v: EmbaasBlobLoader, values: Dict + ) -> EmbaasBlobLoader: + return v or EmbaasBlobLoader( + embaas_api_key=values["embaas_api_key"], + api_url=values["api_url"], + params=values["params"], + ) + + def lazy_load(self) -> Iterator[Document]: + """Load the documents from the file path lazily.""" + blob = Blob.from_path(path=self.file_path) + + assert self.blob_loader is not None + # Should never be None, but mypy doesn't know that. + yield from self.blob_loader.lazy_parse(blob=blob) + + def load(self) -> List[Document]: + return list(self.lazy_load()) + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + if self.params.get("should_embed", False): + warnings.warn( + "Embeddings are not supported with load_and_split." + " Use the API splitter to properly generate embeddings." + " For more information see embaas.io docs." + ) + return super().load_and_split(text_splitter=text_splitter) diff --git a/langchain/embeddings/embaas.py b/langchain/embeddings/embaas.py index 8a9134f711712..e0a42e46d573e 100644 --- a/langchain/embeddings/embaas.py +++ b/langchain/embeddings/embaas.py @@ -32,17 +32,16 @@ class EmbaasEmbeddings(BaseModel, Embeddings): .. code-block:: python # Initialise with default model and instruction - from langchain.llms import EmbaasEmbeddings + from langchain.embeddings import EmbaasEmbeddings emb = EmbaasEmbeddings() # Initialise with custom model and instruction - from langchain.llms import EmbaasEmbeddings + from langchain.embeddings import EmbaasEmbeddings emb_model = "instructor-large" emb_inst = "Represent the Wikipedia document for retrieval" emb = EmbaasEmbeddings( model=emb_model, - instruction=emb_inst, - embaas_api_key="your-api-key" + instruction=emb_inst ) """ diff --git a/tests/integration_tests/document_loaders/test_embaas.py b/tests/integration_tests/document_loaders/test_embaas.py new file mode 100644 index 0000000000000..2170a143c66ac --- /dev/null +++ b/tests/integration_tests/document_loaders/test_embaas.py @@ -0,0 +1,59 @@ +from typing import Any +from unittest.mock import MagicMock, patch + +import responses + +from langchain.document_loaders import EmbaasBlobLoader, EmbaasLoader +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.embaas import EMBAAS_DOC_API_URL + + +@responses.activate +def test_handle_request() -> None: + responses.add( + responses.POST, + EMBAAS_DOC_API_URL, + json={ + "data": { + "chunks": [ + { + "text": "Hello", + "metadata": {"start_page": 1, "end_page": 2}, + "embeddings": [0.0], + } + ] + } + }, + status=200, + ) + + loader = EmbaasBlobLoader(embaas_api_key="api_key", params={"should_embed": True}) + documents = loader.parse(blob=Blob.from_data(data="Hello")) + assert len(documents) == 1 + assert documents[0].page_content == "Hello" + assert documents[0].metadata["start_page"] == 1 + assert documents[0].metadata["end_page"] == 2 + assert documents[0].metadata["embeddings"] == [0.0] + + +@responses.activate +def test_handle_request_exception() -> None: + responses.add( + responses.POST, + EMBAAS_DOC_API_URL, + json={"message": "Invalid request"}, + status=400, + ) + loader = EmbaasBlobLoader(embaas_api_key="api_key") + try: + loader.parse(blob=Blob.from_data(data="Hello")) + except Exception as e: + assert "Invalid request" in str(e) + + +@patch.object(EmbaasBlobLoader, "_handle_request") +def test_load(mock_handle_request: Any) -> None: + mock_handle_request.return_value = [MagicMock()] + loader = EmbaasLoader(file_path="test_embaas.py", embaas_api_key="api_key") + documents = loader.load() + assert len(documents) == 1 From ec1a2adf9ce4f310b53f90602ead72c1f2483f60 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 12 Jun 2023 22:19:03 -0700 Subject: [PATCH 06/11] improve tools (#6062) --- langchain/tools/base.py | 51 ++++++++++++++++++++++------- tests/unit_tests/tools/test_base.py | 38 +++++++++++++++++++-- 2 files changed, 75 insertions(+), 14 deletions(-) diff --git a/langchain/tools/base.py b/langchain/tools/base.py index ea69731b67d0f..ca3bebb041460 100644 --- a/langchain/tools/base.py +++ b/langchain/tools/base.py @@ -68,18 +68,14 @@ def _create_subset_model( name: str, model: BaseModel, field_names: list ) -> Type[BaseModel]: """Create a pydantic model with only a subset of model's fields.""" - fields = { - field_name: ( - model.__fields__[field_name].type_, - model.__fields__[field_name].default, - ) - for field_name in field_names - if field_name in model.__fields__ - } + fields = {} + for field_name in field_names: + field = model.__fields__[field_name] + fields[field_name] = (field.type_, field.field_info) return create_model(name, **fields) # type: ignore -def get_filtered_args( +def _get_filtered_args( inferred_model: Type[BaseModel], func: Callable, ) -> dict: @@ -100,15 +96,22 @@ def create_schema_from_function( model_name: str, func: Callable, ) -> Type[BaseModel]: - """Create a pydantic schema from a function's signature.""" + """Create a pydantic schema from a function's signature. + Args: + model_name: Name to assign to the generated pydandic schema + func: Function to generate the schema from + Returns: + A pydantic model with the same arguments as the function + """ + # https://docs.pydantic.dev/latest/usage/validation_decorator/ validated = validate_arguments(func, config=_SchemaConfig) # type: ignore inferred_model = validated.model # type: ignore if "run_manager" in inferred_model.__fields__: del inferred_model.__fields__["run_manager"] # Pydantic adds placeholder virtual fields we need to strip - filtered_args = get_filtered_args(inferred_model, func) + valid_properties = _get_filtered_args(inferred_model, func) return _create_subset_model( - f"{model_name}Schema", inferred_model, list(filtered_args) + f"{model_name}Schema", inferred_model, list(valid_properties) ) @@ -534,6 +537,30 @@ def from_function( infer_schema: bool = True, **kwargs: Any, ) -> StructuredTool: + """Create tool from a given function. + + A classmethod that helps to create a tool from a function. + + Args: + func: The function from which to create a tool + name: The name of the tool. Defaults to the function name + description: The description of the tool. Defaults to the function docstring + return_direct: Whether to return the result directly or as a callback + args_schema: The schema of the tool's input arguments + infer_schema: Whether to infer the schema from the function's signature + **kwargs: Additional arguments to pass to the tool + + Returns: + The tool + + Examples: + ... code-block:: python + def add(a: int, b: int) -> int: + \"\"\"Add two numbers\"\"\" + return a + b + tool = StructuredTool.from_function(add) + tool.run(1, 2) # 3 + """ name = name or func.__name__ description = description or func.__doc__ assert ( diff --git a/tests/unit_tests/tools/test_base.py b/tests/unit_tests/tools/test_base.py index cea017d79d271..9f05faa0c3613 100644 --- a/tests/unit_tests/tools/test_base.py +++ b/tests/unit_tests/tools/test_base.py @@ -315,6 +315,39 @@ def test_tool_lambda_args_schema() -> None: assert tool.args == expected_args +def test_structured_tool_from_function_docstring() -> None: + """Test that structured tools can be created from functions.""" + + def foo(bar: int, baz: str) -> str: + """Docstring + Args: + bar: int + baz: str + """ + raise NotImplementedError() + + structured_tool = StructuredTool.from_function(foo) + assert structured_tool.name == "foo" + assert structured_tool.args == { + "bar": {"title": "Bar", "type": "integer"}, + "baz": {"title": "Baz", "type": "string"}, + } + + assert structured_tool.args_schema.schema() == { + "properties": { + "bar": {"title": "Bar", "type": "integer"}, + "baz": {"title": "Baz", "type": "string"}, + }, + "title": "fooSchemaSchema", + "type": "object", + "required": ["bar", "baz"], + } + + prefix = "foo(bar: int, baz: str) -> str - " + assert foo.__doc__ is not None + assert structured_tool.description == prefix + foo.__doc__.strip() + + def test_structured_tool_lambda_multi_args_schema() -> None: """Test args schema inference when the tool argument is a lambda function.""" tool = StructuredTool.from_function( @@ -577,12 +610,13 @@ def foo(bar: int, baz: str) -> str: } assert structured_tool.args_schema.schema() == { + "title": "fooSchemaSchema", + "type": "object", "properties": { "bar": {"title": "Bar", "type": "integer"}, "baz": {"title": "Baz", "type": "string"}, }, - "title": "fooSchemaSchema", - "type": "object", + "required": ["bar", "baz"], } prefix = "foo(bar: int, baz: str) -> str - " From 6ac5d8028621dccc5d7015a56264ad9e3e3686c6 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 12 Jun 2023 22:37:55 -0700 Subject: [PATCH 07/11] propogate kwargs fully (#6076) --- langchain/chat_models/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/langchain/chat_models/base.py b/langchain/chat_models/base.py index f3521df6fb17d..440336f275e29 100644 --- a/langchain/chat_models/base.py +++ b/langchain/chat_models/base.py @@ -190,9 +190,10 @@ def __call__( messages: List[BaseMessage], stop: Optional[List[str]] = None, callbacks: Callbacks = None, + **kwargs: Any, ) -> BaseMessage: generation = self.generate( - [messages], stop=stop, callbacks=callbacks + [messages], stop=stop, callbacks=callbacks, **kwargs ).generations[0][0] if isinstance(generation, ChatGeneration): return generation.message @@ -227,7 +228,7 @@ def predict( _stop = None else: _stop = list(stop) - result = self([HumanMessage(content=text)], stop=_stop) + result = self([HumanMessage(content=text)], stop=_stop, **kwargs) return result.content def predict_messages( From a9b3b2e3270aa24cb3cc4680c1e19d6eda86b013 Mon Sep 17 00:00:00 2001 From: Nuno Campos Date: Tue, 13 Jun 2023 06:39:10 +0100 Subject: [PATCH 08/11] Enable serialization for anthropic (#6049) --- langchain/chat_models/anthropic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/langchain/chat_models/anthropic.py b/langchain/chat_models/anthropic.py index 5f21cdeb3c908..a9f3d0e02dba8 100644 --- a/langchain/chat_models/anthropic.py +++ b/langchain/chat_models/anthropic.py @@ -44,6 +44,10 @@ def _llm_type(self) -> str: """Return type of chat model.""" return "anthropic-chat" + @property + def lc_serializable(self) -> bool: + return True + def _convert_one_message_to_text(self, message: BaseMessage) -> str: if isinstance(message, ChatMessage): message_text = f"\n\n{message.role.capitalize()}: {message.content}" From cde1e8739a1f315b90763e9dee0e5022f7d768dd Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 12 Jun 2023 22:45:24 -0700 Subject: [PATCH 09/11] turn off repr (#6078) --- langchain/load/serializable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/load/serializable.py b/langchain/load/serializable.py index 9c8c60bfe9313..56c97dfae3b32 100644 --- a/langchain/load/serializable.py +++ b/langchain/load/serializable.py @@ -55,7 +55,7 @@ def lc_attributes(self) -> Dict: """ return {} - lc_kwargs: Dict[str, Any] = Field(default_factory=dict, exclude=True) + lc_kwargs: Dict[str, Any] = Field(default_factory=dict, exclude=True, repr=False) def __init__(self, **kwargs: Any) -> None: super().__init__(**kwargs) From 0c52275bdbc4b182fcd1efe20c5ce60a32c44075 Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Tue, 13 Jun 2023 07:14:11 -0700 Subject: [PATCH 10/11] Use Run object from SDK (#6067) Update the Run object in the tracer to extend that in the SDK to include the parameters necessary for tracking/tracing --- langchain/callbacks/tracers/langchain.py | 6 +- langchain/callbacks/tracers/schemas.py | 70 +++++-------------- poetry.lock | 18 ++--- pyproject.toml | 2 +- .../callbacks/test_callback_manager.py | 4 +- tests/unit_tests/callbacks/test_schemas.py | 27 +++++++ 6 files changed, 64 insertions(+), 63 deletions(-) create mode 100644 tests/unit_tests/callbacks/test_schemas.py diff --git a/langchain/callbacks/tracers/langchain.py b/langchain/callbacks/tracers/langchain.py index 1bdeb352b071e..2dabfcdebea35 100644 --- a/langchain/callbacks/tracers/langchain.py +++ b/langchain/callbacks/tracers/langchain.py @@ -11,7 +11,11 @@ from langchainplus_sdk import LangChainPlusClient from langchain.callbacks.tracers.base import BaseTracer -from langchain.callbacks.tracers.schemas import Run, RunTypeEnum, TracerSession +from langchain.callbacks.tracers.schemas import ( + Run, + RunTypeEnum, + TracerSession, +) from langchain.env import get_runtime_environment from langchain.schema import BaseMessage, messages_to_dict diff --git a/langchain/callbacks/tracers/schemas.py b/langchain/callbacks/tracers/schemas.py index 74e4d66330b6f..1e264e7dd2570 100644 --- a/langchain/callbacks/tracers/schemas.py +++ b/langchain/callbacks/tracers/schemas.py @@ -2,13 +2,13 @@ from __future__ import annotations import datetime -from enum import Enum from typing import Any, Dict, List, Optional from uuid import UUID +from langchainplus_sdk.schemas import RunBase as BaseRunV2 +from langchainplus_sdk.schemas import RunTypeEnum from pydantic import BaseModel, Field, root_validator -from langchain.env import get_runtime_environment from langchain.schema import LLMResult @@ -88,36 +88,11 @@ class ToolRun(BaseRun): # Begin V2 API Schemas -class RunTypeEnum(str, Enum): - """Enum for run types.""" +class Run(BaseRunV2): + """Run schema for the V2 API in the Tracer.""" - tool = "tool" - chain = "chain" - llm = "llm" - - -class RunBase(BaseModel): - """Base Run schema.""" - - id: Optional[UUID] - start_time: datetime.datetime = Field(default_factory=datetime.datetime.utcnow) - end_time: datetime.datetime = Field(default_factory=datetime.datetime.utcnow) - extra: Optional[Dict[str, Any]] = None - error: Optional[str] execution_order: int - child_execution_order: Optional[int] - serialized: dict - inputs: dict - outputs: Optional[dict] - reference_example_id: Optional[UUID] - run_type: RunTypeEnum - parent_run_id: Optional[UUID] - - -class Run(RunBase): - """Run schema when loading from the DB.""" - - name: str + child_execution_order: int child_runs: List[Run] = Field(default_factory=list) @root_validator(pre=True) @@ -131,26 +106,19 @@ def assign_name(cls, values: dict) -> dict: return values -class RunCreate(RunBase): - name: str - session_name: Optional[str] = None - - @root_validator(pre=True) - def add_runtime_env(cls, values: Dict[str, Any]) -> Dict[str, Any]: - """Add env info to the run.""" - extra = values.get("extra", {}) - extra["runtime"] = get_runtime_environment() - values["extra"] = extra - return values - - -class RunUpdate(BaseModel): - end_time: Optional[datetime.datetime] - error: Optional[str] - outputs: Optional[dict] - parent_run_id: Optional[UUID] - reference_example_id: Optional[UUID] - - ChainRun.update_forward_refs() ToolRun.update_forward_refs() + +__all__ = [ + "BaseRun", + "ChainRun", + "LLMRun", + "Run", + "RunTypeEnum", + "ToolRun", + "TracerSession", + "TracerSessionBase", + "TracerSessionV1", + "TracerSessionV1Base", + "TracerSessionV1Create", +] diff --git a/poetry.lock b/poetry.lock index 6d5bb1e0c2ba2..92094baaffefb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -4058,14 +4058,14 @@ tests = ["pytest", "pytest-mock"] [[package]] name = "langchainplus-sdk" -version = "0.0.7" +version = "0.0.9" description = "Client library to connect to the LangChainPlus LLM Tracing and Evaluation Platform." category = "main" optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchainplus_sdk-0.0.7-py3-none-any.whl", hash = "sha256:aefc471058648bf9fc51f659117d33ef905d25a304d5a021f7e32c30f5921076"}, - {file = "langchainplus_sdk-0.0.7.tar.gz", hash = "sha256:b58565bdcaf301d2e6e7dd8898f0b8ccf549a35476258e0c14d871d6de02d210"}, + {file = "langchainplus_sdk-0.0.9-py3-none-any.whl", hash = "sha256:4fe1a60f28c93ae0e145dcd53e4dc5293374ed0a8518abcc51e201081809bf0b"}, + {file = "langchainplus_sdk-0.0.9.tar.gz", hash = "sha256:bbfdc54c64df5ca4334068ab2d7b89d3a894f313b1285939b4c4532fea62eeb7"}, ] [package.dependencies] @@ -11472,13 +11472,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb"] -azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"] +all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["beautifulsoup4", "bibtexparser", "chardet", "jq", "pdfminer-six", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "beautifulsoup4", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "pyspark", "openai"] -llms = ["anthropic", "cohere", "openai", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] +llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] text-helpers = ["chardet"] @@ -11486,4 +11486,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "17e9c7a2ae2d0ef7cf45bc232ebeb7fd3eee2760bb2a19b34a63dcddafd3e4ad" +content-hash = "b4a782d8223ccc19b2dfb777978c3ad636b11a79cc58a5c45e4dcdb0fe5e29c1" diff --git a/pyproject.toml b/pyproject.toml index 95d40530806a7..5cd8a6dd30e27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -105,7 +105,7 @@ singlestoredb = {version = "^0.6.1", optional = true} pyspark = {version = "^3.4.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} nebula3-python = {version = "^3.4.0", optional = true} -langchainplus-sdk = ">=0.0.7" +langchainplus-sdk = ">=0.0.9" awadb = {version = "^0.3.2", optional = true} azure-search-documents = {version = "11.4.0a20230509004", source = "azure-sdk-dev", optional = true} diff --git a/tests/unit_tests/callbacks/test_callback_manager.py b/tests/unit_tests/callbacks/test_callback_manager.py index 6a21598574147..2fb52165e79c8 100644 --- a/tests/unit_tests/callbacks/test_callback_manager.py +++ b/tests/unit_tests/callbacks/test_callback_manager.py @@ -203,8 +203,10 @@ def test_callback_manager_inheritance() -> None: assert child_manager2.inheritable_handlers == [handler1] -def test_callback_manager_configure() -> None: +def test_callback_manager_configure(monkeypatch: pytest.MonkeyPatch) -> None: """Test callback manager configuration.""" + monkeypatch.setenv("LANGCHAIN_TRACING_V2", "false") + monkeypatch.setenv("LANGCHAIN_TRACING", "false") handler1, handler2, handler3, handler4 = ( FakeCallbackHandler(), FakeCallbackHandler(), diff --git a/tests/unit_tests/callbacks/test_schemas.py b/tests/unit_tests/callbacks/test_schemas.py new file mode 100644 index 0000000000000..34937d7497162 --- /dev/null +++ b/tests/unit_tests/callbacks/test_schemas.py @@ -0,0 +1,27 @@ +import langchain.callbacks.tracers.schemas as schemas +from langchain.callbacks.tracers.schemas import __all__ as schemas_all + + +def test_public_api() -> None: + """Test for changes in the public API.""" + expected_all = [ + "BaseRun", + "ChainRun", + "LLMRun", + "Run", + "RunTypeEnum", + "ToolRun", + "TracerSession", + "TracerSessionBase", + "TracerSessionV1", + "TracerSessionV1Base", + "TracerSessionV1Create", + ] + + assert sorted(schemas_all) == expected_all + + # Assert that the object is actually present in the schema module + for module_name in expected_all: + assert ( + hasattr(schemas, module_name) and getattr(schemas, module_name) is not None + ) From 8fdf88b8e3da9a5744b7a13afa99b16529438a31 Mon Sep 17 00:00:00 2001 From: Keshav Kumar Date: Tue, 13 Jun 2023 21:07:07 +0530 Subject: [PATCH 11/11] Fix for ModuleNotFoundError while running langchain-server. Issue #5833 (#6077) This PR fixes the error `ModuleNotFoundError: No module named 'langchain.cli'` Fixes https://github.com/hwchase17/langchain/issues/5833 (issue) --- langchain/server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/server.py b/langchain/server.py index 750567ed4c012..b877f328236f1 100644 --- a/langchain/server.py +++ b/langchain/server.py @@ -2,7 +2,7 @@ import subprocess from pathlib import Path -from langchain.cli.main import get_docker_compose_command +from langchainplus_sdk.cli.main import get_docker_compose_command def main() -> None: