diff --git a/docs/docs/integrations/document_loaders/unstructured_file.ipynb b/docs/docs/integrations/document_loaders/unstructured_file.ipynb index 0692f7a8f75be..7e26874a68edc 100644 --- a/docs/docs/integrations/document_loaders/unstructured_file.ipynb +++ b/docs/docs/integrations/document_loaders/unstructured_file.ipynb @@ -58,29 +58,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "79d3e549", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from langchain_unstructured import UnstructuredLoader\n", "\n", "loader = UnstructuredLoader(\"./example_data/state_of_the_union.txt\")\n", "\n", - "docs = loader.load()\n", - "\n", - "docs[0].page_content[:400]" + "docs = loader.load()" ] }, { @@ -254,7 +241,11 @@ "source": [ "# Install package\n", "%pip install \"langchain-unstructured\"\n", - "%pip install \"unstructured-client\"" + "%pip install \"unstructured-client\"\n", + "\n", + "# Set API key\n", + "import os\n", + "os.environ['UNSTRUCTURED_API_KEY'] = 'FAKE_API_KEY'" ] }, { @@ -289,9 +280,8 @@ "\n", "loader = UnstructuredLoader(\n", " file_path=\"example_data/fake.docx\",\n", - " api_key=\"FAKE_API_KEY\",\n", + " api_key=os.getenv('UNSTRUCTURED_API_KEY'),\n", " partition_via_api=True,\n", - " url=\"https://api.unstructuredapp.io/general/v0/general\",\n", ")\n", "\n", "docs = loader.load()\n", @@ -338,9 +328,8 @@ "source": [ "loader = UnstructuredLoader(\n", " file_path=[\"example_data/fake.docx\", \"example_data/fake-email.eml\"],\n", - " api_key=\"FAKE_API_KEY\",\n", + " api_key=os.getenv('UNSTRUCTURED_API_KEY'),\n", " partition_via_api=True,\n", - " url=\"https://api.unstructuredapp.io/general/v0/general\",\n", ")\n", "\n", "docs = loader.load()\n", @@ -359,13 +348,13 @@ "Partitioning with the Unstructured API relies on the [Unstructured SDK\n", "Client](https://docs.unstructured.io/api-reference/api-services/sdk).\n", "\n", - "Below is an example showing how you can customize the client by using your own `requests.Session()`\n", - "and a `RetryConfig` object for more control over how failed requests are handled." + "Below is an example showing how you can customize some features of the client and use your own\n", + "`requests.Session()`, pass in an alternative `server_url`, or customize the `RetryConfig` object for more control over how failed requests are handled." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "58e55264", "metadata": {}, "outputs": [ @@ -403,14 +392,13 @@ "source": [ "import requests\n", "from langchain_unstructured import UnstructuredLoader\n", - "from unstructured_client.utils import BackoffStrategy, RetryConfig\n", + "from unstructured_client import UnstructuredClient\n", + "from unstructured_client.utils import RetryConfig, BackoffStrategy\n", "\n", - "loader = UnstructuredLoader(\n", - " \"./example_data/layout-parser-paper.pdf\",\n", - " api_key=\"FAKE_API_KEY\",\n", - " partition_via_api=True,\n", - " url=\"https://api.unstructuredapp.io/general/v0/general\",\n", + "client = UnstructuredClient(\n", + " api_key_auth=os.getenv('UNSTRUCTURED_API_KEY'), # Note: the client API param is \"api_key_auth\" instead of \"api_key\"\n", " client=requests.Session(),\n", + " server_url=\"https://api.unstructuredapp.io/general/v0/general\",\n", " retry_config=RetryConfig(\n", " strategy=\"backoff\",\n", " retry_connection_errors=True,\n", @@ -420,7 +408,12 @@ " exponent=1.5,\n", " max_elapsed_time=900000,\n", " ),\n", - " ),\n", + " ),)\n", + "\n", + "loader = UnstructuredLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " partition_via_api=True,\n", + " client=client,\n", ")\n", "\n", "docs = loader.load()\n", @@ -457,6 +450,13 @@ "id": "e9f1c20d", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Partitioning locally even though api_key is defined since partition_via_api=False.\n" + ] + }, { "name": "stdout", "output_type": "stream", diff --git a/libs/community/langchain_community/document_loaders/unstructured.py b/libs/community/langchain_community/document_loaders/unstructured.py index c8643d0cdeb84..f6c41b1d6640a 100644 --- a/libs/community/langchain_community/document_loaders/unstructured.py +++ b/libs/community/langchain_community/document_loaders/unstructured.py @@ -4,6 +4,7 @@ import logging from abc import ABC, abstractmethod +import os from pathlib import Path from typing import ( IO, @@ -330,7 +331,7 @@ def __init__( self.file_path = file_path self.url = url - self.api_key = api_key + self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key super().__init__(mode=mode, **unstructured_kwargs) @@ -483,7 +484,7 @@ def __init__( self.file = file self.url = url - self.api_key = api_key + self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key super().__init__(mode=mode, **unstructured_kwargs) diff --git a/libs/partners/unstructured/langchain_unstructured/document_loaders.py b/libs/partners/unstructured/langchain_unstructured/document_loaders.py index 347e7a0c0f83c..c40937cf40388 100644 --- a/libs/partners/unstructured/langchain_unstructured/document_loaders.py +++ b/libs/partners/unstructured/langchain_unstructured/document_loaders.py @@ -4,14 +4,16 @@ import json import logging +import os from pathlib import Path -from typing import IO, Any, Callable, Iterator, Optional, cast +from typing import IO, Any, Callable, Iterator, Optional, TypeAlias, cast from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document from unstructured_client import UnstructuredClient # type: ignore from unstructured_client.models import operations, shared # type: ignore -from unstructured_client.utils import RetryConfig # type: ignore + +Element: TypeAlias = Any logger = logging.getLogger(__file__) @@ -83,10 +85,7 @@ def __init__( # SDK parameters api_key: Optional[str] = None, client: Optional[UnstructuredClient] = None, - retry_config: Optional[RetryConfig] = None, - server: Optional[str] = None, url: Optional[str] = "https://api.unstructuredapp.io/general/v0/general", - url_params: Optional[dict[str, str]] = None, **unstructured_kwargs: Any, ): """Initialize loader.""" @@ -95,12 +94,10 @@ def __init__( self.partition_via_api = partition_via_api self.post_processors = post_processors # SDK parameters - self.api_key = api_key + self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key self.client = client - self.retry_config = retry_config - self.server = server self.url = url - self.url_params = url_params + self.unstructured_kwargs = unstructured_kwargs def lazy_load(self) -> Iterator[Document]: @@ -118,10 +115,7 @@ def load_file( # SDK parameters api_key=self.api_key, client=self.client, - retry_config=self.retry_config, - server=self.server, url=self.url, - url_params=self.url_params, **self.unstructured_kwargs, ).lazy_load() @@ -156,10 +150,7 @@ def __init__( # SDK parameters api_key: Optional[str] = None, client: Optional[UnstructuredClient] = None, - retry_config: Optional[RetryConfig] = None, - server: Optional[str] = None, url: Optional[str] = "https://api.unstructuredapp.io/general/v0/general", - url_params: Optional[dict[str, str]] = None, **unstructured_kwargs: Any, ): """Initialize loader.""" @@ -169,21 +160,15 @@ def __init__( self.post_processors = post_processors # SDK parameters self.api_key = api_key - self.client = ( - client - if client is not None - else UnstructuredClient( + self.url = url + if client is not None: + self.client = client + elif self.api_key is not None: + self.client = UnstructuredClient( api_key_auth=self.api_key, - retry_config=self.retry_config, # type: ignore[has-type] - server=self.server, # type: ignore[has-type] - server_url=self.url, # type: ignore[has-type] - url_params=self.url_params, # type: ignore[has-type] + server_url=self.url, ) - ) - self.retry_config = retry_config - self.server = server - self.url = url - self.url_params = url_params + self.unstructured_kwargs = unstructured_kwargs def lazy_load(self) -> Iterator[Document]: @@ -222,7 +207,7 @@ def _elements_json(self) -> list[dict[str, Any]]: return self._convert_elements_to_dicts(self._elements_via_local) @property - def _elements_via_local(self) -> list[Any]: + def _elements_via_local(self) -> list[Element]: try: from unstructured.partition.auto import partition # type: ignore except ImportError: @@ -239,7 +224,7 @@ def _elements_via_local(self) -> list[Any]: return partition( file=self.file, filename=self.file_path, **self.unstructured_kwargs - ) + ) # type: ignore @property def _elements_via_api(self) -> list[dict[str, Any]]: @@ -274,7 +259,9 @@ def _sdk_partition_request(self) -> operations.PartitionRequest: ), ) - def _convert_elements_to_dicts(self, elements: list[Any]) -> list[dict[str, Any]]: + def _convert_elements_to_dicts( + self, elements: list[Element] + ) -> list[dict[str, Any]]: return [element.to_dict() for element in elements] def _get_metadata(self) -> dict[str, Any]: