Skip to content

Commit

Permalink
change client to UnstructuredClient, add os.getenv(), and update jupy…
Browse files Browse the repository at this point in the history
…ter notebook
  • Loading branch information
Coniferish committed Jul 24, 2024
1 parent 26c60a9 commit f7064bc
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 64 deletions.
62 changes: 31 additions & 31 deletions docs/docs/integrations/document_loaders/unstructured_file.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,29 +58,16 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "79d3e549",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"from langchain_unstructured import UnstructuredLoader\n",
"\n",
"loader = UnstructuredLoader(\"./example_data/state_of_the_union.txt\")\n",
"\n",
"docs = loader.load()\n",
"\n",
"docs[0].page_content[:400]"
"docs = loader.load()"
]
},
{
Expand Down Expand Up @@ -254,7 +241,11 @@
"source": [
"# Install package\n",
"%pip install \"langchain-unstructured\"\n",
"%pip install \"unstructured-client\""
"%pip install \"unstructured-client\"\n",
"\n",
"# Set API key\n",
"import os\n",
"os.environ['UNSTRUCTURED_API_KEY'] = 'FAKE_API_KEY'"
]
},
{
Expand Down Expand Up @@ -289,9 +280,8 @@
"\n",
"loader = UnstructuredLoader(\n",
" file_path=\"example_data/fake.docx\",\n",
" api_key=\"FAKE_API_KEY\",\n",
" api_key=os.getenv('UNSTRUCTURED_API_KEY'),\n",
" partition_via_api=True,\n",
" url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
")\n",
"\n",
"docs = loader.load()\n",
Expand Down Expand Up @@ -338,9 +328,8 @@
"source": [
"loader = UnstructuredLoader(\n",
" file_path=[\"example_data/fake.docx\", \"example_data/fake-email.eml\"],\n",
" api_key=\"FAKE_API_KEY\",\n",
" api_key=os.getenv('UNSTRUCTURED_API_KEY'),\n",
" partition_via_api=True,\n",
" url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
")\n",
"\n",
"docs = loader.load()\n",
Expand All @@ -359,13 +348,13 @@
"Partitioning with the Unstructured API relies on the [Unstructured SDK\n",
"Client](https://docs.unstructured.io/api-reference/api-services/sdk).\n",
"\n",
"Below is an example showing how you can customize the client by using your own `requests.Session()`\n",
"and a `RetryConfig` object for more control over how failed requests are handled."
"Below is an example showing how you can customize some features of the client and use your own\n",
"`requests.Session()`, pass in an alternative `server_url`, or customize the `RetryConfig` object for more control over how failed requests are handled."
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 11,
"id": "58e55264",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -403,14 +392,13 @@
"source": [
"import requests\n",
"from langchain_unstructured import UnstructuredLoader\n",
"from unstructured_client.utils import BackoffStrategy, RetryConfig\n",
"from unstructured_client import UnstructuredClient\n",
"from unstructured_client.utils import RetryConfig, BackoffStrategy\n",
"\n",
"loader = UnstructuredLoader(\n",
" \"./example_data/layout-parser-paper.pdf\",\n",
" api_key=\"FAKE_API_KEY\",\n",
" partition_via_api=True,\n",
" url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
"client = UnstructuredClient(\n",
" api_key_auth=os.getenv('UNSTRUCTURED_API_KEY'), # Note: the client API param is \"api_key_auth\" instead of \"api_key\"\n",
" client=requests.Session(),\n",
" server_url=\"https://api.unstructuredapp.io/general/v0/general\",\n",
" retry_config=RetryConfig(\n",
" strategy=\"backoff\",\n",
" retry_connection_errors=True,\n",
Expand All @@ -420,7 +408,12 @@
" exponent=1.5,\n",
" max_elapsed_time=900000,\n",
" ),\n",
" ),\n",
" ),)\n",
"\n",
"loader = UnstructuredLoader(\n",
" \"./example_data/layout-parser-paper.pdf\",\n",
" partition_via_api=True,\n",
" client=client,\n",
")\n",
"\n",
"docs = loader.load()\n",
Expand Down Expand Up @@ -457,6 +450,13 @@
"id": "e9f1c20d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING: Partitioning locally even though api_key is defined since partition_via_api=False.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
from abc import ABC, abstractmethod
import os
from pathlib import Path
from typing import (
IO,
Expand Down Expand Up @@ -330,7 +331,7 @@ def __init__(

self.file_path = file_path
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key

super().__init__(mode=mode, **unstructured_kwargs)

Expand Down Expand Up @@ -483,7 +484,7 @@ def __init__(

self.file = file
self.url = url
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key

super().__init__(mode=mode, **unstructured_kwargs)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@

import json
import logging
import os
from pathlib import Path
from typing import IO, Any, Callable, Iterator, Optional, cast
from typing import IO, Any, Callable, Iterator, Optional, TypeAlias, cast

from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
from unstructured_client import UnstructuredClient # type: ignore
from unstructured_client.models import operations, shared # type: ignore
from unstructured_client.utils import RetryConfig # type: ignore

Element: TypeAlias = Any

logger = logging.getLogger(__file__)

Expand Down Expand Up @@ -83,10 +85,7 @@ def __init__(
# SDK parameters
api_key: Optional[str] = None,
client: Optional[UnstructuredClient] = None,
retry_config: Optional[RetryConfig] = None,
server: Optional[str] = None,
url: Optional[str] = "https://api.unstructuredapp.io/general/v0/general",
url_params: Optional[dict[str, str]] = None,
**unstructured_kwargs: Any,
):
"""Initialize loader."""
Expand All @@ -95,12 +94,10 @@ def __init__(
self.partition_via_api = partition_via_api
self.post_processors = post_processors
# SDK parameters
self.api_key = api_key
self.api_key = os.getenv("UNSTRUCTURED_API_KEY") or api_key
self.client = client
self.retry_config = retry_config
self.server = server
self.url = url
self.url_params = url_params

self.unstructured_kwargs = unstructured_kwargs

def lazy_load(self) -> Iterator[Document]:
Expand All @@ -118,10 +115,7 @@ def load_file(
# SDK parameters
api_key=self.api_key,
client=self.client,
retry_config=self.retry_config,
server=self.server,
url=self.url,
url_params=self.url_params,
**self.unstructured_kwargs,
).lazy_load()

Expand Down Expand Up @@ -156,10 +150,7 @@ def __init__(
# SDK parameters
api_key: Optional[str] = None,
client: Optional[UnstructuredClient] = None,
retry_config: Optional[RetryConfig] = None,
server: Optional[str] = None,
url: Optional[str] = "https://api.unstructuredapp.io/general/v0/general",
url_params: Optional[dict[str, str]] = None,
**unstructured_kwargs: Any,
):
"""Initialize loader."""
Expand All @@ -169,21 +160,15 @@ def __init__(
self.post_processors = post_processors
# SDK parameters
self.api_key = api_key
self.client = (
client
if client is not None
else UnstructuredClient(
self.url = url
if client is not None:
self.client = client
elif self.api_key is not None:
self.client = UnstructuredClient(
api_key_auth=self.api_key,
retry_config=self.retry_config, # type: ignore[has-type]
server=self.server, # type: ignore[has-type]
server_url=self.url, # type: ignore[has-type]
url_params=self.url_params, # type: ignore[has-type]
server_url=self.url,
)
)
self.retry_config = retry_config
self.server = server
self.url = url
self.url_params = url_params

self.unstructured_kwargs = unstructured_kwargs

def lazy_load(self) -> Iterator[Document]:
Expand Down Expand Up @@ -222,7 +207,7 @@ def _elements_json(self) -> list[dict[str, Any]]:
return self._convert_elements_to_dicts(self._elements_via_local)

@property
def _elements_via_local(self) -> list[Any]:
def _elements_via_local(self) -> list[Element]:
try:
from unstructured.partition.auto import partition # type: ignore
except ImportError:
Expand All @@ -239,7 +224,7 @@ def _elements_via_local(self) -> list[Any]:

return partition(
file=self.file, filename=self.file_path, **self.unstructured_kwargs
)
) # type: ignore

@property
def _elements_via_api(self) -> list[dict[str, Any]]:
Expand Down Expand Up @@ -274,7 +259,9 @@ def _sdk_partition_request(self) -> operations.PartitionRequest:
),
)

def _convert_elements_to_dicts(self, elements: list[Any]) -> list[dict[str, Any]]:
def _convert_elements_to_dicts(
self, elements: list[Element]
) -> list[dict[str, Any]]:
return [element.to_dict() for element in elements]

def _get_metadata(self) -> dict[str, Any]:
Expand Down

0 comments on commit f7064bc

Please sign in to comment.