forked from langchain-ai/langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduce Blob and Blob Loader interface (langchain-ai#3603)
This PR introduces a Blob data type and a Blob loader interface. This is the first of a sequence of PRs that follows this proposal: langchain-ai#2833 The primary goals of these abstraction are: * Decouple content loading from content parsing code. * Help duplicated content loading code from document loaders. * Make lazy loading a default for langchain.
- Loading branch information
Showing
5 changed files
with
266 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader | ||
|
||
__all__ = ["BlobLoader", "Blob"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
"""Schema for Blobs and Blob Loaders. | ||
The goal is to facilitate decoupling of content loading from content parsing code. | ||
In addition, content loading code should provide a lazy loading interface by default. | ||
""" | ||
import contextlib | ||
import mimetypes | ||
from abc import ABC, abstractmethod | ||
from io import BufferedReader, BytesIO | ||
from pathlib import PurePath | ||
from typing import Generator, Iterable, Optional, Union | ||
|
||
from pydantic import BaseModel | ||
|
||
PathLike = Union[str, PurePath] | ||
|
||
|
||
class Blob(BaseModel): | ||
"""A blob is used to represent raw data by either reference or value. | ||
Provides an interface to materialize the blob in different representations, and | ||
help to decouple the development of data loaders from the downstream parsing of | ||
the raw data. | ||
Inspired by: https://developer.mozilla.org/en-US/docs/Web/API/Blob | ||
""" | ||
|
||
data: Union[bytes, str, None] # Raw data | ||
mimetype: Optional[str] = None # Not to be confused with a file extension | ||
encoding: str = "utf-8" # Use utf-8 as default encoding, if decoding to string | ||
# Location where the original content was found | ||
# Represent location on the local file system | ||
# Useful for situations where downstream code assumes it must work with file paths | ||
# rather than in-memory content. | ||
path: Optional[PathLike] = None | ||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
frozen = True | ||
|
||
@property | ||
def source(self) -> Optional[str]: | ||
"""The source location of the blob as string if known otherwise none.""" | ||
return str(self.path) if self.path else None | ||
|
||
def as_string(self) -> str: | ||
"""Read data as a string.""" | ||
if self.data is None and self.path: | ||
with open(str(self.path), "r", encoding=self.encoding) as f: | ||
return f.read() | ||
elif isinstance(self.data, bytes): | ||
return self.data.decode(self.encoding) | ||
elif isinstance(self.data, str): | ||
return self.data | ||
else: | ||
raise ValueError(f"Unable to get string for blob {self}") | ||
|
||
def as_bytes(self) -> bytes: | ||
"""Read data as bytes.""" | ||
if isinstance(self.data, bytes): | ||
return self.data | ||
elif isinstance(self.data, str): | ||
return self.data.encode(self.encoding) | ||
elif self.data is None and self.path: | ||
with open(str(self.path), "rb") as f: | ||
return f.read() | ||
else: | ||
raise ValueError(f"Unable to get bytes for blob {self}") | ||
|
||
@contextlib.contextmanager | ||
def as_bytes_io(self) -> Generator[Union[BytesIO, BufferedReader], None, None]: | ||
"""Read data as a byte stream.""" | ||
if isinstance(self.data, bytes): | ||
yield BytesIO(self.data) | ||
elif self.data is None and self.path: | ||
with open(str(self.path), "rb") as f: | ||
yield f | ||
else: | ||
raise NotImplementedError(f"Unable to convert blob {self}") | ||
|
||
@classmethod | ||
def from_path( | ||
cls, | ||
path: PathLike, | ||
*, | ||
encoding: str = "utf-8", | ||
mime_type: Optional[str] = None, | ||
guess_type: bool = True, | ||
) -> "Blob": | ||
"""Load the blob from a path like object. | ||
Args: | ||
path: path like object to file to be read | ||
encoding: Encoding to use if decoding the bytes into a string | ||
mime_type: if provided, will be set as the mime-type of the data | ||
guess_type: If True, the mimetype will be guessed from the file extension, | ||
if a mime-type was not provided | ||
Returns: | ||
Blob instance | ||
""" | ||
if mime_type is None and guess_type: | ||
_mimetype = mimetypes.guess_type(path)[0] if guess_type else None | ||
else: | ||
_mimetype = mime_type | ||
# We do not load the data immediately, instead we treat the blob as a | ||
# reference to the underlying data. | ||
return cls(data=None, mimetype=_mimetype, encoding=encoding, path=path) | ||
|
||
@classmethod | ||
def from_data( | ||
cls, | ||
data: Union[str, bytes], | ||
*, | ||
encoding: str = "utf-8", | ||
mime_type: Optional[str] = None, | ||
path: Optional[str] = None, | ||
) -> "Blob": | ||
"""Initialize the blob from in-memory data. | ||
Args: | ||
data: the in-memory data associated with the blob | ||
encoding: Encoding to use if decoding the bytes into a string | ||
mime_type: if provided, will be set as the mime-type of the data | ||
path: if provided, will be set as the source from which the data came | ||
Returns: | ||
Blob instance | ||
""" | ||
return cls(data=data, mime_type=mime_type, encoding=encoding, path=path) | ||
|
||
def __repr__(self) -> str: | ||
"""Define the blob representation.""" | ||
str_repr = f"Blob {id(self)}" | ||
if self.source: | ||
str_repr += f" {self.source}" | ||
return str_repr | ||
|
||
|
||
class BlobLoader(ABC): | ||
"""Abstract interface for blob loaders implementation. | ||
Implementer should be able to load raw content from a storage system according | ||
to some criteria and return the raw content lazily as a stream of blobs. | ||
""" | ||
|
||
@abstractmethod | ||
def yield_blobs( | ||
self, | ||
) -> Iterable[Blob]: | ||
"""A lazy loader for raw data represented by LangChain's Blob object. | ||
Returns: | ||
A generator over blobs | ||
""" |
Empty file.
6 changes: 6 additions & 0 deletions
6
tests/unit_tests/document_loader/blob_loaders/test_public_api.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from langchain.document_loaders.blob_loaders import __all__ | ||
|
||
|
||
def test_public_api() -> None: | ||
"""Hard-code public API to help determine if we have broken it.""" | ||
assert sorted(__all__) == ["Blob", "BlobLoader"] |
101 changes: 101 additions & 0 deletions
101
tests/unit_tests/document_loader/blob_loaders/test_schema.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import os | ||
from contextlib import contextmanager | ||
from pathlib import Path | ||
from tempfile import NamedTemporaryFile | ||
from typing import Generator, Iterable, Optional | ||
|
||
import pytest | ||
|
||
from langchain.document_loaders.blob_loaders.schema import Blob, BlobLoader, PathLike | ||
|
||
|
||
@contextmanager | ||
def get_temp_file( | ||
content: bytes, suffix: Optional[str] = None | ||
) -> Generator[Path, None, None]: | ||
"""Yield a temporary field with some content.""" | ||
with NamedTemporaryFile(suffix=suffix, delete=False) as temp_file: | ||
temp_file.write(content) | ||
path = Path(temp_file.name) | ||
try: | ||
yield path | ||
finally: | ||
os.remove(str(path)) | ||
|
||
|
||
def test_blob_initialized_with_binary_data() -> None: | ||
"""Test reading blob IO if blob content hasn't been read yet.""" | ||
data = b"Hello, World!" | ||
blob = Blob(data=data) | ||
assert blob.as_string() == "Hello, World!" | ||
assert blob.as_bytes() == data | ||
assert blob.source is None | ||
with blob.as_bytes_io() as bytes_io: | ||
assert bytes_io.read() == data | ||
|
||
|
||
def test_blob_from_pure_path() -> None: | ||
"""Test reading blob from a file path.""" | ||
content = b"Hello, World!" | ||
|
||
with get_temp_file(content, suffix=".html") as temp_path: | ||
assert isinstance(temp_path, Path) | ||
blob = Blob.from_path(temp_path) | ||
assert blob.encoding == "utf-8" # Default encoding | ||
assert blob.path == temp_path | ||
assert blob.mimetype == "text/html" | ||
assert blob.source == str(temp_path) | ||
assert blob.data is None | ||
assert blob.as_bytes() == content | ||
assert blob.as_string() == "Hello, World!" | ||
with blob.as_bytes_io() as bytes_io: | ||
assert bytes_io.read() == content | ||
|
||
|
||
def test_blob_from_str_path() -> None: | ||
"""Test reading blob from a file path.""" | ||
content = b"Hello, World!" | ||
|
||
with get_temp_file(content) as temp_path: | ||
str_path = str(temp_path) | ||
assert isinstance(str_path, str) | ||
blob = Blob.from_path(str_path) | ||
assert blob.encoding == "utf-8" # Default encoding | ||
assert blob.path == str(temp_path) | ||
assert blob.source == str(temp_path) | ||
assert blob.data is None | ||
assert blob.as_bytes() == content | ||
assert blob.as_string() == "Hello, World!" | ||
with blob.as_bytes_io() as bytes_io: | ||
assert bytes_io.read() == content | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"path, mime_type, guess_type, expected_mime_type", | ||
[ | ||
("test.txt", None, True, "text/plain"), | ||
("test.txt", None, False, None), | ||
("test.html", None, True, "text/html"), | ||
("test.html", None, False, None), | ||
("test.html", "user_forced_value", True, "user_forced_value"), | ||
(Path("test.html"), "user_forced_value", True, "user_forced_value"), | ||
(Path("test.html"), None, True, "text/html"), | ||
], | ||
) | ||
def test_mime_type_inference( | ||
path: PathLike, mime_type: str, guess_type: bool, expected_mime_type: Optional[str] | ||
) -> None: | ||
"""Tests mimetype inference based on options and path.""" | ||
blob = Blob.from_path(path, mime_type=mime_type, guess_type=guess_type) | ||
assert blob.mimetype == expected_mime_type | ||
|
||
|
||
def test_blob_loader() -> None: | ||
"""Simple test that verifies that we can implement a blob loader.""" | ||
|
||
class TestLoader(BlobLoader): | ||
def yield_blobs(self) -> Iterable[Blob]: | ||
"""Yield blob implementation.""" | ||
yield Blob(data=b"Hello, World!") | ||
|
||
assert list(TestLoader().yield_blobs()) == [Blob(data=b"Hello, World!")] |