Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UnstructuredReader fixes V2. #14946

Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Unstructured file reader.
"""
Unstructured file reader.

A parser for unstructured text files using Unstructured.io.
Supports .csv, .tsv, .doc, .docx, .odt, .epub, .org, .rst, .rtf,
Expand Down Expand Up @@ -31,7 +32,8 @@ def __init__(
allowed_metadata_types: Optional[Tuple] = None,
excluded_metadata_keys: Optional[Set] = None,
) -> None:
"""Initialize UnstructuredReader.
"""
Initialize UnstructuredReader.

Args:
*args (Any): Additional arguments passed to the BaseReader.
Expand Down Expand Up @@ -79,7 +81,8 @@ def load_data(
split_documents: Optional[bool] = False,
excluded_metadata_keys: Optional[List[str]] = None,
) -> List[Document]:
"""Load data using Unstructured.io.
"""
Load data using Unstructured.io.

Depending on the configuration, if url is set or use_api is True,
it'll parse the file using an API call, otherwise it parses it locally.
Expand All @@ -98,6 +101,14 @@ def load_data(
"""
unstructured_kwargs = unstructured_kwargs.copy() if unstructured_kwargs else {}

if (
unstructured_kwargs.get("file") is not None
and unstructured_kwargs.get("metadata_filename") is None
):
raise ValueError(
"Please provide a 'metadata_filename' as part of the 'unstructured_kwargs' when loading a file stream."
)

elements: List[Element] = self._partition_elements(unstructured_kwargs, file)
Falven marked this conversation as resolved.
Show resolved Hide resolved

return self._create_documents(
Expand All @@ -111,7 +122,8 @@ def load_data(
def _partition_elements(
self, unstructured_kwargs: Dict, file: Optional[Path] = None
) -> List[Element]:
"""Partition the elements from the file or via API.
"""
Partition the elements from the file or via API.

Args:
file (Optional[Path]): Path to the file to be loaded.
Expand Down Expand Up @@ -144,7 +156,8 @@ def _create_documents(
split_documents: Optional[bool],
excluded_metadata_keys: Optional[List[str]],
) -> List[Document]:
"""Create documents from partitioned elements.
"""
Create documents from partitioned elements.

Args:
elements (List): List of partitioned elements.
Expand All @@ -156,47 +169,49 @@ def _create_documents(
Returns:
List[Document]: List of parsed documents.
"""
document_kwargs = document_kwargs.copy() if document_kwargs else {}
doc_kwargs = document_kwargs or {}
doc_extras = extra_info or {}
excluded_keys = set(excluded_metadata_keys or self.excluded_metadata_keys)
docs: List[Document] = []

if split_documents:
for sequence_number, element in enumerate(elements):
kwargs = document_kwargs.copy()
kwargs["text"] = element.text
def _merge_metadata(element: Element, sequence_number: Optional[int] = None):
candidate_metadata = {**element.metadata.to_dict(), **doc_extras}
metadata = {
key: (
value
if isinstance(value, self.allowed_metadata_types)
else json.dumps(value)
)
for key, value in candidate_metadata.items()
if key not in excluded_keys
}
if sequence_number is not None:
metadata["sequence_number"] = sequence_number
return metadata

excluded_keys = set(
excluded_metadata_keys or self.excluded_metadata_keys
if split_documents:
docs = [
Document(
text=element.text,
extra_info=_merge_metadata(element, sequence_number),
doc_id=element.metadata.filename,
**doc_kwargs,
)
metadata = extra_info.copy() if extra_info else {}
for key, value in element.metadata.to_dict().items():
if key not in excluded_keys:
metadata[key] = (
value
if isinstance(value, self.allowed_metadata_types)
else json.dumps(value)
)

kwargs["extra_info"] = metadata
kwargs["doc_id"] = element.id_to_hash(sequence_number)

docs.append(Document(**kwargs))
for sequence_number, element in enumerate(elements)
]
else:
text_chunks = [" ".join(str(el).split()) for el in elements]
text = "\n\n".join(text_chunks)

kwargs = document_kwargs.copy()
kwargs["text"] = text

metadata = extra_info.copy() if extra_info else {}

if len(elements) > 0:
filename = elements[0].metadata.filename
elements[0].id
metadata["filename"] = filename
kwargs["doc_id"] = f"{filename}"

kwargs["extra_info"] = metadata

docs.append(Document(**kwargs))
if len(elements) == 0:
docs = []
else:
text_chunks = [" ".join(str(el).split()) for el in elements]
metadata = _merge_metadata(elements[0])
docs = [
Document(
text="\n\n".join(text_chunks),
extra_info=metadata,
doc_id=metadata.filename,
**doc_kwargs,
)
]

return docs
Loading