Skip to content

Commit

Permalink
feat: add --flatten-metadata to unstructured-ingest (Unstructured-IO#389
Browse files Browse the repository at this point in the history
)

* added --flatten-metadata to unstructured-ingest

* added unit tests for process_file()
  • Loading branch information
natygyoon authored Mar 22, 2023
1 parent 66a0369 commit a4394f6
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 12 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.5.7-dev1
## 0.5.7-dev2

### Enhancements

* Refactored codebase using `exactly_one`

### Features

* Add `--flatten-metadata` parameter to `unstructured-ingest`
* Add `--fields-include` parameter to `unstructured-ingest`

### Fixes
Expand Down
52 changes: 42 additions & 10 deletions test_unstructured_ingest/test_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ def test_process_file_metadata_include_filename(filename: str):
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k == "filename"
assert set(elem["metadata"].keys()) == {"filename"}


@pytest.mark.parametrize("filename", test_files)
Expand All @@ -42,8 +41,7 @@ def test_process_file_metadata_include_filename_pagenum(filename: str):
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k in ["filename", "page_number"]
assert set(elem["metadata"].keys()) == {"filename", "page_number"}


@pytest.mark.parametrize("filename", test_files)
Expand All @@ -58,8 +56,7 @@ def test_process_file_metadata_exclude_filename(filename: str):
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k != "filename"
assert "filename" not in elem["metadata"].keys()


@pytest.mark.parametrize("filename", test_files)
Expand All @@ -74,8 +71,8 @@ def test_process_file_metadata_exclude_filename_pagenum(filename: str):
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k not in ["filename", "page_number"]
assert "filename" not in elem["metadata"].keys()
assert "page_number" not in elem["metadata"].keys()


@pytest.mark.parametrize("filename", test_files)
Expand All @@ -87,7 +84,9 @@ def test_process_file_fields_include_default(filename: str):
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id", "text", "type", "metadata") == set(elem.keys())

for elem in isd_elems:
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())


@pytest.mark.parametrize("filename", test_files)
Expand All @@ -100,5 +99,38 @@ def test_process_file_fields_include_elementid(filename: str):
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id") == set(elem.keys())

for elem in isd_elems:
assert {"element_id"} == set(elem.keys())


@pytest.mark.parametrize("filename", test_files)
def test_process_file_flatten_metadata_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename",
flatten_metadata=True,
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
assert {"element_id", "text", "type", "filename"} == set(elem.keys())


@pytest.mark.parametrize("filename", test_files)
def test_process_file_flatten_metadata_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename,page_number",
flatten_metadata=True,
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
assert {"element_id", "text", "type", "filename", "page_number"} == set(elem.keys())
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.7-dev1" # pragma: no cover
__version__ = "0.5.7-dev2" # pragma: no cover
1 change: 1 addition & 0 deletions unstructured/ingest/connector/biomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class SimpleBiomedConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

def _validate_date_args(self, date):
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class SimpleFsspecConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

# fsspec specific options
access_kwargs: dict = field(default_factory=dict)
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class SimpleGitConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

repo_path: str = field(init=False, repr=False)

Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

recursive: bool = False

Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class SimpleRedditConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False

def __post_init__(self):
if self.num_posts <= 0:
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False


@dataclass
Expand Down
6 changes: 6 additions & 0 deletions unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class BaseConnectorConfig(ABC):
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"
flatten_metadata: bool = False


class BaseIngestDoc(ABC):
Expand Down Expand Up @@ -121,6 +122,11 @@ def process_file(self):
in_list = self.config.fields_include.split(",")
elem = {k: v for k, v in elem.items() if k in in_list}

if self.config.flatten_metadata:
for k, v in elem["metadata"].items(): # type: ignore[attr-defined]
elem[k] = v
elem.pop("metadata") # type: ignore[attr-defined]

self.isd_elems_no_filename.append(elem)

return self.isd_elems_no_filename
23 changes: 23 additions & 0 deletions unstructured/ingest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,14 @@ def run(self):


@click.command()
@click.option(
"--flatten-metadata",
is_flag=True,
default=False,
help="Results in flattened json elements. "
"Specifically, the metadata key values are brought to the top-level of the element, "
"and the `metadata` key itself is removed.",
)
@click.option(
"--fields-include",
default="element_id,text,type,metadata",
Expand Down Expand Up @@ -345,7 +353,13 @@ def main(
metadata_include,
metadata_exclude,
fields_include,
flatten_metadata,
):
if flatten_metadata and "metadata" not in fields_include:
logger.warning(
"`--flatten-metadata` is specified, but there is no metadata to flatten, "
"since `metadata` is not specified in `--fields-include`.",
)
if "metadata" not in fields_include and (metadata_include or metadata_exclude):
logger.warning(
"Either `--metadata-include` or `--metadata-exclude` is specified"
Expand Down Expand Up @@ -428,6 +442,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif protocol in ("abfs", "az"):
Expand All @@ -451,6 +466,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
else:
Expand All @@ -470,6 +486,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif github_url:
Expand All @@ -487,6 +504,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif gitlab_url:
Expand All @@ -504,6 +522,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif subreddit_name:
Expand All @@ -523,6 +542,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif wikipedia_page_title:
Expand All @@ -538,6 +558,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif drive_id:
Expand All @@ -555,6 +576,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
Expand All @@ -572,6 +594,7 @@ def main(
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
flatten_metadata=flatten_metadata,
),
)
# Check for other connector-specific options here and define the doc_connector object
Expand Down

0 comments on commit a4394f6

Please sign in to comment.