Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --fields-include to unstructured-ingest #376

Merged
merged 52 commits into from
Mar 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
3ef6265
added metadata in/exclude params
natygyoon Mar 15, 2023
a084a5c
updated process_file
natygyoon Mar 15, 2023
499280d
existing tests
natygyoon Mar 15, 2023
3b8d369
remove default behavior
natygyoon Mar 15, 2023
e8319f5
changelog and ci
natygyoon Mar 16, 2023
2d01c7a
line length
natygyoon Mar 16, 2023
d6ab499
import
natygyoon Mar 16, 2023
ef788f6
import
natygyoon Mar 16, 2023
297ff5f
import sorted
natygyoon Mar 16, 2023
780f7ff
import
natygyoon Mar 16, 2023
252654b
type
natygyoon Mar 16, 2023
59d7a00
line length
natygyoon Mar 16, 2023
b5393c3
version sync
natygyoon Mar 16, 2023
ef07f40
main
natygyoon Mar 16, 2023
1283a84
ci
natygyoon Mar 16, 2023
97d04e6
json
natygyoon Mar 16, 2023
dd0319a
dict
natygyoon Mar 16, 2023
bdd4a1e
type ignore
natygyoon Mar 16, 2023
79333a5
lint
natygyoon Mar 16, 2023
fc1db61
unit tests for process_file
natygyoon Mar 17, 2023
4f92b68
lint
natygyoon Mar 17, 2023
203e1b2
added --fields-include
natygyoon Mar 17, 2023
53b7aea
lint
natygyoon Mar 17, 2023
e35a04f
line length
natygyoon Mar 17, 2023
b5d7b3e
fix version
natygyoon Mar 17, 2023
3905681
type changed to Optional(str)
natygyoon Mar 20, 2023
c690284
ci
natygyoon Mar 20, 2023
5fab790
line length
natygyoon Mar 20, 2023
46b7550
merge `feat/metadata` into `feat/fields-include`
natygyoon Mar 20, 2023
c0f2ad0
merge conflict
natygyoon Mar 20, 2023
dbf7774
merge conflict
natygyoon Mar 20, 2023
ed5869d
line length
natygyoon Mar 20, 2023
45c9b86
type check
natygyoon Mar 20, 2023
84b831a
line length
natygyoon Mar 20, 2023
c0dbcbe
default
natygyoon Mar 20, 2023
a18d91d
subclass type
natygyoon Mar 20, 2023
7a131b7
fixed dict iter error
natygyoon Mar 20, 2023
054fdc4
Merge branch 'main' into feat/fields-include
natygyoon Mar 20, 2023
bc758bc
Merge branch 'main' into feat/fields-include
natygyoon Mar 21, 2023
6c42cf4
code refactor
natygyoon Mar 21, 2023
f1f90d2
added unit tests for fields_include
natygyoon Mar 21, 2023
245266b
version sync
natygyoon Mar 21, 2023
73f0f6d
Merge branch 'main' into feat/fields-include
natygyoon Mar 21, 2023
509457b
remove custom class
natygyoon Mar 21, 2023
33faf50
unit tests
natygyoon Mar 21, 2023
ec46090
changelog
natygyoon Mar 21, 2023
4e7da4f
version bump
natygyoon Mar 22, 2023
d98c8c2
nit
natygyoon Mar 22, 2023
7fd4ad5
Merge branch 'main' into feat/fields-include
natygyoon Mar 22, 2023
3b46996
remove duplicate
natygyoon Mar 22, 2023
840a1ae
nit
natygyoon Mar 22, 2023
f29bf65
Merge branch 'main' into feat/fields-include
natygyoon Mar 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
## 0.5.7-dev0
## 0.5.7-dev1

### Enhancements

* Refactored codebase using `exactly_one`

### Features

* Add `--fields-include` parameter to `unstructured-ingest`

### Fixes

## 0.5.6
Expand Down
34 changes: 30 additions & 4 deletions test_unstructured_ingest/test_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename(filename: str):
def test_process_file_metadata_include_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
Expand All @@ -31,7 +31,7 @@ def test_process_file_include_filename(filename: str):


@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename_pagenum(filename: str):
def test_process_file_metadata_include_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
Expand All @@ -47,7 +47,7 @@ def test_process_file_include_filename_pagenum(filename: str):


@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename(filename: str):
def test_process_file_metadata_exclude_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
Expand All @@ -63,7 +63,7 @@ def test_process_file_exclude_filename(filename: str):


@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename_pagenum(filename: str):
def test_process_file_metadata_exclude_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
Expand All @@ -76,3 +76,29 @@ def test_process_file_exclude_filename_pagenum(filename: str):
for elem in isd_elems:
for k in elem["metadata"]:
assert k not in ["filename", "page_number"]


@pytest.mark.parametrize("filename", test_files)
def test_process_file_fields_include_default(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id", "text", "type", "metadata") == set(elem.keys())


@pytest.mark.parametrize("filename", test_files)
def test_process_file_fields_include_elementid(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
fields_include="element_id",
),
)
isd_elems = ingest_doc.process_file()
assert set("element_id") == set(elem.keys())

2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.7-dev0" # pragma: no cover
__version__ = "0.5.7-dev1" # pragma: no cover
1 change: 1 addition & 0 deletions unstructured/ingest/connector/biomed.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class SimpleBiomedConfig(BaseConnectorConfig):
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"

def _validate_date_args(self, date):
date_formats = ["%Y-%m-%d", "%Y-%m-%d+%H:%M:%S"]
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class SimpleFsspecConfig(BaseConnectorConfig):
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"

# fsspec specific options
access_kwargs: dict = field(default_factory=dict)
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class SimpleGitConfig(BaseConnectorConfig):
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"

repo_path: str = field(init=False, repr=False)

Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class SimpleGoogleDriveConfig(BaseConnectorConfig):
preserve_downloads: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"

recursive: bool = False

Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/reddit.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class SimpleRedditConfig(BaseConnectorConfig):
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"

def __post_init__(self):
if self.num_posts <= 0:
Expand Down
1 change: 1 addition & 0 deletions unstructured/ingest/connector/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class SimpleWikipediaConfig(BaseConnectorConfig):
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"


@dataclass
Expand Down
5 changes: 4 additions & 1 deletion unstructured/ingest/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class BaseConnectorConfig(ABC):
re_download: bool = False
metadata_include: Optional[str] = None
metadata_exclude: Optional[str] = None
fields_include: str = "element_id,text,type,metadata"


class BaseIngestDoc(ABC):
Expand Down Expand Up @@ -117,7 +118,9 @@ def process_file(self):
if k not in in_list:
elem["metadata"].pop(k, None) # type: ignore[attr-defined]

elem.pop("coordinates") # type: ignore[attr-defined]
in_list = self.config.fields_include.split(",")
elem = {k: v for k, v in elem.items() if k in in_list}

self.isd_elems_no_filename.append(elem)

return self.isd_elems_no_filename
21 changes: 21 additions & 0 deletions unstructured/ingest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ def run(self):


@click.command()
@click.option(
"--fields-include",
default="element_id,text,type,metadata",
help="If set, include the specified top-level fields in an element. "
"Default is `element_id,text,type,metadata`.",
)
@click.option(
"--metadata-include",
default=None,
Expand Down Expand Up @@ -338,7 +344,13 @@ def main(
verbose,
metadata_include,
metadata_exclude,
fields_include,
):
if "metadata" not in fields_include and (metadata_include or metadata_exclude):
logger.warning(
"Either `--metadata-include` or `--metadata-exclude` is specified"
" while metadata is not specified in --fields-include.",
)
if metadata_exclude is not None and metadata_include is not None:
logger.error(
"Arguments `--metadata-include` and `--metadata-exclude` are "
Expand Down Expand Up @@ -415,6 +427,7 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif protocol in ("abfs", "az"):
Expand All @@ -437,6 +450,7 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
else:
Expand All @@ -455,6 +469,7 @@ def main(
preserve_downloads=preserve_downloads,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif github_url:
Expand All @@ -471,6 +486,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif gitlab_url:
Expand All @@ -487,6 +503,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif subreddit_name:
Expand All @@ -505,6 +522,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif wikipedia_page_title:
Expand All @@ -519,6 +537,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif drive_id:
Expand All @@ -535,6 +554,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
elif biomed_path or biomed_api_id or biomed_api_from or biomed_api_until:
Expand All @@ -551,6 +571,7 @@ def main(
re_download=re_download,
metadata_include=metadata_include,
metadata_exclude=metadata_exclude,
fields_include=fields_include,
),
)
# Check for other connector-specific options here and define the doc_connector object
Expand Down