Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --fields-include to unstructured-ingest #376

Merged
merged 52 commits into from
Mar 22, 2023
Merged
Changes from 1 commit
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
3ef6265
added metadata in/exclude params
natygyoon Mar 15, 2023
a084a5c
updated process_file
natygyoon Mar 15, 2023
499280d
existing tests
natygyoon Mar 15, 2023
3b8d369
remove default behavior
natygyoon Mar 15, 2023
e8319f5
changelog and ci
natygyoon Mar 16, 2023
2d01c7a
line length
natygyoon Mar 16, 2023
d6ab499
import
natygyoon Mar 16, 2023
ef788f6
import
natygyoon Mar 16, 2023
297ff5f
import sorted
natygyoon Mar 16, 2023
780f7ff
import
natygyoon Mar 16, 2023
252654b
type
natygyoon Mar 16, 2023
59d7a00
line length
natygyoon Mar 16, 2023
b5393c3
version sync
natygyoon Mar 16, 2023
ef07f40
main
natygyoon Mar 16, 2023
1283a84
ci
natygyoon Mar 16, 2023
97d04e6
json
natygyoon Mar 16, 2023
dd0319a
dict
natygyoon Mar 16, 2023
bdd4a1e
type ignore
natygyoon Mar 16, 2023
79333a5
lint
natygyoon Mar 16, 2023
fc1db61
unit tests for process_file
natygyoon Mar 17, 2023
4f92b68
lint
natygyoon Mar 17, 2023
203e1b2
added --fields-include
natygyoon Mar 17, 2023
53b7aea
lint
natygyoon Mar 17, 2023
e35a04f
line length
natygyoon Mar 17, 2023
b5d7b3e
fix version
natygyoon Mar 17, 2023
3905681
type changed to Optional(str)
natygyoon Mar 20, 2023
c690284
ci
natygyoon Mar 20, 2023
5fab790
line length
natygyoon Mar 20, 2023
46b7550
merge `feat/metadata` into `feat/fields-include`
natygyoon Mar 20, 2023
c0f2ad0
merge conflict
natygyoon Mar 20, 2023
dbf7774
merge conflict
natygyoon Mar 20, 2023
ed5869d
line length
natygyoon Mar 20, 2023
45c9b86
type check
natygyoon Mar 20, 2023
84b831a
line length
natygyoon Mar 20, 2023
c0dbcbe
default
natygyoon Mar 20, 2023
a18d91d
subclass type
natygyoon Mar 20, 2023
7a131b7
fixed dict iter error
natygyoon Mar 20, 2023
054fdc4
Merge branch 'main' into feat/fields-include
natygyoon Mar 20, 2023
bc758bc
Merge branch 'main' into feat/fields-include
natygyoon Mar 21, 2023
6c42cf4
code refactor
natygyoon Mar 21, 2023
f1f90d2
added unit tests for fields_include
natygyoon Mar 21, 2023
245266b
version sync
natygyoon Mar 21, 2023
73f0f6d
Merge branch 'main' into feat/fields-include
natygyoon Mar 21, 2023
509457b
remove custom class
natygyoon Mar 21, 2023
33faf50
unit tests
natygyoon Mar 21, 2023
ec46090
changelog
natygyoon Mar 21, 2023
4e7da4f
version bump
natygyoon Mar 22, 2023
d98c8c2
nit
natygyoon Mar 22, 2023
7fd4ad5
Merge branch 'main' into feat/fields-include
natygyoon Mar 22, 2023
3b46996
remove duplicate
natygyoon Mar 22, 2023
840a1ae
nit
natygyoon Mar 22, 2023
f29bf65
Merge branch 'main' into feat/fields-include
natygyoon Mar 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
unit tests for process_file
  • Loading branch information
natygyoon committed Mar 17, 2023
commit fc1db6170b06a7817bd5d1a9d78767c66335d320
79 changes: 79 additions & 0 deletions test_unstructured_ingest/test_interfaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import pathlib

import pytest

from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig


DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")

test_files = [
"layout-parser-paper-fast.jpg",
"layout-parser-paper-fast.pdf",
]


@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename",
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k == "filename"


@pytest.mark.parametrize("filename", test_files)
def test_process_file_include_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_include="filename,page_number",
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k in ["filename", "page_number"]


@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_exclude="filename",
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k != "filename"


@pytest.mark.parametrize("filename", test_files)
def test_process_file_exclude_filename_pagenum(filename: str):
ingest_doc = GitIngestDoc(
path=filename,
config=SimpleGitConfig(
download_dir=EXAMPLE_DOCS_DIRECTORY,
metadata_exclude="filename,page_number",
),
)
isd_elems = ingest_doc.process_file()

for elem in isd_elems:
for k in elem["metadata"]:
assert k not in ["filename", "page_number"]