Skip to content

Commit

Permalink
Adding content_type and file_filename to autopartition (Unstructured-…
Browse files Browse the repository at this point in the history
…IO#394)

Co-authored-by: cragwolfe <crag@unstructured.io>
  • Loading branch information
amanda103 and cragwolfe authored Mar 24, 2023
1 parent 8ffd310 commit 71e035c
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 23 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## 0.5.7-dev3
## 0.5.7

### Enhancements

* Refactored codebase using `exactly_one`
* Adds ability to pass headers when passing a url in partition_html()
* Added optional `content_type` and `file_filename` parameters to `partition()` to bypass file detection

### Features

Expand Down
5 changes: 3 additions & 2 deletions docs/source/bricks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ In cases where ``libmagic`` is not available, filetype detection will fall back
As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input.
``partition`` also has some optional kwargs.
For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it.
Additionally you can bypass the filetype detection logic with the optional ``content_type`` argument which may be specified with either the ``filename`` or file-like object, ``file``.
You can find a full listing of optional kwargs in the documentation below.

.. code:: python
Expand All @@ -38,7 +39,7 @@ You can find a full listing of optional kwargs in the documentation below.
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
elements = partition(filename=filename)
elements = partition(filename=filename, content_type="application/pdf")
print("\n\n".join([str(el) for el in elements][:10]))
Expand All @@ -57,7 +58,7 @@ The ``unstructured`` library also includes partitioning bricks targeted at speci
The ``partition`` brick uses these document-specific partitioning bricks under the hood.
There are a few reasons you may want to use a document-specific partitioning brick instead of ``partition``:

* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.
* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly, or passing in the ``content_type`` will make your program run faster.
* Fewer dependencies. You don't need to install ``libmagic`` for filetype detection if you're only using document-specific bricks.
* Additional features. The API for partition is the least common denominator for all document types. Certain document-specific brick include extra features that you may want to take advantage of. For example, ``partition_html`` allows you to pass in a URL so you don't have to store the ``.html`` file locally. See the documentation below learn about the options available in each partitioning brick.

Expand Down
76 changes: 61 additions & 15 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,27 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
assert elements == expected_docx_elements


def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
)
def test_auto_partition_doc_with_filename(
mock_docx_document,
expected_docx_elements,
tmpdir,
pass_file_filename,
content_type,
):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
mock_docx_document.save(docx_filename)
convert_office_doc(docx_filename, tmpdir.dirname, "doc")

elements = partition(filename=doc_filename)
file_filename = doc_filename if pass_file_filename else None
elements = partition(
filename=doc_filename,
file_filename=file_filename,
content_type=content_type,
)
assert elements == expected_docx_elements
assert elements[0].metadata.filename == doc_filename

Expand All @@ -130,17 +144,27 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
assert elements == expected_docx_elements


def test_auto_partition_html_from_filename():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_filename(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
elements = partition(filename=filename)
file_filename = filename if pass_file_filename else None
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
assert len(elements) > 0
assert elements[0].metadata.filename == filename


def test_auto_partition_html_from_file():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
)
def test_auto_partition_html_from_file(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
file_filename = filename if pass_file_filename else None
with open(filename) as f:
elements = partition(file=f)
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
assert len(elements) > 0


Expand Down Expand Up @@ -177,9 +201,15 @@ def test_auto_partition_text_from_file():
assert elements == EXPECTED_TEXT_OUTPUT


def test_auto_partition_pdf_from_filename():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
elements = partition(filename=filename)
file_filename = filename if pass_file_filename else None

elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
Expand Down Expand Up @@ -207,10 +237,16 @@ def test_auto_partition_pdf_with_fast_strategy():
)


def test_auto_partition_pdf_from_file():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
)
def test_auto_partition_pdf_from_file(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
file_filename = filename if pass_file_filename else None

with open(filename, "rb") as f:
elements = partition(file=f)
elements = partition(file=f, file_filename=file_filename, content_type=content_type)

assert isinstance(elements[0], Title)
assert elements[0].text.startswith("LayoutParser")
Expand All @@ -230,16 +266,26 @@ def test_partition_pdf_doesnt_raise_warning():
partition(filename=filename)


def test_auto_partition_jpg():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
elements = partition(filename=filename)
file_filename = filename if pass_file_filename else None
elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
assert len(elements) > 0


def test_auto_partition_jpg_from_file():
@pytest.mark.parametrize(
("pass_file_filename", "content_type"),
[(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
)
def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
file_filename = filename if pass_file_filename else None
with open(filename, "rb") as f:
elements = partition(file=f)
elements = partition(file=f, file_filename=file_filename, content_type=content_type)
assert len(elements) > 0


Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.7-dev3" # pragma: no cover
__version__ = "0.5.7" # pragma: no cover
34 changes: 31 additions & 3 deletions unstructured/file_utils/filetype.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,24 @@ def __lt__(self, other):
return self.name < other.name


STR_TO_FILETYPE = {
"application/pdf": FileType.PDF,
"application/msword": FileType.DOC,
"image/jpeg": FileType.JPG,
"image/png": FileType.PNG,
"text/markdown": FileType.MD,
"text/x-markdown": FileType.MD,
"application/epub": FileType.EPUB,
"application/epub+zip": FileType.EPUB,
"text/html": FileType.HTML,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
"application/vnd.ms-excel": FileType.XLS,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
"application/vnd.ms-powerpoint": FileType.PPT,
"application/xml": FileType.XML,
}


EXT_TO_FILETYPE = {
".pdf": FileType.PDF,
".docx": FileType.DOCX,
Expand Down Expand Up @@ -138,18 +156,26 @@ def __lt__(self, other):

def detect_filetype(
filename: Optional[str] = None,
content_type: Optional[str] = None,
file: Optional[IO] = None,
file_filename: Optional[str] = None,
) -> Optional[FileType]:
"""Use libmagic to determine a file's type. Helps determine which partition brick
to use for a given file. A return value of None indicates a non-supported file type."""
exactly_one(filename=filename, file=file)

if filename:
_, extension = os.path.splitext(filename)
if content_type:
filetype = STR_TO_FILETYPE.get(content_type)
if filetype:
return filetype

if filename or file_filename:
_, extension = os.path.splitext(filename or file_filename or "")
extension = extension.lower()
if LIBMAGIC_AVAILABLE:
mime_type = magic.from_file(filename, mime=True)
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
else:
# might not need this
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
elif file is not None:
extension = None
Expand All @@ -164,6 +190,8 @@ def detect_filetype(
"Filetype detection on file-like objects requires libmagic. "
"Please install libmagic and try again.",
)
else:
raise ValueError("No filename, file, nor file_filename were specified.")

if mime_type == "application/pdf":
return FileType.PDF
Expand Down
13 changes: 12 additions & 1 deletion unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@

def partition(
filename: Optional[str] = None,
content_type: Optional[str] = None,
file: Optional[IO] = None,
file_filename: Optional[str] = None,
include_page_breaks: bool = False,
strategy: str = "hi_res",
encoding: str = "utf-8",
Expand All @@ -31,8 +33,12 @@ def partition(
----------
filename
A string defining the target filename path.
content_type
A string defining the file content in MIME type
file
A file-like object using "rb" mode --> open(filename, "rb").
file_filename
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
include_page_breaks
If True, the output will include page breaks if the filetype supports it
strategy
Expand All @@ -42,7 +48,12 @@ def partition(
encoding
The encoding method used to decode the text input. If None, utf-8 will be used.
"""
filetype = detect_filetype(filename=filename, file=file)
filetype = detect_filetype(
filename=filename,
file=file,
file_filename=file_filename,
content_type=content_type,
)

if file is not None:
file.seek(0)
Expand Down

0 comments on commit 71e035c

Please sign in to comment.