Adding content_type and file_filename to autopartition (Unstructured-…

…IO#394) Co-authored-by: cragwolfe <crag@unstructured.io>
siddartha-RE · Mar 24, 2023 · 71e035c · 71e035c
1 parent 8ffd310
commit 71e035c
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
-## 0.5.7-dev3
+## 0.5.7
 
 ### Enhancements
 
 * Refactored codebase using `exactly_one`
 * Adds ability to pass headers when passing a url in partition_html()
+* Added optional `content_type` and `file_filename` parameters to `partition()` to bypass file detection
 
 ### Features
 

diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -30,6 +30,7 @@ In cases where ``libmagic`` is not available, filetype detection will fall back
 As shown in the examples below, the ``partition`` function accepts both filenames and file-like objects as input.
 ``partition`` also has some optional kwargs.
 For example, if you set ``include_page_breaks=True``, the output will include ``PageBreak`` elements if the filetype supports it.
+Additionally you can bypass the filetype detection logic with the optional  ``content_type`` argument which may be specified with either the ``filename`` or file-like object, ``file``.
 You can find a full listing of optional kwargs in the documentation below.
 
 .. code:: python
@@ -38,7 +39,7 @@ You can find a full listing of optional kwargs in the documentation below.
 
 
   filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-  elements = partition(filename=filename)
+  elements = partition(filename=filename, content_type="application/pdf")
   print("\n\n".join([str(el) for el in elements][:10]))
 
 
@@ -57,7 +58,7 @@ The ``unstructured`` library also includes partitioning bricks targeted at speci
 The ``partition`` brick uses these document-specific partitioning bricks under the hood.
 There are a few reasons you may want to use a document-specific partitioning brick instead of ``partition``:
 
-* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.
+* If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly, or passing in the ``content_type`` will make your program run faster.
 * Fewer dependencies. You don't need to install ``libmagic`` for filetype detection if you're only using document-specific bricks.
 * Additional features. The API for partition is the least common denominator for all document types. Certain document-specific brick include extra features that you may want to take advantage of. For example, ``partition_html`` allows you to pass in a URL so you don't have to store the ``.html`` file locally. See the documentation below learn about the options available in each partitioning brick.
 

diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -105,13 +105,27 @@ def test_auto_partition_docx_with_file(mock_docx_document, expected_docx_element
     assert elements == expected_docx_elements
 
 
-def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elements, tmpdir):
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/msword"), (True, "application/msword"), (True, None)],
+)
+def test_auto_partition_doc_with_filename(
+    mock_docx_document,
+    expected_docx_elements,
+    tmpdir,
+    pass_file_filename,
+    content_type,
+):
     docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
     doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")
     mock_docx_document.save(docx_filename)
     convert_office_doc(docx_filename, tmpdir.dirname, "doc")
-
-    elements = partition(filename=doc_filename)
+    file_filename = doc_filename if pass_file_filename else None
+    elements = partition(
+        filename=doc_filename,
+        file_filename=file_filename,
+        content_type=content_type,
+    )
     assert elements == expected_docx_elements
     assert elements[0].metadata.filename == doc_filename
 
@@ -130,17 +144,27 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
     assert elements == expected_docx_elements
 
 
-def test_auto_partition_html_from_filename():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
+)
+def test_auto_partition_html_from_filename(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example-10k.html")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
     assert len(elements) > 0
     assert elements[0].metadata.filename == filename
 
 
-def test_auto_partition_html_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
+)
+def test_auto_partition_html_from_file(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-html.html")
+    file_filename = filename if pass_file_filename else None
     with open(filename) as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)
     assert len(elements) > 0
 
 
@@ -177,9 +201,15 @@ def test_auto_partition_text_from_file():
     assert elements == EXPECTED_TEXT_OUTPUT
 
 
-def test_auto_partition_pdf_from_filename():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
+)
+def test_auto_partition_pdf_from_filename(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
 
     assert isinstance(elements[0], Title)
     assert elements[0].text.startswith("LayoutParser")
@@ -207,10 +237,16 @@ def test_auto_partition_pdf_with_fast_strategy():
     )
 
 
-def test_auto_partition_pdf_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "application/pdf"), (True, "application/pdf"), (True, None)],
+)
+def test_auto_partition_pdf_from_file(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
+    file_filename = filename if pass_file_filename else None
+
     with open(filename, "rb") as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)
 
     assert isinstance(elements[0], Title)
     assert elements[0].text.startswith("LayoutParser")
@@ -230,16 +266,26 @@ def test_partition_pdf_doesnt_raise_warning():
         partition(filename=filename)
 
 
-def test_auto_partition_jpg():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
+)
+def test_auto_partition_jpg(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
-    elements = partition(filename=filename)
+    file_filename = filename if pass_file_filename else None
+    elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
     assert len(elements) > 0
 
 
-def test_auto_partition_jpg_from_file():
+@pytest.mark.parametrize(
+    ("pass_file_filename", "content_type"),
+    [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
+)
+def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
+    file_filename = filename if pass_file_filename else None
     with open(filename, "rb") as f:
-        elements = partition(file=f)
+        elements = partition(file=f, file_filename=file_filename, content_type=content_type)
     assert len(elements) > 0
 
 

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.7-dev3"  # pragma: no cover
+__version__ = "0.5.7"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -111,6 +111,24 @@ def __lt__(self, other):
         return self.name < other.name
 
 
+STR_TO_FILETYPE = {
+    "application/pdf": FileType.PDF,
+    "application/msword": FileType.DOC,
+    "image/jpeg": FileType.JPG,
+    "image/png": FileType.PNG,
+    "text/markdown": FileType.MD,
+    "text/x-markdown": FileType.MD,
+    "application/epub": FileType.EPUB,
+    "application/epub+zip": FileType.EPUB,
+    "text/html": FileType.HTML,
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
+    "application/vnd.ms-excel": FileType.XLS,
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
+    "application/vnd.ms-powerpoint": FileType.PPT,
+    "application/xml": FileType.XML,
+}
+
+
 EXT_TO_FILETYPE = {
     ".pdf": FileType.PDF,
     ".docx": FileType.DOCX,
@@ -138,18 +156,26 @@ def __lt__(self, other):
 
 def detect_filetype(
     filename: Optional[str] = None,
+    content_type: Optional[str] = None,
     file: Optional[IO] = None,
+    file_filename: Optional[str] = None,
 ) -> Optional[FileType]:
     """Use libmagic to determine a file's type. Helps determine which partition brick
     to use for a given file. A return value of None indicates a non-supported file type."""
     exactly_one(filename=filename, file=file)
 
-    if filename:
-        _, extension = os.path.splitext(filename)
+    if content_type:
+        filetype = STR_TO_FILETYPE.get(content_type)
+        if filetype:
+            return filetype
+
+    if filename or file_filename:
+        _, extension = os.path.splitext(filename or file_filename or "")
         extension = extension.lower()
         if LIBMAGIC_AVAILABLE:
-            mime_type = magic.from_file(filename, mime=True)
+            mime_type = magic.from_file(filename or file_filename, mime=True)  # type: ignore
         else:
+            # might not need this
             return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
     elif file is not None:
         extension = None
@@ -164,6 +190,8 @@ def detect_filetype(
                 "Filetype detection on file-like objects requires libmagic. "
                 "Please install libmagic and try again.",
             )
+    else:
+        raise ValueError("No filename, file, nor file_filename were specified.")
 
     if mime_type == "application/pdf":
         return FileType.PDF

diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -17,7 +17,9 @@
 
 def partition(
     filename: Optional[str] = None,
+    content_type: Optional[str] = None,
     file: Optional[IO] = None,
+    file_filename: Optional[str] = None,
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
     encoding: str = "utf-8",
@@ -31,8 +33,12 @@ def partition(
     ----------
      filename
         A string defining the target filename path.
+    content_type
+        A string defining the file content in MIME type
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    file_filename
+        When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
     include_page_breaks
         If True, the output will include page breaks if the filetype supports it
     strategy
@@ -42,7 +48,12 @@ def partition(
     encoding
         The encoding method used to decode the text input. If None, utf-8 will be used.
     """
-    filetype = detect_filetype(filename=filename, file=file)
+    filetype = detect_filetype(
+        filename=filename,
+        file=file,
+        file_filename=file_filename,
+        content_type=content_type,
+    )
 
     if file is not None:
         file.seek(0)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.5.7-dev3" # pragma: no cover
		__version__ = "0.5.7" # pragma: no cover