diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ab958231c7..9b37aeb49a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -105,7 +105,7 @@ jobs: source .venv/bin/activate make install-detectron2 sudo apt-get update - sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice + sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc make test make check-coverage make install-ingest-s3 diff --git a/CHANGELOG.md b/CHANGELOG.md index 2db473554f..41f1e11518 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.4-dev7 +## 0.5.4 ### Enhancements @@ -21,6 +21,7 @@ * Add `AzureBlobStorageConnector` based on its `fsspec` implementation inheriting from `FsspecConnector` +* Add `partition_epub` for partitioning e-books in EPUB3 format. ### Fixes diff --git a/README.md b/README.md index 56cb31f6e4..198544d3b1 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ file to ensure your code matches the formatting and linting standards used in `u If you'd prefer not having code changes auto-tidied before every commit, you can use `make check` to see whether any linting or formatting changes should be applied, and `make tidy` to apply them. -If using the optional `pre-commit`, you'll just need to install the hooks with `pre-commit install` since the +If using the optional `pre-commit`, you'll just need to install the hooks with `pre-commit install` since the `pre-commit` package is installed as part of `make install` mentioned above. Finally, if you decided to use `pre-commit` you can also uninstall the hooks with `pre-commit uninstall`. @@ -119,7 +119,7 @@ you can also uninstall the hooks with `pre-commit uninstall`. You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below. The following examples show how to get started with the `unstructured` library. -You can parse **TXT**, **HTML**, **PDF**, **EML**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**, +You can parse **TXT**, **HTML**, **PDF**, **EML**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**, and **PNG** documents with one line of code!

See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 32e498bdb8..637379933a 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -82,7 +82,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect file type and route it to the appropriate partitioning brick. All partitioning bricks called within ``partition`` are called using the default kwargs. Use the document-type specific bricks if you need to apply non-default settings. -``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.html``, ``.pdf``, +``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.epub``, ``.html``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``, ``.png``, and ``.jpg``. @@ -306,6 +306,41 @@ Examples: elements = partition_email(text=text, include_headers=True) +``partition_epub`` +--------------------- + +The ``partition_epub`` function processes e-books in EPUB3 format. The function +first converts the document to HTML using ``pandocs`` and then calls ``partition_html``. +You'll need `pandocs `_ installed on your system +to use ``partition_epub``. + + +Examples: + +.. code:: python + + from unstructured.partition.epub import partition_epub + + elements = partition_epub(filename="example-docs/winter-sports.epub") + + +``partition_md`` +--------------------- + +The ``partition_md`` function provides the ability to parse markdown files. The +following workflow shows how to use ``partition_md``. + + +Examples: + +.. code:: python + + from unstructured.partition.md import partition_md + + elements = partition_md(filename="README.md") + + + ``partition_text`` --------------------- diff --git a/docs/source/installing.rst b/docs/source/installing.rst index e96d3e8da0..b859a67e29 100644 --- a/docs/source/installing.rst +++ b/docs/source/installing.rst @@ -15,6 +15,7 @@ installation. * ``poppler-utils`` (images and PDFs) * ``tesseract-ocr`` (images and PDFs) * ``libreoffice`` (MS Office docs) + * ``pandocs`` (EPUBs) * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection: * ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"`` diff --git a/example-docs/winter-sports.epub b/example-docs/winter-sports.epub new file mode 100644 index 0000000000..a434863a8c Binary files /dev/null and b/example-docs/winter-sports.epub differ diff --git a/requirements/base.txt b/requirements/base.txt index 36172b0809..aaf27c4cc7 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,6 +4,9 @@ # # pip-compile --output-file=requirements/base.txt # +--extra-index-url https://pypi.ngc.nvidia.com +--trusted-host pypi.ngc.nvidia.com + anyio==3.6.2 # via httpcore argilla==1.4.0 @@ -72,6 +75,8 @@ pydantic==1.10.6 # via argilla pygments==2.14.0 # via rich +pypandoc==1.11 + # via unstructured (setup.py) python-dateutil==2.8.2 # via pandas python-docx==0.8.11 diff --git a/scripts/setup_ubuntu.sh b/scripts/setup_ubuntu.sh index a7829722b8..1de99d69ba 100755 --- a/scripts/setup_ubuntu.sh +++ b/scripts/setup_ubuntu.sh @@ -84,7 +84,7 @@ $sudo $pac install -y poppler-utils #### Tesseract # Install tesseract as well as Russian language -$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus libreoffice +$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus libreoffice pandoc #### libmagic $sudo $pac install -y libmagic-dev diff --git a/setup.py b/setup.py index c0b34bc563..e57221beb9 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ "openpyxl", "pandas", "pillow", + "pypandoc", "python-docx", "python-pptx", "python-magic", diff --git a/test_unstructured/file_utils/test_file_conversion.py b/test_unstructured/file_utils/test_file_conversion.py new file mode 100644 index 0000000000..5ebca00843 --- /dev/null +++ b/test_unstructured/file_utils/test_file_conversion.py @@ -0,0 +1,23 @@ +import os +import pathlib +from unittest.mock import patch + +import pypandoc +import pytest + +from unstructured.file_utils.file_conversion import convert_file_to_text + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + + +def test_convert_file_to_text(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + html_text = convert_file_to_text(filename, source_format="epub", target_format="html") + assert html_text.startswith("

") + + +def test_convert_to_file_raises_if_pandoc_not_available(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError): + with pytest.raises(FileNotFoundError): + convert_file_to_text(filename, source_format="epub", target_format="html") diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 2210ea9cad..10432097aa 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -30,6 +30,7 @@ ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_filename(file, expected): @@ -50,6 +51,7 @@ def test_detect_filetype_from_filename(file, expected): ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected): @@ -73,6 +75,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_file(file, expected): diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index b86b80d2d3..f07d0d16e1 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -277,3 +277,18 @@ def test_auto_with_page_breaks(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") elements = partition(filename=filename, include_page_breaks=True) assert PageBreak() in elements + + +def test_auto_partition_epub_from_filename(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + elements = partition(filename=filename) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + + +def test_auto_partition_epub_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with open(filename, "rb") as f: + elements = partition(file=f) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") diff --git a/test_unstructured/partition/test_epub.py b/test_unstructured/partition/test_epub.py new file mode 100644 index 0000000000..cf3e7977a0 --- /dev/null +++ b/test_unstructured/partition/test_epub.py @@ -0,0 +1,21 @@ +import os +import pathlib + +from unstructured.partition.epub import partition_epub + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + + +def test_partition_epub_from_filename(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + elements = partition_epub(filename=filename) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + + +def test_partition_epub_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with open(filename, "rb") as f: + elements = partition_epub(file=f) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 23fb2a9174..51579011b4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.4-dev7" # pragma: no cover +__version__ = "0.5.4" # pragma: no cover diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py new file mode 100644 index 0000000000..5ad671fe1b --- /dev/null +++ b/unstructured/file_utils/file_conversion.py @@ -0,0 +1,49 @@ +import tempfile +from typing import IO, Optional + +import pypandoc + +from unstructured.partition.common import exactly_one + + +def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: + """Uses pandoc to convert the source document to a raw text string.""" + try: + text = pypandoc.convert_file(filename, "html", format="epub") + except FileNotFoundError as err: + msg = ( + "Error converting the file to text. Ensure you have the pandoc " + "package installed on your system. Install instructions are available at " + "https://pandoc.org/installing.html. The original exception text was:\n" + f"{err}" + ) + raise FileNotFoundError(msg) + + return text + + +def convert_epub_to_html( + filename: Optional[str] = None, + file: Optional[IO] = None, +) -> str: + """Converts an EPUB document to HTML raw text. Enables an EPUB doucment to be + processed using the partition_html function.""" + exactly_one(filename=filename, file=file) + + if file is not None: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.write(file.read()) + tmp.close() + html_text = convert_file_to_text( + filename=tmp.name, + source_format="epub", + target_format="html", + ) + elif filename is not None: + html_text = convert_file_to_text( + filename=filename, + source_format="epub", + target_format="html", + ) + + return html_text diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index b4dd17a1c3..ecfebf8548 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -47,6 +47,11 @@ "text/x-markdown", ] +EPUB_MIME_TYPES = [ + "application/epub", + "application/epub+zip", +] + # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by # looking for expected filenames within the zip file. @@ -94,6 +99,7 @@ class FileType(Enum): HTML = 50 XML = 51 MD = 52 + EPUB = 53 # Compressed Types ZIP = 60 @@ -123,6 +129,7 @@ def __lt__(self, other): ".ppt": FileType.PPT, ".rtf": FileType.RTF, ".json": FileType.JSON, + ".epub": FileType.EPUB, } @@ -180,6 +187,9 @@ def detect_filetype( # NOTE - I am not sure whether libmagic ever returns these mimetypes. return FileType.MD + elif mime_type in EPUB_MIME_TYPES: + return FileType.EPUB + elif mime_type in TXT_MIME_TYPES: if extension and extension == ".eml": return FileType.EML diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 497cf931bb..35f0997d70 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -4,6 +4,7 @@ from unstructured.partition.doc import partition_doc from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email +from unstructured.partition.epub import partition_epub from unstructured.partition.html import partition_html from unstructured.partition.image import partition_image from unstructured.partition.json import partition_json @@ -59,6 +60,8 @@ def partition( include_page_breaks=include_page_breaks, encoding=encoding, ) + elif filetype == FileType.EPUB: + return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.MD: return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.PDF: diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py new file mode 100644 index 0000000000..84f1f51edb --- /dev/null +++ b/unstructured/partition/epub.py @@ -0,0 +1,32 @@ +from typing import IO, List, Optional + +from unstructured.documents.elements import Element +from unstructured.file_utils.file_conversion import convert_epub_to_html +from unstructured.partition.html import partition_html + + +def partition_epub( + filename: Optional[str] = None, + file: Optional[IO] = None, + include_page_breaks: bool = False, +) -> List[Element]: + """Partitions an EPUB document. The document is first converted to HTML and then + partitoned using partiton_html. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + include_page_breaks + If True, the output will include page breaks if the filetype supports it + """ + html_text = convert_epub_to_html(filename=filename, file=file) + # NOTE(robinson) - pypandoc returns a text string with unicode encoding + # ref: https://github.com/JessicaTegner/pypandoc#usage + return partition_html( + text=html_text, + include_page_breaks=include_page_breaks, + encoding="unicode", + )