") + + +def test_convert_to_file_raises_if_pandoc_not_available(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError): + with pytest.raises(FileNotFoundError): + convert_file_to_text(filename, source_format="epub", target_format="html") diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 2210ea9cad..10432097aa 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -30,6 +30,7 @@ ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_filename(file, expected): @@ -50,6 +51,7 @@ def test_detect_filetype_from_filename(file, expected): ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected): @@ -73,6 +75,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte ("fake-html.html", FileType.HTML), ("unsupported/fake-excel.xlsx", FileType.XLSX), ("fake-power-point.pptx", FileType.PPTX), + ("winter-sports.epub", FileType.EPUB), ], ) def test_detect_filetype_from_file(file, expected): diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index b86b80d2d3..f07d0d16e1 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -277,3 +277,18 @@ def test_auto_with_page_breaks(): filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf") elements = partition(filename=filename, include_page_breaks=True) assert PageBreak() in elements + + +def test_auto_partition_epub_from_filename(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + elements = partition(filename=filename) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + + +def test_auto_partition_epub_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with open(filename, "rb") as f: + elements = partition(file=f) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") diff --git a/test_unstructured/partition/test_epub.py b/test_unstructured/partition/test_epub.py new file mode 100644 index 0000000000..cf3e7977a0 --- /dev/null +++ b/test_unstructured/partition/test_epub.py @@ -0,0 +1,21 @@ +import os +import pathlib + +from unstructured.partition.epub import partition_epub + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + + +def test_partition_epub_from_filename(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + elements = partition_epub(filename=filename) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") + + +def test_partition_epub_from_file(): + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") + with open(filename, "rb") as f: + elements = partition_epub(file=f) + assert len(elements) > 0 + assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 23fb2a9174..51579011b4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.4-dev7" # pragma: no cover +__version__ = "0.5.4" # pragma: no cover diff --git a/unstructured/file_utils/file_conversion.py b/unstructured/file_utils/file_conversion.py new file mode 100644 index 0000000000..5ad671fe1b --- /dev/null +++ b/unstructured/file_utils/file_conversion.py @@ -0,0 +1,49 @@ +import tempfile +from typing import IO, Optional + +import pypandoc + +from unstructured.partition.common import exactly_one + + +def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: + """Uses pandoc to convert the source document to a raw text string.""" + try: + text = pypandoc.convert_file(filename, "html", format="epub") + except FileNotFoundError as err: + msg = ( + "Error converting the file to text. Ensure you have the pandoc " + "package installed on your system. Install instructions are available at " + "https://pandoc.org/installing.html. The original exception text was:\n" + f"{err}" + ) + raise FileNotFoundError(msg) + + return text + + +def convert_epub_to_html( + filename: Optional[str] = None, + file: Optional[IO] = None, +) -> str: + """Converts an EPUB document to HTML raw text. Enables an EPUB doucment to be + processed using the partition_html function.""" + exactly_one(filename=filename, file=file) + + if file is not None: + tmp = tempfile.NamedTemporaryFile(delete=False) + tmp.write(file.read()) + tmp.close() + html_text = convert_file_to_text( + filename=tmp.name, + source_format="epub", + target_format="html", + ) + elif filename is not None: + html_text = convert_file_to_text( + filename=filename, + source_format="epub", + target_format="html", + ) + + return html_text diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index b4dd17a1c3..ecfebf8548 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -47,6 +47,11 @@ "text/x-markdown", ] +EPUB_MIME_TYPES = [ + "application/epub", + "application/epub+zip", +] + # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension. # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by # looking for expected filenames within the zip file. @@ -94,6 +99,7 @@ class FileType(Enum): HTML = 50 XML = 51 MD = 52 + EPUB = 53 # Compressed Types ZIP = 60 @@ -123,6 +129,7 @@ def __lt__(self, other): ".ppt": FileType.PPT, ".rtf": FileType.RTF, ".json": FileType.JSON, + ".epub": FileType.EPUB, } @@ -180,6 +187,9 @@ def detect_filetype( # NOTE - I am not sure whether libmagic ever returns these mimetypes. return FileType.MD + elif mime_type in EPUB_MIME_TYPES: + return FileType.EPUB + elif mime_type in TXT_MIME_TYPES: if extension and extension == ".eml": return FileType.EML diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 497cf931bb..35f0997d70 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -4,6 +4,7 @@ from unstructured.partition.doc import partition_doc from unstructured.partition.docx import partition_docx from unstructured.partition.email import partition_email +from unstructured.partition.epub import partition_epub from unstructured.partition.html import partition_html from unstructured.partition.image import partition_image from unstructured.partition.json import partition_json @@ -59,6 +60,8 @@ def partition( include_page_breaks=include_page_breaks, encoding=encoding, ) + elif filetype == FileType.EPUB: + return partition_epub(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.MD: return partition_md(filename=filename, file=file, include_page_breaks=include_page_breaks) elif filetype == FileType.PDF: diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py new file mode 100644 index 0000000000..84f1f51edb --- /dev/null +++ b/unstructured/partition/epub.py @@ -0,0 +1,32 @@ +from typing import IO, List, Optional + +from unstructured.documents.elements import Element +from unstructured.file_utils.file_conversion import convert_epub_to_html +from unstructured.partition.html import partition_html + + +def partition_epub( + filename: Optional[str] = None, + file: Optional[IO] = None, + include_page_breaks: bool = False, +) -> List[Element]: + """Partitions an EPUB document. The document is first converted to HTML and then + partitoned using partiton_html. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + include_page_breaks + If True, the output will include page breaks if the filetype supports it + """ + html_text = convert_epub_to_html(filename=filename, file=file) + # NOTE(robinson) - pypandoc returns a text string with unicode encoding + # ref: https://github.com/JessicaTegner/pypandoc#usage + return partition_html( + text=html_text, + include_page_breaks=include_page_breaks, + encoding="unicode", + )