forked from Unstructured-IO/unstructured
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add
partition_epub
function (Unstructured-IO#364)
* add pypandoc dependency * added epub partitioner and file conversion * test for partition_epub * tests for file conversion * add epub to filetype detection * added epub to auto partition * update bricks docs * updated installing docs * changelot and version * add pandoc to dependencies * add pandoc to debian dependencies * linting, linting, linting * typo fix * typo fix * file conversion type hints * more type hints --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com>
- Loading branch information
1 parent
aa49462
commit e43cb0e
Showing
18 changed files
with
206 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -56,6 +56,7 @@ | |
"openpyxl", | ||
"pandas", | ||
"pillow", | ||
"pypandoc", | ||
"python-docx", | ||
"python-pptx", | ||
"python-magic", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import os | ||
import pathlib | ||
from unittest.mock import patch | ||
|
||
import pypandoc | ||
import pytest | ||
|
||
from unstructured.file_utils.file_conversion import convert_file_to_text | ||
|
||
DIRECTORY = pathlib.Path(__file__).parent.resolve() | ||
|
||
|
||
def test_convert_file_to_text(): | ||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") | ||
html_text = convert_file_to_text(filename, source_format="epub", target_format="html") | ||
assert html_text.startswith("<p>") | ||
|
||
|
||
def test_convert_to_file_raises_if_pandoc_not_available(): | ||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") | ||
with patch.object(pypandoc, "convert_file", side_effect=FileNotFoundError): | ||
with pytest.raises(FileNotFoundError): | ||
convert_file_to_text(filename, source_format="epub", target_format="html") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import os | ||
import pathlib | ||
|
||
from unstructured.partition.epub import partition_epub | ||
|
||
DIRECTORY = pathlib.Path(__file__).parent.resolve() | ||
|
||
|
||
def test_partition_epub_from_filename(): | ||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") | ||
elements = partition_epub(filename=filename) | ||
assert len(elements) > 0 | ||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") | ||
|
||
|
||
def test_partition_epub_from_file(): | ||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") | ||
with open(filename, "rb") as f: | ||
elements = partition_epub(file=f) | ||
assert len(elements) > 0 | ||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.5.4-dev7" # pragma: no cover | ||
__version__ = "0.5.4" # pragma: no cover |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import tempfile | ||
from typing import IO, Optional | ||
|
||
import pypandoc | ||
|
||
from unstructured.partition.common import exactly_one | ||
|
||
|
||
def convert_file_to_text(filename: str, source_format: str, target_format: str) -> str: | ||
"""Uses pandoc to convert the source document to a raw text string.""" | ||
try: | ||
text = pypandoc.convert_file(filename, "html", format="epub") | ||
except FileNotFoundError as err: | ||
msg = ( | ||
"Error converting the file to text. Ensure you have the pandoc " | ||
"package installed on your system. Install instructions are available at " | ||
"https://pandoc.org/installing.html. The original exception text was:\n" | ||
f"{err}" | ||
) | ||
raise FileNotFoundError(msg) | ||
|
||
return text | ||
|
||
|
||
def convert_epub_to_html( | ||
filename: Optional[str] = None, | ||
file: Optional[IO] = None, | ||
) -> str: | ||
"""Converts an EPUB document to HTML raw text. Enables an EPUB doucment to be | ||
processed using the partition_html function.""" | ||
exactly_one(filename=filename, file=file) | ||
|
||
if file is not None: | ||
tmp = tempfile.NamedTemporaryFile(delete=False) | ||
tmp.write(file.read()) | ||
tmp.close() | ||
html_text = convert_file_to_text( | ||
filename=tmp.name, | ||
source_format="epub", | ||
target_format="html", | ||
) | ||
elif filename is not None: | ||
html_text = convert_file_to_text( | ||
filename=filename, | ||
source_format="epub", | ||
target_format="html", | ||
) | ||
|
||
return html_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from typing import IO, List, Optional | ||
|
||
from unstructured.documents.elements import Element | ||
from unstructured.file_utils.file_conversion import convert_epub_to_html | ||
from unstructured.partition.html import partition_html | ||
|
||
|
||
def partition_epub( | ||
filename: Optional[str] = None, | ||
file: Optional[IO] = None, | ||
include_page_breaks: bool = False, | ||
) -> List[Element]: | ||
"""Partitions an EPUB document. The document is first converted to HTML and then | ||
partitoned using partiton_html. | ||
Parameters | ||
---------- | ||
filename | ||
A string defining the target filename path. | ||
file | ||
A file-like object using "rb" mode --> open(filename, "rb"). | ||
include_page_breaks | ||
If True, the output will include page breaks if the filetype supports it | ||
""" | ||
html_text = convert_epub_to_html(filename=filename, file=file) | ||
# NOTE(robinson) - pypandoc returns a text string with unicode encoding | ||
# ref: https://github.com/JessicaTegner/pypandoc#usage | ||
return partition_html( | ||
text=html_text, | ||
include_page_breaks=include_page_breaks, | ||
encoding="unicode", | ||
) |