Skip to content

Commit

Permalink
feat: Add Image element and find_embedded_image function (Unstructu…
Browse files Browse the repository at this point in the history
…red-IO#130)

* add find_embedded_image
  • Loading branch information
mallorih authored Jan 10, 2023
1 parent 7b3b594 commit e0feba8
Show file tree
Hide file tree
Showing 8 changed files with 7,754 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* Added new functions to extract header information `parse_received_data` and `partition_header`
* Added new function to parse plain text files `partition_text`
* Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
* Add new `Image` element and function to find embedded images `find_embedded_images`

## 0.3.5

Expand Down
3,828 changes: 3,828 additions & 0 deletions example-docs/email-with-image.eml

Large diffs are not rendered by default.

3,833 changes: 3,833 additions & 0 deletions example-docs/fake-email-image-embedded.eml

Large diffs are not rendered by default.

28 changes: 23 additions & 5 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#
# pip-compile requirements/dev.in
#
appnope==0.1.3
# via
# ipykernel
# ipython
argon2-cffi==21.3.0
# via notebook
argon2-cffi-bindings==21.2.0
Expand Down Expand Up @@ -36,6 +40,10 @@ executing==1.0.0
# via stack-data
fastjsonschema==2.16.2
# via nbformat
importlib-metadata==6.0.0
# via nbconvert
importlib-resources==5.10.2
# via jsonschema
ipykernel==6.15.3
# via
# ipywidgets
Expand All @@ -45,7 +53,7 @@ ipykernel==6.15.3
# qtconsole
ipython==8.6.0
# via
# -r dev.in
# -r requirements/dev.in
# ipykernel
# ipywidgets
# jupyter-console
Expand All @@ -64,7 +72,7 @@ jinja2==3.1.2
jsonschema==4.16.0
# via nbformat
jupyter==1.0.0
# via -r dev.in
# via -r requirements/dev.in
jupyter-client==7.3.5
# via
# ipykernel
Expand All @@ -76,7 +84,7 @@ jupyter-console==6.4.4
# via jupyter
jupyter-core==5.1.3
# via
# -r dev.in
# -r requirements/dev.in
# jupyter-client
# nbconvert
# nbformat
Expand Down Expand Up @@ -134,7 +142,9 @@ pexpect==4.8.0
pickleshare==0.7.5
# via ipython
pip-tools==6.12.1
# via -r dev.in
# via -r requirements/dev.in
pkgutil-resolve-name==1.3.10
# via jsonschema
platformdirs==2.5.4
# via jupyter-core
prometheus-client==0.14.1
Expand Down Expand Up @@ -190,6 +200,10 @@ terminado==0.15.0
# via notebook
tinycss2==1.1.1
# via nbconvert
tomli==2.0.1
# via
# build
# pep517
tornado==6.2
# via
# ipykernel
Expand Down Expand Up @@ -217,10 +231,14 @@ webencodings==0.5.1
# tinycss2
wheel==0.38.4
# via
# -r dev.in
# -r requirements/dev.in
# pip-tools
widgetsnbextension==4.0.3
# via ipywidgets
zipp==3.11.0
# via
# importlib-metadata
# importlib-resources

# The following packages are considered to be unsafe in a requirements file:
# pip
Expand Down
25 changes: 15 additions & 10 deletions requirements/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,39 @@
attrs==22.1.0
# via pytest
black==22.12.0
# via -r test.in
# via -r requirements/test.in
certifi==2022.12.7
# via
# -r test.in
# -r requirements/test.in
# requests
charset-normalizer==2.1.1
# via requests
click==8.1.3
# via
# -r test.in
# -r requirements/test.in
# black
coverage[toml]==6.4.4
# via
# -r test.in
# -r requirements/test.in
# pytest-cov
flake8==5.0.4
# via -r test.in
# via -r requirements/test.in
idna==3.4
# via
# requests
# yarl
iniconfig==1.1.1
# via pytest
label-studio-sdk==0.0.15
# via -r test.in
# via -r requirements/test.in
lxml==4.9.1
# via label-studio-sdk
mccabe==0.7.0
# via flake8
multidict==6.0.2
# via yarl
mypy==0.991
# via -r test.in
# via -r requirements/test.in
mypy-extensions==0.4.3
# via
# black
Expand All @@ -65,23 +65,28 @@ pyparsing==3.0.9
pytest==7.1.3
# via pytest-cov
pytest-cov==4.0.0
# via -r test.in
# via -r requirements/test.in
pyyaml==6.0
# via vcrpy
requests==2.28.1
# via label-studio-sdk
six==1.16.0
# via vcrpy
tomli==2.0.1
# via pytest
# via
# black
# coverage
# mypy
# pytest
typing-extensions==4.3.0
# via
# black
# mypy
# pydantic
urllib3==1.26.12
# via requests
vcrpy==4.2.1
# via -r test.in
# via -r requirements/test.in
wrapt==1.14.1
# via vcrpy
yarl==1.8.1
Expand Down
18 changes: 17 additions & 1 deletion test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pathlib
import pytest

from unstructured.documents.elements import NarrativeText, Title, ListItem
from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
from unstructured.documents.email_elements import (
MetaData,
Recipient,
Expand All @@ -27,6 +27,15 @@
ListItem(text="Violets are blue"),
]

IMAGE_EXPECTED_OUTPUT = [
NarrativeText(text="This is a test email to use for unit tests."),
Title(text="Important points:"),
NarrativeText(text="hello this is our logo."),
Image(text="unstructured_logo.png"),
ListItem(text="Roses are red"),
ListItem(text="Violets are blue"),
]

HEADER_EXPECTED_OUTPUT = [
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
Expand Down Expand Up @@ -97,6 +106,13 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT


def test_partition_email_from_filename_with_embedded_image():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
elements = partition_email(filename=filename, content_source="text/plain")
assert len(elements) > 0
assert elements == IMAGE_EXPECTED_OUTPUT


def test_partition_email_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
with open(filename, "r") as f:
Expand Down
8 changes: 8 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,11 @@ class Title(Text):
category = "Title"

pass


class Image(Text):
"""A text element for capturing image metadata."""

category = "Image"

pass
32 changes: 29 additions & 3 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys
import re
from email.message import Message
from typing import Dict, IO, List, Optional, Tuple
from typing import Dict, IO, List, Optional, Tuple, Union

if sys.version_info < (3, 8):
from typing_extensions import Final
Expand All @@ -24,7 +24,7 @@
ReceivedInfo,
MetaData,
)
from unstructured.documents.elements import Element, Text
from unstructured.documents.elements import Element, Text, Image, NarrativeText, Title
from unstructured.partition.html import partition_html
from unstructured.partition.text import split_by_paragraph, partition_text

Expand Down Expand Up @@ -113,6 +113,25 @@ def extract_attachment_info(
return list_attachments


def has_embedded_image(element):

PATTERN = re.compile("\[image: .+\]") # noqa: W605 NOTE(harrell)
return PATTERN.search(element.text)


def find_embedded_image(
element: Union[NarrativeText, Title], indices: re.Match
) -> Tuple[Element, Element]:

start, end = indices.start(), indices.end()

image_raw_info = element.text[start:end]
image_info = clean_extra_whitespace(image_raw_info.split(":")[1])
element.text = element.text.replace("[image: " + image_info[:-1] + "]", "")

return Image(text=image_info[:-1]), element


def partition_email(
filename: Optional[str] = None,
file: Optional[IO] = None,
Expand Down Expand Up @@ -171,7 +190,7 @@ def partition_email(
raise ValueError(f"{content_source} content not found in email")

# NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
# looks like the following, resulting in extraneous "=" chracters in the output if
# looks like the following, resulting in extraneous "=" characters in the output if
# you don't clean it up
# <ul> =
# <li>Item 1</li>=
Expand All @@ -188,6 +207,13 @@ def partition_email(
elif content_source == "text/plain":
elements = partition_text(text=content)

for idx, element in enumerate(elements):
indices = has_embedded_image(element)
if (isinstance(element, NarrativeText) or isinstance(element, Title)) and indices:
image_info, clean_element = find_embedded_image(element, indices)
elements[idx] = clean_element
elements.insert(idx + 1, image_info)

header: List[Element] = list()
if include_headers:
header = partition_email_header(msg)
Expand Down

0 comments on commit e0feba8

Please sign in to comment.