feat: Add Image element and find_embedded_image function (Unstructu…

…red-IO#130) * add find_embedded_image
siddartha-RE · Jan 10, 2023 · e0feba8 · e0feba8
1 parent 7b3b594
commit e0feba8
Show file tree

Hide file tree

Showing 8 changed files with 7,754 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@
 * Added new functions to extract header information `parse_received_data` and `partition_header`
 * Added new function to parse plain text files `partition_text`
 * Added new cleaners functions `extract_ip_address`, `extract_ip_address_name`, `extract_mapi_id`, `extract_datetimetz`
+* Add new `Image` element and function to find embedded images `find_embedded_images`
 
 ## 0.3.5
 

diff --git a/example-docs/email-with-image.eml b/example-docs/email-with-image.eml
diff --git a/example-docs/fake-email-image-embedded.eml b/example-docs/fake-email-image-embedded.eml
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -4,6 +4,10 @@
 #
 #    pip-compile requirements/dev.in
 #
+appnope==0.1.3
+    # via
+    #   ipykernel
+    #   ipython
 argon2-cffi==21.3.0
     # via notebook
 argon2-cffi-bindings==21.2.0
@@ -36,6 +40,10 @@ executing==1.0.0
     # via stack-data
 fastjsonschema==2.16.2
     # via nbformat
+importlib-metadata==6.0.0
+    # via nbconvert
+importlib-resources==5.10.2
+    # via jsonschema
 ipykernel==6.15.3
     # via
     #   ipywidgets
@@ -45,7 +53,7 @@ ipykernel==6.15.3
     #   qtconsole
 ipython==8.6.0
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   ipykernel
     #   ipywidgets
     #   jupyter-console
@@ -64,7 +72,7 @@ jinja2==3.1.2
 jsonschema==4.16.0
     # via nbformat
 jupyter==1.0.0
-    # via -r dev.in
+    # via -r requirements/dev.in
 jupyter-client==7.3.5
     # via
     #   ipykernel
@@ -76,7 +84,7 @@ jupyter-console==6.4.4
     # via jupyter
 jupyter-core==5.1.3
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   jupyter-client
     #   nbconvert
     #   nbformat
@@ -134,7 +142,9 @@ pexpect==4.8.0
 pickleshare==0.7.5
     # via ipython
 pip-tools==6.12.1
-    # via -r dev.in
+    # via -r requirements/dev.in
+pkgutil-resolve-name==1.3.10
+    # via jsonschema
 platformdirs==2.5.4
     # via jupyter-core
 prometheus-client==0.14.1
@@ -190,6 +200,10 @@ terminado==0.15.0
     # via notebook
 tinycss2==1.1.1
     # via nbconvert
+tomli==2.0.1
+    # via
+    #   build
+    #   pep517
 tornado==6.2
     # via
     #   ipykernel
@@ -217,10 +231,14 @@ webencodings==0.5.1
     #   tinycss2
 wheel==0.38.4
     # via
-    #   -r dev.in
+    #   -r requirements/dev.in
     #   pip-tools
 widgetsnbextension==4.0.3
     # via ipywidgets
+zipp==3.11.0
+    # via
+    #   importlib-metadata
+    #   importlib-resources
 
 # The following packages are considered to be unsafe in a requirements file:
 # pip

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -7,39 +7,39 @@
 attrs==22.1.0
     # via pytest
 black==22.12.0
-    # via -r test.in
+    # via -r requirements/test.in
 certifi==2022.12.7
     # via
-    #   -r test.in
+    #   -r requirements/test.in
     #   requests
 charset-normalizer==2.1.1
     # via requests
 click==8.1.3
     # via
-    #   -r test.in
+    #   -r requirements/test.in
     #   black
 coverage[toml]==6.4.4
     # via
-    #   -r test.in
+    #   -r requirements/test.in
     #   pytest-cov
 flake8==5.0.4
-    # via -r test.in
+    # via -r requirements/test.in
 idna==3.4
     # via
     #   requests
     #   yarl
 iniconfig==1.1.1
     # via pytest
 label-studio-sdk==0.0.15
-    # via -r test.in
+    # via -r requirements/test.in
 lxml==4.9.1
     # via label-studio-sdk
 mccabe==0.7.0
     # via flake8
 multidict==6.0.2
     # via yarl
 mypy==0.991
-    # via -r test.in
+    # via -r requirements/test.in
 mypy-extensions==0.4.3
     # via
     #   black
@@ -65,23 +65,28 @@ pyparsing==3.0.9
 pytest==7.1.3
     # via pytest-cov
 pytest-cov==4.0.0
-    # via -r test.in
+    # via -r requirements/test.in
 pyyaml==6.0
     # via vcrpy
 requests==2.28.1
     # via label-studio-sdk
 six==1.16.0
     # via vcrpy
 tomli==2.0.1
-    # via pytest
+    # via
+    #   black
+    #   coverage
+    #   mypy
+    #   pytest
 typing-extensions==4.3.0
     # via
+    #   black
     #   mypy
     #   pydantic
 urllib3==1.26.12
     # via requests
 vcrpy==4.2.1
-    # via -r test.in
+    # via -r requirements/test.in
 wrapt==1.14.1
     # via vcrpy
 yarl==1.8.1

diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py
@@ -3,7 +3,7 @@
 import pathlib
 import pytest
 
-from unstructured.documents.elements import NarrativeText, Title, ListItem
+from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
 from unstructured.documents.email_elements import (
     MetaData,
     Recipient,
@@ -27,6 +27,15 @@
     ListItem(text="Violets are blue"),
 ]
 
+IMAGE_EXPECTED_OUTPUT = [
+    NarrativeText(text="This is a test email to use for unit tests."),
+    Title(text="Important points:"),
+    NarrativeText(text="hello this is our logo."),
+    Image(text="unstructured_logo.png"),
+    ListItem(text="Roses are red"),
+    ListItem(text="Violets are blue"),
+]
+
 HEADER_EXPECTED_OUTPUT = [
     MetaData(name="MIME-Version", text="1.0"),
     MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
@@ -97,6 +106,13 @@ def test_partition_email_from_text():
     assert elements == EXPECTED_OUTPUT
 
 
+def test_partition_email_from_filename_with_embedded_image():
+    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-image-embedded.eml")
+    elements = partition_email(filename=filename, content_source="text/plain")
+    assert len(elements) > 0
+    assert elements == IMAGE_EXPECTED_OUTPUT
+
+
 def test_partition_email_header():
     filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
     with open(filename, "r") as f:

diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -72,3 +72,11 @@ class Title(Text):
     category = "Title"
 
     pass
+
+
+class Image(Text):
+    """A text element for capturing image metadata."""
+
+    category = "Image"
+
+    pass
diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -2,7 +2,7 @@
 import sys
 import re
 from email.message import Message
-from typing import Dict, IO, List, Optional, Tuple
+from typing import Dict, IO, List, Optional, Tuple, Union
 
 if sys.version_info < (3, 8):
     from typing_extensions import Final
@@ -24,7 +24,7 @@
     ReceivedInfo,
     MetaData,
 )
-from unstructured.documents.elements import Element, Text
+from unstructured.documents.elements import Element, Text, Image, NarrativeText, Title
 from unstructured.partition.html import partition_html
 from unstructured.partition.text import split_by_paragraph, partition_text
 
@@ -113,6 +113,25 @@ def extract_attachment_info(
     return list_attachments
 
 
+def has_embedded_image(element):
+
+    PATTERN = re.compile("\[image: .+\]")  # noqa: W605 NOTE(harrell)
+    return PATTERN.search(element.text)
+
+
+def find_embedded_image(
+    element: Union[NarrativeText, Title], indices: re.Match
+) -> Tuple[Element, Element]:
+
+    start, end = indices.start(), indices.end()
+
+    image_raw_info = element.text[start:end]
+    image_info = clean_extra_whitespace(image_raw_info.split(":")[1])
+    element.text = element.text.replace("[image: " + image_info[:-1] + "]", "")
+
+    return Image(text=image_info[:-1]), element
+
+
 def partition_email(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
@@ -171,7 +190,7 @@ def partition_email(
         raise ValueError(f"{content_source} content not found in email")
 
     # NOTE(robinson) - In the .eml files, the HTML content gets stored in a format that
-    # looks like the following, resulting in extraneous "=" chracters in the output if
+    # looks like the following, resulting in extraneous "=" characters in the output if
     # you don't clean it up
     # <ul> =
     #    <li>Item 1</li>=
@@ -188,6 +207,13 @@ def partition_email(
     elif content_source == "text/plain":
         elements = partition_text(text=content)
 
+    for idx, element in enumerate(elements):
+        indices = has_embedded_image(element)
+        if (isinstance(element, NarrativeText) or isinstance(element, Title)) and indices:
+            image_info, clean_element = find_embedded_image(element, indices)
+            elements[idx] = clean_element
+            elements.insert(idx + 1, image_info)
+
     header: List[Element] = list()
     if include_headers:
         header = partition_email_header(msg)