Skip to content

Commit

Permalink
feat: Add extract_attachment_info (Unstructured-IO#112)
Browse files Browse the repository at this point in the history
* Adds function to extract attachments and their metadata from eml files
  • Loading branch information
mallorih authored Jan 3, 2023
1 parent 4567357 commit 509ad49
Show file tree
Hide file tree
Showing 6 changed files with 118 additions and 4 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
## 0.3.5-dev3
## 0.3.5-dev4

* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
* Fix for `partition_html` that allows for processing `div` tags that have both text and child
elements
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
* Add new function `extract_attachment_info` that extracts and decode the attachment
of an email.

## 0.3.4

Expand Down
17 changes: 17 additions & 0 deletions docs/source/bricks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,23 @@ Examples:
text = f.read()
elements = partition_email(text=text)
``extract_attachment_info``
----------------------

The ``extract_attachment_info`` function takes an ``email.message.Message`` object
as input and returns the a list of dictionaries containing the attachment information,
such as ``filename``, ``size``, ``payload``, etc. The attachment is saved to the ``output_dir``
if specified.

.. code:: python
import email
from unstructured.partition.email import extract_attachment_info
with open("example-docs/fake-email-attachment.eml", "r") as f:
msg = email.message_from_file(f)
attachment_info = extract_attachment_info(msg, output_dir="example-docs")
``is_bulleted_text``
----------------------
Expand Down
50 changes: 50 additions & 0 deletions example-docs/fake-email-attachment.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
MIME-Version: 1.0
Date: Fri, 23 Dec 2022 12:08:48 -0600
Message-ID: <CAPgNNXSzLVJ-d1OCX_TjFgJU7ugtQrjFybPtAMmmYZzphxNFYg@mail.gmail.com>
Subject: Fake email with attachment
From: Mallori Harrell <mallori@unstructured.io>
To: Mallori Harrell <mallori@unstructured.io>
Content-Type: multipart/mixed; boundary="0000000000005d654405f082adb7"

--0000000000005d654405f082adb7
Content-Type: multipart/alternative; boundary="0000000000005d654205f082adb5"
--0000000000005d654205f082adb5
Content-Type: text/plain; charset="UTF-8"
Hello!
Here's the attachments!
It includes:
- Lots of whitespace
- Little to no content
- and is a quick read
Best,
Mallori
--0000000000005d654205f082adb5
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

<div dir=3D"ltr">Hello!=C2=A0<div><br></div><div>Here&#39;s the attachments=
!</div><div><br></div><div>It includes:</div><div><ul><li style=3D"margin-l=
eft:15px">Lots of whitespace</li><li style=3D"margin-left:15px">Little=C2=
=A0to no content</li><li style=3D"margin-left:15px">and is a quick read</li=
></ul><div>Best,</div></div><div><br></div><div>Mallori</div><div dir=3D"lt=
r" class=3D"gmail_signature" data-smartmail=3D"gmail_signature"><div dir=3D=
"ltr"><div><div><br></div></div></div></div></div>

--0000000000005d654205f082adb5--
--0000000000005d654405f082adb7
Content-Type: text/plain; charset="US-ASCII"; name="fake-attachment.txt"
Content-Disposition: attachment; filename="fake-attachment.txt"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_lc0tto5j0
Content-ID: <f_lc0tto5j0>
SGV5IHRoaXMgaXMgYSBmYWtlIGF0dGFjaG1lbnQh
--0000000000005d654405f082adb7--
16 changes: 15 additions & 1 deletion test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import email
import os
import pathlib
import pytest

from unstructured.documents.elements import NarrativeText, Title, ListItem
from unstructured.partition.email import partition_email
from unstructured.partition.email import partition_email, extract_attachment_info


DIRECTORY = pathlib.Path(__file__).parent.resolve()
Expand All @@ -16,6 +17,10 @@
ListItem(text="Violets are blue"),
]

ATTACH_EXPECTED_OUTPUT = [
{"filename": "fake-attachment.txt", "payload": b"Hey this is a fake attachment!"}
]


def test_partition_email_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
Expand All @@ -41,6 +46,15 @@ def test_partition_email_from_text():
assert elements == EXPECTED_OUTPUT


def test_extract_attachment_info():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-attachment.eml")
with open(filename, "r") as f:
msg = email.message_from_file(f)
attachment_info = extract_attachment_info(msg)
assert len(attachment_info) > 0
assert attachment_info == ATTACH_EXPECTED_OUTPUT


def test_partition_email_raises_with_none_specified():
with pytest.raises(ValueError):
partition_email()
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.5-dev3" # pragma: no cover
__version__ = "0.3.5-dev4" # pragma: no cover
33 changes: 32 additions & 1 deletion unstructured/partition/email.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,51 @@
import email
import sys
from email.message import Message
from typing import Dict, IO, List, Optional

if sys.version_info < (3, 8):
from typing_extensions import Final
else:
from typing import Final

from unstructured.cleaners.core import replace_mime_encodings
from unstructured.cleaners.core import replace_mime_encodings, clean_extra_whitespace
from unstructured.documents.elements import Element, Text
from unstructured.partition.html import partition_html


VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html"]


def extract_attachment_info(
message: Message, output_dir: Optional[str] = None
) -> List[Dict[str, str]]:
list_attachments = []
attachment_info = {}
for part in message.walk():
if "content-disposition" in part:
cdisp = part["content-disposition"].split(";")
cdisp = [clean_extra_whitespace(item) for item in cdisp]

for item in cdisp:
if item.lower() == "attachment":
continue
key, value = item.split("=")
key = clean_extra_whitespace(key.replace('"', ""))
value = clean_extra_whitespace(value.replace('"', ""))
attachment_info[clean_extra_whitespace(key)] = clean_extra_whitespace(value)
attachment_info["payload"] = part.get_payload(decode=True)
list_attachments.append(attachment_info)

for attachment in list_attachments:
if output_dir:
filename = output_dir + "/" + attachment["filename"]
with open(filename, "wb") as f:
# mypy wants to just us `w` when opening the file but this
# causes an error since the payloads are bytes not str
f.write(attachment["payload"]) # type: ignore
return list_attachments


def partition_email(
filename: Optional[str] = None,
file: Optional[IO] = None,
Expand Down

0 comments on commit 509ad49

Please sign in to comment.