Skip to content

Commit

Permalink
chore: Fix parse received data (Unstructured-IO#143)
Browse files Browse the repository at this point in the history
* fix parse_received data
  • Loading branch information
mallorih authored Jan 17, 2023
1 parent 749f9c6 commit 08ccee0
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 16 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
## 0.4.3-dev0
## 0.4.3-dev1

* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
* Fix bug in `_parse_received_data`.

## 0.4.2

* Added `partition_image` to process documents in an image format.
* Fixed utf-8 encoding error in `partition_email` with attachments for `text/html`


## 0.4.1

* Added support for text files in the `partition` function
Expand Down Expand Up @@ -40,7 +40,7 @@
elements
* Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files.
* Helper functions for identifying and extracting phone numbers
* Add new function `extract_attachment_info` that extracts and decode the attachment
* Add new function `extract_attachment_info` that extracts and decodes the attachment
of an email.
* Staging brick to convert a list of `Element`s to a `pandas` dataframe.
* Add plain text functionality to `partition_email`
Expand Down
28 changes: 28 additions & 0 deletions example-docs/fake-email-header.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide
([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0,
cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox
Transport; Wed, 20 Feb 2023 10:03:18 +1200
MIME-Version: 1.0
Date: Fri, 16 Dec 2022 17:04:16 -0500
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
Subject: Test Email
From: Matthew Robinson <mrobinson@unstructured.io>
To: Matthew Robinson <mrobinson@unstructured.io>
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"

--00000000000095c9b205eff92630
Content-Type: text/plain; charset="UTF-8"
This is a test email to use for unit tests.
Important points:
- Roses are red
- Violets are blue
--00000000000095c9b205eff92630
Content-Type: text/html; charset="UTF-8"

<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>

--00000000000095c9b205eff92630--
31 changes: 29 additions & 2 deletions test_unstructured/partition/test_email.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import datetime
import email
import os
import pathlib
import pytest


from unstructured.documents.elements import NarrativeText, Title, ListItem, Image
from unstructured.documents.email_elements import (
MetaData,
Recipient,
Sender,
Subject,
ReceivedInfo,
)
from unstructured.partition.email import (
extract_attachment_info,
Expand Down Expand Up @@ -36,6 +39,30 @@
ListItem(text="Violets are blue"),
]

RECEIVED_HEADER_OUTPUT = [
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"),
ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"),
ReceivedInfo(
name="received_datetimetz",
text="2023-02-20 10:03:18+12:00",
datestamp=datetime.datetime(
2023, 2, 20, 10, 3, 18, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200))
),
),
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
MetaData(
name="Message-ID",
text="<CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>",
),
Subject(text="Test Email"),
Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"),
Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"),
MetaData(
name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"'
),
]

HEADER_EXPECTED_OUTPUT = [
MetaData(name="MIME-Version", text="1.0"),
MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"),
Expand Down Expand Up @@ -114,12 +141,12 @@ def test_partition_email_from_filename_with_embedded_image():


def test_partition_email_header():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml")
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml")
with open(filename, "r") as f:
msg = email.message_from_file(f)
elements = partition_email_header(msg)
assert len(elements) > 0
assert elements == HEADER_EXPECTED_OUTPUT
assert elements == RECEIVED_HEADER_OUTPUT


def test_extract_email_text_matches_html():
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.3-dev0" # pragma: no cover
__version__ = "0.4.3-dev1" # pragma: no cover
19 changes: 13 additions & 6 deletions unstructured/documents/email_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
from unstructured.documents.elements import Element, Text, NoID


class NoDatestamp(ABC):
"""Class to indicate that an element do not have a datetime stamp."""

pass


class EmailElement(Element):
"""An email element is a section of the email."""

Expand All @@ -20,28 +26,29 @@ def __init__(
self,
name: str,
text: str,
datestamp: Union[datetime, NoDatestamp] = NoDatestamp(),
element_id: Union[str, NoID] = NoID(),
):
self.name: str = name
self.text: str = text
self.datestamp: datetime
self.has_datestamp: bool = False

if isinstance(element_id, NoID):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]

super().__init__(element_id=element_id)

def set_datestamp(self, datestamp: datetime):
self.datestamp = datestamp
self.has_datestamp = True
if isinstance(datestamp, datetime):
self.datestamp: datetime = datestamp

def has_datestamp(self):
return "self.datestamp" in globals()

def __str__(self):
return f"{self.name}: {self.text}"

def __eq__(self, other):
if self.has_datestamp:
if self.has_datestamp():
return (
self.name == other.name
and self.text == other.text
Expand Down
5 changes: 1 addition & 4 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,8 @@ def _parse_received_data(data: str) -> List[Element]:
elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0]))
if datetimetz:
elements.append(
ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp(
datestamp=datetimetz
)
ReceivedInfo(name="received_datetimetz", text=str(datetimetz), datestamp=datetimetz)
)

return elements


Expand Down

0 comments on commit 08ccee0

Please sign in to comment.