From 08ccee0acba1f116a66b45c786dd289b09d81294 Mon Sep 17 00:00:00 2001 From: Mallori Harrell <6825104+mallorih@users.noreply.github.com> Date: Tue, 17 Jan 2023 16:36:44 -0600 Subject: [PATCH] chore: Fix parse received data (#143) * fix parse_received data --- CHANGELOG.md | 6 ++--- example-docs/fake-email-header.eml | 28 ++++++++++++++++++++ test_unstructured/partition/test_email.py | 31 +++++++++++++++++++++-- unstructured/__version__.py | 2 +- unstructured/documents/email_elements.py | 19 +++++++++----- unstructured/partition/email.py | 5 +--- 6 files changed, 75 insertions(+), 16 deletions(-) create mode 100644 example-docs/fake-email-header.eml diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fc3e64eb9..f48d971423 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,13 @@ -## 0.4.3-dev0 +## 0.4.3-dev1 * Fix in `exceeds_cap_ratio` so the function doesn't break with empty text +* Fix bug in `_parse_received_data`. ## 0.4.2 * Added `partition_image` to process documents in an image format. * Fixed utf-8 encoding error in `partition_email` with attachments for `text/html` - ## 0.4.1 * Added support for text files in the `partition` function @@ -40,7 +40,7 @@ elements * Add ability to extract document metadata from `.docx`, `.xlsx`, and `.jpg` files. * Helper functions for identifying and extracting phone numbers -* Add new function `extract_attachment_info` that extracts and decode the attachment +* Add new function `extract_attachment_info` that extracts and decodes the attachment of an email. * Staging brick to convert a list of `Element`s to a `pandas` dataframe. * Add plain text functionality to `partition_email` diff --git a/example-docs/fake-email-header.eml b/example-docs/fake-email-header.eml new file mode 100644 index 0000000000..b8d188ff5f --- /dev/null +++ b/example-docs/fake-email-header.eml @@ -0,0 +1,28 @@ +Received: from ABCDEFG-000.ABC.guide (00.0.0.00) by ABCDEFG-000.ABC.guide + ([ba23::58b5:2236:45g2:88h2]) with Unstructured TTTT Server (version=ABC0_0, + cipher=ABC_ABCDE_ABC_NOPE_ABC_000_ABC_ABC000) id 00.0.000.0 via Techbox + Transport; Wed, 20 Feb 2023 10:03:18 +1200 +MIME-Version: 1.0 +Date: Fri, 16 Dec 2022 17:04:16 -0500 +Message-ID: +Subject: Test Email +From: Matthew Robinson +To: Matthew Robinson +Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" + +--00000000000095c9b205eff92630 +Content-Type: text/plain; charset="UTF-8" + +This is a test email to use for unit tests. + +Important points: + + - Roses are red + - Violets are blue + +--00000000000095c9b205eff92630 +Content-Type: text/html; charset="UTF-8" + +
This is a test email to use for unit tests.

Important points:
  • Roses are red
  • Violets are blue
+ +--00000000000095c9b205eff92630-- \ No newline at end of file diff --git a/test_unstructured/partition/test_email.py b/test_unstructured/partition/test_email.py index 282e58995c..d0bdf21549 100644 --- a/test_unstructured/partition/test_email.py +++ b/test_unstructured/partition/test_email.py @@ -1,14 +1,17 @@ +import datetime import email import os import pathlib import pytest + from unstructured.documents.elements import NarrativeText, Title, ListItem, Image from unstructured.documents.email_elements import ( MetaData, Recipient, Sender, Subject, + ReceivedInfo, ) from unstructured.partition.email import ( extract_attachment_info, @@ -36,6 +39,30 @@ ListItem(text="Violets are blue"), ] +RECEIVED_HEADER_OUTPUT = [ + ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="00.0.0.00"), + ReceivedInfo(name="ABCDEFG-000.ABC.guide", text="ba23::58b5:2236:45g2:88h2"), + ReceivedInfo( + name="received_datetimetz", + text="2023-02-20 10:03:18+12:00", + datestamp=datetime.datetime( + 2023, 2, 20, 10, 3, 18, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)) + ), + ), + MetaData(name="MIME-Version", text="1.0"), + MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), + MetaData( + name="Message-ID", + text="", + ), + Subject(text="Test Email"), + Sender(name="Matthew Robinson", text="mrobinson@unstructured.io"), + Recipient(name="Matthew Robinson", text="mrobinson@unstructured.io"), + MetaData( + name="Content-Type", text='multipart/alternative; boundary="00000000000095c9b205eff92630"' + ), +] + HEADER_EXPECTED_OUTPUT = [ MetaData(name="MIME-Version", text="1.0"), MetaData(name="Date", text="Fri, 16 Dec 2022 17:04:16 -0500"), @@ -114,12 +141,12 @@ def test_partition_email_from_filename_with_embedded_image(): def test_partition_email_header(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml") + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email-header.eml") with open(filename, "r") as f: msg = email.message_from_file(f) elements = partition_email_header(msg) assert len(elements) > 0 - assert elements == HEADER_EXPECTED_OUTPUT + assert elements == RECEIVED_HEADER_OUTPUT def test_extract_email_text_matches_html(): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 33c15e450b..4b2a248573 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.3-dev0" # pragma: no cover +__version__ = "0.4.3-dev1" # pragma: no cover diff --git a/unstructured/documents/email_elements.py b/unstructured/documents/email_elements.py index d44863d485..422e093154 100644 --- a/unstructured/documents/email_elements.py +++ b/unstructured/documents/email_elements.py @@ -5,6 +5,12 @@ from unstructured.documents.elements import Element, Text, NoID +class NoDatestamp(ABC): + """Class to indicate that an element do not have a datetime stamp.""" + + pass + + class EmailElement(Element): """An email element is a section of the email.""" @@ -20,12 +26,11 @@ def __init__( self, name: str, text: str, + datestamp: Union[datetime, NoDatestamp] = NoDatestamp(), element_id: Union[str, NoID] = NoID(), ): self.name: str = name self.text: str = text - self.datestamp: datetime - self.has_datestamp: bool = False if isinstance(element_id, NoID): # NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits @@ -33,15 +38,17 @@ def __init__( super().__init__(element_id=element_id) - def set_datestamp(self, datestamp: datetime): - self.datestamp = datestamp - self.has_datestamp = True + if isinstance(datestamp, datetime): + self.datestamp: datetime = datestamp + + def has_datestamp(self): + return "self.datestamp" in globals() def __str__(self): return f"{self.name}: {self.text}" def __eq__(self, other): - if self.has_datestamp: + if self.has_datestamp(): return ( self.name == other.name and self.text == other.text diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 362793bc4a..c6dca905da 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -47,11 +47,8 @@ def _parse_received_data(data: str) -> List[Element]: elements.append(ReceivedInfo(name="mapi_id", text=mapi_id[0])) if datetimetz: elements.append( - ReceivedInfo(name="received_datetimetz", text=str(datetimetz)).set_datestamp( - datestamp=datetimetz - ) + ReceivedInfo(name="received_datetimetz", text=str(datetimetz), datestamp=datetimetz) ) - return elements