Skip to content

feat: Added EmailElement for email documents #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 41 commits into from
Dec 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
eb73930
new data structure, updated CHANGELOG and __version__.py
Dec 16, 2022
19beabd
removed code
Dec 16, 2022
7465689
adding tests
Dec 16, 2022
daae379
added test
Dec 16, 2022
9fa901a
fixed import statement
Dec 16, 2022
a7106d5
linter
Dec 16, 2022
2b5a6ce
remove unused import statement
Dec 16, 2022
b0ea522
fixed syntax
Dec 16, 2022
909b975
added Email class
Dec 16, 2022
6730fcf
merge conflicts
Dec 19, 2022
cc27de8
changelog and new partition function
Dec 19, 2022
47cb1ca
partition text
Dec 19, 2022
3799c6b
remove partition_text
Dec 19, 2022
e9ed6b4
updated comments and added attachment name variable
Dec 19, 2022
fe9bea7
updated changelog
Dec 19, 2022
f15048b
linter
Dec 19, 2022
56d10ef
formatting issues
Dec 19, 2022
3389a1b
version
Dec 19, 2022
1b7815f
Merge branch 'main' into email-element
mallorih Dec 19, 2022
ce05243
changed attachment
Dec 21, 2022
095de59
Merge branch 'email-element' of https://github.com/Unstructured-IO/un…
Dec 21, 2022
310fdaf
new data structure, updated CHANGELOG and __version__.py
Dec 16, 2022
5181591
removed code
Dec 16, 2022
1b65c3f
adding tests
Dec 16, 2022
32eaada
added test
Dec 16, 2022
d358ccd
fixed import statement
Dec 16, 2022
572be70
linter
Dec 16, 2022
65f110f
remove unused import statement
Dec 16, 2022
e582c2c
fixed syntax
Dec 16, 2022
bdd41a2
added Email class
Dec 16, 2022
f53c812
changelog and new partition function
Dec 19, 2022
9365781
partition text
Dec 19, 2022
cd3c238
remove partition_text
Dec 19, 2022
75cb27c
updated comments and added attachment name variable
Dec 19, 2022
5466a08
updated changelog
Dec 19, 2022
bf80f08
linter
Dec 19, 2022
f128fea
formatting issues
Dec 19, 2022
ea7ad94
changed attachment
Dec 21, 2022
42362ed
merge conflicts
Dec 21, 2022
764187f
Merge branch 'main' of https://github.com/Unstructured-IO/unstructure…
Dec 21, 2022
bf9875c
changelog and version
Dec 21, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.3.5-dev1
## 0.3.5-dev2

* Add new pattern to recognize plain text dash bullets
* Add test for bullet patterns
Expand All @@ -15,6 +15,7 @@
* Adds the `partition_email` partitioning brick
* Adds the `replace_mime_encodings` cleaning bricks
* Small fix to HTML parsing related to processing list items with sub-tags
* Add `EmailElement` data structure to store email documents

## 0.3.2

Expand Down
44 changes: 44 additions & 0 deletions test_unstructured/documents/test_email_elements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from functools import partial
import pytest

from unstructured.cleaners.core import clean_prefix
from unstructured.cleaners.translate import translate_text
from unstructured.documents.email_elements import EmailElement, NoID, Name


def test_text_id():
name_element = Name(name="Example", text="hello there!")
assert name_element.id == "c69509590d81db2f37f9d75480c8efed"


def test_element_defaults_to_blank_id():
element = EmailElement()
assert isinstance(element.id, NoID)


def test_text_element_apply_cleaners():
name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats")

name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]"))
assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats"


def test_name_element_apply_multiple_cleaners():
cleaners = [
partial(clean_prefix, pattern=r"\[\d{1,2}\]"),
partial(translate_text, target_lang="ru"),
]
name_element = Name(
name="[1] A Textbook on Crocodile Habitats", text="[1] A Textbook on Crocodile Habitats"
)
name_element.apply(*cleaners)
assert (
str(name_element)
== "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания"
)


def test_apply_raises_if_func_does_not_produce_string():
name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats")
with pytest.raises(ValueError):
name_element.apply(lambda s: 1)
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.3.5-dev1" # pragma: no cover
__version__ = "0.3.5-dev2" # pragma: no cover
150 changes: 150 additions & 0 deletions unstructured/documents/email_elements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from abc import ABC
import hashlib
from typing import Callable, List, Union
from unstructured.documents.elements import Element, Text, NoID


class EmailElement(Element):
"""An email element is a section of the email."""

pass


class Name(EmailElement):
"""Base element for capturing free text from within document."""

category = "Uncategorized"

def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()):
self.name: str = name
self.text: str = text

if isinstance(element_id, NoID):
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits
element_id = hashlib.sha256(text.encode()).hexdigest()[:32]

super().__init__(element_id=element_id)

def __str__(self):
return f"{self.name}: {self.text}"

def __eq__(self, other):
return self.name == other.name and self.text == other.text

def apply(self, *cleaners: Callable):
"""Applies a cleaning brick to the text element. The function that's passed in
should take a string as input and produce a string as output."""
cleaned_text = self.text
cleaned_name = self.name

for cleaner in cleaners:
cleaned_text = cleaner(cleaned_text)
cleaned_name = cleaner(cleaned_name)

if not isinstance(cleaned_text, str):
raise ValueError("Cleaner produced a non-string output.")

if not isinstance(cleaned_name, str):
raise ValueError("Cleaner produced a non-string output.")

self.text = cleaned_text
self.name = cleaned_name


class BodyText(List[Text]):
"""BodyText is an element consisting of multiple, well-formulated sentences. This
excludes elements such titles, headers, footers, and captions. It is the body of an email."""

category = "BodyText"

pass


class Recipient(Text):
"""A text element for capturing the recipient information of an email (e.g. Subject,
To, From, etc)."""

category = "Recipient"

pass


class Sender(Text):
"""A text element for capturing the sender information of an email (e.g. Subject,
To, From, etc)."""

category = "Sender"

pass


class Subject(Text):
"""A text element for capturing the subject information of an email (e.g. Subject,
To, From, etc)."""

category = "Subject"

pass


class ReceivedInfo(List[Text]):
"""A text element for capturing header information of an email (e.g. Subject,
To, From, etc)."""

category = "ReceivedInfo"

pass


class MetaData(Name):
"""A text element for capturing header meta data of an email (e.g. Subject,
To, From, etc)."""

category = "MetaData"

pass


class Attachment(Name):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should attachment also have a bytes or file-like attribute that contains the actual attachment? If we don't have code to deal with attachments yet, we can also spin attachments off and deal with it in a separate PR.

"""A text element for capturing the attachment name in an email (e.g. Subject,
To, From, etc)."""

category = "Attachment"

pass


class Email(ABC):
"""An email class with it's attributes"""

def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText):
self.recipient = recipient
self.sender = sender
self.subject = subject
self.body = body
self.received_info: ReceivedInfo
self.meta_data: MetaData
self.attachment: List[Attachment]

def __str__(self):
return f"""
Recipient: {self.recipient}
Sender: {self.sender}
Subject: {self.subject}

Received Header Information:

{self.received_info}

Meta Data From Header:

{self.meta_data}

Body:

{self.body}

Attachment:

{[file.name for file in self.attachment]}
"""