-
Notifications
You must be signed in to change notification settings - Fork 972
feat: Added EmailElement
for email documents
#103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
41 commits
Select commit
Hold shift + click to select a range
eb73930
new data structure, updated CHANGELOG and __version__.py
19beabd
removed code
7465689
adding tests
daae379
added test
9fa901a
fixed import statement
a7106d5
linter
2b5a6ce
remove unused import statement
b0ea522
fixed syntax
909b975
added Email class
6730fcf
merge conflicts
cc27de8
changelog and new partition function
47cb1ca
partition text
3799c6b
remove partition_text
e9ed6b4
updated comments and added attachment name variable
fe9bea7
updated changelog
f15048b
linter
56d10ef
formatting issues
3389a1b
version
1b7815f
Merge branch 'main' into email-element
mallorih ce05243
changed attachment
095de59
Merge branch 'email-element' of https://github.com/Unstructured-IO/un…
310fdaf
new data structure, updated CHANGELOG and __version__.py
5181591
removed code
1b65c3f
adding tests
32eaada
added test
d358ccd
fixed import statement
572be70
linter
65f110f
remove unused import statement
e582c2c
fixed syntax
bdd41a2
added Email class
f53c812
changelog and new partition function
9365781
partition text
cd3c238
remove partition_text
75cb27c
updated comments and added attachment name variable
5466a08
updated changelog
bf80f08
linter
f128fea
formatting issues
ea7ad94
changed attachment
42362ed
merge conflicts
764187f
Merge branch 'main' of https://github.com/Unstructured-IO/unstructure…
bf9875c
changelog and version
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from functools import partial | ||
import pytest | ||
|
||
from unstructured.cleaners.core import clean_prefix | ||
from unstructured.cleaners.translate import translate_text | ||
from unstructured.documents.email_elements import EmailElement, NoID, Name | ||
|
||
|
||
def test_text_id(): | ||
name_element = Name(name="Example", text="hello there!") | ||
assert name_element.id == "c69509590d81db2f37f9d75480c8efed" | ||
|
||
|
||
def test_element_defaults_to_blank_id(): | ||
element = EmailElement() | ||
assert isinstance(element.id, NoID) | ||
|
||
|
||
def test_text_element_apply_cleaners(): | ||
name_element = Name(name="[2] Example docs", text="[1] A Textbook on Crocodile Habitats") | ||
|
||
name_element.apply(partial(clean_prefix, pattern=r"\[\d{1,2}\]")) | ||
assert str(name_element) == "Example docs: A Textbook on Crocodile Habitats" | ||
|
||
|
||
def test_name_element_apply_multiple_cleaners(): | ||
cleaners = [ | ||
partial(clean_prefix, pattern=r"\[\d{1,2}\]"), | ||
partial(translate_text, target_lang="ru"), | ||
] | ||
name_element = Name( | ||
name="[1] A Textbook on Crocodile Habitats", text="[1] A Textbook on Crocodile Habitats" | ||
) | ||
name_element.apply(*cleaners) | ||
assert ( | ||
str(name_element) | ||
== "Учебник по крокодильным средам обитания: Учебник по крокодильным средам обитания" | ||
) | ||
|
||
|
||
def test_apply_raises_if_func_does_not_produce_string(): | ||
name_element = Name(name="Example docs", text="[1] A Textbook on Crocodile Habitats") | ||
with pytest.raises(ValueError): | ||
name_element.apply(lambda s: 1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.3.5-dev1" # pragma: no cover | ||
__version__ = "0.3.5-dev2" # pragma: no cover |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
from abc import ABC | ||
import hashlib | ||
from typing import Callable, List, Union | ||
from unstructured.documents.elements import Element, Text, NoID | ||
|
||
|
||
class EmailElement(Element): | ||
"""An email element is a section of the email.""" | ||
|
||
pass | ||
|
||
|
||
class Name(EmailElement): | ||
"""Base element for capturing free text from within document.""" | ||
|
||
category = "Uncategorized" | ||
|
||
def __init__(self, name: str, text: str, element_id: Union[str, NoID] = NoID()): | ||
self.name: str = name | ||
self.text: str = text | ||
|
||
if isinstance(element_id, NoID): | ||
# NOTE(robinson) - Cut the SHA256 hex in half to get the first 128 bits | ||
element_id = hashlib.sha256(text.encode()).hexdigest()[:32] | ||
|
||
super().__init__(element_id=element_id) | ||
|
||
def __str__(self): | ||
return f"{self.name}: {self.text}" | ||
|
||
def __eq__(self, other): | ||
return self.name == other.name and self.text == other.text | ||
|
||
def apply(self, *cleaners: Callable): | ||
"""Applies a cleaning brick to the text element. The function that's passed in | ||
should take a string as input and produce a string as output.""" | ||
cleaned_text = self.text | ||
cleaned_name = self.name | ||
|
||
for cleaner in cleaners: | ||
cleaned_text = cleaner(cleaned_text) | ||
cleaned_name = cleaner(cleaned_name) | ||
|
||
if not isinstance(cleaned_text, str): | ||
raise ValueError("Cleaner produced a non-string output.") | ||
|
||
if not isinstance(cleaned_name, str): | ||
raise ValueError("Cleaner produced a non-string output.") | ||
|
||
self.text = cleaned_text | ||
self.name = cleaned_name | ||
|
||
|
||
class BodyText(List[Text]): | ||
"""BodyText is an element consisting of multiple, well-formulated sentences. This | ||
excludes elements such titles, headers, footers, and captions. It is the body of an email.""" | ||
|
||
category = "BodyText" | ||
|
||
pass | ||
|
||
|
||
class Recipient(Text): | ||
"""A text element for capturing the recipient information of an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "Recipient" | ||
|
||
pass | ||
|
||
|
||
class Sender(Text): | ||
"""A text element for capturing the sender information of an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "Sender" | ||
|
||
pass | ||
|
||
|
||
class Subject(Text): | ||
"""A text element for capturing the subject information of an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "Subject" | ||
|
||
pass | ||
|
||
|
||
class ReceivedInfo(List[Text]): | ||
"""A text element for capturing header information of an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "ReceivedInfo" | ||
|
||
pass | ||
|
||
|
||
class MetaData(Name): | ||
"""A text element for capturing header meta data of an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "MetaData" | ||
|
||
pass | ||
|
||
|
||
class Attachment(Name): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should attachment also have a |
||
"""A text element for capturing the attachment name in an email (e.g. Subject, | ||
To, From, etc).""" | ||
|
||
category = "Attachment" | ||
|
||
pass | ||
|
||
|
||
class Email(ABC): | ||
"""An email class with it's attributes""" | ||
|
||
def __init__(self, recipient: Recipient, sender: Sender, subject: Subject, body: BodyText): | ||
self.recipient = recipient | ||
self.sender = sender | ||
self.subject = subject | ||
self.body = body | ||
self.received_info: ReceivedInfo | ||
self.meta_data: MetaData | ||
self.attachment: List[Attachment] | ||
|
||
def __str__(self): | ||
return f""" | ||
Recipient: {self.recipient} | ||
Sender: {self.sender} | ||
Subject: {self.subject} | ||
|
||
Received Header Information: | ||
|
||
{self.received_info} | ||
|
||
Meta Data From Header: | ||
|
||
{self.meta_data} | ||
|
||
Body: | ||
|
||
{self.body} | ||
|
||
Attachment: | ||
|
||
{[file.name for file in self.attachment]} | ||
""" |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.