Skip to content

Commit

Permalink
Merge pull request #18 from wahlflo/17-eml_analyzer-text-throws-cyril…
Browse files Browse the repository at this point in the history
…lic-characters-instead-of-german-umlauts

Fixing Issues with Encodings
  • Loading branch information
wahlflo authored Aug 4, 2023
2 parents e163290 + 247cb1f commit 20c387e
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 24 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,28 @@ on:

jobs:
build:

runs-on: ubuntu-latest
name: Test on ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest ]
python-version: [ "3.7", "3.8", "3.9", "3.10" , "3.11" ]

runs-on: ${{ matrix.os }}

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
python -m pip install -r requirements.txt
- name: Test with pytest
run: |
pytest
2 changes: 1 addition & 1 deletion eml_analyzer/cli_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def main():
argument_parser = argparse.ArgumentParser(prog='emlAnalyzer', description='A CLI script to analyze an email in the EML format for viewing headers, extracting attachments, etc.')
argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r'), nargs='?', default=sys.stdin)
argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r', encoding='utf-8'), nargs='?', default=sys.stdin)
argument_parser.add_argument('--header', action='store_true', default=False, help="Shows the headers")
argument_parser.add_argument('-x', '--tracking', action='store_true', default=False, help="Shows content which is reloaded from external resources in the HTML part")
argument_parser.add_argument('-a', '--attachments', action='store_true', default=False, help="Lists attachments")
Expand Down
51 changes: 39 additions & 12 deletions eml_analyzer/library/parser/parsed_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import NamedTuple, List, Tuple, Set

from eml_analyzer.library.parser.attachment import Attachment
from eml_analyzer.library.parser.printable_filename import decode_ASCII_encoded_string
from eml_analyzer.library.parser.structure_item import StructureItem


Expand Down Expand Up @@ -48,7 +49,7 @@ def _add_error_messages(self, error_message: str) -> None:

def get_header(self) -> List[Tuple[str, any]]:
""" returns list of key-value pairs of header entries """
return self._parsed_email.items()
return [(key, decode_ASCII_encoded_string(value)) for key, value in self._parsed_email.items()]

def get_structure(self) -> StructureItem:
return StructureItem(message=self._parsed_email)
Expand Down Expand Up @@ -86,14 +87,19 @@ def _get_first_email_payload_with_matching_type(message: email.message.Message,

@staticmethod
def _get_decoded_payload_from_message(message: email.message.Message) -> None or str:
transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
if transfer_encoding in {'7bit', '8bit', 'binary'}:
return message.get_payload(decode=False)

payload_in_bytes = message.get_payload(decode=True)

list_of_possible_encodings = ParsedEmail._create_list_of_possible_encodings(message=message)

for encoding_format in list_of_possible_encodings:
try:
return payload_in_bytes.decode(encoding_format)
except ValueError:
except ValueError as error:
print('Error: ' + str(error))
continue
raise PayloadDecodingException('Payload could not be decoded')

Expand All @@ -102,23 +108,44 @@ def _create_list_of_possible_encodings(message: email.message.Message) -> list:
""" creates a list of the most possible encodings of a payload """
list_of_possible_encodings = list()

header_values = ParsedEmail._header_lookup(message=message, key='content-type')

# at first add the encodings mentioned in the object header
for k, v in message.items():
k = str(k).lower()
v = str(v).lower()
if k == 'content-type':
entries = v.split(';')
for entry in entries:
entry = entry.strip()
if entry.startswith('charset='):
encoding = entry.replace('charset=', '').replace('"', '')
list_of_possible_encodings.append(encoding)
for v in header_values:
entries = v.split(';')
for entry in entries:
entry = entry.strip()
if entry.startswith('charset='):
encoding = entry.replace('charset=', '').replace('"', '')
list_of_possible_encodings.append(encoding)

for x in ['utf-8', 'windows-1251', 'iso-8859-1', 'us-ascii', 'iso-8859-15']:
if x not in list_of_possible_encodings:
list_of_possible_encodings.append(x)
return list_of_possible_encodings

@staticmethod
def _payload_needs_decoding(message: email.message.Message) -> bool:
transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
if transfer_encoding is None:
return True
return transfer_encoding not in {'7bit', '8bit', 'binary'}

@staticmethod
def _header_lookup_first_element(message: email.message.Message, key: str) -> str or None:
for header_key, value in message.items():
if str(header_key).lower() == key:
return str(value).lower()
return None

@staticmethod
def _header_lookup(message: email.message.Message, key: str) -> [str]:
values = list()
for header_key, value in message.items():
if str(header_key).lower() == key:
values.append(str(value).lower())
return values

def get_attachments(self) -> List[Attachment]:
return_list = list()
counter = 0
Expand Down
25 changes: 23 additions & 2 deletions eml_analyzer/library/parser/printable_filename.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import binascii
import re
import email.message
import quopri


def get_printable_filename_if_existent(message: email.message.Message) -> str or None:
Expand All @@ -12,7 +13,7 @@ def get_printable_filename_if_existent(message: email.message.Message) -> str or


def _make_string_printable(original_string: str) -> str:
original_string = _decode_ASCII_encoded_string(string=original_string)
original_string = decode_ASCII_encoded_string(string=original_string)

additional_allowed_chars = {'_', '.', '(', ')', '-', ' '}
clean_name = ''
Expand All @@ -24,7 +25,13 @@ def _make_string_printable(original_string: str) -> str:
return clean_name


def _decode_ASCII_encoded_string(string: str) -> str:
def decode_ASCII_encoded_string(string: str) -> str:
string = _decode_ASCII_encoded_string_baseX(string=string)
string = _decode_ASCII_encoded_string_quoted_printable_string(string=string)
return string


def _decode_ASCII_encoded_string_baseX(string: str) -> str:
""" decodes ASCII strings which are encoded like: name := "=?UTF-8?B?" + base64_encode(string) + "?=" """
pattern = re.compile(r'=\?(.+?)\?B\?(.+?)\?=', re.IGNORECASE)
for match in list(re.finditer(pattern=pattern, string=string)):
Expand All @@ -33,3 +40,17 @@ def _decode_ASCII_encoded_string(string: str) -> str:
except binascii.Error:
pass
return string


def _decode_ASCII_encoded_string_quoted_printable_string(string: str) -> str:
pattern = re.compile(r'=\?(.+?)\?Q\?(.+?)\?=', re.IGNORECASE)
for match in list(re.finditer(pattern=pattern, string=string)):
try:
encoding = match.group(1)
encoded_string = match.group(2)
decoded_string = quopri.decodestring(encoded_string)
replacement = decoded_string.decode(encoding)
string = string.replace(match.group(0), replacement)
except binascii.Error:
pass
return string
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setuptools.setup(
name="eml-analyzer",
version="2.0.3",
version="3.0.0",
author="Florian Wahl",
author_email="florian.wahl.developer@gmail.com",
description="A cli script to analyze an E-Mail in the eml format for viewing the header, extracting attachments, etc.",
Expand Down
13 changes: 13 additions & 0 deletions tests/library/parser/test_emails/utf8_with_umlauts.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Message-ID: <c18a84e6-cc7e-e22e-63ce-49c76547617f@ra-maier.com>
Date: Tue, 25 Jul 2023 16:07:11 +0200
MIME-Version: 1.0
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
Thunderbird/102.13.0
Content-Language: en-US
To: dmaier@mailbox.org
From: dmaier@mailbox.org
Subject: =?UTF-8?Q?Dies_ist_ein_d=c3=a4mlicher_Test?=
Content-Type: text/plain; charset=UTF-8; format=flowed
Content-Transfer-Encoding: 8bit

Dies ist ein dämlicher Test.
21 changes: 18 additions & 3 deletions tests/library/parser/test_parsed_email.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def load_test_eml_file(test_file) -> str:
current_directory_of_the_script = os.path.dirname(__file__)
test_emails = os.path.join(current_directory_of_the_script, 'test_emails')
path_to_test_file = os.path.join(test_emails, test_file)
with open(path_to_test_file, mode='r') as input_file:
with open(path_to_test_file, mode='r', encoding='utf-8') as input_file:
return input_file.read()


Expand All @@ -32,7 +32,7 @@ def test_case_1_header_subject(self):
header = x.get_header()
for key, value in header:
if key == 'Subject':
self.assertIn(value, 'UnitTest Subject =?UTF-8?B?TcO8bmNoZW4s?=')
self.assertEqual(value, 'UnitTest Subject München,')
return
self.fail(msg="header subject not found")

Expand Down Expand Up @@ -228,4 +228,19 @@ def test_get_reloaded_content_from_html_case_3(self):

def url_decode(self):
import urllib.parse
self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))
self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))

def test_case_uf8_with_umlauts_txt(self):
eml_content = load_test_eml_file('utf8_with_umlauts.eml')
x = ParsedEmail(eml_content=eml_content)
self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.')

def test_case_uf8_with_umlauts_header(self):
eml_content = load_test_eml_file('utf8_with_umlauts.eml')
x = ParsedEmail(eml_content=eml_content)
header = x.get_header()
for key, value in header:
if key == 'Subject':
self.assertEqual(value, 'Dies_ist_ein_dämlicher_Test')
return
self.fail(msg="header subject not found")
7 changes: 5 additions & 2 deletions tests/library/parser/test_printable_filename.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, _decode_ASCII_encoded_string
from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, decode_ASCII_encoded_string


class TestPrintableFilename(unittest.TestCase):
Expand All @@ -12,9 +12,10 @@ def test_decode_ASCII_encoded_string(self):
('=?UTF-8?B?4o6Y7Z+/?=', '⎘퟿'),
('=?utf-8?b?4o6Y7Z+/?=', '⎘퟿'),
('=?utf-16?b?SABlAGwAbABvAFcAbwByAGwAZAA=?=', 'HelloWorld'),
('=?UTF-8?Q?=c3=a4?=', 'ä'),
]
for value, expected in test_cases:
result = _decode_ASCII_encoded_string(string=value)
result = decode_ASCII_encoded_string(string=value)
self.assertEqual(result, expected)

def test_make_string_printable(self):
Expand All @@ -24,6 +25,7 @@ def test_make_string_printable(self):
('Hello World', 'Hello World'),
('=?UTF-8?B?7Z+/?=', ''), # character is not printable
('=?UTF-8?B?4o6Y?=', '_'), # character is printable
('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable
]
for value, expected in test_cases:
result = _make_string_printable(original_string=value)
Expand All @@ -36,6 +38,7 @@ def test_get_printable_filename_if_existent(self):
('Hello World', 'Hello World'),
('=?UTF-8?B?7Z+/?=', ''), # character is not printable
('=?UTF-8?B?4o6Y?=', '_'), # character is printable
('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable
(None, None),
]

Expand Down

0 comments on commit 20c387e

Please sign in to comment.