Skip to content

Commit 69b205d

Browse files
committed
Fix for email.generator.Generator with whitespace between encoded words.
email.generator.Generator currently does not handle whitespace between encoded words correctly when the encoded words span multiple lines. The current generator will create an encoded word for each line. If the end of the line happens to correspond with the end real word in the plaintext, the generator will place an unencoded space at the start of the subsequent lines to represent the whitespace between the plaintext words. A compliant decoder will strip all the whitespace from between two encoded words which leads to missing spaces in the round-tripped output. The fix for this is to make sure that whitespace between two encoded words ends up inside of one or the other of the encoded words. This fix places the space inside of the second encoded word. A second problem happens with continuation lines. A continuation line that starts with whitespace and is followed by a non-encoded word is fine because the newline between such continuation lines is defined as condensing to a single space character. When the continuation line starts with whitespace followed by an encoded word, however, the RFCs specify that the word is run together with the encoded word on the previous line. This is because normal words are filded on syntactic breaks by encoded words are not. The solution to this is to add the whitespace to the start of the encoded word on the continuation line. Test cases are from #92081
1 parent dc3f975 commit 69b205d

File tree

4 files changed

+79
-8
lines changed

4 files changed

+79
-8
lines changed

Lib/email/_header_value_parser.py

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2764,10 +2764,14 @@ def _refold_parse_tree(parse_tree, *, policy):
27642764
# max_line_length 0/None means no limit, ie: infinitely long.
27652765
maxlen = policy.max_line_length or sys.maxsize
27662766
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
2767-
lines = ['']
2768-
last_ew = None
2767+
lines = [''] # Folded lines to be output
2768+
leading_whitespace = '' # When we have whitespace between two encoded
2769+
# words, we may need to encode the whitespace
2770+
# at the beginning of the second word.
2771+
last_ew = None # Points to the last encoded character if there's an ew on
2772+
# the line
27692773
wrap_as_ew_blocked = 0
2770-
want_encoding = False
2774+
want_encoding = False # This is set to True if we need to encode this part
27712775
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
27722776
parts = list(parse_tree)
27732777
while parts:
@@ -2791,10 +2795,12 @@ def _refold_parse_tree(parse_tree, *, policy):
27912795
# 'charset' property on the policy.
27922796
charset = 'utf-8'
27932797
want_encoding = True
2798+
27942799
if part.token_type == 'mime-parameters':
27952800
# Mime parameter folding (using RFC2231) is extra special.
27962801
_fold_mime_parameters(part, lines, maxlen, encoding)
27972802
continue
2803+
27982804
if want_encoding and not wrap_as_ew_blocked:
27992805
if not part.as_ew_allowed:
28002806
want_encoding = False
@@ -2821,20 +2827,37 @@ def _refold_parse_tree(parse_tree, *, policy):
28212827
# It's a terminal, wrap it as an encoded word, possibly
28222828
# combining it with previously encoded words if allowed.
28232829
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
2824-
part.ew_combine_allowed, charset)
2830+
part.ew_combine_allowed, charset, leading_whitespace)
2831+
# This whitespace has been added to the lines in _fold_as_ew()
2832+
# so clear it now.
2833+
leading_whitespace = ''
28252834
want_encoding = False
28262835
continue
2836+
28272837
if len(tstr) <= maxlen - len(lines[-1]):
28282838
lines[-1] += tstr
28292839
continue
2840+
28302841
# This part is too long to fit. The RFC wants us to break at
28312842
# "major syntactic breaks", so unless we don't consider this
28322843
# to be one, check if it will fit on the next line by itself.
2844+
leading_whitespace = ''
28332845
if (part.syntactic_break and
28342846
len(tstr) + 1 <= maxlen):
28352847
newline = _steal_trailing_WSP_if_exists(lines)
28362848
if newline or part.startswith_fws():
2849+
# We're going to fold the data onto a new line here. Due to
2850+
# the way encoded strings handle continuation lines, we need to
2851+
# be prepared to encode any whitespace if the next line turns
2852+
# out to start with an encoded word.
28372853
lines.append(newline + tstr)
2854+
2855+
leading_whitespace = []
2856+
for char in lines[-1]:
2857+
if char not in WSP:
2858+
break
2859+
leading_whitespace.append(char)
2860+
leading_whitespace = ''.join(leading_whitespace)
28382861
last_ew = None
28392862
continue
28402863
if not hasattr(part, 'encode'):
@@ -2858,9 +2881,10 @@ def _refold_parse_tree(parse_tree, *, policy):
28582881
else:
28592882
# We can't fold it onto the next line either...
28602883
lines[-1] += tstr
2884+
28612885
return policy.linesep.join(lines) + policy.linesep
28622886

2863-
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
2887+
def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset, leading_whitespace):
28642888
"""Fold string to_encode into lines as encoded word, combining if allowed.
28652889
Return the new value for last_ew, or None if ew_combine_allowed is False.
28662890
@@ -2875,14 +2899,15 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
28752899
to_encode = str(
28762900
get_unstructured(lines[-1][last_ew:] + to_encode))
28772901
lines[-1] = lines[-1][:last_ew]
2878-
if to_encode[0] in WSP:
2902+
elif to_encode[0] in WSP:
28792903
# We're joining this to non-encoded text, so don't encode
28802904
# the leading blank.
28812905
leading_wsp = to_encode[0]
28822906
to_encode = to_encode[1:]
28832907
if (len(lines[-1]) == maxlen):
28842908
lines.append(_steal_trailing_WSP_if_exists(lines))
28852909
lines[-1] += leading_wsp
2910+
28862911
trailing_wsp = ''
28872912
if to_encode[-1] in WSP:
28882913
# Likewise for the trailing space.
@@ -2902,11 +2927,20 @@ def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
29022927

29032928
while to_encode:
29042929
remaining_space = maxlen - len(lines[-1])
2905-
text_space = remaining_space - chrome_len
2930+
text_space = remaining_space - chrome_len - len(leading_whitespace)
29062931
if text_space <= 0:
29072932
lines.append(' ')
29082933
continue
29092934

2935+
# If we are at the start of a continuation line, prepend whitespace
2936+
# (we only want to do this when the line starts with an encoded word
2937+
# but if we're folding in this helper function, then we know that we
2938+
# are going to be writing out an encoded word.)
2939+
if len(lines) > 1 and len(lines[-1]) == 1 and leading_whitespace:
2940+
encoded_word = _ew.encode(leading_whitespace, charset=encode_as)
2941+
lines[-1] += encoded_word
2942+
leading_whitespace = ''
2943+
29102944
to_encode_word = to_encode[:text_space]
29112945
encoded_word = _ew.encode(to_encode_word, charset=encode_as)
29122946
excess = len(encoded_word) - remaining_space

Lib/test/test_email/test_generator.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,41 @@ class TestBytesGenerator(TestGeneratorBase, TestEmailBase):
232232
ioclass = io.BytesIO
233233
typ = lambda self, x: x.encode('ascii')
234234

235+
def test_defaults_handle_spaces_between_encoded_words_when_folded(self):
236+
source = ("Уведомление о принятии в работу обращения для"
237+
" подключения услуги")
238+
expected = ('Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtSDQviDQv9GA0LjQvdGP0YLQuNC4?=\n'
239+
' =?utf-8?b?INCyINGA0LDQsdC+0YLRgyDQvtCx0YDQsNGJ0LXQvdC40Y8g0LTQu9GPINC/0L4=?=\n'
240+
' =?utf-8?b?0LTQutC70Y7Rh9C10L3QuNGPINGD0YHQu9GD0LPQuA==?=\n\n').encode('ascii')
241+
msg = EmailMessage()
242+
msg['Subject'] = source
243+
s = io.BytesIO()
244+
g = BytesGenerator(s)
245+
g.flatten(msg)
246+
self.assertEqual(s.getvalue(), expected)
247+
248+
def test_defaults_handle_spaces_at_start_of_subject(self):
249+
source = " Уведомление"
250+
expected = b"Subject: =?utf-8?b?0KPQstC10LTQvtC80LvQtdC90LjQtQ==?=\n\n"
251+
msg = EmailMessage()
252+
msg['Subject'] = source
253+
s = io.BytesIO()
254+
g = BytesGenerator(s)
255+
g.flatten(msg)
256+
self.assertEqual(s.getvalue(), expected)
257+
258+
def test_defaults_handle_spaces_at_start_of_continuation_line(self):
259+
source = " ф ффффффффффффффффффф ф ф"
260+
expected = (b"Subject: "
261+
b"=?utf-8?b?0YQg0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YTRhNGE0YQ=?=\n"
262+
b" =?utf-8?b?INGEINGE?=\n\n")
263+
msg = EmailMessage()
264+
msg['Subject'] = source
265+
s = io.BytesIO()
266+
g = BytesGenerator(s)
267+
g.flatten(msg)
268+
self.assertEqual(s.getvalue(), expected)
269+
235270
def test_cte_type_7bit_handles_unknown_8bit(self):
236271
source = ("Subject: Maintenant je vous présente mon "
237272
"collègue\n\n").encode('utf-8')

Lib/test/test_email/test_headerregistry.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from test.test_email import TestEmailBase, parameterize
88
from email import headerregistry
99
from email.headerregistry import Address, Group
10+
from email.header import decode_header
1011
from test.support import ALWAYS_EQ
1112

1213

@@ -1628,7 +1629,7 @@ def test_address_display_names(self):
16281629
'Lôrem ipsum dôlôr sit amet, cônsectetuer adipiscing. '
16291630
'Suspendisse pôtenti. Aliquam nibh. Suspendisse pôtenti.',
16301631
'=?utf-8?q?L=C3=B4rem_ipsum_d=C3=B4l=C3=B4r_sit_amet=2C_c'
1631-
'=C3=B4nsectetuer?=\n =?utf-8?q?adipiscing=2E_Suspendisse'
1632+
'=C3=B4nsectetuer?=\n =?utf-8?q?_adipiscing=2E_Suspendisse'
16321633
'_p=C3=B4tenti=2E_Aliquam_nibh=2E?=\n Suspendisse =?utf-8'
16331634
'?q?p=C3=B4tenti=2E?=',
16341635
),
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix missing spaces in email headers when the spaces are mixed with encoded 8-bit characters.

0 commit comments

Comments
 (0)