python · jonathaneunice · Jun 15, 2017 · Apr 1, 2023 · Apr 1, 2023
diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py
@@ -269,6 +269,41 @@ def test_em_dash(self):
                   "he", " ", "was", " ", "gone"]
         self.check_split(text, expect)
 
+    def test_unicode_em_dash(self):
+        # Test text with Unicode em-dashes
+        text = "Em-dashes should be written \u2014 thus."
+        self.check_wrap(text, 25,
+                        ["Em-dashes should be",
+                         "written \u2014 thus."])
+
+        # Probe the boundaries of the em-dash. Parallels ASCII tests
+        # but widths - 1 since len('\u2014') = len('--') - 1
+        self.check_wrap(text, 28,
+                        ["Em-dashes should be written",
+                         "\u2014 thus."])
+        expect = ["Em-dashes should be written \u2014",
+                  "thus."]
+        self.check_wrap(text, 29, expect)
+        self.check_wrap(text, 34, expect)
+        self.check_wrap(text, 35,
+                        ["Em-dashes should be written \u2014 thus."])
+
+        # Tests for adjacent glyphs not needed for Unicode em-dash
+        # because unlike adjacent hypens, not meaningful or common.
+
+        # All of the above behaviour could be deduced by probing the
+        # _split() method. Note mixed real and simulated em-dashes.
+        text = "Here's an \u2014 em-dash and\u2014here's another---and another! And--more!"
+        expect = ["Here's", " ", "an", " ", "\u2014", " ", "em-", "dash", " ",
+                  "and", "\u2014", "here's", " ", "another", "---",
+                  "and", " ", "another!", " ", "And", "--", "more!"]
+
+        self.check_split(text, expect)
+
+        text = "and then\u2014bam!\u2014he was gone"
+        expect = ["and", " ", "then", "\u2014", "bam!", "\u2014",
+                  "he", " ", "was", " ", "gone"]
+        self.check_split(text, expect)
 
     def test_unix_options (self):
         # Test that Unix-style command-line options are wrapped correctly.

diff --git a/Lib/textwrap.py b/Lib/textwrap.py
@@ -78,17 +78,21 @@ class TextWrapper:
     wordsep_re = re.compile(r'''
         ( # any whitespace
           %(ws)s+
-        | # em-dash between words
+        | # ASCII em-dash between words
           (?<=%(wp)s) -{2,} (?=\w)
+        | # Unicode em-dash between words
+          (?<=%(wp)s) \u2014 (?=\w)
         | # word, possibly hyphenated
           %(nws)s+? (?:
             # hyphenated word
               -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
               (?= %(lt)s -? %(lt)s)
             | # end of word
               (?=%(ws)s|\Z)
-            | # em-dash
+            | # ASCII em-dash
               (?<=%(wp)s) (?=-{2,}\w)
+            | # Unicode em-dash
+              (?<=%(wp)s) (?=\u2014\w)
             )
         )''' % {'wp': word_punct, 'lt': letter,
                 'ws': whitespace, 'nws': nowhitespace},

diff --git a/Misc/NEWS.d/next/Library/2023-04-01-08-55-31.gh-issue-74865.hqKRyU.rst b/Misc/NEWS.d/next/Library/2023-04-01-08-55-31.gh-issue-74865.hqKRyU.rst
@@ -0,0 +1,2 @@
+:mod:`textwrap` now treats real, Unicode em-dashes like the simulated ones
+(two or more consecutive hyphens). Patch by Jonathan Eunice.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		:mod:`textwrap` now treats real, Unicode em-dashes like the simulated ones
		(two or more consecutive hyphens). Patch by Jonathan Eunice.