Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-74865: textwrap support for true (Unicode) em-dashes #2224

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions Lib/test/test_textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,41 @@ def test_em_dash(self):
"he", " ", "was", " ", "gone"]
self.check_split(text, expect)

def test_unicode_em_dash(self):
# Test text with Unicode em-dashes
text = "Em-dashes should be written \u2014 thus."
self.check_wrap(text, 25,
["Em-dashes should be",
"written \u2014 thus."])

# Probe the boundaries of the em-dash. Parallels ASCII tests
# but widths - 1 since len('\u2014') = len('--') - 1
self.check_wrap(text, 28,
["Em-dashes should be written",
"\u2014 thus."])
expect = ["Em-dashes should be written \u2014",
"thus."]
self.check_wrap(text, 29, expect)
self.check_wrap(text, 34, expect)
self.check_wrap(text, 35,
["Em-dashes should be written \u2014 thus."])

# Tests for adjacent glyphs not needed for Unicode em-dash
# because unlike adjacent hypens, not meaningful or common.

# All of the above behaviour could be deduced by probing the
# _split() method. Note mixed real and simulated em-dashes.
text = "Here's an \u2014 em-dash and\u2014here's another---and another! And--more!"
expect = ["Here's", " ", "an", " ", "\u2014", " ", "em-", "dash", " ",
"and", "\u2014", "here's", " ", "another", "---",
"and", " ", "another!", " ", "And", "--", "more!"]

self.check_split(text, expect)

text = "and then\u2014bam!\u2014he was gone"
expect = ["and", " ", "then", "\u2014", "bam!", "\u2014",
"he", " ", "was", " ", "gone"]
self.check_split(text, expect)

def test_unix_options (self):
# Test that Unix-style command-line options are wrapped correctly.
Expand Down
8 changes: 6 additions & 2 deletions Lib/textwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,21 @@ class TextWrapper:
wordsep_re = re.compile(r'''
( # any whitespace
%(ws)s+
| # em-dash between words
| # ASCII em-dash between words
(?<=%(wp)s) -{2,} (?=\w)
| # Unicode em-dash between words
(?<=%(wp)s) \u2014 (?=\w)
| # word, possibly hyphenated
%(nws)s+? (?:
# hyphenated word
-(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
(?= %(lt)s -? %(lt)s)
| # end of word
(?=%(ws)s|\Z)
| # em-dash
| # ASCII em-dash
(?<=%(wp)s) (?=-{2,}\w)
| # Unicode em-dash
(?<=%(wp)s) (?=\u2014\w)
)
)''' % {'wp': word_punct, 'lt': letter,
'ws': whitespace, 'nws': nowhitespace},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
:mod:`textwrap` now treats real, Unicode em-dashes like the simulated ones
(two or more consecutive hyphens). Patch by Jonathan Eunice.