Skip to content

Commit 6fefcee

Browse files
committed
use utf-8 throughout htmldocck
This commit improves compatibility with Python 3, which already uses Unicode throughout. It also fixes a subtle incompatibility stemming from the use of `entitydefs`, which contains replacement text _encoded in latin-1_ for HTML entities. When using Python 3, this would cause `0xa0` to be incorrectly added to the element tree. This meant that there was a rustdoc test that would pass under Python 2 but fail under Python 3, due to an incorrect regex match against the non-breaking space character. This commit triggers that failure in both versions, and also fixes it.
1 parent 6861426 commit 6fefcee

File tree

2 files changed

+29
-19
lines changed

2 files changed

+29
-19
lines changed

src/etc/htmldocck.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
14
r"""
25
htmldocck.py is a custom checker script for Rustdoc HTML outputs.
36
@@ -98,7 +101,10 @@
98101
99102
"""
100103

101-
from __future__ import print_function
104+
from __future__ import absolute_import, print_function, unicode_literals
105+
106+
import codecs
107+
import io
102108
import sys
103109
import os.path
104110
import re
@@ -110,14 +116,10 @@
110116
from HTMLParser import HTMLParser
111117
from xml.etree import cElementTree as ET
112118

113-
# ⇤/⇥ are not in HTML 4 but are in HTML 5
114119
try:
115-
from html.entities import entitydefs
120+
from html.entities import name2codepoint
116121
except ImportError:
117-
from htmlentitydefs import entitydefs
118-
entitydefs['larrb'] = u'\u21e4'
119-
entitydefs['rarrb'] = u'\u21e5'
120-
entitydefs['nbsp'] = ' '
122+
from htmlentitydefs import name2codepoint
121123

122124
# "void elements" (no closing tag) from the HTML Standard section 12.1.2
123125
VOID_ELEMENTS = set(['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen',
@@ -157,11 +159,11 @@ def handle_data(self, data):
157159
self.__builder.data(data)
158160

159161
def handle_entityref(self, name):
160-
self.__builder.data(entitydefs[name])
162+
self.__builder.data(unichr(name2codepoint[name]))
161163

162164
def handle_charref(self, name):
163165
code = int(name[1:], 16) if name.startswith(('x', 'X')) else int(name, 10)
164-
self.__builder.data(unichr(code).encode('utf-8'))
166+
self.__builder.data(unichr(code))
165167

166168
def close(self):
167169
HTMLParser.close(self)
@@ -210,11 +212,11 @@ def concat_multi_lines(f):
210212
(?<=(?<!\S)@)(?P<negated>!?)
211213
(?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
212214
(?P<args>.*)$
213-
''', re.X)
215+
''', re.X | re.UNICODE)
214216

215217

216218
def get_commands(template):
217-
with open(template, 'rU') as f:
219+
with io.open(template, encoding='utf-8') as f:
218220
for lineno, line in concat_multi_lines(f):
219221
m = LINE_PATTERN.search(line)
220222
if not m:
@@ -226,7 +228,10 @@ def get_commands(template):
226228
if args and not args[:1].isspace():
227229
print_err(lineno, line, 'Invalid template syntax')
228230
continue
229-
args = shlex.split(args)
231+
try:
232+
args = shlex.split(args)
233+
except UnicodeEncodeError:
234+
args = [arg.decode('utf-8') for arg in shlex.split(args.encode('utf-8'))]
230235
yield Command(negated=negated, cmd=cmd, args=args, lineno=lineno+1, context=line)
231236

232237

@@ -280,7 +285,7 @@ def get_file(self, path):
280285
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
281286
raise FailedCheck('File does not exist {!r}'.format(path))
282287

283-
with open(abspath) as f:
288+
with io.open(abspath, encoding='utf-8') as f:
284289
data = f.read()
285290
self.files[path] = data
286291
return data
@@ -294,9 +299,9 @@ def get_tree(self, path):
294299
if not(os.path.exists(abspath) and os.path.isfile(abspath)):
295300
raise FailedCheck('File does not exist {!r}'.format(path))
296301

297-
with open(abspath) as f:
302+
with io.open(abspath, encoding='utf-8') as f:
298303
try:
299-
tree = ET.parse(f, CustomHTMLParser())
304+
tree = ET.fromstringlist(f.readlines(), CustomHTMLParser())
300305
except Exception as e:
301306
raise RuntimeError('Cannot parse an HTML file {!r}: {}'.format(path, e))
302307
self.trees[path] = tree
@@ -313,7 +318,7 @@ def check_string(data, pat, regexp):
313318
if not pat:
314319
return True # special case a presence testing
315320
elif regexp:
316-
return re.search(pat, data) is not None
321+
return re.search(pat, data, flags=re.UNICODE) is not None
317322
else:
318323
data = ' '.join(data.split())
319324
pat = ' '.join(pat.split())
@@ -350,7 +355,7 @@ def check_tree_text(tree, path, pat, regexp):
350355
break
351356
except Exception as e:
352357
print('Failed to get path "{}"'.format(path))
353-
raise e
358+
raise
354359
return ret
355360

356361

@@ -359,7 +364,12 @@ def get_tree_count(tree, path):
359364
return len(tree.findall(path))
360365

361366
def stderr(*args):
362-
print(*args, file=sys.stderr)
367+
if sys.version_info.major < 3:
368+
file = codecs.getwriter('utf-8')(sys.stderr)
369+
else:
370+
file = sys.stderr
371+
372+
print(*args, file=file)
363373

364374
def print_err(lineno, context, err, message=None):
365375
global ERR_COUNT

src/test/rustdoc/issue-32374.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
// 'Deprecated since 1.0.0: text'
1111
// @has - '<code>test</code>&nbsp;<a href="http://issue_url/32374">#32374</a>'
1212
// @matches issue_32374/struct.T.html '//*[@class="stab unstable"]' \
13-
// '🔬 This is a nightly-only experimental API. \(test #32374\)$'
13+
// '🔬 This is a nightly-only experimental API. \(test\s#32374\)$'
1414
/// Docs
1515
#[rustc_deprecated(since = "1.0.0", reason = "text")]
1616
#[unstable(feature = "test", issue = "32374")]

0 commit comments

Comments
 (0)