1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
1
4
r"""
2
5
htmldocck.py is a custom checker script for Rustdoc HTML outputs.
3
6
98
101
99
102
"""
100
103
101
- from __future__ import print_function
104
+ from __future__ import absolute_import , print_function , unicode_literals
105
+
106
+ import codecs
107
+ import io
102
108
import sys
103
109
import os .path
104
110
import re
110
116
from HTMLParser import HTMLParser
111
117
from xml .etree import cElementTree as ET
112
118
113
- # ⇤/⇥ are not in HTML 4 but are in HTML 5
114
119
try :
115
- from html .entities import entitydefs
120
+ from html .entities import name2codepoint
116
121
except ImportError :
117
- from htmlentitydefs import entitydefs
118
- entitydefs ['larrb' ] = u'\u21e4 '
119
- entitydefs ['rarrb' ] = u'\u21e5 '
120
- entitydefs ['nbsp' ] = ' '
122
+ from htmlentitydefs import name2codepoint
121
123
122
124
# "void elements" (no closing tag) from the HTML Standard section 12.1.2
123
125
VOID_ELEMENTS = set (['area' , 'base' , 'br' , 'col' , 'embed' , 'hr' , 'img' , 'input' , 'keygen' ,
@@ -157,11 +159,11 @@ def handle_data(self, data):
157
159
self .__builder .data (data )
158
160
159
161
def handle_entityref (self , name ):
160
- self .__builder .data (entitydefs [name ])
162
+ self .__builder .data (unichr ( name2codepoint [name ]) )
161
163
162
164
def handle_charref (self , name ):
163
165
code = int (name [1 :], 16 ) if name .startswith (('x' , 'X' )) else int (name , 10 )
164
- self .__builder .data (unichr (code ). encode ( 'utf-8' ) )
166
+ self .__builder .data (unichr (code ))
165
167
166
168
def close (self ):
167
169
HTMLParser .close (self )
@@ -210,11 +212,11 @@ def concat_multi_lines(f):
210
212
(?<=(?<!\S)@)(?P<negated>!?)
211
213
(?P<cmd>[A-Za-z]+(?:-[A-Za-z]+)*)
212
214
(?P<args>.*)$
213
- ''' , re .X )
215
+ ''' , re .X | re . UNICODE )
214
216
215
217
216
218
def get_commands (template ):
217
- with open (template , 'rU ' ) as f :
219
+ with io . open (template , encoding = 'utf-8 ' ) as f :
218
220
for lineno , line in concat_multi_lines (f ):
219
221
m = LINE_PATTERN .search (line )
220
222
if not m :
@@ -226,7 +228,10 @@ def get_commands(template):
226
228
if args and not args [:1 ].isspace ():
227
229
print_err (lineno , line , 'Invalid template syntax' )
228
230
continue
229
- args = shlex .split (args )
231
+ try :
232
+ args = shlex .split (args )
233
+ except UnicodeEncodeError :
234
+ args = [arg .decode ('utf-8' ) for arg in shlex .split (args .encode ('utf-8' ))]
230
235
yield Command (negated = negated , cmd = cmd , args = args , lineno = lineno + 1 , context = line )
231
236
232
237
@@ -280,7 +285,7 @@ def get_file(self, path):
280
285
if not (os .path .exists (abspath ) and os .path .isfile (abspath )):
281
286
raise FailedCheck ('File does not exist {!r}' .format (path ))
282
287
283
- with open (abspath ) as f :
288
+ with io . open (abspath , encoding = 'utf-8' ) as f :
284
289
data = f .read ()
285
290
self .files [path ] = data
286
291
return data
@@ -294,9 +299,9 @@ def get_tree(self, path):
294
299
if not (os .path .exists (abspath ) and os .path .isfile (abspath )):
295
300
raise FailedCheck ('File does not exist {!r}' .format (path ))
296
301
297
- with open (abspath ) as f :
302
+ with io . open (abspath , encoding = 'utf-8' ) as f :
298
303
try :
299
- tree = ET .parse ( f , CustomHTMLParser ())
304
+ tree = ET .fromstringlist ( f . readlines () , CustomHTMLParser ())
300
305
except Exception as e :
301
306
raise RuntimeError ('Cannot parse an HTML file {!r}: {}' .format (path , e ))
302
307
self .trees [path ] = tree
@@ -313,7 +318,7 @@ def check_string(data, pat, regexp):
313
318
if not pat :
314
319
return True # special case a presence testing
315
320
elif regexp :
316
- return re .search (pat , data ) is not None
321
+ return re .search (pat , data , flags = re . UNICODE ) is not None
317
322
else :
318
323
data = ' ' .join (data .split ())
319
324
pat = ' ' .join (pat .split ())
@@ -350,7 +355,7 @@ def check_tree_text(tree, path, pat, regexp):
350
355
break
351
356
except Exception as e :
352
357
print ('Failed to get path "{}"' .format (path ))
353
- raise e
358
+ raise
354
359
return ret
355
360
356
361
@@ -359,7 +364,12 @@ def get_tree_count(tree, path):
359
364
return len (tree .findall (path ))
360
365
361
366
def stderr (* args ):
362
- print (* args , file = sys .stderr )
367
+ if sys .version_info .major < 3 :
368
+ file = codecs .getwriter ('utf-8' )(sys .stderr )
369
+ else :
370
+ file = sys .stderr
371
+
372
+ print (* args , file = file )
363
373
364
374
def print_err (lineno , context , err , message = None ):
365
375
global ERR_COUNT
0 commit comments