34
34
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
35
35
'''
36
36
37
- UNICODE_VERSION = (12 , 1 , 0 )
37
+ UNICODE_VERSION = (13 , 0 , 0 )
38
38
39
39
UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
40
40
@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
54
54
re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
55
55
re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
56
56
57
- for line in fileinput .input (os .path .basename (f )):
57
+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput . hook_encoded ( "utf-8" ) ):
58
58
prop = None
59
59
d_lo = 0
60
60
d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
81
81
82
82
return props
83
83
84
+ def load_confusables (f ):
85
+ fetch (f )
86
+ confusables = []
87
+ re1 = re .compile (r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*" )
88
+
89
+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
90
+ d_input = 0
91
+ d_outputs = []
92
+ m = re1 .match (line )
93
+ if not m :
94
+ continue
95
+ d_inputs = m .group (1 ).split ()
96
+ if len (d_inputs ) != 1 :
97
+ raise Exception ('More than one code point in first column' )
98
+ d_input = int (d_inputs [0 ].strip (), 16 )
99
+ for d_output in m .group (2 ).split ():
100
+ d_outputitem = int (d_output , 16 );
101
+ d_outputs .append (d_outputitem );
102
+ confusables .append ((d_input , d_outputs ))
103
+
104
+ return confusables
105
+
84
106
def format_table_content (f , content , indent ):
85
107
line = " " * indent
86
108
first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99
121
def escape_char (c ):
100
122
return "'\\ u{%x}'" % c
101
123
124
+ def escape_char_list (l ):
125
+ line = "[" ;
126
+ first = True ;
127
+ for c in l :
128
+ if first :
129
+ line += escape_char (c );
130
+ else :
131
+ line += ", " + escape_char (c );
132
+ first = False ;
133
+ line += "]" ;
134
+ return line
135
+
102
136
def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
103
137
pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])), is_const = True ):
104
138
pub_string = "const"
@@ -173,10 +207,45 @@ def emit_identifier_module(f):
173
207
pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
174
208
f .write ("}\n \n " )
175
209
210
+ def emit_confusable_detection_module (f ):
211
+ f .write ("pub mod confusable_detection {" )
212
+ f .write ("""
213
+
214
+ #[inline]
215
+ pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216
+ // FIXME: do we want to special case ASCII here?
217
+ match c as usize {
218
+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
219
+ }
220
+ }
221
+
222
+ """ )
223
+
224
+ f .write (" // Confusable table:\n " )
225
+ confusable_table = load_confusables ("confusables.txt" )
226
+ confusable_table .sort (key = lambda w : w [0 ])
227
+
228
+ emit_table (f , "CONFUSABLES" , confusable_table , "&'static [(char, &'static [char])]" , is_pub = False ,
229
+ pfun = lambda x : "(%s, &%s)" % (escape_char (x [0 ]), escape_char_list (x [1 ])))
230
+ f .write ("}\n \n " )
231
+
232
+
176
233
def emit_util_mod (f ):
177
234
f .write ("""
178
235
pub mod util {
179
236
use core::result::Result::{Ok, Err};
237
+
238
+ #[inline]
239
+ pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
240
+ match r.binary_search_by_key(&c, |&(k, _)| k) {
241
+ Ok(idx) => {
242
+ let (_, v) = r[idx];
243
+ Some(v)
244
+ }
245
+ Err(_) => None
246
+ }
247
+ }
248
+
180
249
#[inline]
181
250
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182
251
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +293,5 @@ def emit_util_mod(f):
224
293
emit_util_mod (rf )
225
294
### identifier module
226
295
emit_identifier_module (rf )
296
+ ### confusable_detection module
297
+ emit_confusable_detection_module (rf )
0 commit comments