Skip to content

Commit dd79485

Browse files
committed
Update to Unicode 13.0 and implement confusable detection.
1 parent 916eec5 commit dd79485

File tree

5 files changed

+3794
-1186
lines changed

5 files changed

+3794
-1186
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
1919
unicode-script = { version = "0.4.0", default-features = false }
20+
unicode-normalization = "0.1.12"
2021
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2122
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2223
compiler_builtins = { version = "0.1", optional = true }

scripts/unicode.py

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3535
'''
3636

37-
UNICODE_VERSION = (12, 1, 0)
37+
UNICODE_VERSION = (13, 0, 0)
3838

3939
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
4040

@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
5454
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
5555
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
5656

57-
for line in fileinput.input(os.path.basename(f)):
57+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
5858
prop = None
5959
d_lo = 0
6060
d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181

8282
return props
8383

84+
def load_confusables(f):
85+
fetch(f)
86+
confusables = []
87+
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")
88+
89+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
90+
d_input = 0
91+
d_outputs = []
92+
m = re1.match(line)
93+
if not m:
94+
continue
95+
d_inputs = m.group(1).split()
96+
if len(d_inputs) != 1:
97+
raise Exception('More than one code point in first column')
98+
d_input = int(d_inputs[0].strip(), 16)
99+
for d_output in m.group(2).split():
100+
d_outputitem = int(d_output, 16);
101+
d_outputs.append(d_outputitem);
102+
confusables.append((d_input, d_outputs))
103+
104+
return confusables
105+
84106
def format_table_content(f, content, indent):
85107
line = " "*indent
86108
first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121
def escape_char(c):
100122
return "'\\u{%x}'" % c
101123

124+
def escape_char_list(l):
125+
line = "[";
126+
first = True;
127+
for c in l:
128+
if first:
129+
line += escape_char(c);
130+
else:
131+
line += ", " + escape_char(c);
132+
first = False;
133+
line += "]";
134+
return line
135+
102136
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
103137
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
104138
pub_string = "const"
@@ -173,10 +207,45 @@ def emit_identifier_module(f):
173207
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
174208
f.write("}\n\n")
175209

210+
def emit_confusable_detection_module(f):
211+
f.write("pub mod confusable_detection {")
212+
f.write("""
213+
214+
#[inline]
215+
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+
// FIXME: do we want to special case ASCII here?
217+
match c as usize {
218+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
219+
}
220+
}
221+
222+
""")
223+
224+
f.write(" // Confusable table:\n")
225+
confusable_table = load_confusables("confusables.txt")
226+
confusable_table.sort(key=lambda w: w[0])
227+
228+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
229+
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
230+
f.write("}\n\n")
231+
232+
176233
def emit_util_mod(f):
177234
f.write("""
178235
pub mod util {
179236
use core::result::Result::{Ok, Err};
237+
238+
#[inline]
239+
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
240+
match r.binary_search_by_key(&c, |&(k, _)| k) {
241+
Ok(idx) => {
242+
let (_, v) = r[idx];
243+
Some(v)
244+
}
245+
Err(_) => None
246+
}
247+
}
248+
180249
#[inline]
181250
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182251
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +293,5 @@ def emit_util_mod(f):
224293
emit_util_mod(rf)
225294
### identifier module
226295
emit_identifier_module(rf)
296+
### confusable_detection module
297+
emit_confusable_detection_module(rf)

src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#![cfg_attr(feature = "bench", feature(test))]
5050
#![no_std]
5151

52+
extern crate alloc;
53+
5254
#[cfg(test)]
5355
#[macro_use]
5456
extern crate std;
@@ -61,10 +63,12 @@ pub use tables::UNICODE_VERSION;
6163
pub mod general_security_profile;
6264
pub mod mixed_script;
6365
pub mod restriction_level;
66+
pub mod confusable_detection;
6467

6568
pub use general_security_profile::GeneralSecurityProfile;
6669
pub use mixed_script::MixedScript;
6770
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};
71+
pub use confusable_detection::skeleton;
6872

6973
#[rustfmt::skip]
7074
pub(crate) mod tables;

0 commit comments

Comments
 (0)