Skip to content

Commit 7f00217

Browse files
committed
refactor: make unicode_data tests normal tests
Instead of generating a standalone executable to test `unicode_data`, generate normal tests in `coretests`. This ensures tests are always generated, and will be run as part of the normal testsuite. Also change the generated tests to loop over lookup tables, rather than generating a separate `assert_eq!()` statement for every codepoint. The old approach produced a massive (20,000 lines plus) file which took minutes to compile!
1 parent f6e1076 commit 7f00217

File tree

6 files changed

+3126
-68
lines changed

6 files changed

+3126
-68
lines changed

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub(crate) mod printable;
2020

2121
mod rt;
2222
#[allow(unreachable_pub)]
23-
mod unicode_data;
23+
pub mod unicode_data;
2424

2525
/// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
2626
/// `char` and `str` methods are based on.

library/coretests/tests/char.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ use std::char::MAX_LEN_UTF8;
22
use std::str::FromStr;
33
use std::{char, str};
44

5+
mod unicode_data;
6+
57
#[test]
68
fn test_convert() {
79
assert_eq!(u32::from('a'), 0x61);

library/coretests/tests/char/unicode_data.rs

Lines changed: 2998 additions & 0 deletions
Large diffs are not rendered by default.

library/coretests/tests/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@
111111
#![feature(try_find)]
112112
#![feature(try_trait_v2)]
113113
#![feature(uint_bit_width)]
114+
#![feature(unicode_internals)]
114115
#![feature(unsize)]
115116
#![feature(unwrap_infallible)]
116117
// tidy-alphabetical-end

src/bootstrap/src/core/build_steps/run.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ impl Step for UnicodeTableGenerator {
374374
fn run(self, builder: &Builder<'_>) {
375375
let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
376376
cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
377+
cmd.arg(builder.src.join("library/coretests/tests/char/unicode_data.rs"));
377378
cmd.run(builder);
378379
}
379380
}

src/tools/unicode-table-generator/src/main.rs

Lines changed: 123 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -207,26 +207,23 @@ fn load_data() -> UnicodeData {
207207
}
208208

209209
fn main() {
210-
let write_location = std::env::args().nth(1).unwrap_or_else(|| {
211-
eprintln!("Must provide path to write unicode tables to");
210+
let args = std::env::args().collect::<Vec<_>>();
211+
212+
if args.len() != 3 {
213+
eprintln!("Must provide paths to write unicode tables and tests to");
212214
eprintln!(
213-
"e.g. {} library/core/src/unicode/unicode_data.rs",
214-
std::env::args().next().unwrap_or_default()
215+
"e.g. {} library/core/src/unicode/unicode_data.rs library/coretests/tests/char/unicode_data.rs",
216+
args[0]
215217
);
216218
std::process::exit(1);
217-
});
219+
}
218220

219-
// Optional test path, which is a Rust source file testing that the unicode
220-
// property lookups are correct.
221-
let test_path = std::env::args().nth(2);
221+
let data_path = &args[1];
222+
let test_path = &args[2];
222223

223224
let unicode_data = load_data();
224225
let ranges_by_property = &unicode_data.ranges;
225226

226-
if let Some(path) = test_path {
227-
std::fs::write(&path, generate_tests(&unicode_data)).unwrap();
228-
}
229-
230227
let mut table_file = String::new();
231228
writeln!(
232229
table_file,
@@ -279,8 +276,12 @@ fn main() {
279276
writeln!(table_file, "}}\n");
280277
}
281278

282-
std::fs::write(&write_location, table_file).unwrap();
283-
rustfmt(&write_location);
279+
let test_file = generate_tests(&unicode_data);
280+
281+
std::fs::write(&test_path, test_file).unwrap();
282+
std::fs::write(&data_path, table_file).unwrap();
283+
rustfmt(&data_path);
284+
rustfmt(&test_path);
284285
}
285286

286287
fn rustfmt(path: &str) {
@@ -303,79 +304,134 @@ fn version() -> String {
303304
}
304305

305306
fn generate_tests(data: &UnicodeData) -> String {
306-
let mut s = format!(
307-
"#![feature(core_intrinsics)]
308-
#![allow(internal_features, dead_code)]
309-
// ignore-tidy-filelength
310-
mod rt;
311-
mod unicode_data;
312-
fn main() {{"
307+
let mut s = String::new();
308+
writeln!(
309+
s,
310+
"//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n"
313311
);
312+
writeln!(s, "use std::ops::RangeInclusive;\n");
313+
writeln!(s, "use core::unicode::unicode_data;\n");
314314
for (property, ranges) in &data.ranges {
315-
let prop = property.to_lowercase();
315+
let prop_lower = property.to_lowercase();
316+
let prop_upper = property.to_uppercase();
316317
let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX)
317318
.filter(|c| !c.is_ascii())
318319
.map(u32::from)
319320
.partition(|c| ranges.iter().any(|r| r.contains(c)));
320321

322+
let is_true = ranges_from_set(&is_true);
323+
let is_false = ranges_from_set(&is_false);
324+
325+
let is_true = is_true
326+
.iter()
327+
.map(|r| {
328+
let start = char::from_u32(r.start).unwrap();
329+
let end = char::from_u32(r.end - 1).unwrap();
330+
start..=end
331+
})
332+
.collect::<Vec<_>>();
333+
let is_false = is_false
334+
.iter()
335+
.map(|r| {
336+
let start = char::from_u32(r.start).unwrap();
337+
let end = char::from_u32(r.end - 1).unwrap();
338+
start..=end
339+
})
340+
.collect::<Vec<_>>();
341+
321342
writeln!(
322343
s,
323-
"println!(\"Testing {prop}\");
324-
{prop}_true();
325-
{prop}_false();
326-
fn {prop}_true() {{\n{}\n}}
327-
fn {prop}_false() {{\n{}\n}}",
328-
generate_asserts(&prop, &is_true, true),
329-
generate_asserts(&prop, &is_false, false)
344+
"
345+
#[test]
346+
#[cfg_attr(miri, ignore)]
347+
fn {prop_lower}_true() {{
348+
for range in {prop_upper}_TRUE {{
349+
for c in range.clone() {{
350+
assert!(unicode_data::{prop_lower}::lookup(c), \"{{c:?}}\");
351+
}}
352+
}}
353+
}}
354+
#[rustfmt::skip]
355+
static {prop_upper}_TRUE: &[RangeInclusive<char>; {is_true_len}] = &[{is_true}];
356+
357+
#[test]
358+
#[cfg_attr(miri, ignore)]
359+
fn {prop_lower}_false() {{
360+
for range in {prop_upper}_FALSE {{
361+
for c in range.clone() {{
362+
assert!(!unicode_data::{prop_lower}::lookup(c), \"{{c:?}}\");
363+
}}
364+
}}
365+
}}
366+
#[rustfmt::skip]
367+
static {prop_upper}_FALSE: &[RangeInclusive<char>; {is_false_len}] = &[{is_false}];
368+
",
369+
is_true_len = is_true.len(),
370+
is_false_len = is_false.len(),
371+
is_true = fmt_list(is_true),
372+
is_false = fmt_list(is_false),
330373
);
331374
}
332375

333-
for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
376+
for (prop_lower, conversion) in
377+
["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
334378
{
335-
writeln!(s, r#"println!("Testing {name}");"#);
336-
for (c, mapping) in conversion {
337-
let c = char::from_u32(*c).unwrap();
338-
let mapping = mapping.map(|c| char::from_u32(c).unwrap());
339-
writeln!(s, "assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});");
340-
}
379+
let prop_upper = prop_lower.to_uppercase();
380+
381+
let mapped = conversion
382+
.iter()
383+
.map(|(c, chars)| {
384+
(char::from_u32(*c).unwrap(), chars.map(|c| char::from_u32(c).unwrap()))
385+
})
386+
.collect::<Vec<_>>();
387+
341388
let unmapped: Vec<_> = (char::MIN..=char::MAX)
342389
.filter(|c| !c.is_ascii())
343390
.map(u32::from)
344391
.filter(|c| !conversion.contains_key(c))
345392
.collect();
346-
let unmapped_ranges = ranges_from_set(&unmapped);
347-
for range in unmapped_ranges {
348-
let start = char::from_u32(range.start).unwrap();
349-
let end = char::from_u32(range.end - 1).unwrap();
350-
writeln!(
351-
s,
352-
r#"for c in {start:?}..={end:?} {{
353-
assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);
354-
}}"#
355-
);
356-
}
357-
}
358-
359-
writeln!(s, "}}");
360-
s
361-
}
393+
let unmapped = ranges_from_set(&unmapped);
394+
let unmapped = unmapped
395+
.iter()
396+
.map(|r| {
397+
let start = char::from_u32(r.start).unwrap();
398+
let end = char::from_u32(r.end - 1).unwrap();
399+
start..=end
400+
})
401+
.collect::<Vec<_>>();
362402

363-
fn generate_asserts(prop: &str, points: &[u32], truthy: bool) -> String {
364-
let mut s = String::new();
365-
let truthy = if truthy { "" } else { "!" };
366-
for range in ranges_from_set(points) {
367-
let start = char::from_u32(range.start).unwrap();
368-
let end = char::from_u32(range.end - 1).unwrap();
369-
match range.len() {
370-
1 => writeln!(s, "assert!({truthy}unicode_data::{prop}::lookup({start:?}));"),
371-
_ => writeln!(
372-
s,
373-
"for c in {start:?}..={end:?} {{
374-
assert!({truthy}unicode_data::{prop}::lookup(c));
375-
}}"
376-
),
377-
}
403+
writeln!(
404+
s,
405+
r#"
406+
#[test]
407+
#[cfg_attr(miri, ignore)]
408+
fn {prop_lower}_mapped() {{
409+
for (c, chars) in {prop_upper}_MAPPED {{
410+
assert_eq!(unicode_data::conversions::{prop_lower}(*c), *chars, "{{c:?}}");
411+
}}
412+
}}
413+
#[rustfmt::skip]
414+
static {prop_upper}_MAPPED: &[(char, [char; 3]); {mapped_len}] = &[{mapped}];
415+
416+
#[test]
417+
#[cfg_attr(miri, ignore)]
418+
fn {prop_lower}_unmapped() {{
419+
for range in {prop_upper}_UNMAPPED {{
420+
for c in range.clone() {{
421+
assert_eq!(unicode_data::conversions::{prop_lower}(c), [c, '\0', '\0'], "{{c:?}}");
422+
}}
423+
}}
424+
}}
425+
#[rustfmt::skip]
426+
static {prop_upper}_UNMAPPED: &[RangeInclusive<char>; {unmapped_len}] = &[{unmapped}];
427+
"#,
428+
mapped_len = mapped.len(),
429+
unmapped_len = unmapped.len(),
430+
mapped = fmt_list(mapped),
431+
unmapped = fmt_list(unmapped),
432+
);
378433
}
434+
379435
s
380436
}
381437

0 commit comments

Comments
 (0)