refactor: make unicode_data tests normal tests

Kmeakin · Kmeakin · commit 7f0021722f19 · 2025-10-20T00:42:15.000+01:00
Instead of generating a standalone executable to test `unicode_data`,
generate normal tests in `coretests`. This ensures tests are always
generated, and will be run as part of the normal testsuite.

Also change the generated tests to loop over lookup tables, rather than
generating a separate `assert_eq!()` statement for every codepoint. The
old approach produced a massive (20,000 lines plus) file which took
minutes to compile!
diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs
@@ -20,7 +20,7 @@ pub(crate) mod printable;
 
 mod rt;
 #[allow(unreachable_pub)]
-mod unicode_data;
+pub mod unicode_data;
 
 /// The version of [Unicode](https://www.unicode.org/) that the Unicode parts of
 /// `char` and `str` methods are based on.
diff --git a/library/coretests/tests/char.rs b/library/coretests/tests/char.rs
@@ -2,6 +2,8 @@ use std::char::MAX_LEN_UTF8;
 use std::str::FromStr;
 use std::{char, str};
 
+mod unicode_data;
+
 #[test]
 fn test_convert() {
     assert_eq!(u32::from('a'), 0x61);
diff --git a/library/coretests/tests/char/unicode_data.rs b/library/coretests/tests/char/unicode_data.rs
diff --git a/library/coretests/tests/lib.rs b/library/coretests/tests/lib.rs
@@ -111,6 +111,7 @@
 #![feature(try_find)]
 #![feature(try_trait_v2)]
 #![feature(uint_bit_width)]
+#![feature(unicode_internals)]
 #![feature(unsize)]
 #![feature(unwrap_infallible)]
 // tidy-alphabetical-end
diff --git a/src/bootstrap/src/core/build_steps/run.rs b/src/bootstrap/src/core/build_steps/run.rs
@@ -374,6 +374,7 @@ impl Step for UnicodeTableGenerator {
     fn run(self, builder: &Builder<'_>) {
         let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);
         cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));
+        cmd.arg(builder.src.join("library/coretests/tests/char/unicode_data.rs"));
         cmd.run(builder);
     }
 }
diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs
@@ -207,26 +207,23 @@ fn load_data() -> UnicodeData {
 }
 
 fn main() {
-    let write_location = std::env::args().nth(1).unwrap_or_else(|| {
-        eprintln!("Must provide path to write unicode tables to");
+    let args = std::env::args().collect::<Vec<_>>();
+
+    if args.len() != 3 {
+        eprintln!("Must provide paths to write unicode tables and tests to");
         eprintln!(
-            "e.g. {} library/core/src/unicode/unicode_data.rs",
-            std::env::args().next().unwrap_or_default()
+            "e.g. {} library/core/src/unicode/unicode_data.rs library/coretests/tests/char/unicode_data.rs",
+            args[0]
         );
         std::process::exit(1);
-    });
+    }
 
-    // Optional test path, which is a Rust source file testing that the unicode
-    // property lookups are correct.
-    let test_path = std::env::args().nth(2);
+    let data_path = &args[1];
+    let test_path = &args[2];
 
     let unicode_data = load_data();
     let ranges_by_property = &unicode_data.ranges;
 
-    if let Some(path) = test_path {
-        std::fs::write(&path, generate_tests(&unicode_data)).unwrap();
-    }
-
     let mut table_file = String::new();
     writeln!(
         table_file,
@@ -279,8 +276,12 @@ fn main() {
         writeln!(table_file, "}}\n");
     }
 
-    std::fs::write(&write_location, table_file).unwrap();
-    rustfmt(&write_location);
+    let test_file = generate_tests(&unicode_data);
+
+    std::fs::write(&test_path, test_file).unwrap();
+    std::fs::write(&data_path, table_file).unwrap();
+    rustfmt(&data_path);
+    rustfmt(&test_path);
 }
 
 fn rustfmt(path: &str) {
@@ -303,79 +304,134 @@ fn version() -> String {
 }
 
 fn generate_tests(data: &UnicodeData) -> String {
-    let mut s = format!(
-        "#![feature(core_intrinsics)]
-        #![allow(internal_features, dead_code)]
-        // ignore-tidy-filelength
-        mod rt;
-        mod unicode_data;
-        fn main() {{"
+    let mut s = String::new();
+    writeln!(
+        s,
+        "//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n"
     );
+    writeln!(s, "use std::ops::RangeInclusive;\n");
+    writeln!(s, "use core::unicode::unicode_data;\n");
     for (property, ranges) in &data.ranges {
-        let prop = property.to_lowercase();
+        let prop_lower = property.to_lowercase();
+        let prop_upper = property.to_uppercase();
         let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX)
             .filter(|c| !c.is_ascii())
             .map(u32::from)
             .partition(|c| ranges.iter().any(|r| r.contains(c)));
 
+        let is_true = ranges_from_set(&is_true);
+        let is_false = ranges_from_set(&is_false);
+
+        let is_true = is_true
+            .iter()
+            .map(|r| {
+                let start = char::from_u32(r.start).unwrap();
+                let end = char::from_u32(r.end - 1).unwrap();
+                start..=end
+            })
+            .collect::<Vec<_>>();
+        let is_false = is_false
+            .iter()
+            .map(|r| {
+                let start = char::from_u32(r.start).unwrap();
+                let end = char::from_u32(r.end - 1).unwrap();
+                start..=end
+            })
+            .collect::<Vec<_>>();
+
         writeln!(
             s,
-            "println!(\"Testing {prop}\");
-            {prop}_true();
-            {prop}_false();
-            fn {prop}_true() {{\n{}\n}}
-            fn {prop}_false() {{\n{}\n}}",
-            generate_asserts(&prop, &is_true, true),
-            generate_asserts(&prop, &is_false, false)
+            "
+#[test]
+#[cfg_attr(miri, ignore)]
+fn {prop_lower}_true() {{
+    for range in {prop_upper}_TRUE {{
+        for c in range.clone() {{
+            assert!(unicode_data::{prop_lower}::lookup(c), \"{{c:?}}\");
+        }}
+    }}
+}}
+#[rustfmt::skip]
+static {prop_upper}_TRUE: &[RangeInclusive<char>; {is_true_len}] = &[{is_true}];
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn {prop_lower}_false() {{
+    for range in {prop_upper}_FALSE {{
+        for c in range.clone() {{
+            assert!(!unicode_data::{prop_lower}::lookup(c), \"{{c:?}}\");
+        }}
+    }}
+}}
+#[rustfmt::skip]
+static {prop_upper}_FALSE: &[RangeInclusive<char>; {is_false_len}] = &[{is_false}];
+",
+            is_true_len = is_true.len(),
+            is_false_len = is_false.len(),
+            is_true = fmt_list(is_true),
+            is_false = fmt_list(is_false),
         );
     }
 
-    for (name, conversion) in ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
+    for (prop_lower, conversion) in
+        ["to_lower", "to_upper"].iter().zip([&data.to_lower, &data.to_upper])
     {
-        writeln!(s, r#"println!("Testing {name}");"#);
-        for (c, mapping) in conversion {
-            let c = char::from_u32(*c).unwrap();
-            let mapping = mapping.map(|c| char::from_u32(c).unwrap());
-            writeln!(s, "assert_eq!(unicode_data::conversions::{name}({c:?}), {mapping:?});");
-        }
+        let prop_upper = prop_lower.to_uppercase();
+
+        let mapped = conversion
+            .iter()
+            .map(|(c, chars)| {
+                (char::from_u32(*c).unwrap(), chars.map(|c| char::from_u32(c).unwrap()))
+            })
+            .collect::<Vec<_>>();
+
         let unmapped: Vec<_> = (char::MIN..=char::MAX)
             .filter(|c| !c.is_ascii())
             .map(u32::from)
             .filter(|c| !conversion.contains_key(c))
             .collect();
-        let unmapped_ranges = ranges_from_set(&unmapped);
-        for range in unmapped_ranges {
-            let start = char::from_u32(range.start).unwrap();
-            let end = char::from_u32(range.end - 1).unwrap();
-            writeln!(
-                s,
-                r#"for c in {start:?}..={end:?} {{
-                    assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);
-                }}"#
-            );
-        }
-    }
-
-    writeln!(s, "}}");
-    s
-}
+        let unmapped = ranges_from_set(&unmapped);
+        let unmapped = unmapped
+            .iter()
+            .map(|r| {
+                let start = char::from_u32(r.start).unwrap();
+                let end = char::from_u32(r.end - 1).unwrap();
+                start..=end
+            })
+            .collect::<Vec<_>>();
 
-fn generate_asserts(prop: &str, points: &[u32], truthy: bool) -> String {
-    let mut s = String::new();
-    let truthy = if truthy { "" } else { "!" };
-    for range in ranges_from_set(points) {
-        let start = char::from_u32(range.start).unwrap();
-        let end = char::from_u32(range.end - 1).unwrap();
-        match range.len() {
-            1 => writeln!(s, "assert!({truthy}unicode_data::{prop}::lookup({start:?}));"),
-            _ => writeln!(
-                s,
-                "for c in {start:?}..={end:?} {{
-                    assert!({truthy}unicode_data::{prop}::lookup(c));
-                }}"
-            ),
-        }
+        writeln!(
+            s,
+            r#"
+#[test]
+#[cfg_attr(miri, ignore)]
+fn {prop_lower}_mapped() {{
+    for (c, chars) in {prop_upper}_MAPPED {{
+        assert_eq!(unicode_data::conversions::{prop_lower}(*c), *chars, "{{c:?}}");
+    }}
+}}
+#[rustfmt::skip]
+static {prop_upper}_MAPPED: &[(char, [char; 3]); {mapped_len}] = &[{mapped}];
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn {prop_lower}_unmapped() {{
+    for range in {prop_upper}_UNMAPPED {{
+        for c in range.clone() {{
+            assert_eq!(unicode_data::conversions::{prop_lower}(c), [c, '\0', '\0'], "{{c:?}}");
+        }}
+    }}
+}}
+#[rustfmt::skip]
+static {prop_upper}_UNMAPPED: &[RangeInclusive<char>; {unmapped_len}] = &[{unmapped}];
+"#,
+            mapped_len = mapped.len(),
+            unmapped_len = unmapped.len(),
+            mapped = fmt_list(mapped),
+            unmapped = fmt_list(unmapped),
+        );
     }
+
     s
 }
 

Original file line number	Diff line number	Diff line change
`@@ -374,6 +374,7 @@ impl Step for UnicodeTableGenerator {`
`374`	`374`	`fn run(self, builder: &Builder<'_>) {`
`375`	`375`	`let mut cmd = builder.tool_cmd(Tool::UnicodeTableGenerator);`
`376`	`376`	`cmd.arg(builder.src.join("library/core/src/unicode/unicode_data.rs"));`
	`377`	`+ cmd.arg(builder.src.join("library/coretests/tests/char/unicode_data.rs"));`
`377`	`378`	`cmd.run(builder);`
`378`	`379`	`}`
`379`	`380`	`}`