From e72a20888752a9fd9da1cbe7955a9c1cd409e1b0 Mon Sep 17 00:00:00 2001 From: Asherah Connor Date: Wed, 10 Jul 2024 18:37:20 +0300 Subject: [PATCH] strings: Case::DontPreserve is now Case::Fold. We add `caseless` to do the folding. It matches upstream enough [^1], unlike e.g. ICU4X's `CaseMapper` (doesn't fold Eszett to "ss"), and also unlike ICU4X, it doesn't require us to bump our MSRV. 2/2 sgtm A separate `--gfm-quirks` CLI option is added since base tests fail if we just turn on all of GFM for them. The nice thing about `caseless` is that while its last release may be 6 years ago, it depends on unicode-normalization ^0.1, the latest of which is 5 months ago. It's also [very easy to read][caseless], so I'm all good with this. [^1] Not that straightforward: https://github.com/commonmark/commonmark-spec/issues/695 [caseless]: https://github.com/unicode-rs/rust-caseless/blob/v0.2.1/src/lib.rs --- Cargo.lock | 35 +++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + script/cibuild | 2 +- src/main.rs | 12 ++++++++---- src/parser/inlines.rs | 2 +- src/parser/mod.rs | 6 +++--- src/strings.rs | 32 ++++++++++++++++++-------------- 7 files changed, 67 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bdcfc456..afc18d1f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -68,6 +68,16 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "caseless" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808dab3318747be122cb31d36de18d4d1c81277a76f8332a02b81a3d73463d7f" +dependencies = [ + "regex", + "unicode-normalization", +] + [[package]] name = "cc" version = "1.0.78" @@ -123,6 +133,7 @@ name = "comrak" version = "0.24.1" dependencies = [ "arbitrary", + "caseless", "clap", "derive_builder", "emojis", @@ -809,6 +820,21 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinyvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "toml" version = "0.5.10" @@ -864,6 +890,15 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode_categories" version = "0.1.1" diff --git a/Cargo.toml b/Cargo.toml index 13cab088..9366fce4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ slug = "0.1.4" emojis = { version = "0.6.2", optional = true } arbitrary = { version = "1", optional = true, features = ["derive"] } derive_builder = "0.20.0" +caseless = "0.2.1" [dev-dependencies] ntest = "0.9" diff --git a/script/cibuild b/script/cibuild index c46c266d..794441cc 100755 --- a/script/cibuild +++ b/script/cibuild @@ -20,7 +20,7 @@ set +e python3 spec_tests.py --no-normalize --spec ../../commonmark-spec/spec.txt "$PROGRAM_ARG" \ || failed=1 -python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm" \ +python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm-quirks" \ || failed=1 python3 pathological_tests.py "$PROGRAM_ARG" \ || failed=1 diff --git a/src/main.rs b/src/main.rs index 8c7a7fa6..22df9deb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,12 +57,16 @@ struct Cli { full_info_string: bool, /// Enable GitHub-flavored markdown extensions: strikethrough, tagfilter, - /// table, autolink, and tasklist. Also enables --github-pre-lang, and - /// enables GFM-style quirks in output HTML, such as not nesting - /// tags, which otherwise breaks CommonMark compatibility. + /// table, autolink, and tasklist. Also enables --github-pre-lang and + /// --gfm-quirks. #[arg(long)] gfm: bool, + /// Enables GFM-style quirks in output HTML, such as not nesting + /// tags, which otherwise breaks CommonMark compatibility. + #[arg(long)] + gfm_quirks: bool, + /// Enable relaxing which character is allowed in a tasklists. #[arg(long)] relaxed_tasklist_character: bool, @@ -286,7 +290,7 @@ fn main() -> Result<(), Box> { .escaped_char_spans(cli.escaped_char_spans) .ignore_setext(cli.ignore_setext) .ignore_empty_links(cli.ignore_empty_links) - .gfm_quirks(cli.gfm) + .gfm_quirks(cli.gfm_quirks || cli.gfm) .build()?; let options = Options { diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 900957a9..f288ec50 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -1530,7 +1530,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> { } // Need to normalize both to lookup in refmap and to call callback - let lab = strings::normalize_label(&lab, Case::DontPreserve); + let lab = strings::normalize_label(&lab, Case::Fold); let mut reff = if found_label { self.refmap.lookup(&lab) } else { diff --git a/src/parser/mod.rs b/src/parser/mod.rs index eb03d20b..9d0a8c51 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -2147,7 +2147,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> { match node.data.borrow().value { NodeValue::FootnoteDefinition(ref nfd) => { map.insert( - strings::normalize_label(&nfd.name, Case::DontPreserve), + strings::normalize_label(&nfd.name, Case::Fold), FootnoteDefinition { ix: None, node, @@ -2173,7 +2173,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> { let mut replace = None; match ast.value { NodeValue::FootnoteReference(ref mut nfr) => { - let normalized = strings::normalize_label(&nfr.name, Case::DontPreserve); + let normalized = strings::normalize_label(&nfr.name, Case::Fold); if let Some(ref mut footnote) = map.get_mut(&normalized) { let ix = match footnote.ix { Some(ix) => ix, @@ -2405,7 +2405,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> { } } - lab = strings::normalize_label(&lab, Case::DontPreserve); + lab = strings::normalize_label(&lab, Case::Fold); if !lab.is_empty() { subj.refmap.map.entry(lab).or_insert(Reference { url: String::from_utf8(strings::clean_url(url)).unwrap(), diff --git a/src/strings.rs b/src/strings.rs index 0403ee89..42206bc3 100644 --- a/src/strings.rs +++ b/src/strings.rs @@ -7,7 +7,7 @@ use std::str; #[derive(PartialEq, Eq)] pub enum Case { Preserve, - DontPreserve, + Fold, } pub fn unescape(v: &mut Vec) { @@ -262,13 +262,23 @@ pub fn normalize_label(i: &str, casing: Case) -> String { } } else { last_was_whitespace = false; - match casing { - Case::Preserve => v.push(c), - Case::DontPreserve => v.push_str(&c.to_lowercase().to_string()), - } + v.push(c); } } - v + + if casing == Case::Fold { + caseless::default_case_fold_str(&v) + } else { + v + } +} + +#[test] +fn normalize_label_fold_test() { + assert_eq!(normalize_label("Abc \t\ndef", Case::Preserve), "Abc def"); + assert_eq!(normalize_label("Abc \t\ndef", Case::Fold), "abc def"); + assert_eq!(normalize_label("Straẞe", Case::Preserve), "Straẞe"); + assert_eq!(normalize_label("Straẞe", Case::Fold), "strasse"); } pub fn split_off_front_matter<'s>(mut s: &'s str, delimiter: &str) -> Option<(&'s str, &'s str)> { @@ -356,14 +366,8 @@ pub mod tests { #[test] fn normalize_label_lowercase() { - assert_eq!( - normalize_label(" Foo\u{A0}BAR ", Case::DontPreserve), - "foo bar" - ); - assert_eq!( - normalize_label(" FooİBAR ", Case::DontPreserve), - "fooi\u{307}bar" - ); + assert_eq!(normalize_label(" Foo\u{A0}BAR ", Case::Fold), "foo bar"); + assert_eq!(normalize_label(" FooİBAR ", Case::Fold), "fooi\u{307}bar"); } #[test]