From e72a20888752a9fd9da1cbe7955a9c1cd409e1b0 Mon Sep 17 00:00:00 2001
From: Asherah Connor <ashe@kivikakk.ee>
Date: Wed, 10 Jul 2024 18:37:20 +0300
Subject: [PATCH] strings: Case::DontPreserve is now Case::Fold.

We add `caseless` to do the folding. It matches upstream enough [^1],
unlike e.g. ICU4X's `CaseMapper` (doesn't fold Eszett to "ss"), and also
unlike ICU4X, it doesn't require us to bump our MSRV. 2/2 sgtm

A separate `--gfm-quirks` CLI option is added since base tests fail if
we just turn on all of GFM for them.

The nice thing about `caseless` is that while its last release may be
6 years ago, it depends on unicode-normalization ^0.1, the latest of
which is 5 months ago. It's also [very easy to read][caseless], so I'm
all good with this.

[^1] Not that straightforward: https://github.com/commonmark/commonmark-spec/issues/695

[caseless]: https://github.com/unicode-rs/rust-caseless/blob/v0.2.1/src/lib.rs
---
 Cargo.lock            | 35 +++++++++++++++++++++++++++++++++++
 Cargo.toml            |  1 +
 script/cibuild        |  2 +-
 src/main.rs           | 12 ++++++++----
 src/parser/inlines.rs |  2 +-
 src/parser/mod.rs     |  6 +++---
 src/strings.rs        | 32 ++++++++++++++++++--------------
 7 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bdcfc456..afc18d1f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -68,6 +68,16 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "caseless"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "808dab3318747be122cb31d36de18d4d1c81277a76f8332a02b81a3d73463d7f"
+dependencies = [
+ "regex",
+ "unicode-normalization",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.78"
@@ -123,6 +133,7 @@ name = "comrak"
 version = "0.24.1"
 dependencies = [
  "arbitrary",
+ "caseless",
  "clap",
  "derive_builder",
  "emojis",
@@ -809,6 +820,21 @@ dependencies = [
  "time-core",
 ]
 
+[[package]]
+name = "tinyvec"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "toml"
 version = "0.5.10"
@@ -864,6 +890,15 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
 
+[[package]]
+name = "unicode-normalization"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
+dependencies = [
+ "tinyvec",
+]
+
 [[package]]
 name = "unicode_categories"
 version = "0.1.1"
diff --git a/Cargo.toml b/Cargo.toml
index 13cab088..9366fce4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,7 @@ slug = "0.1.4"
 emojis = { version = "0.6.2", optional = true }
 arbitrary = { version = "1", optional = true, features = ["derive"] }
 derive_builder = "0.20.0"
+caseless = "0.2.1"
 
 [dev-dependencies]
 ntest = "0.9"
diff --git a/script/cibuild b/script/cibuild
index c46c266d..794441cc 100755
--- a/script/cibuild
+++ b/script/cibuild
@@ -20,7 +20,7 @@ set +e
 python3 spec_tests.py --no-normalize --spec ../../commonmark-spec/spec.txt "$PROGRAM_ARG" \
 	|| failed=1
 
-python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm" \
+python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm-quirks" \
 	|| failed=1
 python3 pathological_tests.py "$PROGRAM_ARG" \
 	|| failed=1
diff --git a/src/main.rs b/src/main.rs
index 8c7a7fa6..22df9deb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -57,12 +57,16 @@ struct Cli {
     full_info_string: bool,
 
     /// Enable GitHub-flavored markdown extensions: strikethrough, tagfilter,
-    /// table, autolink, and tasklist. Also enables --github-pre-lang, and
-    /// enables GFM-style quirks in output HTML, such as not nesting <strong>
-    /// tags, which otherwise breaks CommonMark compatibility.
+    /// table, autolink, and tasklist. Also enables --github-pre-lang and
+    /// --gfm-quirks.
     #[arg(long)]
     gfm: bool,
 
+    /// Enables GFM-style quirks in output HTML, such as not nesting <strong>
+    /// tags, which otherwise breaks CommonMark compatibility.
+    #[arg(long)]
+    gfm_quirks: bool,
+
     /// Enable relaxing which character is allowed in a tasklists.
     #[arg(long)]
     relaxed_tasklist_character: bool,
@@ -286,7 +290,7 @@ fn main() -> Result<(), Box<dyn Error>> {
         .escaped_char_spans(cli.escaped_char_spans)
         .ignore_setext(cli.ignore_setext)
         .ignore_empty_links(cli.ignore_empty_links)
-        .gfm_quirks(cli.gfm)
+        .gfm_quirks(cli.gfm_quirks || cli.gfm)
         .build()?;
 
     let options = Options {
diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs
index 900957a9..f288ec50 100644
--- a/src/parser/inlines.rs
+++ b/src/parser/inlines.rs
@@ -1530,7 +1530,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
         }
 
         // Need to normalize both to lookup in refmap and to call callback
-        let lab = strings::normalize_label(&lab, Case::DontPreserve);
+        let lab = strings::normalize_label(&lab, Case::Fold);
         let mut reff = if found_label {
             self.refmap.lookup(&lab)
         } else {
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index eb03d20b..9d0a8c51 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -2147,7 +2147,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
         match node.data.borrow().value {
             NodeValue::FootnoteDefinition(ref nfd) => {
                 map.insert(
-                    strings::normalize_label(&nfd.name, Case::DontPreserve),
+                    strings::normalize_label(&nfd.name, Case::Fold),
                     FootnoteDefinition {
                         ix: None,
                         node,
@@ -2173,7 +2173,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
         let mut replace = None;
         match ast.value {
             NodeValue::FootnoteReference(ref mut nfr) => {
-                let normalized = strings::normalize_label(&nfr.name, Case::DontPreserve);
+                let normalized = strings::normalize_label(&nfr.name, Case::Fold);
                 if let Some(ref mut footnote) = map.get_mut(&normalized) {
                     let ix = match footnote.ix {
                         Some(ix) => ix,
@@ -2405,7 +2405,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
             }
         }
 
-        lab = strings::normalize_label(&lab, Case::DontPreserve);
+        lab = strings::normalize_label(&lab, Case::Fold);
         if !lab.is_empty() {
             subj.refmap.map.entry(lab).or_insert(Reference {
                 url: String::from_utf8(strings::clean_url(url)).unwrap(),
diff --git a/src/strings.rs b/src/strings.rs
index 0403ee89..42206bc3 100644
--- a/src/strings.rs
+++ b/src/strings.rs
@@ -7,7 +7,7 @@ use std::str;
 #[derive(PartialEq, Eq)]
 pub enum Case {
     Preserve,
-    DontPreserve,
+    Fold,
 }
 
 pub fn unescape(v: &mut Vec<u8>) {
@@ -262,13 +262,23 @@ pub fn normalize_label(i: &str, casing: Case) -> String {
             }
         } else {
             last_was_whitespace = false;
-            match casing {
-                Case::Preserve => v.push(c),
-                Case::DontPreserve => v.push_str(&c.to_lowercase().to_string()),
-            }
+            v.push(c);
         }
     }
-    v
+
+    if casing == Case::Fold {
+        caseless::default_case_fold_str(&v)
+    } else {
+        v
+    }
+}
+
+#[test]
+fn normalize_label_fold_test() {
+    assert_eq!(normalize_label("Abc   \t\ndef", Case::Preserve), "Abc def");
+    assert_eq!(normalize_label("Abc   \t\ndef", Case::Fold), "abc def");
+    assert_eq!(normalize_label("Straẞe", Case::Preserve), "Straẞe");
+    assert_eq!(normalize_label("Straẞe", Case::Fold), "strasse");
 }
 
 pub fn split_off_front_matter<'s>(mut s: &'s str, delimiter: &str) -> Option<(&'s str, &'s str)> {
@@ -356,14 +366,8 @@ pub mod tests {
 
     #[test]
     fn normalize_label_lowercase() {
-        assert_eq!(
-            normalize_label("  Foo\u{A0}BAR  ", Case::DontPreserve),
-            "foo bar"
-        );
-        assert_eq!(
-            normalize_label("  FooİBAR  ", Case::DontPreserve),
-            "fooi\u{307}bar"
-        );
+        assert_eq!(normalize_label("  Foo\u{A0}BAR  ", Case::Fold), "foo bar");
+        assert_eq!(normalize_label("  FooİBAR  ", Case::Fold), "fooi\u{307}bar");
     }
 
     #[test]