Skip to content

Commit

Permalink
strings: Case::DontPreserve is now Case::Fold.
Browse files Browse the repository at this point in the history
We add `caseless` to do the folding. It matches upstream enough [^1],
unlike e.g. ICU4X's `CaseMapper` (doesn't fold Eszett to "ss"), and also
unlike ICU4X, it doesn't require us to bump our MSRV. 2/2 sgtm

A separate `--gfm-quirks` CLI option is added since base tests fail if
we just turn on all of GFM for them.

The nice thing about `caseless` is that while its last release may be
6 years ago, it depends on unicode-normalization ^0.1, the latest of
which is 5 months ago. It's also [very easy to read][caseless], so I'm
all good with this.

[^1] Not that straightforward: commonmark/commonmark-spec#695

[caseless]: https://github.com/unicode-rs/rust-caseless/blob/v0.2.1/src/lib.rs
  • Loading branch information
kivikakk committed Jul 10, 2024
1 parent d921f62 commit e72a208
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 23 deletions.
35 changes: 35 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ slug = "0.1.4"
emojis = { version = "0.6.2", optional = true }
arbitrary = { version = "1", optional = true, features = ["derive"] }
derive_builder = "0.20.0"
caseless = "0.2.1"

[dev-dependencies]
ntest = "0.9"
Expand Down
2 changes: 1 addition & 1 deletion script/cibuild
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ set +e
python3 spec_tests.py --no-normalize --spec ../../commonmark-spec/spec.txt "$PROGRAM_ARG" \
|| failed=1

python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm" \
python3 spec_tests.py --no-normalize --spec spec.txt "$PROGRAM_ARG --gfm-quirks" \
|| failed=1
python3 pathological_tests.py "$PROGRAM_ARG" \
|| failed=1
Expand Down
12 changes: 8 additions & 4 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,16 @@ struct Cli {
full_info_string: bool,

/// Enable GitHub-flavored markdown extensions: strikethrough, tagfilter,
/// table, autolink, and tasklist. Also enables --github-pre-lang, and
/// enables GFM-style quirks in output HTML, such as not nesting <strong>
/// tags, which otherwise breaks CommonMark compatibility.
/// table, autolink, and tasklist. Also enables --github-pre-lang and
/// --gfm-quirks.
#[arg(long)]
gfm: bool,

/// Enables GFM-style quirks in output HTML, such as not nesting <strong>
/// tags, which otherwise breaks CommonMark compatibility.
#[arg(long)]
gfm_quirks: bool,

/// Enable relaxing which character is allowed in a tasklists.
#[arg(long)]
relaxed_tasklist_character: bool,
Expand Down Expand Up @@ -286,7 +290,7 @@ fn main() -> Result<(), Box<dyn Error>> {
.escaped_char_spans(cli.escaped_char_spans)
.ignore_setext(cli.ignore_setext)
.ignore_empty_links(cli.ignore_empty_links)
.gfm_quirks(cli.gfm)
.gfm_quirks(cli.gfm_quirks || cli.gfm)
.build()?;

let options = Options {
Expand Down
2 changes: 1 addition & 1 deletion src/parser/inlines.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1530,7 +1530,7 @@ impl<'a, 'r, 'o, 'd, 'i, 'c, 'subj> Subject<'a, 'r, 'o, 'd, 'i, 'c, 'subj> {
}

// Need to normalize both to lookup in refmap and to call callback
let lab = strings::normalize_label(&lab, Case::DontPreserve);
let lab = strings::normalize_label(&lab, Case::Fold);
let mut reff = if found_label {
self.refmap.lookup(&lab)
} else {
Expand Down
6 changes: 3 additions & 3 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2147,7 +2147,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
match node.data.borrow().value {
NodeValue::FootnoteDefinition(ref nfd) => {
map.insert(
strings::normalize_label(&nfd.name, Case::DontPreserve),
strings::normalize_label(&nfd.name, Case::Fold),
FootnoteDefinition {
ix: None,
node,
Expand All @@ -2173,7 +2173,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
let mut replace = None;
match ast.value {
NodeValue::FootnoteReference(ref mut nfr) => {
let normalized = strings::normalize_label(&nfr.name, Case::DontPreserve);
let normalized = strings::normalize_label(&nfr.name, Case::Fold);
if let Some(ref mut footnote) = map.get_mut(&normalized) {
let ix = match footnote.ix {
Some(ix) => ix,
Expand Down Expand Up @@ -2405,7 +2405,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
}
}

lab = strings::normalize_label(&lab, Case::DontPreserve);
lab = strings::normalize_label(&lab, Case::Fold);
if !lab.is_empty() {
subj.refmap.map.entry(lab).or_insert(Reference {
url: String::from_utf8(strings::clean_url(url)).unwrap(),
Expand Down
32 changes: 18 additions & 14 deletions src/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::str;
#[derive(PartialEq, Eq)]
pub enum Case {
Preserve,
DontPreserve,
Fold,
}

pub fn unescape(v: &mut Vec<u8>) {
Expand Down Expand Up @@ -262,13 +262,23 @@ pub fn normalize_label(i: &str, casing: Case) -> String {
}
} else {
last_was_whitespace = false;
match casing {
Case::Preserve => v.push(c),
Case::DontPreserve => v.push_str(&c.to_lowercase().to_string()),
}
v.push(c);
}
}
v

if casing == Case::Fold {
caseless::default_case_fold_str(&v)
} else {
v
}
}

#[test]
fn normalize_label_fold_test() {
assert_eq!(normalize_label("Abc \t\ndef", Case::Preserve), "Abc def");
assert_eq!(normalize_label("Abc \t\ndef", Case::Fold), "abc def");
assert_eq!(normalize_label("Straẞe", Case::Preserve), "Straẞe");
assert_eq!(normalize_label("Straẞe", Case::Fold), "strasse");
}

pub fn split_off_front_matter<'s>(mut s: &'s str, delimiter: &str) -> Option<(&'s str, &'s str)> {
Expand Down Expand Up @@ -356,14 +366,8 @@ pub mod tests {

#[test]
fn normalize_label_lowercase() {
assert_eq!(
normalize_label(" Foo\u{A0}BAR ", Case::DontPreserve),
"foo bar"
);
assert_eq!(
normalize_label(" FooİBAR ", Case::DontPreserve),
"fooi\u{307}bar"
);
assert_eq!(normalize_label(" Foo\u{A0}BAR ", Case::Fold), "foo bar");
assert_eq!(normalize_label(" FooİBAR ", Case::Fold), "fooi\u{307}bar");
}

#[test]
Expand Down

0 comments on commit e72a208

Please sign in to comment.