Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not attempt downcasing first when case-folding a Char #13542

Merged
merged 2 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions spec/std/char_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ describe "Char" do
actual.should eq(['s', 's'])
end
it { 'Ń'.downcase(Unicode::CaseOptions::Fold).should eq('ń') }
it { 'ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
it { 'Ꭰ'.downcase(Unicode::CaseOptions::Fold).should eq('Ꭰ') }
end

it "#succ" do
Expand Down
35 changes: 12 additions & 23 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -635,33 +635,22 @@ describe "String" do
end

describe "#downcase" do
it { "HELLO!".downcase.should eq("hello!") }
it { "HELLO MAN!".downcase.should eq("hello man!") }
it { "ÁÉÍÓÚĀ".downcase.should eq("áéíóúā") }
it { "AEIİOU".downcase(Unicode::CaseOptions::Turkic).should eq("aeıiou") }
it { "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII).should eq("ÁeÍoÚ") }
it { "İ".downcase.should eq("i̇") }
it { "Baffle".downcase(Unicode::CaseOptions::Fold).should eq("baffle") }
it { "ff".downcase(Unicode::CaseOptions::Fold).should eq("ff") }
it { "tschüß".downcase(Unicode::CaseOptions::Fold).should eq("tschüss") }
it { "ΣίσυφοςfiÆ".downcase(Unicode::CaseOptions::Fold).should eq("σίσυφοσfiæ") }
it { assert_prints "HELLO!".downcase, "hello!" }
it { assert_prints "HELLO MAN!".downcase, "hello man!" }
it { assert_prints "ÁÉÍÓÚĀ".downcase, "áéíóúā" }
it { assert_prints "AEIİOU".downcase(Unicode::CaseOptions::Turkic), "aeıiou" }
it { assert_prints "ÁEÍOÚ".downcase(Unicode::CaseOptions::ASCII), "ÁeÍoÚ" }
it { assert_prints "İ".downcase, "i̇" }
it { assert_prints "Baffle".downcase(Unicode::CaseOptions::Fold), "baffle" }
it { assert_prints "ff".downcase(Unicode::CaseOptions::Fold), "ff" }
it { assert_prints "tschüß".downcase(Unicode::CaseOptions::Fold), "tschüss" }
it { assert_prints "ΣίσυφοςfiÆ".downcase(Unicode::CaseOptions::Fold), "σίσυφοσfiæ" }
it { assert_prints "ꭰ".downcase(Unicode::CaseOptions::Fold), "Ꭰ" }
it { assert_prints "Ꭰ".downcase(Unicode::CaseOptions::Fold), "Ꭰ" }

it "does not touch invalid code units in an otherwise ascii string" do
"\xB5!\xE0\xC1\xB5?".downcase.should eq("\xB5!\xE0\xC1\xB5?")
end

describe "with IO" do
it { String.build { |io| "HELLO!".downcase io }.should eq "hello!" }
it { String.build { |io| "HELLO MAN!".downcase io }.should eq "hello man!" }
it { String.build { |io| "ÁÉÍÓÚĀ".downcase io }.should eq "áéíóúā" }
it { String.build { |io| "AEIİOU".downcase io, Unicode::CaseOptions::Turkic }.should eq "aeıiou" }
it { String.build { |io| "ÁEÍOÚ".downcase io, Unicode::CaseOptions::ASCII }.should eq "ÁeÍoÚ" }
it { String.build { |io| "İ".downcase io }.should eq "i̇" }
it { String.build { |io| "Baffle".downcase io, Unicode::CaseOptions::Fold }.should eq "baffle" }
it { String.build { |io| "ff".downcase io, Unicode::CaseOptions::Fold }.should eq "ff" }
it { String.build { |io| "tschüß".downcase io, Unicode::CaseOptions::Fold }.should eq "tschüss" }
it { String.build { |io| "ΣίσυφοςfiÆ".downcase io, Unicode::CaseOptions::Fold }.should eq "σίσυφοσfiæ" }
end
end

describe "#upcase" do
Expand Down
33 changes: 31 additions & 2 deletions src/char.cr
Original file line number Diff line number Diff line change
Expand Up @@ -400,17 +400,46 @@ struct Char
# 'x'.downcase # => 'x'
# '.'.downcase # => '.'
# ```
#
# If `options.fold?` is true, then returns the case-folded equivalent instead.
# Note that this will return `self` if a multiple-character case folding
# exists, even if a separate single-character transformation is also defined
# in Unicode.
#
# ```
# 'Z'.downcase(Unicode::CaseOptions::Fold) # => 'z'
# 'x'.downcase(Unicode::CaseOptions::Fold) # => 'x'
# 'ς'.downcase(Unicode::CaseOptions::Fold) # => 'σ'
# 'ꭰ'.downcase(Unicode::CaseOptions::Fold) # => 'Ꭰ'
# 'ẞ'.downcase(Unicode::CaseOptions::Fold) # => 'ẞ' # not U+00DF 'ß'
# 'ᾈ'.downcase(Unicode::CaseOptions::Fold) # => "ᾈ" # not U+1F80 'ᾀ'
# ```
def downcase(options : Unicode::CaseOptions = :none) : Char
Unicode.downcase(self, options)
if options.fold?
Unicode.foldcase(self, options)
else
Unicode.downcase(self, options)
end
end

# Yields each char for the downcase equivalent of this char.
#
# This method takes into account the possibility that an downcase
# version of a char might result in multiple chars, like for
# 'İ', which results in 'i' and a dot mark.
#
# ```
# 'Z'.downcase { |v| puts v } # prints 'z'
# 'ς'.downcase(Unicode::CaseOptions::Fold) { |v| puts v } # prints 'σ'
# 'ẞ'.downcase(Unicode::CaseOptions::Fold) { |v| puts v } # prints 's', 's'
# 'ᾈ'.downcase(Unicode::CaseOptions::Fold) { |v| puts v } # prints 'ἀ', 'ι'
# ```
def downcase(options : Unicode::CaseOptions = :none, &)
Unicode.downcase(self, options) { |char| yield char }
if options.fold?
Unicode.foldcase(self, options) { |char| yield char }
else
Unicode.downcase(self, options) { |char| yield char }
end
end

# Returns the upcase equivalent of this char.
Expand Down
61 changes: 42 additions & 19 deletions src/unicode/unicode.cr
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ module Unicode
Turkic

# Unicode case folding, which is more far-reaching than Unicode case mapping.
#
# Note that only full mappings are defined, and calling `Char#downcase` with
# this option will return its receiver unchanged if a multiple-character
# case folding exists, even if a separate single-character transformation is
# also defined in Unicode.
#
# ```
# "ẞ".downcase(Unicode::CaseOptions::Fold) # => "ss"
# 'ẞ'.downcase(Unicode::CaseOptions::Fold) # => 'ẞ' # not U+00DF 'ß'
#
# "ᾈ".downcase(Unicode::CaseOptions::Fold) # => "ἀι"
# 'ᾈ'.downcase(Unicode::CaseOptions::Fold) # => "ᾈ" # not U+1F80 'ᾀ'
# ```
Fold
end

Expand Down Expand Up @@ -224,9 +237,6 @@ module Unicode
result = check_downcase_turkic(char, options)
return result if result

results = check_downcase_fold(char, options)
return results[0].unsafe_chr if results && results.size == 1

check_downcase_ranges(char)
end

Expand All @@ -244,12 +254,6 @@ module Unicode
return
end

result = check_downcase_fold(char, options)
if result
result.each { |c| yield c.unsafe_chr if c != 0 }
return
end

result = special_cases_downcase[char.ord]?
if result
result.each { |c| yield c.unsafe_chr if c != 0 }
Expand Down Expand Up @@ -283,16 +287,6 @@ module Unicode
end
end

private def self.check_downcase_fold(char, options)
if options.fold?
result = search_ranges(casefold_ranges, char.ord)
return {char.ord + result} if result

return fold_cases[char.ord]?
end
nil
end

private def self.check_downcase_ranges(char)
result = search_ranges(downcase_ranges, char.ord)
return char + result if result
Expand All @@ -303,6 +297,35 @@ module Unicode
char
end

# :nodoc:
def self.foldcase(char : Char, options : CaseOptions) : Char
results = check_foldcase(char, options)
return results[0].unsafe_chr if results && results.size == 1

char
end

# :nodoc:
def self.foldcase(char : Char, options : CaseOptions, &)
result = check_foldcase(char, options)
if result
result.each { |c| yield c.unsafe_chr if c != 0 }
return
end

yield char
end

private def self.check_foldcase(char, options)
if options.fold?
result = search_ranges(casefold_ranges, char.ord)
return {char.ord + result} if result

return fold_cases[char.ord]?
end
nil
end

# :nodoc:
def self.lowercase?(char : Char) : Bool
in_category?(char.ord, category_Ll)
Expand Down