Skip to content

Commit 38a9725

Browse files
authored
Support case-changes to Annotated{String,Char}s (#54013)
Previously, any case changes to Annotated{String,Char} types triggered "fall back to non-annotated type" non-specialised methods. It would be nice to keep the annotations though, and that can be done so long as we keep track of any potential changes to the number of bytes taken by each character on case changes. This is unusual, but can happen with some letters (e.g. the upper case of 'ſ' is 'S'). To handle this, a helper function annotated_chartransform is introduced. This allows for efficient uppercase/lowercase methods (about 50% overhead in managing the annotation ranges, compared to just transforming a String). The {upper,lower}casefirst and titlecase transformations are much more inefficient with this style of implementation, but not prohibitively so. If somebody has a bright idea, or they emerge as an area deserving of more attention, the performance characteristics can be improved. As a bonus, a specialised textwidth method is implemented to avoid the generic fallback, providing a ~12x performance improvement. To check that annotated_chartransform is accurate, as are the specialised case-transformations, a few million random collections of strings were pre- and post-annotated and checked to be the same in a fuzzing check performed with Supposition.jl. const short_str = Data.Text(Data.Characters(), max_len=20) const short_strs = Data.Vectors(short_str, max_size=10) const case_transform_fn = Data.SampledFrom((uppercase, lowercase)) function annot_caseinvariant(f::Function, strs::Vector{String}) annot_strs = map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]), enumerate(strs)) f_annot_strs = map(((i, s),) -> AnnotatedString(s, [(1:ncodeunits(s), :i => i)]), enumerate(map(f, strs))) pre_join = Base.annotated_chartransform(join(annot_strs), f) post_join = join(f_annot_strs) pre_join == post_join end @check max_examples=1_000_000 annot_caseinvariant(case_transform_fn, short_strs) This helped me determine that in annotated_chartransform the "- 1" was needed with offset position calculation, and that in the "findlast" calls that less than *or equal* was the correct equality test.
1 parent c741bd3 commit 38a9725

File tree

3 files changed

+123
-1
lines changed

3 files changed

+123
-1
lines changed

base/strings/annotated.jl

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,51 @@ Get all annotations of `chr`, in the form of a vector of annotation pairs.
399399
"""
400400
annotations(c::AnnotatedChar) = c.annotations
401401

402+
## Character transformation helper function, c.f. `unicode.jl`.
403+
404+
"""
405+
annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
406+
407+
Transform every character in `str` with `f`, adjusting annotation regions as
408+
appropriate. `f` must take one of two forms, either:
409+
- `f(c::Char) -> Char`, or
410+
- `f(c::Char, state) -> (Char, state)`.
411+
412+
This works by comparing the number of code units of each character before and
413+
after transforming with `f`, recording and aggregating any differences, then
414+
applying them to the annotation regions.
415+
416+
Returns an `AnnotatedString{String}` (regardless of the original underling
417+
string type of `str`).
418+
"""
419+
function annotated_chartransform(f::Function, str::AnnotatedString, state=nothing)
420+
outstr = IOBuffer()
421+
annots = Tuple{UnitRange{Int}, Pair{Symbol, Any}}[]
422+
bytepos = firstindex(str) - 1
423+
offsets = [bytepos => 0]
424+
for c in str.string
425+
oldnb = ncodeunits(c)
426+
bytepos += oldnb
427+
if isnothing(state)
428+
c = f(c)
429+
else
430+
c, state = f(c, state)
431+
end
432+
nb = write(outstr, c)
433+
if nb != oldnb
434+
push!(offsets, bytepos => last(last(offsets)) + nb - oldnb)
435+
end
436+
end
437+
for annot in str.annotations
438+
region, value = annot
439+
start, stop = first(region), last(region)
440+
start_offset = last(offsets[findlast(<=(start) first, offsets)::Int])
441+
stop_offset = last(offsets[findlast(<=(stop) first, offsets)::Int])
442+
push!(annots, ((start + start_offset):(stop + stop_offset), value))
443+
end
444+
AnnotatedString(String(take!(outstr)), annots)
445+
end
446+
402447
## AnnotatedIOBuffer
403448

404449
struct AnnotatedIOBuffer <: AbstractPipe

base/strings/unicode.jl

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
module Unicode
55

66
import Base: show, ==, hash, string, Symbol, isless, length, eltype,
7-
convert, isvalid, ismalformed, isoverlong, iterate
7+
convert, isvalid, ismalformed, isoverlong, iterate,
8+
AnnotatedString, AnnotatedChar, annotated_chartransform
89

910
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
1011

@@ -271,6 +272,8 @@ julia> textwidth("March")
271272
"""
272273
textwidth(s::AbstractString) = mapreduce(textwidth, +, s; init=0)
273274

275+
textwidth(s::AnnotatedString) = textwidth(s.string)
276+
274277
"""
275278
lowercase(c::AbstractChar)
276279
@@ -290,6 +293,8 @@ julia> lowercase('Ö')
290293
lowercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) :
291294
T(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
292295

296+
lowercase(c::AnnotatedChar) = AnnotatedChar(lowercase(c.char), annotations(c))
297+
293298
"""
294299
uppercase(c::AbstractChar)
295300
@@ -309,6 +314,8 @@ julia> uppercase('ê')
309314
uppercase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
310315
T(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
311316

317+
uppercase(c::AnnotatedChar) = AnnotatedChar(uppercase(c.char), annotations(c))
318+
312319
"""
313320
titlecase(c::AbstractChar)
314321
@@ -332,6 +339,8 @@ julia> uppercase('dž')
332339
titlecase(c::T) where {T<:AbstractChar} = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) :
333340
T(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
334341

342+
titlecase(c::AnnotatedChar) = AnnotatedChar(titlecase(c.char), annotations(c))
343+
335344
############################################################################
336345

337346
# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
@@ -606,6 +615,7 @@ julia> uppercase("Julia")
606615
```
607616
"""
608617
uppercase(s::AbstractString) = map(uppercase, s)
618+
uppercase(s::AnnotatedString) = annotated_chartransform(uppercase, s)
609619

610620
"""
611621
lowercase(s::AbstractString)
@@ -621,6 +631,7 @@ julia> lowercase("STRINGS AND THINGS")
621631
```
622632
"""
623633
lowercase(s::AbstractString) = map(lowercase, s)
634+
lowercase(s::AnnotatedString) = annotated_chartransform(lowercase, s)
624635

625636
"""
626637
titlecase(s::AbstractString; [wordsep::Function], strict::Bool=true) -> String
@@ -669,6 +680,23 @@ function titlecase(s::AbstractString; wordsep::Function = !isletter, strict::Boo
669680
return String(take!(b))
670681
end
671682

683+
# TODO: improve performance characteristics, room for a ~10x improvement.
684+
function titlecase(s::AnnotatedString; wordsep::Function = !isletter, strict::Bool=true)
685+
initial_state = (; startword = true, state = Ref{Int32}(0),
686+
c0 = eltype(s)(zero(UInt32)), wordsep, strict)
687+
annotated_chartransform(s, initial_state) do c, state
688+
if isgraphemebreak!(state.state, state.c0, c) && state.wordsep(c)
689+
state = Base.setindex(state, true, :startword)
690+
cnew = c
691+
else
692+
cnew = state.startword ? titlecase(c) : state.strict ? lowercase(c) : c
693+
state = Base.setindex(state, false, :startword)
694+
end
695+
state = Base.setindex(state, c, :c0)
696+
cnew, state
697+
end
698+
end
699+
672700
"""
673701
uppercasefirst(s::AbstractString) -> String
674702
@@ -693,6 +721,17 @@ function uppercasefirst(s::AbstractString)
693721
string(c′, SubString(s, nextind(s, 1)))
694722
end
695723

724+
# TODO: improve performance characteristics, room for a ~5x improvement.
725+
function uppercasefirst(s::AnnotatedString)
726+
annotated_chartransform(s, true) do c, state
727+
if state
728+
(titlecase(c), false)
729+
else
730+
(c, state)
731+
end
732+
end
733+
end
734+
696735
"""
697736
lowercasefirst(s::AbstractString)
698737
@@ -715,6 +754,17 @@ function lowercasefirst(s::AbstractString)
715754
string(c′, SubString(s, nextind(s, 1)))
716755
end
717756

757+
# TODO: improve performance characteristics, room for a ~5x improvement.
758+
function lowercasefirst(s::AnnotatedString)
759+
annotated_chartransform(s, true) do c, state
760+
if state
761+
(lowercase(c), false)
762+
else
763+
(c, state)
764+
end
765+
end
766+
end
767+
718768
############################################################################
719769
# iterators for grapheme segmentation
720770

test/strings/annotated.jl

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,33 @@ end
108108
@test reverse(str2) == Base.AnnotatedString("esac", [(2:3, :label => "oomph")])
109109
end
110110

111+
@testset "Unicode" begin
112+
for words in (["ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE", "Сodeunıts"],
113+
["Сodeunıts", "ᲃase", "cɦɒnɡeȿ", "can", "CHⱯNGE"])
114+
ann_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
115+
for (i, w) in enumerate(words)]
116+
ann_str = join(ann_words, '-')
117+
for transform in (lowercase, uppercase, titlecase)
118+
t_words = map(transform, words)
119+
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
120+
for (i, w) in enumerate(t_words)]
121+
ann_t_str = join(ann_t_words, '-')
122+
t_ann_str = transform(ann_str)
123+
@test String(ann_t_str) == String(t_ann_str)
124+
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
125+
end
126+
for transform in (uppercasefirst, lowercasefirst)
127+
t_words = vcat(transform(first(words)), words[2:end])
128+
ann_t_words = [Base.AnnotatedString(w, [(1:ncodeunits(w), :i => i)])
129+
for (i, w) in enumerate(t_words)]
130+
ann_t_str = join(ann_t_words, '-')
131+
t_ann_str = transform(ann_str)
132+
@test String(ann_t_str) == String(t_ann_str)
133+
@test Base.annotations(ann_t_str) == Base.annotations(t_ann_str)
134+
end
135+
end
136+
end
137+
111138
@testset "AnnotatedIOBuffer" begin
112139
aio = Base.AnnotatedIOBuffer()
113140
# Append-only writing

0 commit comments

Comments
 (0)