Skip to content

bugfixes for rsplit (for unicode and empty strings) #26238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 10 additions & 16 deletions base/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,9 @@ split(str::T, splitter::Char;
_split(str, equalto(splitter), limit, keep, T <: SubString ? T[] : SubString{T}[])

function _split(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
i = start(str)
@assert (i = firstindex(str)) == 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need to assert this? Why do it here and not elsewhere? I think implementations which get this wrong will realize it quite soon even without assertions given that may things will fail.

Copy link
Member Author

@stevengj stevengj Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm asserting it because the subsequent code implicitly relies on it. Basically, I think of this as a comment with some added value. Assertions are cheap so we shouldn't hesitate to add them, no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(i = firstindex(str)) though goes against the recently added documention that use of @assert might not work in the future (#25610)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, will fix.

Copy link
Member

@StefanKarpinski StefanKarpinski Mar 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really since the argument here is to delete it which is exactly what the compiler would do. On the contrary, this is exactly the way that @assert should be used.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@StefanKarpinski, the point is that you can't completely delete this code because it contains an assignment. Hence this is not correct usage of @assert.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yes. That's true. I missed that bit.

n = lastindex(str)
r = coalesce(findfirst(splitter,str), i - 1)
r = coalesce(findfirst(splitter,str), 0)
if r != 0:-1
j, k = first(r), nextind(str,last(r))
while 0 < j <= n && length(strs) != limit-1
Expand Down Expand Up @@ -342,22 +342,16 @@ rsplit(str::T, splitter::Char;
_rsplit(str, equalto(splitter), limit, keep, T <: SubString ? T[] : SubString{T}[])

function _rsplit(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
i = start(str)
n = lastindex(str)
r = coalesce(findlast(splitter, str), i - 1)
j = first(r)-1
k = last(r)
while((0 <= j < n) && (length(strs) != limit-1))
if i <= k
(keep_empty || (k < n)) && pushfirst!(strs, SubString(str,k+1,n))
n = j
end
(k <= j) && (j = prevind(str,j))
r = coalesce(findprev(splitter,str,j), 0)
j = first(r)-1
k = last(r)
r = coalesce(findlast(splitter, str), 0)
j, k = first(r), last(r)
while j > 0 && k > 0 && length(strs) != limit-1
(keep_empty || k < n) && pushfirst!(strs, SubString(str,nextind(str,k),n))
n = prevind(str, j)
r = coalesce(findprev(splitter,str,n), 0)
j, k = first(r), last(r)
end
(keep_empty || (n > 0)) && pushfirst!(strs, SubString(str,1,n))
(keep_empty || n > 0) && pushfirst!(strs, SubString(str,1,n))
return strs
end
#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false)
Expand Down
147 changes: 77 additions & 70 deletions test/strings/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,81 +75,88 @@ end
end

@testset "rsplit/split" begin
@test isequal(split("foo,bar,baz", 'x'), ["foo,bar,baz"])
@test isequal(split("foo,bar,baz", ','), ["foo","bar","baz"])
@test isequal(split("foo,bar,baz", ","), ["foo","bar","baz"])
@test isequal(split("foo,bar,baz", r","), ["foo","bar","baz"])
@test isequal(split("foo,bar,baz", ','; limit=0), ["foo","bar","baz"])
@test isequal(split("foo,bar,baz", ','; limit=1), ["foo,bar,baz"])
@test isequal(split("foo,bar,baz", ','; limit=2), ["foo","bar,baz"])
@test isequal(split("foo,bar,baz", ','; limit=3), ["foo","bar","baz"])
@test isequal(split("foo,bar", "o,b"), ["fo","ar"])

@test isequal(split("", ','), [""])
@test isequal(split(",", ','), ["",""])
@test isequal(split(",,", ','), ["","",""])
@test isequal(split("", ',' ; keep=false), [])
@test isequal(split(",", ',' ; keep=false), [])
@test isequal(split(",,", ','; keep=false), [])

@test isequal(split("a b c"), ["a","b","c"])
@test isequal(split("a b \t c\n"), ["a","b","c"])

@test isequal(rsplit("foo,bar,baz", 'x'), ["foo,bar,baz"])
@test isequal(rsplit("foo,bar,baz", ','), ["foo","bar","baz"])
@test isequal(rsplit("foo,bar,baz", ","), ["foo","bar","baz"])
@test isequal(rsplit("foo,bar,baz", ','; limit=0), ["foo","bar","baz"])
@test isequal(rsplit("foo,bar,baz", ','; limit=1), ["foo,bar,baz"])
@test isequal(rsplit("foo,bar,baz", ','; limit=2), ["foo,bar","baz"])
@test isequal(rsplit("foo,bar,baz", ','; limit=3), ["foo","bar","baz"])
@test isequal(rsplit("foo,bar", "o,b"), ["fo","ar"])

@test isequal(rsplit("", ','), [""])
@test isequal(rsplit(",", ','), ["",""])
@test isequal(rsplit(",,", ','), ["","",""])
@test isequal(rsplit(",,", ','; limit=2), [",",""])
@test isequal(rsplit("", ',' ; keep=false), [])
@test isequal(rsplit(",", ',' ; keep=false), [])
@test isequal(rsplit(",,", ','; keep=false), [])

#@test isequal(rsplit("a b c"), ["a","b","c"])
#@test isequal(rsplit("a b \t c\n"), ["a","b","c"])
@test split("foo,bar,baz", 'x') == ["foo,bar,baz"]
@test split("foo,bar,baz", ',') == ["foo","bar","baz"]
@test split("foo,bar,baz", ",") == ["foo","bar","baz"]
@test split("foo,bar,baz", r",") == ["foo","bar","baz"]
@test split("foo,bar,baz", ','; limit=0) == ["foo","bar","baz"]
@test split("foo,bar,baz", ','; limit=1) == ["foo,bar,baz"]
@test split("foo,bar,baz", ','; limit=2) == ["foo","bar,baz"]
@test split("foo,bar,baz", ','; limit=3) == ["foo","bar","baz"]
@test split("foo,bar", "o,b") == ["fo","ar"]

@test split("", ',') == [""]
@test split(",", ',') == ["",""]
@test split(",,", ',') == ["","",""]
@test split("", ',' ; keep=false) == []
@test split(",", ',' ; keep=false) == []
@test split(",,", ','; keep=false) == []

@test split("a b c") == ["a","b","c"]
@test split("a b \t c\n") == ["a","b","c"]

@test rsplit("foo,bar,baz", 'x') == ["foo,bar,baz"]
@test rsplit("foo,bar,baz", ',') == ["foo","bar","baz"]
@test rsplit("foo,bar,baz", ",") == ["foo","bar","baz"]
@test rsplit("foo,bar,baz", ','; limit=0) == ["foo","bar","baz"]
@test rsplit("foo,bar,baz", ','; limit=1) == ["foo,bar,baz"]
@test rsplit("foo,bar,baz", ','; limit=2) == ["foo,bar","baz"]
@test rsplit("foo,bar,baz", ','; limit=3) == ["foo","bar","baz"]
@test rsplit("foo,bar", "o,b") == ["fo","ar"]

@test rsplit("", ',') == [""]
@test rsplit(",", ',') == ["",""]
@test rsplit(",,", ',') == ["","",""]
@test rsplit(",,", ','; limit=2) == [",",""]
@test rsplit("", ',' ; keep=false) == []
@test rsplit(",", ',' ; keep=false) == []
@test rsplit(",,", ','; keep=false) == []

#@test rsplit("a b c") == ["a","b","c"]
#@test rsplit("a b \t c\n") == ["a","b","c"]

let str = "a.:.ba..:..cba.:.:.dcba.:."
@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""])
@test isequal(split(str, ".:."; keep=false), ["a","ba.",".cba",":.dcba"])
@test isequal(split(str, ".:."), ["a","ba.",".cba",":.dcba",""])
@test isequal(split(str, r"\.(:\.)+"), ["a","ba.",".cba","dcba",""])
@test isequal(split(str, r"\.(:\.)+"; keep=false), ["a","ba.",".cba","dcba"])
@test isequal(split(str, r"\.+:\.+"), ["a","ba","cba",":.dcba",""])
@test isequal(split(str, r"\.+:\.+"; keep=false), ["a","ba","cba",":.dcba"])

@test isequal(rsplit(str, ".:."), ["a","ba.",".cba.:","dcba",""])
@test isequal(rsplit(str, ".:."; keep=false), ["a","ba.",".cba.:","dcba"])
@test isequal(rsplit(str, ".:."; limit=2), ["a.:.ba..:..cba.:.:.dcba", ""])
@test isequal(rsplit(str, ".:."; limit=3), ["a.:.ba..:..cba.:", "dcba", ""])
@test isequal(rsplit(str, ".:."; limit=4), ["a.:.ba.", ".cba.:", "dcba", ""])
@test isequal(rsplit(str, ".:."; limit=5), ["a", "ba.", ".cba.:", "dcba", ""])
@test isequal(rsplit(str, ".:."; limit=6), ["a", "ba.", ".cba.:", "dcba", ""])
@test split(str, ".:.") == ["a","ba.",".cba",":.dcba",""]
@test split(str, ".:."; keep=false) == ["a","ba.",".cba",":.dcba"]
@test split(str, ".:.") == ["a","ba.",".cba",":.dcba",""]
@test split(str, r"\.(:\.)+") == ["a","ba.",".cba","dcba",""]
@test split(str, r"\.(:\.)+"; keep=false) == ["a","ba.",".cba","dcba"]
@test split(str, r"\.+:\.+") == ["a","ba","cba",":.dcba",""]
@test split(str, r"\.+:\.+"; keep=false) == ["a","ba","cba",":.dcba"]

@test rsplit(str, ".:.") == ["a","ba.",".cba.:","dcba",""]
@test rsplit(str, ".:."; keep=false) == ["a","ba.",".cba.:","dcba"]
@test rsplit(str, ".:."; limit=2) == ["a.:.ba..:..cba.:.:.dcba", ""]
@test rsplit(str, ".:."; limit=3) == ["a.:.ba..:..cba.:", "dcba", ""]
@test rsplit(str, ".:."; limit=4) == ["a.:.ba.", ".cba.:", "dcba", ""]
@test rsplit(str, ".:."; limit=5) == ["a", "ba.", ".cba.:", "dcba", ""]
@test rsplit(str, ".:."; limit=6) == ["a", "ba.", ".cba.:", "dcba", ""]
end

# zero-width splits
@test isequal(rsplit("", ""), [""])

@test isequal(split("", ""), [""])
@test isequal(split("", r""), [""])
@test isequal(split("abc", ""), ["a","b","c"])
@test isequal(split("abc", r""), ["a","b","c"])
@test isequal(split("abcd", r"b?"), ["a","c","d"])
@test isequal(split("abcd", r"b*"), ["a","c","d"])
@test isequal(split("abcd", r"b+"), ["a","cd"])
@test isequal(split("abcd", r"b?c?"), ["a","d"])
@test isequal(split("abcd", r"[bc]?"), ["a","","d"])
@test isequal(split("abcd", r"a*"), ["","b","c","d"])
@test isequal(split("abcd", r"a+"), ["","bcd"])
@test isequal(split("abcd", r"d*"), ["a","b","c",""])
@test isequal(split("abcd", r"d+"), ["abc",""])
@test isequal(split("abcd", r"[ad]?"), ["","b","c",""])
@test split("", "") == rsplit("", "") == [""]
@test split("abc", "") == rsplit("abc", "") == ["a","b","c"]
@test rsplit("abc", "", limit=2) == ["ab","c"]
@test split("abc", "", limit=2) == ["a","bc"]

@test split("", r"") == [""]
@test split("abc", r"") == ["a","b","c"]
@test split("abcd", r"b?") == ["a","c","d"]
@test split("abcd", r"b*") == ["a","c","d"]
@test split("abcd", r"b+") == ["a","cd"]
@test split("abcd", r"b?c?") == ["a","d"]
@test split("abcd", r"[bc]?") == ["a","","d"]
@test split("abcd", r"a*") == ["","b","c","d"]
@test split("abcd", r"a+") == ["","bcd"]
@test split("abcd", r"d*") == ["a","b","c",""]
@test split("abcd", r"d+") == ["abc",""]
@test split("abcd", r"[ad]?") == ["","b","c",""]

# multi-byte unicode characters (issue #26225)
@test split("α β γ", " ") == rsplit("α β γ", " ") ==
split("α β γ", isspace) == rsplit("α β γ", isspace) == ["α","β","γ"]
@test split("ö.", ".") == rsplit("ö.", ".") == ["ö",""]
@test split("α β γ", "β") == rsplit("α β γ", "β") == ["α "," γ"]
end

@testset "replace" begin
Expand Down