From 98f8f785ec68d4a35e0d8b722a224447532aefef Mon Sep 17 00:00:00 2001 From: Jeff Bezanson Date: Sat, 20 Jul 2013 00:07:09 -0400 Subject: [PATCH] change string indexing to avoid invalid indexes in more cases, and eliminate the thisind() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For example, search("∀ x", "∀", 1) used to give 1:3, which doesn't make much sense since the "3" is an artifact of the encoding and isn't a valid index. Now it gives 1:1, or in general a:b where a is the index of the first found character and b is the index of the last found character. --- base/datafmt.jl | 3 +- base/exports.jl | 1 - base/regex.jl | 7 ++-- base/string.jl | 79 +++++++++++++++++++++++---------------------- base/utf8.jl | 15 +++++++-- doc/stdlib/base.rst | 4 --- test/strings.jl | 29 ++++++++--------- 7 files changed, 72 insertions(+), 66 deletions(-) diff --git a/base/datafmt.jl b/base/datafmt.jl index 0742e7975ef3c..c804a1cbe0976 100644 --- a/base/datafmt.jl +++ b/base/datafmt.jl @@ -105,7 +105,8 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au for col in 1:maxcol start_pos = dlm_col_begin(maxcol, offsets, row, col) end_pos = offsets[row,col] - sval = SubString(sbuff, start_pos, end_pos) + sval = SubString(sbuff, start_pos, + prevind(sbuff, nextind(sbuff,end_pos))) if T <: Char (length(sval) != 1) && error("file entry \"$(sval)\" is not a Char") diff --git a/base/exports.jl b/base/exports.jl index 0a3dd31773524..0dde4627e4537 100644 --- a/base/exports.jl +++ b/base/exports.jl @@ -778,7 +778,6 @@ export string, strip, strwidth, - thisind, ucfirst, uppercase, utf8, diff --git a/base/regex.jl b/base/regex.jl index 70e98ef85d920..f4a4a1acb6171 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -107,13 +107,12 @@ end matchall(re::Regex, str::ByteString) = matchall(re, str, false) function search(str::ByteString, re::Regex, idx::Integer) - len = length(str.data) - if idx >= len+2 - return idx == len+2 ? (0:-1) : error(BoundsError) + if idx > nextind(str,endof(str)) + throw(BoundsError()) end opts = re.options & PCRE.EXECUTE_MASK m, n = PCRE.exec(re.regex, C_NULL, str, idx-1, opts, true) - isempty(m) ? (0:-1) : ((m[1]+1):m[2]) + isempty(m) ? (0:-1) : ((m[1]+1):prevind(str,m[2]+1)) end search(s::String, r::Regex, idx::Integer) = error("regex search is only available for bytestrings; use bytestring(s) to convert") diff --git a/base/string.jl b/base/string.jl index 46dc6e9062515..e19a73db2eeaa 100644 --- a/base/string.jl +++ b/base/string.jl @@ -101,29 +101,35 @@ end prevind(s::DirectIndexString, i::Integer) = i-1 prevind(s , i::Integer) = i-1 -thisind(s::DirectIndexString, i::Integer) = i -thisind(s , i::Integer) = i nextind(s::DirectIndexString, i::Integer) = i+1 nextind(s , i::Integer) = i+1 -prevind(s::String, i::Integer) = thisind(s,thisind(s,i)-1) - -function thisind(s::String, i::Integer) - for j = i:-1:1 +function prevind(s::String, i::Integer) + e = endof(s) + if i > e + return e + end + j = i-1 + while j >= 1 if isvalid(s,j) return j end + j -= 1 end return 0 # out of range end function nextind(s::String, i::Integer) - for j = i+1:endof(s) + e = endof(s) + if i < 1 || e == 0 + return 1 + end + for j = i+1:e if isvalid(s,j) return j end end - endof(s)+1 # out of range + next(s,e)[2] # out of range end ind2chr(s::DirectIndexString, i::Integer) = i @@ -163,8 +169,7 @@ typealias Chars Union(Char,AbstractVector{Char},Set{Char}) function search(s::String, c::Chars, i::Integer) if isempty(c) - return 1 <= i <= endof(s)+1 ? i : - i == endof(s)+2 ? 0 : + return 1 <= i <= nextind(s,endof(s)) ? i : error(BoundsError) end @@ -185,8 +190,7 @@ contains(s::String, c::Char) = (search(s,c)!=0) function _search(s, t, i) if isempty(t) - return 1 <= i <= endof(s)+1 ? (i:i-1) : - i == endof(s)+2 ? (0:-1) : + return 1 <= i <= nextind(s,endof(s)) ? (i:i-1) : error(BoundsError) end t1, j2 = next(t,start(t)) @@ -209,7 +213,7 @@ function _search(s, t, i) end end if matched - return i:k-1 + return i:prevind(s,k) end i = ii end @@ -224,8 +228,7 @@ rsearch(s::String, c::Chars) = rsearch(s,c,endof(s)) function _rsearch(s, t, i) if isempty(t) - return 1 <= i <= endof(s)+1 ? (i:i-1) : - i == endof(s)+2 ? (0:-1) : + return 1 <= i <= nextind(s,endof(s)) ? (i:i-1) : error(BoundsError) end t = reverse(t) @@ -252,12 +255,7 @@ function _rsearch(s, t, i) end if matched fst = nextind(s,l-k+1) - # NOTE: there's a subtle difference between - # nexind(s,i) and next(s,i)[2] if s::UTF8String - # since at the end nextind returns endof(s)+1 - # while next returns length(s.data)+1 - lst = next(s,i)[2]-1 - return fst:lst + return fst:i end i = l-ii+1 end @@ -384,11 +382,14 @@ immutable SubString{T<:String} <: String endof::Int function SubString(s::T, i::Int, j::Int) - if i > endof(s) + if i > endof(s) || j e + break end - k1 = k r = search(str,pattern,k) j, k = first(r), last(r) - if n == limit break end + n == limit && break n += 1 end write(out, SubString(str,i)) diff --git a/base/utf8.jl b/base/utf8.jl index be84b1a6b9d6c..e45f2f2fd3291 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -28,7 +28,15 @@ is_utf8_start(byte::Uint8) = ((byte&0xc0)!=0x80) ## required core functionality ## -endof(s::UTF8String) = thisind(s,length(s.data)) +function endof(s::UTF8String) + d = s.data + i = length(d) + i == 0 && return i + while !is_utf8_start(d[i]) + i -= 1 + end + i +end length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data)) function getindex(s::UTF8String, i::Int) @@ -105,9 +113,10 @@ end function rsearch(s::UTF8String, c::Char, i::Integer) if c < 0x80 return rsearch(s.data, uint8(c), i) end + b = first_utf8_byte(c) while true - i = rsearch(s.data, first_utf8_byte(c), i) - if i==0 || s[i]==c return thisind(s,i) end + i = rsearch(s.data, b, i) + if i==0 || s[i]==c return i end i = prevind(s,i) end end diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 163a277bfe133..139f35c3af4ca 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -813,10 +813,6 @@ Strings Get the previous valid string index before ``i``. Returns ``0`` at the beginning of the string. -.. function:: thisind(str, i) - - Adjust ``i`` downwards until it reaches a valid index for the given string. - .. function:: randstring(len) Create a random ASCII string of length ``len``, consisting of upper- and lower-case letters and the digits 0-9 diff --git a/test/strings.jl b/test/strings.jl index 65fcbf490ebf2..971462a748210 100644 --- a/test/strings.jl +++ b/test/strings.jl @@ -317,15 +317,15 @@ end @test search(u8str, "z") == 0:-1 @test search(u8str, "∄") == 0:-1 -@test search(u8str, "∀") == 1:3 +@test search(u8str, "∀") == 1:1 @test search(u8str, "∀", 4) == 0:-1 -@test search(u8str, "∃") == 13:15 +@test search(u8str, "∃") == 13:13 @test search(u8str, "∃", 16) == 0:-1 @test search(u8str, "x") == 26:26 @test search(u8str, "x", 27) == 43:43 @test search(u8str, "x", 44) == 0:-1 -@test search(u8str, "ε") == 5:6 -@test search(u8str, "ε", 7) == 54:55 +@test search(u8str, "ε") == 5:5 +@test search(u8str, "ε", 7) == 54:54 @test search(u8str, "ε", 56) == 0:-1 # string rsearch with a single-char string @@ -343,19 +343,19 @@ end @test rsearch(u8str, "z") == 0:-1 @test rsearch(u8str, "∄") == 0:-1 -@test rsearch(u8str, "∀") == 1:3 +@test rsearch(u8str, "∀") == 1:1 @test rsearch(u8str, "∀", 0) == 0:-1 #TODO: setting the limit in the middle of a wide char # makes search fail but rsearch succeed. # Should rsearch fail as well? #@test rsearch(u8str, "∀", 2) == 0:-1 # gives 1:3 -@test rsearch(u8str, "∃") == 13:15 +@test rsearch(u8str, "∃") == 13:13 @test rsearch(u8str, "∃", 12) == 0:-1 @test rsearch(u8str, "x") == 43:43 @test rsearch(u8str, "x", 42) == 26:26 @test rsearch(u8str, "x", 25) == 0:-1 -@test rsearch(u8str, "ε") == 54:55 -@test rsearch(u8str, "ε", 53) == 5:6 +@test rsearch(u8str, "ε") == 54:54 +@test rsearch(u8str, "ε", 53) == 5:5 @test rsearch(u8str, "ε", 4) == 0:-1 # string search with a single-char regex @@ -370,25 +370,24 @@ end @test search(astr, r"\n", 15) == 0:-1 @test search(u8str, r"z") == 0:-1 @test search(u8str, r"∄") == 0:-1 -@test search(u8str, r"∀") == 1:3 +@test search(u8str, r"∀") == 1:1 @test search(u8str, r"∀", 4) == 0:-1 @test search(u8str, r"∀") == search(u8str, r"\u2200") @test search(u8str, r"∀", 4) == search(u8str, r"\u2200", 4) -@test search(u8str, r"∃") == 13:15 +@test search(u8str, r"∃") == 13:13 @test search(u8str, r"∃", 16) == 0:-1 @test search(u8str, r"x") == 26:26 @test search(u8str, r"x", 27) == 43:43 @test search(u8str, r"x", 44) == 0:-1 -@test search(u8str, r"ε") == 5:6 -@test search(u8str, r"ε", 7) == 54:55 +@test search(u8str, r"ε") == 5:5 +@test search(u8str, r"ε", 7) == 54:54 @test search(u8str, r"ε", 56) == 0:-1 for i = 1:endof(astr) @test search(astr, r"."s, i) == i:i end for i = 1:endof(u8str) - # TODO: should regex search fast-forward invalid indices? if isvalid(u8str,i) - @test search(u8str, r"."s, i) == i:(next(u8str,i)[2]-1) + @test search(u8str, r"."s, i) == i:i end end @@ -776,7 +775,7 @@ bin_val = hex2bytes("07bf") # sizeof @test sizeof("abc") == 3 @test sizeof("\u2222") == 3 -@test sizeof(SubString("abc\u2222def",4,6)) == 3 +@test sizeof(SubString("abc\u2222def",4,4)) == 3 @test sizeof(RopeString("abc","def")) == 6 # issue #3597