Skip to content

Commit

Permalink
Merge pull request #3770 from JuliaLang/jb/strindexchanges
Browse files Browse the repository at this point in the history
change string indexing to avoid invalid indexes
  • Loading branch information
JeffBezanson committed Aug 9, 2013
2 parents db45d12 + c184149 commit 2200bd3
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 66 deletions.
3 changes: 2 additions & 1 deletion base/datafmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au
for col in 1:maxcol
start_pos = dlm_col_begin(maxcol, offsets, row, col)
end_pos = offsets[row,col]
sval = SubString(sbuff, start_pos, end_pos)
sval = SubString(sbuff, start_pos,
prevind(sbuff, nextind(sbuff,end_pos)))

if T <: Char
(length(sval) != 1) && error("file entry \"$(sval)\" is not a Char")
Expand Down
1 change: 0 additions & 1 deletion base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,6 @@ export
strip,
strwidth,
summary,
thisind,
ucfirst,
unescape_chars,
unescape_string,
Expand Down
7 changes: 3 additions & 4 deletions base/regex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,12 @@ end
matchall(re::Regex, str::ByteString) = matchall(re, str, false)

function search(str::ByteString, re::Regex, idx::Integer)
len = length(str.data)
if idx >= len+2
return idx == len+2 ? (0:-1) : error(BoundsError)
if idx > nextind(str,endof(str))
throw(BoundsError())
end
opts = re.options & PCRE.EXECUTE_MASK
m, n = PCRE.exec(re.regex, C_NULL, str, idx-1, opts, true)
isempty(m) ? (0:-1) : ((m[1]+1):m[2])
isempty(m) ? (0:-1) : ((m[1]+1):prevind(str,m[2]+1))
end
search(s::String, r::Regex, idx::Integer) =
error("regex search is only available for bytestrings; use bytestring(s) to convert")
Expand Down
79 changes: 41 additions & 38 deletions base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -101,29 +101,35 @@ end

prevind(s::DirectIndexString, i::Integer) = i-1
prevind(s , i::Integer) = i-1
thisind(s::DirectIndexString, i::Integer) = i
thisind(s , i::Integer) = i
nextind(s::DirectIndexString, i::Integer) = i+1
nextind(s , i::Integer) = i+1

prevind(s::String, i::Integer) = thisind(s,thisind(s,i)-1)

function thisind(s::String, i::Integer)
for j = i:-1:1
function prevind(s::String, i::Integer)
e = endof(s)
if i > e
return e
end
j = i-1
while j >= 1
if isvalid(s,j)
return j
end
j -= 1
end
return 0 # out of range
end

function nextind(s::String, i::Integer)
for j = i+1:endof(s)
e = endof(s)
if i < 1 || e == 0
return 1
end
for j = i+1:e
if isvalid(s,j)
return j
end
end
endof(s)+1 # out of range
next(s,e)[2] # out of range
end

ind2chr(s::DirectIndexString, i::Integer) = i
Expand Down Expand Up @@ -163,8 +169,7 @@ typealias Chars Union(Char,AbstractVector{Char},Set{Char})

function search(s::String, c::Chars, i::Integer)
if isempty(c)
return 1 <= i <= endof(s)+1 ? i :
i == endof(s)+2 ? 0 :
return 1 <= i <= nextind(s,endof(s)) ? i :
error(BoundsError)
end

Expand All @@ -185,8 +190,7 @@ contains(s::String, c::Char) = (search(s,c)!=0)

function _search(s, t, i)
if isempty(t)
return 1 <= i <= endof(s)+1 ? (i:i-1) :
i == endof(s)+2 ? (0:-1) :
return 1 <= i <= nextind(s,endof(s)) ? (i:i-1) :
error(BoundsError)
end
t1, j2 = next(t,start(t))
Expand All @@ -209,7 +213,7 @@ function _search(s, t, i)
end
end
if matched
return i:k-1
return i:prevind(s,k)
end
i = ii
end
Expand All @@ -224,8 +228,7 @@ rsearch(s::String, c::Chars) = rsearch(s,c,endof(s))

function _rsearch(s, t, i)
if isempty(t)
return 1 <= i <= endof(s)+1 ? (i:i-1) :
i == endof(s)+2 ? (0:-1) :
return 1 <= i <= nextind(s,endof(s)) ? (i:i-1) :
error(BoundsError)
end
t = reverse(t)
Expand All @@ -252,12 +255,7 @@ function _rsearch(s, t, i)
end
if matched
fst = nextind(s,l-k+1)
# NOTE: there's a subtle difference between
# nexind(s,i) and next(s,i)[2] if s::UTF8String
# since at the end nextind returns endof(s)+1
# while next returns length(s.data)+1
lst = next(s,i)[2]-1
return fst:lst
return fst:i
end
i = l-ii+1
end
Expand Down Expand Up @@ -387,11 +385,14 @@ immutable SubString{T<:String} <: String
endof::Int

function SubString(s::T, i::Int, j::Int)
if i > endof(s)
if i > endof(s) || j<i
return new(s, i, 0)
else
o = thisind(s,i)-1
new(s, o, max(0, thisind(s,j)-o))
if !isvalid(s,i) || !isvalid(s,j)
error("invalid SubString indexes")
end
o = i-1
new(s, o, max(0, j-o))
end
end
end
Expand Down Expand Up @@ -422,8 +423,8 @@ endof(s::SubString) = s.endof
# can this be delegated efficiently somehow?
# that may require additional string interfaces

thisind(s::SubString, i::Integer) = thisind(s.string, i+s.offset)-s.offset
nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset
prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset

convert{T<:String}(::Type{SubString{T}}, s::T) = SubString(s, 1, endof(s))

Expand Down Expand Up @@ -492,7 +493,7 @@ sizeof(s::RevString) = sizeof(s.string)

function next(s::RevString, i::Int)
n = endof(s); j = n-i+1
(s.string[j], n-thisind(s.string,j-1)+1)
(s.string[j], n-prevind(s.string,j)+1)
end

reverse(s::String) = RevString(s)
Expand Down Expand Up @@ -1027,17 +1028,17 @@ function split(str::String, splitter, limit::Integer, keep_empty::Bool)
i = start(str)
n = endof(str)
r = search(str,splitter,i)
j, k = first(r), last(r)+1
j, k = first(r), nextind(str,last(r))
while 0 < j <= n && length(strs) != limit-1
if i < k
if keep_empty || i < j
push!(strs, str[i:j-1])
push!(strs, str[i:prevind(str,j)])
end
i = k
end
if k <= j; k = nextind(str,j) end
r = search(str,splitter,k)
j, k = first(r), last(r)+1
j, k = first(r), nextind(str,last(r))
end
if keep_empty || !done(str,i)
push!(strs, str[i:])
Expand Down Expand Up @@ -1080,26 +1081,28 @@ rsplit(s::String, spl) = rsplit(s, spl, 0, true)

function replace(str::ByteString, pattern, repl::Function, limit::Integer)
n = 1
rstr = ""
e = endof(str)
i = a = start(str)
r = search(str,pattern,i)
j, k = first(r), last(r)
k1 = k
out = IOBuffer()
while j != 0
if i == a || i <= k
write(out, SubString(str,i,j-1))
write(out, SubString(str,i,prevind(str,j)))
write(out, string(repl(SubString(str,j,k))))
i = nextind(str, k)
end
if k <= j
k = nextind(str,j)
k == k1 && break
if k<j
i = j
k = nextind(str, j)
else
i = k = nextind(str, k)
end
if j > e
break
end
k1 = k
r = search(str,pattern,k)
j, k = first(r), last(r)
if n == limit break end
n == limit && break
n += 1
end
write(out, SubString(str,i))
Expand Down
15 changes: 12 additions & 3 deletions base/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,15 @@ is_utf8_start(byte::Uint8) = ((byte&0xc0)!=0x80)

## required core functionality ##

endof(s::UTF8String) = thisind(s,length(s.data))
function endof(s::UTF8String)
d = s.data
i = length(d)
i == 0 && return i
while !is_utf8_start(d[i])
i -= 1
end
i
end
length(s::UTF8String) = int(ccall(:u8_strlen, Csize_t, (Ptr{Uint8},), s.data))

function getindex(s::UTF8String, i::Int)
Expand Down Expand Up @@ -105,9 +113,10 @@ end

function rsearch(s::UTF8String, c::Char, i::Integer)
if c < 0x80 return rsearch(s.data, uint8(c), i) end
b = first_utf8_byte(c)
while true
i = rsearch(s.data, first_utf8_byte(c), i)
if i==0 || s[i]==c return thisind(s,i) end
i = rsearch(s.data, b, i)
if i==0 || s[i]==c return i end
i = prevind(s,i)
end
end
Expand Down
4 changes: 0 additions & 4 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,6 @@ Strings
Get the previous valid string index before ``i``. Returns ``0`` at
the beginning of the string.

.. function:: thisind(str, i)

Adjust ``i`` downwards until it reaches a valid index for the given string.

.. function:: randstring(len)

Create a random ASCII string of length ``len``, consisting of upper- and lower-case letters and the digits 0-9
Expand Down
29 changes: 14 additions & 15 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -317,15 +317,15 @@ end

@test search(u8str, "z") == 0:-1
@test search(u8str, "") == 0:-1
@test search(u8str, "") == 1:3
@test search(u8str, "") == 1:1
@test search(u8str, "", 4) == 0:-1
@test search(u8str, "") == 13:15
@test search(u8str, "") == 13:13
@test search(u8str, "", 16) == 0:-1
@test search(u8str, "x") == 26:26
@test search(u8str, "x", 27) == 43:43
@test search(u8str, "x", 44) == 0:-1
@test search(u8str, "ε") == 5:6
@test search(u8str, "ε", 7) == 54:55
@test search(u8str, "ε") == 5:5
@test search(u8str, "ε", 7) == 54:54
@test search(u8str, "ε", 56) == 0:-1

# string rsearch with a single-char string
Expand All @@ -343,19 +343,19 @@ end

@test rsearch(u8str, "z") == 0:-1
@test rsearch(u8str, "") == 0:-1
@test rsearch(u8str, "") == 1:3
@test rsearch(u8str, "") == 1:1
@test rsearch(u8str, "", 0) == 0:-1
#TODO: setting the limit in the middle of a wide char
# makes search fail but rsearch succeed.
# Should rsearch fail as well?
#@test rsearch(u8str, "∀", 2) == 0:-1 # gives 1:3
@test rsearch(u8str, "") == 13:15
@test rsearch(u8str, "") == 13:13
@test rsearch(u8str, "", 12) == 0:-1
@test rsearch(u8str, "x") == 43:43
@test rsearch(u8str, "x", 42) == 26:26
@test rsearch(u8str, "x", 25) == 0:-1
@test rsearch(u8str, "ε") == 54:55
@test rsearch(u8str, "ε", 53) == 5:6
@test rsearch(u8str, "ε") == 54:54
@test rsearch(u8str, "ε", 53) == 5:5
@test rsearch(u8str, "ε", 4) == 0:-1

# string search with a single-char regex
Expand All @@ -370,25 +370,24 @@ end
@test search(astr, r"\n", 15) == 0:-1
@test search(u8str, r"z") == 0:-1
@test search(u8str, r"") == 0:-1
@test search(u8str, r"") == 1:3
@test search(u8str, r"") == 1:1
@test search(u8str, r"", 4) == 0:-1
@test search(u8str, r"") == search(u8str, r"\u2200")
@test search(u8str, r"", 4) == search(u8str, r"\u2200", 4)
@test search(u8str, r"") == 13:15
@test search(u8str, r"") == 13:13
@test search(u8str, r"", 16) == 0:-1
@test search(u8str, r"x") == 26:26
@test search(u8str, r"x", 27) == 43:43
@test search(u8str, r"x", 44) == 0:-1
@test search(u8str, r"ε") == 5:6
@test search(u8str, r"ε", 7) == 54:55
@test search(u8str, r"ε") == 5:5
@test search(u8str, r"ε", 7) == 54:54
@test search(u8str, r"ε", 56) == 0:-1
for i = 1:endof(astr)
@test search(astr, r"."s, i) == i:i
end
for i = 1:endof(u8str)
# TODO: should regex search fast-forward invalid indices?
if isvalid(u8str,i)
@test search(u8str, r"."s, i) == i:(next(u8str,i)[2]-1)
@test search(u8str, r"."s, i) == i:i
end
end

Expand Down Expand Up @@ -778,7 +777,7 @@ bin_val = hex2bytes("07bf")
# sizeof
@test sizeof("abc") == 3
@test sizeof("\u2222") == 3
@test sizeof(SubString("abc\u2222def",4,6)) == 3
@test sizeof(SubString("abc\u2222def",4,4)) == 3
@test sizeof(RopeString("abc","def")) == 6

# issue #3597
Expand Down

0 comments on commit 2200bd3

Please sign in to comment.