Skip to content

Commit

Permalink
Ensure column names are valid identifiers. Closes #520
Browse files Browse the repository at this point in the history
  • Loading branch information
garborg committed Mar 9, 2014
1 parent ebb12ec commit 5c89da7
Show file tree
Hide file tree
Showing 12 changed files with 280 additions and 39 deletions.
4 changes: 2 additions & 2 deletions src/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -306,5 +306,5 @@ function data(rc::RComplex)
BitArray(imag(rc.data) .== R_NA_FLOAT64))
end

DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data),
Symbol[symbol(x) for x in rl.attr["names"].data])
DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data),
Symbol[identifier(x) for x in rl.attr["names"].data])
3 changes: 3 additions & 0 deletions src/dataframe/dataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@ function insert_single_column!(df::DataFrame,
df.columns[j] = dv
else
if typeof(col_ind) <: Symbol
if !is_valid_identifier(col_ind)
error("$col_ind is not a valid identifier.")
end
push!(df.colindex, col_ind)
push!(df.columns, dv)
else
Expand Down
26 changes: 11 additions & 15 deletions src/dataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ immutable ParseOptions{S <: ByteString}
falsestrings::Vector{S}
makefactors::Bool
names::Vector{Symbol}
cleannames::Bool
eltypes::Vector{DataType}
allowcomments::Bool
commentmark::Char
Expand Down Expand Up @@ -642,11 +641,8 @@ function parsenames!(names::Vector{Symbol},
for j in 1:fields
left = bounds[j] + 2
right = bounds[j + 1]
if bytes[right] == '\r' || bytes[right] == '\n'
names[j] = symbol(bytestring(bytes[left:(right - 1)]))
else
names[j] = symbol(bytestring(bytes[left:right]))
end

names[j] = identifier(bytestring(bytes[left:right]))
end

return
Expand Down Expand Up @@ -738,11 +734,6 @@ function readtable!(p::ParsedCSV,
# Parse contents of a buffer into a DataFrame
df = builddf(rows, cols, bytes, fields, p, o)

# Clean up column names if requested
if o.cleannames
cleannames!(df)
end

# Return the final DataFrame
return df
end
Expand All @@ -760,7 +751,7 @@ function readtable(io::IO,
nrows::Integer = -1,
names::Vector = Symbol[],
colnames::Vector = Symbol[],
cleannames::Bool = false,
cleannames::Any = nothing,
eltypes::Vector{DataType} = DataType[],
coltypes::Vector{DataType} = DataType[],
allowcomments::Bool = false,
Expand All @@ -785,6 +776,9 @@ function readtable(io::IO,
end
eltypes = coltypes
end
if !isa(cleannames, Nothing)
warn("Argument 'cleannames' is deprecated (it now happens automatically).")
end

if !isempty(eltypes)
for j in 1:length(eltypes)
Expand All @@ -809,7 +803,7 @@ function readtable(io::IO,
# Set parsing options
o = ParseOptions(header, separator, quotemark, decimal,
nastrings, truestrings, falsestrings,
makefactors, names, cleannames, eltypes,
makefactors, names, eltypes,
allowcomments, commentmark, ignorepadding,
skipstart, skiprows, skipblanks, encoding,
allowescapes)
Expand All @@ -836,7 +830,7 @@ function readtable(pathname::String;
nrows::Integer = -1,
names::Vector = Symbol[],
colnames::Vector = Symbol[],
cleannames::Bool = false,
cleannames::Any = nothing,
coltypes::Vector{DataType} = DataType[],
eltypes::Vector{DataType} = DataType[],
allowcomments::Bool = false,
Expand All @@ -861,6 +855,9 @@ function readtable(pathname::String;
end
eltypes = coltypes
end
if !isa(cleannames, Nothing)
warn("Argument 'cleannames' is deprecated (it now happens automatically).")
end

# Open an IO stream based on pathname
# (1) Path is an HTTP or FTP URL
Expand Down Expand Up @@ -891,7 +888,6 @@ function readtable(pathname::String;
makefactors = makefactors,
nrows = nrows,
names = names,
cleannames = cleannames,
eltypes = eltypes,
allowcomments = allowcomments,
commentmark = commentmark,
Expand Down
20 changes: 16 additions & 4 deletions src/other/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@ type Index <: AbstractIndex # an OrderedDict would be nice here...
lookup::Dict{Symbol, Indices} # name => names array position
names::Vector{Symbol}
end
function Index{T <: Symbol}(x::Vector{T})
x = make_unique(convert(Vector{Symbol}, x))
function Index(x::Vector{Symbol})
for n in x
if !is_valid_identifier(n)
error("Names must be valid identifiers.")
end
end
x = make_unique(x)
Index(Dict{Symbol, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
end
Index() = Index(Dict{Symbol, Indices}(), Symbol[])
Expand All @@ -22,10 +27,14 @@ Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names))
Base.deepcopy(x::Index) = Index(deepcopy(x.lookup), deepcopy(x.names))
Base.isequal(x::Index, y::Index) = isequal(x.lookup, y.lookup) && isequal(x.names, y.names)

# I think this should be Vector{T <: Symbol}
function names!(x::Index, nm::Vector{Symbol})
if length(nm) != length(x)
error("lengths don't match.")
error("Lengths don't match.")
end
for n in nm
if !is_valid_identifier(n)
error("Names must be valid identifiers.")
end
end
for i in 1:length(nm)
delete!(x.lookup, x.names[i])
Expand All @@ -41,6 +50,9 @@ function rename!(x::Index, nms)
if haskey(x, to)
error("Tried renaming $from to $to, when $to already exists in the Index.")
end
if !is_valid_identifier(to)
error("Names must be valid identifiers.")
end
x.lookup[to] = col = pop!(x.lookup, from)
if !isa(col, Array)
x.names[col] = to
Expand Down
110 changes: 95 additions & 15 deletions src/other/utils.jl
Original file line number Diff line number Diff line change
@@ -1,25 +1,81 @@
# slow, but maintains order and seems to work:
function _setdiff(a::Vector, b::Vector)
idx = Int[]
for i in 1:length(a)
if !(a[i] in b)
push!(idx, i)
const RESERVED_WORDS = ASCIIString["begin", "while", "if", "for", "try",
"return", "break", "continue", "function", "macro", "quote", "let",
"local", "global", "const", "abstract", "typealias", "type", "bitstype",
"immutable", "ccall", "do", "module", "baremodule", "using", "import",
"export", "importall", "end", "else", "elseif", "catch", "finally"]

function is_valid_identifier(sym::Symbol)
s = string(sym)
s == normalize_string(s) && isidentifier(s)
end

function identifier(s::String)
s = normalize_string(s)
symbol(isidentifier(s) ? s : makeidentifier(s))
end

function isidentifier(s::String)
n = length(s)
if n == 0 || in(s, RESERVED_WORDS)
return false
end
i = 1
for c in s
if i == 1
if !(isalpha(c) || c == '_')
return false
end
elseif !(isalpha(c) || isdigit(c) || c == '_' || c == '!')
return false
end
i += 1
end
a[idx]
return true
end

function _uniqueofsorted(x::Vector)
idx = fill(true, length(x))
lastx = x[1]
for i = 2:length(x)
if lastx == x[i]
idx[i] = false
function makeidentifier(s::String)
ind = 0
n = endof(s)
res = Array(Char, 0)

while isempty(res)
if ind >= n
return "x"
end

ind = nextind(s, ind)
c = s[ind]

if isalpha(c)
push!(res, c)
elseif isdigit(c) || c == '!'
push!(res, 'x')
push!(res, c)
end
end

while ind < n
ind = nextind(s, ind)
c = s[ind]

if isalpha(c) || isdigit(c) || c == '!'
push!(res, c)
else
lastx = x[i]
while ind < n
ind = nextind(s, ind)
c = s[ind]

if isalpha(c) || isdigit(c) || c == '!'
push!(res, '_')
push!(res, c)
break
end
end
end
end
x[idx]

cs = UTF32String(res)
return in(cs, RESERVED_WORDS) ? "_"*cs : cs
end

function make_unique(names::Vector{Symbol})
Expand Down Expand Up @@ -129,3 +185,27 @@ function countna(da::PooledDataArray)
end
return res
end

# slow, but maintains order and seems to work:
function _setdiff(a::Vector, b::Vector)
idx = Int[]
for i in 1:length(a)
if !(a[i] in b)
push!(idx, i)
end
end
a[idx]
end

function _uniqueofsorted(x::Vector)
idx = fill(true, length(x))
lastx = x[1]
for i = 2:length(x)
if lastx == x[i]
idx[i] = false
else
lastx = x[i]
end
end
x[idx]
end
10 changes: 8 additions & 2 deletions test/RDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@ module TestRDA
# save(df, file='minimal.rda')

# df['int'] = c(1L, 2L)
# df['logi'] = c(T, F)
# df['logi'] = c(TRUE, FALSE)
# df['chr'] = c('ab', 'c')
# df['factor'] = factor(df[['chr']])
# df['factor'] = factor(df$chr)
# #utf=c('Ж', '∰')) R handles it, read_rda doesn't.
# save(df, file='types.rda')

# df[2, ] = NA
# df['chr'] = NULL # NA characters breaking read_rda
# save(df, file='NAs.rda')

# names(df) = c('end', '!', '1', '%_B*\tC*')
# save(df, file='names.rda')

testdir = dirname(@__FILE__)

df = DataFrame(num = [1.1, 2.2])
Expand All @@ -32,4 +35,7 @@ module TestRDA
df[2, :] = NA
df = df[:, [:num, :int, :logi, :factor]] # (NA) chr breaks read_rda
@test isequal(DataFrame(read_rda("$testdir/data/RDA/NAs.rda")["df"]), df)

rda_names = names(DataFrame(read_rda("$testdir/data/RDA/names.rda")["df"]))
@test rda_names == [:_end, :x!, :x1, :B_C]
end
Binary file added test/data/RDA/names.rda
Binary file not shown.
5 changes: 5 additions & 0 deletions test/index.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,9 @@ end
push!(i, :C)
@test delete!(i, 1) == Index([:C])

Index(Symbol["a"])
@test_throws Index(Symbol["\u212b"])
@test_throws Index(Symbol["end"])
@test_throws Index(Symbol["1a"])

end
9 changes: 9 additions & 0 deletions test/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -161,4 +161,13 @@ module TestIndexing
df[row_index, column_index] = DataArray([1 + 0im, 2 + 1im])
end
end

#
# Insert single column
#

df[symbol("c")] = 1:2
@test_throws df[symbol("\u212b")] = 1:2
@test_throws df[symbol("end")] = 1:2
@test_throws df[symbol("1a")] = 1:2
end
12 changes: 12 additions & 0 deletions test/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -260,4 +260,16 @@ module TestIO
@test typeof(df[:b]) == DataArray{UTF8String,1}
@test df[:b][1] == "T"
@test df[:b][2] == "FALSE"

# Readtable ignorepadding argument
io = IOBuffer("A , \tB , C\n1 , \t2, 3\n")
@test readtable(io, ignorepadding=true) == DataFrame(A=1, B=2, C=3)

# Readtable name normalization
abnormal = "\u212b"
ns = [, :B_C, :_end]
@test !in(symbol(abnormal), ns)

io = IOBuffer(abnormal*",%_B*\tC*,end\n1,2,3\n")
@test names(readtable(io)) == ns
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ anyerrors = false
using Base.Test
using DataFrames

my_tests = ["data.jl",
my_tests = ["utils.jl",
"data.jl",
"index.jl",
"dataframe.jl",
"io.jl",
Expand Down
Loading

0 comments on commit 5c89da7

Please sign in to comment.