Ensure column names are valid identifiers. Closes #520

JuliaData · Mar 9, 2014 · 5c89da7 · 5c89da7
1 parent ebb12ec
commit 5c89da7
Show file tree

Hide file tree

Showing 12 changed files with 280 additions and 39 deletions.
diff --git a/src/RDA.jl b/src/RDA.jl
@@ -306,5 +306,5 @@ function data(rc::RComplex)
               BitArray(imag(rc.data) .== R_NA_FLOAT64))
 end
 
-DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data), 
-                                 Symbol[symbol(x) for x in rl.attr["names"].data]) 
+DataFrame(rl::RList) = DataFrame(map(x->data(x), rl.data),
+                                 Symbol[identifier(x) for x in rl.attr["names"].data])
diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl
@@ -391,6 +391,9 @@ function insert_single_column!(df::DataFrame,
         df.columns[j] = dv
     else
         if typeof(col_ind) <: Symbol
+            if !is_valid_identifier(col_ind)
+                error("$col_ind is not a valid identifier.")
+            end
             push!(df.colindex, col_ind)
             push!(df.columns, dv)
         else

diff --git a/src/dataframe/io.jl b/src/dataframe/io.jl
@@ -15,7 +15,6 @@ immutable ParseOptions{S <: ByteString}
     falsestrings::Vector{S}
     makefactors::Bool
     names::Vector{Symbol}
-    cleannames::Bool
     eltypes::Vector{DataType}
     allowcomments::Bool
     commentmark::Char
@@ -642,11 +641,8 @@ function parsenames!(names::Vector{Symbol},
     for j in 1:fields
         left = bounds[j] + 2
         right = bounds[j + 1]
-        if bytes[right] == '\r' || bytes[right] == '\n'
-            names[j] = symbol(bytestring(bytes[left:(right - 1)]))
-        else
-            names[j] = symbol(bytestring(bytes[left:right]))
-        end
+
+        names[j] = identifier(bytestring(bytes[left:right]))
     end
 
     return
@@ -738,11 +734,6 @@ function readtable!(p::ParsedCSV,
     # Parse contents of a buffer into a DataFrame
     df = builddf(rows, cols, bytes, fields, p, o)
 
-    # Clean up column names if requested
-    if o.cleannames
-        cleannames!(df)
-    end
-
     # Return the final DataFrame
     return df
 end
@@ -760,7 +751,7 @@ function readtable(io::IO,
                    nrows::Integer = -1,
                    names::Vector = Symbol[],
                    colnames::Vector = Symbol[],
-                   cleannames::Bool = false,
+                   cleannames::Any = nothing,
                    eltypes::Vector{DataType} = DataType[],
                    coltypes::Vector{DataType} = DataType[],
                    allowcomments::Bool = false,
@@ -785,6 +776,9 @@ function readtable(io::IO,
         end
         eltypes = coltypes
     end
+    if !isa(cleannames, Nothing)
+        warn("Argument 'cleannames' is deprecated (it now happens automatically).")
+    end
 
     if !isempty(eltypes)
         for j in 1:length(eltypes)
@@ -809,7 +803,7 @@ function readtable(io::IO,
     # Set parsing options
     o = ParseOptions(header, separator, quotemark, decimal,
                      nastrings, truestrings, falsestrings,
-                     makefactors, names, cleannames, eltypes,
+                     makefactors, names, eltypes,
                      allowcomments, commentmark, ignorepadding,
                      skipstart, skiprows, skipblanks, encoding,
                      allowescapes)
@@ -836,7 +830,7 @@ function readtable(pathname::String;
                    nrows::Integer = -1,
                    names::Vector = Symbol[],
                    colnames::Vector = Symbol[],
-                   cleannames::Bool = false,
+                   cleannames::Any = nothing,
                    coltypes::Vector{DataType} = DataType[],
                    eltypes::Vector{DataType} = DataType[],
                    allowcomments::Bool = false,
@@ -861,6 +855,9 @@ function readtable(pathname::String;
         end
         eltypes = coltypes
     end
+    if !isa(cleannames, Nothing)
+        warn("Argument 'cleannames' is deprecated (it now happens automatically).")
+    end
 
     # Open an IO stream based on pathname
     # (1) Path is an HTTP or FTP URL
@@ -891,7 +888,6 @@ function readtable(pathname::String;
                      makefactors = makefactors,
                      nrows = nrows,
                      names = names,
-                     cleannames = cleannames,
                      eltypes = eltypes,
                      allowcomments = allowcomments,
                      commentmark = commentmark,

diff --git a/src/other/index.jl b/src/other/index.jl
@@ -11,8 +11,13 @@ type Index <: AbstractIndex   # an OrderedDict would be nice here...
     lookup::Dict{Symbol, Indices}      # name => names array position
     names::Vector{Symbol}
 end
-function Index{T <: Symbol}(x::Vector{T})
-    x = make_unique(convert(Vector{Symbol}, x))
+function Index(x::Vector{Symbol})
+    for n in x
+        if !is_valid_identifier(n)
+            error("Names must be valid identifiers.")
+        end
+    end
+    x = make_unique(x)
     Index(Dict{Symbol, Indices}(tuple(x...), tuple([1:length(x)]...)), x)
 end
 Index() = Index(Dict{Symbol, Indices}(), Symbol[])
@@ -22,10 +27,14 @@ Base.copy(x::Index) = Index(copy(x.lookup), copy(x.names))
 Base.deepcopy(x::Index) = Index(deepcopy(x.lookup), deepcopy(x.names))
 Base.isequal(x::Index, y::Index) = isequal(x.lookup, y.lookup) && isequal(x.names, y.names)
 
-# I think this should be Vector{T <: Symbol}
 function names!(x::Index, nm::Vector{Symbol})
     if length(nm) != length(x)
-        error("lengths don't match.")
+        error("Lengths don't match.")
+    end
+    for n in nm
+        if !is_valid_identifier(n)
+            error("Names must be valid identifiers.")
+        end
     end
     for i in 1:length(nm)
         delete!(x.lookup, x.names[i])
@@ -41,6 +50,9 @@ function rename!(x::Index, nms)
             if haskey(x, to)
                 error("Tried renaming $from to $to, when $to already exists in the Index.")
             end
+            if !is_valid_identifier(to)
+                error("Names must be valid identifiers.")
+            end
             x.lookup[to] = col = pop!(x.lookup, from)
             if !isa(col, Array)
                 x.names[col] = to

diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -1,25 +1,81 @@
-# slow, but maintains order and seems to work:
-function _setdiff(a::Vector, b::Vector)
-    idx = Int[]
-    for i in 1:length(a)
-        if !(a[i] in b)
-            push!(idx, i)
+const RESERVED_WORDS = ASCIIString["begin", "while", "if", "for", "try",
+    "return", "break", "continue", "function", "macro", "quote", "let",
+    "local", "global", "const", "abstract", "typealias", "type", "bitstype",
+    "immutable", "ccall", "do", "module", "baremodule", "using", "import",
+    "export", "importall", "end", "else", "elseif", "catch", "finally"]
+
+ function is_valid_identifier(sym::Symbol)
+    s = string(sym)
+    s == normalize_string(s) && isidentifier(s)
+ end
+
+function identifier(s::String)
+    s = normalize_string(s)
+    symbol(isidentifier(s) ? s : makeidentifier(s))
+end
+
+function isidentifier(s::String)
+    n = length(s)
+    if n == 0 || in(s, RESERVED_WORDS)
+        return false
+    end
+    i = 1
+    for c in s
+        if i == 1
+            if !(isalpha(c) || c == '_')
+                return false
+            end
+        elseif !(isalpha(c) || isdigit(c) || c == '_' || c == '!')
+            return false
         end
+        i += 1
     end
-    a[idx]
+    return true
 end
 
-function _uniqueofsorted(x::Vector)
-    idx = fill(true, length(x))
-    lastx = x[1]
-    for i = 2:length(x)
-        if lastx == x[i]
-            idx[i] = false
+function makeidentifier(s::String)
+    ind = 0
+    n = endof(s)
+    res = Array(Char, 0)
+
+    while isempty(res)
+        if ind >= n
+            return "x"
+        end
+
+        ind = nextind(s, ind)
+        c = s[ind]
+
+        if isalpha(c)
+            push!(res, c)
+        elseif isdigit(c) || c == '!'
+            push!(res, 'x')
+            push!(res, c)
+        end
+    end
+
+    while ind < n
+        ind = nextind(s, ind)
+        c = s[ind]
+
+        if isalpha(c) || isdigit(c) || c == '!'
+            push!(res, c)
         else
-            lastx = x[i]
+            while ind < n
+                ind = nextind(s, ind)
+                c = s[ind]
+
+                if isalpha(c) || isdigit(c) || c == '!'
+                    push!(res, '_')
+                    push!(res, c)
+                    break
+                end
+            end
         end
     end
-    x[idx]
+
+    cs = UTF32String(res)
+    return in(cs, RESERVED_WORDS) ? "_"*cs : cs
 end
 
 function make_unique(names::Vector{Symbol})
@@ -129,3 +185,27 @@ function countna(da::PooledDataArray)
     end
     return res
 end
+
+# slow, but maintains order and seems to work:
+function _setdiff(a::Vector, b::Vector)
+    idx = Int[]
+    for i in 1:length(a)
+        if !(a[i] in b)
+            push!(idx, i)
+        end
+    end
+    a[idx]
+end
+
+function _uniqueofsorted(x::Vector)
+    idx = fill(true, length(x))
+    lastx = x[1]
+    for i = 2:length(x)
+        if lastx == x[i]
+            idx[i] = false
+        else
+            lastx = x[i]
+        end
+    end
+    x[idx]
+end
diff --git a/test/RDA.jl b/test/RDA.jl
@@ -8,16 +8,19 @@ module TestRDA
         # save(df, file='minimal.rda')
 
         # df['int'] = c(1L, 2L)
-        # df['logi'] = c(T, F)
+        # df['logi'] = c(TRUE, FALSE)
         # df['chr'] = c('ab', 'c')
-        # df['factor'] = factor(df[['chr']])
+        # df['factor'] = factor(df$chr)
         # #utf=c('Ж', '∰')) R handles it, read_rda doesn't.
         # save(df, file='types.rda')
 
         # df[2, ] = NA
         # df['chr'] = NULL  # NA characters breaking read_rda
         # save(df, file='NAs.rda')
 
+        # names(df) = c('end', '!', '1', '%_B*\tC*')
+        # save(df, file='names.rda')
+
     testdir = dirname(@__FILE__)
 
     df = DataFrame(num = [1.1, 2.2])
@@ -32,4 +35,7 @@ module TestRDA
     df[2, :] = NA
     df = df[:, [:num, :int, :logi, :factor]]  # (NA) chr breaks read_rda
     @test isequal(DataFrame(read_rda("$testdir/data/RDA/NAs.rda")["df"]), df)
+
+    rda_names = names(DataFrame(read_rda("$testdir/data/RDA/names.rda")["df"]))
+    @test rda_names == [:_end, :x!, :x1, :B_C]
 end
diff --git a/test/data/RDA/names.rda b/test/data/RDA/names.rda
diff --git a/test/index.jl b/test/index.jl
@@ -46,4 +46,9 @@ end
 push!(i, :C)
 @test delete!(i, 1) == Index([:C])
 
+Index(Symbol["a"])
+@test_throws Index(Symbol["\u212b"])
+@test_throws Index(Symbol["end"])
+@test_throws Index(Symbol["1a"])
+
 end
diff --git a/test/indexing.jl b/test/indexing.jl
@@ -161,4 +161,13 @@ module TestIndexing
             df[row_index, column_index] = DataArray([1 + 0im, 2 + 1im])
         end
     end
+
+    #
+    # Insert single column
+    #
+
+    df[symbol("c")] = 1:2
+    @test_throws df[symbol("\u212b")] = 1:2
+    @test_throws df[symbol("end")] = 1:2
+    @test_throws df[symbol("1a")] = 1:2
 end
diff --git a/test/io.jl b/test/io.jl
@@ -260,4 +260,16 @@ module TestIO
     @test typeof(df[:b]) == DataArray{UTF8String,1}
     @test df[:b][1] == "T"
     @test df[:b][2] == "FALSE"
+
+    # Readtable ignorepadding argument
+    io = IOBuffer("A , \tB  , C\n1 , \t2, 3\n")
+    @test readtable(io, ignorepadding=true) == DataFrame(A=1, B=2, C=3)
+
+    # Readtable name normalization
+    abnormal = "\u212b"
+    ns = [:Å, :B_C, :_end]
+    @test !in(symbol(abnormal), ns)
+
+    io = IOBuffer(abnormal*",%_B*\tC*,end\n1,2,3\n")
+    @test names(readtable(io)) == ns
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,7 +8,8 @@ anyerrors = false
 using Base.Test
 using DataFrames
 
-my_tests = ["data.jl",
+my_tests = ["utils.jl",
+            "data.jl",
             "index.jl",
             "dataframe.jl",
             "io.jl",