diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md index 259814ca52..da9ff04675 100644 --- a/docs/src/lib/types.md +++ b/docs/src/lib/types.md @@ -49,6 +49,9 @@ The `RepeatedVector` and `StackedVector` types are subtypes of `AbstractVector` with the exception that they are read only. Note that they are not exported and should not be constructed directly, but they are columns of a `DataFrame` returned by `stack` with `view=true`. +The `ByRow` type is a special type used for selection operations to signal that the wrapped function should be applied +to each element (row) of the selection. + ## [The design of handling of columns of a `DataFrame`](@id man-columnhandling) When a `DataFrame` is constructed columns are copied by default. You can disable @@ -103,6 +106,7 @@ without caution because: ```@docs AbstractDataFrame +ByRow DataFrame DataFrameRow GroupedDataFrame diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md index c37d1ef694..5a2c086081 100644 --- a/docs/src/man/getting_started.md +++ b/docs/src/man/getting_started.md @@ -522,34 +522,64 @@ julia> df[in.(df.A, Ref([1, 5, 601])), :] │ 3 │ 601 │ 7 │ 301 │ ``` -Equivalently, the `in` function can be called with a single argument to create a function object that tests whether each value belongs to the subset (partial application of `in`): `df[in([1, 5, 601]).(df.A), :]`. +Equivalently, the `in` function can be called with a single argument to create +a function object that tests whether each value belongs to the subset +(partial application of `in`): `df[in([1, 5, 601]).(df.A), :]`. #### Column selection using `select` and `select!` -You can also use the [`select`](@ref) and [`select!`](@ref) functions to select columns in a data frame. +You can also use the [`select`](@ref) and [`select!`](@ref) functions to select, +rename and transform columns in a data frame. The `select` function creates a new data frame: ```jldoctest dataframe -julia> df = DataFrame(x1=1, x2=2, y=3) -1×3 DataFrame +julia> df = DataFrame(x1=[1, 2], x2=[3, 4], y=[5, 6]) +2×3 DataFrame │ Row │ x1 │ x2 │ y │ │ │ Int64 │ Int64 │ Int64 │ ├─────┼───────┼───────┼───────┤ -│ 1 │ 1 │ 2 │ 3 │ +│ 1 │ 1 │ 3 │ 5 │ +│ 2 │ 2 │ 4 │ 6 │ julia> select(df, Not(:x1)) # drop column :x1 in a new data frame -1×2 DataFrame +2×2 DataFrame │ Row │ x2 │ y │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ -│ 1 │ 2 │ 3 │ +│ 1 │ 3 │ 5 │ +│ 2 │ 4 │ 6 │ julia> select(df, r"x") # select columns containing 'x' character -1×2 DataFrame +2×2 DataFrame │ Row │ x1 │ x2 │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ -│ 1 │ 1 │ 2 │ +│ 1 │ 1 │ 3 │ +│ 2 │ 2 │ 4 │ + +julia> select(df, :x1 => :a1, :x2 => :a2) # rename columns +2×2 DataFrame +│ Row │ a1 │ a2 │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 3 │ +│ 2 │ 2 │ 4 │ + +julia> select(df, :x1, :x2 => (x -> x .- minimum(x)) => :x2) # transform columns +2×2 DataFrame +│ Row │ x1 │ x2 │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 0 │ +│ 2 │ 2 │ 1 │ + +julia> select(df, :x2, :x2 => ByRow(sqrt)) # transform columns by row +2×2 DataFrame +│ Row │ x2 │ x2_sqrt │ +│ │ Int64 │ Float64 │ +├─────┼───────┼─────────┤ +│ 1 │ 3 │ 1.73205 │ +│ 2 │ 4 │ 2.0 │ ``` It is important to note that `select` always returns a data frame, diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 678a396800..c4ac96df1f 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -17,6 +17,7 @@ import DataAPI, export AbstractDataFrame, All, Between, + ByRow, DataFrame, DataFrame!, DataFrameRow, @@ -83,6 +84,7 @@ include("dataframerow/utils.jl") include("other/broadcasting.jl") +include("abstractdataframe/selection.jl") include("abstractdataframe/iteration.jl") include("abstractdataframe/join.jl") include("abstractdataframe/reshape.jl") diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl new file mode 100644 index 0000000000..5d91e6031e --- /dev/null +++ b/src/abstractdataframe/selection.jl @@ -0,0 +1,560 @@ +# TODO: +# * add transform and transfom! functions +# * add NT (or better name) to column selector passing NamedTuple +# (also in other places: filter, combine) +# * add select/select!/transform/transform! for GroupedDataFrame + +# normalize_selection function makes sure that whatever input format of idx is it +# will end up in one of four canonical forms +# 1) Int +# 2) AbstractVector{Int} +# 3) Pair{Int, Pair{ColRename, Symbol}} +# 4) Pair{Int, <:Pair{<:Base.Callable, Symbol}} +# 5) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}} +# 6) Pair{Int, Pair{ByRow, Symbol}} +# 7) Pair{AbstractVector{Int}, Pair{ByRow, Symbol}} + +""" + ColRename + +A singleton type indicating that column renaming operation was requested in `select`. +""" +struct ColRename end + +""" + ByRow + +A type used for selection operations to signal that the wrapped function should +be applied to each element (row) of the selection. +""" +struct ByRow{T} + fun::T +end + +(f::ByRow)(cols::AbstractVector...) = f.fun.(cols...) + +# add a method to funname defined in other/utils.jl +funname(row::ByRow) = funname(row.fun) + +normalize_selection(idx::AbstractIndex, sel) = + try + idx[sel] + catch e + if e isa MethodError && e.f === getindex && e.args === (idx, sel) + throw(ArgumentError("Unrecognized column selector: $sel")) + else + rethrow(e) + end + end + +function normalize_selection(idx::AbstractIndex, sel::ColumnIndex) + c = idx[sel] + return c => ColRename() => _names(idx)[c] +end + +function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol}) + c = idx[first(sel)] + return c => ColRename() => last(sel) +end + +function normalize_selection(idx::AbstractIndex, + sel::Pair{<:Any,<:Pair{<:Union{Base.Callable, ByRow}, Symbol}}) + rawc = first(sel) + if rawc isa AbstractVector{Int} + c = rawc + elseif rawc isa AbstractVector{Symbol} + c = [idx[n] for n in rawc] + else + c = try + idx[rawc] + catch e + if e isa MethodError && e.f === getindex && e.args === (idx, rawc) + throw(ArgumentError("Unrecognized column selector: $rawc")) + else + rethrow(e) + end + end + end + if length(c) == 0 && first(last(sel)) isa ByRow + throw(ArgumentError("at least one column must be passed to a " * + "`ByRow` transformation function")) + end + return c => last(sel) +end + +function normalize_selection(idx::AbstractIndex, + sel::Pair{<:ColumnIndex,<:Union{Base.Callable, ByRow}}) + c = idx[first(sel)] + fun = last(sel) + newcol = Symbol(_names(idx)[c], "_", funname(fun)) + return c => fun => newcol +end + +function normalize_selection(idx::AbstractIndex, + sel::Pair{<:Any, <:Union{Base.Callable,ByRow}}) + rawc = first(sel) + if rawc isa AbstractVector{Int} + c = rawc + elseif rawc isa AbstractVector{Symbol} + c = [idx[n] for n in rawc] + else + c = try + idx[rawc] + catch e + if e isa MethodError && e.f === getindex && e.args === (idx, rawc) + throw(ArgumentError("Unrecognized column selector: $rawc")) + else + rethrow(e) + end + end + end + if length(c) == 0 && last(sel) isa ByRow + throw(ArgumentError("at least one column must be passed to a " * + "`ByRow` transformation function")) + end + fun = last(sel) + if length(c) > 3 + newcol = Symbol(join(@views(_names(idx)[c[1:2]]), '_'), "_etc_", funname(fun)) + elseif isempty(c) + newcol = Symbol(funname(fun)) + else + newcol = Symbol(join(view(_names(idx), c), '_'), '_', funname(fun)) + end + return c => fun => newcol +end + +function select_transform!(nc::Pair{Int, Pair{ColRename, Symbol}}, + df::AbstractDataFrame, newdf::DataFrame, + transformed_cols::Dict{Symbol, Any}, copycols::Bool) + col_idx, (_, newname) = nc + # it is allowed to request column tranformation only once + @assert !hasproperty(newdf, newname) + newdf[!, newname] = copycols ? df[:, col_idx] : df[!, col_idx] + # mark that column transformation was applied + # nothing is not possible otherwise as a value in this dict + transformed_cols[newname] = nothing +end + +function select_transform!(nc::Pair{<:Union{Int, AbstractVector{Int}}, + <:Pair{<:Union{Base.Callable, ByRow}, Symbol}}, + df::AbstractDataFrame, newdf::DataFrame, + transformed_cols::Dict{Symbol, Any}, copycols::Bool) + col_idx, (fun, newname) = nc + @assert !hasproperty(newdf, newname) + cdf = eachcol(df) + if col_idx isa Int + res = fun(df[!, col_idx]) + else + res = fun((cdf[i] for i in col_idx)...) + end + if res isa Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix} + throw(ArgumentError("return value from function $fun " * + "of type $(typeof(res)) is currently not allowed.")) + end + if res isa AbstractVector + respar = parent(res) + if copycols && !(fun isa ByRow) && + (res isa SubArray || any(i -> respar === parent(cdf[i]), col_idx)) + newdf[!, newname] = copy(res) + else + newdf[!, newname] = res + end + else + newdf[!, newname] = [res] + end + transformed_cols[newname] = nothing +end + +""" + select!(df::DataFrame, inds...) + +Mutate `df` in place to retain only columns specified by `inds...` and return it. + +Arguments passed as `inds...` can be any index that is allowed for column indexing. +In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported. + +Columns can be renamed using the `old_column => new_column_name` syntax, +and transformed using the `old_column => fun => new_column_name` syntax. +`new_column_name` must be a `Symbol`, and `fun` a function or a type. If `old_column` +is a `Symbol` or an integer then `fun` is applied to the corresponding column vector. +Otherwise `old_column` can be any column indexing syntax, in which case `fun` +will be passed the column vectors specified by `old_column` as separate arguments. +If `fun` returns a value of type other than `AbstractVector` then it will be wrapped +into a 1-element vector, unless its type is one of `AbstractDataFrame`, `NamedTuple`, +`DataFrameRow`, `AbstractMatrix`, in which case an error is thrown as currently these +return types are not allowed. + +To apply `fun` to each row instead of whole columns, it can be wrapped in a `ByRow` +struct. In this case if `old_column` is a `Symbol` or an integer then `fun` is applied +to each element (row) of `old_column` using broadcasting. Otherwise `old_column` can be +any column indexing syntax, in which case `fun` will be passed one argument for each of +the columns specified by `old_column`. If `ByRow` is used it is not allowed for +`old_column` to select an empty set of columns. + +Column transformation can also be specified using the short `old_column => fun` form. +In this case, `new_column_name` is automatically generated as `\$(old_column)_\$(fun)`. +Up to three column names are used for multiple input columns and they are joined +using `_`; if more than three columns are passed then the name consists of the +first two names and `etc` suffix then, e.g. `[:a,:b,:c,:d] => fun` produces +the new column name `:a_b_etc_fun`. + +If a collection of column names is passed to `select!` then requesting duplicate column +names in target data frame are accepted (e.g. `select!(df, [:a], :, r"a")` is allowed) +and only the first occurrence is used. In particular a syntax to move column `:col` +to the first position in the data frame is `select!(df, :col, :)`. +On the contrary, output column names of renaming, transformation and single column +selection operations must be unique, so e.g. `select!(df, :a, :a => :a)` or +`select!(df, :a, :a => ByRow(sin) => :a)` are not allowed. + +Note that including the same column several times in the data frame via renaming +or transformations that return the same object without copying will create column aliases. +An example of such a situation is `select!(df, :a, :a => :b, :a => identity => :c)`. + +# Examples +```jldoctest +julia> df = DataFrame(a=1:3, b=4:6) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 2 │ 5 │ +│ 3 │ 3 │ 6 │ + +julia> select!(df, 2) +3×1 DataFrame +│ Row │ b │ +│ │ Int64 │ +├─────┼───────┤ +│ 1 │ 4 │ +│ 2 │ 5 │ +│ 3 │ 6 │ + +julia> df = DataFrame(a=1:3, b=4:6) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 2 │ 5 │ +│ 3 │ 3 │ 6 │ + +julia> select!(df, :a => ByRow(sin) => :c, :b) +3×2 DataFrame +│ Row │ c │ b │ +│ │ Float64 │ Int64 │ +├─────┼──────────┼───────┤ +│ 1 │ 0.841471 │ 4 │ +│ 2 │ 0.909297 │ 5 │ +│ 3 │ 0.14112 │ 6 │ + +julia> select!(df, :, [:c, :b] => (c,b) -> c .+ b .- sum(b)/length(b)) +3×3 DataFrame +│ Row │ c │ b │ c_b_function │ +│ │ Float64 │ Int64 │ Float64 │ +├─────┼──────────┼───────┼──────────────┤ +│ 1 │ 0.841471 │ 4 │ -0.158529 │ +│ 2 │ 0.909297 │ 5 │ 0.909297 │ +│ 3 │ 0.14112 │ 6 │ 1.14112 │ +``` + +""" +function select!(df::DataFrame, inds::AbstractVector{Int}) + if isempty(inds) + empty!(_columns(df)) + empty!(index(df)) + return df + end + indmin, indmax = extrema(inds) + if indmin < 1 + throw(ArgumentError("indices must be positive")) + end + if indmax > ncol(df) + throw(ArgumentError("indices must not be greater than number of columns")) + end + if !allunique(inds) + throw(ArgumentError("indices must not contain duplicates")) + end + copy!(_columns(df), _columns(df)[inds]) + x = index(df) + copy!(_names(x), _names(df)[inds]) + empty!(x.lookup) + for (i, n) in enumerate(x.names) + x.lookup[n] = i + end + return df +end + +select!(df::DataFrame, c::Int) = select!(df, [c]) +select!(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, + Colon, All, Not, Between, Regex}) = + select!(df, index(df)[c]) + +function select!(df::DataFrame, cs...) + newdf = select(df, cs..., copycols=false) + copy!(_columns(df), _columns(newdf)) + x = index(df) + copy!(_names(x), _names(newdf)) + empty!(x.lookup) + for (i, n) in enumerate(x.names) + x.lookup[n] = i + end + return df +end + +""" + select(df::AbstractDataFrame, inds...; copycols::Bool=true) + +Create a new data frame that contains columns from `df` specified by `inds` and return it. + +Arguments passed as `inds...` can be any index that is allowed for column indexing. +In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported. + +Also if `df` is a `DataFrame` or `copycols=true` then column renaming and transformations +are supported. + +Columns can be renamed using the `old_column => new_column_name` syntax, +and transformed using the `old_column => fun => new_column_name` syntax. +`new_column_name` must be a `Symbol`, and `fun` a function or a type. If `old_column` +is a `Symbol` or an integer then `fun` is applied to the corresponding column vector. +Otherwise `old_column` can be any column indexing syntax, in which case `fun` +will be passed the column vectors specified by `old_column` as separate arguments. +If `fun` returns a value of type other than `AbstractVector` then it will be wrapped +into a 1-element vector, unless its type is one of `AbstractDataFrame`, `NamedTuple`, +`DataFrameRow`, `AbstractMatrix`, in which case an error is thrown as currently these +return types are not allowed. + +To apply `fun` to each row instead of whole columns, it can be wrapped in a `ByRow` +struct. In this case if `old_column` is a `Symbol` or an integer then `fun` is applied +to each element (row) of `old_column` using broadcasting. Otherwise `old_column` can be +any column indexing syntax, in which case `fun` will be passed one argument for each of +the columns specified by `old_column`. If `ByRow` is used it is not allowed for +`old_column` to select an empty set of columns. + +Column transformation can also be specified using the short `old_column => fun` form. +In this case, `new_column_name` is automatically generated as `\$(old_column)_\$(fun)`. +Up to three column names are used for multiple input columns and they are joined +using `_`; if more than three columns are passed then the name consists of the +first two names and `etc` suffix then, e.g. `[:a,:b,:c,:d] => fun` produces +the new column name `:a_b_etc_fun`. + +If a collection of column names is passed to `select!` then requesting duplicate column +names in target data frame are accepted (e.g. `select!(df, [:a], :, r"a")` is allowed) +and only the first occurrence is used. In particular a syntax to move column `:col` +to the first position in the data frame is `select!(df, :col, :)`. +On the contrary, output column names of renaming, transformation and single column +selection operations must be unique, so e.g. `select!(df, :a, :a => :a)` or +`select!(df, :a, :a => ByRow(sin) => :a)` are not allowed. + +If `df` is a `DataFrame` a new `DataFrame` is returned. +If `copycols=false`, then the returned `DataFrame` shares column vectors with `df` where possible. +If `copycols=true` (the default), then the returned `DataFrame` will not share columns with `df`. +The only exception for this rule is the `old_column => fun => new_column` transformation +when `fun` returns a vector that is not allocated by `fun` but is neither a `SubArray` nor one +of the input vectors. +In such a case a new `DataFrame` might contain aliases. Such a situation can only happen +with transformations which returns vectors other than their inputs, e.g. with +`select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2)` when `c` is a vector object +or with `select(df, :a => (x -> df.c) => :c2)`. + +If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is returned and +the same copying rules apply as for a `DataFrame` input: +this means in particular that selected columns will be copied. +If `copycols=false`, a `SubDataFrame` is returned without copying columns. + +Note that including the same column several times in the data frame via renaming or +transformations that return the same object when `copycols=false` will create column +aliases. An example of such a situation is +`select(df, :a, :a => :b, :a => identity => :c, copycols=false)`. + +# Examples +```jldoctest +julia> df = DataFrame(a=1:3, b=4:6) +3×2 DataFrame +│ Row │ a │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 2 │ 5 │ +│ 3 │ 3 │ 6 │ + +julia> select(df, :b) +3×1 DataFrame +│ Row │ b │ +│ │ Int64 │ +├─────┼───────┤ +│ 1 │ 4 │ +│ 2 │ 5 │ +│ 3 │ 6 │ + +julia> select(df, Not(:b)) # drop column :b from df +3×1 DataFrame +│ Row │ a │ +│ │ Int64 │ +├─────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 2 │ +│ 3 │ 3 │ + +julia> select(df, :a => :c, :b) +3×2 DataFrame +│ Row │ c │ b │ +│ │ Int64 │ Int64 │ +├─────┼───────┼───────┤ +│ 1 │ 1 │ 4 │ +│ 2 │ 2 │ 5 │ +│ 3 │ 3 │ 6 │ + +julia> select(df, :a => ByRow(sin) => :c, :b) +3×2 DataFrame +│ Row │ c │ b │ +│ │ Float64 │ Int64 │ +├─────┼──────────┼───────┤ +│ 1 │ 0.841471 │ 4 │ +│ 2 │ 0.909297 │ 5 │ +│ 3 │ 0.14112 │ 6 │ + +julia> select(df, :, [:a, :b] => (a,b) -> a .+ b .- sum(b)/length(b)) +3×3 DataFrame +│ Row │ a │ b │ a_b_function │ +│ │ Int64 │ Int64 │ Float64 │ +├─────┼───────┼───────┼──────────────┤ +│ 1 │ 1 │ 4 │ 0.0 │ +│ 2 │ 2 │ 5 │ 2.0 │ +│ 3 │ 3 │ 6 │ 4.0 │ +``` + +""" +select(df::DataFrame, inds::AbstractVector{Int}; copycols::Bool=true) = + DataFrame(_columns(df)[inds], Index(_names(df)[inds]), + copycols=copycols) +select(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, + Colon, All, Not, Between, Regex}; copycols::Bool=true) = + select(df, index(df)[c], copycols=copycols) +select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) = + select(df, [c], copycols=copycols) + +select(df::DataFrame, cs...; copycols::Bool=true) = + _select(df, [normalize_selection(index(df), c) for c in cs], copycols) + +function _select(df::AbstractDataFrame, normalized_cs, copycols::Bool) + @assert !(df isa SubDataFrame && copycols==false) + newdf = DataFrame() + # the role of transformed_cols is the following + # * make sure that we do not use the same target column name twice in transformations; + # note though that it can appear in no-transformation selection like + # `select(df, :, :a => ByRow(sin) => :a), where :a is produced both by `:` + # and by `:a => ByRow(sin) => :a` + # * make sure that if some column is produced by transformation like + # `:a => ByRow(sin) => :a` and it appears earlier or later in non-transforming + # selection like `:` or `:a` then the transformation is computed and inserted + # in to the target data frame once and only once the first time the target column + # is requested to be produced. + # + # For example in: + # + # julia> df = DataFrame(a=1:2, b=3:4) + # 2×2 DataFrame + # │ Row │ a │ b │ + # │ │ Int64 │ Int64 │ + # ├─────┼───────┼───────┤ + # │ 1 │ 1 │ 3 │ + # │ 2 │ 2 │ 4 │ + # + # julia> select(df, :, :a => ByRow(sin) => :a, :a, 1) + # 2×2 DataFrame + # │ Row │ a │ b │ + # │ │ Float64 │ Int64 │ + # ├─────┼──────────┼───────┤ + # │ 1 │ 0.841471 │ 3 │ + # │ 2 │ 0.909297 │ 4 │ + # + # we compute column :a immediately when we process `:` although it is specified + # later by `:a=>sin=>:a` because we know from `transformed_cols` variable that + # it will be computed later via a transformation + transformed_cols = Dict{Symbol, Any}() + for nc in normalized_cs + if nc isa Pair + newname = last(last(nc)) + @assert newname isa Symbol + if haskey(transformed_cols, newname) + throw(ArgumentError("duplicate target column name $newname passed")) + end + transformed_cols[newname] = nc + end + end + for nc in normalized_cs + if nc isa AbstractVector{Int} + allunique(nc) || throw(ArgumentError("duplicate column names selected")) + for i in nc + newname = _names(df)[i] + # as nc is a multiple column selection without transformations + # we allow duplicate column names with selections applied earlier + # and ignore them for convinience, to allow for e.g. select(df, :x1, :) + if !hasproperty(newdf, newname) + if haskey(transformed_cols, newname) + # if newdf does not have a column newname + # but a column transformation was requested for this column + # then apply the transformation immediately + # in such a case nct may not be nothing, as if it were + # nothing then newname should be preasent in newdf already + nct = transformed_cols[newname] + @assert nct !== nothing + select_transform!(nct, df, newdf, transformed_cols, copycols) + else + newdf[!, newname] = copycols ? df[:, i] : df[!, i] + end + end + end + else + # nc is normalized so it has a form src_cols => fun => Symbol + newname = last(last(nc)) + if hasproperty(newdf, newname) + # it is possible that the transformation has already been applied + # via multiple column selection, like in select(df, :, :x1 => :y1) + # but then transformed_cols[newname] must be nothing + @assert transformed_cols[newname] === nothing + else + select_transform!(nc, df, newdf, transformed_cols, copycols) + end + end + end + return newdf +end + +select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) = + select(dfv, [ind], copycols=copycols) +select(dfv::SubDataFrame, inds::Union{AbstractVector{<:Integer}, AbstractVector{Symbol}, + Colon, All, Not, Between, Regex}; copycols::Bool=true) = + copycols ? dfv[:, inds] : view(dfv, :, inds) + +function select(dfv::SubDataFrame, inds...; copycols::Bool=true) + if copycols + return _select(dfv, [normalize_selection(index(dfv), c) for c in inds], true) + else + # we do not support transformations here + # newinds should not be large so making it Vector{Any} should be OK + newinds = [] + seen_single_column = Set{Int}() + for ind in inds + if ind isa ColumnIndex + ind_idx = index(dfv)[ind] + if ind_idx in seen_single_column + throw(ArgumentError("selecting the same column multiple times using" * + " Symbol or integer is not allowed ($ind was " * + "passed more than once")) + else + push!(seen_single_column, ind_idx) + end + else + newind = normalize_selection(index(dfv), ind) + if newind isa Pair + throw(ArgumentError("transforming and renaming columns of a " * + "SubDataFrame is not allowed when `copycols=false`")) + end + push!(newinds, newind) + end + end + return view(dfv, :, All(newinds...)) + end +end diff --git a/src/dataframe/dataframe.jl b/src/dataframe/dataframe.jl index d8972c8205..9a8d503037 100644 --- a/src/dataframe/dataframe.jl +++ b/src/dataframe/dataframe.jl @@ -797,141 +797,6 @@ function deleterows!(df::DataFrame, inds::AbstractVector{Bool}) df end -""" - select!(df::DataFrame, inds...) - -Mutate `df` in place to retain only columns specified by `inds...` and return it. - -Arguments passed as `inds...` can be any index that is allowed for column indexing -provided that the columns requested in each of them are unique and present in `df`. -In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported. - -If more than one argument is passed then they are joined as `All(inds...)`. -Note that `All` selects the union of columns passed to it, so columns selected -in different `inds...` do not have to be unique. For example a call -`select!(df, :col, All())` is valid and moves column `:col` in the -data frame to be the first, provided it is present in `df`. - -# Examples -```jldoctest -julia> df = DataFrame(a=1:3, b=4:6) -3×2 DataFrame -│ Row │ a │ b │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 4 │ -│ 2 │ 2 │ 5 │ -│ 3 │ 3 │ 6 │ - -julia> select!(df, 2) -3×1 DataFrame -│ Row │ b │ -│ │ Int64 │ -├─────┼───────┤ -│ 1 │ 4 │ -│ 2 │ 5 │ -│ 3 │ 6 │ -``` - -""" -function select!(df::DataFrame, inds::AbstractVector{Int}) - if isempty(inds) - empty!(_columns(df)) - empty!(index(df)) - return df - end - indmin, indmax = extrema(inds) - if indmin < 1 - throw(ArgumentError("indices must be positive")) - end - if indmax > ncol(df) - throw(ArgumentError("indices must not be greater than number of columns")) - end - if !allunique(inds) - throw(ArgumentError("indices must not contain duplicates")) - end - copy!(_columns(df), _columns(df)[inds]) - x = index(df) - copy!(_names(x), _names(df)[inds]) - empty!(x.lookup) - for (i, n) in enumerate(x.names) - x.lookup[n] = i - end - df -end - -select!(df::DataFrame, c::Int) = select!(df, [c]) -select!(df::DataFrame, c::Any) = select!(df, index(df)[c]) -select!(df::DataFrame, c, cs...) = select!(df, All(c, cs...)) - -""" - select(df::AbstractDataFrame, inds...; copycols::Bool=true) - -Create a new data frame that contains columns from `df` -specified by `inds` and return it. - -Arguments passed as `inds...` can be any index that is allowed for column indexing -provided that the columns requested in each of them are unique and present in `df`. -In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported. - -If more than one argument is passed then they are joined as `All(inds...)`. -Note that `All` selects the union of columns passed to it, so columns selected -in different `inds...` do not have to be unique. For example a call -`select(df, :col, All())` is valid and creates a new data frame with column `:col` -moved to be the first, provided it is present in `df`. - - -If `df` is a `DataFrame` return a new `DataFrame` that contains columns from `df` -specified by `inds`. -If `copycols=true` (the default), then returned `DataFrame` holds -copies of column vectors in `df`. -If `copycols=false`, then returned `DataFrame` shares column vectors with `df`. - -If `df` is a `SubDataFrame` then a `SubDataFrame` is returned if `copycols=false` -and a `DataFrame` with freshly allocated columns otherwise. - -# Examples -```jldoctest -julia> df = DataFrame(a=1:3, b=4:6) -3×2 DataFrame -│ Row │ a │ b │ -│ │ Int64 │ Int64 │ -├─────┼───────┼───────┤ -│ 1 │ 1 │ 4 │ -│ 2 │ 2 │ 5 │ -│ 3 │ 3 │ 6 │ - -julia> select(df, :b) -3×1 DataFrame -│ Row │ b │ -│ │ Int64 │ -├─────┼───────┤ -│ 1 │ 4 │ -│ 2 │ 5 │ -│ 3 │ 6 │ - -julia> select(df, Not(:b)) # drop column :b from df -3×1 DataFrame -│ Row │ a │ -│ │ Int64 │ -├─────┼───────┤ -│ 1 │ 1 │ -│ 2 │ 2 │ -│ 3 │ 3 │ -``` - -""" -select(df::DataFrame, inds::AbstractVector{Int}; copycols::Bool=true) = - DataFrame(_columns(df)[inds], Index(_names(df)[inds]), - copycols=copycols) - -select(df::DataFrame, c::Int; copycols::Bool=true) = - select(df, [c], copycols=copycols) -select(df::DataFrame, c::Any; copycols::Bool=true) = - select(df, index(df)[c], copycols=copycols) -select(df::DataFrame, c, cs...; copycols::Bool=true) = - select(df, All(c, cs...), copycols=copycols) - ############################################################################## ## ## Hcat specialization diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index c62d348887..b113d61c1a 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -1264,11 +1264,6 @@ function aggregate(d::AbstractDataFrame, cols, fs::Any; aggregate(groupby(d, cols, sort=sort, skipmissing=skipmissing), fs) end -function funname(f) - n = nameof(f) - String(n)[1] == '#' ? :function : n -end - _makeheaders(fs::AbstractVector, cn::AbstractVector{Symbol}) = [Symbol(colname, '_', funname(f)) for f in fs for colname in cn] diff --git a/src/other/index.jl b/src/other/index.jl index d2dd51f612..146a583595 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -342,6 +342,9 @@ Base.@propagate_inbounds parentcols(ind::SubIndex, idx::AbstractVector{Symbol}) Base.@propagate_inbounds parentcols(ind::SubIndex, idx::Regex) = [parentcols(ind, i) for i in _names(ind) if occursin(idx, String(i))] +Base.@propagate_inbounds parentcols(ind::SubIndex, idx::Union{All, Between}) = + parentcols(ind, ind[idx]) + Base.@propagate_inbounds parentcols(ind::SubIndex, ::Colon) = ind.cols Base.@propagate_inbounds parentcols(ind::SubIndex, idx::Not) = parentcols(ind, ind[idx]) diff --git a/src/other/utils.jl b/src/other/utils.jl index 0f58becc0e..4c31776267 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -57,3 +57,8 @@ function gennames(n::Integer) end return res end + +function funname(f) + n = nameof(f) + String(n)[1] == '#' ? :function : n +end diff --git a/src/subdataframe/subdataframe.jl b/src/subdataframe/subdataframe.jl index 6620ee4cd5..6cc38f5d95 100644 --- a/src/subdataframe/subdataframe.jl +++ b/src/subdataframe/subdataframe.jl @@ -173,10 +173,3 @@ function DataFrame(sdf::SubDataFrame; copycols::Bool=true) end Base.convert(::Type{DataFrame}, sdf::SubDataFrame) = DataFrame(sdf) - -select(dfv::SubDataFrame, inds; copycols::Bool=true) = - copycols ? dfv[:, inds] : view(dfv, :, inds) -select(dfv::SubDataFrame, inds::ColumnIndex; copycols::Bool=true) = - select(dfv, [inds], copycols=copycols) -select(dfv::SubDataFrame, ind, inds...; copycols::Bool=true) = - select(dfv, All(ind, inds...), copycols=copycols) diff --git a/test/dataframe.jl b/test/dataframe.jl index d68aca0830..d280825f57 100644 --- a/test/dataframe.jl +++ b/test/dataframe.jl @@ -508,505 +508,6 @@ end @test_logs (:warn, r"In the future `push!` will not allow passing collections of type") push!(df, "ab") end -@testset "select! Not" begin - df = DataFrame(a=1, b=2, c=3, d=4, e=5) - @test_throws BoundsError select!(df, Not(0)) - @test_throws BoundsError select!(df, Not(6)) - @test_throws ArgumentError select!(df, Not([1, 1])) - @test_throws ArgumentError select!(df, Not(:f)) - @test_throws BoundsError select!(df, Not([true, false])) - - d = copy(df) - select!(d, Not([:a, :e, :c])) - @test names(d) == [:b, :d] - select!(d, Not(:b)) - @test d == DataFrame(d=4) - DataFrames._check_consistency(d) - - d = copy(df) - select!(d, Not(r"[aec]")) - @test names(d) == [:b, :d] - select!(d, Not(r"b")) - @test d == DataFrame(d=4) - - d = copy(df) - select!(d, Not([2, 5, 3])) - @test names(d) == [:a, :d] - select!(d, Not(2)) - @test d == DataFrame(a=1) - - d = copy(df) - select!(d, Not(2:3)) - @test d == DataFrame(a=1, d=4, e=5) - - d = copy(df) - select!(d, Not([false, true, true, false, false])) - @test d == DataFrame(a=1, d=4, e=5) -end - -@testset "select Not" begin - df = DataFrame(a=1, b=2, c=3, d=4, e=5) - @test_throws BoundsError select(df, Not(0)) - @test_throws BoundsError select(df, Not(6)) - @test_throws ArgumentError select(df, Not([1, 1])) - @test_throws ArgumentError select(df, Not(:f)) - @test_throws BoundsError select(df, Not([true, false])) - - df2 = copy(df) - d = select(df, Not([:a, :e, :c])) - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d.b !== df.b - @test d.d !== df.d - @test df == df2 - - df2 = copy(df) - d = select(df, Not(r"[aec]")) - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d == df[:, r"[bd]"] - @test d.b !== df.b - @test d.d !== df.d - @test df == df2 - - d = select(df, Not([2, 5, 3])) - @test names(d) == [:a, :d] - @test d.a !== df.a - @test d.d !== df.d - @test d == df[:, [:a, :d]] - @test df == df2 - - d = select(df, Not(2:3)) - @test d == DataFrame(a=1, d=4, e=5) - @test d.a !== df.a - @test d.d !== df.d - @test d.e !== df.e - @test df == df2 - - d = select(df, Not([false, true, true, false, false])) - @test d == DataFrame(a=1, d=4, e=5) - @test d.a !== df.a - @test d.d !== df.d - @test d.e !== df.e - @test df == df2 - - d = select(df, Not(1)) - @test d == DataFrame(b=2,c=3,d=4,e=5) - @test d.b !== df.b - @test d.b == df.b - @test df == df2 - - d = select(df, Not([:a, :e, :c]), copycols=false) - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d.b === df.b - @test d.d === df.d - @test df == df2 - - d = select(df, Not(r"[aec]"), copycols=false) - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d == df[:, r"[bd]"] - @test d.b === df.b - @test d.d === df.d - @test df == df2 - - d = select(df, Not([2, 5, 3]), copycols=false) - @test names(d) == [:a, :d] - @test d.a === df.a - @test d.d === df.d - @test d == df[:, [:a, :d]] - @test df == df2 - - d = select(df, Not(2:3), copycols=false) - @test d == DataFrame(a=1, d=4, e=5) - @test d.a === df.a - @test d.d === df.d - @test d.e === df.e - @test df == df2 - - d = select(df, Not([false, true, true, false, false]), copycols=false) - @test d == DataFrame(a=1, d=4, e=5) - @test d.a === df.a - @test d.d === df.d - @test d.e === df.e - @test df == df2 - - d = select(df, Not(1), copycols=false) - @test d == DataFrame(b=2,c=3,d=4,e=5) - @test d.b === df.b - @test df == df2 -end - -@testset "select Not view" begin - df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) - @test_throws BoundsError select(df, Not(0)) - @test_throws BoundsError select(df, Not(6)) - @test_throws ArgumentError select(df, Not([1, 1])) - @test_throws ArgumentError select(df, Not(:f)) - @test_throws BoundsError select(df, Not([true, false])) - - df2 = copy(df) - d = select(df, Not([:a, :e, :c])) - @test d isa DataFrame - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d.b !== df.b - @test d.d !== df.d - @test df == df2 - - df2 = copy(df) - d = select(df, Not(r"[aec]")) - @test d isa DataFrame - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d == df[:, r"[bd]"] - @test d.b !== df.b - @test d.d !== df.d - @test df == df2 - - d = select(df, Not([2, 5, 3])) - @test d isa DataFrame - @test names(d) == [:a, :d] - @test d.a !== df.a - @test d.d !== df.d - @test d == df[:, [:a, :d]] - @test df == df2 - - d = select(df, Not(2:3)) - @test d isa DataFrame - @test d == DataFrame(a=1, d=4, e=5) - @test d.a !== df.a - @test d.d !== df.d - @test d.e !== df.e - @test df == df2 - - d = select(df, Not([false, true, true, false, false])) - @test d isa DataFrame - @test d == DataFrame(a=1, d=4, e=5) - @test d.a !== df.a - @test d.d !== df.d - @test d.e !== df.e - @test df == df2 - - d = select(df, Not(1)) - @test d isa DataFrame - @test d == DataFrame(b=2,c=3,d=4,e=5) - @test d.b !== df.b - @test d.b == df.b - @test df == df2 - - d = select(df, Not([:a, :e, :c]), copycols=false) - @test d isa SubDataFrame - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d.b === df.b - @test d.d === df.d - @test df == df2 - - d = select(df, Not(r"[aec]"), copycols=false) - @test d isa SubDataFrame - @test names(d) == [:b, :d] - @test d == df[:, [:b, :d]] - @test d == df[:, r"[bd]"] - @test d.b === df.b - @test d.d === df.d - @test df == df2 - - d = select(df, Not([2, 5, 3]), copycols=false) - @test d isa SubDataFrame - @test names(d) == [:a, :d] - @test d.a === df.a - @test d.d === df.d - @test d == df[:, [:a, :d]] - @test df == df2 - - d = select(df, Not(2:3), copycols=false) - @test d isa SubDataFrame - @test d == DataFrame(a=1, d=4, e=5) - @test d.a === df.a - @test d.d === df.d - @test d.e === df.e - @test df == df2 - - d = select(df, Not([false, true, true, false, false]), copycols=false) - @test d isa SubDataFrame - @test d == DataFrame(a=1, d=4, e=5) - @test d.a === df.a - @test d.d === df.d - @test d.e === df.e - @test df == df2 - - d = select(df, Not(1), copycols=false) - @test d isa SubDataFrame - @test d == DataFrame(b=2,c=3,d=4,e=5) - @test d.b === df.b - @test df == df2 -end - -@testset "select!" begin - df = DataFrame(a=1, b=2, c=3, d=4, e=5) - @test_throws ArgumentError select!(df, 0) - @test_throws ArgumentError select!(df, 6) - @test_throws ArgumentError select!(df, [1, 1]) - @test_throws ArgumentError select!(df, :f) - @test_throws BoundsError select!(df, [true, false]) - - @test_throws MethodError select!(view(df, :, :), 1:2) - - d = copy(df, copycols=false) - @test select!(d, 1:0) == DataFrame() - @test select!(d, Not(r"")) == DataFrame() - - d = copy(df, copycols=false) - select!(d, [:a, :e, :c]) - @test names(d) == [:a, :e, :c] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = copy(df, copycols=false) - select!(d, r"[aec]") - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = copy(df, copycols=false) - select!(d, [true, false, true, false, true]) - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.c === df.c - @test d.e === df.e - - d = copy(df, copycols=false) - select!(d, [:d, :e, :a, :c, :b]) - @test names(d) == [:d, :e, :a, :c, :b] - for i in [:d, :e, :a, :c, :b] - @test d[!, i] === df[!, i] - end - - d = copy(df, copycols=false) - select!(d, [2, 5, 3]) - @test names(d) == [:b, :e, :c] - @test d.b === df.b - @test d.e === df.e - @test d.c === df.c - - d = copy(df, copycols=false) - select!(d, 2:3) - @test names(d) == [:b, :c] - @test d.b === df.b - @test d.c === df.c - - d = copy(df, copycols=false) - select!(d, 2) - @test names(d) == [:b] - @test d.b === df.b -end - -@testset "select" begin - df = DataFrame(a=1, b=2, c=3, d=4, e=5) - @test_throws BoundsError select(df, 0) - @test_throws BoundsError select(df, 6) - @test_throws ArgumentError select(df, [1, 1]) - @test_throws ArgumentError select(df, :f) - @test_throws BoundsError select!(df, [true, false]) - - @test select(df, 1:0) == DataFrame() - @test select(df, Not(r"")) == DataFrame() - @test select(df, 1:0, copycols=false) == DataFrame() - @test select(df, Not(r""), copycols=false) == DataFrame() - - d = select(df, [:a, :e, :c]) - @test names(d) == [:a, :e, :c] - @test d.a !== df.a - @test d.e !== df.e - @test d.c !== df.c - @test d.a == df.a - @test d.e == df.e - @test d.c == df.c - - d = select(df, r"[aec]") - @test names(d) == [:a, :c, :e] - @test d.a !== df.a - @test d.e !== df.e - @test d.c !== df.c - @test d.a == df.a - @test d.e == df.e - @test d.c == df.c - - d = select(df, [true, false, true, false, true]) - @test names(d) == [:a, :c, :e] - @test d.a !== df.a - @test d.c !== df.c - @test d.e !== df.e - @test d.a == df.a - @test d.c == df.c - @test d.e == df.e - - d = select(df, [2, 5, 3]) - @test names(d) == [:b, :e, :c] - @test d.b !== df.b - @test d.e !== df.e - @test d.c !== df.c - @test d.b == df.b - @test d.e == df.e - @test d.c == df.c - - d = select(df, 2:3) - @test names(d) == [:b, :c] - @test d.b !== df.b - @test d.c !== df.c - @test d.b == df.b - @test d.c == df.c - - d = select(df, 2) - @test names(d) == [:b] - @test d.b !== df.b - @test d.b == df.b - - d = select(df, [:a, :e, :c], copycols=false) - @test names(d) == [:a, :e, :c] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = select(df, r"[aec]", copycols=false) - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = select(df, [true, false, true, false, true], copycols=false) - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.c === df.c - @test d.e === df.e - - d = select(df, [2, 5, 3], copycols=false) - @test names(d) == [:b, :e, :c] - @test d.b === df.b - @test d.e === df.e - @test d.c === df.c - - d = select(df, 2:3, copycols=false) - @test names(d) == [:b, :c] - @test d.b === df.b - @test d.c === df.c - - d = select(df, 2, copycols=false) - @test names(d) == [:b] - @test d.b === df.b -end - -@testset "select view" begin - df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) - @test_throws BoundsError select(df, 0) - @test_throws BoundsError select(df, 6) - @test_throws ArgumentError select(df, [1, 1]) - @test_throws ArgumentError select(df, :f) - @test_throws MethodError select!(df, [true, false]) - - @test select(df, 1:0) == DataFrame() - @test select(df, Not(r"")) == DataFrame() - @test select(df, 1:0, copycols=false) == DataFrame() - @test select(df, Not(r""), copycols=false) == DataFrame() - - d = select(df, [:a, :e, :c]) - @test d isa DataFrame - @test names(d) == [:a, :e, :c] - @test d.a !== df.a - @test d.e !== df.e - @test d.c !== df.c - @test d.a == df.a - @test d.e == df.e - @test d.c == df.c - - d = select(df, r"[aec]") - @test d isa DataFrame - @test names(d) == [:a, :c, :e] - @test d.a !== df.a - @test d.e !== df.e - @test d.c !== df.c - @test d.a == df.a - @test d.e == df.e - @test d.c == df.c - - d = select(df, [true, false, true, false, true]) - @test d isa DataFrame - @test names(d) == [:a, :c, :e] - @test d.a !== df.a - @test d.c !== df.c - @test d.e !== df.e - @test d.a == df.a - @test d.c == df.c - @test d.e == df.e - - d = select(df, [2, 5, 3]) - @test d isa DataFrame - @test names(d) == [:b, :e, :c] - @test d.b !== df.b - @test d.e !== df.e - @test d.c !== df.c - @test d.b == df.b - @test d.e == df.e - @test d.c == df.c - - d = select(df, 2:3) - @test d isa DataFrame - @test names(d) == [:b, :c] - @test d.b !== df.b - @test d.c !== df.c - @test d.b == df.b - @test d.c == df.c - - d = select(df, 2) - @test d isa DataFrame - @test names(d) == [:b] - @test d.b !== df.b - @test d.b == df.b - - d = select(df, [:a, :e, :c], copycols=false) - @test d isa SubDataFrame - @test names(d) == [:a, :e, :c] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = select(df, r"[aec]", copycols=false) - @test d isa SubDataFrame - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.e === df.e - @test d.c === df.c - - d = select(df, [true, false, true, false, true], copycols=false) - @test d isa SubDataFrame - @test names(d) == [:a, :c, :e] - @test d.a === df.a - @test d.c === df.c - @test d.e === df.e - - d = select(df, [2, 5, 3], copycols=false) - @test d isa SubDataFrame - @test names(d) == [:b, :e, :c] - @test d.b === df.b - @test d.e === df.e - @test d.c === df.c - - d = select(df, 2:3, copycols=false) - @test d isa SubDataFrame - @test names(d) == [:b, :c] - @test d.b === df.b - @test d.c === df.c - - d = select(df, 2, copycols=false) - @test d isa SubDataFrame - @test names(d) == [:b] - @test d.b === df.b -end - @testset "deleterows!" begin df = DataFrame(a=[1, 2], b=[3.0, 4.0]) @test deleterows!(df, 1) === df @@ -1888,69 +1389,6 @@ end @test df[1, :B] === 0 end -@testset "select! on all columns" begin - a, b, c = 1:5, 2:6, 3:7 - original = DataFrame(a=a, b=b, c=c) - - df = deepcopy(original) - expected = deepcopy(original) - @test select!(df, [:a, :b, :c]) === df - @test df == expected - @test select!(df, 1:3) === df - @test df == expected - - df = deepcopy(original) - expected = DataFrame(b=b, c=c, a=a) - select!(df, [:b, :c, :a]) === df - @test df == expected - df = deepcopy(original) - select!(df, [2, 3, 1]) === df - @test df == expected - - df = deepcopy(original) - expected = DataFrame(c=c, a=a, b=b) - select!(df, [:c, :a, :b]) === df - @test df == expected - df = deepcopy(original) - select!(df, [3, 1, 2]) === df - @test df == expected - - df = deepcopy(original) - expected = DataFrame(a=a, c=c, b=b) - select!(df, [:a, :c, :b]) === df - @test df == expected - df = deepcopy(original) - select!(df, [1, 3, 2]) === df - @test df == expected - - df = deepcopy(original) - expected = DataFrame(b=b, a=a, c=c) - select!(df, [:b, :a, :c]) === df - @test df == expected - df = deepcopy(original) - select!(df, [2, 1, 3]) === df - @test df == expected - - df = deepcopy(original) - expected = DataFrame(c=c, b=b, a=a) - select!(df, [:c, :b, :a]) === df - @test df == expected - df = deepcopy(original) - select!(df, [3, 2, 1]) === df - @test df == expected - - df = DataFrame(a=a, b=b, c=c) - @test_throws ArgumentError select!(df, 1:4) - @test_throws ArgumentError select!(df, [:a, :b, :c, :d]) - @test_throws ArgumentError select!(df, [1, 2, 3, 1]) - @test_throws ArgumentError select!(df, [:a, :b, :c, :a]) - - # but this works - @test select!(copy(df), [:a, :c]) == df[:, [:a, :c]] - @test select!(copy(df), [:a, :b]) == df[:, [:a, :b]] - @test select!(copy(df), [1, 3]) == df[:, [1, 3]] -end - @testset "getproperty, setproperty! and propertynames" begin x = collect(1:10) y = collect(1.0:10.0) @@ -2122,25 +1560,6 @@ end df2v[Not(1), Between(1,2)] = Matrix(df[!, 1:2]) end -@testset "select and select! with multiple columns passed" begin - df = DataFrame(rand(10, 4)) - @test select(df, :x2, :x4, All()) == select(df, :x2, :x4, :x1, :x3) - @test select(df, :x4, Between(:x2, :x4), All()) == select(df, :x4, :x2, :x3, :x1) - - dfv = view(df, :, :) - @test select(dfv, :x2, :x4, All()) == select(dfv, :x2, :x4, :x1, :x3) - @test select(dfv, :x4, Between(:x2, :x4), All()) == select(dfv, :x4, :x2, :x3, :x1) - - dfc = copy(df) - @test select!(dfc, :x2, :x4, All()) == dfc - @test dfc == select(df, :x2, :x4, :x1, :x3) - dfc = copy(df) - @test select!(dfc, :x4, Between(:x2, :x4), All()) == dfc - @test dfc == select(df, :x4, :x2, :x3, :x1) - - @test select(df, Not([:x2, :x3]), All()) == select(df, :x1, :x4, :x2, :x3) -end - @testset "vcat and push! with :orderequal" begin for v in ((a=10, b=20, c=30), DataFrame(a=10, b=20, c=30)[1, :], diff --git a/test/index.jl b/test/index.jl index 44b54a085b..eb6b63802a 100644 --- a/test/index.jl +++ b/test/index.jl @@ -236,6 +236,8 @@ end @test DataFrames.parentcols(SubIndex(i, r"x1.")) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i, r"xx"))) @test DataFrames.parentcols(SubIndex(i, r"")) == 1:5 + @test DataFrames.parentcols(SubIndex(i, All())) == 1:5 + @test DataFrames.parentcols(SubIndex(i, Between(:x1, :x12))) == 1:2 i2 = SubIndex(i, r"") @test i2[r"x1."] == [2, 3] @@ -250,6 +252,8 @@ end @test DataFrames.parentcols(SubIndex(i2, r"x1.")) == [2, 3] @test isempty(DataFrames.parentcols(SubIndex(i2, r"xx"))) @test DataFrames.parentcols(SubIndex(i2, r"")) == 1:5 + @test DataFrames.parentcols(SubIndex(i2, All())) == 1:5 + @test DataFrames.parentcols(SubIndex(i2, Between(:x1, :x12))) == 1:2 i3 = SubIndex(i, r"x1.") @test i3[r"x1.$"] == [1] @@ -264,6 +268,9 @@ end @test DataFrames.parentcols(SubIndex(i3, r"x1.$")) == [1] @test isempty(DataFrames.parentcols(SubIndex(i3, r"xx"))) @test DataFrames.parentcols(SubIndex(i3, r"")) == 1:2 + @test DataFrames.parentcols(SubIndex(i3, All())) == 1:2 + @test_throws BoundsError DataFrames.parentcols(SubIndex(i3, Between(:x1, :x12))) == 1:2 + @test DataFrames.parentcols(SubIndex(i3, Between(:x12, :x12))) == 1:1 end @testset "Not indexing" begin diff --git a/test/runtests.jl b/test/runtests.jl index afac7fa5f6..26446fe6d8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,6 +13,7 @@ my_tests = ["utils.jl", "data.jl", "index.jl", "dataframe.jl", + "select.jl", "reshape.jl", "dataframerow.jl", "io.jl", diff --git a/test/select.jl b/test/select.jl new file mode 100644 index 0000000000..7bffdf5974 --- /dev/null +++ b/test/select.jl @@ -0,0 +1,838 @@ +module TestSelect + +using DataFrames, Test, Random + +Random.seed!(1234) + +@testset "select! Not" begin + df = DataFrame(a=1, b=2, c=3, d=4, e=5) + @test_throws BoundsError select!(df, Not(0)) + @test_throws BoundsError select!(df, Not(6)) + @test_throws ArgumentError select!(df, Not([1, 1])) + @test_throws ArgumentError select!(df, Not(:f)) + @test_throws BoundsError select!(df, Not([true, false])) + + d = copy(df) + select!(d, Not([:a, :e, :c])) + @test d == DataFrame(b=2, d=4) + DataFrames._check_consistency(d) + select!(d, Not(:b)) + @test d == DataFrame(d=4) + DataFrames._check_consistency(d) + + d = copy(df) + select!(d, Not(r"[aec]")) + @test d == DataFrame(b=2, d=4) + DataFrames._check_consistency(d) + select!(d, Not(r"b")) + @test d == DataFrame(d=4) + DataFrames._check_consistency(d) + + d = copy(df) + select!(d, Not([2, 5, 3])) + @test d == DataFrame(a=1, d=4) + DataFrames._check_consistency(d) + select!(d, Not(2)) + @test d == DataFrame(a=1) + DataFrames._check_consistency(d) + + d = copy(df) + select!(d, Not(2:3)) + @test d == DataFrame(a=1, d=4, e=5) + DataFrames._check_consistency(d) + + d = copy(df) + select!(d, Not([false, true, true, false, false])) + @test d == DataFrame(a=1, d=4, e=5) + DataFrames._check_consistency(d) +end + +@testset "select Not" begin + df = DataFrame(a=1, b=2, c=3, d=4, e=5) + @test_throws BoundsError select(df, Not(0)) + @test_throws BoundsError select(df, Not(6)) + @test_throws ArgumentError select(df, Not([1, 1])) + @test_throws ArgumentError select(df, Not(:f)) + @test_throws BoundsError select(df, Not([true, false])) + + df2 = copy(df) + d = select(df, Not([:a, :e, :c])) + @test d == df[:, [:b, :d]] + @test d.b !== df.b + @test d.d !== df.d + @test df == df2 + + df2 = copy(df) + d = select(df, Not(r"[aec]")) + @test d == df[:, [:b, :d]] + @test d == df[:, r"[bd]"] + @test d.b !== df.b + @test d.d !== df.d + @test df == df2 + + d = select(df, Not([2, 5, 3])) + @test d.a !== df.a + @test d.d !== df.d + @test d == df[:, [:a, :d]] + @test df == df2 + + d = select(df, Not(2:3)) + @test d == DataFrame(a=1, d=4, e=5) + @test d.a !== df.a + @test d.d !== df.d + @test d.e !== df.e + @test df == df2 + + d = select(df, Not([false, true, true, false, false])) + @test d == DataFrame(a=1, d=4, e=5) + @test d.a !== df.a + @test d.d !== df.d + @test d.e !== df.e + @test df == df2 + + d = select(df, Not(1)) + @test d == DataFrame(b=2,c=3,d=4,e=5) + @test d.b !== df.b + @test d.b == df.b + @test df == df2 + + d = select(df, Not([:a, :e, :c]), copycols=false) + @test d == df[:, [:b, :d]] + @test d.b === df.b + @test d.d === df.d + @test df == df2 + + d = select(df, Not(r"[aec]"), copycols=false) + @test d == df[:, [:b, :d]] + @test d == df[:, r"[bd]"] + @test d.b === df.b + @test d.d === df.d + @test df == df2 + + d = select(df, Not([2, 5, 3]), copycols=false) + @test d.a === df.a + @test d.d === df.d + @test d == df[:, [:a, :d]] + @test df == df2 + + d = select(df, Not(2:3), copycols=false) + @test d == DataFrame(a=1, d=4, e=5) + @test d.a === df.a + @test d.d === df.d + @test d.e === df.e + @test df == df2 + + d = select(df, Not([false, true, true, false, false]), copycols=false) + @test d == DataFrame(a=1, d=4, e=5) + @test d.a === df.a + @test d.d === df.d + @test d.e === df.e + @test df == df2 + + d = select(df, Not(1), copycols=false) + @test d == DataFrame(b=2,c=3,d=4,e=5) + @test d.b === df.b + @test df == df2 +end + +@testset "select Not on SubDataFrame" begin + df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) + @test_throws BoundsError select(df, Not(0)) + @test_throws BoundsError select(df, Not(6)) + @test_throws ArgumentError select(df, Not([1, 1])) + @test_throws ArgumentError select(df, Not(:f)) + @test_throws BoundsError select(df, Not([true, false])) + + df2 = copy(df) + d = select(df, Not([:a, :e, :c])) + @test d isa DataFrame + @test d == df[:, [:b, :d]] + @test d.b !== df.b + @test d.d !== df.d + @test df == df2 + + df2 = copy(df) + d = select(df, Not(r"[aec]")) + @test d isa DataFrame + @test d == df[:, [:b, :d]] + @test d == df[:, r"[bd]"] + @test d.b !== df.b + @test d.d !== df.d + @test df == df2 + + d = select(df, Not([2, 5, 3])) + @test d isa DataFrame + @test d.a !== df.a + @test d.d !== df.d + @test d == df[:, [:a, :d]] + @test df == df2 + + d = select(df, Not(2:3)) + @test d isa DataFrame + @test d == DataFrame(a=1, d=4, e=5) + @test d.a !== df.a + @test d.d !== df.d + @test d.e !== df.e + @test df == df2 + + d = select(df, Not([false, true, true, false, false])) + @test d isa DataFrame + @test d == DataFrame(a=1, d=4, e=5) + @test d.a !== df.a + @test d.d !== df.d + @test d.e !== df.e + @test df == df2 + + d = select(df, Not(1)) + @test d isa DataFrame + @test d == DataFrame(b=2,c=3,d=4,e=5) + @test d.b !== df.b + @test d.b == df.b + @test df == df2 + + d = select(df, Not([:a, :e, :c]), copycols=false) + @test d isa SubDataFrame + @test d == df[:, [:b, :d]] + @test d.b === df.b + @test d.d === df.d + @test df == df2 + + d = select(df, Not(r"[aec]"), copycols=false) + @test d isa SubDataFrame + @test d == df[:, [:b, :d]] + @test d == df[:, r"[bd]"] + @test d.b === df.b + @test d.d === df.d + @test df == df2 + + d = select(df, Not([2, 5, 3]), copycols=false) + @test d isa SubDataFrame + @test d.a === df.a + @test d.d === df.d + @test d == df[:, [:a, :d]] + @test df == df2 + + d = select(df, Not(2:3), copycols=false) + @test d isa SubDataFrame + @test d == DataFrame(a=1, d=4, e=5) + @test d.a === df.a + @test d.d === df.d + @test d.e === df.e + @test df == df2 + + d = select(df, Not([false, true, true, false, false]), copycols=false) + @test d isa SubDataFrame + @test d == DataFrame(a=1, d=4, e=5) + @test d.a === df.a + @test d.d === df.d + @test d.e === df.e + @test df == df2 + + d = select(df, Not(1), copycols=false) + @test d isa SubDataFrame + @test d == DataFrame(b=2,c=3,d=4,e=5) + @test d.b === df.b + @test df == df2 +end + +@testset "select!" begin + df = DataFrame(a=1, b=2, c=3, d=4, e=5) + @test_throws ArgumentError select!(df, 0) + @test_throws ArgumentError select!(df, 6) + @test_throws ArgumentError select!(df, [1, 1]) + @test_throws ArgumentError select!(df, :f) + @test_throws BoundsError select!(df, [true, false]) + + @test_throws MethodError select!(view(df, :, :), 1:2) + + d = copy(df, copycols=false) + @test select!(d, 1:0) == DataFrame() + @test select!(d, Not(r"")) == DataFrame() + + d = copy(df, copycols=false) + select!(d, [:a, :e, :c]) + @test names(d) == [:a, :e, :c] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = copy(df, copycols=false) + select!(d, r"[aec]") + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = copy(df, copycols=false) + select!(d, [true, false, true, false, true]) + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.c === df.c + @test d.e === df.e + + d = copy(df, copycols=false) + select!(d, [:d, :e, :a, :c, :b]) + @test names(d) == [:d, :e, :a, :c, :b] + for i in [:d, :e, :a, :c, :b] + @test d[!, i] === df[!, i] + end + + d = copy(df, copycols=false) + select!(d, [2, 5, 3]) + @test names(d) == [:b, :e, :c] + @test d.b === df.b + @test d.e === df.e + @test d.c === df.c + + d = copy(df, copycols=false) + select!(d, 2:3) + @test names(d) == [:b, :c] + @test d.b === df.b + @test d.c === df.c + + d = copy(df, copycols=false) + select!(d, 2) + @test names(d) == [:b] + @test d.b === df.b +end + +@testset "select" begin + df = DataFrame(a=1, b=2, c=3, d=4, e=5) + @test_throws BoundsError select(df, 0) + @test_throws BoundsError select(df, 6) + @test_throws ArgumentError select(df, [1, 1]) + @test_throws ArgumentError select(df, :f) + @test_throws BoundsError select!(df, [true, false]) + + @test select(df, 1:0) == DataFrame() + @test select(df, Not(r"")) == DataFrame() + @test select(df, 1:0, copycols=false) == DataFrame() + @test select(df, Not(r""), copycols=false) == DataFrame() + + d = select(df, [:a, :e, :c]) + @test names(d) == [:a, :e, :c] + @test d.a !== df.a + @test d.e !== df.e + @test d.c !== df.c + @test d.a == df.a + @test d.e == df.e + @test d.c == df.c + + d = select(df, r"[aec]") + @test names(d) == [:a, :c, :e] + @test d.a !== df.a + @test d.e !== df.e + @test d.c !== df.c + @test d.a == df.a + @test d.e == df.e + @test d.c == df.c + + d = select(df, [true, false, true, false, true]) + @test names(d) == [:a, :c, :e] + @test d.a !== df.a + @test d.c !== df.c + @test d.e !== df.e + @test d.a == df.a + @test d.c == df.c + @test d.e == df.e + + d = select(df, [2, 5, 3]) + @test names(d) == [:b, :e, :c] + @test d.b !== df.b + @test d.e !== df.e + @test d.c !== df.c + @test d.b == df.b + @test d.e == df.e + @test d.c == df.c + + d = select(df, 2:3) + @test names(d) == [:b, :c] + @test d.b !== df.b + @test d.c !== df.c + @test d.b == df.b + @test d.c == df.c + + d = select(df, 2) + @test names(d) == [:b] + @test d.b !== df.b + @test d.b == df.b + + d = select(df, [:a, :e, :c], copycols=false) + @test names(d) == [:a, :e, :c] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = select(df, r"[aec]", copycols=false) + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = select(df, [true, false, true, false, true], copycols=false) + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.c === df.c + @test d.e === df.e + + d = select(df, [2, 5, 3], copycols=false) + @test names(d) == [:b, :e, :c] + @test d.b === df.b + @test d.e === df.e + @test d.c === df.c + + d = select(df, 2:3, copycols=false) + @test names(d) == [:b, :c] + @test d.b === df.b + @test d.c === df.c + + d = select(df, 2, copycols=false) + @test names(d) == [:b] + @test d.b === df.b +end + +@testset "select on SubDataFrame" begin + df = view(DataFrame(a=1, b=2, c=3, d=4, e=5), :, :) + @test_throws BoundsError select(df, 0) + @test_throws BoundsError select(df, 6) + @test_throws ArgumentError select(df, [1, 1]) + @test_throws ArgumentError select(df, :f) + @test_throws MethodError select!(df, [true, false]) + + @test select(df, 1:0) == DataFrame() + @test select(df, Not(r"")) == DataFrame() + @test select(df, 1:0, copycols=false) == DataFrame() + @test select(df, Not(r""), copycols=false) == DataFrame() + + d = select(df, [:a, :e, :c]) + @test d isa DataFrame + @test names(d) == [:a, :e, :c] + @test d.a !== df.a + @test d.e !== df.e + @test d.c !== df.c + @test d.a == df.a + @test d.e == df.e + @test d.c == df.c + + d = select(df, r"[aec]") + @test d isa DataFrame + @test names(d) == [:a, :c, :e] + @test d.a !== df.a + @test d.e !== df.e + @test d.c !== df.c + @test d.a == df.a + @test d.e == df.e + @test d.c == df.c + + d = select(df, [true, false, true, false, true]) + @test d isa DataFrame + @test names(d) == [:a, :c, :e] + @test d.a !== df.a + @test d.c !== df.c + @test d.e !== df.e + @test d.a == df.a + @test d.c == df.c + @test d.e == df.e + + d = select(df, [2, 5, 3]) + @test d isa DataFrame + @test names(d) == [:b, :e, :c] + @test d.b !== df.b + @test d.e !== df.e + @test d.c !== df.c + @test d.b == df.b + @test d.e == df.e + @test d.c == df.c + + d = select(df, 2:3) + @test d isa DataFrame + @test names(d) == [:b, :c] + @test d.b !== df.b + @test d.c !== df.c + @test d.b == df.b + @test d.c == df.c + + d = select(df, 2) + @test d isa DataFrame + @test names(d) == [:b] + @test d.b !== df.b + @test d.b == df.b + + d = select(df, [:a, :e, :c], copycols=false) + @test d isa SubDataFrame + @test names(d) == [:a, :e, :c] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = select(df, r"[aec]", copycols=false) + @test d isa SubDataFrame + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.e === df.e + @test d.c === df.c + + d = select(df, [true, false, true, false, true], copycols=false) + @test d isa SubDataFrame + @test names(d) == [:a, :c, :e] + @test d.a === df.a + @test d.c === df.c + @test d.e === df.e + + d = select(df, [2, 5, 3], copycols=false) + @test d isa SubDataFrame + @test names(d) == [:b, :e, :c] + @test d.b === df.b + @test d.e === df.e + @test d.c === df.c + + d = select(df, 2:3, copycols=false) + @test d isa SubDataFrame + @test names(d) == [:b, :c] + @test d.b === df.b + @test d.c === df.c + + d = select(df, 2, copycols=false) + @test d isa SubDataFrame + @test names(d) == [:b] + @test d.b === df.b +end + +@testset "select! on all columns" begin + a, b, c = 1:5, 2:6, 3:7 + original = DataFrame(a=a, b=b, c=c) + + df = deepcopy(original) + expected = deepcopy(original) + @test select!(df, [:a, :b, :c]) === df + @test df == expected + @test select!(df, 1:3) === df + @test df == expected + + df = deepcopy(original) + expected = DataFrame(b=b, c=c, a=a) + select!(df, [:b, :c, :a]) === df + @test df == expected + df = deepcopy(original) + select!(df, [2, 3, 1]) === df + @test df == expected + + df = deepcopy(original) + expected = DataFrame(c=c, a=a, b=b) + select!(df, [:c, :a, :b]) === df + @test df == expected + df = deepcopy(original) + select!(df, [3, 1, 2]) === df + @test df == expected + + df = deepcopy(original) + expected = DataFrame(a=a, c=c, b=b) + select!(df, [:a, :c, :b]) === df + @test df == expected + df = deepcopy(original) + select!(df, [1, 3, 2]) === df + @test df == expected + + df = deepcopy(original) + expected = DataFrame(b=b, a=a, c=c) + select!(df, [:b, :a, :c]) === df + @test df == expected + df = deepcopy(original) + select!(df, [2, 1, 3]) === df + @test df == expected + + df = deepcopy(original) + expected = DataFrame(c=c, b=b, a=a) + select!(df, [:c, :b, :a]) === df + @test df == expected + df = deepcopy(original) + select!(df, [3, 2, 1]) === df + @test df == expected + + df = DataFrame(a=a, b=b, c=c) + @test_throws ArgumentError select!(df, 1:4) + @test_throws ArgumentError select!(df, [:a, :b, :c, :d]) + @test_throws ArgumentError select!(df, [1, 2, 3, 1]) + @test_throws ArgumentError select!(df, [:a, :b, :c, :a]) + + # but this works + @test select!(copy(df), [:a, :c]) == df[:, [:a, :c]] + @test select!(copy(df), [:a, :b]) == df[:, [:a, :b]] + @test select!(copy(df), [1, 3]) == df[:, [1, 3]] +end + +@testset "select and select! with multiple selectors passed" begin + df = DataFrame(rand(10, 4)) + @test select(df, :x2, :x4, All()) == select(df, :x2, :x4, :x1, :x3) + @test select(df, :x4, Between(:x2, :x4), All()) == select(df, :x4, :x2, :x3, :x1) + + dfv = view(df, :, :) + @test select(dfv, :x2, :x4, All()) == select(df, :x2, :x4, :x1, :x3) + @test select(dfv, :x4, Between(:x2, :x4), All()) == select(df, :x4, :x2, :x3, :x1) + @test select(dfv, :x2, :x4, All()) == select(dfv, :x2, :x4, :x1, :x3) + @test select(dfv, :x4, Between(:x2, :x4), All()) == select(dfv, :x4, :x2, :x3, :x1) + + dfc = copy(df) + @test select!(dfc, :x2, :x4, All()) == dfc + @test dfc == select(df, :x2, :x4, :x1, :x3) + dfc = copy(df) + @test select!(dfc, :x4, Between(:x2, :x4), All()) == dfc + @test dfc == select(df, :x4, :x2, :x3, :x1) + + @test select(df, Not([:x2, :x3]), All()) == select(df, :x1, :x4, :x2, :x3) +end + +@testset "select and select! renaming" begin + df = DataFrame(rand(10, 4)) + @test select(df, :x1 => :x2, :x2 => :x1) == rename(df[:, 1:2], [:x2, :x1]) + @test select(df, :x2 => :x1, :x1 => :x2) == DataFrame(x1=df.x2, x2=df.x1) + @test_throws ArgumentError select(df, [:x1, :x2] => :x3) + @test_throws ArgumentError select!(df, [:x1, :x2] => :x3) + @test_throws BoundsError select(df, 0 => :x3) + @test_throws BoundsError select!(df, 0 => :x3) + + df2 = select(df, :x1 => :x2, :x2 => :x1) + @test df2.x1 == df.x2 + @test df2.x1 !== df.x2 + df2 = select(df, :x1 => :x2, :x2 => :x1, copycols=false) + @test df2.x1 === df.x2 + + df2 = select(df, :x1, :x1 => :x2) + @test df2.x1 == df2.x2 + @test df2.x1 !== df2.x2 + + df2 = select(df, :x1, :x1 => :x2, copycols=false) + @test df2.x1 === df2.x2 + + x1 = df.x1 + x2 = df.x2 + select!(df, :x1 => :x2, :x2 => :x1) + @test x1 === df.x2 + @test x2 === df.x1 + @test names(df) == [:x2, :x1] + + df = DataFrame(rand(10, 4)) + select!(df, :x1, :x1 => :x2) + @test df2.x1 === df2.x2 + + df = DataFrame(rand(10, 4)) + df2 = select(df, :, :x1 => :x3) + @test df2 == DataFrame(eachcol(df)[[1,2,1,4]]) + @test df2.x1 !== df2.x3 + df2 = select(df, :, :x1 => :x3, copycols=false) + @test df2 == DataFrame(eachcol(df)[[1,2,1,4]]) + @test df2.x1 === df2.x3 + @test select(df, :x1 => :x3, :) == DataFrame(eachcol(df)[[1,1,2,4]], + [:x3, :x1, :x2, :x4]) + select!(df, :, :x1 => :x3) + @test df2 == df + @test all(i -> df2[!, i] === df[!, i], ncol(df2)) +end + +@testset "select and select! many columns naming" begin + df = DataFrame(rand(10, 4)) + for fun in (+, ByRow(+)), copycols in [true, false] + @test select(df, 1 => fun, copycols=copycols) == + DataFrame(Symbol("x1_+") => df.x1) + @test select(df, 1:2 => fun, copycols=copycols) == + DataFrame(Symbol("x1_x2_+") => df.x1 + df.x2) + @test select(df, 1:3 => fun, copycols=copycols) == + DataFrame(Symbol("x1_x2_x3_+") => df.x1 + df.x2 + df.x3) + @test select(df, 1:4 => fun, copycols=copycols) == + DataFrame(Symbol("x1_x2_etc_+") => sum.(eachrow(df))) + end + for fun in (+, ByRow(+)) + dfc = copy(df) + select!(dfc, 1 => fun) + @test dfc == DataFrame(Symbol("x1_+") => df.x1) + dfc = copy(df) + select!(dfc, 1:2 => fun) + @test dfc == DataFrame(Symbol("x1_x2_+") => df.x1 + df.x2) + dfc = copy(df) + select!(dfc, 1:3 => fun) + @test dfc == DataFrame(Symbol("x1_x2_x3_+") => df.x1 + df.x2 + df.x3) + dfc = copy(df) + select!(dfc, 1:4 => fun) + @test dfc == DataFrame(Symbol("x1_x2_etc_+") => sum.(eachrow(df))) + end +end + +@testset "select and select! many different transforms" begin + df = DataFrame(rand(10, 4)) + + df2 = select(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, + [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) + @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test df.x2 == df2.x2 + @test df.x2 !== df2.x2 + @test df.x1 == df2.x4 + @test df.x4 !== df2.x1 + @test df2.r1 == df.x1 .^ 2 + @test df2.r1 == df2.r2 + @test df2.x1 == df.x1 + df.x2 + @test df2.x3 == df.x1 ./ df.x2 + + @test select(df, [:x1, :x1] => +) == DataFrame(Symbol("x1_x1_+") => 2*df.x1) + @test select(df, [1, 1] => +) == DataFrame(Symbol("x1_x1_+") => 2*df.x1) + + df2 = select(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, + [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4, copycols=false) + @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test df.x2 === df2.x2 + @test df.x1 === df2.x4 + @test df2.r1 == df.x1 .^ 2 + @test df2.r1 == df2.r2 + @test df2.x1 == df.x1 + df.x2 + @test df2.x3 == df.x1 ./ df.x2 + + x1, x2, x3, x4 = df.x1, df.x2, df.x3, df.x4 + select!(df, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, + [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) + @test names(df2) == [:x2, :x1, :x3, :x4, :r1, :r2] + @test x2 === df.x2 + @test x1 === df.x4 + @test df.r1 == x1 .^ 2 + @test df.r1 == df.r2 + @test df.x1 == x1 + x2 + @test df.x3 == x1 ./ x2 +end + +@testset "select and select! reserved return values" begin + df = DataFrame(x=1) + df2 = copy(df) + for retval in [df2, (a=1, b=2), df2[1, :], ones(2,2)] + @test_throws ArgumentError select(df, :x => x -> retval) + @test_throws ArgumentError select(df, :x => x -> retval, copycols=false) + @test_throws ArgumentError select!(df, :x => x -> retval) + @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval]) + cdf = copy(df) + select!(cdf, :x => ByRow(x -> retval)) + @test cdf == DataFrame(x_function = [retval]) + end + + for retval in [(1, 2), ones(2,2,2)] + @test select(df, :x => x -> retval) == DataFrame(x_function = [retval]) + @test select(df, :x => ByRow(x -> retval)) == DataFrame(x_function = [retval]) + cdf = copy(df) + select!(cdf, :x => x -> retval) + @test cdf == DataFrame(x_function = [retval]) + cdf = copy(df) + select!(cdf, :x => ByRow(x -> retval)) + @test cdf == DataFrame(x_function = [retval]) + end +end + +@testset "select and select! empty selection" begin + df = DataFrame(rand(10, 4)) + x = [1,2,3] + + @test select(df, r"z") == DataFrame() + @test select(df, r"z" => () -> x) == DataFrame(:function => x) + @test select(df, r"z" => () -> x)[!, 1] === x # no copy even for copycols=true + @test_throws MethodError select(df, r"z" => x -> 1) + @test_throws ArgumentError select(df, r"z" => ByRow(rand)) + + @test select(df, r"z", copycols=false) == DataFrame() + @test select(df, r"z" => () -> x, copycols=false) == DataFrame(:function => x) + @test select(df, r"z" => () -> x, copycols=false)[!, 1] === x + @test_throws MethodError select(df, r"z" => x -> 1, copycols=false) + @test_throws ArgumentError select(df, r"z" => ByRow(rand), copycols=false) + + @test_throws MethodError select!(df, r"z" => x -> 1) + @test_throws ArgumentError select!(df, r"z" => ByRow(rand)) + @test_throws ErrorException select!(df, r"z" => () -> x, copycols=false) + select!(df, r"z" => () -> x) + @test df == DataFrame(:function => x) +end + +@testset "wrong selection patterns" begin + df = DataFrame(rand(10, 4)) + + @test_throws ArgumentError select(df, "z") + @test_throws ArgumentError select(df, "z" => :x1) + @test_throws ArgumentError select(df, "z" => identity) + @test_throws ArgumentError select(df, "z" => identity => :x1) +end + +@testset "select and select! duplicates" begin + df = DataFrame(rand(10, 4)) + df_ref = copy(df) + + @test_throws ArgumentError select(df, :x1, :x1) + @test_throws ArgumentError select(df, :x1, :x5) + @test select(df, :x2, r"x", :x1, :) == df[:, [:x2, :x1, :x3, :x4]] + + @test_throws ArgumentError select(df, :x1, :x2 => :x1) + @test_throws ArgumentError select(df, :x3 => :x1, :x2 => :x1) + @test_throws ArgumentError select(df, :x1, :x2 => identity => :x1) + @test_throws ArgumentError select(df, :x1 => :x1, :x2 => identity => :x1) + @test_throws ArgumentError select(df, :x3 => identity => :x1, :x2 => identity => :x1) + @test select(df, [:x1], :x2 => :x1) == DataFrame(x1 = df.x2) + + @test_throws ArgumentError select!(df, :x1, :x1) + @test_throws ArgumentError select!(df, :x1, :x5) + @test df == df_ref + + select!(df, :x2, r"x", :x1, :) + @test df == df_ref[:, [:x2, :x1, :x3, :x4]] + + df = DataFrame(rand(10, 2)) + @test select(df, [:x1, :x1] => -) == DataFrame(Symbol("x1_x1_-") => zeros(10)) + select!(df, [:x1, :x1] => -) + @test df == DataFrame(Symbol("x1_x1_-") => zeros(10)) +end + +@testset "SubDataFrame selection" begin + df = DataFrame(rand(12, 5)) + sdf = view(df, 1:10, 1:4) + df_ref = copy(sdf) + + @test select(sdf, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, + [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) == + select(df_ref, :x2, :, :x1 => ByRow(x -> x^2) => :r1, :x1 => (x -> x .^ 2) => :r2, + [:x1, :x2] => (+) => :x1, 1:2 => ByRow(/) => :x3, :x1 => :x4) + + for fun in (+, ByRow(+)) + @test select(sdf, 1 => fun) == + DataFrame(Symbol("x1_+") => sdf.x1) + @test select(sdf, 1:2 => fun) == + DataFrame(Symbol("x1_x2_+") => sdf.x1 + sdf.x2) + @test select(sdf, 1:3 => fun) == + DataFrame(Symbol("x1_x2_x3_+") => sdf.x1 + sdf.x2 + sdf.x3) + @test select(sdf, 1:4 => fun) == + DataFrame(Symbol("x1_x2_etc_+") => sum.(eachrow(sdf))) + end + + @test_throws ArgumentError select(sdf, :x1, :x1) + @test_throws ArgumentError select(sdf, :x1, :x1, copycols=false) + @test select(sdf, :x1, [:x1]) == sdf[:, [:x1]] + @test select(sdf, :x1, [:x1]) isa DataFrame + @test select(sdf, :x1, [:x1], copycols=false) == sdf[:, [:x1]] + @test select(sdf, :x1, [:x1], copycols=false) isa SubDataFrame + @test_throws ArgumentError select(sdf, :x1 => :r1, copycols=false) + @test_throws ArgumentError select(sdf, :x1 => identity => :r1, copycols=false) +end + +@testset "copycols special cases" begin + df = DataFrame(a=1:3, b=4:6) + c = [7, 8] + df2 = select(df, :a => (x -> c) => :c1, :b => (x -> c) => :c2) + @test df2.c1 === df2.c2 + df2 = select(df, :a => identity => :c1, :a => :c2) + @test df2.c1 !== df2.c2 + df2 = select(df, :a => identity => :c1) + @test df2.c1 !== df.a + df2 = select(df, :a => (x -> df.b) => :c1) + @test df2.c1 === df.b + df2 = select(view(df, 1:2, :), :a => parent => :c1) + @test df2.c1 !== df.a + df2 = select(view(df, 1:2, :), :a => (x -> view(x, 1:1)) => :c1) + @test df2.c1 isa Vector + df2 = select(df, :a, :a => :b, :a => identity => :c, copycols=false) + @test df2.b === df2.c === df.a + a = df.a + select!(df, :a, :a => :b, :a => identity => :c) + @test df.b === df.c === a +end + +end # module