From 38027c7ff90288c802fe64751e7ec5412e1ea973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 5 Sep 2020 10:55:01 +0200 Subject: [PATCH] add renamecols to select/transform/combine (#2397) --- NEWS.md | 4 + src/abstractdataframe/selection.jl | 181 +++++++++++++--------- src/deprecated.jl | 38 ++--- src/groupeddataframe/splitapplycombine.jl | 148 +++++++++--------- test/grouping.jl | 29 +++- test/select.jl | 21 +++ 6 files changed, 257 insertions(+), 164 deletions(-) diff --git a/NEWS.md b/NEWS.md index c0f0a3ce74..2c650aae2a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -55,6 +55,10 @@ ([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373)) * add `columnindex` for `DataFrameRow` ([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380)) +* `select`, `select!`, `transform`, `transform!` and `combine` now allow `renamecols` + keyword argument that makes it possible to avoid adding transformation function name + as a suffix in automatically generated column names + ([#2397](https://github.com/JuliaData/DataFrames.jl/pull/2397)) ## Deprecated diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl index ad3043be84..0469b87fb3 100644 --- a/src/abstractdataframe/selection.jl +++ b/src/abstractdataframe/selection.jl @@ -34,7 +34,7 @@ _by_row_helper(x::Union{NamedTuple, DataFrameRow}) = # add a method to funname defined in other/utils.jl funname(row::ByRow) = funname(row.fun) -normalize_selection(idx::AbstractIndex, sel) = +normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) = try idx[sel] catch e @@ -45,28 +45,33 @@ normalize_selection(idx::AbstractIndex, sel) = end end -normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}) = +normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}, + renamecols::Bool) = length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel)) -normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString}) = - normalize_selection(idx, first(sel) => Symbol(last(sel))) -normalize_selection(idx::AbstractIndex, sel::typeof(nrow)) = - normalize_selection(idx, nrow => :nrow) +normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString}, + renamecols::Bool) = + normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols) +normalize_selection(idx::AbstractIndex, sel::typeof(nrow), renamecols::Bool) = + normalize_selection(idx, nrow => :nrow, renamecols) -function normalize_selection(idx::AbstractIndex, sel::ColumnIndex) +function normalize_selection(idx::AbstractIndex, sel::ColumnIndex, renamecols::Bool) c = idx[sel] return c => identity => _names(idx)[c] end -function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol}) +function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol}, + renamecols::Bool) c = idx[first(sel)] return c => identity => last(sel) end -normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString}) = - normalize_selection(idx, first(sel) => Symbol(last(sel))) +normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString}, + renamecols::Bool) = + normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool) function normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}}) + sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}}, + renamecols::Bool) if first(sel) isa AsTable rawc = first(sel).cols wanttable = true @@ -97,19 +102,25 @@ function normalize_selection(idx::AbstractIndex, end normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}}) = - normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel)))) + sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}}, + renamecols::Bool) = + normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))), + renamecols::Bool) function normalize_selection(idx::AbstractIndex, - sel::Pair{<:ColumnIndex,<:Base.Callable}) + sel::Pair{<:ColumnIndex,<:Base.Callable}, renamecols::Bool) c = idx[first(sel)] fun = last(sel) - newcol = Symbol(_names(idx)[c], "_", funname(fun)) + if renamecols + newcol = Symbol(_names(idx)[c], "_", funname(fun)) + else + newcol = _names(idx)[c] + end return c => fun => newcol end function normalize_selection(idx::AbstractIndex, - sel::Pair{<:Any, <:Base.Callable}) + sel::Pair{<:Any, <:Base.Callable}, renamecols::Bool) if first(sel) isa AsTable rawc = first(sel).cols wanttable = true @@ -138,11 +149,23 @@ function normalize_selection(idx::AbstractIndex, end fun = last(sel) if length(c) > 3 - newcol = Symbol(join(@views(_names(idx)[c[1:2]]), '_'), "_etc_", funname(fun)) + prefix = join(@views(_names(idx)[c[1:2]]), '_') + if renamecols + newcol = Symbol(prefix, "_etc_", funname(fun)) + else + newcol = Symbol(prefix, "_etc") + end elseif isempty(c) + renamecols || throw(ArgumentError("when renamecols=false target column name " * + "must be passed if there are no input columns")) newcol = Symbol(funname(fun)) else - newcol = Symbol(join(view(_names(idx), c), '_'), '_', funname(fun)) + prefix = join(view(_names(idx), c), '_') + if renamecols + newcol = Symbol(prefix, '_', funname(fun)) + else + newcol = Symbol(prefix) + end end return (wanttable ? AsTable(c) : c) => fun => newcol end @@ -251,10 +274,14 @@ SELECT_ARG_RULES = Column transformation can also be specified using the short `old_column => fun` form. In this case, `new_column_name` is automatically generated as - `\$(old_column)_\$(fun)`. Up to three column names are used for multiple - input columns and they are joined using `_`; if more than three columns are - passed then the name consists of the first two names and `etc` suffix then, - e.g. `[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun`. + `\$(old_column)_\$(fun)` if `renamecols=true` and `\$(old_column)` if + `renamecols=false`. Up to three column names are used for multiple input + columns and they are joined using `_`; if more than three columns are passed + then the name consists of the first two names and `etc` suffix then, e.g. + `[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun` if + `renamecols=true` and ``:a_b_etc` if `renamecols=false`. + It is not allowed to pass `renamecols=false` if `old_column` is empty + as it would generate an empty column name. Column renaming and transformation operations can be passed wrapped in vectors (this is useful when combined with broadcasting). @@ -275,7 +302,7 @@ SELECT_ARG_RULES = """ """ - select!(df::DataFrame, args...) + select!(df::DataFrame, args...; renamecols::Bool=true) Mutate `df` in place to retain only columns specified by `args...` and return it. The result is guaranteed to have the same number of rows as `df`, except when no @@ -345,22 +372,22 @@ julia> df = DataFrame(a=1:3, b=4:6); julia> using Statistics -julia> select!(df, AsTable(:) => ByRow(mean)) +julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false) 3×1 DataFrame -│ Row │ a_b_mean │ -│ │ Float64 │ -├─────┼──────────┤ -│ 1 │ 2.5 │ -│ 2 │ 3.5 │ -│ 3 │ 4.5 │ +│ Row │ a_b │ +│ │ Float64 │ +├─────┼─────────┤ +│ 1 │ 2.5 │ +│ 2 │ 3.5 │ +│ 3 │ 4.5 │ ``` """ -select!(df::DataFrame, args...) = - _replace_columns!(df, select(df, args..., copycols=false)) +select!(df::DataFrame, args...; renamecols::Bool=true) = + _replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols)) """ - transform!(df::DataFrame, args...) + transform!(df::DataFrame, args...; renamecols::Bool=true) Mutate `df` in place to add columns specified by `args...` and return it. The result is guaranteed to have the same number of rows as `df`. @@ -368,10 +395,11 @@ Equivalent to `select!(df, :, args...)`. See [`select!`](@ref) for detailed rules regarding accepted values for `args`. """ -transform!(df::DataFrame, args...) = select!(df, :, args...) +transform!(df::DataFrame, args...; renamecols::Bool=true) = + select!(df, :, args..., renamecols=renamecols) """ - select(df::AbstractDataFrame, args...; copycols::Bool=true) + select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) Create a new data frame that contains columns from `df` specified by `args` and return it. The result is guaranteed to have the same number of rows as `df`, @@ -479,22 +507,22 @@ julia> select(df, names(df) .=> sum .=> [:A, :B]) │ 2 │ 6 │ 15 │ │ 3 │ 6 │ 15 │ -julia> select(df, AsTable(:) => ByRow(mean)) +julia> select(df, AsTable(:) => ByRow(mean), renamecols=false) 3×1 DataFrame -│ Row │ a_b_mean │ -│ │ Float64 │ -├─────┼──────────┤ -│ 1 │ 2.5 │ -│ 2 │ 3.5 │ -│ 3 │ 4.5 │ +│ Row │ a_b │ +│ │ Float64 │ +├─────┼─────────┤ +│ 1 │ 2.5 │ +│ 2 │ 3.5 │ +│ 3 │ 4.5 │ ``` """ -select(df::AbstractDataFrame, args...; copycols::Bool=true) = - manipulate(df, args..., copycols=copycols, keeprows=true) +select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = + manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols) """ - transform(df::AbstractDataFrame, args...; copycols::Bool=true) + transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) Create a new data frame that contains columns from `df` and adds columns specified by `args` and return it. @@ -503,12 +531,12 @@ Equivalent to `select(df, :, args..., copycols=copycols)`. See [`select`](@ref) for detailed rules regarding accepted values for `args`. """ -transform(df::AbstractDataFrame, args...; copycols::Bool=true) = - select(df, :, args..., copycols=copycols) +transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) = + select(df, :, args..., copycols=copycols, renamecols=renamecols) """ - combine(df::AbstractDataFrame, args...) - combine(arg, df::AbstractDataFrame) + combine(df::AbstractDataFrame, args...; renamecols::Bool=true) + combine(arg, df::AbstractDataFrame; renamecols::Bool=true) Create a new data frame that contains columns from `df` specified by `args` and return it. The result can have any number of rows that is determined by the @@ -530,42 +558,46 @@ julia> df = DataFrame(a=1:3, b=4:6) │ 2 │ 2 │ 5 │ │ 3 │ 3 │ 6 │ -julia> combine(df, :a => sum, nrow) +julia> combine(df, :a => sum, nrow, renamecols=false) 1×2 DataFrame -│ Row │ a_sum │ nrow │ +│ Row │ a │ nrow │ │ │ Int64 │ Int64 │ ├─────┼───────┼───────┤ │ 1 │ 6 │ 3 │ ``` """ -combine(df::AbstractDataFrame, args...) = - manipulate(df, args..., copycols=true, keeprows=false) +combine(df::AbstractDataFrame, args...; renamecols::Bool=true) = + manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols) -function combine(arg, df::AbstractDataFrame) +function combine(arg, df::AbstractDataFrame; renamecols::Bool=true) if nrow(df) == 0 throw(ArgumentError("calling combine on a data frame with zero rows" * " with transformation as a first argument is " * "currently not supported")) end - return combine(arg, groupby(df, Symbol[])) + return combine(arg, groupby(df, Symbol[]), renamecols=renamecols) end -manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) = - DataFrame(_columns(df)[args], Index(_names(df)[args]), - copycols=copycols) +manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool, + renamecols::Bool) = + DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols) -function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool) +function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool, + renamecols::Bool) if c isa AbstractVector{<:Pair} - return manipulate(df, c..., copycols=copycols, keeprows=keeprows) + return manipulate(df, c..., copycols=copycols, keeprows=keeprows, + renamecols=renamecols) else - return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows) + return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows, + renamecols=renamecols) end end -manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) = - manipulate(df, [c], copycols=copycols, keeprows=keeprows) +manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool, + renamecols::Bool) = + manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols) -function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) +function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool) cs_vec = [] for v in cs if v isa AbstractVector{<:Pair} @@ -574,7 +606,7 @@ function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool) push!(cs_vec, v) end end - return _manipulate(df, [normalize_selection(index(df), c) for c in cs_vec], + return _manipulate(df, [normalize_selection(index(df), c, renamecols) for c in cs_vec], copycols, keeprows) end @@ -679,19 +711,22 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr return newdf end -manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) = - manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows) +manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool, + renamecols::Bool) = + manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows, renamecols=renamecols) function manipulate(dfv::SubDataFrame, args::MultiColumnIndex; - copycols::Bool, keeprows::Bool) + copycols::Bool, keeprows::Bool, renamecols::Bool) if args isa AbstractVector{<:Pair} - return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows) + return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows, + renamecols=renamecols) else return copycols ? dfv[:, args] : view(dfv, :, args) end end -function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) +function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool, + renamecols::Bool) if copycols cs_vec = [] for v in args @@ -701,8 +736,8 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) push!(cs_vec, v) end end - return _manipulate(dfv, [normalize_selection(index(dfv), c) for c in cs_vec], - true, keeprows) + return _manipulate(dfv, [normalize_selection(index(dfv), c, renamecols) for c in cs_vec], + true, keeprows) else # we do not support transformations here # newinds contains only indexing; making it Vector{Any} avoids some compilation @@ -719,7 +754,7 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool) push!(seen_single_column, ind_idx) end else - newind = normalize_selection(index(dfv), ind) + newind = normalize_selection(index(dfv), ind, renamecols) if newind isa Pair throw(ArgumentError("transforming and renaming columns of a " * "SubDataFrame is not allowed when `copycols=false`")) diff --git a/src/deprecated.jl b/src/deprecated.jl index e4686b830c..9851dfc472 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -21,19 +21,19 @@ function CategoricalArrays.categorical(df::AbstractDataFrame, end if cols isa AbstractVector{<:Union{AbstractString, Symbol}} Base.depwarn("`categorical(df, cols)` is deprecated. " * - "Use `transform(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `transform(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical) - return transform(df, cols .=> (x -> categorical(x, compress=compress)) .=> cols) + return transform(df, cols .=> (x -> categorical(x, compress=compress)), renamecols=false) elseif cols isa Union{AbstractString, Symbol} Base.depwarn("`categorical(df, cols)` is deprecated. " * - "Use `transform(df, cols => $categoricalstr => cols)` instead.", + "Use `transform(df, cols => $categoricalstr, renamecols=false)` instead.", :categorical) - return transform(df, cols => (x -> categorical(x, compress=compress)) => cols) + return transform(df, cols => (x -> categorical(x, compress=compress)), renamecols=false) else Base.depwarn("`categorical(df, cols)` is deprecated. " * - "Use `transform(df, names(df, cols) .=> $categoricalstr .=> names(df, cols))` instead.", + "Use `transform(df, names(df, cols) .=> $categoricalstr, renamecols=false)` instead.", :categorical) - return transform(df, names(df, cols) .=> (x -> categorical(x, compress=compress)) .=> names(df, cols)) + return transform(df, names(df, cols) .=> (x -> categorical(x, compress=compress)), renamecols=false) end end @@ -49,15 +49,15 @@ function CategoricalArrays.categorical(df::AbstractDataFrame, if cols === nothing cols = Union{AbstractString, Missing} Base.depwarn("`categorical(df)` is deprecated. " * - "Use `cols = names(df)[map(c -> eltype(c) <: $cols, eachcol(df))]; transform(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `cols = names(df)[map(c -> eltype(c) <: $cols, eachcol(df))]; transform(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical) else Base.depwarn("`categorical(df, T)` is deprecated. " * - "Use `cols = names(df)[map(c -> eltype(c) <: T, eachcol(df))]; transform(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `cols = names(df)[map(c -> eltype(c) <: T, eachcol(df))]; transform(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical) end colsstr = names(df)[map(c -> eltype(c) <: cols, eachcol(df))] - return transform(df, colsstr .=> (x -> categorical(x, compress=compress)) .=> colsstr) + return transform(df, colsstr .=> (x -> categorical(x, compress=compress)), renamecols=false) end function categorical!(df::DataFrame, cols::Union{ColumnIndex, MultiColumnIndex}; @@ -70,19 +70,19 @@ function categorical!(df::DataFrame, cols::Union{ColumnIndex, MultiColumnIndex}; end if cols isa AbstractVector{<:Union{AbstractString, Symbol}} Base.depwarn("`categorical!(df, cols)` is deprecated. " * - "Use `transform!(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `transform!(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical!) - return transform!(df, cols .=> (x -> categorical(x, compress=compress)) .=> cols) + return transform!(df, cols .=> (x -> categorical(x, compress=compress)), renamecols=false) elseif cols isa Union{AbstractString, Symbol} Base.depwarn("`categorical!(df, cols)` is deprecated. " * - "Use `transform!(df, cols => $categoricalstr => cols)` instead.", + "Use `transform!(df, cols => $categoricalstr, renamecols=false)` instead.", :categorical!) - return transform!(df, cols => (x -> categorical(x, compress=compress)) => cols) + return transform!(df, cols => (x -> categorical(x, compress=compress)), renamecols=false) else Base.depwarn("`categorical!(df, cols)` is deprecated. " * - "Use `transform!(df, names(df, cols) .=> $categoricalstr .=> names(df, cols))` instead.", + "Use `transform!(df, names(df, cols) .=> $categoricalstr, renamecols=false)` instead.", :categorical!) - return transform!(df, names(df, cols) .=> (x -> categorical(x, compress=compress)) .=> names(df, cols)) + return transform!(df, names(df, cols) .=> (x -> categorical(x, compress=compress)), renamecols=false) end end @@ -97,13 +97,13 @@ function categorical!(df::DataFrame, cols::Union{Type, Nothing}=nothing; if cols === nothing cols = Union{AbstractString, Missing} Base.depwarn("`categorical!(df)` is deprecated. " * - "Use `cols = names(df)[map(c -> eltype(c) <: $cols, eachcol(df))]; transform!(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `cols = names(df)[map(c -> eltype(c) <: $cols, eachcol(df))]; transform!(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical!) else Base.depwarn("`categorical!(df, T)` is deprecated. " * - "Use `cols = names(df)[map(c -> eltype(c) <: T, eachcol(df))]; transform!(df, cols .=> $categoricalstr .=> cols)` instead.", + "Use `cols = names(df)[map(c -> eltype(c) <: T, eachcol(df))]; transform!(df, cols .=> $categoricalstr, renamecols=false)` instead.", :categorical!) end colsstr = names(df)[map(c -> eltype(c) <: cols, eachcol(df))] - return transform!(df, colsstr .=> (x -> categorical(x, compress=compress)) .=> colsstr) -end \ No newline at end of file + return transform!(df, colsstr .=> (x -> categorical(x, compress=compress)), renamecols=false) +end diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index f48ec1ec6d..d7b1c23d86 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -227,13 +227,15 @@ const F_ARGUMENT_RULES = * Column transformation operations using the `Pair` notation that is described below and vectors of such pairs. - Transformations allowed using `Pair`s follow the rules specified - for [`select`](@ref) and have the form `source_cols => fun`, - `source_cols => fun => target_col`, or `source_col => target_col`. - Function `fun` is passed `SubArray` views as positional arguments for each column - specified to be selected, or a `NamedTuple` containing these `SubArray`s if - `source_cols` is an `AsTable` selector. It can return a vector or a single value - (defined precisely below). + Transformations allowed using `Pair`s follow the rules specified for + [`select`](@ref) and have the form `source_cols => fun`, `source_cols => fun + => target_col`, or `source_col => target_col`. Function `fun` is passed + `SubArray` views as positional arguments for each column specified to be + selected, or a `NamedTuple` containing these `SubArray`s if `source_cols` is + an `AsTable` selector. It can return a vector or a single value (defined + precisely below). If automatic generation of target column + name is required it respects the `renamecols` keyword argument following the + rules described in [`select`](@ref). As a special case `nrow` or `nrow => target_col` can be passed without specifying input columns to efficiently calculate number of rows in each group. @@ -272,10 +274,12 @@ const KWARG_PROCESSING_RULES = """ """ - combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, ungroup::Bool=true) + combine(gd::GroupedDataFrame, args...; keepkeys::Bool=true, ungroup::Bool=true, + renamecols::Bool=true) combine(fun::Union{Function, Type}, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true) - combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true) + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) + combine(pair::Pair, gd::GroupedDataFrame; keepkeys::Bool=true, ungroup::Bool=true, + renamecols::Bool=true) Apply operations to each group in a [`GroupedDataFrame`](@ref) and return the combined result as a `DataFrame` if `ungroup=true` or `GroupedDataFrame` if `ungroup=false`. @@ -433,33 +437,34 @@ julia> combine(gd, AsTable(:) => Ref) │ 3 │ 3 │ (a = [3, 3], b = [2, 2], c = [3, 7]) │ │ 4 │ 4 │ (a = [4, 4], b = [1, 1], c = [4, 8]) │ -julia> combine(gd, :, AsTable(Not(:a)) => sum) +julia> combine(gd, :, AsTable(Not(:a)) => sum, renamecols=false) 8×4 DataFrame -│ Row │ a │ b │ c │ b_c_sum │ -│ │ Int64 │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┼─────────┤ -│ 1 │ 1 │ 2 │ 1 │ 3 │ -│ 2 │ 1 │ 2 │ 5 │ 7 │ -│ 3 │ 2 │ 1 │ 2 │ 3 │ -│ 4 │ 2 │ 1 │ 6 │ 7 │ -│ 5 │ 3 │ 2 │ 3 │ 5 │ -│ 6 │ 3 │ 2 │ 7 │ 9 │ -│ 7 │ 4 │ 1 │ 4 │ 5 │ -│ 8 │ 4 │ 1 │ 8 │ 9 │ +│ Row │ a │ b │ c │ b_c │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 2 │ 1 │ 3 │ +│ 2 │ 1 │ 2 │ 5 │ 7 │ +│ 3 │ 2 │ 1 │ 2 │ 3 │ +│ 4 │ 2 │ 1 │ 6 │ 7 │ +│ 5 │ 3 │ 2 │ 3 │ 5 │ +│ 6 │ 3 │ 2 │ 7 │ 9 │ +│ 7 │ 4 │ 1 │ 4 │ 5 │ +│ 8 │ 4 │ 1 │ 8 │ 9 │ ``` """ function combine(f::Base.Callable, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true) + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) return combine_helper(f, gd, keepkeys=keepkeys, ungroup=ungroup, - copycols=true, keeprows=false) + copycols=true, keeprows=false, renamecols=renamecols) end combine(f::typeof(nrow), gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true) = - combine(gd, [nrow => :nrow], keepkeys=keepkeys, ungroup=ungroup) + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = + combine(gd, [nrow => :nrow], keepkeys=keepkeys, ungroup=ungroup, + renamecols=renamecols) function combine(p::Pair, gd::GroupedDataFrame; - keepkeys::Bool=true, ungroup::Bool=true) + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) # move handling of aggregate to specialized combine p_from, p_to = p @@ -467,7 +472,7 @@ function combine(p::Pair, gd::GroupedDataFrame; # by moving to combine(::GroupedDataFrame, ::AbstractVector) method # note that even if length(gd) == 0 we can do this step if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to), gd) || p_from === nrow - return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup) + return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup, renamecols=renamecols) end if p_from isa Tuple @@ -479,19 +484,20 @@ function combine(p::Pair, gd::GroupedDataFrame; cs = p_from end return combine_helper(cs => p_to, gd, keepkeys=keepkeys, ungroup=ungroup, - copycols=true, keeprows=false) + copycols=true, keeprows=false, renamecols=renamecols) end combine(gd::GroupedDataFrame, cs::Union{Pair, typeof(nrow), ColumnIndex, MultiColumnIndex}...; - keepkeys::Bool=true, ungroup::Bool=true) = + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) = _combine_prepare(gd, cs..., keepkeys=keepkeys, ungroup=ungroup, - copycols=true, keeprows=false) + copycols=true, keeprows=false, renamecols=renamecols) function _combine_prepare(gd::GroupedDataFrame, @nospecialize(cs::Union{Pair, typeof(nrow), - ColumnIndex, MultiColumnIndex}...); - keepkeys::Bool, ungroup::Bool, copycols::Bool, keeprows::Bool) + ColumnIndex, MultiColumnIndex}...); + keepkeys::Bool, ungroup::Bool, copycols::Bool, + keeprows::Bool, renamecols::Bool) cs_vec = [] for p in cs if p === nrow @@ -513,7 +519,7 @@ function _combine_prepare(gd::GroupedDataFrame, end end end - cs_norm_pre = [normalize_selection(index(parent(gd)), c) for c in cs_vec] + cs_norm_pre = [normalize_selection(index(parent(gd)), c, renamecols) for c in cs_vec] seen_cols = Set{Symbol}() process_vectors = false for v in cs_norm_pre @@ -564,7 +570,7 @@ function _combine_prepare(gd::GroupedDataFrame, f = Pair[first(x) => first(last(x)) for x in cs_norm] nms = Symbol[last(last(x)) for x in cs_norm] return combine_helper(f, gd, nms, keepkeys=keepkeys, ungroup=ungroup, - copycols=copycols, keeprows=keeprows) + copycols=copycols, keeprows=keeprows, renamecols=renamecols) end function gen_groups(idx::Vector{Int}) @@ -584,11 +590,11 @@ end function combine_helper(f, gd::GroupedDataFrame, nms::Union{AbstractVector{Symbol},Nothing}=nothing; keepkeys::Bool, ungroup::Bool, - copycols::Bool, keeprows::Bool) + copycols::Bool, keeprows::Bool, renamecols::Bool) if !ungroup && !keepkeys throw(ArgumentError("keepkeys=false when ungroup=false is not allowed")) end - idx, valscat = _combine(f, gd, nms, copycols, keeprows) + idx, valscat = _combine(f, gd, nms, copycols, keeprows, renamecols) !keepkeys && ungroup && return valscat keys = groupcols(gd) for key in keys @@ -1137,7 +1143,7 @@ end function _combine(f::AbstractVector{<:Pair}, gd::GroupedDataFrame, nms::AbstractVector{Symbol}, - copycols::Bool, keeprows::Bool) + copycols::Bool, keeprows::Bool, renamecols::Bool) # here f should be normalized and in a form of source_cols => fun @assert all(x -> first(x) isa Union{Int, AbstractVector{Int}, AsTable}, f) @assert all(x -> last(x) isa Base.Callable, f) @@ -1277,7 +1283,7 @@ function _combine(f::AbstractVector{<:Pair}, end function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, - copycols::Bool, keeprows::Bool) + copycols::Bool, keeprows::Bool, renamecols::Bool) @assert copycols && !keeprows # use `similar` as `gd` might have been subsetted firstres = length(gd) > 0 ? fun(gd[1]) : fun(similar(parent(gd), 0)) @@ -1287,11 +1293,11 @@ function _combine(fun::Base.Callable, gd::GroupedDataFrame, ::Nothing, end function _combine(p::Pair, gd::GroupedDataFrame, ::Nothing, - copycols::Bool, keeprows::Bool) + copycols::Bool, keeprows::Bool, renamecols::Bool) # here p should not be normalized as we allow tabular return value from fun # map and combine should not dispatch here if p is isagg @assert copycols && !keeprows - source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p) + source_cols, (fun, out_col) = normalize_selection(index(parent(gd)), p, renamecols) parentdf = parent(gd) if source_cols isa Int incols = (parent(gd)[!, source_cols],) @@ -1553,8 +1559,8 @@ function _combine_tables_with_first!(first::Union{AbstractDataFrame, end """ - select(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) + select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, + ungroup::Bool=true, renamecols::Bool=true) Apply `args` to `gd` following the rules described in [`combine`](@ref). @@ -1686,42 +1692,42 @@ julia> select(gd, :b, :c => sum) # passing columns and broadcasting │ 7 │ 1 │ 2 │ 19 │ │ 8 │ 2 │ 1 │ 17 │ -julia> select(gd, :, AsTable(Not(:a)) => sum) +julia> select(gd, :, AsTable(Not(:a)) => sum, renamecols=false) 8×4 DataFrame -│ Row │ a │ b │ c │ b_c_sum │ -│ │ Int64 │ Int64 │ Int64 │ Int64 │ -├─────┼───────┼───────┼───────┼─────────┤ -│ 1 │ 1 │ 2 │ 1 │ 3 │ -│ 2 │ 1 │ 1 │ 2 │ 3 │ -│ 3 │ 1 │ 2 │ 3 │ 5 │ -│ 4 │ 2 │ 1 │ 4 │ 5 │ -│ 5 │ 2 │ 2 │ 5 │ 7 │ -│ 6 │ 1 │ 1 │ 6 │ 7 │ -│ 7 │ 1 │ 2 │ 7 │ 9 │ -│ 8 │ 2 │ 1 │ 8 │ 9 │ +│ Row │ a │ b │ c │ b_c │ +│ │ Int64 │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┼───────┤ +│ 1 │ 1 │ 2 │ 1 │ 3 │ +│ 2 │ 1 │ 1 │ 2 │ 3 │ +│ 3 │ 1 │ 2 │ 3 │ 5 │ +│ 4 │ 2 │ 1 │ 4 │ 5 │ +│ 5 │ 2 │ 2 │ 5 │ 7 │ +│ 6 │ 1 │ 1 │ 6 │ 7 │ +│ 7 │ 1 │ 2 │ 7 │ 9 │ +│ 8 │ 2 │ 1 │ 8 │ 9 │ ``` """ -select(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) = +select(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, + ungroup::Bool=true, renamecols::Bool=true) = _combine_prepare(gd, args..., copycols=copycols, keepkeys=keepkeys, - ungroup=ungroup, keeprows=true) + ungroup=ungroup, keeprows=true, renamecols=renamecols) """ transform(gd::GroupedDataFrame, args...; copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) An equivalent of -`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, ungroup=ungroup)` +`select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, ungroup=ungroup, renamecols=renamecols)` but keeps the columns of `parent(gd)` in their original order. # See also [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform!`](@ref) """ -function transform(gd::GroupedDataFrame, args...; - copycols::Bool=true, keepkeys::Bool=true, ungroup::Bool=true) +function transform(gd::GroupedDataFrame, args...; copycols::Bool=true, + keepkeys::Bool=true, ungroup::Bool=true, renamecols::Bool=true) res = select(gd, :, args..., copycols=copycols, keepkeys=keepkeys, - ungroup=ungroup) + ungroup=ungroup, renamecols=renamecols) # res can be a GroupedDataFrame based on DataFrame or a DataFrame, # so parent always gives a data frame select!(parent(res), propertynames(parent(gd)), :) @@ -1729,10 +1735,10 @@ function transform(gd::GroupedDataFrame, args...; end """ - select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) + select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) An equivalent of -`select(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)` +`select(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup, renamecols=renamecols)` but updates `parent(gd)` in place. `gd` is updated to reflect the new rows of its updated parent. @@ -1743,18 +1749,19 @@ using the same parent data frame they might get corrupt. [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`transform`](@ref), [`transform!`](@ref) """ -function select!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) - newdf = select(gd, args..., copycols=false) +function select!(gd::GroupedDataFrame{DataFrame}, args...; + ungroup::Bool=true, renamecols::Bool=true) + newdf = select(gd, args..., copycols=false, renamecols=renamecols) df = parent(gd) _replace_columns!(df, newdf) return ungroup ? df : gd end """ - transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) + transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true, renamecols::Bool=true) An equivalent of -`transform(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup)` +`transform(gd, args..., copycols=false, keepkeys=true, ungroup=ungroup, renamecols=renamecols)` but updates `parent(gd)` in place and keeps the columns of `parent(gd)` in their original order. @@ -1762,8 +1769,9 @@ and keeps the columns of `parent(gd)` in their original order. [`groupby`](@ref), [`combine`](@ref), [`select`](@ref), [`select!`](@ref), [`transform`](@ref) """ -function transform!(gd::GroupedDataFrame{DataFrame}, args...; ungroup::Bool=true) - newdf = select(gd, :, args..., copycols=false) +function transform!(gd::GroupedDataFrame{DataFrame}, args...; + ungroup::Bool=true, renamecols::Bool=true) + newdf = select(gd, :, args..., copycols=false, renamecols=renamecols) df = parent(gd) select!(newdf, propertynames(df), :) _replace_columns!(df, newdf) diff --git a/test/grouping.jl b/test/grouping.jl index 336c84eb82..c8839c9fa9 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1379,7 +1379,7 @@ end @test gd[Dict([Test.GenericString(String(k)) => v for (k, v) in pairs(key)]...)] ≅ gd[i] # Out of order Dict @test gd[Dict([k => v for (k, v) in Iterators.reverse(pairs(key))]...)] ≅ gd[i] - # AbstractDict + # AbstractDict @test gd[Test.GenericDict(Dict(key))] ≅ gd[i] end @@ -1395,7 +1395,7 @@ end @test get(gd, Dict(:a => :A, :b => 1), nothing) ≅ gd[1] @test get(gd, Dict(:b => 1, :a => :A), nothing) ≅ gd[1] @test get(gd, Dict(:a => :A, :b => 3), nothing) == nothing - + # Wrong values @test_throws KeyError gd[(a=:A, b=3)] @test_throws KeyError gd[(:A, 3)] @@ -2839,4 +2839,29 @@ end end end +@testset "renamecols=false tests" begin + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + gdf = groupby_checked(df, :a) + + @test select(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + @test_throws ArgumentError select(gdf, [] => () -> 10, renamecols=false) + @test transform(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) + @test combine(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + @test combine([:a, :b] => +, gdf, renamecols=false) == DataFrame(a=1:3, a_b=5:2:9) + @test combine(identity, gdf, renamecols=false) == df + + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + gdf = groupby_checked(df, :a) + @test select!(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == df + @test df == DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + gdf = groupby_checked(df, :a) + @test transform!(gdf, :a => +, [:a, :b] => +, All() => +, renamecols=false) == df + @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) +end + end # module diff --git a/test/select.jl b/test/select.jl index 99dc122322..f707a9e950 100644 --- a/test/select.jl +++ b/test/select.jl @@ -1285,4 +1285,25 @@ end @test_throws MethodError transform!(dfv, 1) end +@testset "renamecols=false tests" begin + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + @test select(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + @test_throws ArgumentError select(df, [] => () -> 10, renamecols=false) + @test transform(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) + @test combine(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == + DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + @test combine([:a, :b] => +, df, renamecols=false) == DataFrame(a_b=5:2:9) + @test combine(identity, df, renamecols=false) == df + + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + @test select!(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == df + @test df == DataFrame(a=1:3, a_b=5:2:9, a_b_etc=22:4:30) + + df = DataFrame(a=1:3, b=4:6, c=7:9, d=10:12) + @test transform!(df, :a => +, [:a, :b] => +, All() => +, renamecols=false) == df + @test df == DataFrame(a=1:3, b=4:6, c=7:9, d=10:12, a_b=5:2:9, a_b_etc=22:4:30) +end + end # module