Skip to content

Commit

Permalink
add renamecols to select/transform/combine (#2397)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkamins authored Sep 5, 2020
1 parent 3d0b2a1 commit 38027c7
Show file tree
Hide file tree
Showing 6 changed files with 257 additions and 164 deletions.
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373))
* add `columnindex` for `DataFrameRow`
([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380))
* `select`, `select!`, `transform`, `transform!` and `combine` now allow `renamecols`
keyword argument that makes it possible to avoid adding transformation function name
as a suffix in automatically generated column names
([#2397](https://github.com/JuliaData/DataFrames.jl/pull/2397))

## Deprecated

Expand Down
181 changes: 108 additions & 73 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ _by_row_helper(x::Union{NamedTuple, DataFrameRow}) =
# add a method to funname defined in other/utils.jl
funname(row::ByRow) = funname(row.fun)

normalize_selection(idx::AbstractIndex, sel) =
normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) =
try
idx[sel]
catch e
Expand All @@ -45,28 +45,33 @@ normalize_selection(idx::AbstractIndex, sel) =
end
end

normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}) =
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol},
renamecols::Bool) =
length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel))
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString}) =
normalize_selection(idx, first(sel) => Symbol(last(sel)))
normalize_selection(idx::AbstractIndex, sel::typeof(nrow)) =
normalize_selection(idx, nrow => :nrow)
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString},
renamecols::Bool) =
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols)
normalize_selection(idx::AbstractIndex, sel::typeof(nrow), renamecols::Bool) =
normalize_selection(idx, nrow => :nrow, renamecols)

function normalize_selection(idx::AbstractIndex, sel::ColumnIndex)
function normalize_selection(idx::AbstractIndex, sel::ColumnIndex, renamecols::Bool)
c = idx[sel]
return c => identity => _names(idx)[c]
end

function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol})
function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol},
renamecols::Bool)
c = idx[first(sel)]
return c => identity => last(sel)
end

normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString}) =
normalize_selection(idx, first(sel) => Symbol(last(sel)))
normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString},
renamecols::Bool) =
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}})
sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}},
renamecols::Bool)
if first(sel) isa AsTable
rawc = first(sel).cols
wanttable = true
Expand Down Expand Up @@ -97,19 +102,25 @@ function normalize_selection(idx::AbstractIndex,
end

normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}}) =
normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))))
sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}},
renamecols::Bool) =
normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))),
renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:ColumnIndex,<:Base.Callable})
sel::Pair{<:ColumnIndex,<:Base.Callable}, renamecols::Bool)
c = idx[first(sel)]
fun = last(sel)
newcol = Symbol(_names(idx)[c], "_", funname(fun))
if renamecols
newcol = Symbol(_names(idx)[c], "_", funname(fun))
else
newcol = _names(idx)[c]
end
return c => fun => newcol
end

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any, <:Base.Callable})
sel::Pair{<:Any, <:Base.Callable}, renamecols::Bool)
if first(sel) isa AsTable
rawc = first(sel).cols
wanttable = true
Expand Down Expand Up @@ -138,11 +149,23 @@ function normalize_selection(idx::AbstractIndex,
end
fun = last(sel)
if length(c) > 3
newcol = Symbol(join(@views(_names(idx)[c[1:2]]), '_'), "_etc_", funname(fun))
prefix = join(@views(_names(idx)[c[1:2]]), '_')
if renamecols
newcol = Symbol(prefix, "_etc_", funname(fun))
else
newcol = Symbol(prefix, "_etc")
end
elseif isempty(c)
renamecols || throw(ArgumentError("when renamecols=false target column name " *
"must be passed if there are no input columns"))
newcol = Symbol(funname(fun))
else
newcol = Symbol(join(view(_names(idx), c), '_'), '_', funname(fun))
prefix = join(view(_names(idx), c), '_')
if renamecols
newcol = Symbol(prefix, '_', funname(fun))
else
newcol = Symbol(prefix)
end
end
return (wanttable ? AsTable(c) : c) => fun => newcol
end
Expand Down Expand Up @@ -251,10 +274,14 @@ SELECT_ARG_RULES =
Column transformation can also be specified using the short `old_column =>
fun` form. In this case, `new_column_name` is automatically generated as
`\$(old_column)_\$(fun)`. Up to three column names are used for multiple
input columns and they are joined using `_`; if more than three columns are
passed then the name consists of the first two names and `etc` suffix then,
e.g. `[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun`.
`\$(old_column)_\$(fun)` if `renamecols=true` and `\$(old_column)` if
`renamecols=false`. Up to three column names are used for multiple input
columns and they are joined using `_`; if more than three columns are passed
then the name consists of the first two names and `etc` suffix then, e.g.
`[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun` if
`renamecols=true` and ``:a_b_etc` if `renamecols=false`.
It is not allowed to pass `renamecols=false` if `old_column` is empty
as it would generate an empty column name.
Column renaming and transformation operations can be passed wrapped in
vectors (this is useful when combined with broadcasting).
Expand All @@ -275,7 +302,7 @@ SELECT_ARG_RULES =
"""

"""
select!(df::DataFrame, args...)
select!(df::DataFrame, args...; renamecols::Bool=true)
Mutate `df` in place to retain only columns specified by `args...` and return it.
The result is guaranteed to have the same number of rows as `df`, except when no
Expand Down Expand Up @@ -345,33 +372,34 @@ julia> df = DataFrame(a=1:3, b=4:6);
julia> using Statistics
julia> select!(df, AsTable(:) => ByRow(mean))
julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false)
3×1 DataFrame
│ Row │ a_b_mean
│ │ Float64
├─────┼─────────
│ 1 │ 2.5
│ 2 │ 3.5
│ 3 │ 4.5
│ Row │ a_b
│ │ Float64 │
├─────┼─────────┤
│ 1 │ 2.5 │
│ 2 │ 3.5 │
│ 3 │ 4.5 │
```
"""
select!(df::DataFrame, args...) =
_replace_columns!(df, select(df, args..., copycols=false))
select!(df::DataFrame, args...; renamecols::Bool=true) =
_replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols))

"""
transform!(df::DataFrame, args...)
transform!(df::DataFrame, args...; renamecols::Bool=true)
Mutate `df` in place to add columns specified by `args...` and return it.
The result is guaranteed to have the same number of rows as `df`.
Equivalent to `select!(df, :, args...)`.
See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform!(df::DataFrame, args...) = select!(df, :, args...)
transform!(df::DataFrame, args...; renamecols::Bool=true) =
select!(df, :, args..., renamecols=renamecols)

"""
select(df::AbstractDataFrame, args...; copycols::Bool=true)
select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
Create a new data frame that contains columns from `df` specified by `args` and
return it. The result is guaranteed to have the same number of rows as `df`,
Expand Down Expand Up @@ -479,22 +507,22 @@ julia> select(df, names(df) .=> sum .=> [:A, :B])
│ 2 │ 6 │ 15 │
│ 3 │ 6 │ 15 │
julia> select(df, AsTable(:) => ByRow(mean))
julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
3×1 DataFrame
│ Row │ a_b_mean
│ │ Float64
├─────┼─────────
│ 1 │ 2.5
│ 2 │ 3.5
│ 3 │ 4.5
│ Row │ a_b
│ │ Float64 │
├─────┼─────────┤
│ 1 │ 2.5 │
│ 2 │ 3.5 │
│ 3 │ 4.5 │
```
"""
select(df::AbstractDataFrame, args...; copycols::Bool=true) =
manipulate(df, args..., copycols=copycols, keeprows=true)
select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols)

"""
transform(df::AbstractDataFrame, args...; copycols::Bool=true)
transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
Create a new data frame that contains columns from `df` and adds columns
specified by `args` and return it.
Expand All @@ -503,12 +531,12 @@ Equivalent to `select(df, :, args..., copycols=copycols)`.
See [`select`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
select(df, :, args..., copycols=copycols)
transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
select(df, :, args..., copycols=copycols, renamecols=renamecols)

"""
combine(df::AbstractDataFrame, args...)
combine(arg, df::AbstractDataFrame)
combine(df::AbstractDataFrame, args...; renamecols::Bool=true)
combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
Create a new data frame that contains columns from `df` specified by `args` and
return it. The result can have any number of rows that is determined by the
Expand All @@ -530,42 +558,46 @@ julia> df = DataFrame(a=1:3, b=4:6)
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │
julia> combine(df, :a => sum, nrow)
julia> combine(df, :a => sum, nrow, renamecols=false)
1×2 DataFrame
│ Row │ a_sum │ nrow │
│ Row │ a │ nrow │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 6 │ 3 │
```
"""
combine(df::AbstractDataFrame, args...) =
manipulate(df, args..., copycols=true, keeprows=false)
combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols)

function combine(arg, df::AbstractDataFrame)
function combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
if nrow(df) == 0
throw(ArgumentError("calling combine on a data frame with zero rows" *
" with transformation as a first argument is " *
"currently not supported"))
end
return combine(arg, groupby(df, Symbol[]))
return combine(arg, groupby(df, Symbol[]), renamecols=renamecols)
end

manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
DataFrame(_columns(df)[args], Index(_names(df)[args]),
copycols=copycols)
manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols)

function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool)
if c isa AbstractVector{<:Pair}
return manipulate(df, c..., copycols=copycols, keeprows=keeprows)
return manipulate(df, c..., copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
else
return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows)
return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
end
end

manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
manipulate(df, [c], copycols=copycols, keeprows=keeprows)
manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols)

function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool)
cs_vec = []
for v in cs
if v isa AbstractVector{<:Pair}
Expand All @@ -574,7 +606,7 @@ function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
push!(cs_vec, v)
end
end
return _manipulate(df, [normalize_selection(index(df), c) for c in cs_vec],
return _manipulate(df, [normalize_selection(index(df), c, renamecols) for c in cs_vec],
copycols, keeprows)
end

Expand Down Expand Up @@ -679,19 +711,22 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr
return newdf
end

manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows)
manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows, renamecols=renamecols)

function manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
copycols::Bool, keeprows::Bool)
copycols::Bool, keeprows::Bool, renamecols::Bool)
if args isa AbstractVector{<:Pair}
return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows)
return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
else
return copycols ? dfv[:, args] : view(dfv, :, args)
end
end

function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool,
renamecols::Bool)
if copycols
cs_vec = []
for v in args
Expand All @@ -701,8 +736,8 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
push!(cs_vec, v)
end
end
return _manipulate(dfv, [normalize_selection(index(dfv), c) for c in cs_vec],
true, keeprows)
return _manipulate(dfv, [normalize_selection(index(dfv), c, renamecols) for c in cs_vec],
true, keeprows)
else
# we do not support transformations here
# newinds contains only indexing; making it Vector{Any} avoids some compilation
Expand All @@ -719,7 +754,7 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
push!(seen_single_column, ind_idx)
end
else
newind = normalize_selection(index(dfv), ind)
newind = normalize_selection(index(dfv), ind, renamecols)
if newind isa Pair
throw(ArgumentError("transforming and renaming columns of a " *
"SubDataFrame is not allowed when `copycols=false`"))
Expand Down
Loading

0 comments on commit 38027c7

Please sign in to comment.