Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add suffix to select/transform/combine #2397

Merged
merged 7 commits into from
Sep 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373))
* add `columnindex` for `DataFrameRow`
([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380))
* `select`, `select!`, `transform`, `transform!` and `combine` now allow `renamecols`
keyword argument that makes it possible to avoid adding transformation function name
as a suffix in automatically generated column names
([#2397](https://github.com/JuliaData/DataFrames.jl/pull/2397))

## Deprecated

Expand Down
181 changes: 108 additions & 73 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ _by_row_helper(x::Union{NamedTuple, DataFrameRow}) =
# add a method to funname defined in other/utils.jl
funname(row::ByRow) = funname(row.fun)

normalize_selection(idx::AbstractIndex, sel) =
normalize_selection(idx::AbstractIndex, sel, renamecols::Bool) =
try
idx[sel]
catch e
Expand All @@ -45,28 +45,33 @@ normalize_selection(idx::AbstractIndex, sel) =
end
end

normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol}) =
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), Symbol},
renamecols::Bool) =
length(idx) == 0 ? (Int[] => (() -> 0) => last(sel)) : (1 => length => last(sel))
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString}) =
normalize_selection(idx, first(sel) => Symbol(last(sel)))
normalize_selection(idx::AbstractIndex, sel::typeof(nrow)) =
normalize_selection(idx, nrow => :nrow)
normalize_selection(idx::AbstractIndex, sel::Pair{typeof(nrow), <:AbstractString},
renamecols::Bool) =
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols)
normalize_selection(idx::AbstractIndex, sel::typeof(nrow), renamecols::Bool) =
normalize_selection(idx, nrow => :nrow, renamecols)

function normalize_selection(idx::AbstractIndex, sel::ColumnIndex)
function normalize_selection(idx::AbstractIndex, sel::ColumnIndex, renamecols::Bool)
c = idx[sel]
return c => identity => _names(idx)[c]
end

function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol})
function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, Symbol},
renamecols::Bool)
c = idx[first(sel)]
return c => identity => last(sel)
end

normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString}) =
normalize_selection(idx, first(sel) => Symbol(last(sel)))
normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:AbstractString},
renamecols::Bool) =
normalize_selection(idx, first(sel) => Symbol(last(sel)), renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}})
sel::Pair{<:Any,<:Pair{<:Base.Callable, Symbol}},
renamecols::Bool)
if first(sel) isa AsTable
rawc = first(sel).cols
wanttable = true
Expand Down Expand Up @@ -97,19 +102,25 @@ function normalize_selection(idx::AbstractIndex,
end

normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}}) =
normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))))
sel::Pair{<:Any,<:Pair{<:Base.Callable,<:AbstractString}},
renamecols::Bool) =
normalize_selection(idx, first(sel) => first(last(sel)) => Symbol(last(last(sel))),
renamecols::Bool)

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:ColumnIndex,<:Base.Callable})
sel::Pair{<:ColumnIndex,<:Base.Callable}, renamecols::Bool)
c = idx[first(sel)]
fun = last(sel)
newcol = Symbol(_names(idx)[c], "_", funname(fun))
if renamecols
newcol = Symbol(_names(idx)[c], "_", funname(fun))
else
newcol = _names(idx)[c]
end
return c => fun => newcol
end

function normalize_selection(idx::AbstractIndex,
sel::Pair{<:Any, <:Base.Callable})
sel::Pair{<:Any, <:Base.Callable}, renamecols::Bool)
if first(sel) isa AsTable
rawc = first(sel).cols
wanttable = true
Expand Down Expand Up @@ -138,11 +149,23 @@ function normalize_selection(idx::AbstractIndex,
end
fun = last(sel)
if length(c) > 3
newcol = Symbol(join(@views(_names(idx)[c[1:2]]), '_'), "_etc_", funname(fun))
prefix = join(@views(_names(idx)[c[1:2]]), '_')
if renamecols
newcol = Symbol(prefix, "_etc_", funname(fun))
else
newcol = Symbol(prefix, "_etc")
end
elseif isempty(c)
renamecols || throw(ArgumentError("when renamecols=false target column name " *
"must be passed if there are no input columns"))
newcol = Symbol(funname(fun))
else
newcol = Symbol(join(view(_names(idx), c), '_'), '_', funname(fun))
prefix = join(view(_names(idx), c), '_')
if renamecols
newcol = Symbol(prefix, '_', funname(fun))
else
newcol = Symbol(prefix)
end
end
return (wanttable ? AsTable(c) : c) => fun => newcol
end
Expand Down Expand Up @@ -251,10 +274,14 @@ SELECT_ARG_RULES =
Column transformation can also be specified using the short `old_column =>
fun` form. In this case, `new_column_name` is automatically generated as
`\$(old_column)_\$(fun)`. Up to three column names are used for multiple
input columns and they are joined using `_`; if more than three columns are
passed then the name consists of the first two names and `etc` suffix then,
e.g. `[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun`.
`\$(old_column)_\$(fun)` if `renamecols=true` and `\$(old_column)` if
`renamecols=false`. Up to three column names are used for multiple input
columns and they are joined using `_`; if more than three columns are passed
then the name consists of the first two names and `etc` suffix then, e.g.
`[:a,:b,:c,:d] => fun` produces the new column name `:a_b_etc_fun` if
`renamecols=true` and ``:a_b_etc` if `renamecols=false`.
It is not allowed to pass `renamecols=false` if `old_column` is empty
as it would generate an empty column name.
Column renaming and transformation operations can be passed wrapped in
vectors (this is useful when combined with broadcasting).
Expand All @@ -275,7 +302,7 @@ SELECT_ARG_RULES =
"""

"""
select!(df::DataFrame, args...)
select!(df::DataFrame, args...; renamecols::Bool=true)
Mutate `df` in place to retain only columns specified by `args...` and return it.
The result is guaranteed to have the same number of rows as `df`, except when no
Expand Down Expand Up @@ -345,33 +372,34 @@ julia> df = DataFrame(a=1:3, b=4:6);
julia> using Statistics
julia> select!(df, AsTable(:) => ByRow(mean))
julia> select!(df, AsTable(:) => ByRow(mean), renamecols=false)
3×1 DataFrame
│ Row │ a_b_mean
│ │ Float64
├─────┼─────────
│ 1 │ 2.5
│ 2 │ 3.5
│ 3 │ 4.5
│ Row │ a_b
│ │ Float64 │
├─────┼─────────┤
│ 1 │ 2.5 │
│ 2 │ 3.5 │
│ 3 │ 4.5 │
```
"""
select!(df::DataFrame, args...) =
_replace_columns!(df, select(df, args..., copycols=false))
select!(df::DataFrame, args...; renamecols::Bool=true) =
_replace_columns!(df, select(df, args..., copycols=false, renamecols=renamecols))

"""
transform!(df::DataFrame, args...)
transform!(df::DataFrame, args...; renamecols::Bool=true)
Mutate `df` in place to add columns specified by `args...` and return it.
The result is guaranteed to have the same number of rows as `df`.
Equivalent to `select!(df, :, args...)`.
See [`select!`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform!(df::DataFrame, args...) = select!(df, :, args...)
transform!(df::DataFrame, args...; renamecols::Bool=true) =
select!(df, :, args..., renamecols=renamecols)

"""
select(df::AbstractDataFrame, args...; copycols::Bool=true)
select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
Create a new data frame that contains columns from `df` specified by `args` and
return it. The result is guaranteed to have the same number of rows as `df`,
Expand Down Expand Up @@ -479,22 +507,22 @@ julia> select(df, names(df) .=> sum .=> [:A, :B])
│ 2 │ 6 │ 15 │
│ 3 │ 6 │ 15 │
julia> select(df, AsTable(:) => ByRow(mean))
julia> select(df, AsTable(:) => ByRow(mean), renamecols=false)
3×1 DataFrame
│ Row │ a_b_mean
│ │ Float64
├─────┼─────────
│ 1 │ 2.5
│ 2 │ 3.5
│ 3 │ 4.5
│ Row │ a_b
│ │ Float64 │
├─────┼─────────┤
│ 1 │ 2.5 │
│ 2 │ 3.5 │
│ 3 │ 4.5 │
```
"""
select(df::AbstractDataFrame, args...; copycols::Bool=true) =
manipulate(df, args..., copycols=copycols, keeprows=true)
select(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
manipulate(df, args..., copycols=copycols, keeprows=true, renamecols=renamecols)

"""
transform(df::AbstractDataFrame, args...; copycols::Bool=true)
transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true)
Create a new data frame that contains columns from `df` and adds columns
specified by `args` and return it.
Expand All @@ -503,12 +531,12 @@ Equivalent to `select(df, :, args..., copycols=copycols)`.
See [`select`](@ref) for detailed rules regarding accepted values for `args`.
"""
transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
select(df, :, args..., copycols=copycols)
transform(df::AbstractDataFrame, args...; copycols::Bool=true, renamecols::Bool=true) =
select(df, :, args..., copycols=copycols, renamecols=renamecols)

"""
combine(df::AbstractDataFrame, args...)
combine(arg, df::AbstractDataFrame)
combine(df::AbstractDataFrame, args...; renamecols::Bool=true)
combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
Create a new data frame that contains columns from `df` specified by `args` and
return it. The result can have any number of rows that is determined by the
Expand All @@ -530,42 +558,46 @@ julia> df = DataFrame(a=1:3, b=4:6)
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │
julia> combine(df, :a => sum, nrow)
julia> combine(df, :a => sum, nrow, renamecols=false)
1×2 DataFrame
│ Row │ a_sum │ nrow │
│ Row │ a │ nrow │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 6 │ 3 │
```
"""
combine(df::AbstractDataFrame, args...) =
manipulate(df, args..., copycols=true, keeprows=false)
combine(df::AbstractDataFrame, args...; renamecols::Bool=true) =
manipulate(df, args..., copycols=true, keeprows=false, renamecols=renamecols)

function combine(arg, df::AbstractDataFrame)
function combine(arg, df::AbstractDataFrame; renamecols::Bool=true)
if nrow(df) == 0
throw(ArgumentError("calling combine on a data frame with zero rows" *
" with transformation as a first argument is " *
"currently not supported"))
end
return combine(arg, groupby(df, Symbol[]))
return combine(arg, groupby(df, Symbol[]), renamecols=renamecols)
end

manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
DataFrame(_columns(df)[args], Index(_names(df)[args]),
copycols=copycols)
manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
DataFrame(_columns(df)[args], Index(_names(df)[args]), copycols=copycols)

function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool)
function manipulate(df::DataFrame, c::MultiColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool)
if c isa AbstractVector{<:Pair}
return manipulate(df, c..., copycols=copycols, keeprows=keeprows)
return manipulate(df, c..., copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
else
return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows)
return manipulate(df, index(df)[c], copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
end
end

manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool) =
manipulate(df, [c], copycols=copycols, keeprows=keeprows)
manipulate(df::DataFrame, c::ColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
manipulate(df, [c], copycols=copycols, keeprows=keeprows, renamecols=renamecols)

function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool, renamecols::Bool)
cs_vec = []
for v in cs
if v isa AbstractVector{<:Pair}
Expand All @@ -574,7 +606,7 @@ function manipulate(df::DataFrame, cs...; copycols::Bool, keeprows::Bool)
push!(cs_vec, v)
end
end
return _manipulate(df, [normalize_selection(index(df), c) for c in cs_vec],
return _manipulate(df, [normalize_selection(index(df), c, renamecols) for c in cs_vec],
copycols, keeprows)
end

Expand Down Expand Up @@ -679,19 +711,22 @@ function _manipulate(df::AbstractDataFrame, normalized_cs, copycols::Bool, keepr
return newdf
end

manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool) =
manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows)
manipulate(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool, keeprows::Bool,
renamecols::Bool) =
manipulate(dfv, [ind], copycols=copycols, keeprows=keeprows, renamecols=renamecols)

function manipulate(dfv::SubDataFrame, args::MultiColumnIndex;
copycols::Bool, keeprows::Bool)
copycols::Bool, keeprows::Bool, renamecols::Bool)
if args isa AbstractVector{<:Pair}
return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows)
return manipulate(dfv, args..., copycols=copycols, keeprows=keeprows,
renamecols=renamecols)
else
return copycols ? dfv[:, args] : view(dfv, :, args)
end
end

function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool,
renamecols::Bool)
if copycols
cs_vec = []
for v in args
Expand All @@ -701,8 +736,8 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
push!(cs_vec, v)
end
end
return _manipulate(dfv, [normalize_selection(index(dfv), c) for c in cs_vec],
true, keeprows)
return _manipulate(dfv, [normalize_selection(index(dfv), c, renamecols) for c in cs_vec],
true, keeprows)
else
# we do not support transformations here
# newinds contains only indexing; making it Vector{Any} avoids some compilation
Expand All @@ -719,7 +754,7 @@ function manipulate(dfv::SubDataFrame, args...; copycols::Bool, keeprows::Bool)
push!(seen_single_column, ind_idx)
end
else
newind = normalize_selection(index(dfv), ind)
newind = normalize_selection(index(dfv), ind, renamecols)
if newind isa Pair
throw(ArgumentError("transforming and renaming columns of a " *
"SubDataFrame is not allowed when `copycols=false`"))
Expand Down
Loading