Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transformation and renaming to select and select! #2080

Merged
merged 51 commits into from
Mar 19, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
77f4623
add support for transforms in select and define transform and transform!
bkamins Jan 6, 2020
147a427
fix SubDataFrame select signature
bkamins Jan 6, 2020
11fd0a2
fix problem in autogeneration of column names
bkamins Jan 6, 2020
6fa4f84
add documentation of automatic generation of column names
bkamins Jan 7, 2020
d501fb4
improvements after code review
bkamins Jan 8, 2020
7053e5b
updates after a code review
bkamins Jan 9, 2020
ec834e2
correct variable name
bkamins Jan 10, 2020
6c76aca
minor fix
bkamins Jan 10, 2020
e59d129
fix select for SubDataFrame
bkamins Jan 10, 2020
dee8ac7
improved multiple column transformation
bkamins Jan 10, 2020
4a8a40b
improve select for SubDataFrame
bkamins Jan 10, 2020
f04a549
Apply suggestions from code review
bkamins Jan 10, 2020
bbc06f4
fixes after code review
bkamins Jan 10, 2020
498d9df
fixes from code review
bkamins Jan 12, 2020
fa5a1f1
disallow duplicates in single column selection
bkamins Jan 15, 2020
cd8f41b
fix select for SubDataFrame to avoid duplicate ColumnIndex selelctions
bkamins Jan 15, 2020
3c7149b
Apply suggestions from code review
bkamins Jan 15, 2020
807adfc
fixes after the code review
bkamins Jan 16, 2020
aa7746b
change default behavior to whole-column and add Row
bkamins Feb 1, 2020
7524706
fix typo
bkamins Feb 4, 2020
3d77f6b
add funname to Row
bkamins Feb 5, 2020
e560a14
merge normalize_selection methods
bkamins Feb 5, 2020
9caab2d
make ByRow a functor
bkamins Feb 11, 2020
db8f103
Update src/abstractdataframe/selection.jl
bkamins Feb 14, 2020
df6795a
disallow transofmation of 0 columns
bkamins Feb 14, 2020
ba1feb9
disallow 0 columns only in ByRow
bkamins Feb 15, 2020
0c30db7
Merge branch 'master' into flexible_select
bkamins Feb 15, 2020
6d03a1c
sync with Tables 1.0
bkamins Feb 15, 2020
34aa4cd
fix documentation
bkamins Feb 15, 2020
a03afd7
fix missing parenthesis
bkamins Feb 16, 2020
d4fced0
fix method signature
bkamins Feb 17, 2020
c712088
export ByRow
bkamins Feb 17, 2020
9b5c027
auto-splat (no docs update)
bkamins Feb 22, 2020
8e73abc
fix @views
bkamins Feb 22, 2020
930875e
move to broadcasting in ByRow
bkamins Feb 26, 2020
ab4103a
Apply suggestions from code review
bkamins Feb 28, 2020
4289c48
update implementation
bkamins Feb 28, 2020
6341ccc
reorganize tests
bkamins Feb 28, 2020
09e632e
first round of tests
bkamins Feb 28, 2020
df59216
disallow AbstractDataFrame, NamedTuple, DataFrameRow, and AbstractMat…
bkamins Feb 29, 2020
d932b05
fix test
bkamins Feb 29, 2020
688b077
clean up transformation implementation
bkamins Mar 1, 2020
c34ee72
further sanitizing select rules and more code explanations
bkamins Mar 1, 2020
b818d57
fix comments
bkamins Mar 1, 2020
08d4043
tests of disallowed values
bkamins Mar 2, 2020
49dff0e
finalize tests
bkamins Mar 2, 2020
d685576
fix Julia 1.0 tests
bkamins Mar 2, 2020
35f8996
stop doing pessimistic copy when copycols=true
bkamins Mar 12, 2020
78b492d
Apply suggestions from code review
bkamins Mar 18, 2020
52e690d
fixes after code review
bkamins Mar 18, 2020
20642c5
improve docstring
bkamins Mar 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ include("dataframerow/utils.jl")

include("other/broadcasting.jl")

include("abstractdataframe/selection.jl")
include("abstractdataframe/iteration.jl")
include("abstractdataframe/join.jl")
include("abstractdataframe/reshape.jl")
Expand Down
361 changes: 361 additions & 0 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
# TODO:
# * add transform and transfom! functions
# * add `Col` wrapper for whole column operations
# * update documentation
# * add tests

# normalize_selection function makes sure that whatever input format of idx is it
# will end up in one of four canonical forms
# 1) Int
# 2) AbstractVector{Int}
# 3) Pair{Int, Pair{ColRename, Symbol}}
# 3) Pair{Int, <:Pair{<:Base.Callable, Symbol}}
# 4) Pair{AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
# in this way we can easily later decide on the codepath using type signatures

```
bkamins marked this conversation as resolved.
Show resolved Hide resolved
ColRename

A singleton type indicating that column renaming operation was requested in `select`.
```
struct ColRename end

normalize_selection(idx::AbstractIndex, sel) = idx[sel]
bkamins marked this conversation as resolved.
Show resolved Hide resolved
bkamins marked this conversation as resolved.
Show resolved Hide resolved
normalize_selection(idx::AbstractIndex, sel::Pair{<:Any,<:Pair{<:Base.Callable,Symbol}}) =
idx[first(sel)] => last(sel)
normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:Symbol}) =
idx[first(sel)] => ColRename() => last(sel)

function normalize_selection(idx::AbstractIndex, sel::Pair{<:ColumnIndex, <:Base.Callable})
c = idx[first(sel)]
fun = last(sel)
newcol = Symbol(_names(idx)[c], "_", funname(fun))
return c => fun => newcol
end

function normalize_selection(idx::AbstractIndex, sel::Pair{<:Any, <:Base.Callable})
c = idx[first(sel)]
fun = last(sel)
if length(c) > 2
newcol = Symbol(join(_names(idx)[c[1:2]], "_"), "_⋯_", funname(fun))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
else
newcol = Symbol(join(_names(idx)[c], "_"), "_", funname(fun))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end
return c => fun => newcol
end

struct TypeHolder{T} end

function select_transform_helper(th::TypeHolder{T}, cols, fun, n) where T
bkamins marked this conversation as resolved.
Show resolved Hide resolved
fun_transform(fun, x::T) = fun(x)
map(i -> fun_transform(fun, T(ntuple(j -> cols[j][i], length(cols)))), 1:n)
end

function select_transform!(nc::Union{Pair{Int, Pair{ColRename, Symbol}},
Pair{<:Union{Int, AbstractVector{Int}},
<:Pair{<:Base.Callable, Symbol}}},
df::DataFrame, newdf::DataFrame,
transformed_cols::Dict{Any, Any}, copycols::Bool)
newname = last(last(nc))
if !isnothing(transformed_cols[newname])
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@assert !hasproperty(newdf, newname)
end
col_idx = first(nc)
if nc isa Pair{Int, Pair{ColRename, Symbol}}
newdf[!, newname] = copycols ? df[:, col_idx] : df[!, col_idx]
elseif nc isa Pair{Int, <:Pair{<:Base.Callable, Symbol}}
newdf[!, newname] = first(last(nc)).(df[!, col_idx])
bkamins marked this conversation as resolved.
Show resolved Hide resolved
elseif nc isa Pair{<:AbstractVector{Int}, <:Pair{<:Base.Callable, Symbol}}
if length(col_idx) == 0
newdf[!, newname] = [first(last(nc))() for _ in axes(df, 1)]
else
cols = ntuple(i -> _columns(df)[col_idx[i]], length(col_idx))
colnames = ntuple(i -> _names(df)[col_idx[i]], length(col_idx))
newdf[!, newname] = select_transform_helper(TypeHolder{NamedTuple{colnames,
Tuple{eltype.(cols)...}}}(),
cols, first(last(nc)), nrow(df))
end
else
throw(ErrorException("code should never reach this branch"))
end
transformed_cols[newname] = nothing
end

"""
select!(df::DataFrame, inds...)

Mutate `df` in place to retain only columns specified by `inds...` and return it.

Arguments passed as `inds...` can be any index that is allowed for column indexing
provided that the columns requested in each of them are unique and present in `df`.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported.

Column renaming and transformations are supported.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
The syntax for column renaming is `old_column=>new_column_name`.
The syntax for column transformations is `old_column=>fun=>new_column_name`.
`new_column_name` must be a `Symbol`.
If `old_column` is a `Symbol` or an integer then and `fun` must be callable
that is applied row by row to the values of `old_column`.
bkamins marked this conversation as resolved.
Show resolved Hide resolved
Otherwise `old_column` can be any column indexing syntax, but in this case `fun`
bkamins marked this conversation as resolved.
Show resolved Hide resolved
will be passed `DataFrameRows` selected by `old_column`.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

Column transfomation syntax also supports a short `old_column=>fun` form, in which
case `new_column_name` is automatically generated by joining `olcd_column` with `fun` name
bkamins marked this conversation as resolved.
Show resolved Hide resolved
with `_`.

If more than one argument is passed then duplicates are accepted except for
bkamins marked this conversation as resolved.
Show resolved Hide resolved
column renaming and transformation operations, where it is not alloweded to rename/transform
bkamins marked this conversation as resolved.
Show resolved Hide resolved
into the same column name.
For example if `:col` is present in `df` a call `select!(df, :col, :)` is valid
and moves the column `:col` moved to be the first one in-place.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

Note that including the same column several times in the data frame will create aliases.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

# Examples
```jldoctest
julia> df = DataFrame(a=1:3, b=4:6)
3×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 4 │
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │

julia> select!(df, 2)
3×1 DataFrame
│ Row │ b │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │

julia> df = DataFrame(a=1:3, b=4:6)
3×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 4 │
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │

julia> select!(df, :a=><(1.5)=>:c, :b)
3×2 DataFrame
│ Row │ c │ b │
│ │ Bool │ Int64 │
├─────┼──────┼───────┤
│ 1 │ 1 │ 4 │
│ 2 │ 0 │ 5 │
│ 3 │ 0 │ 6 │
```

"""
function select!(df::DataFrame, inds::AbstractVector{Int})
if isempty(inds)
empty!(_columns(df))
empty!(index(df))
return df
end
indmin, indmax = extrema(inds)
if indmin < 1
throw(ArgumentError("indices must be positive"))
end
if indmax > ncol(df)
throw(ArgumentError("indices must not be greater than number of columns"))
end
if !allunique(inds)
throw(ArgumentError("indices must not contain duplicates"))
end
copy!(_columns(df), _columns(df)[inds])
x = index(df)
copy!(_names(x), _names(df)[inds])
empty!(x.lookup)
for (i, n) in enumerate(x.names)
x.lookup[n] = i
end
return df
end

select!(df::DataFrame, c::Int) = select!(df, [c])
select!(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
Colon, All, Not, Between, Regex}) =
select!(df, index(df)[c])

function select!(df::DataFrame, cs...)
newdf = select(df, cs..., copycols=false)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
copy!(_columns(df), _columns(newdf))
x = index(df)
copy!(_names(x), _names(newdf))
empty!(x.lookup)
for (i, n) in enumerate(x.names)
x.lookup[n] = i
end
return df
end

"""
select(df::AbstractDataFrame, inds...; copycols::Bool=true)

Create a new data frame that contains columns from `df`
specified by `inds` and return it.

Arguments passed as `inds...` can be any index that is allowed for column indexing
provided that the columns requested in each of them are unique and present in `df`.
In particular, regular expressions, `All`, `Between`, and `Not` selectors are supported.

Also if `df` is a `DataFrame` or `copycols=true` then column renaming and transformations
are supported. The syntax for column renaming is `old_column=>new_column_name`.
The syntax for column transformations is `old_column=>fun=>new_column_name`.
If `old_column` is a `Symbol` or an integer then and `fun` must be callable
that is applied row by row to the values of `old_column`.
Otherwise `old_column` can be any column indexing syntax, but in this case `fun`
will be passed `DataFrameRows` selected by `old_column`.

Column transfomation syntax also supports a short `old_column=>fun` form, in which
case `new_column_name` is automatically generated by joining `olcd_column` with `fun` name
with `_`.

If more than one argument is passed then duplicates are accepted except for
column renaming and transformation operations, where it is not allowed to rename/transform
into the same column name.
For example if `:col` is present in `df` a call `select(df, :col, :)` is valid
and creates a new data frame with column `:col` moved to be the first.

If `df` is a `DataFrame` a new `DataFrame` is returned.
If `copycols=true` (the default), then returned `DataFrame` is guaranteed not to share columns with `df`.
If `copycols=false`, then returned `DataFrame` shares column vectors with `df` where possible.

If `df` is a `SubDataFrame` then a `SubDataFrame` is returned if `copycols=false`
and a `DataFrame` with freshly allocated columns otherwise.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"freshly allocated" only according to the rules described above for DataFrame right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes. but in particular this time we are sure we will not reuse the columns from df as SubDataFrame holds views, and we always materialize views when copycols=true.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But what happens if you do e.g. :x => (x -> v) with v a global vector?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it does not get copied - exactly like in DataFrame case:

julia> using DataFrames

julia> df = view(DataFrame(rand(2,3)), :, :)
2×3 SubDataFrame
│ Row │ x1       │ x2       │ x3       │
│     │ Float64  │ Float64  │ Float64  │
├─────┼──────────┼──────────┼──────────┤
│ 1   │ 0.86855  │ 0.994078 │ 0.512417 │
│ 2   │ 0.473683 │ 0.911317 │ 0.284993 │

julia> x = [1, 2]
2-element Array{Int64,1}:
 1
 2

julia> df2 = select(df, :x1 => (y -> x) => :y)
2×1 DataFrame
│ Row │ y     │
│     │ Int64 │
├─────┼───────┤
│ 1   │ 1     │
│ 2   │ 2     │

julia> df2.y === x
true

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I understand it is best to remove "freshly allocated" right?


Note that if `df` is a `DataFrame` and `copycols=false` then including the same column several times
in the resulting data frame will create aliases.

# Examples
```jldoctest
julia> df = DataFrame(a=1:3, b=4:6)
3×2 DataFrame
│ Row │ a │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 4 │
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │

julia> select(df, :b)
3×1 DataFrame
│ Row │ b │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 4 │
│ 2 │ 5 │
│ 3 │ 6 │

julia> select(df, Not(:b)) # drop column :b from df
3×1 DataFrame
│ Row │ a │
│ │ Int64 │
├─────┼───────┤
│ 1 │ 1 │
│ 2 │ 2 │
│ 3 │ 3 │

julia> select(df, :a=>:c, :b)
3×2 DataFrame
│ Row │ c │ b │
│ │ Int64 │ Int64 │
├─────┼───────┼───────┤
│ 1 │ 1 │ 4 │
│ 2 │ 2 │ 5 │
│ 3 │ 3 │ 6 │

julia> select(df, :b, :a=><(1.5)=>:c)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
3×2 DataFrame
│ Row │ b │ c │
│ │ Int64 │ Bool │
├─────┼───────┼──────┤
│ 1 │ 4 │ 1 │
│ 2 │ 5 │ 0 │
│ 3 │ 6 │ 0 │
```

"""
select(df::DataFrame, inds::AbstractVector{Int}; copycols::Bool=true) =
DataFrame(_columns(df)[inds], Index(_names(df)[inds]),
copycols=copycols)
select(df::DataFrame, c::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
Colon, All, Not, Between, Regex}; copycols::Bool=true) =
select(df, index(df)[c], copycols=copycols)
select(df::DataFrame, c::ColumnIndex; copycols::Bool=true) =
select(df, [c], copycols=copycols)

select(df::DataFrame, cs...; copycols::Bool=true) =
_select(df, [normalize_selection(index(df), c) for c in cs], copycols)

function _select(df::DataFrame, ncs, copycols::Bool)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
newdf = DataFrame()
# it should be OK to be type unstable here + in this way we aviod having to compile custom Dict
bkamins marked this conversation as resolved.
Show resolved Hide resolved
transformed_cols = Dict()
for nc in ncs
if nc isa Pair
newname = last(last(nc))
@assert newname isa Symbol
if haskey(transformed_cols, newname)
throw(ArgumentError("duplicate target transformed or renamed column names passed"))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end
transformed_cols[newname] = nc
end
end
for nc in ncs
if nc isa Union{Int, AbstractVector{Int}}
allunique(nc) || throw(ArgumentError("duplicate column names selected"))
for i in nc
newname = _names(df)[i]
if !hasproperty(newdf, newname)
if haskey(transformed_cols, newname)
nct = transformed_cols[newname]
@assert !isnothing(nct)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
select_transform!(nct, df, newdf, transformed_cols, copycols)
else
newdf[!, newname] = copycols ? df[:, i] : df[!, i]
end
end
end
else
select_transform!(nc, df, newdf, transformed_cols, copycols)
end
end
return newdf
end

select(dfv::SubDataFrame, ind::ColumnIndex; copycols::Bool=true) =
select(dfv, [ind], copycols=copycols)
select(dfv::SubDataFrame, inds::Union{AbstractVector{<:Integer}, AbstractVector{Symbol},
Colon, All, Not, Between, Regex}; copycols::Bool=true) =
copycols ? dfv[:, inds] : view(dfv, :, inds)

function select(dfv::SubDataFrame, inds...; copycols::Bool=true)
if copycols
newinds = [normalize_selection(index(dfv), c) for c in inds]
usedcols = Int[]
for ni in newinds
# ni is guaranteed to be a Pair with first being an index or an index
append!(usedcols, ni isa Pair ? first(ni) : ni)
end
return _select(dfv[:, unique!(usedcols)], newinds, false)
else
# we do not support transformations here
bkamins marked this conversation as resolved.
Show resolved Hide resolved
# newinds should not be large so making it Vector{Any} should be OK
newinds = []
for ind in inds
newind = normalize_selection(index(dfv), ind)
if newind isa Pair
throw(ArgumentError("transforming and renaming columns of a " *
"`SubDataFrame` is not allowed when `copycols=false`"))
end
push!(newinds, newind)
end
return view(dfv, :, All(newinds...))
end
end
Loading