Skip to content

Commit

Permalink
Index to grouped data frame using Dicts (#2281)
Browse files Browse the repository at this point in the history
  • Loading branch information
pdeffebach authored Aug 29, 2020
1 parent a391912 commit d156320
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 13 deletions.
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
* add `rownumber` to `DataFrameRow` ([#2356](https://github.com/JuliaData/DataFrames.jl/pull/2356))
* allow passing column name to specify the position where a new columns should be
inserted in `insertcols!` ([#2365](https://github.com/JuliaData/DataFrames.jl/pull/2365))
* allow `GroupedDataFrame`s to be indexed using a dictionary, which can use `Symbol` or string keys and
are not dependent on the order of keys. ([#2281](https://github.com/JuliaData/DataFrames.jl/pull/2281))
* add `isapprox` method to check for approximate equality between two dataframes
([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373))
* add `columnindex` for `DataFrameRow`
([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380))

## Deprecated

* `DataFrame!` is now deprecated ([#2338](https://github.com/JuliaData/DataFrames.jl/pull/2338))
Expand Down
47 changes: 39 additions & 8 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,17 @@ Base.getindex(gd::GroupedDataFrame, idxs::Colon) =
Key for one of the groups of a [`GroupedDataFrame`](@ref). Contains the values
of the corresponding grouping columns and behaves similarly to a `NamedTuple`,
but using it to index its `GroupedDataFrame` is much more effecient than using the
equivalent `Tuple` or `NamedTuple`.
but using it to index its `GroupedDataFrame` is more efficient than using the
equivalent `Tuple` and `NamedTuple`, and much more efficient than using
the equivalent `AbstractDict`.
Instances of this type are returned by `keys(::GroupedDataFrame)` and are not
meant to be constructed directly.
Indexing fields of `GroupKey` is allowed using an integer, a `Symbol`, or a string.
It is also possible to access the data in a `GroupKey` using the `getproperty`
function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, or `Vector`.
function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, a `Vector`, or
a `Dict`. When converted to a `Dict`, the keys of the `Dict` are `Symbol`s.
See [`keys(::GroupedDataFrame)`](@ref) for more information.
"""
Expand Down Expand Up @@ -328,6 +330,8 @@ Base.convert(::Type{Array{T}}, key::GroupKey) where {T} = Vector{T}(key)
Base.Array(key::GroupKey) = Vector(key)
Base.Array{T}(key::GroupKey) where {T} = Vector{T}(key)

Base.Dict(key::GroupKey) = Dict(pairs(key)...)

Base.broadcastable(::GroupKey) =
throw(ArgumentError("broadcasting over `GroupKey`s is reserved"))

Expand Down Expand Up @@ -367,7 +371,7 @@ function Base.getindex(gd::GroupedDataFrame, idx...)
end

# The allowed key types for dictionary-like indexing
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple}
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, AbstractDict{<:AbstractString}}
# All allowed scalar index types
const GroupIndexTypes = Union{Integer, GroupKeyTypes}

Expand All @@ -387,6 +391,25 @@ function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
return Base.to_index(gd, Tuple(key))
end

function _dict_to_tuple(key::AbstractDict{<:AbstractString}, gd::GroupedDataFrame)
if length(key) != length(gd.cols)
throw(KeyError(key))
end

return ntuple(i -> key[String(gd.cols[i])], length(gd.cols))
end

function _dict_to_tuple(key::AbstractDict{Symbol}, gd::GroupedDataFrame)
if length(key) != length(gd.cols)
throw(KeyError(key))
end

return ntuple(i -> key[gd.cols[i]], length(gd.cols))
end

Base.to_index(gd::GroupedDataFrame, key::Union{AbstractDict{Symbol},AbstractDict{<:AbstractString}}) =
Base.to_index(gd, _dict_to_tuple(key, gd))

# Array of (possibly non-standard) indices
function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
# A concrete eltype which is <: GroupKeyTypes, don't need to check
Expand All @@ -409,6 +432,10 @@ function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
Tuple
elseif E1 <: NamedTuple
NamedTuple
elseif E1 <: AbstractDict{Symbol}
AbstractDict{Symbol}
elseif E1 <: AbstractDict{<:AbstractString}
AbstractDict{<:AbstractString}
else
throw(ArgumentError("Invalid index: $idx1 of type $E1"))
end
Expand Down Expand Up @@ -463,7 +490,7 @@ end
Get the set of keys for each group of the `GroupedDataFrame` `gd` as a
[`GroupKeys`](@ref) object. Each key is a [`GroupKey`](@ref), which behaves like
a `NamedTuple` holding the values of the grouping columns for a given group.
Unlike the equivalent `Tuple` and `NamedTuple`, these keys can be used to index
Unlike the equivalent `Tuple`, `NamedTuple`, and `AbstractDict`, these keys can be used to index
into `gd` efficiently. The ordering of the keys is identical to the ordering of
the groups of `gd` under iteration and integer indexing.
Expand Down Expand Up @@ -572,6 +599,9 @@ function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
return haskey(gd, Tuple(key))
end

Base.haskey(gd::GroupedDataFrame, key::AbstractDict{<:Union{Symbol, <:AbstractString}}) =
haskey(gd, _dict_to_tuple(key, gd))

Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) =
1 <= key <= length(gd)

Expand All @@ -580,8 +610,9 @@ Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) =
Get a group based on the values of the grouping columns.
`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same
order as the `cols` argument to `groupby`).
`key` may be a `GroupKey`, `NamedTuple` or `Tuple` of grouping column values (in the same
order as the `cols` argument to `groupby`). It may also be an `AbstractDict`, in which case the
order of the arguments does not matter.
# Examples
Expand Down Expand Up @@ -625,7 +656,7 @@ julia> get(gd, (:baz,), nothing)
julia> get(gd, (:qux,), nothing)
```
"""
function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default)
function Base.get(gd::GroupedDataFrame, key::GroupKeyTypes, default)
try
return gd[key]
catch KeyError
Expand Down
14 changes: 12 additions & 2 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ and combines the result into a data frame).
[`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref),
which can also be used to get the values of the grouping columns for each group.
`Tuples` and `NamedTuple`s containing the values of the grouping columns (in the
same order as the `cols` argument) are also accepted as indices, but this will
be slower than using the equivalent `GroupKey`.
same order as the `cols` argument) are also accepted as indices. Finally,
an `AbstractDict` can be used to index into a grouped data frame where
the keys are column names of the data frame. The order of the keys does
not matter in this case.
# See also
Expand Down Expand Up @@ -89,6 +91,14 @@ julia> gd[(a=3,)]
│ 1 │ 3 │ 2 │ 3 │
│ 2 │ 3 │ 2 │ 7 │
julia> gd[Dict("a" => 3)]
2×3 SubDataFrame
│ Row │ a │ b │ c │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┤
│ 1 │ 3 │ 2 │ 3 │
│ 2 │ 3 │ 2 │ 7 │
julia> gd[(3,)]
2×3 SubDataFrame
│ Row │ a │ b │ c │
Expand Down
28 changes: 26 additions & 2 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1371,29 +1371,53 @@ end
@test gd[NamedTuple(key)] gd[i]
# Plain tuple
@test gd[Tuple(key)] gd[i]
# Dict with `Symbol` keys
@test gd[Dict(key)] gd[i]
# Dict with string keys
@test gd[Dict([String(k) => v for (k, v) in pairs(key)]...)] gd[i]
# Dict with AbstractString keys
@test gd[Dict([Test.GenericString(String(k)) => v for (k, v) in pairs(key)]...)] gd[i]
# Out of order Dict
@test gd[Dict([k => v for (k, v) in Iterators.reverse(pairs(key))]...)] gd[i]
# AbstractDict
@test gd[Test.GenericDict(Dict(key))] gd[i]
end

# Equivalent value of different type
@test gd[(a=:A, b=1.0)] gd[1]

@test get(gd, (a=:A, b=1), nothing) gd[1]
@test get(gd, (a=:A, b=3), nothing) == nothing

@test get(gd, (:A, 1), nothing) gd[1]
@test get(gd, (:A, 3), nothing) == nothing
@test get(gd, first(keys(gd)), gd) gd[1]
@test get(gd, Dict("a" => :A, "b" => 1), nothing) gd[1]
@test get(gd, Dict(:a => :A, :b => 1), nothing) gd[1]
@test get(gd, Dict(:b => 1, :a => :A), nothing) gd[1]
@test get(gd, Dict(:a => :A, :b => 3), nothing) == nothing

# Wrong values
@test_throws KeyError gd[(a=:A, b=3)]
@test_throws KeyError gd[(:A, 3)]
@test_throws KeyError gd[(a=:A, b="1")]
@test_throws KeyError gd[Dict(:a => :A, :b => "1")]
# Wrong length
@test_throws KeyError gd[(a=:A,)]
@test_throws KeyError gd[(:A,)]
@test_throws KeyError gd[(a=:A, b=1, c=1)]
@test_throws KeyError gd[(:A, 1, 1)]
@test_throws KeyError gd[Dict(:a => :A, :b => 1, :c => 2)]
# Out of order
@test_throws KeyError gd[(b=1, a=:A)]
@test_throws KeyError gd[(1, :A)]
# Empty
@test_throws KeyError gd[()]
@test_throws KeyError gd[NamedTuple()]
@test_throws KeyError gd[Dict{String, Any}()]

# Bad Dict types
@test_throws ArgumentError gd[Dict()]
@test_throws ArgumentError gd[Dict(1 => :A, 2 => 1)]
end

@testset "GroupKey and GroupKeys" begin
Expand Down Expand Up @@ -1524,7 +1548,7 @@ end
gkeys = keys(gd)[ints]

# Test with GroupKeys, Tuples, and NamedTuples
for converter in [identity, Tuple, NamedTuple]
for converter in [identity, Tuple, NamedTuple, Dict]
a = converter.(gkeys)
@test gd[a] gd2

Expand Down

0 comments on commit d156320

Please sign in to comment.