Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Index to grouped data frame using Dicts #2281

Merged
merged 17 commits into from
Aug 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@
* add `rownumber` to `DataFrameRow` ([#2356](https://github.com/JuliaData/DataFrames.jl/pull/2356))
* allow passing column name to specify the position where a new columns should be
inserted in `insertcols!` ([#2365](https://github.com/JuliaData/DataFrames.jl/pull/2365))
* allow `GroupedDataFrame`s to be indexed using a dictionary, which can use `Symbol` or string keys and
are not dependent on the order of keys. ([#2281](https://github.com/JuliaData/DataFrames.jl/pull/2281))
* add `isapprox` method to check for approximate equality between two dataframes
([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373))
* add `columnindex` for `DataFrameRow`
([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380))

## Deprecated

* `DataFrame!` is now deprecated ([#2338](https://github.com/JuliaData/DataFrames.jl/pull/2338))
Expand Down
47 changes: 39 additions & 8 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,17 @@ Base.getindex(gd::GroupedDataFrame, idxs::Colon) =

Key for one of the groups of a [`GroupedDataFrame`](@ref). Contains the values
of the corresponding grouping columns and behaves similarly to a `NamedTuple`,
but using it to index its `GroupedDataFrame` is much more effecient than using the
equivalent `Tuple` or `NamedTuple`.
but using it to index its `GroupedDataFrame` is more efficient than using the
equivalent `Tuple` and `NamedTuple`, and much more efficient than using
the equivalent `AbstractDict`.

Instances of this type are returned by `keys(::GroupedDataFrame)` and are not
meant to be constructed directly.

Indexing fields of `GroupKey` is allowed using an integer, a `Symbol`, or a string.
It is also possible to access the data in a `GroupKey` using the `getproperty`
function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, or `Vector`.
function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, a `Vector`, or
a `Dict`. When converted to a `Dict`, the keys of the `Dict` are `Symbol`s.

See [`keys(::GroupedDataFrame)`](@ref) for more information.
"""
Expand Down Expand Up @@ -328,6 +330,8 @@ Base.convert(::Type{Array{T}}, key::GroupKey) where {T} = Vector{T}(key)
Base.Array(key::GroupKey) = Vector(key)
Base.Array{T}(key::GroupKey) where {T} = Vector{T}(key)

Base.Dict(key::GroupKey) = Dict(pairs(key)...)

Base.broadcastable(::GroupKey) =
throw(ArgumentError("broadcasting over `GroupKey`s is reserved"))

Expand Down Expand Up @@ -367,7 +371,7 @@ function Base.getindex(gd::GroupedDataFrame, idx...)
end

# The allowed key types for dictionary-like indexing
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple}
const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, AbstractDict{<:AbstractString}}
# All allowed scalar index types
const GroupIndexTypes = Union{Integer, GroupKeyTypes}

Expand All @@ -387,6 +391,25 @@ function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
return Base.to_index(gd, Tuple(key))
end

function _dict_to_tuple(key::AbstractDict{<:AbstractString}, gd::GroupedDataFrame)
if length(key) != length(gd.cols)
throw(KeyError(key))
end

return ntuple(i -> key[String(gd.cols[i])], length(gd.cols))
end

function _dict_to_tuple(key::AbstractDict{Symbol}, gd::GroupedDataFrame)
if length(key) != length(gd.cols)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
throw(KeyError(key))
pdeffebach marked this conversation as resolved.
Show resolved Hide resolved
end

return ntuple(i -> key[gd.cols[i]], length(gd.cols))
end

Base.to_index(gd::GroupedDataFrame, key::Union{AbstractDict{Symbol},AbstractDict{<:AbstractString}}) =
Base.to_index(gd, _dict_to_tuple(key, gd))

# Array of (possibly non-standard) indices
function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
# A concrete eltype which is <: GroupKeyTypes, don't need to check
Expand All @@ -409,6 +432,10 @@ function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T}
Tuple
elseif E1 <: NamedTuple
NamedTuple
elseif E1 <: AbstractDict{Symbol}
AbstractDict{Symbol}
elseif E1 <: AbstractDict{<:AbstractString}
AbstractDict{<:AbstractString}
else
throw(ArgumentError("Invalid index: $idx1 of type $E1"))
end
Expand Down Expand Up @@ -463,7 +490,7 @@ end
Get the set of keys for each group of the `GroupedDataFrame` `gd` as a
[`GroupKeys`](@ref) object. Each key is a [`GroupKey`](@ref), which behaves like
a `NamedTuple` holding the values of the grouping columns for a given group.
Unlike the equivalent `Tuple` and `NamedTuple`, these keys can be used to index
Unlike the equivalent `Tuple`, `NamedTuple`, and `AbstractDict`, these keys can be used to index
into `gd` efficiently. The ordering of the keys is identical to the ordering of
the groups of `gd` under iteration and integer indexing.

Expand Down Expand Up @@ -572,6 +599,9 @@ function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
return haskey(gd, Tuple(key))
end

Base.haskey(gd::GroupedDataFrame, key::AbstractDict{<:Union{Symbol, <:AbstractString}}) =
haskey(gd, _dict_to_tuple(key, gd))

Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) =
1 <= key <= length(gd)

Expand All @@ -580,8 +610,9 @@ Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) =

Get a group based on the values of the grouping columns.

`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same
order as the `cols` argument to `groupby`).
`key` may be a `GroupKey`, `NamedTuple` or `Tuple` of grouping column values (in the same
order as the `cols` argument to `groupby`). It may also be an `AbstractDict`, in which case the
bkamins marked this conversation as resolved.
Show resolved Hide resolved
order of the arguments does not matter.

# Examples

Expand Down Expand Up @@ -625,7 +656,7 @@ julia> get(gd, (:baz,), nothing)
julia> get(gd, (:qux,), nothing)
```
"""
function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default)
function Base.get(gd::GroupedDataFrame, key::GroupKeyTypes, default)
try
return gd[key]
catch KeyError
Expand Down
14 changes: 12 additions & 2 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ and combines the result into a data frame).
[`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref),
which can also be used to get the values of the grouping columns for each group.
`Tuples` and `NamedTuple`s containing the values of the grouping columns (in the
same order as the `cols` argument) are also accepted as indices, but this will
be slower than using the equivalent `GroupKey`.
same order as the `cols` argument) are also accepted as indices. Finally,
an `AbstractDict` can be used to index into a grouped data frame where
the keys are column names of the data frame. The order of the keys does
not matter in this case.

# See also

Expand Down Expand Up @@ -89,6 +91,14 @@ julia> gd[(a=3,)]
│ 1 │ 3 │ 2 │ 3 │
│ 2 │ 3 │ 2 │ 7 │

julia> gd[Dict("a" => 3)]
2×3 SubDataFrame
│ Row │ a │ b │ c │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┤
│ 1 │ 3 │ 2 │ 3 │
│ 2 │ 3 │ 2 │ 7 │

julia> gd[(3,)]
2×3 SubDataFrame
│ Row │ a │ b │ c │
Expand Down
28 changes: 26 additions & 2 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1371,29 +1371,53 @@ end
@test gd[NamedTuple(key)] ≅ gd[i]
# Plain tuple
@test gd[Tuple(key)] ≅ gd[i]
# Dict with `Symbol` keys
@test gd[Dict(key)] ≅ gd[i]
# Dict with string keys
@test gd[Dict([String(k) => v for (k, v) in pairs(key)]...)] ≅ gd[i]
# Dict with AbstractString keys
@test gd[Dict([Test.GenericString(String(k)) => v for (k, v) in pairs(key)]...)] ≅ gd[i]
# Out of order Dict
@test gd[Dict([k => v for (k, v) in Iterators.reverse(pairs(key))]...)] ≅ gd[i]
# AbstractDict
@test gd[Test.GenericDict(Dict(key))] ≅ gd[i]
end

# Equivalent value of different type
@test gd[(a=:A, b=1.0)] ≅ gd[1]

@test get(gd, (a=:A, b=1), nothing) ≅ gd[1]
@test get(gd, (a=:A, b=3), nothing) == nothing

@test get(gd, (:A, 1), nothing) ≅ gd[1]
@test get(gd, (:A, 3), nothing) == nothing
@test get(gd, first(keys(gd)), gd) ≅ gd[1]
@test get(gd, Dict("a" => :A, "b" => 1), nothing) ≅ gd[1]
@test get(gd, Dict(:a => :A, :b => 1), nothing) ≅ gd[1]
@test get(gd, Dict(:b => 1, :a => :A), nothing) ≅ gd[1]
@test get(gd, Dict(:a => :A, :b => 3), nothing) == nothing

# Wrong values
@test_throws KeyError gd[(a=:A, b=3)]
@test_throws KeyError gd[(:A, 3)]
@test_throws KeyError gd[(a=:A, b="1")]
@test_throws KeyError gd[Dict(:a => :A, :b => "1")]
# Wrong length
@test_throws KeyError gd[(a=:A,)]
@test_throws KeyError gd[(:A,)]
@test_throws KeyError gd[(a=:A, b=1, c=1)]
@test_throws KeyError gd[(:A, 1, 1)]
@test_throws KeyError gd[Dict(:a => :A, :b => 1, :c => 2)]
# Out of order
@test_throws KeyError gd[(b=1, a=:A)]
@test_throws KeyError gd[(1, :A)]
# Empty
@test_throws KeyError gd[()]
@test_throws KeyError gd[NamedTuple()]
@test_throws KeyError gd[Dict{String, Any}()]

# Bad Dict types
@test_throws ArgumentError gd[Dict()]
@test_throws ArgumentError gd[Dict(1 => :A, 2 => 1)]
end

@testset "GroupKey and GroupKeys" begin
Expand Down Expand Up @@ -1524,7 +1548,7 @@ end
gkeys = keys(gd)[ints]

# Test with GroupKeys, Tuples, and NamedTuples
for converter in [identity, Tuple, NamedTuple]
for converter in [identity, Tuple, NamedTuple, Dict]
a = converter.(gkeys)
@test gd[a] ≅ gd2

Expand Down