diff --git a/NEWS.md b/NEWS.md index 135e857113..f2d0a60fba 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,11 +38,13 @@ * add `rownumber` to `DataFrameRow` ([#2356](https://github.com/JuliaData/DataFrames.jl/pull/2356)) * allow passing column name to specify the position where a new columns should be inserted in `insertcols!` ([#2365](https://github.com/JuliaData/DataFrames.jl/pull/2365)) +* allow `GroupedDataFrame`s to be indexed using a dictionary, which can use `Symbol` or string keys and + are not dependent on the order of keys. ([#2281](https://github.com/JuliaData/DataFrames.jl/pull/2281)) * add `isapprox` method to check for approximate equality between two dataframes ([#2373](https://github.com/JuliaData/DataFrames.jl/pull/2373)) * add `columnindex` for `DataFrameRow` ([#2380](https://github.com/JuliaData/DataFrames.jl/pull/2380)) - + ## Deprecated * `DataFrame!` is now deprecated ([#2338](https://github.com/JuliaData/DataFrames.jl/pull/2338)) diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl index 3297369041..b1c32e84f8 100644 --- a/src/groupeddataframe/groupeddataframe.jl +++ b/src/groupeddataframe/groupeddataframe.jl @@ -245,15 +245,17 @@ Base.getindex(gd::GroupedDataFrame, idxs::Colon) = Key for one of the groups of a [`GroupedDataFrame`](@ref). Contains the values of the corresponding grouping columns and behaves similarly to a `NamedTuple`, -but using it to index its `GroupedDataFrame` is much more effecient than using the -equivalent `Tuple` or `NamedTuple`. +but using it to index its `GroupedDataFrame` is more efficient than using the +equivalent `Tuple` and `NamedTuple`, and much more efficient than using +the equivalent `AbstractDict`. Instances of this type are returned by `keys(::GroupedDataFrame)` and are not meant to be constructed directly. Indexing fields of `GroupKey` is allowed using an integer, a `Symbol`, or a string. It is also possible to access the data in a `GroupKey` using the `getproperty` -function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, or `Vector`. +function. A `GroupKey` can be converted to a `Tuple`, `NamedTuple`, a `Vector`, or +a `Dict`. When converted to a `Dict`, the keys of the `Dict` are `Symbol`s. See [`keys(::GroupedDataFrame)`](@ref) for more information. """ @@ -328,6 +330,8 @@ Base.convert(::Type{Array{T}}, key::GroupKey) where {T} = Vector{T}(key) Base.Array(key::GroupKey) = Vector(key) Base.Array{T}(key::GroupKey) where {T} = Vector{T}(key) +Base.Dict(key::GroupKey) = Dict(pairs(key)...) + Base.broadcastable(::GroupKey) = throw(ArgumentError("broadcasting over `GroupKey`s is reserved")) @@ -367,7 +371,7 @@ function Base.getindex(gd::GroupedDataFrame, idx...) end # The allowed key types for dictionary-like indexing -const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple} +const GroupKeyTypes = Union{GroupKey, Tuple, NamedTuple, AbstractDict{Symbol}, AbstractDict{<:AbstractString}} # All allowed scalar index types const GroupIndexTypes = Union{Integer, GroupKeyTypes} @@ -387,6 +391,25 @@ function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} return Base.to_index(gd, Tuple(key)) end +function _dict_to_tuple(key::AbstractDict{<:AbstractString}, gd::GroupedDataFrame) + if length(key) != length(gd.cols) + throw(KeyError(key)) + end + + return ntuple(i -> key[String(gd.cols[i])], length(gd.cols)) +end + +function _dict_to_tuple(key::AbstractDict{Symbol}, gd::GroupedDataFrame) + if length(key) != length(gd.cols) + throw(KeyError(key)) + end + + return ntuple(i -> key[gd.cols[i]], length(gd.cols)) +end + +Base.to_index(gd::GroupedDataFrame, key::Union{AbstractDict{Symbol},AbstractDict{<:AbstractString}}) = + Base.to_index(gd, _dict_to_tuple(key, gd)) + # Array of (possibly non-standard) indices function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T} # A concrete eltype which is <: GroupKeyTypes, don't need to check @@ -409,6 +432,10 @@ function Base.to_index(gd::GroupedDataFrame, idxs::AbstractVector{T}) where {T} Tuple elseif E1 <: NamedTuple NamedTuple + elseif E1 <: AbstractDict{Symbol} + AbstractDict{Symbol} + elseif E1 <: AbstractDict{<:AbstractString} + AbstractDict{<:AbstractString} else throw(ArgumentError("Invalid index: $idx1 of type $E1")) end @@ -463,7 +490,7 @@ end Get the set of keys for each group of the `GroupedDataFrame` `gd` as a [`GroupKeys`](@ref) object. Each key is a [`GroupKey`](@ref), which behaves like a `NamedTuple` holding the values of the grouping columns for a given group. -Unlike the equivalent `Tuple` and `NamedTuple`, these keys can be used to index +Unlike the equivalent `Tuple`, `NamedTuple`, and `AbstractDict`, these keys can be used to index into `gd` efficiently. The ordering of the keys is identical to the ordering of the groups of `gd` under iteration and integer indexing. @@ -572,6 +599,9 @@ function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N} return haskey(gd, Tuple(key)) end +Base.haskey(gd::GroupedDataFrame, key::AbstractDict{<:Union{Symbol, <:AbstractString}}) = + haskey(gd, _dict_to_tuple(key, gd)) + Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) = 1 <= key <= length(gd) @@ -580,8 +610,9 @@ Base.haskey(gd::GroupedDataFrame, key::Union{Signed,Unsigned}) = Get a group based on the values of the grouping columns. -`key` may be a `NamedTuple` or `Tuple` of grouping column values (in the same -order as the `cols` argument to `groupby`). +`key` may be a `GroupKey`, `NamedTuple` or `Tuple` of grouping column values (in the same +order as the `cols` argument to `groupby`). It may also be an `AbstractDict`, in which case the +order of the arguments does not matter. # Examples @@ -625,7 +656,7 @@ julia> get(gd, (:baz,), nothing) julia> get(gd, (:qux,), nothing) ``` """ -function Base.get(gd::GroupedDataFrame, key::Union{Tuple, NamedTuple}, default) +function Base.get(gd::GroupedDataFrame, key::GroupKeyTypes, default) try return gd[key] catch KeyError diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 9ee7c22247..12edf9746e 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -36,8 +36,10 @@ and combines the result into a data frame). [`GroupKey`](@ref) objects returned by [`keys(::GroupedDataFrame)`](@ref), which can also be used to get the values of the grouping columns for each group. `Tuples` and `NamedTuple`s containing the values of the grouping columns (in the -same order as the `cols` argument) are also accepted as indices, but this will -be slower than using the equivalent `GroupKey`. +same order as the `cols` argument) are also accepted as indices. Finally, +an `AbstractDict` can be used to index into a grouped data frame where +the keys are column names of the data frame. The order of the keys does +not matter in this case. # See also @@ -89,6 +91,14 @@ julia> gd[(a=3,)] │ 1 │ 3 │ 2 │ 3 │ │ 2 │ 3 │ 2 │ 7 │ +julia> gd[Dict("a" => 3)] +2×3 SubDataFrame +│ Row │ a │ b │ c │ +│ │ Int64 │ Int64 │ Int64 │ +├─────┼───────┼───────┼───────┤ +│ 1 │ 3 │ 2 │ 3 │ +│ 2 │ 3 │ 2 │ 7 │ + julia> gd[(3,)] 2×3 SubDataFrame │ Row │ a │ b │ c │ diff --git a/test/grouping.jl b/test/grouping.jl index bb8254dbc3..49092f2dc7 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -1371,6 +1371,16 @@ end @test gd[NamedTuple(key)] ≅ gd[i] # Plain tuple @test gd[Tuple(key)] ≅ gd[i] + # Dict with `Symbol` keys + @test gd[Dict(key)] ≅ gd[i] + # Dict with string keys + @test gd[Dict([String(k) => v for (k, v) in pairs(key)]...)] ≅ gd[i] + # Dict with AbstractString keys + @test gd[Dict([Test.GenericString(String(k)) => v for (k, v) in pairs(key)]...)] ≅ gd[i] + # Out of order Dict + @test gd[Dict([k => v for (k, v) in Iterators.reverse(pairs(key))]...)] ≅ gd[i] + # AbstractDict + @test gd[Test.GenericDict(Dict(key))] ≅ gd[i] end # Equivalent value of different type @@ -1378,22 +1388,36 @@ end @test get(gd, (a=:A, b=1), nothing) ≅ gd[1] @test get(gd, (a=:A, b=3), nothing) == nothing - + @test get(gd, (:A, 1), nothing) ≅ gd[1] + @test get(gd, (:A, 3), nothing) == nothing + @test get(gd, first(keys(gd)), gd) ≅ gd[1] + @test get(gd, Dict("a" => :A, "b" => 1), nothing) ≅ gd[1] + @test get(gd, Dict(:a => :A, :b => 1), nothing) ≅ gd[1] + @test get(gd, Dict(:b => 1, :a => :A), nothing) ≅ gd[1] + @test get(gd, Dict(:a => :A, :b => 3), nothing) == nothing + # Wrong values @test_throws KeyError gd[(a=:A, b=3)] @test_throws KeyError gd[(:A, 3)] @test_throws KeyError gd[(a=:A, b="1")] + @test_throws KeyError gd[Dict(:a => :A, :b => "1")] # Wrong length @test_throws KeyError gd[(a=:A,)] @test_throws KeyError gd[(:A,)] @test_throws KeyError gd[(a=:A, b=1, c=1)] @test_throws KeyError gd[(:A, 1, 1)] + @test_throws KeyError gd[Dict(:a => :A, :b => 1, :c => 2)] # Out of order @test_throws KeyError gd[(b=1, a=:A)] @test_throws KeyError gd[(1, :A)] # Empty @test_throws KeyError gd[()] @test_throws KeyError gd[NamedTuple()] + @test_throws KeyError gd[Dict{String, Any}()] + + # Bad Dict types + @test_throws ArgumentError gd[Dict()] + @test_throws ArgumentError gd[Dict(1 => :A, 2 => 1)] end @testset "GroupKey and GroupKeys" begin @@ -1524,7 +1548,7 @@ end gkeys = keys(gd)[ints] # Test with GroupKeys, Tuples, and NamedTuples - for converter in [identity, Tuple, NamedTuple] + for converter in [identity, Tuple, NamedTuple, Dict] a = converter.(gkeys) @test gd[a] ≅ gd2