Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BREAKING] Handle zero groups #2324

Merged
merged 11 commits into from
Aug 4, 2020
10 changes: 4 additions & 6 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@ end

function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
N = length(gd)
parent_names = _names(gd)
keys = html_escape(join(string.(groupcols(gd)), ", "))
keystr = length(gd.cols) > 1 ? "keys" : "key"
groupstr = N > 1 ? "groups" : "group"
Expand All @@ -203,7 +202,7 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
nrows = size(gd[1], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [html_escape(string(parent_names[col], " = ",
identified_groups = [html_escape(string(col, " = ",
repr(first(gd[1][!, col]))))
for col in gd.cols]

Expand All @@ -216,7 +215,7 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
nrows = size(gd[N], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [html_escape(string(parent_names[col], " = ",
identified_groups = [html_escape(string(col, " = ",
repr(first(gd[N][!, col]))))
for col in gd.cols]

Expand Down Expand Up @@ -342,7 +341,6 @@ Base.show(io::IO, mime::MIME"text/latex", dfcs::DataFrameColumns; eltypes::Bool=

function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
N = length(gd)
parent_names = _names(gd)
keys = join(latex_escape.(string.(groupcols(gd))), ", ")
keystr = length(gd.cols) > 1 ? "keys" : "key"
groupstr = N > 1 ? "groups" : "group"
Expand All @@ -351,7 +349,7 @@ function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
nrows = size(gd[1], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [latex_escape(string(parent_names[col], " = ",
identified_groups = [latex_escape(string(col, " = ",
repr(first(gd[1][!, col]))))
for col in gd.cols]

Expand All @@ -364,7 +362,7 @@ function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
nrows = size(gd[N], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [latex_escape(string(parent_names[col], " = ",
identified_groups = [latex_escape(string(col, " = ",
repr(first(gd[N][!, col]))))
for col in gd.cols]

Expand Down
15 changes: 13 additions & 2 deletions src/abstractdataframe/selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -508,12 +508,16 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) =

"""
combine(df::AbstractDataFrame, args...)
combine(arg, df::AbstractDataFrame)

Create a new data frame that contains columns from `df` specified by `args` and
return it. The result can have any number of rows that is determined by the
values returned by passed transformations.

See [`select`](@ref) for detailed rules regarding accepted values for `args`.
See [`select`](@ref) for detailed rules regarding accepted values for `args` in
`combine(df, args...)` form. For `combine(arg, df)` the same rules as for
`combine` on `GroupedDataFrame` apply except that a `df` with zero rows is
currently not allowed.

# Examples
```jldoctest
Expand All @@ -537,7 +541,14 @@ julia> combine(df, :a => sum, nrow)
combine(df::AbstractDataFrame, args...) =
manipulate(df, args..., copycols=true, keeprows=false)

combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, []))
function combine(arg, df::AbstractDataFrame)
if nrow(df) == 0
throw(ArgumentError("calling combine on a data frame with zero rows" *
" with transformation as a first argument is " *
"currently not supported"))
end
return combine(arg, groupby(df, []))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end

manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
DataFrame(_columns(df)[args], Index(_names(df)[args]),
Expand Down
31 changes: 19 additions & 12 deletions src/groupeddataframe/groupeddataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Not meant to be constructed directly, see `groupby`.
"""
mutable struct GroupedDataFrame{T<:AbstractDataFrame}
parent::T
cols::Vector{Int} # columns used for grouping
cols::Vector{Symbol} # column names used for grouping
groups::Vector{Int} # group indices for each row in 0:ngroups, 0 skipped
idx::Union{Vector{Int},Nothing} # indexing vector sorting rows into groups
starts::Union{Vector{Int},Nothing} # starts of groups after permutation by idx
Expand Down Expand Up @@ -144,19 +144,26 @@ groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing)

Return a vector of `Symbol` column names in `parent(gd)` used for grouping.
"""
groupcols(gd::GroupedDataFrame) = _names(gd)[gd.cols]
function groupcols(gd::GroupedDataFrame)
issubset(gd.cols, _names(parent(gd))) && return copy(gd.cols)
throw(ErrorException("grouping column names not found in data frame column names"))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
end

"""
valuecols(gd::GroupedDataFrame)

Return a vector of `Symbol` column names in `parent(gd)` not used for grouping.
"""
valuecols(gd::GroupedDataFrame) = _names(gd)[Not(gd.cols)]
function valuecols(gd::GroupedDataFrame)
issubset(gd.cols, _names(parent(gd))) || throw(ErrorException("grouping column " *
"names not found in data frame column names"))
return setdiff(_names(gd), gd.cols)
end


# Get grouping variable index by its name
function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool)
i = findfirst(==(name), groupcols(gd))
i = findfirst(==(name), gd.cols)
i === nothing && strict && throw(ArgumentError("$name is not a grouping column"))
return i
end
Expand Down Expand Up @@ -214,15 +221,15 @@ function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
new_groups[idx[j]] = i
end
end
GroupedDataFrame(gd.parent, gd.cols, new_groups, gd.idx,
GroupedDataFrame(gd.parent, copy(gd.cols), new_groups, gd.idx,
new_starts, new_ends, length(new_starts), nothing,
Threads.ReentrantLock())
end

# Index with colon (creates copy)
function Base.getindex(gd::GroupedDataFrame, idxs::Colon)
Threads.lock(gd.lazy_lock)
new_gd = GroupedDataFrame(gd.parent, gd.cols, gd.groups, getfield(gd, :idx),
new_gd = GroupedDataFrame(gd.parent, copy(gd.cols), gd.groups, getfield(gd, :idx),
bkamins marked this conversation as resolved.
Show resolved Hide resolved
getfield(gd, :starts), getfield(gd, :ends), gd.ngroups,
getfield(gd, :keymap), Threads.ReentrantLock())
Threads.unlock(gd.lazy_lock)
Expand Down Expand Up @@ -259,11 +266,11 @@ end

Base.parent(key::GroupKey) = getfield(key, :parent)
Base.length(key::GroupKey) = length(parent(key).cols)
Base.names(key::GroupKey) = string.(groupcols(parent(key)))
Base.names(key::GroupKey) = string.(parent(key).cols)
# Private fields are never exposed since they can conflict with column names
Base.propertynames(key::GroupKey, private::Bool=false) = groupcols(parent(key))
Base.propertynames(key::GroupKey, private::Bool=false) = copy(parent(key).cols)
Base.keys(key::GroupKey) = propertynames(key)
Base.haskey(key::GroupKey, idx::Symbol) = idx in groupcols(parent(key))
Base.haskey(key::GroupKey, idx::Symbol) = idx in parent(key).cols
Base.haskey(key::GroupKey, idx::AbstractString) = haskey(key, Symbol(idx))
Base.haskey(key::GroupKey, idx::Union{Signed,Unsigned}) = 1 <= idx <= length(key)
Base.values(key::GroupKey) = Tuple(_groupvalues(parent(key), getfield(key, :idx)))
Expand Down Expand Up @@ -293,7 +300,7 @@ end
Base.getproperty(key::GroupKey, p::AbstractString) = getproperty(key, Symbol(p))

function Base.NamedTuple(key::GroupKey)
N = NamedTuple{Tuple(groupcols(parent(key)))}
N = NamedTuple{Tuple(parent(key).cols)}
N(_groupvalues(parent(key), getfield(key, :idx)))
end
Base.Tuple(key::GroupKey) = values(key)
Expand Down Expand Up @@ -349,7 +356,7 @@ end
Base.to_index(gd::GroupedDataFrame, key::Tuple) = gd.keymap[key]

function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
if length(key) != length(gd.cols) || any(n != c for (n, c) in zip(N, gd.cols))
throw(KeyError(key))
end
return Base.to_index(gd, Tuple(key))
Expand Down Expand Up @@ -533,7 +540,7 @@ function Base.haskey(gd::GroupedDataFrame, key::Tuple)
end

function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
if any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
if length(key) != length(gd.cols) || any(((n, c),) -> n != c, zip(N, gd.cols))
return throw(ArgumentError("The column names of key do not match " *
"the names of grouping columns"))
end
Expand Down
7 changes: 3 additions & 4 deletions src/groupeddataframe/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ function Base.show(io::IO, gd::GroupedDataFrame;
rowlabel::Symbol = :Row,
summary::Bool = true)
N = length(gd)
parent_names = _names(gd.parent)

summary && Base.summary(io, gd)

Expand All @@ -23,7 +22,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
nrows = size(gd[i], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [string(parent_names[col], " = ", repr(gd[i][1, col]))
identified_groups = [string(col, " = ", repr(gd[i][1, col]))
for col in gd.cols]

print(io, "\nGroup $i ($nrows $rows): ")
Expand All @@ -37,7 +36,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
nrows = size(gd[1], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [string(parent_names[col], " = ", repr(gd[1][1, col]))
identified_groups = [string(col, " = ", repr(gd[1][1, col]))
for col in gd.cols]

print(io, "\nFirst Group ($nrows $rows): ")
Expand All @@ -50,7 +49,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
nrows = size(gd[N], 1)
rows = nrows > 1 ? "rows" : "row"

identified_groups = [string(parent_names[col], " = ", repr(gd[N][1, col]))
identified_groups = [string(col, " = ", repr(gd[N][1, col]))
for col in gd.cols]
print(io, "\n⋮")
print(io, "\nLast Group ($nrows $rows): ")
Expand Down
Loading