JuliaData · bkamins · Aug 4, 2020 · Jul 6, 2020 · Jul 21, 2020 · Jul 22, 2020
diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
@@ -194,7 +194,6 @@ end
 
 function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
     N = length(gd)
-    parent_names = _names(gd)
     keys = html_escape(join(string.(groupcols(gd)), ", "))
     keystr = length(gd.cols) > 1 ? "keys" : "key"
     groupstr = N > 1 ? "groups" : "group"
@@ -203,7 +202,7 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
         nrows = size(gd[1], 1)
         rows = nrows > 1 ? "rows" : "row"
 
-        identified_groups = [html_escape(string(parent_names[col], " = ",
+        identified_groups = [html_escape(string(col, " = ",
                                                 repr(first(gd[1][!, col]))))
                              for col in gd.cols]
 
@@ -216,7 +215,7 @@ function Base.show(io::IO, mime::MIME"text/html", gd::GroupedDataFrame)
         nrows = size(gd[N], 1)
         rows = nrows > 1 ? "rows" : "row"
 
-        identified_groups = [html_escape(string(parent_names[col], " = ",
+        identified_groups = [html_escape(string(col, " = ",
                                                 repr(first(gd[N][!, col]))))
                              for col in gd.cols]
 
@@ -342,7 +341,6 @@ Base.show(io::IO, mime::MIME"text/latex", dfcs::DataFrameColumns; eltypes::Bool=
 
 function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
     N = length(gd)
-    parent_names = _names(gd)
     keys = join(latex_escape.(string.(groupcols(gd))), ", ")
     keystr = length(gd.cols) > 1 ? "keys" : "key"
     groupstr = N > 1 ? "groups" : "group"
@@ -351,7 +349,7 @@ function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
         nrows = size(gd[1], 1)
         rows = nrows > 1 ? "rows" : "row"
 
-        identified_groups = [latex_escape(string(parent_names[col], " = ",
+        identified_groups = [latex_escape(string(col, " = ",
                                                  repr(first(gd[1][!, col]))))
                              for col in gd.cols]
 
@@ -364,7 +362,7 @@ function Base.show(io::IO, mime::MIME"text/latex", gd::GroupedDataFrame)
         nrows = size(gd[N], 1)
         rows = nrows > 1 ? "rows" : "row"
 
-        identified_groups = [latex_escape(string(parent_names[col], " = ",
+        identified_groups = [latex_escape(string(col, " = ",
                                                  repr(first(gd[N][!, col]))))
                              for col in gd.cols]
 

diff --git a/src/abstractdataframe/selection.jl b/src/abstractdataframe/selection.jl
@@ -508,12 +508,16 @@ transform(df::AbstractDataFrame, args...; copycols::Bool=true) =
 
 """
     combine(df::AbstractDataFrame, args...)
+    combine(arg, df::AbstractDataFrame)
 
 Create a new data frame that contains columns from `df` specified by `args` and
 return it. The result can have any number of rows that is determined by the
 values returned by passed transformations.
 
-See [`select`](@ref) for detailed rules regarding accepted values for `args`.
+See [`select`](@ref) for detailed rules regarding accepted values for `args` in
+`combine(df, args...)` form. For `combine(arg, df)` the same rules as for
+`combine` on `GroupedDataFrame` apply except that a `df` with zero rows is
+currently not allowed.
 
 # Examples
 ```jldoctest
@@ -537,7 +541,14 @@ julia> combine(df, :a => sum, nrow)
 combine(df::AbstractDataFrame, args...) =
     manipulate(df, args..., copycols=true, keeprows=false)
 
-combine(arg, df::AbstractDataFrame) = combine(arg, groupby(df, []))
+function combine(arg, df::AbstractDataFrame)
+    if nrow(df) == 0
+        throw(ArgumentError("calling combine on a data frame with zero rows" *
+                            " with transformation as a first argument is " *
+                            "currently not supported"))
+    end
+    return combine(arg, groupby(df, []))
+end
 
 manipulate(df::DataFrame, args::AbstractVector{Int}; copycols::Bool, keeprows::Bool) =
     DataFrame(_columns(df)[args], Index(_names(df)[args]),

diff --git a/src/groupeddataframe/groupeddataframe.jl b/src/groupeddataframe/groupeddataframe.jl
@@ -22,7 +22,7 @@ Not meant to be constructed directly, see `groupby`.
 """
 mutable struct GroupedDataFrame{T<:AbstractDataFrame}
     parent::T
-    cols::Vector{Int}                    # columns used for grouping
+    cols::Vector{Symbol}                 # column names used for grouping
     groups::Vector{Int}                  # group indices for each row in 0:ngroups, 0 skipped
     idx::Union{Vector{Int},Nothing}      # indexing vector sorting rows into groups
     starts::Union{Vector{Int},Nothing}   # starts of groups after permutation by idx
@@ -144,19 +144,26 @@ groupindices(gd::GroupedDataFrame) = replace(gd.groups, 0=>missing)
 
 Return a vector of `Symbol` column names in `parent(gd)` used for grouping.
 """
-groupcols(gd::GroupedDataFrame) = _names(gd)[gd.cols]
+function groupcols(gd::GroupedDataFrame)
+    issubset(gd.cols, _names(parent(gd))) && return copy(gd.cols)
+    throw(ErrorException("grouping column names not found in data frame column names"))
+end
 
 """
     valuecols(gd::GroupedDataFrame)
 
 Return a vector of `Symbol` column names in `parent(gd)` not used for grouping.
 """
-valuecols(gd::GroupedDataFrame) = _names(gd)[Not(gd.cols)]
+function valuecols(gd::GroupedDataFrame)
+    issubset(gd.cols, _names(parent(gd))) || throw(ErrorException("grouping column " *
+        "names not found in data frame column names"))
+    return setdiff(_names(gd), gd.cols)
+end
 
 
 # Get grouping variable index by its name
 function _groupvar_idx(gd::GroupedDataFrame, name::Symbol, strict::Bool)
-    i = findfirst(==(name), groupcols(gd))
+    i = findfirst(==(name), gd.cols)
     i === nothing && strict && throw(ArgumentError("$name is not a grouping column"))
     return i
 end
@@ -214,15 +221,15 @@ function Base.getindex(gd::GroupedDataFrame, idxs::AbstractVector{<:Integer})
             new_groups[idx[j]] = i
         end
     end
-    GroupedDataFrame(gd.parent, gd.cols, new_groups, gd.idx,
+    GroupedDataFrame(gd.parent, copy(gd.cols), new_groups, gd.idx,
                      new_starts, new_ends, length(new_starts), nothing,
                      Threads.ReentrantLock())
 end
 
 # Index with colon (creates copy)
 function Base.getindex(gd::GroupedDataFrame, idxs::Colon)
     Threads.lock(gd.lazy_lock)
-    new_gd = GroupedDataFrame(gd.parent, gd.cols, gd.groups, getfield(gd, :idx),
+    new_gd = GroupedDataFrame(gd.parent, copy(gd.cols), gd.groups, getfield(gd, :idx),
                               getfield(gd, :starts), getfield(gd, :ends), gd.ngroups,
                               getfield(gd, :keymap), Threads.ReentrantLock())
     Threads.unlock(gd.lazy_lock)
@@ -259,11 +266,11 @@ end
 
 Base.parent(key::GroupKey) = getfield(key, :parent)
 Base.length(key::GroupKey) = length(parent(key).cols)
-Base.names(key::GroupKey) = string.(groupcols(parent(key)))
+Base.names(key::GroupKey) = string.(parent(key).cols)
 # Private fields are never exposed since they can conflict with column names
-Base.propertynames(key::GroupKey, private::Bool=false) = groupcols(parent(key))
+Base.propertynames(key::GroupKey, private::Bool=false) = copy(parent(key).cols)
 Base.keys(key::GroupKey) = propertynames(key)
-Base.haskey(key::GroupKey, idx::Symbol) = idx in groupcols(parent(key))
+Base.haskey(key::GroupKey, idx::Symbol) = idx in parent(key).cols
 Base.haskey(key::GroupKey, idx::AbstractString) = haskey(key, Symbol(idx))
 Base.haskey(key::GroupKey, idx::Union{Signed,Unsigned}) = 1 <= idx <= length(key)
 Base.values(key::GroupKey) = Tuple(_groupvalues(parent(key), getfield(key, :idx)))
@@ -293,7 +300,7 @@ end
 Base.getproperty(key::GroupKey, p::AbstractString) = getproperty(key, Symbol(p))
 
 function Base.NamedTuple(key::GroupKey)
-    N = NamedTuple{Tuple(groupcols(parent(key)))}
+    N = NamedTuple{Tuple(parent(key).cols)}
     N(_groupvalues(parent(key), getfield(key, :idx)))
 end
 Base.Tuple(key::GroupKey) = values(key)
@@ -349,7 +356,7 @@ end
 Base.to_index(gd::GroupedDataFrame, key::Tuple) = gd.keymap[key]
 
 function Base.to_index(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
-    if length(key) != length(gd.cols) || any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
+    if length(key) != length(gd.cols) || any(n != c for (n, c) in zip(N, gd.cols))
         throw(KeyError(key))
     end
     return Base.to_index(gd, Tuple(key))
@@ -533,7 +540,7 @@ function Base.haskey(gd::GroupedDataFrame, key::Tuple)
 end
 
 function Base.haskey(gd::GroupedDataFrame, key::NamedTuple{N}) where {N}
-    if any(n != _names(gd)[c] for (n, c) in zip(N, gd.cols))
+    if length(key) != length(gd.cols) || any(((n, c),) -> n != c, zip(N, gd.cols))
         return throw(ArgumentError("The column names of key do not match " *
                                    "the names of grouping columns"))
     end

diff --git a/src/groupeddataframe/show.jl b/src/groupeddataframe/show.jl
@@ -14,7 +14,6 @@ function Base.show(io::IO, gd::GroupedDataFrame;
                    rowlabel::Symbol = :Row,
                    summary::Bool = true)
     N = length(gd)
-    parent_names = _names(gd.parent)
 
     summary && Base.summary(io, gd)
 
@@ -23,7 +22,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
             nrows = size(gd[i], 1)
             rows = nrows > 1 ? "rows" : "row"
 
-            identified_groups = [string(parent_names[col], " = ", repr(gd[i][1, col]))
+            identified_groups = [string(col, " = ", repr(gd[i][1, col]))
                                  for col in gd.cols]
 
             print(io, "\nGroup $i ($nrows $rows): ")
@@ -37,7 +36,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
             nrows = size(gd[1], 1)
             rows = nrows > 1 ? "rows" : "row"
 
-            identified_groups = [string(parent_names[col], " = ", repr(gd[1][1, col]))
+            identified_groups = [string(col, " = ", repr(gd[1][1, col]))
                                  for col in gd.cols]
 
             print(io, "\nFirst Group ($nrows $rows): ")
@@ -50,7 +49,7 @@ function Base.show(io::IO, gd::GroupedDataFrame;
             nrows = size(gd[N], 1)
             rows = nrows > 1 ? "rows" : "row"
 
-            identified_groups = [string(parent_names[col], " = ", repr(gd[N][1, col]))
+            identified_groups = [string(col, " = ", repr(gd[N][1, col]))
                                  for col in gd.cols]
             print(io, "\n⋮")
             print(io, "\nLast Group ($nrows $rows): ")