JuliaData · bkamins · Aug 13, 2020 · Aug 9, 2020 · Aug 10, 2020 · Aug 10, 2020
diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl
@@ -1,3 +1,7 @@
+# this contstant defines which types of values returned by aggregation function
+# in combine are considered to produce multiple columns in the resulting data frame
+const MULTI_COLS_TYPE = Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
+
 """
     groupby(d::AbstractDataFrame, cols; sort=false, skipmissing=false)
 
@@ -452,7 +456,7 @@ function combine(p::Pair, gd::GroupedDataFrame;
     # verify if it is not better to use a fast path, which we achieve
     # by moving to combine(::GroupedDataFrame, ::AbstractVector) method
     # note that even if length(gd) == 0 we can do this step
-    if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to)) || p_from === nrow
+    if isagg(p_from => (p_to isa Pair ? first(p_to) : p_to), gd) || p_from === nrow
         return combine(gd, [p], keepkeys=keepkeys, ungroup=ungroup)
     end
 
@@ -761,16 +765,37 @@ end
 Reduce(f, condf=nothing, adjust=nothing) = Reduce(f, condf, adjust, false)
 
 check_aggregate(f::Any) = f
+validate_aggregate(f::Any, col::AbstractVector) = false
+
 check_aggregate(::typeof(sum)) = Reduce(Base.add_sum)
+validate_aggregate(::typeof(sum), ::AbstractVector{<:Union{Missing, Number}}) = true
+
+check_aggregate(::typeof(sum∘skipmissing)) = Reduce(Base.add_sum, !ismissing)
+validate_aggregate(::typeof(sum∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(prod)) = Reduce(Base.mul_prod)
+validate_aggregate(::typeof(prod), ::AbstractVector{<:Union{Missing, Number}}) = true
+
+check_aggregate(::typeof(prod∘skipmissing)) = Reduce(Base.mul_prod, !ismissing)
+validate_aggregate(::typeof(prod∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(maximum)) = Reduce(max)
+validate_aggregate(::typeof(maximum), ::AbstractVector{<:Union{Missing, Real}}) = true
+
+check_aggregate(::typeof(maximum∘skipmissing)) = Reduce(max, !ismissing, nothing, true)
+validate_aggregate(::typeof(maximum∘skipmissing), ::AbstractVector{<:Union{Missing, Real}}) = true
+
 check_aggregate(::typeof(minimum)) = Reduce(min)
+validate_aggregate(::typeof(minimum), ::AbstractVector{<:Union{Missing, Real}}) = true
+
+check_aggregate(::typeof(minimum∘skipmissing)) = Reduce(min, !ismissing, nothing, true)
+validate_aggregate(::typeof(minimum∘skipmissing), ::AbstractVector{<:Union{Missing, Real}}) = true
+
 check_aggregate(::typeof(mean)) = Reduce(Base.add_sum, nothing, /)
-check_aggregate(::typeof(sum∘skipmissing)) = Reduce(Base.add_sum, !ismissing)
-check_aggregate(::typeof(prod∘skipmissing)) = Reduce(Base.mul_prod, !ismissing)
+validate_aggregate(::typeof(mean), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(mean∘skipmissing)) = Reduce(Base.add_sum, !ismissing, /)
-check_aggregate(::typeof(maximum∘skipmissing)) = Reduce(max, !ismissing, nothing, true)
-check_aggregate(::typeof(minimum∘skipmissing)) = Reduce(min, !ismissing, nothing, true)
+validate_aggregate(::typeof(mean∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) = true
 
 # Other aggregate functions which are not strictly reductions
 struct Aggregate{F, C} <: AbstractAggregate
@@ -780,14 +805,36 @@ end
 Aggregate(f) = Aggregate(f, nothing)
 
 check_aggregate(::typeof(var)) = Aggregate(var)
+validate_aggregate(::typeof(var), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(var∘skipmissing)) = Aggregate(var, !ismissing)
+validate_aggregate(::typeof(var∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(std)) = Aggregate(std)
+validate_aggregate(::typeof(std), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(std∘skipmissing)) = Aggregate(std, !ismissing)
+validate_aggregate(::typeof(std∘skipmissing), ::AbstractVector{<:Union{Missing, Number}}) = true
+
 check_aggregate(::typeof(first)) = Aggregate(first)
+validate_aggregate(::typeof(first), v::AbstractVector) = eltype(v) === Any ? false : true
+validate_aggregate(::typeof(first), ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = false
+
 check_aggregate(::typeof(first∘skipmissing)) = Aggregate(first, !ismissing)
+validate_aggregate(::typeof(first∘skipmissing), v::AbstractVector) = eltype(v) === Any ? false : true
+validate_aggregate(::typeof(first∘skipmissing), ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = false
+
 check_aggregate(::typeof(last)) = Aggregate(last)
+validate_aggregate(::typeof(last), v::AbstractVector) = eltype(v) === Any ? false : true
+validate_aggregate(::typeof(last), ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = false
+
 check_aggregate(::typeof(last∘skipmissing)) = Aggregate(last, !ismissing)
+validate_aggregate(::typeof(last∘skipmissing), v::AbstractVector) = eltype(v) === Any ? false : true
+validate_aggregate(::typeof(last∘skipmissing), ::AbstractVector{<:Union{Missing, MULTI_COLS_TYPE, AbstractVector}}) = false
+
 check_aggregate(::typeof(length)) = Aggregate(length)
+validate_aggregate(::typeof(length), ::AbstractVector) = true
+
 # SkipMissing does not support length
 
 # Find first value matching condition for each group
@@ -864,7 +911,11 @@ function groupreduce_init(op, condf, adjust,
     if isconcretetype(Tnm) && applicable(initf, Tnm)
         tmpv = initf(Tnm)
         initv = op(tmpv, tmpv)
-        x = adjust isa Nothing ? initv : adjust(initv, 1)
+        if adjust isa Nothing
+            x = Tnm <: AbstractIrrational ? float(initv) : initv
+        else
+            x = adjust(initv, 1)
+        end
         if condf === !ismissing
             V = typeof(x)
         else
@@ -900,7 +951,8 @@ for (op, initf) in ((:max, :typemin), (:min, :typemax))
             # It is safe to use a non-missing init value
             # since missing will poison the result if present
             # we assume here that groups are non-empty (current design assures this)
-            if isconcretetype(S) && hasmethod($initf, Tuple{S})
+            # + workaround for https://github.com/JuliaLang/julia/issues/36978
+            if isconcretetype(S) && hasmethod($initf, Tuple{S}) && !(S <: Irrational)
                 fill!(outcol, $initf(S))
             else
                 fillfirst!(condf, outcol, incol, gd)
@@ -1038,10 +1090,8 @@ function (agg::Aggregate{typeof(length)})(incol::AbstractVector, gd::GroupedData
     end
 end
 
-isagg(p::Pair) =
-    check_aggregate(last(p)) isa AbstractAggregate && first(p) isa ColumnIndex
-
-const MULTI_COLS_TYPE = Union{AbstractDataFrame, NamedTuple, DataFrameRow, AbstractMatrix}
+isagg((col, fun)::Pair, gdf::GroupedDataFrame) =
+    col isa ColumnIndex && validate_aggregate(fun, parent(gdf)[!, col])
 
 function _agg2idx_map_helper(idx, idx_agg)
     agg2idx_map = fill(-1, length(idx))
@@ -1101,11 +1151,11 @@ function _combine(f::AbstractVector{<:Pair},
     end
 
     idx_agg = nothing
-    if length(gd) > 0 && any(isagg, f)
+    if length(gd) > 0 && any(x -> isagg(x, gd), f)
         # Compute indices of representative rows only once for all AbstractAggregates
         idx_agg = Vector{Int}(undef, length(gd))
         fillfirst!(nothing, idx_agg, 1:length(gd.groups), gd)
-    elseif length(gd) == 0 || !all(isagg, f)
+    elseif length(gd) == 0 || !all(x -> isagg(x, gd), f)
         # Trigger computation of indices
         # This can speed up some aggregates that would not trigger this on their own
         @assert gd.idx !== nothing
@@ -1114,7 +1164,7 @@ function _combine(f::AbstractVector{<:Pair},
     parentdf = parent(gd)
     for (i, p) in enumerate(f)
         source_cols, fun = p
-        if length(gd) > 0 && isagg(p)
+        if length(gd) > 0 && isagg(p, gd)
             incol = parentdf[!, source_cols]
             agg = check_aggregate(last(p))
             outcol = agg(incol, gd)

diff --git a/test/grouping.jl b/test/grouping.jl
@@ -2691,4 +2691,122 @@ end
     @test map(nrow, gdf) == [1, 1, 1]
 end
 
+@testset "isagg fix" begin
+    for fun in (sum, prod, mean, var, std, sum∘skipmissing, prod∘skipmissing,
+                mean∘skipmissing, var∘skipmissing, std∘skipmissing),
+        col in ([1, 2, 3], [big(1.5), big(2.5), big(3.5)], [1 + 0.5im, 2 + 0.5im, 3 + 0.5im],
+                [true, false, true], [pi, pi, pi], [1//2, 1//3, 1//4],
+                Real[1, 1.5, 1//2], Number[1, 1.5, 1//2], Any[1, 1.5, 1//2],
+                [1, 2, missing], [big(1.5), big(2.5), missing], [1 + 0.5im, 2 + 0.5im, missing],
+                [true, false, missing], [pi, pi, missing], [1//2, 1//3, missing],
+                Union{Missing,Real}[1, 1.5, missing],
+                Union{Missing,Number}[1, 1.5, missing], Any[1, 1.5, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+    end
+
+    for fun in (maximum, minimum, maximum∘skipmissing, minimum∘skipmissing),
+        col in ([1, 2, 3], [big(1.5), big(2.5), big(3.5)],
+                [true, false, true], [pi, pi, pi], [1//2, 1//3, 1//4],
+                Real[1, 1.5, 1//2], Number[1, 1.5, 1//2], Any[1, 1.5, 1//2],
+                [1, 2, missing], [big(1.5), big(2.5), missing],
+                [true, false, missing], [pi, pi, missing], [1//2, 1//3, missing],
+                Union{Missing,Real}[1, 1.5, missing],
+                Union{Missing,Number}[1, 1.5, missing], Any[1, 1.5, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+    end
+
+    for fun in (first, last, length, first∘skipmissing, last∘skipmissing),
+        col in ([1, 2, 3], [big(1.5), big(2.5), big(3.5)], [1 + 0.5im, 2 + 0.5im, 3 + 0.5im],
+                [true, false, true], [pi, pi, pi], [1//2, 1//3, 1//4],
+                Real[1, 1.5, 1//2], Number[1, 1.5, 1//2], Any[1, 1.5, 1//2],
+                [1, 2, missing], [big(1.5), big(2.5), missing], [1 + 0.5im, 2 + 0.5im, missing],
+                [true, false, missing], [pi, pi, missing], [1//2, 1//3, missing],
+                Union{Missing,Real}[1, 1.5, missing],
+                Union{Missing,Number}[1, 1.5, missing], Any[1, 1.5, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        if fun isa typeof(last∘skipmissing)
+            # this is another hard corner case
+            if eltype(col) === Any
+                @test_throws MethodError combine(gdf, :x => fun => :y)
+            else
+                @test combine(gdf, :x => fun => :y) ==
+                      combine(groupby_checked(dropmissing(parent(gdf)), :g), :x => fun => :y)
+            end
+            @test_throws MethodError combine(gdf, :x => (x -> fun(x)) => :y)
+        else
+            @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+        end
+    end
+
+    for fun in (sum, mean, var, std),
+        col in ([1:3, 4:6, 7:9], [1:3, 4:6, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        if eltype(col) >: Missing
+            @test_throws MethodError combine(gdf, :x => fun => :y)
+            @test_throws MethodError combine(gdf, :x => (x -> fun(x)) => :y)
+        else
+            @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+        end
+    end
+
+    for fun in (sum∘skipmissing, mean∘skipmissing),
+        col in ([1:3, 4:6, 7:9], [1:3, 4:6, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+    end
+
+    # workaround https://github.com/JuliaLang/julia/issues/36979
+    for fun in (var∘skipmissing, std∘skipmissing),
+        col in ([1:3, 4:6, 7:9], [1:3, 4:6, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        @test_throws MethodError combine(gdf, :x => fun => :y)
+        @test_throws MethodError combine(gdf, :x => (x -> fun(x)) => :y)
+    end
+
+    for fun in (maximum, minimum, maximum∘skipmissing, minimum∘skipmissing,
+                first, last, length, first∘skipmissing, last∘skipmissing),
+        col in ([1:3, 4:6, 7:9], [1:3, 4:6, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        if fun isa typeof(last∘skipmissing)
+            @test_throws MethodError combine(gdf, :x => fun => :y)
+            @test_throws MethodError combine(gdf, :x => (x -> fun(x)) => :y)
+        else
+            @test combine(gdf, :x => fun => :y) ≅ combine(gdf, :x => (x -> fun(x)) => :y)
+        end
+    end
+
+    for fun in (prod, prod∘skipmissing),
+        col in ([1:3, 4:6, 7:9], [1:3, 4:6, missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        @test_throws MethodError combine(gdf, :x => fun => :y)
+        @test_throws MethodError combine(gdf, :x => (x -> fun(x)) => :y)
+    end
+
+    for fun in (sum, prod, mean, var, std, sum∘skipmissing, prod∘skipmissing,
+                mean∘skipmissing, var∘skipmissing, std∘skipmissing,
+                maximum, minimum, maximum∘skipmissing, minimum∘skipmissing,
+                first, last, length, first∘skipmissing, last∘skipmissing),
+        col in ([ones(2,2), zeros(2,2), ones(2,2)], [ones(2,2), zeros(2,2), missing],
+                [DataFrame(ones(2,2)), DataFrame(zeros(2,2)), DataFrame(ones(2,2))],
+                [DataFrame(ones(2,2)), DataFrame(zeros(2,2)), ones(2,2)],
+                [DataFrame(ones(2,2)), DataFrame(zeros(2,2)), missing],
+                [(a=1, b=2), (a=3, b=4), (a=5, b=6)], [(a=1, b=2), (a=3, b=4), missing])
+        gdf = groupby_checked(DataFrame(g=[1, 1, 1], x=col), :g)
+        if fun isa typeof(length)
+            @test combine(gdf, :x => fun => :y) ≅ DataFrame(g=1, y=3)
+            @test combine(gdf, :x => (x -> fun(x)) => :y) ≅ DataFrame(g=1, y=3)
+        elseif (fun isa typeof(last) && ismissing(last(col))) ||
+               (fun isa Union{typeof(maximum), typeof(minimum)} && col ≅ [(a=1, b=2), (a=3, b=4), missing])
+            # this case is a hard problem what to do and probably we have to leave it as is
+            @test combine(gdf, :x => fun => :y) ≅ DataFrame(g=1, y=missing)
+            @test combine(gdf, :x => (x -> fun(x)) => :y) ≅ DataFrame(g=1, y=missing)
+        else
+            @test_throws Union{ArgumentError, MethodError} combine(gdf, :x => fun => :y)
+            @test_throws Union{ArgumentError, MethodError} combine(gdf, :x => (x -> fun(x)) => :y)
+        end
+    end
+end
+
 end # module