diff --git a/src/groupeddataframe/splitapplycombine.jl b/src/groupeddataframe/splitapplycombine.jl index 72aaa348b6..858a859bd3 100644 --- a/src/groupeddataframe/splitapplycombine.jl +++ b/src/groupeddataframe/splitapplycombine.jl @@ -477,8 +477,9 @@ struct Reduce{O, C, A} <: AbstractAggregate op::O condf::C adjust::A + checkempty::Bool end -Reduce(f, condf=nothing) = Reduce(f, condf, nothing) +Reduce(f, condf=nothing, adjust=nothing) = Reduce(f, condf, adjust, false) check_aggregate(f::Any) = f check_aggregate(::typeof(sum)) = Reduce(Base.add_sum) @@ -488,9 +489,9 @@ check_aggregate(::typeof(minimum)) = Reduce(min) check_aggregate(::typeof(mean)) = Reduce(Base.add_sum, nothing, /) check_aggregate(::typeof(sum∘skipmissing)) = Reduce(Base.add_sum, !ismissing) check_aggregate(::typeof(prod∘skipmissing)) = Reduce(Base.mul_prod, !ismissing) -check_aggregate(::typeof(maximum∘skipmissing)) = Reduce(max, !ismissing) -check_aggregate(::typeof(minimum∘skipmissing)) = Reduce(min, !ismissing) check_aggregate(::typeof(mean∘skipmissing)) = Reduce(Base.add_sum, !ismissing, /) +check_aggregate(::typeof(maximum∘skipmissing)) = Reduce(max, !ismissing, nothing, true) +check_aggregate(::typeof(minimum∘skipmissing)) = Reduce(min, !ismissing, nothing, true) # Other aggregate functions which are not strictly reductions struct Aggregate{F, C} <: AbstractAggregate @@ -609,10 +610,10 @@ function copyto_widen!(res::AbstractVector{T}, x::AbstractVector) where T return res end -function groupreduce!(res, f, op, condf, adjust, +function groupreduce!(res, f, op, condf, adjust, checkempty::Bool, incol::AbstractVector{T}, gd::GroupedDataFrame) where T n = length(gd) - if adjust !== nothing + if adjust !== nothing || checkempty counts = zeros(Int, n) end groups = gd.groups @@ -621,15 +622,20 @@ function groupreduce!(res, f, op, condf, adjust, x = incol[i] if gix > 0 && (condf === nothing || condf(x)) res[gix] = op(res[gix], f(x, gix)) - adjust !== nothing && (counts[gix] += 1) + if adjust !== nothing || checkempty + counts[gix] += 1 + end end end outcol = adjust === nothing ? res : map(adjust, res, counts) + if checkempty && any(iszero, counts) + throw(ArgumentError("some groups contain only missing values")) + end # Undo pool sharing done by groupreduce_init - if outcol isa CategoricalVector + if outcol isa CategoricalVector && outcol.pool === incol.pool U = Union{CategoricalArrays.leveltype(outcol), eltype(outcol) >: Missing ? Missing : Union{}} - outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool) + outcol = CategoricalArray{U, 1}(outcol.refs, copy(outcol.pool)) end if isconcretetype(eltype(outcol)) return outcol @@ -639,20 +645,21 @@ function groupreduce!(res, f, op, condf, adjust, end # Function barrier works around type instability of _groupreduce_init due to applicable -groupreduce(f, op, condf, adjust, incol::AbstractVector, gd::GroupedDataFrame) = +groupreduce(f, op, condf, adjust, checkempty::Bool, + incol::AbstractVector, gd::GroupedDataFrame) = groupreduce!(groupreduce_init(op, condf, incol, gd), - f, op, condf, adjust, incol, gd) + f, op, condf, adjust, checkempty, incol, gd) # Avoids the overhead due to Missing when computing reduction -groupreduce(f, op, condf::typeof(!ismissing), adjust, +groupreduce(f, op, condf::typeof(!ismissing), adjust, checkempty::Bool, incol::AbstractVector, gd::GroupedDataFrame) = groupreduce!(disallowmissing(groupreduce_init(op, condf, incol, gd)), - f, op, condf, adjust, incol, gd) + f, op, condf, adjust, checkempty, incol, gd) (r::Reduce)(incol::AbstractVector, gd::GroupedDataFrame) = - groupreduce((x, i) -> x, r.op, r.condf, r.adjust, incol, gd) + groupreduce((x, i) -> x, r.op, r.condf, r.adjust, r.checkempty, incol, gd) function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame) - means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, incol, gd) + means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd) # !ismissing check is purely an optimization to avoid a copy later if eltype(means) >: Missing && agg.condf !== !ismissing T = Union{Missing, real(eltype(means))} @@ -660,8 +667,9 @@ function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFra T = real(eltype(means)) end res = zeros(T, length(gd)) - groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, - agg.condf, (x, l) -> x / (l-1), incol, gd) + groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf, + (x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1), + false, incol, gd) end function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame) diff --git a/test/grouping.jl b/test/grouping.jl index 4e7a597883..d28f4550a6 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -841,6 +841,19 @@ Base.isless(::TestType, ::TestType) = false expected = combine(gd, y = :x3 => x -> f(collect(skipmissing(x)))) @test res ≅ expected @test typeof(res.y) == typeof(expected.y) + + # Test reduction over group with only missing values + gd = groupby_checked(df, :a, skipmissing=skip, sort=sort) + indices && @test gd.idx !== nothing # Trigger computation of indices + gd[1][:, :x3] .= missing + if f in (maximum, minimum, first, last) + @test_throws ArgumentError combine(gd, y = :x3 => f∘skipmissing) + else + res = combine(gd, y = :x3 => f∘skipmissing) + expected = combine(gd, y = :x3 => x -> f(collect(skipmissing(x)))) + @test res ≅ expected + @test typeof(res.y) == typeof(expected.y) + end end # Test complex numbers for f in (sum, prod, mean, var, std, first, last, length)