Skip to content

Commit

Permalink
Fix grouped maximum, minimum, var and std with only missing values (#…
Browse files Browse the repository at this point in the history
…2063)

When a group only contains missing values, grouped `maximum` and `minimum` have to
throw an error like the corresponding functions. Otherwise they incorrectly return
`typemin`/`typemax`. Also fix `var` and `std` to return `NaN` in that case instead
of `-0.0`.

This problem cannot happen without `skipmissing`, as empty groups are not allowed.
  • Loading branch information
nalimilan authored Dec 26, 2019
1 parent 0ee43f5 commit 8902a08
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 16 deletions.
40 changes: 24 additions & 16 deletions src/groupeddataframe/splitapplycombine.jl
Original file line number Diff line number Diff line change
Expand Up @@ -477,8 +477,9 @@ struct Reduce{O, C, A} <: AbstractAggregate
op::O
condf::C
adjust::A
checkempty::Bool
end
Reduce(f, condf=nothing) = Reduce(f, condf, nothing)
Reduce(f, condf=nothing, adjust=nothing) = Reduce(f, condf, adjust, false)

check_aggregate(f::Any) = f
check_aggregate(::typeof(sum)) = Reduce(Base.add_sum)
Expand All @@ -488,9 +489,9 @@ check_aggregate(::typeof(minimum)) = Reduce(min)
check_aggregate(::typeof(mean)) = Reduce(Base.add_sum, nothing, /)
check_aggregate(::typeof(sumskipmissing)) = Reduce(Base.add_sum, !ismissing)
check_aggregate(::typeof(prodskipmissing)) = Reduce(Base.mul_prod, !ismissing)
check_aggregate(::typeof(maximumskipmissing)) = Reduce(max, !ismissing)
check_aggregate(::typeof(minimumskipmissing)) = Reduce(min, !ismissing)
check_aggregate(::typeof(meanskipmissing)) = Reduce(Base.add_sum, !ismissing, /)
check_aggregate(::typeof(maximumskipmissing)) = Reduce(max, !ismissing, nothing, true)
check_aggregate(::typeof(minimumskipmissing)) = Reduce(min, !ismissing, nothing, true)

# Other aggregate functions which are not strictly reductions
struct Aggregate{F, C} <: AbstractAggregate
Expand Down Expand Up @@ -609,10 +610,10 @@ function copyto_widen!(res::AbstractVector{T}, x::AbstractVector) where T
return res
end

function groupreduce!(res, f, op, condf, adjust,
function groupreduce!(res, f, op, condf, adjust, checkempty::Bool,
incol::AbstractVector{T}, gd::GroupedDataFrame) where T
n = length(gd)
if adjust !== nothing
if adjust !== nothing || checkempty
counts = zeros(Int, n)
end
groups = gd.groups
Expand All @@ -621,15 +622,20 @@ function groupreduce!(res, f, op, condf, adjust,
x = incol[i]
if gix > 0 && (condf === nothing || condf(x))
res[gix] = op(res[gix], f(x, gix))
adjust !== nothing && (counts[gix] += 1)
if adjust !== nothing || checkempty
counts[gix] += 1
end
end
end
outcol = adjust === nothing ? res : map(adjust, res, counts)
if checkempty && any(iszero, counts)
throw(ArgumentError("some groups contain only missing values"))
end
# Undo pool sharing done by groupreduce_init
if outcol isa CategoricalVector
if outcol isa CategoricalVector && outcol.pool === incol.pool
U = Union{CategoricalArrays.leveltype(outcol),
eltype(outcol) >: Missing ? Missing : Union{}}
outcol = CategoricalArray{U, 1}(outcol.refs, incol.pool)
outcol = CategoricalArray{U, 1}(outcol.refs, copy(outcol.pool))
end
if isconcretetype(eltype(outcol))
return outcol
Expand All @@ -639,29 +645,31 @@ function groupreduce!(res, f, op, condf, adjust,
end

# Function barrier works around type instability of _groupreduce_init due to applicable
groupreduce(f, op, condf, adjust, incol::AbstractVector, gd::GroupedDataFrame) =
groupreduce(f, op, condf, adjust, checkempty::Bool,
incol::AbstractVector, gd::GroupedDataFrame) =
groupreduce!(groupreduce_init(op, condf, incol, gd),
f, op, condf, adjust, incol, gd)
f, op, condf, adjust, checkempty, incol, gd)
# Avoids the overhead due to Missing when computing reduction
groupreduce(f, op, condf::typeof(!ismissing), adjust,
groupreduce(f, op, condf::typeof(!ismissing), adjust, checkempty::Bool,
incol::AbstractVector, gd::GroupedDataFrame) =
groupreduce!(disallowmissing(groupreduce_init(op, condf, incol, gd)),
f, op, condf, adjust, incol, gd)
f, op, condf, adjust, checkempty, incol, gd)

(r::Reduce)(incol::AbstractVector, gd::GroupedDataFrame) =
groupreduce((x, i) -> x, r.op, r.condf, r.adjust, incol, gd)
groupreduce((x, i) -> x, r.op, r.condf, r.adjust, r.checkempty, incol, gd)

function (agg::Aggregate{typeof(var)})(incol::AbstractVector, gd::GroupedDataFrame)
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, incol, gd)
means = groupreduce((x, i) -> x, Base.add_sum, agg.condf, /, false, incol, gd)
# !ismissing check is purely an optimization to avoid a copy later
if eltype(means) >: Missing && agg.condf !== !ismissing
T = Union{Missing, real(eltype(means))}
else
T = real(eltype(means))
end
res = zeros(T, length(gd))
groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +,
agg.condf, (x, l) -> x / (l-1), incol, gd)
groupreduce!(res, (x, i) -> @inbounds(abs2(x - means[i])), +, agg.condf,
(x, l) -> l <= 1 ? oftype(x / (l-1), NaN) : x / (l-1),
false, incol, gd)
end

function (agg::Aggregate{typeof(std)})(incol::AbstractVector, gd::GroupedDataFrame)
Expand Down
13 changes: 13 additions & 0 deletions test/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,19 @@ Base.isless(::TestType, ::TestType) = false
expected = combine(gd, y = :x3 => x -> f(collect(skipmissing(x))))
@test res expected
@test typeof(res.y) == typeof(expected.y)

# Test reduction over group with only missing values
gd = groupby_checked(df, :a, skipmissing=skip, sort=sort)
indices && @test gd.idx !== nothing # Trigger computation of indices
gd[1][:, :x3] .= missing
if f in (maximum, minimum, first, last)
@test_throws ArgumentError combine(gd, y = :x3 => fskipmissing)
else
res = combine(gd, y = :x3 => fskipmissing)
expected = combine(gd, y = :x3 => x -> f(collect(skipmissing(x))))
@test res expected
@test typeof(res.y) == typeof(expected.y)
end
end
# Test complex numbers
for f in (sum, prod, mean, var, std, first, last, length)
Expand Down

0 comments on commit 8902a08

Please sign in to comment.