JuliaStats · mbauman · Aug 27, 2024 · May 14, 2025
diff --git a/src/Statistics.jl b/src/Statistics.jl
@@ -211,6 +211,7 @@ realXcY(x::Complex, y::Complex) = real(x)*real(y) + imag(x)*imag(y)
 var(iterable; corrected::Bool=true, mean=nothing) = _var(iterable, corrected, mean)
 
 function _var(iterable, corrected::Bool, mean)
+    ismissing(mean) && return missing
     y = iterate(iterable)
     if y === nothing
         T = eltype(iterable)
@@ -252,61 +253,36 @@ function _var(iterable, corrected::Bool, mean)
     end
 end
 
-centralizedabs2fun(m) = x -> abs2.(x - m)
-centralize_sumabs2(A::AbstractArray, m) =
-    mapreduce(centralizedabs2fun(m), +, A)
-centralize_sumabs2(A::AbstractArray, m, ifirst::Int, ilast::Int) =
-    Base.mapreduce_impl(centralizedabs2fun(m), +, A, ifirst, ilast)
-
-function centralize_sumabs2!(R::AbstractArray{S}, A::AbstractArray, means::AbstractArray) where S
-    # following the implementation of _mapreducedim! at base/reducedim.jl
-    lsiz = Base.check_reducedims(R,A)
-    for i in 1:max(ndims(R), ndims(means))
-        if axes(means, i) != axes(R, i)
-            throw(DimensionMismatch("dimension $i of `mean` should have indices $(axes(R, i)), but got $(axes(means, i))"))
-        end
-    end
-    isempty(R) || fill!(R, zero(S))
-    isempty(A) && return R
-
-    if Base.has_fast_linear_indexing(A) && lsiz > 16 && !has_offset_axes(R, means)
-        nslices = div(length(A), lsiz)
-        ibase = first(LinearIndices(A))-1
-        for i = 1:nslices
-            @inbounds R[i] = centralize_sumabs2(A, means[i], ibase+1, ibase+lsiz)
-            ibase += lsiz
-        end
-        return R
-    end
-    indsAt, indsRt = Base.safe_tail(axes(A)), Base.safe_tail(axes(R)) # handle d=1 manually
-    keep, Idefault = Broadcast.shapeindexer(indsRt)
-    if Base.reducedim1(R, A)
-        i1 = first(Base.axes1(R))
-        @inbounds for IA in CartesianIndices(indsAt)
-            IR = Broadcast.newindex(IA, keep, Idefault)
-            r = R[i1,IR]
-            m = means[i1,IR]
-            @simd for i in axes(A, 1)
-                r += abs2(A[i,IA] - m)
-            end
-            R[i1,IR] = r
-        end
-    else
-        @inbounds for IA in CartesianIndices(indsAt)
-            IR = Broadcast.newindex(IA, keep, Idefault)
-            @simd for i in axes(A, 1)
-                R[i,IR] += abs2(A[i,IA] - means[i,IR])
-            end
-        end
-    end
-    return R
+struct CentralizedAbs2Fun{T,S} <: Function
+    mean::S
 end
+CentralizedAbs2Fun{T}(means) where {T} = CentralizedAbs2Fun{T,typeof(means)}(means)
+CentralizedAbs2Fun(means) = CentralizedAbs2Fun{typeof(means)}(means)
+CentralizedAbs2Fun(means, extrude) = CentralizedAbs2Fun{eltype(means)}(Broadcast.extrude(means))
+# Division is generally costly, but Julia is typically able to constant propagate a /1
+# and simply ensure we get the type right at no cost, allowing the division in-place later
+(f::CentralizedAbs2Fun)(x) = abs2.(x - f.mean)/1
+(f::CentralizedAbs2Fun{<:Any,<:Broadcast.Extruded})((i, x),) = abs2.(x - Broadcast._broadcast_getindex(f.mean, i))/1
+_doubled(x) = x+x
+Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any,S}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1
+Base.mapreduce_empty(::CentralizedAbs2Fun{T,<:Broadcast.Extruded}, ::typeof(Base.add_sum), ::Type{Tuple{_Any, Union{Missing, S}}}) where {T<:Number, S<:Number, _Any} = _doubled(abs2(zero(T)-zero(S)))/1
+Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{S}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1
+Base.mapreduce_empty(::CentralizedAbs2Fun{T}, ::typeof(Base.add_sum), ::Type{Union{Missing, S}}) where {T<:Number, S<:Number} = _doubled(abs2(zero(T)-zero(S)))/1
+
+centralize_sumabs2(A::AbstractArray, m) =
+    sum(CentralizedAbs2Fun(m), A)
+centralize_sumabs2(A::AbstractArray, m::AbstractArray, region) =
+    sum(CentralizedAbs2Fun(m, true), Base.PairsArray(A), dims=region)
+centralize_sumabs2!(R::AbstractArray, A::AbstractArray, means::AbstractArray) =
+    sum!(CentralizedAbs2Fun(means, true), R, Base.PairsArray(A))
+
 
 function varm!(R::AbstractArray{S}, A::AbstractArray, m::AbstractArray; corrected::Bool=true) where S
-    if isempty(A)
+    _checkm(R, m, ntuple(identity, Val(max(ndims(R), ndims(m)))))
+    if isempty(A) || length(A) == 1 && corrected
         fill!(R, convert(S, NaN))
     else
-        rn = div(length(A), length(R)) - Int(corrected)
+        rn = prod(ntuple(d->size(R, d) == 1 ? size(A, d) : 1, Val(max(ndims(A), ndims(R))))) - Int(corrected)
         centralize_sumabs2!(R, A, m)
         R .= R .* (1 // rn)
     end
@@ -339,15 +315,33 @@ over dimensions. In that case, `mean` must be an array with the same shape as
 """
 varm(A::AbstractArray, m::AbstractArray; corrected::Bool=true, dims=:) = _varm(A, m, corrected, dims)
 
-_varm(A::AbstractArray{T}, m, corrected::Bool, region) where {T} =
-    varm!(Base.reducedim_init(t -> abs2(t)/2, +, A, region), A, m; corrected=corrected)
+_throw_mean_mismatch(A, m, region) = throw(DimensionMismatch("axes of means ($(axes(m))) does not match reduction over $(region) of $(axes(A))"))
+function _checkm(A::AbstractArray, m::AbstractArray, region)
+    for d in 1:max(ndims(A), ndims(m))
+        if d in region
+            size(m, d) == 1 || _throw_mean_mismatch(A, m, region)
+        else
+            axes(m, d) == axes(A, d) || _throw_mean_mismatch(A, m, region)
+        end
+    end
+end
+function _varm(A::AbstractArray, m, corrected::Bool, region)
+    _checkm(A, m, region)
+    rn = prod(ntuple(d->d in region ? size(A, d) : 1, Val(ndims(A)))) - Int(corrected)
+    R = centralize_sumabs2(A, m, region)
+    if rn <= 0
+        R .= R ./ 0
+    else
+        R .= R .* 1//rn # why use Rational?
+    end
+    return R
+end
 
 varm(A::AbstractArray, m; corrected::Bool=true) = _varm(A, m, corrected, :)
 
 function _varm(A::AbstractArray{T}, m, corrected::Bool, ::Colon) where T
-    n = length(A)
-    n == 0 && return oftype((abs2(zero(T)) + abs2(zero(T)))/2, NaN)
-    return centralize_sumabs2(A, m) / (n - Int(corrected))
+    rn = max(length(A) - Int(corrected), 0)
+    centralize_sumabs2(A, m)/rn
 end
 
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -522,9 +522,15 @@ Y = [6.0  2.0;
     @testset "cov with missing" begin
         @test cov([missing]) === cov([1, missing]) === missing
         @test cov([1, missing], [2, 3]) === cov([1, 3], [2, missing]) === missing
-        @test_throws Exception cov([1 missing; 2 3])
-        @test_throws Exception cov([1 missing; 2 3], [1, 2])
-        @test_throws Exception cov([1, 2], [1 missing; 2 3])
+        if isdefined(Base, :_reducedim_init)
+            @test_broken isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3]))
+            @test_broken isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2]))
+            @test_broken isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3]))
+        else
+            @test isequal(coalesce.(cov([1 missing; 2 3]), NaN), cov([1 NaN; 2 3]))
+            @test isequal(coalesce.(cov([1 missing; 2 3], [1, 2]), NaN), cov([1 NaN; 2 3], [1, 2]))
+            @test isequal(coalesce.(cov([1, 2], [1 missing; 2 3]), NaN), cov([1, 2], [1 NaN; 2 3]))
+        end
         @test isequal(cov([1 2; 2 3], [1, missing]), [missing missing]')
         @test isequal(cov([1, missing], [1 2; 2 3]), [missing missing])
     end
@@ -633,9 +639,15 @@ end
         @test cor([missing]) === missing
         @test cor([1, missing]) == 1
         @test cor([1, missing], [2, 3]) === cor([1, 3], [2, missing]) === missing
-        @test_throws Exception cor([1 missing; 2 3])
-        @test_throws Exception cor([1 missing; 2 3], [1, 2])
-        @test_throws Exception cor([1, 2], [1 missing; 2 3])
+        if isdefined(Base, :_reducedim_init)
+            @test_broken isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3]))
+            @test_broken isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2]))
+            @test_broken isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3]))
+        else
+            @test isequal(coalesce.(cor([1 missing; 2 3]), NaN), cor([1 NaN; 2 3]))
+            @test isequal(coalesce.(cor([1 missing; 2 3], [1, 2]), NaN), cor([1 NaN; 2 3], [1, 2]))
+            @test isequal(coalesce.(cor([1, 2], [1 missing; 2 3]), NaN), cor([1, 2], [1 NaN; 2 3]))
+        end
         @test isequal(cor([1 2; 2 3], [1, missing]), [missing missing]')
         @test isequal(cor([1, missing], [1 2; 2 3]), [missing missing])
     end
@@ -1072,3 +1084,19 @@ end
     @test isequal(cov(Int[], my), fill(-0.0, 1, 3))
     @test isequal(cor(Int[], my), fill(NaN, 1, 3))
 end
+
+@testset "mean, var, std type stability with Missings; Issue #160" begin
+    @test (@inferred Missing mean(view([1, 2, missing], 1:2))) == (@inferred mean([1,2]))
+    @test (@inferred Missing var(view([1, 2, missing], 1:2))) == (@inferred var([1,2]))
+    @test (@inferred Missing std(view([1, 2, missing], 1:2))) == (@inferred std([1,2]))
+end
+
+@testset "inexact errors; Issues #7 and #126" begin
+    a = [missing missing; 0 1]
+    @test_broken isequal(mean(a;dims=2), [missing; 0.5;;])
+
+    x = [(i==3 && j==3) ? missing : i*j for i in 1:3, j in 1:4]
+    @test ismissing(@inferred Float64 mean(x))
+    @test isequal(mean(x; dims=1), [2. 4. missing 8.])
+    @test isequal(mean(x; dims=2), [2.5; 5.0; missing;;])
+end