Skip to content

map/broadcast on CategoricalArrays allocates since Julia 1.11 #58169

Open
@nalimilan

Description

@nalimilan

The following calls allocate a lot on Julia 1.11 and master, while they had only a handful of allocations on 1.10 (JuliaData/CategoricalArrays.jl#412). Performance has also decreased by a factor of 2 to 3.

using CategoricalArrays
A = rand(1:10, 1000);
Acat = categorical(A);

julia> @allocations getindex.(Ref(Acat), eachindex(Acat))
1020

julia> @allocations [v for v in Acat]
1018

julia> @allocations map(identity, Acat)
1018

julia> function f(A)
           res = similar(A)
           for i in eachindex(A)
               res[i] = A[i]
           end
           res
       end
f (generic function with 1 method)

julia> @allocations f(Acat)
31930

julia> @allocations f(Acat)
1017

However, a loop like this doesn't allocate:

function count(x::CategoricalArray, val)
    n = 0
    @inbounds for i in eachindex(x)
        v = x[i]
        n += (v == val)
    end
    n
end

julia> @allocations count(Acat, 1)
0

Below is a simplified definition of CategoricalArray that reproduces the problem. It's not identical as CategoricalArray defines similar so that a CategoricalArray is returned rather than an Array, but apparently that doesn't make a difference for allocations.

mutable struct CategoricalPool{T, R <: Integer, V}
    levels::Vector{T}          # category levels ordered by their reference codes
    invindex::Dict{T, R}       # map from category levels to their reference codes
    ordered::Bool              # whether levels can be compared using <
    hash::Union{UInt, Nothing} # hash of levels
    subsetof::Ptr{Nothing}     # last seen strict superset pool
    equalto::Ptr{Nothing}      # last seen equal pool

    function CategoricalPool{T, R, V}(levels::Vector{T},
                                      ordered::Bool) where {T, R, V}
        if length(levels) > typemax(R)
            throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end]))
        end
        invindex = Dict{T, R}(v => i for (i, v) in enumerate(levels))
        if length(invindex) != length(levels)
            throw(ArgumentError("Duplicate entries are not allowed in levels"))
        end
        CategoricalPool{T, R, V}(levels, invindex, ordered)
    end
    function CategoricalPool{T, R, V}(invindex::Dict{T, R},
                                      ordered::Bool) where {T, R, V}
        levels = Vector{T}(undef, length(invindex))
        # If invindex contains non consecutive values, a BoundsError will be thrown
        try
            for (k, v) in invindex
                levels[v] = k
            end
        catch BoundsError
            throw(ArgumentError("Reference codes must be in 1:length(invindex)"))
        end
        if length(invindex) > typemax(R)
            throw(LevelsException{T, R}(levels[typemax(R)+1:end]))
        end
        CategoricalPool{T, R, V}(levels, invindex, ordered)
    end
    function CategoricalPool{T, R, V}(levels::Vector{T},
                                      invindex::Dict{T, R},
                                      ordered::Bool,
                                      hash::Union{UInt, Nothing}=nothing) where {T, R, V}
        if !(V <: CategoricalValue)
            throw(ArgumentError("Type $V is not a categorical value type"))
        end
        if V !== CategoricalValue{T, R}
            throw(ArgumentError("V must be CategoricalValue{T, R}"))
        end
        pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL)
        return pool
    end
end

struct CategoricalValue{T, R <: Integer}
    pool::CategoricalPool{T, R, CategoricalValue{T, R}}
    ref::R
end

mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractArray{C, N}
    refs::Array{R, N}
    pool::CategoricalPool{V, R, C}

    function CategoricalArray{T, N}(refs::Array{R, N},
                                    pool::CategoricalPool{V, R, C}) where
                                                 {T, N, R <: Integer, V, C}
        T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}"))
        U = T >: Missing ? Missing : Union{}
        new{T, N, R, V, C, U}(refs, pool)
    end
end


Base.size(A::CategoricalArray) = size(A.refs)
Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear()

Base.getindex(pool::CategoricalPool{T, R}, i::Integer) where {T, R<:Integer} =
    CategoricalValue{T, R}(pool, i)

@inline function Base.getindex(A::CategoricalArray{T}, I...) where {T}
    @boundscheck checkbounds(A.refs, I...)
    # Let Array indexing code handle everything
    @inbounds r = A.refs[I...]

    if isa(r, Array)
        return CategoricalArray{T, ndims(r)}(r, copy(A.pool))
    else
        r > 0 || throw(UndefRefError())
        @inbounds res = A.pool[r]
        return res
    end
end

A = rand(UInt32.(1:10), 1000);
pool = CategoricalPool{Int, UInt32, CategoricalValue{Int, UInt32}}(collect(1:10), false);
Acat = CategoricalArray{Int, 1}(A, pool);

Metadata

Metadata

Assignees

No one assigned

    Labels

    performanceMust go fasterregressionRegression in behavior compared to a previous versionregression 1.11Regression in the 1.11 release

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions