Open
Description
The following calls allocate a lot on Julia 1.11 and master, while they had only a handful of allocations on 1.10 (JuliaData/CategoricalArrays.jl#412). Performance has also decreased by a factor of 2 to 3.
using CategoricalArrays
A = rand(1:10, 1000);
Acat = categorical(A);
julia> @allocations getindex.(Ref(Acat), eachindex(Acat))
1020
julia> @allocations [v for v in Acat]
1018
julia> @allocations map(identity, Acat)
1018
julia> function f(A)
res = similar(A)
for i in eachindex(A)
res[i] = A[i]
end
res
end
f (generic function with 1 method)
julia> @allocations f(Acat)
31930
julia> @allocations f(Acat)
1017
However, a loop like this doesn't allocate:
function count(x::CategoricalArray, val)
n = 0
@inbounds for i in eachindex(x)
v = x[i]
n += (v == val)
end
n
end
julia> @allocations count(Acat, 1)
0
Below is a simplified definition of CategoricalArray
that reproduces the problem. It's not identical as CategoricalArray
defines similar
so that a CategoricalArray
is returned rather than an Array
, but apparently that doesn't make a difference for allocations.
mutable struct CategoricalPool{T, R <: Integer, V}
levels::Vector{T} # category levels ordered by their reference codes
invindex::Dict{T, R} # map from category levels to their reference codes
ordered::Bool # whether levels can be compared using <
hash::Union{UInt, Nothing} # hash of levels
subsetof::Ptr{Nothing} # last seen strict superset pool
equalto::Ptr{Nothing} # last seen equal pool
function CategoricalPool{T, R, V}(levels::Vector{T},
ordered::Bool) where {T, R, V}
if length(levels) > typemax(R)
throw(LevelsException{T, R}(levels[Int(typemax(R))+1:end]))
end
invindex = Dict{T, R}(v => i for (i, v) in enumerate(levels))
if length(invindex) != length(levels)
throw(ArgumentError("Duplicate entries are not allowed in levels"))
end
CategoricalPool{T, R, V}(levels, invindex, ordered)
end
function CategoricalPool{T, R, V}(invindex::Dict{T, R},
ordered::Bool) where {T, R, V}
levels = Vector{T}(undef, length(invindex))
# If invindex contains non consecutive values, a BoundsError will be thrown
try
for (k, v) in invindex
levels[v] = k
end
catch BoundsError
throw(ArgumentError("Reference codes must be in 1:length(invindex)"))
end
if length(invindex) > typemax(R)
throw(LevelsException{T, R}(levels[typemax(R)+1:end]))
end
CategoricalPool{T, R, V}(levels, invindex, ordered)
end
function CategoricalPool{T, R, V}(levels::Vector{T},
invindex::Dict{T, R},
ordered::Bool,
hash::Union{UInt, Nothing}=nothing) where {T, R, V}
if !(V <: CategoricalValue)
throw(ArgumentError("Type $V is not a categorical value type"))
end
if V !== CategoricalValue{T, R}
throw(ArgumentError("V must be CategoricalValue{T, R}"))
end
pool = new(levels, invindex, ordered, hash, C_NULL, C_NULL)
return pool
end
end
struct CategoricalValue{T, R <: Integer}
pool::CategoricalPool{T, R, CategoricalValue{T, R}}
ref::R
end
mutable struct CategoricalArray{T, N, R <: Integer, V, C, U} <: AbstractArray{C, N}
refs::Array{R, N}
pool::CategoricalPool{V, R, C}
function CategoricalArray{T, N}(refs::Array{R, N},
pool::CategoricalPool{V, R, C}) where
{T, N, R <: Integer, V, C}
T === V || T == Union{V, Missing} || throw(ArgumentError("T ($T) must be equal to $V or Union{$V, Missing}"))
U = T >: Missing ? Missing : Union{}
new{T, N, R, V, C, U}(refs, pool)
end
end
Base.size(A::CategoricalArray) = size(A.refs)
Base.IndexStyle(::Type{<:CategoricalArray}) = IndexLinear()
Base.getindex(pool::CategoricalPool{T, R}, i::Integer) where {T, R<:Integer} =
CategoricalValue{T, R}(pool, i)
@inline function Base.getindex(A::CategoricalArray{T}, I...) where {T}
@boundscheck checkbounds(A.refs, I...)
# Let Array indexing code handle everything
@inbounds r = A.refs[I...]
if isa(r, Array)
return CategoricalArray{T, ndims(r)}(r, copy(A.pool))
else
r > 0 || throw(UndefRefError())
@inbounds res = A.pool[r]
return res
end
end
A = rand(UInt32.(1:10), 1000);
pool = CategoricalPool{Int, UInt32, CategoricalValue{Int, UInt32}}(collect(1:10), false);
Acat = CategoricalArray{Int, 1}(A, pool);