Skip to content

Commit

Permalink
change Base.unique(::AbstrCatArr) to return CatArr
Browse files Browse the repository at this point in the history
so it conforms to the semantics of the Base.unique()

This is a breaking change that requires a new minor release.
  • Loading branch information
alyst committed Aug 17, 2021
1 parent 6803c3c commit 18691e3
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 30 deletions.
38 changes: 17 additions & 21 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -804,31 +804,27 @@ function levels!(A::CategoricalArray{T, N, R}, newlevels::Vector;
A
end

function _unique(::Type{S},
refs::AbstractArray{T},
pool::CategoricalPool) where {S, T<:Integer}
nlevels = length(levels(pool)) + 1
order = fill(0, nlevels) # 0 indicates not seen
# If we don't track missings, short-circuit even if none has been seen
count = S >: Missing ? 0 : 1
@inbounds for i in refs
if order[i + 1] == 0
count += 1
order[i + 1] = count
count == nlevels && break
# return unique refs (each value is unique) in the order of appearance in `refs`
# equivalent to fallback Base.unique() implementation,
# but short-circuits once references to all levels are encountered
function _uniquerefs(A::CatArrOrSub{T}) where T
arefs = refs(A)
res = similar(arefs, 0)
nlevels = length(levels(A))
maxunique = nlevels + (T >: Missing ? 1 : 0)
seen = fill(false, nlevels + 1) # always +1 for 0 (missing ref)
@inbounds for ref in arefs
if !seen[ref + 1]
push!(res, ref)
seen[ref + 1] = true
(length(res) == maxunique) && break
end
end
S[i == 1 ? missing : levels(pool)[i - 1] for i in sortperm(order) if order[i] != 0]
return res
end

"""
unique(A::CategoricalArray)
Return levels which appear in `A` in their order of appearance.
This function is significantly slower than [`levels`](@ref DataAPI.levels)
since it needs to check whether levels are used or not.
"""
unique(A::CategoricalArray{T}) where {T} = _unique(T, A.refs, A.pool)
unique(A::CatArrOrSub{T}) where T =
CategoricalVector{T}(_uniquerefs(A), copy(pool(A)))

"""
droplevels!(A::CategoricalArray)
Expand Down
7 changes: 0 additions & 7 deletions src/subarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,6 @@ isordered(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray} = isordered(paren
levels!(sa::SubArray{T,N,P}, newlevels::Vector) where {T,N,P<:CategoricalArray} =
levels!(parent(sa), newlevels)

function unique(sa::SubArray{T,N,P}) where {T,N,P<:CategoricalArray}
A = parent(sa)
refs = view(A.refs, sa.indices...)
S = eltype(P) >: Missing ? Union{eltype(levels(A.pool)), Missing} : eltype(levels(A.pool))
_unique(S, refs, A.pool)
end

refs(A::SubArray{<:Any, <:Any, <:CategoricalArray}) =
view(parent(A).refs, parentindices(A)...)

Expand Down
11 changes: 9 additions & 2 deletions test/11_array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -237,6 +238,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -402,6 +404,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x[4] === CategoricalValue(x.pool, 4)
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)

x[1:2] .= -1
@test x[1] === CategoricalValue(x.pool, 5)
Expand Down Expand Up @@ -438,6 +441,7 @@ using CategoricalArrays: DefaultRefType, leveltype
@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test unique(x) isa CategoricalVector{String, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -694,6 +698,7 @@ end
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) == ["Old", "Young", "Middle"]
@test typeof(unique(x)) === typeof(x)
@test levels!(x, ["Young", "Middle", "Old", "Unused"]) === x
@test levels(x) == ["Young", "Middle", "Old", "Unused"]
@test unique(x) == ["Old", "Young", "Middle"]
Expand All @@ -703,18 +708,20 @@ end

x = CategoricalArray(String[])
@test isa(levels(x), Vector{String}) && isempty(levels(x))
@test isa(unique(x), Vector{String}) && isempty(unique(x))
@test isa(unique(x), typeof(x)) && isempty(unique(x))
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test isa(unique(x), Vector{String}) && isempty(unique(x))
@test isa(unique(x), typeof(x)) && isempty(unique(x))

# To test short-circuiting
x = CategoricalArray(repeat(1:10, inner=10))
@test levels(x) == collect(1:10)
@test unique(x) == collect(1:10)
@test unique(x) isa typeof(x)
@test levels!(x, [19:-1:1; 20]) === x
@test levels(x) == [19:-1:1; 20]
@test unique(x) == collect(1:10)
@test unique(x) isa typeof(x)
end

end
8 changes: 8 additions & 0 deletions test/12_missingarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ const ≅ = isequal
@test isordered(x) === ordered
@test levels(x) == sort(unique(a))
@test unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -258,6 +259,7 @@ const ≅ = isequal
@test x a
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (3,)
@test length(x) === 3

Expand Down Expand Up @@ -402,6 +404,7 @@ const ≅ = isequal
@test x == collect(a)
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test typeof(unique(x)) === typeof(x)
@test size(x) === (4,)
@test length(x) === 4
@test leveltype(x) === Float64
Expand Down Expand Up @@ -578,6 +581,7 @@ const ≅ = isequal
@test x[4] === CategoricalValue(x.pool, 4)
@test levels(x) == unique(a)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)

x[1:2] .= -1
@test x[1] === CategoricalValue(x.pool, 5)
Expand All @@ -587,6 +591,7 @@ const ≅ = isequal
@test isordered(x) === false
@test levels(x) == vcat(unique(a), -1)
@test unique(x) == unique(collect(x))
@test typeof(unique(x)) === typeof(x)


ordered!(x, ordered)
Expand Down Expand Up @@ -618,6 +623,7 @@ const ≅ = isequal
@test x == a
@test isordered(x) === ordered
@test levels(x) == unique(x) == unique(a)
@test unique(x) isa CategoricalVector{Union{String, Missing}, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -778,6 +784,7 @@ const ≅ = isequal
@test isordered(x) === ordered
@test levels(x) == filter(x->!ismissing(x), unique(a))
@test unique(x) unique(a)
@test unique(x) isa CategoricalVector{Union{String, Missing}, R}
@test size(x) === (2, 3)
@test length(x) === 6

Expand Down Expand Up @@ -1099,6 +1106,7 @@ end
x = CategoricalArray(["Old", "Young", "Middle", missing, "Young"])
@test levels(x) == ["Middle", "Old", "Young"]
@test unique(x) ["Old", "Young", "Middle", missing]
@test typeof(unique(x)) === typeof(x)
@test levels!(x, ["Young", "Middle", "Old"]) === x
@test levels(x) == ["Young", "Middle", "Old"]
@test unique(x) ["Old", "Young", "Middle", missing]
Expand Down

0 comments on commit 18691e3

Please sign in to comment.