JuliaLang · LilithHafner · Jun 4, 2022
diff --git a/base/sort.jl b/base/sort.jl
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -52,6 +52,7 @@ let
         :LibGit2,
         :Profile,
         :SparseArrays,
+        :StandardSortingAlgorithms,
         :UUIDs,
 
         # 3-depth packages

diff --git a/stdlib/StandardSortingAlgorithms/Project.toml b/stdlib/StandardSortingAlgorithms/Project.toml
@@ -0,0 +1,15 @@
+name = "StandardSortingAlgorithms"
+uuid = "7744cb9a-8a56-1d63-a5da-e2fdf8a12fa2"
+
+[deps]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[extras]
+Future = "9fa8497b-333b-5362-9e8d-4d0656e87820"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test", "SparseArrays", "LinearAlgebra", "Future", "Statistics"]
diff --git a/stdlib/StandardSortingAlgorithms/docs/src/index.md b/stdlib/StandardSortingAlgorithms/docs/src/index.md
diff --git a/stdlib/StandardSortingAlgorithms/src/AdaptiveSort.jl b/stdlib/StandardSortingAlgorithms/src/AdaptiveSort.jl
@@ -0,0 +1,127 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+# For AbstractVector{Bool}, counting sort is always best.
+# This is an implementation of counting sort specialized for Bools.
+function sort!(v::AbstractVector{B}, lo::Integer, hi::Integer, a::AdaptiveSort, o::Ordering,
+        t::Union{AbstractVector{B}, Nothing}=nothing) where {B <: Bool}
+    first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
+    count = 0
+    @inbounds for i in lo:hi
+        if v[i] == first
+            count += 1
+        end
+    end
+    @inbounds v[lo:lo+count-1] .= first
+    @inbounds v[lo+count:hi] .= !first
+    v
+end
+function sort!(v::AbstractVector{T}, lo::Integer, hi::Integer, a::AdaptiveSort, o::Ordering,
+            t::Union{AbstractVector{T}, Nothing}=nothing) where T
+    # if the sorting task is not UIntMappable, then we can't radix sort or sort_int_range!
+    # so we skip straight to the fallback algorithm which is comparison based.
+    U = UIntMappable(T, o)
+    U === nothing && return sort!(v, lo, hi, a.fallback, o)
+
+    # to avoid introducing excessive detection costs for the trivial sorting problem
+    # and to avoid overflow, we check for small inputs before any other runtime checks
+    hi <= lo && return v
+    lenm1 = maybe_unsigned(hi-lo) # adding 1 would risk overflow
+    # only count sort on a short range can compete with insertion sort when lenm1 < 40
+    # and the optimization is not worth the detection cost, so we use insertion sort.
+    lenm1 < 40 && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
+
+    # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
+    # arrays it is essentially free (<1%). Insertion sort runs in a fast O(n) on presorted
+    # input and this guarantees presorted input will always be efficiently handled
+    issorted(view(v, lo:hi), o) && return v
+
+    # For large arrays, a reverse-sorted check is essentially free (overhead < 1%)
+    if lenm1 >= 500 && issorted(view(v, lo:hi), ReverseOrdering(o))
+        reverse!(view(v, lo:hi))
+        return v
+    end
+
+    # UInt128 does not support fast bit shifting so we never
+    # dispatch to radix sort but we may still perform count sort
+    if sizeof(U) > 8
+        if T <: Integer && o isa DirectOrdering
+            v_min, v_max = _extrema(v, lo, hi, Forward)
+            v_range = maybe_unsigned(v_max-v_min)
+            v_range == 0 && return v # all same
+
+            # we know lenm1 ≥ 40, so this will never underflow.
+            # if lenm1 > 3.7e18 (59 exabytes), then this may incorrectly dispatch to fallback
+            if v_range < 5lenm1-100 # count sort will outperform comparison sort if v's range is small
+                return sort_int_range!(v, Int(v_range+1), v_min, o === Forward ? identity : reverse, lo, hi)
+            end
+        end
+        return sort!(v, lo, hi, a.fallback, o)
+    end
+
+    v_min, v_max = _extrema(v, lo, hi, o)
+    lt(o, v_min, v_max) || return v # all same
+    if T <: Integer && o isa DirectOrdering
+        R = o === Reverse
+        v_range = maybe_unsigned(R ? v_min-v_max : v_max-v_min)
+        if v_range < div(lenm1, 2) # count sort will be superior if v's range is very small
+            return sort_int_range!(v, Int(v_range+1), R ? v_max : v_min, R ? reverse : identity, lo, hi)
+        end
+    end
+
+    u_min, u_max = uint_map(v_min, o), uint_map(v_max, o)
+    u_range = maybe_unsigned(u_max-u_min)
+    if u_range < div(lenm1, 2) # count sort will be superior if u's range is very small
+        u = uint_map!(v, lo, hi, o)
+        sort_int_range!(u, Int(u_range+1), u_min, identity, lo, hi)
+        return uint_unmap!(v, u, lo, hi, o)
+    end
+
+    # if u's range is small, then once we subtract out v_min, we'll get a vector like
+    # UInt16[0x001a, 0x0015, 0x0006, 0x001b, 0x0008, 0x000c, 0x0001, 0x000e, 0x001c, 0x0009]
+    # where we only need to radix over the last few bits (5, in the example).
+    bits = unsigned(8sizeof(u_range) - leading_zeros(u_range))
+
+    # radix sort runs in O(bits * lenm1), insertion sort runs in O(lenm1^2). Radix sort
+    # has a constant factor that is three times higher, so radix runtime is 3bits * lenm1
+    # and insertion runtime is lenm1^2. Empirically, insertion is faster than radix iff
+    # lenm1 < 3bits.
+    # Insertion < Radix
+    #   lenm1^2 < 3 * bits * lenm1
+    #     lenm1 < 3bits
+    if lenm1 < 3bits
+        # at lenm1 = 64*3-1, QuickSort is about 20% faster than InsertionSort.
+        alg = a.fallback === QuickSort && lenm1 > 120 ? QuickSort : SMALL_ALGORITHM
+        return sort!(v, lo, hi, alg, o)
+    end
+
+    # At this point, we are committed to radix sort.
+    u = uint_map!(v, lo, hi, o)
+
+    # we subtract u_min to avoid radixing over unnecessary bits. For example,
+    # Int32[3, -1, 2] uint_maps to UInt32[0x80000003, 0x7fffffff, 0x80000002]
+    # which uses all 32 bits, but once we subtract u_min = 0x7fffffff, we are left with
+    # UInt32[0x00000004, 0x00000000, 0x00000003] which uses only 3 bits, and
+    # Float32[2.012, 400.0, 12.345] uint_maps to UInt32[0x3fff3b63, 0x3c37ffff, 0x414570a4]
+    # which is reduced to UInt32[0x03c73b64, 0x00000000, 0x050d70a5] using only 26 bits.
+    # the overhead for this subtraction is small enough that it is worthwhile in many cases.
+
+    # this is faster than u[lo:hi] .-= u_min as of v1.9.0-DEV.100
+    @inbounds for i in lo:hi
+        u[i] -= u_min
+    end
+
+    u2 = radix_sort!(u, lo, hi, bits, reinterpret(U, workspace(v, t, hi)))
+    uint_unmap!(v, u2, lo, hi, o, u_min)
+end
+
+maybe_unsigned(x::Integer) = x # this is necessary to avoid calling unsigned on BigInt
+maybe_unsigned(x::BitSigned) = unsigned(x)
+function _extrema(v::AbstractVector, lo::Integer, hi::Integer, o::Ordering)
+    mn = mx = v[lo]
+    @inbounds for i in (lo+1):hi
+        vi = v[i]
+        lt(o, vi, mn) && (mn = vi)
+        lt(o, mx, vi) && (mx = vi)
+    end
+    mn, mx
+end
diff --git a/stdlib/StandardSortingAlgorithms/src/Float.jl b/stdlib/StandardSortingAlgorithms/src/Float.jl
@@ -0,0 +1,157 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#  Floating point optimizations
+module Float
+using ..Sort
+using ...Order
+using ..Base: @inbounds, AbstractVector, Vector, last, firstindex, lastindex, Missing, Type, reinterpret
+
+import Core.Intrinsics: slt_int
+import ..StandardSortingAlgorithms: sort!, UIntMappable, uint_map, uint_unmap
+import ...Order: lt, DirectOrdering
+
+const Floats = Union{Float32,Float64}
+const FPSortable = Union{ # Mixed Float32 and Float64 are not allowed.
+    AbstractVector{Union{Float32, Missing}},
+    AbstractVector{Union{Float64, Missing}},
+    AbstractVector{Float32},
+    AbstractVector{Float64},
+    AbstractVector{Missing}}
+
+struct Left <: Ordering end
+struct Right <: Ordering end
+
+left(::DirectOrdering) = Left()
+right(::DirectOrdering) = Right()
+
+left(o::Perm) = Perm(left(o.order), o.data)
+right(o::Perm) = Perm(right(o.order), o.data)
+
+lt(::Left, x::T, y::T) where {T<:Floats} = slt_int(y, x)
+lt(::Right, x::T, y::T) where {T<:Floats} = slt_int(x, y)
+
+uint_map(x::Float32, ::Left) = ~reinterpret(UInt32, x)
+uint_unmap(::Type{Float32}, u::UInt32, ::Left) = reinterpret(Float32, ~u)
+uint_map(x::Float32, ::Right) = reinterpret(UInt32, x)
+uint_unmap(::Type{Float32}, u::UInt32, ::Right) = reinterpret(Float32, u)
+UIntMappable(::Type{Float32}, ::Union{Left, Right}) = UInt32
+
+uint_map(x::Float64, ::Left) = ~reinterpret(UInt64, x)
+uint_unmap(::Type{Float64}, u::UInt64, ::Left) = reinterpret(Float64, ~u)
+uint_map(x::Float64, ::Right) = reinterpret(UInt64, x)
+uint_unmap(::Type{Float64}, u::UInt64, ::Right) = reinterpret(Float64, u)
+UIntMappable(::Type{Float64}, ::Union{Left, Right}) = UInt64
+
+isnan(o::DirectOrdering, x::Floats) = (x!=x)
+isnan(o::DirectOrdering, x::Missing) = false
+isnan(o::Perm, i::Integer) = isnan(o.order,o.data[i])
+
+ismissing(o::DirectOrdering, x::Floats) = false
+ismissing(o::DirectOrdering, x::Missing) = true
+ismissing(o::Perm, i::Integer) = ismissing(o.order,o.data[i])
+
+allowsmissing(::AbstractVector{T}, ::DirectOrdering) where {T} = T >: Missing
+allowsmissing(::AbstractVector{<:Integer},
+              ::Perm{<:DirectOrdering,<:AbstractVector{T}}) where {T} =
+    T >: Missing
+
+function specials2left!(testf::Function, v::AbstractVector, o::Ordering,
+                        lo::Integer=firstindex(v), hi::Integer=lastindex(v))
+    i = lo
+    @inbounds while i <= hi && testf(o,v[i])
+        i += 1
+    end
+    j = i + 1
+    @inbounds while j <= hi
+        if testf(o,v[j])
+            v[i], v[j] = v[j], v[i]
+            i += 1
+        end
+        j += 1
+    end
+    return i, hi
+end
+function specials2right!(testf::Function, v::AbstractVector, o::Ordering,
+                         lo::Integer=firstindex(v), hi::Integer=lastindex(v))
+    i = hi
+    @inbounds while lo <= i && testf(o,v[i])
+        i -= 1
+    end
+    j = i - 1
+    @inbounds while lo <= j
+        if testf(o,v[j])
+            v[i], v[j] = v[j], v[i]
+            i -= 1
+        end
+        j -= 1
+    end
+    return lo, i
+end
+
+function specials2left!(v::AbstractVector, a::Algorithm, o::Ordering)
+    lo, hi = firstindex(v), lastindex(v)
+    if allowsmissing(v, o)
+        i, _ = specials2left!((v, o) -> ismissing(v, o) || isnan(v, o), v, o, lo, hi)
+        sort!(v, lo, i-1, a, o)
+        return i, hi
+    else
+        return specials2left!(isnan, v, o, lo, hi)
+    end
+end
+function specials2right!(v::AbstractVector, a::Algorithm, o::Ordering)
+    lo, hi = firstindex(v), lastindex(v)
+    if allowsmissing(v, o)
+        _, i = specials2right!((v, o) -> ismissing(v, o) || isnan(v, o), v, o, lo, hi)
+        sort!(v, i+1, hi, a, o)
+        return lo, i
+    else
+        return specials2right!(isnan, v, o, lo, hi)
+    end
+end
+
+specials2end!(v::AbstractVector, a::Algorithm, o::ForwardOrdering) =
+    specials2right!(v, a, o)
+specials2end!(v::AbstractVector, a::Algorithm, o::ReverseOrdering) =
+    specials2left!(v, a, o)
+specials2end!(v::AbstractVector{<:Integer}, a::Algorithm, o::Perm{<:ForwardOrdering}) =
+    specials2right!(v, a, o)
+specials2end!(v::AbstractVector{<:Integer}, a::Algorithm, o::Perm{<:ReverseOrdering}) =
+    specials2left!(v, a, o)
+
+issignleft(o::ForwardOrdering, x::Floats) = lt(o, x, zero(x))
+issignleft(o::ReverseOrdering, x::Floats) = lt(o, x, -zero(x))
+issignleft(o::Perm, i::Integer) = issignleft(o.order, o.data[i])
+
+function fpsort!(v::AbstractVector, a::Algorithm, o::Ordering,
+        t::Union{AbstractVector, Nothing}=nothing)
+    # fpsort!'s optimizations speed up comparisons, of which there are O(nlogn).
+    # The overhead is O(n). For n < 10, it's not worth it.
+    length(v) < 10 && return sort!(v, firstindex(v), lastindex(v), SMALL_ALGORITHM, o, t)
+
+    i, j = lo, hi = specials2end!(v,a,o)
+    @inbounds while true
+        while i <= j &&  issignleft(o,v[i]); i += 1; end
+        while i <= j && !issignleft(o,v[j]); j -= 1; end
+        i <= j || break
+        v[i], v[j] = v[j], v[i]
+        i += 1; j -= 1
+    end
+    sort!(v, lo, j,  a, left(o), t)
+    sort!(v, i,  hi, a, right(o), t)
+    return v
+end
+
+
+fpsort!(v::AbstractVector, a::Sort.PartialQuickSort, o::Ordering) =
+    sort!(v, firstindex(v), lastindex(v), a, o)
+
+function sort!(v::FPSortable, a::Algorithm, o::DirectOrdering,
+        t::Union{FPSortable, Nothing}=nothing)
+    fpsort!(v, a, o, t)
+end
+function sort!(v::AbstractVector{<:Union{Signed, Unsigned}}, a::Algorithm,
+        o::Perm{<:DirectOrdering,<:FPSortable}, t::Union{AbstractVector, Nothing}=nothing)
+    fpsort!(v, a, o, t)
+end
+
+end # module Float
diff --git a/stdlib/StandardSortingAlgorithms/src/MergeSort.jl b/stdlib/StandardSortingAlgorithms/src/MergeSort.jl
@@ -0,0 +1,38 @@
+function sort!(v::AbstractVector{T}, lo::Integer, hi::Integer, a::MergeSortAlg, o::Ordering,
+    t0::Union{AbstractVector{T}, Nothing}=nothing) where T
+@inbounds if lo < hi
+    hi-lo <= SMALL_THRESHOLD && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
+
+    m = midpoint(lo, hi)
+    t = workspace(v, t0, m-lo+1)
+
+    sort!(v, lo,  m,  a, o, t)
+    sort!(v, m+1, hi, a, o, t)
+
+    i, j = 1, lo
+    while j <= m
+        t[i] = v[j]
+        i += 1
+        j += 1
+    end
+
+    i, k = 1, lo
+    while k < j <= hi
+        if lt(o, v[j], t[i])
+            v[k] = v[j]
+            j += 1
+        else
+            v[k] = t[i]
+            i += 1
+        end
+        k += 1
+    end
+    while k < j
+        v[k] = t[i]
+        k += 1
+        i += 1
+    end
+end
+
+return v
+end