diff --git a/base/sort.jl b/base/sort.jl
index 72e28ce892039..8aa1d00998e22 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -671,12 +671,12 @@ end
 
 # This is a stable least significant bit first radix sort.
 #
-# That is, it first sorts the entire vector by the last CHUNK_SIZE bits, then by the second
-# to last CHUNK_SIZE bits, and so on. Stability means that it will not reorder two elements
+# That is, it first sorts the entire vector by the last chunk_size bits, then by the second
+# to last chunk_size bits, and so on. Stability means that it will not reorder two elements
 # that compare equal. This is essential so that the order introduced by earlier,
 # less significant passes is preserved by later passes.
 #
-# Each pass divides the input into 2^CHUNK_SIZE == MASK+1 buckets. To do this, it
+# Each pass divides the input into 2^chunk_size == mask+1 buckets. To do this, it
 #  * counts the number of entries that fall into each bucket
 #  * uses those counts to compute the indices to move elements of those buckets into
 #  * moves elements into the computed indices in the swap array
@@ -685,32 +685,32 @@ end
 # In the case of an odd number of passes, the returned vector will === the input vector t,
 # not v. This is one of the many reasons radix_sort! is not exported.
 function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsigned,
-                     ::Val{CHUNK_SIZE}, t::AbstractVector{U}) where {U <: Unsigned, CHUNK_SIZE}
-    # bits is unsigned and CHUNK_SIZE is a compile time constant for performance reasons.
-    MASK = UInt(1) << CHUNK_SIZE - 0x1
-    counts = Vector{UInt}(undef, MASK+2)
+                     t::AbstractVector{U}, chunk_size=radix_chunk_size_heuristic(lo, hi, bits)) where U <: Unsigned
+    # bits is unsigned for performance reasons.
+    mask = UInt(1) << chunk_size - 0x1
+    counts = Vector{UInt}(undef, mask+2)
 
-    @inbounds for shift in 0:CHUNK_SIZE:bits-1
+    @inbounds for shift in 0:chunk_size:bits-1
 
-        # counts[2:MASK+2] will store the number of elements that fall into each bucket.
-        # if CHUNK_SIZE = 8, counts[2] is bucket 0x00 and counts[257] is bucket 0xff.
+        # counts[2:mask+2] will store the number of elements that fall into each bucket.
+        # if chunk_size = 8, counts[2] is bucket 0x00 and counts[257] is bucket 0xff.
         counts .= 0
         for k in lo:hi
             x = v[k]                  # lookup the element
-            i = (x >> shift)&MASK + 2 # compute its bucket's index for this pass
+            i = (x >> shift)&mask + 2 # compute its bucket's index for this pass
             counts[i] += 1            # increment that bucket's count
         end
 
         counts[1] = lo                # set target index for the first bucket
         cumsum!(counts, counts)       # set target indices for subsequent buckets
-        # counts[1:MASK+1] now stores indices where the first member of each bucket
+        # counts[1:mask+1] now stores indices where the first member of each bucket
         # belongs, not the number of elements in each bucket. We will put the first element
         # of bucket 0x00 in t[counts[1]], the next element of bucket 0x00 in t[counts[1]+1],
         # and the last element of bucket 0x00 in t[counts[2]-1].
 
         for k in lo:hi
             x = v[k]                  # lookup the element
-            i = (x >> shift)&MASK + 1 # compute its bucket's index for this pass
+            i = (x >> shift)&mask + 1 # compute its bucket's index for this pass
             j = counts[i]             # lookup the target index
             t[j] = x                  # put the element where it belongs
             counts[i] = j + 1         # increment the target index for the next
@@ -722,6 +722,18 @@ function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsig
 
     v
 end
+function radix_chunk_size_heuristic(lo::Integer, hi::Integer, bits::Unsigned)
+    # chunk_size is the number of bits to radix over at once.
+    # We need to allocate an array of size 2^chunk size, and on the other hand the higher
+    # the chunk size the fewer passes we need. Theoretically, chunk size should be based on
+    # the Lambert W function applied to length. Empirically, we use this heuristic:
+    guess = min(10, log(maybe_unsigned(hi-lo))*3/4+3)
+    # TODO the maximum chunk size should be based on archetecture cache size.
+
+    # We need iterations * chunk size ≥ bits, and these cld's
+    # make an effort to get iterations * chunk size ≈ bits
+    UInt8(cld(bits, cld(bits, guess)))
+end
 
 # For AbstractVector{Bool}, counting sort is always best.
 # This is an implementation of counting sort specialized for Bools.
@@ -832,36 +844,7 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::
         u[i] -= u_min
     end
 
-    # chunk_size is the number of bits to radix over at once.
-    # We need to allocate an array of size 2^chunk size, and on the other hand the higher
-    # the chunk size the fewer passes we need. Theoretically, chunk size should be based on
-    # the Lambert W function applied to length. Empirically, we use this heuristic:
-    guess = log(lenm1)*3/4+3
-    # We need iterations * chunk size ≥ bits, and these cld's
-    # make an effort to get iterations * chunk size ≈ bits
-    chunk_size = UInt8(cld(bits, cld(bits, guess)))
-    @assert chunk_size >= 3
-
-    t = similar(u)
-    # This if else chain is to avoid dynamic dispatch for small cases.
-    # Chunk sizes less than 3 should never occur, and chunk sizes greater than 8
-    # only occur for arrays of length greater than 950, and tend to occur only for arrays
-    # of length greater than about 4000 where a single dynamic dispatch is less costly
-    u2 = if chunk_size == 3
-        radix_sort!(u, lo, hi, bits, Val(0x3), t)
-    elseif chunk_size == 4
-        radix_sort!(u, lo, hi, bits, Val(0x4), t)
-    elseif chunk_size == 5
-        radix_sort!(u, lo, hi, bits, Val(0x5), t)
-    elseif chunk_size == 6 # 9% to 15% savings over dynamic dispatch
-        radix_sort!(u, lo, hi, bits, Val(0x6), t)
-    elseif chunk_size == 7 # 2% to 7% savings over dynamic dispatch
-        radix_sort!(u, lo, hi, bits, Val(0x7), t)
-    elseif chunk_size == 8 # -1% to 10% savings and common for lengths between 300 and 3000
-        radix_sort!(u, lo, hi, bits, Val(0x8), t)
-    else
-        radix_sort!(u, lo, hi, bits, Val(chunk_size), t) # dynamic dispatch
-    end
+    u2 = radix_sort!(u, lo, hi, bits, similar(u))
     Serial.deserialize!(v, u2, lo, hi, o, u_min)
 end