Description
While playing around with numpy arrays from julia (discource thread) I have noticed that code like copy_np
outperforms copy_jl
(full benchmark here
using PythonCall
np = pyimport("numpy")
function copy_jl(arr)
sum(copy(arr))
end
function copy_np(arr)
pymem = np.empty(length(arr))
pyarr = PyArray(pymem)
pyarr .= arr
ans = sum(pyarr)
return ans
end
Further exploration revealed that disabling hugepages in numpy
removes this gap ENV["NUMPY_MADVISE_HUGEPAGE"] = "0"
. Apparently numpy
uses hugepages by default for large arrays (docs and source]). If I understand correctly /sys/kernel/mm/transparent_hugepage/enabled
is normally not set to always
by default but rather to madvise
which requires the underlying code to explicitly request this functionality - I have checked on Ubuntu 24.10 and 22.04.
Can we have this in julia as well? Given people are routinely working with large arrays (and adoption in numpy) this will likely be useful at least as an option. I did some experiments and they indicate that manually constructing arrays from memory that was madvise
'd for hugepage shows good improvement:
timings
julia +release -t 1 copyadd_hugepage.jl
... precomps
0.043465 seconds (3 allocations: 76.294 MiB) # pure julia
0.043485 seconds (3 allocations: 76.294 MiB) # malloc
0.021253 seconds (3 allocations: 76.294 MiB) # malloc + madivse(MADV_HUGEPAGE)
source
import Mmap: MADV_HUGEPAGE
function malloc_hugepage_vector_allocator(nels; hugepage=true)
nbytes = (nels * 8)
mlcd = @ccall pvalloc(nbytes::Csize_t)::Ptr{Cvoid} # depreceated pvalloc
if hugepage
success = @ccall madvise(mlcd :: Ptr{Cvoid}, nbytes :: Csize_t, MADV_HUGEPAGE :: Cint) :: Cint
@assert iszero(success)
end
arr_cons = @ccall jl_ptr_to_array_1d(
Vector{Float64}::Any,
mlcd::Ptr{Cvoid},
nels::Csize_t,
1::Cint # 1 triggers gc
) :: Vector{Float64}
return arr_cons
end
function copyadd_jl(arr)
arr1 = Vector{Float64}(undef, length(arr))
arr1 .= arr
return sum(arr)
end
function copyadd_hugepage(arr; hugepage=true)
arr1 = malloc_hugepage_vector_allocator(length(arr); hugepage)
arr1 .= arr
return sum(arr)
end
GC.enable(false)
arr_test = ones(10_000_000)
@time copyadd_jl(arr_test)
@time copyadd_hugepage(arr_test; hugepage=false)
@time copyadd_hugepage(arr_test; hugepage=true)
@time copyadd_jl(arr_test)
@time copyadd_hugepage(arr_test; hugepage=false)
@time copyadd_hugepage(arr_test; hugepage=true)
system
julia> versioninfo()
Julia Version 1.11.1
Commit 8f5b7ca12ad (2024-10-16 10:53 UTC)
Build Info:
Official https://julialang.org/ release
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 4 × INTEL(R) XEON(R) GOLD 6548N
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, icelake-server)
Threads: 1 default, 0 interactive, 1 GC (on 4 virtual cores)