Open
Description
This problem was originally reported in the helpdesk Slack against DataFrames, but I believe I have replicated it just using vectors of vectors.
OP was using Julia 1.5.0 with DataFrame 1.2.2 on Linux
I used Julia 1.6.2 on WSL2 Ubuntu
Just using vectors, everything behaves as expected and julia's allocated memory does not increase:
function inner_df(Nrow,Ncol) # Create small dataframe
rand(Nrow*Ncol)
end
function outer_df(N) #Stack small dataframes
Nrow=76
Ncol=21
df = Vector{Float64}(undef,0)
for i = 1:N
append!(df,inner_df(Nrow,Ncol))
end
return df
end
function iterated(Niter) # Create a large DataFrame many times.
for i =1:Niter
println(i)
@time pan = outer_df(1000*10*10)
end
nothing
end
# (code and top lines after the code)
iterated(1)
GC.gc()
# 12142 user 20 0 1892304 215296 64744 S 0.0 1.3 0:19.41 julia
iterated(10)
GC.gc()
# 12142 user 20 0 1892416 226740 65412 S 0.0 1.4 1:51.96 julia
iterated(10)
GC.gc()
# 12142 user 20 0 1892416 227000 65416 S 0.0 1.4 3:25.04 julia
iterated(20)
GC.gc()
# 12142 user 20 0 1892416 227104 65416 S 0.0 1.4 6:32.83 julia
Using vectors of vectors, there seems to be a memory leak:
function inner_df(Nrow,Ncol) # Create small dataframe
[rand(Nrow) for i in 1:Ncol]
end
function outer_df(N) #Stack small dataframes
Nrow=76
Ncol=21
df=[Vector{Float64}(undef, 0) for i in 1:Ncol]
for i = 1:N
df2 = inner_df(Nrow,Ncol)
for j in 1:Ncol
append!(df[j],df2[j])
end
end
return df
end
function iterated(Niter) # Create a large DataFrame many times.
for i =1:Niter
println(i)
@time pan = outer_df(1000*10*10)
end
nothing
end
# (code and top lines after code)
iterated(1)
GC.gc()
# 9204 user 20 0 2275056 216848 64844 S 0.0 1.3 0:15.61 julia
iterated(10)
GC.gc()
# 9204 user 20 0 3339728 952280 64844 S 0.0 5.7 1:06.03 julia
iterated(10)
GC.gc()
# 9204 user 20 0 3339728 952876 64844 S 0.0 5.7 1:55.75 julia
iterated(20)
GC.gc()
# 9204 user 20 0 3343824 953112 64964 S 0.0 5.7 3:31.63 julia
Please feel free to close if this is a vagary of Linux memory management.