This repository has been archived by the owner on Mar 12, 2021. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 83
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #83 from JuliaGPU/tb/poolmgmt
Managed pool allocator
- Loading branch information
Showing
2 changed files
with
252 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,268 @@ | ||
using CUDAdrv | ||
# dynamic memory pool allocator | ||
# | ||
# this allocator sits between CuArray constructors | ||
# and the actual memory allocation in CUDAdrv.Mem | ||
# | ||
# the core design is a pretty simple: | ||
# - bin allocations into multiple pools according to their size (see `poolidx`) | ||
# - when requested memory, check the pool for unused memory, or allocate dynamically | ||
# - conversely, when released memory, put it in the appropriate pool for future use | ||
# | ||
# to avoid memory hogging and/or trashing the Julia GC: | ||
# - keep track of used and available memory, in order to determine the usage of each pool | ||
# - keep track of each pool's usage, as well as a window of previous usages | ||
# - regularly release memory from underused pools (see `reclaim(false)`) | ||
# | ||
# possible improvements: | ||
# - pressure: have the `reclaim` background task reclaim more aggressively, | ||
# and call it from the failure cascade in `alloc` | ||
# - context management: either switch contexts when performing memory operations, | ||
# or just use unified memory for all allocations. | ||
# - per-device pools | ||
|
||
const pools = Vector{Mem.Buffer}[] | ||
const pool_lock = ReentrantLock() | ||
|
||
|
||
## infrastructure | ||
|
||
const pools_used = Vector{Set{Mem.Buffer}}() | ||
const pools_avail = Vector{Vector{Mem.Buffer}}() | ||
|
||
poolidx(n) = ceil(Int, log2(n))+1 | ||
poolsize(idx) = 2^(idx-1) | ||
|
||
function pool(idx) | ||
while length(pools) < idx | ||
push!(pools, Mem.Buffer[]) | ||
function create_pools(idx) | ||
if length(pool_usage) >= idx | ||
# fast-path without taking a lock | ||
return | ||
end | ||
|
||
lock(pool_lock) do | ||
while length(pool_usage) < idx | ||
push!(pool_usage, 1) | ||
push!(pool_history, initial_usage) | ||
push!(pools_used, Set{Mem.Buffer}()) | ||
push!(pools_avail, Vector{Mem.Buffer}()) | ||
end | ||
end | ||
end | ||
|
||
|
||
## management | ||
|
||
const USAGE_WINDOW = 5 | ||
const initial_usage = Tuple(1 for _ in 1:USAGE_WINDOW) | ||
|
||
const pool_usage = Vector{Float64}() | ||
const pool_history = Vector{NTuple{USAGE_WINDOW,Float64}}() | ||
|
||
# min and max time between successive background task iterations. | ||
# when the pool usages don't change, scan less regularly. | ||
# | ||
# together with USAGE_WINDOW, this determines how long it takes for objects to get reclaimed | ||
const MIN_DELAY = 1.0 | ||
const MAX_DELAY = 5.0 | ||
|
||
# debug stats | ||
mutable struct PoolStats | ||
req_alloc::Int | ||
req_free::Int | ||
|
||
actual_alloc::Int | ||
actual_free::Int | ||
|
||
amount_alloc::Int | ||
amount_free::Int | ||
|
||
alloc_1::Int | ||
alloc_2::Int | ||
alloc_3::Int | ||
alloc_4::Int | ||
end | ||
const pool_stats = PoolStats(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | ||
|
||
function __init_memory__() | ||
create_pools(30) # up to 512 MiB | ||
|
||
managed = parse(Bool, get(ENV, "CUARRAYS_MANAGED_POOL", "true")) | ||
if managed | ||
delay = MIN_DELAY | ||
@schedule begin | ||
while true | ||
if scan() | ||
delay = MIN_DELAY | ||
else | ||
delay = min(delay*2, MAX_DELAY) | ||
end | ||
|
||
reclaim() | ||
|
||
sleep(delay) | ||
end | ||
end | ||
end | ||
|
||
verbose = haskey(ENV, "CUARRAYS_MANAGED_POOL") | ||
if verbose | ||
atexit(()->begin | ||
Core.println(""" | ||
Pool statistics (managed: $(managed ? "yes" : "no")): | ||
- requested alloc/free: $(pool_stats.req_alloc) $(pool_stats.req_free) | ||
- actual alloc/free: $(pool_stats.actual_alloc) $(pool_stats.actual_free) | ||
- amount alloc/free: $(pool_stats.amount_alloc) $(pool_stats.amount_free) | ||
- alloc types: $(pool_stats.alloc_1) $(pool_stats.alloc_2) $(pool_stats.alloc_3) $(pool_stats.alloc_4)""") | ||
end) | ||
end | ||
|
||
end | ||
|
||
# scan every pool and manage the usage history | ||
# | ||
# returns a boolean indicating whether any pool is active (this can be a false negative) | ||
function scan() | ||
gc(false) # quick, incremental collection | ||
|
||
lock(pool_lock) do | ||
active = false | ||
|
||
@inbounds for pid in 1:length(pool_history) | ||
nused = length(pools_used[pid]) | ||
navail = length(pools_avail[pid]) | ||
history = pool_history[pid] | ||
|
||
if nused+navail > 0 | ||
usage = pool_usage[pid] | ||
current_usage = nused / (nused + navail) | ||
|
||
if any(usage->usage != current_usage, history) | ||
# shift the history window with the recorded usage | ||
history = pool_history[pid] | ||
pool_history[pid] = (Base.tail(pool_history[pid])..., usage) | ||
|
||
# reset the usage with the current one | ||
pool_usage[pid] = current_usage | ||
end | ||
|
||
if usage != current_usage | ||
active = true | ||
end | ||
else | ||
pool_usage[pid] = 1 | ||
pool_history[pid] = initial_usage | ||
end | ||
end | ||
|
||
active | ||
end | ||
@inbounds return pools[idx] | ||
end | ||
|
||
function clearpool() | ||
for pool in pools | ||
for buf in pool | ||
Mem.free(buf) | ||
# reclaim free objects | ||
function reclaim(full::Bool=false) | ||
lock(pool_lock) do | ||
if full | ||
# reclaim all currently unused buffers | ||
for (pid, pl) in enumerate(pools_avail) | ||
for buf in pl | ||
pool_stats.actual_free += 1 | ||
Mem.free(buf) | ||
pool_stats.amount_free += poolsize(pid) | ||
end | ||
empty!(pl) | ||
end | ||
else | ||
# only reclaim really unused buffers | ||
@inbounds for pid in 1:length(pool_usage) | ||
nused = length(pools_used[pid]) | ||
navail = length(pools_avail[pid]) | ||
recent_usage = (pool_history[pid]..., pool_usage[pid]) | ||
|
||
if navail > 0 | ||
# reclaim as much as the usage allows | ||
reclaimable = floor(Int, (1-maximum(recent_usage))*(nused+navail)) | ||
@assert reclaimable <= navail | ||
|
||
while reclaimable > 0 | ||
buf = pop!(pools_avail[pid]) | ||
pool_stats.actual_free += 1 | ||
Mem.free(buf) | ||
pool_stats.amount_free += poolsize(pid) | ||
reclaimable -= 1 | ||
end | ||
end | ||
end | ||
end | ||
empty!(pool) | ||
end | ||
end | ||
|
||
|
||
## interface | ||
|
||
function alloc(bytes) | ||
pool_stats.req_alloc += 1 | ||
|
||
pid = poolidx(bytes) | ||
pl = pool(pid) | ||
isempty(pl) || return pop!(pl) | ||
try | ||
Mem.alloc(poolsize(pid)) | ||
catch e | ||
e == CUDAdrv.CuError(2) || rethrow() | ||
gc() | ||
isempty(pl) || return pop!(pl) | ||
clearpool() | ||
Mem.alloc(poolsize(pid)) | ||
create_pools(pid) | ||
|
||
@inbounds used = pools_used[pid] | ||
@inbounds avail = pools_avail[pid] | ||
|
||
lock(pool_lock) do | ||
# 1. find an unused buffer in our pool | ||
buf = if !isempty(avail) | ||
pool_stats.alloc_1 += 1 | ||
pop!(avail) | ||
else | ||
try | ||
# 2. didn't have one, so allocate a new buffer | ||
buf = Mem.alloc(poolsize(pid)) | ||
pool_stats.alloc_2 += 1 | ||
pool_stats.actual_alloc += 1 | ||
pool_stats.amount_alloc += poolsize(pid) | ||
buf | ||
catch e | ||
e == CUDAdrv.CuError(2) || rethrow() | ||
# 3. that failed; make Julia collect objects and check do 1. again | ||
gc(true) # full collection | ||
if !isempty(avail) | ||
pool_stats.alloc_3 += 1 | ||
buf = pop!(avail) | ||
else | ||
# 4. didn't have one, so reclaim all other unused buffers and do 2. again | ||
reclaim(true) | ||
buf = Mem.alloc(poolsize(pid)) | ||
pool_stats.alloc_4 += 1 | ||
pool_stats.actual_alloc += 1 | ||
pool_stats.amount_alloc += poolsize(pid) | ||
buf | ||
end | ||
end | ||
end | ||
|
||
push!(used, buf) | ||
|
||
current_usage = length(used) / (length(avail) + length(used)) | ||
pool_usage[pid] = max(pool_usage[pid], current_usage) | ||
|
||
buf | ||
end | ||
end | ||
|
||
function dealloc(buf, n) | ||
push!(pool(poolidx(n)), buf) | ||
function dealloc(buf, bytes) | ||
pool_stats.req_free += 1 | ||
|
||
pid = poolidx(bytes) | ||
|
||
@inbounds used = pools_used[pid] | ||
@inbounds avail = pools_avail[pid] | ||
|
||
lock(pool_lock) do | ||
delete!(used, buf) | ||
|
||
push!(avail, buf) | ||
|
||
current_usage = length(used) / (length(used) + length(avail)) | ||
pool_usage[pid] = max(pool_usage[pid], current_usage) | ||
end | ||
|
||
return | ||
end |