Merge pull request #83 from JuliaGPU/tb/poolmgmt

Managed pool allocator
JuliaGPU · Jun 1, 2018 · 75f4645 · 75f4645
2 parents 98e32e0 + 2aed893
commit 75f4645
Show file tree

Hide file tree

Showing 2 changed files with 252 additions and 23 deletions.
diff --git a/src/CuArrays.jl b/src/CuArrays.jl
@@ -42,6 +42,8 @@ function __init__()
         warn("Please run Pkg.build(\"CuArrays\") and restart Julia.")
         return
     end
+
+    __init_memory__()
 end
 
 end # module
diff --git a/src/memory.jl b/src/memory.jl
@@ -1,41 +1,268 @@
-using CUDAdrv
+# dynamic memory pool allocator
+#
+# this allocator sits between CuArray constructors
+# and the actual memory allocation in CUDAdrv.Mem
+#
+# the core design is a pretty simple:
+# - bin allocations into multiple pools according to their size (see `poolidx`)
+# - when requested memory, check the pool for unused memory, or allocate dynamically
+# - conversely, when released memory, put it in the appropriate pool for future use
+#
+# to avoid memory hogging and/or trashing the Julia GC:
+# - keep track of used and available memory, in order to determine the usage of each pool
+# - keep track of each pool's usage, as well as a window of previous usages
+# - regularly release memory from underused pools (see `reclaim(false)`)
+#
+# possible improvements:
+# - pressure: have the `reclaim` background task reclaim more aggressively,
+#             and call it from the failure cascade in `alloc`
+# - context management: either switch contexts when performing memory operations,
+#                       or just use unified memory for all allocations.
+# - per-device pools
 
-const pools = Vector{Mem.Buffer}[]
+const pool_lock = ReentrantLock()
+
+
+## infrastructure
+
+const pools_used = Vector{Set{Mem.Buffer}}()
+const pools_avail = Vector{Vector{Mem.Buffer}}()
 
 poolidx(n) = ceil(Int, log2(n))+1
 poolsize(idx) = 2^(idx-1)
 
-function pool(idx)
-  while length(pools) < idx
-    push!(pools, Mem.Buffer[])
+function create_pools(idx)
+  if length(pool_usage) >= idx
+    # fast-path without taking a lock
+    return
+  end
+
+  lock(pool_lock) do
+    while length(pool_usage) < idx
+      push!(pool_usage, 1)
+      push!(pool_history, initial_usage)
+      push!(pools_used, Set{Mem.Buffer}())
+      push!(pools_avail, Vector{Mem.Buffer}())
+    end
+  end
+end
+
+
+## management
+
+const USAGE_WINDOW = 5
+const initial_usage = Tuple(1 for _ in 1:USAGE_WINDOW)
+
+const pool_usage = Vector{Float64}()
+const pool_history = Vector{NTuple{USAGE_WINDOW,Float64}}()
+
+# min and max time between successive background task iterations.
+# when the pool usages don't change, scan less regularly.
+#
+# together with USAGE_WINDOW, this determines how long it takes for objects to get reclaimed
+const MIN_DELAY = 1.0
+const MAX_DELAY = 5.0
+
+# debug stats
+mutable struct PoolStats
+  req_alloc::Int
+  req_free::Int
+
+  actual_alloc::Int
+  actual_free::Int
+
+  amount_alloc::Int
+  amount_free::Int
+
+  alloc_1::Int
+  alloc_2::Int
+  alloc_3::Int
+  alloc_4::Int
+end
+const pool_stats = PoolStats(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
+
+function __init_memory__()
+  create_pools(30) # up to 512 MiB
+
+  managed = parse(Bool, get(ENV, "CUARRAYS_MANAGED_POOL", "true"))
+  if managed
+    delay = MIN_DELAY
+    @schedule begin
+      while true
+        if scan()
+          delay = MIN_DELAY
+        else
+          delay = min(delay*2, MAX_DELAY)
+        end
+
+        reclaim()
+
+        sleep(delay)
+      end
+    end
+  end
+
+  verbose = haskey(ENV, "CUARRAYS_MANAGED_POOL")
+  if verbose
+    atexit(()->begin
+      Core.println("""
+        Pool statistics (managed: $(managed ? "yes" : "no")):
+         - requested alloc/free: $(pool_stats.req_alloc) $(pool_stats.req_free)
+         - actual alloc/free: $(pool_stats.actual_alloc) $(pool_stats.actual_free)
+         - amount alloc/free: $(pool_stats.amount_alloc) $(pool_stats.amount_free)
+         - alloc types: $(pool_stats.alloc_1) $(pool_stats.alloc_2) $(pool_stats.alloc_3) $(pool_stats.alloc_4)""")
+    end)
+  end
+
+end
+
+# scan every pool and manage the usage history
+#
+# returns a boolean indicating whether any pool is active (this can be a false negative)
+function scan()
+  gc(false) # quick, incremental collection
+
+  lock(pool_lock) do
+    active = false
+
+    @inbounds for pid in 1:length(pool_history)
+      nused = length(pools_used[pid])
+      navail = length(pools_avail[pid])
+      history = pool_history[pid]
+
+      if nused+navail > 0
+        usage = pool_usage[pid]
+        current_usage = nused / (nused + navail)
+
+        if any(usage->usage != current_usage, history)
+          # shift the history window with the recorded usage
+          history = pool_history[pid]
+          pool_history[pid] = (Base.tail(pool_history[pid])..., usage)
+
+          # reset the usage with the current one
+          pool_usage[pid] = current_usage
+        end
+
+        if usage != current_usage
+          active = true
+        end
+      else
+        pool_usage[pid] = 1
+        pool_history[pid] = initial_usage
+      end
+    end
+
+    active
   end
-  @inbounds return pools[idx]
 end
 
-function clearpool()
-  for pool in pools
-    for buf in pool
-      Mem.free(buf)
+# reclaim free objects
+function reclaim(full::Bool=false)
+  lock(pool_lock) do
+    if full
+      # reclaim all currently unused buffers
+      for (pid, pl) in enumerate(pools_avail)
+        for buf in pl
+          pool_stats.actual_free += 1
+          Mem.free(buf)
+          pool_stats.amount_free += poolsize(pid)
+        end
+        empty!(pl)
+      end
+    else
+      # only reclaim really unused buffers
+      @inbounds for pid in 1:length(pool_usage)
+        nused = length(pools_used[pid])
+        navail = length(pools_avail[pid])
+        recent_usage = (pool_history[pid]..., pool_usage[pid])
+
+        if navail > 0
+          # reclaim as much as the usage allows
+          reclaimable = floor(Int, (1-maximum(recent_usage))*(nused+navail))
+          @assert reclaimable <= navail
+
+          while reclaimable > 0
+            buf = pop!(pools_avail[pid])
+            pool_stats.actual_free += 1
+            Mem.free(buf)
+            pool_stats.amount_free += poolsize(pid)
+            reclaimable -= 1
+          end
+        end
+      end
     end
-    empty!(pool)
   end
 end
 
+
+## interface
+
 function alloc(bytes)
+  pool_stats.req_alloc += 1
+
   pid = poolidx(bytes)
-  pl = pool(pid)
-  isempty(pl) || return pop!(pl)
-  try
-    Mem.alloc(poolsize(pid))
-  catch e
-    e == CUDAdrv.CuError(2) || rethrow()
-    gc()
-    isempty(pl) || return pop!(pl)
-    clearpool()
-    Mem.alloc(poolsize(pid))
+  create_pools(pid)
+
+  @inbounds used = pools_used[pid]
+  @inbounds avail = pools_avail[pid]
+
+  lock(pool_lock) do
+    # 1. find an unused buffer in our pool
+    buf = if !isempty(avail)
+      pool_stats.alloc_1 += 1
+      pop!(avail)
+    else
+      try
+        # 2. didn't have one, so allocate a new buffer
+        buf = Mem.alloc(poolsize(pid))
+        pool_stats.alloc_2 += 1
+        pool_stats.actual_alloc += 1
+        pool_stats.amount_alloc += poolsize(pid)
+        buf
+      catch e
+        e == CUDAdrv.CuError(2) || rethrow()
+        # 3. that failed; make Julia collect objects and check do 1. again
+        gc(true) # full collection
+        if !isempty(avail)
+          pool_stats.alloc_3 += 1
+          buf = pop!(avail)
+        else
+          # 4. didn't have one, so reclaim all other unused buffers and do 2. again
+          reclaim(true)
+          buf = Mem.alloc(poolsize(pid))
+          pool_stats.alloc_4 += 1
+          pool_stats.actual_alloc += 1
+          pool_stats.amount_alloc += poolsize(pid)
+          buf
+        end
+      end
+    end
+
+    push!(used, buf)
+
+    current_usage = length(used) / (length(avail) + length(used))
+    pool_usage[pid] = max(pool_usage[pid], current_usage)
+
+    buf
   end
 end
 
-function dealloc(buf, n)
-  push!(pool(poolidx(n)), buf)
+function dealloc(buf, bytes)
+  pool_stats.req_free += 1
+
+  pid = poolidx(bytes)
+
+  @inbounds used = pools_used[pid]
+  @inbounds avail = pools_avail[pid]
+
+  lock(pool_lock) do
+    delete!(used, buf)
+
+    push!(avail, buf)
+
+    current_usage = length(used) / (length(used) + length(avail))
+    pool_usage[pid] = max(pool_usage[pid], current_usage)
+  end
+
+  return
 end