rapidsai · madsbk · Dec 2, 2022 · Dec 5, 2022 · Dec 8, 2022 · Dec 12, 2022
@@ -8,10 +8,10 @@
 import threading
 import traceback
 import warnings
-import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Callable, Dict, List, Optional, Tuple
+from weakref import WeakKeyDictionary, WeakValueDictionary
 
 import rmm.mr
 
@@ -218,7 +218,10 @@ class SpillManager:
         SpillStatistics for the different levels.
     """
 
-    _buffers: weakref.WeakValueDictionary[int, SpillableBuffer]
+    _buffers: WeakValueDictionary[int, SpillableBuffer]
+    _spill_handlers: WeakKeyDictionary[
+        SpillableBuffer, Tuple[Callable[..., Optional[int]], Tuple, Dict]
+    ]
     statistics: SpillStatistics
 
     def __init__(
@@ -229,10 +232,11 @@ def __init__(
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
-        self._buffers = weakref.WeakValueDictionary()
+        self._buffers = WeakValueDictionary()
         self._id_counter = 0
         self._spill_on_demand = spill_on_demand
         self._device_memory_limit = device_memory_limit
+        self._spill_handlers = WeakKeyDictionary()
         self.statistics = SpillStatistics(statistic_level)
 
         if self._spill_on_demand:
@@ -347,13 +351,22 @@ def spill_device_memory(self, nbytes: int) -> int:
         """
         spilled = 0
         for buf in self.buffers(order_by_access_time=True):
+            if spilled >= nbytes:
+                break
             if buf.lock.acquire(blocking=False):
                 try:
                     if not buf.is_spilled and buf.spillable:
+                        # Check if `buf` has a registered spill handler
+                        handler = self._spill_handlers.get(buf, None)
+                        if handler is not None:
+                            func, args, kwargs = handler
+                            s = func(*args, **kwargs)
+                            if s is not None:
+                                spilled += s
+                                self._spill_handlers.pop(buf, None)
+                                continue
                         buf.spill(target="cpu")
                         spilled += buf.size
-                        if spilled >= nbytes:
-                            break
                 finally:
                     buf.lock.release()
         return spilled
@@ -385,6 +398,51 @@ def spill_to_device_limit(self, device_limit: int = None) -> int:
         )
         return self.spill_device_memory(nbytes=unspilled - limit)
 
+    def register_spill_handler(
+        self,
+        buffer: SpillableBuffer,
+        func: Callable[..., Optional[int]],
+        *args,
+        **kwargs,
+    ) -> None:
+        """Register a spill handler for a buffer
+
+        This enables customization of how to handle the spilling of a specific
+        buffer. When the spill manager chooses to spill the buffer, it calls
+        the provided callback function instead of spilling the buffer itself.
+
+        The callback function is called like `func(*args, **kwargs)` and must
+        return the number of bytes freed or None. If None, the spill manager
+        will spill `buffer`.
+
+        Warning
+        -------
+        The spill manager keeps a reference to `func`, `args`, and `kwargs`
+        thus everything they reference are also kept alive.
+
+        Parameters
+        ----------
+        buffer : SpillableBuffer
+            The buffer `func` handle.
+        func : Callable[*args, **kwargs, Optional[int]]
+            The spill handler
+        *args
+            Positional arguments pass to `func`
+        **kwargs
+            Keyword arguments pass to `func`
+
+        Return
+        ------
+        int
+            The number of bytes spilled or freed.
+        """
+
+        if buffer in self._spill_handlers:
+            raise RuntimeError(
+                f"Spill handler already registered for {buffer}"
+            )
+        self._spill_handlers[buffer] = (func, args, kwargs)
+
     def __repr__(self) -> str:
         spilled = sum(buf.size for buf in self.buffers() if buf.is_spilled)
         unspilled = sum(

@@ -2,14 +2,20 @@
 
 from __future__ import annotations
 
+import functools
+import sys
 import threading
+import weakref
 from contextlib import ContextDecorator
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, TypeVar, Union
 
+import cudf
 from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock
 
+T = TypeVar("T")
+
 
 def as_buffer(
     data: Union[int, Any],
@@ -134,3 +140,83 @@ def get_spill_lock() -> Union[SpillLock, None]:
     _id = threading.get_ident()
     spill_lock, _ = _thread_spill_locks.get(_id, (None, 0))
     return spill_lock
+
+
+def _clear_property_cache(
+    instance_ref: weakref.ReferenceType[T], nbytes: int, attrname: str
+) -> Optional[int]:
+    """Spill handler that clears the `cached_property` of an instance
+
+    The signature of this function is compatible with SpillManager's
+    register_spill_handler.
+
+    To avoid keeping instance alive, we take a weak reference of the instance.
+
+    Parameters
+    ----------
+    instance_ref
+        Weakref of the instance
+    nbytes : int
+        Size of the cached data
+    attrname : str
+        Name of the cached attribute
+
+    Return
+    ------
+    int
+        Number of bytes cleared
+    """
+
+    instance = instance_ref()
+    if instance is None:
+        return 0
+
+    cached = instance.__dict__.get(attrname, None)
+    if cached is None:
+        return None  # The cached has been cleared
+
+    # If `cached` is known outside of the cache, we cannot free any
+    # memory by clearing the cache. We have three inside references:
+    # `instance.__dict__`, `cached`, and `sys.getrefcount`.
+    if sys.getrefcount(cached) > 3:
+        return None
+
+    instance.__dict__.pop(attrname, None)  # Clear cache atomically
+    return nbytes
+
+
+class cached_property(functools.cached_property):
+    """A version of `cached_property` that delete instead of spill the cache
+
+    When spilling is disabled (the default case), this decorator is identical
+    to `functools.cached_property`.
+
+    When spilling is enabled, this property register a spill handler for
+    the cached data that deletes the data rather than spilling it. For now,
+    only cached Columns are handled this way.
+    See `SpillManager.register_spill_handler`.
+    """
+
+    def __get__(self, instance: T, owner=None):
+        cache_hit = self.attrname in instance.__dict__
+        ret = super().__get__(instance, owner)
+        if cache_hit or not isinstance(ret, cudf.core.column.ColumnBase):
+            return ret
+
+        manager = get_global_manager()
+        if manager is None:
+            return ret
+
+        buf = ret.base_data
+        if buf is None or buf.nbytes == 0:
+            return ret
+        assert isinstance(buf, SpillableBuffer)
+
+        manager.register_spill_handler(
+            buf,
+            _clear_property_cache,
+            weakref.ref(instance),
+            nbytes=buf.nbytes,
+            attrname=self.attrname,
+        )
+        return ret
@@ -5,7 +5,6 @@
 import math
 import pickle
 import warnings
-from functools import cached_property
 from numbers import Number
 from typing import (
     Any,
@@ -38,6 +37,7 @@
     is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _index_astype_docstring
+from cudf.core.buffer.utils import cached_property
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
@@ -245,7 +245,7 @@ def step(self):
     def _num_rows(self):
         return len(self)
 
-    @cached_property  # type: ignore
+    @cached_property
     @_cudf_nvtx_annotate
     def _values(self):
         if len(self) > 0:

@@ -18,6 +18,7 @@
 import cudf
 import cudf.core.buffer.spill_manager
 import cudf.options
+from cudf._lib.column import Column
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
     Buffer,
@@ -36,6 +37,8 @@
     SpillableBufferSlice,
     SpillLock,
 )
+from cudf.core.buffer.utils import cached_property
+from cudf.core.column import column
 from cudf.testing._utils import assert_eq
 
 if get_global_manager() is not None:
@@ -609,3 +612,56 @@ def test_statistics_expose(manager: SpillManager):
     assert stat.count == 10
     assert stat.total_nbytes == buffers[0].nbytes * 10
     assert stat.spilled_nbytes == buffers[0].nbytes * 10
+
+
+def test_cached_property(manager: SpillManager):
+    class ClassWithCachedColumn:
+        @cached_property
+        def cached_column(self) -> Column:
+            return column.arange(3)
+
+    # Check that a spill handler is created
+    c = ClassWithCachedColumn()
+    col = c.cached_column
+    assert len(manager.buffers()) == 1
+    assert manager.buffers()[0] is col.base_data
+    assert len(manager._spill_handlers) == 1
+
+    # Since we have a ref to `col`, the cache is spilled
+    assert manager.spill_device_memory(nbytes=1) == gen_df_data_nbytes
+    assert len(manager.buffers()) == 1
+    assert len(manager._spill_handlers) == 1
+
+    # Let's unspill and delete our ref to `col`. We still have the
+    # cached buffer and its spill handler
+    col.base_data.spill(target="gpu")
+    del col
+    assert len(manager.buffers()) == 1
+    assert len(manager._spill_handlers) == 1
+
+    # However, now that we have removed the ref to `col`, spilling the
+    # cached buffer, will clear the cache
+    assert manager.spill_device_memory(nbytes=1) == gen_df_data_nbytes
+    assert len(manager.buffers()) == 0
+    assert len(manager._spill_handlers) == 0
+
+
+def test_spilling_of_range_index(manager: SpillManager):
+    df = single_column_df(target="gpu")
+    assert isinstance(df.index, cudf.RangeIndex)
+    assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes)
+
+    # materialize the index
+    df.index._values
+    assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes * 2)
+
+    # spill the column, which has the oldest access time
+    manager.spill_device_memory(nbytes=1)
+    assert spilled_and_unspilled(manager) == (
+        gen_df_data_nbytes,
+        gen_df_data_nbytes,
+    )
+
+    # spill the index, which is deleted instead of spilled.
+    manager.spill_device_memory(nbytes=1)
+    assert spilled_and_unspilled(manager) == (gen_df_data_nbytes, 0)