Skip to content

Commit b6e026b

Browse files
author
Lincoln Stein
committed
revert to using deepcopy() method for making VRAM copies
1 parent 6f699c1 commit b6e026b

File tree

6 files changed

+47
-179
lines changed

6 files changed

+47
-179
lines changed

invokeai/app/invocations/compel.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
9292
),
9393
):
9494

95-
print(f'DEBUG: compel: tid={threading.current_thread().ident}, gpu={TorchDevice.choose_torch_device()}, text_encoder={text_encoder.device}')
96-
9795
assert isinstance(text_encoder, CLIPTextModel)
9896
assert isinstance(tokenizer, CLIPTokenizer)
9997
compel = Compel(
@@ -117,7 +115,6 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
117115
conditioning_data = ConditioningFieldData(conditionings=[BasicConditioningInfo(embeds=c)])
118116

119117
conditioning_name = context.conditioning.save(conditioning_data)
120-
print(f'DEBUG: conditioning_name={conditioning_name}')
121118

122119
return ConditioningOutput(
123120
conditioning=ConditioningField(

invokeai/app/invocations/latent.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1080,7 +1080,6 @@ def invoke(self, context: InvocationContext) -> ImageOutput:
10801080
with torch.inference_mode():
10811081
# copied from diffusers pipeline
10821082
latents = latents / vae.config.scaling_factor
1083-
print(f'DEBUG: tid={threading.current_thread().ident}, gpu={TorchDevice.choose_torch_device()}, latent_device={latents.device}')
10841083
image = vae.decode(latents, return_dict=False)[0]
10851084
image = (image / 2 + 0.5).clamp(0, 1) # denormalize
10861085
# we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16

invokeai/app/services/model_manager/model_manager_default.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,6 @@ def build_model_manager(
7676

7777
ram_cache = ModelCache(
7878
max_cache_size=app_config.ram,
79-
max_vram_cache_size=app_config.vram,
80-
lazy_offloading=app_config.lazy_offload,
8179
logger=logger,
8280
)
8381
convert_cache = ModelConvertCache(cache_path=app_config.convert_cache_path, max_size=app_config.convert_cache)

invokeai/backend/model_manager/load/model_cache/model_cache_base.py

Lines changed: 1 addition & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -43,26 +43,9 @@ def model(self) -> AnyModel:
4343

4444
@dataclass
4545
class CacheRecord(Generic[T]):
46-
"""
47-
Elements of the cache:
48-
49-
key: Unique key for each model, same as used in the models database.
50-
model: Model in memory.
51-
state_dict: A read-only copy of the model's state dict in RAM. It will be
52-
used as a template for creating a copy in the VRAM.
53-
size: Size of the model
54-
loaded: True if the model's state dict is currently in VRAM
55-
56-
Before a model is executed, the state_dict template is copied into VRAM,
57-
and then injected into the model. When the model is finished, the VRAM
58-
copy of the state dict is deleted, and the RAM version is reinjected
59-
into the model.
60-
"""
46+
"""Elements of the cache."""
6147

6248
key: str
63-
model: T
64-
device: torch.device
65-
state_dict: Optional[Dict[str, torch.Tensor]]
6649
size: int
6750
model: T
6851
loaded: bool = False
@@ -130,28 +113,12 @@ def get_execution_device(self) -> torch.device:
130113
"""
131114
pass
132115

133-
@property
134-
@abstractmethod
135-
def lazy_offloading(self) -> bool:
136-
"""Return true if the cache is configured to lazily offload models in VRAM."""
137-
pass
138-
139116
@property
140117
@abstractmethod
141118
def max_cache_size(self) -> float:
142119
"""Return true if the cache is configured to lazily offload models in VRAM."""
143120
pass
144121

145-
@abstractmethod
146-
def offload_unlocked_models(self, size_required: int) -> None:
147-
"""Offload from VRAM any models not actively in use."""
148-
pass
149-
150-
@abstractmethod
151-
def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
152-
"""Move model into the indicated device."""
153-
pass
154-
155122
@property
156123
@abstractmethod
157124
def stats(self) -> Optional[CacheStats]:

invokeai/backend/model_manager/load/model_cache/model_cache_default.py

Lines changed: 37 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,8 @@
1919
"""
2020

2121
import gc
22-
import math
2322
import sys
2423
import threading
25-
import time
2624
from contextlib import contextmanager, suppress
2725
from logging import Logger
2826
from threading import BoundedSemaphore
@@ -31,7 +29,7 @@
3129
import torch
3230

3331
from invokeai.backend.model_manager import AnyModel, SubModelType
34-
from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
32+
from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot
3533
from invokeai.backend.util.devices import TorchDevice
3634
from invokeai.backend.util.logging import InvokeAILogger
3735

@@ -42,11 +40,6 @@
4240
# Default is roughly enough to hold three fp16 diffusers models in RAM simultaneously
4341
DEFAULT_MAX_CACHE_SIZE = 6.0
4442

45-
# amount of GPU memory to hold in reserve for use by generations (GB)
46-
# Empirically this value seems to improve performance without starving other
47-
# processes.
48-
DEFAULT_MAX_VRAM_CACHE_SIZE = 0.25
49-
5043
# actual size of a gig
5144
GIG = 1073741824
5245

@@ -60,12 +53,10 @@ class ModelCache(ModelCacheBase[AnyModel]):
6053
def __init__(
6154
self,
6255
max_cache_size: float = DEFAULT_MAX_CACHE_SIZE,
63-
max_vram_cache_size: float = DEFAULT_MAX_VRAM_CACHE_SIZE,
6456
storage_device: torch.device = torch.device("cpu"),
6557
execution_devices: Optional[Set[torch.device]] = None,
6658
precision: torch.dtype = torch.float16,
6759
sequential_offload: bool = False,
68-
lazy_offloading: bool = True,
6960
sha_chunksize: int = 16777216,
7061
log_memory_usage: bool = False,
7162
logger: Optional[Logger] = None,
@@ -76,18 +67,14 @@ def __init__(
7667
:param max_cache_size: Maximum size of the RAM cache [6.0 GB]
7768
:param storage_device: Torch device to save inactive model in [torch.device('cpu')]
7869
:param precision: Precision for loaded models [torch.float16]
79-
:param lazy_offloading: Keep model in VRAM until another model needs to be loaded
8070
:param sequential_offload: Conserve VRAM by loading and unloading each stage of the pipeline sequentially
8171
:param log_memory_usage: If True, a memory snapshot will be captured before and after every model cache
8272
operation, and the result will be logged (at debug level). There is a time cost to capturing the memory
8373
snapshots, so it is recommended to disable this feature unless you are actively inspecting the model cache's
8474
behaviour.
8575
"""
86-
# allow lazy offloading only when vram cache enabled
87-
self._lazy_offloading = lazy_offloading and max_vram_cache_size > 0
8876
self._precision: torch.dtype = precision
8977
self._max_cache_size: float = max_cache_size
90-
self._max_vram_cache_size: float = max_vram_cache_size
9178
self._storage_device: torch.device = storage_device
9279
self._ram_lock = threading.Lock()
9380
self._logger = logger or InvokeAILogger.get_logger(self.__class__.__name__)
@@ -111,11 +98,6 @@ def logger(self) -> Logger:
11198
"""Return the logger used by the cache."""
11299
return self._logger
113100

114-
@property
115-
def lazy_offloading(self) -> bool:
116-
"""Return true if the cache is configured to lazily offload models in VRAM."""
117-
return self._lazy_offloading
118-
119101
@property
120102
def storage_device(self) -> torch.device:
121103
"""Return the storage device (e.g. "CPU" for RAM)."""
@@ -233,10 +215,9 @@ def put(
233215
if key in self._cached_models:
234216
return
235217
self.make_room(size)
236-
state_dict = model.state_dict() if isinstance(model, torch.nn.Module) else None
237-
cache_record = CacheRecord(key=key, model=model, device=self.storage_device, state_dict=state_dict, size=size)
238-
self._cached_models[key] = cache_record
239-
self._cache_stack.append(key)
218+
cache_record = CacheRecord(key, model=model, size=size)
219+
self._cached_models[key] = cache_record
220+
self._cache_stack.append(key)
240221

241222
def get(
242223
self,
@@ -296,107 +277,6 @@ def _make_cache_key(self, model_key: str, submodel_type: Optional[SubModelType]
296277
else:
297278
return model_key
298279

299-
def offload_unlocked_models(self, size_required: int) -> None:
300-
"""Move any unused models from VRAM."""
301-
reserved = self._max_vram_cache_size * GIG
302-
vram_in_use = torch.cuda.memory_allocated() + size_required
303-
self.logger.debug(f"{(vram_in_use/GIG):.2f}GB VRAM needed for models; max allowed={(reserved/GIG):.2f}GB")
304-
for _, cache_entry in sorted(self._cached_models.items(), key=lambda x: x[1].size):
305-
if vram_in_use <= reserved:
306-
break
307-
if not cache_entry.loaded:
308-
continue
309-
if not cache_entry.locked:
310-
self.move_model_to_device(cache_entry, self.storage_device)
311-
cache_entry.loaded = False
312-
vram_in_use = torch.cuda.memory_allocated() + size_required
313-
self.logger.debug(
314-
f"Removing {cache_entry.key} from VRAM to free {(cache_entry.size/GIG):.2f}GB; vram free = {(torch.cuda.memory_allocated()/GIG):.2f}GB"
315-
)
316-
317-
TorchDevice.empty_cache()
318-
319-
def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device: torch.device) -> None:
320-
"""Move model into the indicated device.
321-
322-
:param cache_entry: The CacheRecord for the model
323-
:param target_device: The torch.device to move the model into
324-
325-
May raise a torch.cuda.OutOfMemoryError
326-
"""
327-
# These attributes are not in the base ModelMixin class but in various derived classes.
328-
# Some models don't have these attributes, in which case they run in RAM/CPU.
329-
self.logger.debug(f"Called to move {cache_entry.key} to {target_device}")
330-
if not (hasattr(cache_entry.model, "device") and hasattr(cache_entry.model, "to")):
331-
return
332-
333-
source_device = cache_entry.device
334-
335-
# Note: We compare device types only so that 'cuda' == 'cuda:0'.
336-
# This would need to be revised to support multi-GPU.
337-
if torch.device(source_device).type == torch.device(target_device).type:
338-
return
339-
340-
# This roundabout method for moving the model around is done to avoid
341-
# the cost of moving the model from RAM to VRAM and then back from VRAM to RAM.
342-
# When moving to VRAM, we copy (not move) each element of the state dict from
343-
# RAM to a new state dict in VRAM, and then inject it into the model.
344-
# This operation is slightly faster than running `to()` on the whole model.
345-
#
346-
# When the model needs to be removed from VRAM we simply delete the copy
347-
# of the state dict in VRAM, and reinject the state dict that is cached
348-
# in RAM into the model. So this operation is very fast.
349-
start_model_to_time = time.time()
350-
snapshot_before = self._capture_memory_snapshot()
351-
352-
try:
353-
if cache_entry.state_dict is not None:
354-
assert hasattr(cache_entry.model, "load_state_dict")
355-
if target_device == self.storage_device:
356-
cache_entry.model.load_state_dict(cache_entry.state_dict, assign=True)
357-
else:
358-
new_dict: Dict[str, torch.Tensor] = {}
359-
for k, v in cache_entry.state_dict.items():
360-
new_dict[k] = v.to(torch.device(target_device), copy=True)
361-
cache_entry.model.load_state_dict(new_dict, assign=True)
362-
cache_entry.model.to(target_device)
363-
cache_entry.device = target_device
364-
except Exception as e: # blow away cache entry
365-
self._delete_cache_entry(cache_entry)
366-
raise e
367-
368-
snapshot_after = self._capture_memory_snapshot()
369-
end_model_to_time = time.time()
370-
self.logger.debug(
371-
f"Moved model '{cache_entry.key}' from {source_device} to"
372-
f" {target_device} in {(end_model_to_time-start_model_to_time):.2f}s."
373-
f"Estimated model size: {(cache_entry.size/GIG):.3f} GB."
374-
f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
375-
)
376-
377-
if (
378-
snapshot_before is not None
379-
and snapshot_after is not None
380-
and snapshot_before.vram is not None
381-
and snapshot_after.vram is not None
382-
):
383-
vram_change = abs(snapshot_before.vram - snapshot_after.vram)
384-
385-
# If the estimated model size does not match the change in VRAM, log a warning.
386-
if not math.isclose(
387-
vram_change,
388-
cache_entry.size,
389-
rel_tol=0.1,
390-
abs_tol=10 * MB,
391-
):
392-
self.logger.debug(
393-
f"Moving model '{cache_entry.key}' from {source_device} to"
394-
f" {target_device} caused an unexpected change in VRAM usage. The model's"
395-
" estimated size may be incorrect. Estimated model size:"
396-
f" {(cache_entry.size/GIG):.3f} GB.\n"
397-
f"{get_pretty_snapshot_diff(snapshot_before, snapshot_after)}"
398-
)
399-
400280
def print_cuda_stats(self) -> None:
401281
"""Log CUDA diagnostics."""
402282
vram = "%4.2fG" % (torch.cuda.memory_allocated() / GIG)
@@ -440,12 +320,43 @@ def make_room(self, size: int) -> None:
440320
while current_size + bytes_needed > maximum_size and pos < len(self._cache_stack):
441321
model_key = self._cache_stack[pos]
442322
cache_entry = self._cached_models[model_key]
323+
324+
refs = sys.getrefcount(cache_entry.model)
325+
326+
# HACK: This is a workaround for a memory-management issue that we haven't tracked down yet. We are directly
327+
# going against the advice in the Python docs by using `gc.get_referrers(...)` in this way:
328+
# https://docs.python.org/3/library/gc.html#gc.get_referrers
329+
330+
# manualy clear local variable references of just finished function calls
331+
# for some reason python don't want to collect it even by gc.collect() immidiately
332+
if refs > 2:
333+
while True:
334+
cleared = False
335+
for referrer in gc.get_referrers(cache_entry.model):
336+
if type(referrer).__name__ == "frame":
337+
# RuntimeError: cannot clear an executing frame
338+
with suppress(RuntimeError):
339+
referrer.clear()
340+
cleared = True
341+
# break
342+
343+
# repeat if referrers changes(due to frame clear), else exit loop
344+
if cleared:
345+
gc.collect()
346+
else:
347+
break
348+
443349
device = cache_entry.model.device if hasattr(cache_entry.model, "device") else None
444350
self.logger.debug(
445-
f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded}"
351+
f"Model: {model_key}, locks: {cache_entry._locks}, device: {device}, loaded: {cache_entry.loaded},"
352+
f" refs: {refs}"
446353
)
447354

448-
if not cache_entry.locked:
355+
# Expected refs:
356+
# 1 from cache_entry
357+
# 1 from getrefcount function
358+
# 1 from onnx runtime object
359+
if not cache_entry.locked and refs <= (3 if "onnx" in model_key else 2):
449360
self.logger.debug(
450361
f"Removing {model_key} from RAM cache to free at least {(size/GIG):.2f} GB (-{(cache_entry.size/GIG):.2f} GB)"
451362
)

invokeai/backend/model_manager/load/model_cache/model_locker.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Base class and implementation of a class that moves models in and out of VRAM.
33
"""
44

5+
import copy
56
from typing import Optional
67

78
import torch
@@ -54,14 +55,13 @@ def lock(self) -> AnyModel:
5455
# NOTE that the model has to have the to() method in order for this code to move it into GPU!
5556
self._cache_entry.lock()
5657
try:
57-
if self._cache.lazy_offloading:
58-
self._cache.offload_unlocked_models(self._cache_entry.size)
59-
60-
execution_device = self._cache.get_execution_device()
61-
self._cache.move_model_to_device(self._cache_entry, execution_device)
58+
# We wait for a gpu to be free - may raise a ValueError
59+
self._execution_device = self._cache.get_execution_device()
60+
self._cache.logger.debug(f"Locking {self._cache_entry.key} in {self._execution_device}")
61+
model_in_gpu = copy.deepcopy(self._cache_entry.model)
62+
if hasattr(model_in_gpu, "to"):
63+
model_in_gpu.to(self._execution_device)
6264
self._cache_entry.loaded = True
63-
64-
self._cache.logger.debug(f"Locking {self._cache_entry.key} in {execution_device}")
6565
self._cache.print_cuda_stats()
6666
except torch.cuda.OutOfMemoryError:
6767
self._cache.logger.warning("Insufficient GPU memory to load model. Aborting")
@@ -70,15 +70,11 @@ def lock(self) -> AnyModel:
7070
except Exception:
7171
self._cache_entry.unlock()
7272
raise
73-
74-
return self.model
73+
return model_in_gpu
7574

7675
def unlock(self) -> None:
7776
"""Call upon exit from context."""
7877
if not hasattr(self.model, "to"):
7978
return
80-
8179
self._cache_entry.unlock()
82-
if not self._cache.lazy_offloading:
83-
self._cache.offload_unlocked_models(0)
84-
self._cache.print_cuda_stats()
80+
self._cache.print_cuda_stats()

0 commit comments

Comments
 (0)