Skip to content

Commit 44e5edc

Browse files
ProExpertProgminpeter
authored andcommitted
[BugFix][V1] Fix memory profiling bug (vllm-project#18974)
Signed-off-by: luka <luka@neuralmagic.com> Signed-off-by: minpeter <kali2005611@gmail.com>
1 parent 44fbbb6 commit 44e5edc

File tree

3 files changed

+53
-16
lines changed

3 files changed

+53
-16
lines changed

tests/models/test_initialization.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ def _initialize_kv_caches_v1(self, vllm_config):
8686
} if model_info.speculative_model else None,
8787
trust_remote_code=model_info.trust_remote_code,
8888
max_model_len=model_info.max_model_len,
89+
# these tests seem to produce leftover memory
90+
gpu_memory_utilization=0.80,
8991
load_format="dummy",
9092
hf_overrides=hf_overrides,
9193
)

tests/v1/sample/test_logprobs.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
4242
#TODO: enable this once we support it for
4343
# prompt logprobs.
4444
enable_prefix_caching=request.param,
45-
gpu_memory_utilization=0.5,
45+
gpu_memory_utilization=0.4, # up to 2 alive concurrently
4646
) as vllm_model:
4747
yield vllm_model
4848

@@ -343,10 +343,13 @@ def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
343343
with monkeypatch.context() as m:
344344
m.setenv("VLLM_USE_V1", "1")
345345

346-
runner = VllmRunner("facebook/opt-125m",
347-
max_logprobs=1,
348-
enable_prefix_caching=False,
349-
max_model_len=256)
346+
runner = VllmRunner(
347+
"facebook/opt-125m",
348+
max_logprobs=1,
349+
enable_prefix_caching=False,
350+
# 2 other llms alive during whole session
351+
gpu_memory_utilization=0.15,
352+
max_model_len=256)
350353
vllm_sampling_params = SamplingParams(logprobs=1)
351354
# should pass
352355
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)

vllm/v1/worker/gpu_worker.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,20 @@ def init_device(self):
130130
_check_if_gpu_supports_dtype(self.model_config.dtype)
131131
gc.collect()
132132
torch.cuda.empty_cache()
133-
self.init_gpu_memory = torch.cuda.mem_get_info()[0]
133+
self.init_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
134+
requested_memory = (total_gpu_memory *
135+
self.cache_config.gpu_memory_utilization)
136+
if self.init_gpu_memory < requested_memory:
137+
GiB = lambda b: round(b / GiB_bytes, 2)
138+
raise ValueError(
139+
f"Free memory on device ({GiB(self.init_gpu_memory)}/"
140+
f"{GiB(total_gpu_memory)} GiB) on startup is less than "
141+
f"desired GPU memory utilization "
142+
f"({self.cache_config.gpu_memory_utilization}, "
143+
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
144+
f"utilization or reduce GPU memory used by other processes."
145+
)
146+
134147
else:
135148
raise RuntimeError(
136149
f"Not support device type: {self.device_config.device}")
@@ -190,28 +203,47 @@ def determine_available_memory(self) -> int:
190203
# GPU did not change their memory usage during the profiling.
191204
assert self.init_gpu_memory > free_gpu_memory, (
192205
"Error in memory profiling. "
193-
f"Initial free memory {self.init_gpu_memory}, current free memory"
194-
f" {free_gpu_memory}. This happens when the GPU memory was "
195-
"not properly cleaned up before initializing the vLLM instance.")
206+
f"Initial free memory {self.init_gpu_memory/GiB_bytes} GiB, "
207+
f"current free memory {free_gpu_memory/GiB_bytes} GiB. "
208+
f"This happens when the GPU memory was not properly cleaned up "
209+
f"before initializing the vLLM instance.")
196210

197211
# Get the peak memory allocation recorded by torch
198-
peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
212+
peak_torch_memory = torch.cuda.memory_stats(
213+
)["allocated_bytes.all.peak"]
199214

200215
# Check for any memory left around that may have been allocated on the
201216
# gpu outside of `torch`. NCCL operations, for example, can use a few
202-
# GB during a forward pass
217+
# GB during a forward pass.
203218
torch.cuda.empty_cache()
204219
torch_allocated_bytes = torch.cuda.memory_stats(
205220
)["allocated_bytes.all.current"]
206-
total_allocated_bytes = torch.cuda.mem_get_info(
207-
)[1] - torch.cuda.mem_get_info()[0]
208-
non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
209-
if non_torch_allocations > 0:
210-
peak_memory += non_torch_allocations
221+
222+
# Reset after emptying torch cache
223+
free_gpu_memory = torch.cuda.mem_get_info()[0]
224+
225+
# Total forward allocation (current) is equal to the diff in free memory
226+
fwd_alloc_bytes = self.init_gpu_memory - free_gpu_memory
227+
# We assume current non-torch allocation is equal to peak
228+
non_torch_alloc_bytes = max(0, fwd_alloc_bytes - torch_allocated_bytes)
229+
# Total forward allocation (peak) is peak torch + non-torch
230+
peak_memory = peak_torch_memory + non_torch_alloc_bytes
231+
211232
available_kv_cache_memory = (
212233
total_gpu_memory * self.cache_config.gpu_memory_utilization -
213234
peak_memory)
214235

236+
GiB = lambda b: b / GiB_bytes
237+
logger.debug(
238+
"Initial free memory: %.2f GiB, free memory: %.2f GiB, "
239+
"total GPU memory: %.2f GiB", GiB(self.init_gpu_memory),
240+
GiB(free_gpu_memory), GiB(total_gpu_memory))
241+
logger.debug(
242+
"Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
243+
"%.2f GiB, available KVCache memory: %.2f GiB",
244+
GiB(peak_torch_memory), GiB(non_torch_alloc_bytes),
245+
GiB(available_kv_cache_memory))
246+
215247
return int(available_kv_cache_memory)
216248

217249
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:

0 commit comments

Comments
 (0)