@@ -130,7 +130,20 @@ def init_device(self):
130
130
_check_if_gpu_supports_dtype (self .model_config .dtype )
131
131
gc .collect ()
132
132
torch .cuda .empty_cache ()
133
- self .init_gpu_memory = torch .cuda .mem_get_info ()[0 ]
133
+ self .init_gpu_memory , total_gpu_memory = torch .cuda .mem_get_info ()
134
+ requested_memory = (total_gpu_memory *
135
+ self .cache_config .gpu_memory_utilization )
136
+ if self .init_gpu_memory < requested_memory :
137
+ GiB = lambda b : round (b / GiB_bytes , 2 )
138
+ raise ValueError (
139
+ f"Free memory on device ({ GiB (self .init_gpu_memory )} /"
140
+ f"{ GiB (total_gpu_memory )} GiB) on startup is less than "
141
+ f"desired GPU memory utilization "
142
+ f"({ self .cache_config .gpu_memory_utilization } , "
143
+ f"{ GiB (requested_memory )} GiB). Decrease GPU memory "
144
+ f"utilization or reduce GPU memory used by other processes."
145
+ )
146
+
134
147
else :
135
148
raise RuntimeError (
136
149
f"Not support device type: { self .device_config .device } " )
@@ -190,28 +203,47 @@ def determine_available_memory(self) -> int:
190
203
# GPU did not change their memory usage during the profiling.
191
204
assert self .init_gpu_memory > free_gpu_memory , (
192
205
"Error in memory profiling. "
193
- f"Initial free memory { self .init_gpu_memory } , current free memory"
194
- f" { free_gpu_memory } . This happens when the GPU memory was "
195
- "not properly cleaned up before initializing the vLLM instance." )
206
+ f"Initial free memory { self .init_gpu_memory / GiB_bytes } GiB, "
207
+ f"current free memory { free_gpu_memory / GiB_bytes } GiB. "
208
+ f"This happens when the GPU memory was not properly cleaned up "
209
+ f"before initializing the vLLM instance." )
196
210
197
211
# Get the peak memory allocation recorded by torch
198
- peak_memory = torch .cuda .memory_stats ()["allocated_bytes.all.peak" ]
212
+ peak_torch_memory = torch .cuda .memory_stats (
213
+ )["allocated_bytes.all.peak" ]
199
214
200
215
# Check for any memory left around that may have been allocated on the
201
216
# gpu outside of `torch`. NCCL operations, for example, can use a few
202
- # GB during a forward pass
217
+ # GB during a forward pass.
203
218
torch .cuda .empty_cache ()
204
219
torch_allocated_bytes = torch .cuda .memory_stats (
205
220
)["allocated_bytes.all.current" ]
206
- total_allocated_bytes = torch .cuda .mem_get_info (
207
- )[1 ] - torch .cuda .mem_get_info ()[0 ]
208
- non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
209
- if non_torch_allocations > 0 :
210
- peak_memory += non_torch_allocations
221
+
222
+ # Reset after emptying torch cache
223
+ free_gpu_memory = torch .cuda .mem_get_info ()[0 ]
224
+
225
+ # Total forward allocation (current) is equal to the diff in free memory
226
+ fwd_alloc_bytes = self .init_gpu_memory - free_gpu_memory
227
+ # We assume current non-torch allocation is equal to peak
228
+ non_torch_alloc_bytes = max (0 , fwd_alloc_bytes - torch_allocated_bytes )
229
+ # Total forward allocation (peak) is peak torch + non-torch
230
+ peak_memory = peak_torch_memory + non_torch_alloc_bytes
231
+
211
232
available_kv_cache_memory = (
212
233
total_gpu_memory * self .cache_config .gpu_memory_utilization -
213
234
peak_memory )
214
235
236
+ GiB = lambda b : b / GiB_bytes
237
+ logger .debug (
238
+ "Initial free memory: %.2f GiB, free memory: %.2f GiB, "
239
+ "total GPU memory: %.2f GiB" , GiB (self .init_gpu_memory ),
240
+ GiB (free_gpu_memory ), GiB (total_gpu_memory ))
241
+ logger .debug (
242
+ "Peak torch memory: %.2f GiB, non-torch forward-pass memory: "
243
+ "%.2f GiB, available KVCache memory: %.2f GiB" ,
244
+ GiB (peak_torch_memory ), GiB (non_torch_alloc_bytes ),
245
+ GiB (available_kv_cache_memory ))
246
+
215
247
return int (available_kv_cache_memory )
216
248
217
249
def get_kv_cache_spec (self ) -> dict [str , KVCacheSpec ]:
0 commit comments