We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 29740c5 commit ff21cf6Copy full SHA for ff21cf6
vllm/v1/worker/tpu_worker.py
@@ -161,7 +161,13 @@ def determine_available_memory(self) -> int:
161
# intermediate activations.
162
m = xm.get_memory_info(self.device)
163
total_memory_size = m["bytes_limit"]
164
- profiled = m["peak_bytes_used"] # Weights + intermediate activations.
+ current_mem = m["bytes_used"]
165
+ # Ideally we would use profiled = m["peak_bytes_used"] to
166
+ # get weights + activations. But there is memory used during
167
+ # compilation / weight loading that impacts the peak and
168
+ # there is no way to reset peak memory in XLA, So we
169
+ # use the heuristic of 2% of weights.
170
+ profiled = current_mem * 1.02
171
172
# Calculate the TPU KV cache size based on profiling.
173
usable_memory_size = int(total_memory_size *
0 commit comments