vllm-project · rkooo567 · May 3, 2024 · May 3, 2024 · May 3, 2024 · May 6, 2024
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -447,13 +447,25 @@ def is_pin_memory_available() -> bool:
 
 class CudaMemoryProfiler:
 
-    def __init__(self, device=None):
+    def __init__(self, device=None, capture_max_memory: bool = False):
+        """A context manager to measure memory usage on a given device.
+
+        If capture_max_memory is True, it measures the maximum memory usage
+        during the profiling. However, it can only measure GPU memory used by
+        torch tensor. If it is False, it measures the memory delta which also
+        includes non-torch tensor GPU memory usage.
+        """
         self.device = device
+        self.capture_max_memory = capture_max_memory
 
     def current_memory_usage(self) -> float:
         # Return the memory usage in bytes.
-        torch.cuda.reset_peak_memory_stats(self.device)
-        mem = torch.cuda.max_memory_allocated(self.device)
+        if self.capture_max_memory:
+            torch.cuda.reset_peak_memory_stats(self.device)
+            mem = torch.cuda.max_memory_allocated(self.device)
+        else:
+            free, total = torch.cuda.mem_get_info(self.device)
+            mem = total - free
         return mem
 
     def __enter__(self):

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -31,10 +31,12 @@
 _PAD_SLOT_ID = -1
 LORA_WARMUP_RANK = 8
 _BATCH_SIZE_ALIGNMENT = 8
+MAX_BATCH_SIZE_TO_CAPTURE = 256
 # Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+    _BATCH_SIZE_ALIGNMENT * i
+    for i in range(1, MAX_BATCH_SIZE_TO_CAPTURE // _BATCH_SIZE_ALIGNMENT + 1)
 ]
 
 
@@ -156,7 +158,7 @@ def __init__(
         # (max batch size to capture, max context len to capture / block size).
         self.graph_block_tables: torch.Tensor  # Set after initial profiling.
 
-    def load_model(self) -> None:
+    def load_model(self):
         with CudaMemoryProfiler() as m:
             self.model = get_model(
                 model_config=self.model_config,

@@ -15,13 +15,17 @@
 from vllm.distributed.device_communicators import pynccl_utils
 from vllm.distributed.device_communicators.custom_all_reduce import (
     init_custom_ar)
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import CudaMemoryProfiler
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
 from vllm.worker.worker_base import WorkerBase
 
+logger = init_logger(__name__)
+
 
 class Worker(WorkerBase):
     """A worker class that executes (a partition of) the model on a GPU.
@@ -180,14 +184,26 @@ def initialize_cache(self, num_gpu_blocks: int,
 
     def _init_cache_engine(self):
         assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
+        with CudaMemoryProfiler() as m:
+            self.cache_engine = CacheEngine(self.cache_config,
+                                            self.model_config,
+                                            self.parallel_config)
+        mem_usage = m.consumed_memory
+        unit, scale = "GB", float(2**30)
+        logger.info("GPU KV cache reserves %.4f %s GPU memory.",
+                    mem_usage / scale, unit)
         self.gpu_cache = self.cache_engine.gpu_cache
         self.model_runner.set_block_size(self.cache_engine.block_size)
 
     def _warm_up_model(self) -> None:
         if not self.model_config.enforce_eager:
-            self.model_runner.capture_model(self.gpu_cache)
+            with CudaMemoryProfiler() as m:
+                self.model_runner.capture_model(self.gpu_cache)
+            mem_usage = m.consumed_memory
+            unit, scale = "GB", float(2**30)
+            logger.info("Capturing cuda graph reserves %.4f %s GPU memory.",
+                        mem_usage / scale, unit)
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)
@@ -332,9 +348,12 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
                                 max_model_len) -> None:
     if num_gpu_blocks <= 0:
-        raise ValueError("No available memory for the cache blocks. "
-                         "Try increasing `gpu_memory_utilization` when "
-                         "initializing the engine.")
+        raise ValueError(
+            "No available memory for the cache blocks. vLLM needs {} more GPU "
+            "blocks to allocate. Try increasing `gpu_memory_utilization` when "
+            "initializing the engine. Or increase `tensor_parallel_size`, which"
+            "shards model weights across GPUs. It gives more memory to "
+            "allocate kv cache blocks per GPU.".format(-num_gpu_blocks))
     max_seq_len = block_size * num_gpu_blocks
     if max_model_len > max_seq_len:
         raise ValueError(