Clean cuda relatives and enable installation and execution on CPU (vllm-project#13)

xiangyuT · web-flow · commit 13dfc4fbe6d5 · 2023-10-26T09:46:04.000+08:00
* init

* refine

* remove debug logging

* modify setup.py

* fix
diff --git a/setup.py b/setup.py
diff --git a/vllm/config.py b/vllm/config.py
@@ -48,6 +48,7 @@ class ModelConfig:
             output). If None, will be derived from the model.
         quantization: Quantization method that was used to quantize the model
             weights. If None, we assume the model weights are not quantized.
+        device: The device to be used for the model. If None, we will default to use CPU as the device.
     """
 
     def __init__(
@@ -64,6 +65,7 @@ def __init__(
         tokenizer_revision: Optional[str] = None,
         max_model_len: Optional[int] = None,
         quantization: Optional[str] = None,
+        device: Optional[str] = 'cpu',
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -75,6 +77,7 @@ def __init__(
         self.revision = revision
         self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
+        self.device = device
 
         self.hf_config = get_config(model, trust_remote_code, revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -31,6 +31,7 @@ class EngineArgs:
     revision: Optional[str] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
+    device: Optional[str] = 'cpu'
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -166,6 +167,12 @@ def add_cli_args(
                             choices=['awq', None],
                             default=None,
                             help='Method used to quantize the weights')
+        parser.add_argument('--device',
+                            type=str,
+                            choices=['gpu', 'cpu', None],
+                            default=None,
+                            help='Device to execute LLM model')
+        
         return parser
 
     @classmethod
@@ -184,7 +191,7 @@ def create_engine_configs(
                                    self.download_dir, self.load_format,
                                    self.dtype, self.seed, self.revision,
                                    self.tokenizer_revision, self.max_model_len,
-                                   self.quantization)
+                                   self.quantization, self.device)
         cache_config = CacheConfig(
             self.block_size, self.gpu_memory_utilization, self.swap_space,
             getattr(model_config.hf_config, 'sliding_window', None))
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -7,34 +7,34 @@
 from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig
-from vllm.model_executor.models import *  # pylint: disable=wildcard-import
+from vllm.model_executor.models import BigDLLlamaForCausalLM  # pylint: disable=wildcard-import
 from vllm.model_executor.weight_utils import (get_quant_config,
                                               initialize_dummy_weights)
 
 # TODO(woosuk): Lazy-load the model classes.
 _MODEL_REGISTRY = {
-    "AquilaModel": AquilaForCausalLM,
-    "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
-    "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
-    "BloomForCausalLM": BloomForCausalLM,
-    "FalconForCausalLM": FalconForCausalLM,
-    "GPT2LMHeadModel": GPT2LMHeadModel,
-    "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
-    "GPTJForCausalLM": GPTJForCausalLM,
-    "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
-    "InternLMForCausalLM": InternLMForCausalLM,
+    # "AquilaModel": AquilaForCausalLM,
+    # "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
+    # "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
+    # "BloomForCausalLM": BloomForCausalLM,
+    # "FalconForCausalLM": FalconForCausalLM,
+    # "GPT2LMHeadModel": GPT2LMHeadModel,
+    # "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
+    # "GPTJForCausalLM": GPTJForCausalLM,
+    # "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
+    # "InternLMForCausalLM": InternLMForCausalLM,
     "LlamaForCausalLM": BigDLLlamaForCausalLM,
-    "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
-    "MistralForCausalLM": MistralForCausalLM,
-    "MPTForCausalLM": MPTForCausalLM,
-    "OPTForCausalLM": OPTForCausalLM,
-    "QWenLMHeadModel": QWenLMHeadModel,
-    "RWForCausalLM": FalconForCausalLM,
+    # "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
+    # "MistralForCausalLM": MistralForCausalLM,
+    # "MPTForCausalLM": MPTForCausalLM,
+    # "OPTForCausalLM": OPTForCausalLM,
+    # "QWenLMHeadModel": QWenLMHeadModel,
+    # "RWForCausalLM": FalconForCausalLM,
 }
 
 # FIXME(woosuk): Remove this once all models support quantization.
 _MODEL_CLASSES_SUPPORT_QUANTIZATION = [
-    LlamaForCausalLM,
+#     LlamaForCausalLM,
 ]
 
 
@@ -100,5 +100,6 @@ def get_model(model_config: ModelConfig) -> nn.Module:
             # Load the weights from the cached or downloaded files.
             model.load_weights(model_config.model, model_config.download_dir,
                                model_config.load_format, model_config.revision)
-            model = model.cuda()
+            if model_config.device != 'cpu':
+                model = model.cuda()
     return model.eval()
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,35 +1,35 @@
-from vllm.model_executor.models.aquila import AquilaForCausalLM
-from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
-                                                 BaichuanForCausalLM)
-from vllm.model_executor.models.bloom import BloomForCausalLM
-from vllm.model_executor.models.falcon import FalconForCausalLM
-from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
-from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
-from vllm.model_executor.models.gpt_j import GPTJForCausalLM
-from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
-from vllm.model_executor.models.internlm import InternLMForCausalLM
-from vllm.model_executor.models.llama import LlamaForCausalLM
+# from vllm.model_executor.models.aquila import AquilaForCausalLM
+# from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
+#                                                  BaichuanForCausalLM)
+# from vllm.model_executor.models.bloom import BloomForCausalLM
+# from vllm.model_executor.models.falcon import FalconForCausalLM
+# from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
+# from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
+# from vllm.model_executor.models.gpt_j import GPTJForCausalLM
+# from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
+# from vllm.model_executor.models.internlm import InternLMForCausalLM
+# from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM
-from vllm.model_executor.models.mpt import MPTForCausalLM
-from vllm.model_executor.models.opt import OPTForCausalLM
-from vllm.model_executor.models.qwen import QWenLMHeadModel
-from vllm.model_executor.models.mistral import MistralForCausalLM
+# from vllm.model_executor.models.mpt import MPTForCausalLM
+# from vllm.model_executor.models.opt import OPTForCausalLM
+# from vllm.model_executor.models.qwen import QWenLMHeadModel
+# from vllm.model_executor.models.mistral import MistralForCausalLM
 
 __all__ = [
-    "AquilaForCausalLM",
-    "BaiChuanForCausalLM",
-    "BaichuanForCausalLM",
-    "BloomForCausalLM",
-    "FalconForCausalLM",
-    "GPT2LMHeadModel",
-    "GPTBigCodeForCausalLM",
-    "GPTJForCausalLM",
-    "GPTNeoXForCausalLM",
-    "InternLMForCausalLM",
-    "LlamaForCausalLM",
+    # "AquilaForCausalLM",
+    # "BaiChuanForCausalLM",
+    # "BaichuanForCausalLM",
+    # "BloomForCausalLM",
+    # "FalconForCausalLM",
+    # "GPT2LMHeadModel",
+    # "GPTBigCodeForCausalLM",
+    # "GPTJForCausalLM",
+    # "GPTNeoXForCausalLM",
+    # "InternLMForCausalLM",
+    # "LlamaForCausalLM",
     "BigDLLlamaForCausalLM",
-    "MPTForCausalLM",
-    "OPTForCausalLM",
-    "QWenLMHeadModel",
-    "MistralForCausalLM",
+    # "MPTForCausalLM",
+    # "OPTForCausalLM",
+    # "QWenLMHeadModel",
+    # "MistralForCausalLM",
 ]
diff --git a/vllm/model_executor/models/bigdl_llama.py b/vllm/model_executor/models/bigdl_llama.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase, LlamaConfig
+from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
 from typing import Optional, Tuple, List, Type, Dict
 
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
@@ -50,11 +50,15 @@ def __init__(
         super().__init__()
         # pdb.set_trace()
         self.config = config
+        if True:
+            from bigdl.llm.transformers import AutoModelForCausalLM
+        else:
+            from transformers import AutoModelForCausalLM
         self.model = AutoModelForCausalLM.from_pretrained(config._name_or_path)
         self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
         self.device = torch.device(
             "cuda" if torch.cuda.is_available() else "cpu")
-        self.dtype = self.model.config.torch_dtype
+        self.dtype = self.model.dtype
         # self.tmp_kv_cache = [[0]]
 
     def decode(self, generated_ids: List[int]) -> str:
@@ -77,7 +81,7 @@ def forward(
         seq_len = len(seq_group_meta_data_lists)
 
         bigdl_input_ids = []
-        bigdl_position_ids = []
+        # bigdl_position_ids = []
         cur_seq_ids = []
         bigdl_sampling_params = {}
         max_context_len = 0
@@ -94,16 +98,15 @@ def forward(
             context_len = seq_data.get_len()
             if seq_group_meta_data.is_prompt:
                 bigdl_input_ids.append(cur_seq_input_ids)
-                bigdl_position_ids.append(list(range(context_len)))
+                # bigdl_position_ids.append(list(range(context_len)))
                 max_context_len = max(max_context_len, context_len)
             else:
                 bigdl_input_ids.append([cur_seq_input_ids[-1]])
-                bigdl_position_ids.append([context_len - 1])
+                # bigdl_position_ids.append([context_len - 1])
 
             bigdl_sampling_params[seq_id] = seq_group_meta_data.sampling_params
 
             context_len = seq_data.get_len()
-            bigdl_position_ids.append(range(context_len))
 
         if all_decoding:
             # pdb.set_trace()
@@ -125,15 +128,14 @@ def forward(
                 _pad_to_max(input_ids, max_context_len)
                 for input_ids in bigdl_input_ids
             ]
-            bigdl_position_ids = [
-                _pad_to_max(position_ids, max_context_len)
-                for position_ids in bigdl_position_ids
-            ]
+            # bigdl_position_ids = [
+            #     _pad_to_max(position_ids, max_context_len)
+            #     for position_ids in bigdl_position_ids
+            # ]
 
         bigdl_input_ids = torch.tensor(bigdl_input_ids, device=self.device)
-        bigdl_position_ids = torch.tensor(bigdl_position_ids,
-                                          device=self.device)
-
+        # bigdl_position_ids = torch.tensor(bigdl_position_ids,
+        #                                   device=self.device, dtype=self.dtype)
         if all_decoding:
             kwargs = {
                 "input_ids": bigdl_input_ids,
@@ -165,7 +167,7 @@ def forward(
             last_token_logits = logits_processor(
                 None, outputs.logits[index:index + 1, -1, :])[0]
             probs = torch.softmax(last_token_logits, dim=-1)
-            indices = torch.multinomial(probs, num_samples=2)
+            indices = torch.multinomial(probs, num_samples=cur_sampling_params.best_of)
             tokens = [int(token) for token in indices.tolist()]
 
             logprobs = math.log(probs[tokens[0]])
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -5,7 +5,7 @@
 import psutil
 import torch
 
-from vllm import cuda_utils
+# from vllm import cuda_utils
 
 
 class Device(enum.Enum):
@@ -27,13 +27,13 @@ def reset(self) -> None:
         self.counter = 0
 
 
-def get_max_shared_memory_bytes(gpu: int = 0) -> int:
-    """Returns the maximum shared memory per thread block in bytes."""
-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
-    cudaDevAttrMaxSharedMemoryPerBlockOptin = 97  # pylint: disable=invalid-name
-    max_shared_mem = cuda_utils.get_device_attribute(
-        cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
-    return int(max_shared_mem)
+# def get_max_shared_memory_bytes(gpu: int = 0) -> int:
+#     """Returns the maximum shared memory per thread block in bytes."""
+#     # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+#     cudaDevAttrMaxSharedMemoryPerBlockOptin = 97  # pylint: disable=invalid-name
+#     max_shared_mem = cuda_utils.get_device_attribute(
+#         cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
+#     return int(max_shared_mem)
 
 
 def get_gpu_memory(gpu: int = 0) -> int:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -12,8 +12,8 @@
     initialize_model_parallel)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.worker.cache_engine import CacheEngine
-from vllm.utils import get_gpu_memory, get_max_shared_memory_bytes
+# from vllm.worker.cache_engine import CacheEngine
+# from vllm.utils import get_gpu_memory, get_max_shared_memory_bytes
 
 import pdb
 
@@ -63,22 +63,23 @@ def clean_finished_seqs(self, finished_seqs: List[int]):
             del self.kv_cache[seq_id]
 
     def init_model(self):
-        # This env var set by Ray causes exceptions with graph building.
-        os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-        # Env vars will be set by Ray.
-        self.rank = self.rank if self.rank is not None else int(
-            os.getenv("RANK", "-1"))
-        local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        self.device = torch.device(f"cuda:{local_rank}")
-        if self.rank < 0:
-            raise ValueError("Invalid or unspecified rank.")
-        torch.cuda.set_device(self.device)
-
-        _check_if_gpu_supports_dtype(self.model_config.dtype)
-
-        # Initialize the distributed environment.
-        _init_distributed_environment(self.parallel_config, self.rank,
-                                      self.distributed_init_method)
+        if self.model_config.device != 'cpu':
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            # Env vars will be set by Ray.
+            self.rank = self.rank if self.rank is not None else int(
+                os.getenv("RANK", "-1"))
+            local_rank = int(os.getenv("LOCAL_RANK", "0"))
+            self.device = torch.device(f"cuda:{local_rank}")
+            if self.rank < 0:
+                raise ValueError("Invalid or unspecified rank.")
+            torch.cuda.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+
+            # Initialize the distributed environment.
+            _init_distributed_environment(self.parallel_config, self.rank,
+                                        self.distributed_init_method)
 
         # Initialize the model.
         set_random_seed(self.model_config.seed)
@@ -136,8 +137,8 @@ def profile_num_available_blocks(
         torch.cuda.synchronize()
         peak_memory = torch.cuda.max_memory_allocated()
         total_gpu_memory = get_gpu_memory()
-        cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, self.model_config, self.parallel_config)
+        # cache_block_size = CacheEngine.get_cache_block_size(
+        #     block_size, self.model_config, self.parallel_config)
         num_gpu_blocks = int(
             (total_gpu_memory * gpu_memory_utilization - peak_memory) //
             cache_block_size)
@@ -163,8 +164,8 @@ def init_cache_engine(self, cache_config: CacheConfig) -> None:
                               self.sliding_window)
         _check_if_can_support_max_seq_len(max_seq_len, self.block_size)
 
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
+        # self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+        #                                 self.parallel_config)
         self.cache_events = self.cache_engine.events
         self.gpu_cache = self.cache_engine.gpu_cache