Skip to content

Commit 13dfc4f

Browse files
authored
Clean cuda relatives and enable installation and execution on CPU (vllm-project#13)
* init * refine * remove debug logging * modify setup.py * fix
1 parent dd60db0 commit 13dfc4f

File tree

8 files changed

+302
-288
lines changed

8 files changed

+302
-288
lines changed

setup.py

Lines changed: 194 additions & 194 deletions
Large diffs are not rendered by default.

vllm/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class ModelConfig:
4848
output). If None, will be derived from the model.
4949
quantization: Quantization method that was used to quantize the model
5050
weights. If None, we assume the model weights are not quantized.
51+
device: The device to be used for the model. If None, we will default to use CPU as the device.
5152
"""
5253

5354
def __init__(
@@ -64,6 +65,7 @@ def __init__(
6465
tokenizer_revision: Optional[str] = None,
6566
max_model_len: Optional[int] = None,
6667
quantization: Optional[str] = None,
68+
device: Optional[str] = 'cpu',
6769
) -> None:
6870
self.model = model
6971
self.tokenizer = tokenizer
@@ -75,6 +77,7 @@ def __init__(
7577
self.revision = revision
7678
self.tokenizer_revision = tokenizer_revision
7779
self.quantization = quantization
80+
self.device = device
7881

7982
self.hf_config = get_config(model, trust_remote_code, revision)
8083
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)

vllm/engine/arg_utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class EngineArgs:
3131
revision: Optional[str] = None
3232
tokenizer_revision: Optional[str] = None
3333
quantization: Optional[str] = None
34+
device: Optional[str] = 'cpu'
3435

3536
def __post_init__(self):
3637
if self.tokenizer is None:
@@ -166,6 +167,12 @@ def add_cli_args(
166167
choices=['awq', None],
167168
default=None,
168169
help='Method used to quantize the weights')
170+
parser.add_argument('--device',
171+
type=str,
172+
choices=['gpu', 'cpu', None],
173+
default=None,
174+
help='Device to execute LLM model')
175+
169176
return parser
170177

171178
@classmethod
@@ -184,7 +191,7 @@ def create_engine_configs(
184191
self.download_dir, self.load_format,
185192
self.dtype, self.seed, self.revision,
186193
self.tokenizer_revision, self.max_model_len,
187-
self.quantization)
194+
self.quantization, self.device)
188195
cache_config = CacheConfig(
189196
self.block_size, self.gpu_memory_utilization, self.swap_space,
190197
getattr(model_config.hf_config, 'sliding_window', None))

vllm/model_executor/model_loader.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,34 @@
77
from transformers import PretrainedConfig
88

99
from vllm.config import ModelConfig
10-
from vllm.model_executor.models import * # pylint: disable=wildcard-import
10+
from vllm.model_executor.models import BigDLLlamaForCausalLM # pylint: disable=wildcard-import
1111
from vllm.model_executor.weight_utils import (get_quant_config,
1212
initialize_dummy_weights)
1313

1414
# TODO(woosuk): Lazy-load the model classes.
1515
_MODEL_REGISTRY = {
16-
"AquilaModel": AquilaForCausalLM,
17-
"BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b
18-
"BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b
19-
"BloomForCausalLM": BloomForCausalLM,
20-
"FalconForCausalLM": FalconForCausalLM,
21-
"GPT2LMHeadModel": GPT2LMHeadModel,
22-
"GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
23-
"GPTJForCausalLM": GPTJForCausalLM,
24-
"GPTNeoXForCausalLM": GPTNeoXForCausalLM,
25-
"InternLMForCausalLM": InternLMForCausalLM,
16+
# "AquilaModel": AquilaForCausalLM,
17+
# "BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b
18+
# "BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b
19+
# "BloomForCausalLM": BloomForCausalLM,
20+
# "FalconForCausalLM": FalconForCausalLM,
21+
# "GPT2LMHeadModel": GPT2LMHeadModel,
22+
# "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
23+
# "GPTJForCausalLM": GPTJForCausalLM,
24+
# "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
25+
# "InternLMForCausalLM": InternLMForCausalLM,
2626
"LlamaForCausalLM": BigDLLlamaForCausalLM,
27-
"LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-*
28-
"MistralForCausalLM": MistralForCausalLM,
29-
"MPTForCausalLM": MPTForCausalLM,
30-
"OPTForCausalLM": OPTForCausalLM,
31-
"QWenLMHeadModel": QWenLMHeadModel,
32-
"RWForCausalLM": FalconForCausalLM,
27+
# "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-*
28+
# "MistralForCausalLM": MistralForCausalLM,
29+
# "MPTForCausalLM": MPTForCausalLM,
30+
# "OPTForCausalLM": OPTForCausalLM,
31+
# "QWenLMHeadModel": QWenLMHeadModel,
32+
# "RWForCausalLM": FalconForCausalLM,
3333
}
3434

3535
# FIXME(woosuk): Remove this once all models support quantization.
3636
_MODEL_CLASSES_SUPPORT_QUANTIZATION = [
37-
LlamaForCausalLM,
37+
# LlamaForCausalLM,
3838
]
3939

4040

@@ -100,5 +100,6 @@ def get_model(model_config: ModelConfig) -> nn.Module:
100100
# Load the weights from the cached or downloaded files.
101101
model.load_weights(model_config.model, model_config.download_dir,
102102
model_config.load_format, model_config.revision)
103-
model = model.cuda()
103+
if model_config.device != 'cpu':
104+
model = model.cuda()
104105
return model.eval()
Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,35 @@
1-
from vllm.model_executor.models.aquila import AquilaForCausalLM
2-
from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
3-
BaichuanForCausalLM)
4-
from vllm.model_executor.models.bloom import BloomForCausalLM
5-
from vllm.model_executor.models.falcon import FalconForCausalLM
6-
from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
7-
from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
8-
from vllm.model_executor.models.gpt_j import GPTJForCausalLM
9-
from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
10-
from vllm.model_executor.models.internlm import InternLMForCausalLM
11-
from vllm.model_executor.models.llama import LlamaForCausalLM
1+
# from vllm.model_executor.models.aquila import AquilaForCausalLM
2+
# from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
3+
# BaichuanForCausalLM)
4+
# from vllm.model_executor.models.bloom import BloomForCausalLM
5+
# from vllm.model_executor.models.falcon import FalconForCausalLM
6+
# from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
7+
# from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
8+
# from vllm.model_executor.models.gpt_j import GPTJForCausalLM
9+
# from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
10+
# from vllm.model_executor.models.internlm import InternLMForCausalLM
11+
# from vllm.model_executor.models.llama import LlamaForCausalLM
1212
from vllm.model_executor.models.bigdl_llama import BigDLLlamaForCausalLM
13-
from vllm.model_executor.models.mpt import MPTForCausalLM
14-
from vllm.model_executor.models.opt import OPTForCausalLM
15-
from vllm.model_executor.models.qwen import QWenLMHeadModel
16-
from vllm.model_executor.models.mistral import MistralForCausalLM
13+
# from vllm.model_executor.models.mpt import MPTForCausalLM
14+
# from vllm.model_executor.models.opt import OPTForCausalLM
15+
# from vllm.model_executor.models.qwen import QWenLMHeadModel
16+
# from vllm.model_executor.models.mistral import MistralForCausalLM
1717

1818
__all__ = [
19-
"AquilaForCausalLM",
20-
"BaiChuanForCausalLM",
21-
"BaichuanForCausalLM",
22-
"BloomForCausalLM",
23-
"FalconForCausalLM",
24-
"GPT2LMHeadModel",
25-
"GPTBigCodeForCausalLM",
26-
"GPTJForCausalLM",
27-
"GPTNeoXForCausalLM",
28-
"InternLMForCausalLM",
29-
"LlamaForCausalLM",
19+
# "AquilaForCausalLM",
20+
# "BaiChuanForCausalLM",
21+
# "BaichuanForCausalLM",
22+
# "BloomForCausalLM",
23+
# "FalconForCausalLM",
24+
# "GPT2LMHeadModel",
25+
# "GPTBigCodeForCausalLM",
26+
# "GPTJForCausalLM",
27+
# "GPTNeoXForCausalLM",
28+
# "InternLMForCausalLM",
29+
# "LlamaForCausalLM",
3030
"BigDLLlamaForCausalLM",
31-
"MPTForCausalLM",
32-
"OPTForCausalLM",
33-
"QWenLMHeadModel",
34-
"MistralForCausalLM",
31+
# "MPTForCausalLM",
32+
# "OPTForCausalLM",
33+
# "QWenLMHeadModel",
34+
# "MistralForCausalLM",
3535
]

vllm/model_executor/models/bigdl_llama.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import torch
22
from torch import nn
33

4-
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizerBase, LlamaConfig
4+
from transformers import AutoTokenizer, PreTrainedTokenizerBase, LlamaConfig
55
from typing import Optional, Tuple, List, Type, Dict
66

77
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
@@ -50,11 +50,15 @@ def __init__(
5050
super().__init__()
5151
# pdb.set_trace()
5252
self.config = config
53+
if True:
54+
from bigdl.llm.transformers import AutoModelForCausalLM
55+
else:
56+
from transformers import AutoModelForCausalLM
5357
self.model = AutoModelForCausalLM.from_pretrained(config._name_or_path)
5458
self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path)
5559
self.device = torch.device(
5660
"cuda" if torch.cuda.is_available() else "cpu")
57-
self.dtype = self.model.config.torch_dtype
61+
self.dtype = self.model.dtype
5862
# self.tmp_kv_cache = [[0]]
5963

6064
def decode(self, generated_ids: List[int]) -> str:
@@ -77,7 +81,7 @@ def forward(
7781
seq_len = len(seq_group_meta_data_lists)
7882

7983
bigdl_input_ids = []
80-
bigdl_position_ids = []
84+
# bigdl_position_ids = []
8185
cur_seq_ids = []
8286
bigdl_sampling_params = {}
8387
max_context_len = 0
@@ -94,16 +98,15 @@ def forward(
9498
context_len = seq_data.get_len()
9599
if seq_group_meta_data.is_prompt:
96100
bigdl_input_ids.append(cur_seq_input_ids)
97-
bigdl_position_ids.append(list(range(context_len)))
101+
# bigdl_position_ids.append(list(range(context_len)))
98102
max_context_len = max(max_context_len, context_len)
99103
else:
100104
bigdl_input_ids.append([cur_seq_input_ids[-1]])
101-
bigdl_position_ids.append([context_len - 1])
105+
# bigdl_position_ids.append([context_len - 1])
102106

103107
bigdl_sampling_params[seq_id] = seq_group_meta_data.sampling_params
104108

105109
context_len = seq_data.get_len()
106-
bigdl_position_ids.append(range(context_len))
107110

108111
if all_decoding:
109112
# pdb.set_trace()
@@ -125,15 +128,14 @@ def forward(
125128
_pad_to_max(input_ids, max_context_len)
126129
for input_ids in bigdl_input_ids
127130
]
128-
bigdl_position_ids = [
129-
_pad_to_max(position_ids, max_context_len)
130-
for position_ids in bigdl_position_ids
131-
]
131+
# bigdl_position_ids = [
132+
# _pad_to_max(position_ids, max_context_len)
133+
# for position_ids in bigdl_position_ids
134+
# ]
132135

133136
bigdl_input_ids = torch.tensor(bigdl_input_ids, device=self.device)
134-
bigdl_position_ids = torch.tensor(bigdl_position_ids,
135-
device=self.device)
136-
137+
# bigdl_position_ids = torch.tensor(bigdl_position_ids,
138+
# device=self.device, dtype=self.dtype)
137139
if all_decoding:
138140
kwargs = {
139141
"input_ids": bigdl_input_ids,
@@ -165,7 +167,7 @@ def forward(
165167
last_token_logits = logits_processor(
166168
None, outputs.logits[index:index + 1, -1, :])[0]
167169
probs = torch.softmax(last_token_logits, dim=-1)
168-
indices = torch.multinomial(probs, num_samples=2)
170+
indices = torch.multinomial(probs, num_samples=cur_sampling_params.best_of)
169171
tokens = [int(token) for token in indices.tolist()]
170172

171173
logprobs = math.log(probs[tokens[0]])

vllm/utils.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import psutil
66
import torch
77

8-
from vllm import cuda_utils
8+
# from vllm import cuda_utils
99

1010

1111
class Device(enum.Enum):
@@ -27,13 +27,13 @@ def reset(self) -> None:
2727
self.counter = 0
2828

2929

30-
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
31-
"""Returns the maximum shared memory per thread block in bytes."""
32-
# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
33-
cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 # pylint: disable=invalid-name
34-
max_shared_mem = cuda_utils.get_device_attribute(
35-
cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
36-
return int(max_shared_mem)
30+
# def get_max_shared_memory_bytes(gpu: int = 0) -> int:
31+
# """Returns the maximum shared memory per thread block in bytes."""
32+
# # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
33+
# cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 # pylint: disable=invalid-name
34+
# max_shared_mem = cuda_utils.get_device_attribute(
35+
# cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
36+
# return int(max_shared_mem)
3737

3838

3939
def get_gpu_memory(gpu: int = 0) -> int:

vllm/worker/worker.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
initialize_model_parallel)
1313
from vllm.sampling_params import SamplingParams
1414
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
15-
from vllm.worker.cache_engine import CacheEngine
16-
from vllm.utils import get_gpu_memory, get_max_shared_memory_bytes
15+
# from vllm.worker.cache_engine import CacheEngine
16+
# from vllm.utils import get_gpu_memory, get_max_shared_memory_bytes
1717

1818
import pdb
1919

@@ -63,22 +63,23 @@ def clean_finished_seqs(self, finished_seqs: List[int]):
6363
del self.kv_cache[seq_id]
6464

6565
def init_model(self):
66-
# This env var set by Ray causes exceptions with graph building.
67-
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
68-
# Env vars will be set by Ray.
69-
self.rank = self.rank if self.rank is not None else int(
70-
os.getenv("RANK", "-1"))
71-
local_rank = int(os.getenv("LOCAL_RANK", "0"))
72-
self.device = torch.device(f"cuda:{local_rank}")
73-
if self.rank < 0:
74-
raise ValueError("Invalid or unspecified rank.")
75-
torch.cuda.set_device(self.device)
76-
77-
_check_if_gpu_supports_dtype(self.model_config.dtype)
78-
79-
# Initialize the distributed environment.
80-
_init_distributed_environment(self.parallel_config, self.rank,
81-
self.distributed_init_method)
66+
if self.model_config.device != 'cpu':
67+
# This env var set by Ray causes exceptions with graph building.
68+
os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
69+
# Env vars will be set by Ray.
70+
self.rank = self.rank if self.rank is not None else int(
71+
os.getenv("RANK", "-1"))
72+
local_rank = int(os.getenv("LOCAL_RANK", "0"))
73+
self.device = torch.device(f"cuda:{local_rank}")
74+
if self.rank < 0:
75+
raise ValueError("Invalid or unspecified rank.")
76+
torch.cuda.set_device(self.device)
77+
78+
_check_if_gpu_supports_dtype(self.model_config.dtype)
79+
80+
# Initialize the distributed environment.
81+
_init_distributed_environment(self.parallel_config, self.rank,
82+
self.distributed_init_method)
8283

8384
# Initialize the model.
8485
set_random_seed(self.model_config.seed)
@@ -136,8 +137,8 @@ def profile_num_available_blocks(
136137
torch.cuda.synchronize()
137138
peak_memory = torch.cuda.max_memory_allocated()
138139
total_gpu_memory = get_gpu_memory()
139-
cache_block_size = CacheEngine.get_cache_block_size(
140-
block_size, self.model_config, self.parallel_config)
140+
# cache_block_size = CacheEngine.get_cache_block_size(
141+
# block_size, self.model_config, self.parallel_config)
141142
num_gpu_blocks = int(
142143
(total_gpu_memory * gpu_memory_utilization - peak_memory) //
143144
cache_block_size)
@@ -163,8 +164,8 @@ def init_cache_engine(self, cache_config: CacheConfig) -> None:
163164
self.sliding_window)
164165
_check_if_can_support_max_seq_len(max_seq_len, self.block_size)
165166

166-
self.cache_engine = CacheEngine(self.cache_config, self.model_config,
167-
self.parallel_config)
167+
# self.cache_engine = CacheEngine(self.cache_config, self.model_config,
168+
# self.parallel_config)
168169
self.cache_events = self.cache_engine.events
169170
self.gpu_cache = self.cache_engine.gpu_cache
170171

0 commit comments

Comments
 (0)