Skip to content

Commit a039da9

Browse files
committed
Merge remote-tracking branch 'AzureGIT/main' into llava_devel
2 parents a8b0dbc + 358c328 commit a039da9

32 files changed

+597
-333
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ Easy, fast, and cheap LLM serving for everyone
2727
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
2828

2929
---
30-
30+
## About
3131
vLLM is a fast and easy-to-use library for LLM inference and serving.
3232

3333
vLLM is fast with:
@@ -54,6 +54,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
5454
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
5555
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
5656
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
57+
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
5758
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
5859
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
5960
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)

csrc/pos_encoding_kernels.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ __global__ void rotary_embedding_kernel(
4343
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
4444
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
4545
const int rot_dim,
46-
const int query_stride,
47-
const int key_stride,
46+
const int64_t query_stride,
47+
const int64_t key_stride,
4848
const int num_heads,
4949
const int num_kv_heads,
5050
const int head_size) {
@@ -60,7 +60,7 @@ __global__ void rotary_embedding_kernel(
6060
const int nq = num_heads * embed_dim;
6161
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
6262
const int head_idx = i / embed_dim;
63-
const int token_head = token_idx * query_stride + head_idx * head_size;
63+
const int64_t token_head = token_idx * query_stride + head_idx * head_size;
6464
const int rot_offset = i % embed_dim;
6565
apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
6666
sin_ptr, rot_offset, embed_dim);
@@ -69,7 +69,7 @@ __global__ void rotary_embedding_kernel(
6969
const int nk = num_kv_heads * embed_dim;
7070
for (int i = threadIdx.x; i < nk; i += blockDim.x) {
7171
const int head_idx = i / embed_dim;
72-
const int token_head = token_idx * key_stride + head_idx * head_size;
72+
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
7373
const int rot_offset = i % embed_dim;
7474
apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
7575
sin_ptr, rot_offset, embed_dim);
@@ -89,8 +89,8 @@ void rotary_embedding(
8989
int rot_dim = cos_sin_cache.size(1);
9090
int num_heads = query.size(-1) / head_size;
9191
int num_kv_heads = key.size(-1) / head_size;
92-
int query_stride = query.stride(-2);
93-
int key_stride = key.stride(-2);
92+
int64_t query_stride = query.stride(-2);
93+
int64_t key_stride = key.stride(-2);
9494

9595
dim3 grid(num_tokens);
9696
dim3 block(std::min(num_heads * rot_dim / 2, 512));

csrc/quantization/gptq/q_gemm.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ namespace gptq {
2828
#define DIVIDE(x, size) (((x) + (size) - 1) / (size))
2929

3030
#if defined(USE_ROCM)
31+
#include <hipblas/hipblas.h>
3132
__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle,
3233
hipblasOperation_t transA,
3334
hipblasOperation_t transB,
@@ -520,12 +521,21 @@ __global__ void gemm_half_q_half_alt_kernel(
520521
zeros_tmp[tmp_k] = zero;
521522
}
522523
for (int m = 0; m < b_end; m++) {
524+
#ifndef USE_ROCM
523525
res2 = {};
526+
#else
527+
res2.x = __half_as_ushort(__float2half(0));
528+
res2.y = __half_as_ushort(__float2half(0));
529+
#endif
524530
res2 = __hfma2(__hfma2(deq2[(tmp >> 0) & 0xff][off], scales_tmp[0], zeros_tmp[0]), blockvec[m][k + 0], res2);
525531
res2 = __hfma2(__hfma2(deq2[(tmp >> 8) & 0xff][off], scales_tmp[1], zeros_tmp[1]), blockvec[m][k + 1], res2);
526532
res2 = __hfma2(__hfma2(deq2[(tmp >> 16) & 0xff][off], scales_tmp[2], zeros_tmp[2]), blockvec[m][k + 2], res2);
527533
res2 = __hfma2(__hfma2(deq2[(tmp >> 24) & 0xff][off], scales_tmp[3], zeros_tmp[3]), blockvec[m][k + 3], res2);
534+
#ifndef USE_ROCM
528535
res[m] = __hadd(res[m], __hadd(res2.x, res2.y));
536+
#else
537+
res[m] = __hadd(res[m], __hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)));
538+
#endif
529539
}
530540
i += width;
531541
k += 4;

docs/source/getting_started/amd-installation.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
116116

117117
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
118118
- `Pytorch <https://pytorch.org/>`_
119+
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
119120

120121
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
121122

docs/source/getting_started/installation.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ You can install vLLM using pip:
4242
$ pip uninstall torch -y
4343
$ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
4444
45+
$ # Re-install xFormers with CUDA 11.8.
46+
$ pip uninstall xformers -y
47+
$ pip install --upgrade xformers --index-url https://download.pytorch.org/whl/cu118
48+
4549
4650
.. _build_from_source:
4751

docs/source/models/engine_args.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ Below, you can find an explanation of every engine argument for vLLM:
8989

9090
CPU swap space size (GiB) per GPU.
9191

92-
.. option:: --gpu-memory-utilization <percentage>
92+
.. option:: --gpu-memory-utilization <fraction>
9393

94-
The percentage of GPU memory to be used for the model executor.
94+
The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.
95+
For example, a value of 0.5 would imply 50% GPU memory utilization.
96+
If unspecified, will use the default value of 0.9.
9597

9698
.. option:: --max-num-batched-tokens <tokens>
9799

docs/source/models/supported_models.rst

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ Alongside each architecture, we include some popular models that use it.
2323
* - :code:`ChatGLMModel`
2424
- ChatGLM
2525
- :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
26+
* - :code:`DeciLMForCausalLM`
27+
- DeciLM
28+
- :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
2629
* - :code:`BloomForCausalLM`
2730
- BLOOM, BLOOMZ, BLOOMChat
2831
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
@@ -90,7 +93,7 @@ Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-pr
9093
If vLLM successfully generates text, it indicates that your model is supported.
9194

9295
.. tip::
93-
To use models from `ModelScope <www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
96+
To use models from `ModelScope <https://www.modelscope.cn>`_ instead of HuggingFace Hub, set an environment variable:
9497

9598
.. code-block:: shell
9699

docs/source/serving/serving_with_langchain.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@ To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langcha
2828
2929
print(llm("What is the capital of France ?"))
3030
31-
Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/extras/integrations/llms/vllm.ipynb>`_ for more details.
31+
Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/llms/vllm.ipynb>`_ for more details.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,13 +219,13 @@ def get_torch_arch_list() -> Set[str]:
219219
"csrc/activation_kernels.cu",
220220
"csrc/layernorm_kernels.cu",
221221
"csrc/quantization/squeezellm/quant_cuda_kernel.cu",
222+
"csrc/quantization/gptq/q_gemm.cu",
222223
"csrc/cuda_utils_kernels.cu",
223224
"csrc/pybind.cpp",
224225
]
225226

226227
if _is_cuda():
227228
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
228-
vllm_extension_sources.append("csrc/quantization/gptq/q_gemm.cu")
229229

230230
vllm_extension = CUDAExtension(
231231
name="vllm._C",

tests/async_engine/test_api_server.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,14 @@ def test_api_server(api_server):
4444
"""
4545
with Pool(32) as pool:
4646
# Wait until the server is ready
47-
prompts = ["Hello world"] * 1
47+
prompts = ["warm up"] * 1
4848
result = None
4949
while not result:
5050
try:
51-
for _ in pool.map(_query_server, prompts):
51+
for r in pool.map(_query_server, prompts):
52+
result = r
5253
break
53-
except Exception:
54+
except requests.exceptions.ConnectionError:
5455
time.sleep(1)
5556

5657
# Actual tests start here
@@ -63,13 +64,14 @@ def test_api_server(api_server):
6364
assert num_aborted_requests == 0
6465

6566
# Try with 100 prompts
66-
prompts = ["Hello world"] * 100
67+
prompts = ["test prompt"] * 100
6768
for result in pool.map(_query_server, prompts):
6869
assert result
6970

7071
# Cancel requests
72+
prompts = ["canceled requests"] * 100
7173
pool.map_async(_query_server, prompts)
72-
time.sleep(0.01)
74+
time.sleep(0.001)
7375
pool.terminate()
7476
pool.join()
7577

@@ -81,6 +83,6 @@ def test_api_server(api_server):
8183
# check that server still runs after cancellations
8284
with Pool(32) as pool:
8385
# Try with 100 prompts
84-
prompts = ["Hello world"] * 100
86+
prompts = ["test prompt after canceled"] * 100
8587
for result in pool.map(_query_server, prompts):
8688
assert result

tests/conftest.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
from vllm import LLM, SamplingParams
99
from vllm.transformers_utils.tokenizer import get_tokenizer
1010

11-
_TEST_PROMPTS = ["prompts/example.txt"]
12-
_LONG_PROMPTS = ["prompts/summary.txt"]
11+
_TEST_DIR = os.path.dirname(__file__)
12+
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
13+
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
1314

1415

1516
def _read_prompts(filename: str) -> str:
@@ -24,15 +25,15 @@ def _read_prompts(filename: str) -> str:
2425
def example_prompts() -> List[str]:
2526
prompts = []
2627
for filename in _TEST_PROMPTS:
27-
prompts += _read_prompts(os.path.join("tests", filename))
28+
prompts += _read_prompts(filename)
2829
return prompts
2930

3031

3132
@pytest.fixture
3233
def example_long_prompts() -> List[str]:
3334
prompts = []
3435
for filename in _LONG_PROMPTS:
35-
prompts += _read_prompts(os.path.join("tests", filename))
36+
prompts += _read_prompts(filename)
3637
return prompts
3738

3839

tests/distributed/test_comm_ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import torch
99

1010
from vllm.config import ParallelConfig
11-
from vllm.engine.ray_utils import get_open_port
11+
from vllm.utils import get_open_port
1212
from vllm.model_executor.parallel_utils.communication_op import (
1313
tensor_model_parallel_all_reduce,
1414
tensor_model_parallel_all_gather,

tests/models/test_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"facebook/opt-125m",
99
"meta-llama/Llama-2-7b-hf",
1010
"mistralai/Mistral-7B-v0.1",
11+
"Deci/DeciLM-7b",
1112
"tiiuae/falcon-7b",
1213
"gpt2",
1314
"bigcode/tiny_starcoder_py",

vllm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from vllm.outputs import CompletionOutput, RequestOutput
1010
from vllm.sampling_params import SamplingParams
1111

12-
__version__ = "0.2.5"
12+
__version__ = "0.2.6"
1313

1414
__all__ = [
1515
"LLM",

vllm/config.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -112,24 +112,20 @@ def _verify_load_format(self) -> None:
112112
supported_load_format = [
113113
"auto", "pt", "safetensors", "npcache", "dummy"
114114
]
115-
rocm_not_supported_load_format = ["safetensors"]
115+
rocm_not_supported_load_format = []
116116
if load_format not in supported_load_format:
117117
raise ValueError(
118118
f"Unknown load format: {self.load_format}. Must be one of "
119119
"'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
120-
if is_hip():
121-
if load_format in ["safetensors"]:
122-
rocm_supported_load_format = [
123-
f for f in supported_load_format
124-
if (f not in rocm_not_supported_load_format)
125-
]
126-
raise ValueError(
127-
f"load format \'{load_format}\' is not supported in ROCm. "
128-
f"Supported load format are "
129-
f"{rocm_supported_load_format}")
130-
# Force ROCm to load from pt weights if nothing specific is set
131-
if load_format == "auto":
132-
load_format = "pt"
120+
if is_hip() and load_format in rocm_not_supported_load_format:
121+
rocm_supported_load_format = [
122+
f for f in supported_load_format
123+
if (f not in rocm_not_supported_load_format)
124+
]
125+
raise ValueError(
126+
f"load format \'{load_format}\' is not supported in ROCm. "
127+
f"Supported load format are "
128+
f"{rocm_supported_load_format}")
133129

134130
# TODO: Remove this check once HF updates the pt weights of Mixtral.
135131
architectures = getattr(self.hf_config, "architectures", [])
@@ -149,7 +145,7 @@ def _verify_tokenizer_mode(self) -> None:
149145

150146
def _verify_quantization(self) -> None:
151147
supported_quantization = ["awq", "gptq", "squeezellm"]
152-
rocm_not_supported_quantization = ["awq", "gptq"]
148+
rocm_not_supported_quantization = ["awq"]
153149
if self.quantization is not None:
154150
self.quantization = self.quantization.lower()
155151

@@ -185,10 +181,11 @@ def _verify_cuda_graph(self) -> None:
185181
self.max_context_len_to_capture = self.max_model_len
186182
self.max_context_len_to_capture = min(self.max_context_len_to_capture,
187183
self.max_model_len)
188-
if self.quantization == "gptq" and not self.enforce_eager:
184+
if (self.quantization in ["gptq", "squeezellm"]
185+
and not self.enforce_eager):
189186
# Related issue: https://github.com/vllm-project/vllm/issues/2147
190-
logger.warning("GPTQ does not support CUDA graph yet. Disabling "
191-
"CUDA graph.")
187+
logger.warning(f"{self.quantization} does not support CUDA graph "
188+
"yet. Disabling CUDA graph.")
192189
self.enforce_eager = True
193190

194191
def verify_with_parallel_config(

vllm/core/block_manager.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def __init__(
103103
def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
104104
# FIXME(woosuk): Here we assume that all sequences in the group share
105105
# the same prompt. This may not be true for preempted sequences.
106-
seq = seq_group.get_seqs()[0]
106+
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
107107
num_required_blocks = len(seq.logical_token_blocks)
108108
if self.block_sliding_window is not None:
109109
num_required_blocks = min(num_required_blocks,
@@ -122,7 +122,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
122122
def allocate(self, seq_group: SequenceGroup) -> None:
123123
# NOTE: Here we assume that all sequences in the group have the same
124124
# prompt.
125-
seq = seq_group.get_seqs()[0]
125+
seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
126126

127127
# Allocate new physical token blocks that will store the prompt tokens.
128128
block_table: BlockTable = []
@@ -137,7 +137,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
137137
block_table.append(block)
138138

139139
# Assign the block table for each sequence.
140-
for seq in seq_group.get_seqs():
140+
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
141141
self.block_tables[seq.seq_id] = block_table.copy()
142142

143143
def can_append_slot(self, seq_group: SequenceGroup) -> bool:

vllm/core/scheduler.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,15 +139,17 @@ def _schedule(self) -> SchedulerOutputs:
139139
while self.waiting:
140140
seq_group = self.waiting[0]
141141

142-
assert seq_group.num_seqs() == 1, (
142+
waiting_seqs = seq_group.get_seqs(
143+
status=SequenceStatus.WAITING)
144+
assert len(waiting_seqs) == 1, (
143145
"Waiting sequence group should have only one prompt "
144146
"sequence.")
145-
num_prompt_tokens = seq_group.get_seqs()[0].get_len()
147+
num_prompt_tokens = waiting_seqs[0].get_len()
146148
if num_prompt_tokens > self.prompt_limit:
147149
logger.warning(
148150
f"Input prompt ({num_prompt_tokens} tokens) is too long"
149151
f" and exceeds limit of {self.prompt_limit}")
150-
for seq in seq_group.get_seqs():
152+
for seq in waiting_seqs:
151153
seq.status = SequenceStatus.FINISHED_IGNORED
152154
ignored_seq_groups.append(seq_group)
153155
self.waiting.pop(0)
@@ -161,7 +163,7 @@ def _schedule(self) -> SchedulerOutputs:
161163
logger.warning(
162164
f"Input prompt ({num_prompt_tokens} tokens) is too long"
163165
f" and exceeds the capacity of block_manager")
164-
for seq in seq_group.get_seqs():
166+
for seq in waiting_seqs:
165167
seq.status = SequenceStatus.FINISHED_IGNORED
166168
ignored_seq_groups.append(seq_group)
167169
self.waiting.pop(0)
@@ -317,7 +319,7 @@ def free_finished_seq_groups(self) -> None:
317319

318320
def _allocate(self, seq_group: SequenceGroup) -> None:
319321
self.block_manager.allocate(seq_group)
320-
for seq in seq_group.get_seqs():
322+
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
321323
seq.status = SequenceStatus.RUNNING
322324

323325
def _append_slot(

0 commit comments

Comments
 (0)