Description
Your current environment
vllm docker v0.5.0post1,
GPU: 4090
cuda driver: Driver Version: 535.86.10
model: qwen1.5-14b-chat-AWQ, with enable-prefix-caching
🐛 Describe the bug
ERROR 07-31 15:13:06 async_llm_engine.py:61] Engine background task failed
ERROR 07-31 15:13:06 async_llm_engine.py:61] Traceback (most recent call last):
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 51, in _log_task_completion
ERROR 07-31 15:13:06 async_llm_engine.py:61] return_value = task.result()
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 548, in run_engine_loop
ERROR 07-31 15:13:06 async_llm_engine.py:61] has_requests_in_progress = await asyncio.wait_for(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
ERROR 07-31 15:13:06 async_llm_engine.py:61] return fut.result()
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 522, in engine_step
ERROR 07-31 15:13:06 async_llm_engine.py:61] request_outputs = await self.engine.step_async()
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 244, in step_async
ERROR 07-31 15:13:06 async_llm_engine.py:61] output = await self.model_executor.execute_model_async(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
ERROR 07-31 15:13:06 async_llm_engine.py:61] output = await make_async(self.driver_worker.execute_model
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
ERROR 07-31 15:13:06 async_llm_engine.py:61] result = self.fn(*self.args, **self.kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 07-31 15:13:06 async_llm_engine.py:61] return func(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 280, in execute_model
ERROR 07-31 15:13:06 async_llm_engine.py:61] output = self.model_runner.execute_model(seq_group_metadata_list,
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
ERROR 07-31 15:13:06 async_llm_engine.py:61] return func(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 749, in execute_model
ERROR 07-31 15:13:06 async_llm_engine.py:61] hidden_states = model_executable(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self._call_impl(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return forward_call(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 330, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] hidden_states = self.model(input_ids, positions, kv_caches,
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self._call_impl(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return forward_call(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 254, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] hidden_states, residual = layer(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self._call_impl(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return forward_call(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 206, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] hidden_states = self.self_attn(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self._call_impl(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return forward_call(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 153, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self._call_impl(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
ERROR 07-31 15:13:06 async_llm_engine.py:61] return forward_call(*args, **kwargs)
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/attention/layer.py", line 89, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] return self.impl.forward(query, key, value, kv_cache, attn_metadata,
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm/attention/backends/flash_attn.py", line 339, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] output[:num_prefill_tokens] = flash_attn_varlen_func(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 1099, in flash_attn_varlen_func
ERROR 07-31 15:13:06 async_llm_engine.py:61] return FlashAttnVarlenFunc.apply(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 598, in apply
ERROR 07-31 15:13:06 async_llm_engine.py:61] return super().apply(*args, **kwargs) # type: ignore[misc]
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 596, in forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
ERROR 07-31 15:13:06 async_llm_engine.py:61] File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 88, in _flash_attn_varlen_forward
ERROR 07-31 15:13:06 async_llm_engine.py:61] out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
ERROR 07-31 15:13:06 async_llm_engine.py:61] RuntimeError: CUDA error: an illegal memory access was encountered
ERROR 07-31 15:13:06 async_llm_engine.py:61] Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
ERROR 07-31 15:13:06 async_llm_engine.py:61]
Exception in callback functools.partial(<function _log_task_completion at 0x7f47bdda4ca0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7f47bb414580>>)
handle: <Handle functools.partial(<function _log_task_completion at 0x7f47bdda4ca0>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7f47bb414580>>)>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 51, in _log_task_completion
return_value = task.result()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 548, in run_engine_loop
has_requests_in_progress = await asyncio.wait_for(
File "/usr/lib/python3.10/asyncio/tasks.py", line 445, in wait_for
return fut.result()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 522, in engine_step
request_outputs = await self.engine.step_async()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 244, in step_async
output = await self.model_executor.execute_model_async(
File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 117, in execute_model_async
output = await make_async(self.driver_worker.execute_model
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 280, in execute_model
output = self.model_runner.execute_model(seq_group_metadata_list,
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 749, in execute_model
hidden_states = model_executable(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
INFO 07-31 15:13:06 async_llm_engine.py:176] Aborted request cmpl-9ee39e0e594c4e7c817ce54f27d62a41.
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 330, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 254, in forward
hidden_states, residual = layer(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 206, in forward
hidden_states = self.self_attn(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
INFO 07-31 15:13:06 async_llm_engine.py:176] Aborted request cmpl-8f691facb4ad41d08a2b1816d63b9a37.
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/qwen2.py", line 153, in forward
attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/attention/layer.py", line 89, in forward
return self.impl.forward(query, key, value, kv_cache, attn_metadata,
File "/usr/local/lib/python3.10/dist-packages/vllm/attention/backends/flash_attn.py", line 339, in forward
output[:num_prefill_tokens] = flash_attn_varlen_func(
File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 1099, in flash_attn_varlen_func
return FlashAttnVarlenFunc.apply(
File "/usr/local/lib/python3.10/dist-packages/torch/autograd/function.py", line 598, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 596, in forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = _flash_attn_varlen_forward(
File "/usr/local/lib/python3.10/dist-packages/vllm_flash_attn/flash_attn_interface.py", line 88, in _flash_attn_varlen_forward
out, q, k, v, out_padded, softmax_lse, S_dmask, rng_state = flash_attn_cuda.varlen_fwd(
RuntimeError: CUDA error: an illegal memory access was encountered
INFO 07-31 15:13:06 async_llm_engine.py:176] Aborted request cmpl-b479d70e16ba4daa8bf07a1d3c0bb295.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.