Closed
Description
Your current environment
Running in Kubernetes on H100 in vllm/vllm-openai:v0.4.0
π Describe the bug
Seems like there have been some weird dependency issues since v0.2.7. We would love to use flash attention when you are able to fix this! Thank you! π
INFO 04-02 23:55:08 api_server.py:148] vLLM API server version 0.4.0
INFO 04-02 23:55:08 api_server.py:149] args: Namespace(host=None, port=8000, uvicorn_log_level='info', allow_credentials=True, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, served_model_name=None, lora_modules=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], model='mistralai/Mixtral-8x7B-v0.1', tokenizer=None, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', dtype='auto', kv_cache_dtype='auto', max_model_len=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=2, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=False, use_v2_block_manager=False, seed=0, swap_space=4, gpu_memory_utilization=0.9, forced_num_gpu_blocks=None, max_num_batched_tokens=None, max_num_seqs=256, max_logprobs=5, disable_log_stats=False, quantization=None, enforce_eager=False, max_context_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', max_cpu_loras=None, device='auto', image_input_type=None, image_token_id=None, image_input_shape=None, image_feature_size=None, scheduler_delay_factor=0.0, enable_chunked_prefill=False, engine_use_ray=False, disable_log_requests=False, max_log_len=None)
2024-04-02 23:55:10,082 INFO worker.py:1752 -- Started a local Ray instance.
INFO 04-02 23:55:10 llm_engine.py:75] Initializing an LLM engine (v0.4.0) with config: model='mistralai/Mixtral-8x7B-v0.1', tokenizer='mistralai/Mixtral-8x7B-v0.1', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 04-02 23:55:16 selector.py:16] Using FlashAttention backend.
(RayWorkerVllm pid=2375) INFO 04-02 23:55:16 selector.py:16] Using FlashAttention backend.
INFO 04-02 23:55:18 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=2375) INFO 04-02 23:55:18 pynccl_utils.py:45] vLLM is using nccl==2.18.1
INFO 04-02 23:55:28 weight_utils.py:177] Using model weights format ['*.safetensors']
(RayWorkerVllm pid=2375) INFO 04-02 23:55:29 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-02 23:55:39 model_runner.py:104] Loading model weights took 43.5064 GB
(RayWorkerVllm pid=2375) INFO 04-02 23:55:45 model_runner.py:104] Loading model weights took 43.5064 GB
INFO 04-02 23:55:50 fused_moe.py:272] Using configuration from /workspace/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer.
Traceback (most recent call last):
File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/workspace/vllm/entrypoints/openai/api_server.py", line 156, in <module>
engine = AsyncLLMEngine.from_engine_args(
File "/workspace/vllm/engine/async_llm_engine.py", line 348, in from_engine_args
engine = cls(
File "/workspace/vllm/engine/async_llm_engine.py", line 311, in __init__
self.engine = self._init_engine(*args, **kwargs)
File "/workspace/vllm/engine/async_llm_engine.py", line 422, in _init_engine
return engine_class(*args, **kwargs)
File "/workspace/vllm/engine/llm_engine.py", line 111, in __init__
self.model_executor = executor_class(model_config, cache_config,
File "/workspace/vllm/executor/ray_gpu_executor.py", line 65, in __init__
self._init_cache()
File "/workspace/vllm/executor/ray_gpu_executor.py", line 220, in _init_cache
num_blocks = self._run_workers(
File "/workspace/vllm/executor/ray_gpu_executor.py", line 324, in _run_workers
driver_worker_output = getattr(self.driver_worker,
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/vllm/worker/worker.py", line 131, in profile_num_available_blocks
self.model_runner.profile_run()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/vllm/worker/model_runner.py", line 742, in profile_run
self.execute_model(seqs, kv_caches)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/workspace/vllm/worker/model_runner.py", line 663, in execute_model
hidden_states = model_executable(**execute_model_kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/vllm/model_executor/models/mixtral.py", line 379, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/vllm/model_executor/models/mixtral.py", line 315, in forward
hidden_states, residual = layer(positions, hidden_states,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/vllm/model_executor/models/mixtral.py", line 274, in forward
hidden_states = self.block_sparse_moe(hidden_states)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/workspace/vllm/model_executor/models/mixtral.py", line 128, in forward
final_hidden_states = fused_moe(hidden_states,
File "/workspace/vllm/model_executor/layers/fused_moe/fused_moe.py", line 397, in fused_moe
invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1,
File "/workspace/vllm/model_executor/layers/fused_moe/fused_moe.py", line 222, in invoke_fused_moe_kernel
fused_moe_kernel[grid](
File "<string>", line 63, in fused_moe_kernel
File "/usr/local/lib/python3.10/dist-packages/triton/compiler/compiler.py", line 425, in compile
so_path = make_stub(name, signature, constants)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler/make_launcher.py", line 39, in make_stub
so = _build(name, src_path, tmpdir)
File "/usr/local/lib/python3.10/dist-packages/triton/common/build.py", line 61, in _build
cuda_lib_dirs = libcuda_dirs()
File "/usr/local/lib/python3.10/dist-packages/triton/common/build.py", line 30, in libcuda_dirs
assert any(os.path.exists(os.path.join(path, 'libcuda.so')) for path in dirs), msg
AssertionError: libcuda.so cannot found!