diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a6a311e97633..fc4ac10b7669a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") # @@ -507,7 +507,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 + GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 40430dae10c5b..24bb7299338ac 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,7 +424,11 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) + if ("${CUDA_CUDA_LIB}" STREQUAL "") + set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") + endif() + target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} + ${CUDA_LIBRARIES}) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/pyproject.toml b/pyproject.toml index e78f5652f486b..e0c56ab79cad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.0", + "torch == 2.4.0", "wheel", "jinja2", ] diff --git a/requirements-build.txt b/requirements-build.txt index ea2b688bb3108..6144a56da8c47 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.0 +torch==2.4.0 wheel jinja2 diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 92fa303d687a2..3b3c2f876919e 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.5.0 +torch == 2.4.0 # These must be updated alongside torch -torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 +torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 7ad0d1e7f704b..ac54cf0c3288f 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.5.0 # should be aligned with "common" vLLM torch version +torch == 2.4.0 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcfc159e4f5a0..75625b35209ce 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_logprobs_close, check_outputs_equal +from ...utils import check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -43,40 +43,18 @@ def test_models( dtype: str, max_tokens: int, ) -> None: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - if model == "openbmb/MiniCPM3-4B": - # the output becomes slightly different when upgrading to - # pytorch 2.5 . Changing to logprobs checks instead of exact - # output checks. - NUM_LOG_PROBS = 8 - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9c5212ace1346..30bbf5107475d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -7,7 +7,6 @@ from typing import Callable, List, Tuple, TypeVar import pynvml -import torch from typing_extensions import ParamSpec from vllm.logger import init_logger @@ -27,10 +26,6 @@ " and cause errors. See https://pypi.org/project/pynvml " "for more information.") -# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models -# see https://github.com/huggingface/diffusers/issues/9704 for details -torch.backends.cuda.enable_cudnn_sdp(False) - # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, # all the related functions work on real physical device ids.