From 994e287bbc760862d5f1c3258a57b5b74a166c6e Mon Sep 17 00:00:00 2001
From: mzusman <mor.zusmann@gmail.com>
Date: Tue, 29 Oct 2024 15:18:37 +0200
Subject: [PATCH] Revert "[Misc] Upgrade to pytorch 2.5 (#9588)"

This reverts commit 3cb07a36a20f9af11346650559470d685e9dc711.
---
 CMakeLists.txt                                |  4 +-
 cmake/utils.cmake                             |  6 ++-
 pyproject.toml                                |  2 +-
 requirements-build.txt                        |  2 +-
 requirements-cuda.txt                         |  6 +--
 requirements-openvino.txt                     |  2 +-
 .../decoder_only/language/test_big_models.py  | 46 +++++--------------
 vllm/platforms/cuda.py                        |  5 --
 8 files changed, 25 insertions(+), 48 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a6a311e97633..fc4ac10b7669a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
@@ -507,7 +507,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
+          GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 40430dae10c5b..24bb7299338ac 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -424,7 +424,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
   # dependencies that are not necessary and may not be installed.
   if (GPU_LANGUAGE STREQUAL "CUDA")
-    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+    if ("${CUDA_CUDA_LIB}" STREQUAL "")
+      set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
+    endif()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
+      ${CUDA_LIBRARIES})
   else()
     target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
   endif()
diff --git a/pyproject.toml b/pyproject.toml
index e78f5652f486b..e0c56ab79cad0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging",
     "setuptools>=61",
     "setuptools-scm>=8.0",
-    "torch == 2.5.0",
+    "torch == 2.4.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements-build.txt b/requirements-build.txt
index ea2b688bb3108..6144a56da8c47 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.0
+torch==2.4.0
 wheel
 jinja2
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 92fa303d687a2..3b3c2f876919e 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.5.0
+torch == 2.4.0
 # These must be updated alongside torch
-torchvision == 0.20   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.0
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.4.0
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index 7ad0d1e7f704b..ac54cf0c3288f 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -1,7 +1,7 @@
 # Common dependencies
 -r requirements-common.txt
 
-torch == 2.5.0 #  should be aligned with "common" vLLM torch version
+torch == 2.4.0 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention
 
 optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py
index fcfc159e4f5a0..75625b35209ce 100644
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -8,7 +8,7 @@
 
 from vllm.platforms import current_platform
 
-from ...utils import check_logprobs_close, check_outputs_equal
+from ...utils import check_outputs_equal
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -43,40 +43,18 @@ def test_models(
     dtype: str,
     max_tokens: int,
 ) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    if model == "openbmb/MiniCPM3-4B":
-        # the output becomes slightly different when upgrading to
-        # pytorch 2.5 . Changing to logprobs checks instead of exact
-        # output checks.
-        NUM_LOG_PROBS = 8
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy_logprobs_limit(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
-
-        check_logprobs_close(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
-    else:
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-        with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        check_outputs_equal(
-            outputs_0_lst=hf_outputs,
-            outputs_1_lst=vllm_outputs,
-            name_0="hf",
-            name_1="vllm",
-        )
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 9c5212ace1346..30bbf5107475d 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -7,7 +7,6 @@
 from typing import Callable, List, Tuple, TypeVar
 
 import pynvml
-import torch
 from typing_extensions import ParamSpec
 
 from vllm.logger import init_logger
@@ -27,10 +26,6 @@
         " and cause errors. See https://pypi.org/project/pynvml "
         "for more information.")
 
-# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models
-# see https://github.com/huggingface/diffusers/issues/9704 for details
-torch.backends.cuda.enable_cudnn_sdp(False)
-
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.