revert githash commit

neuralmagic · Jun 21, 2024 · a512d63 · a512d63 · github-actions · Jun 21, 2024
1 parent 39e484e
commit a512d63
Show file tree

Hide file tree

Showing 6 changed files with 3 additions and 46 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,7 +8,6 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
-include(${CMAKE_CURRENT_LIST_DIR}/cmake/dep.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the
@@ -207,8 +206,7 @@ define_gpu_extension_target(
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
-  WITH_SOABI
-  LIBRARIES cmake_git_version_tracking)
+  WITH_SOABI)
 
 #
 # _moe_C extension
@@ -226,8 +224,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   USE_SABI 3
-  WITH_SOABI
-  LIBRARIES cmake_git_version_tracking)
+  WITH_SOABI)
 
 #
 # _punica_C extension
@@ -279,8 +276,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
     COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
     ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
     USE_SABI 3
-    WITH_SOABI
-    LIBRARIES cmake_git_version_tracking)
+    WITH_SOABI)
 else()
   message(WARNING "Unable to create _punica_C target because none of the "
     "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")

diff --git a/cmake/dep.cmake b/cmake/dep.cmake
diff --git a/collect_env.py b/collect_env.py
@@ -15,16 +15,13 @@
 try:
     import torch
     TORCH_AVAILABLE = True
-    installed_path = os.path.dirname(torch.__file__)
-    sys.path.insert(0, os.path.dirname(installed_path))
 except (ImportError, NameError, AttributeError, OSError):
     TORCH_AVAILABLE = False
 
 # System Environment Information
 SystemEnv = namedtuple(
     'SystemEnv',
     [
-        'vllm_git_hash',
         'torch_version',
         'is_debug_build',
         'cuda_compiled_version',
@@ -143,14 +140,6 @@ def get_conda_packages(run_lambda, patterns=None):
                                                          for name in patterns))
 
 
-def get_vllm_git_hash():
-    try:
-        import vllm
-        return vllm.githash()
-    except ImportError:
-        return None
-
-
 def get_gcc_version(run_lambda):
     return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
 
@@ -549,7 +538,6 @@ def get_version_or_na(cfg, prefix):
     gpu_topo = get_gpu_topo(run_lambda)
 
     return SystemEnv(
-        vllm_git_hash=get_vllm_git_hash(),
         torch_version=version_str,
         is_debug_build=debug_mode_str,
         python_version='{} ({}-bit runtime)'.format(
@@ -626,7 +614,6 @@ def get_version_or_na(cfg, prefix):
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}
 vLLM Version: {vllm_version}
-vLLM Git Hash: {vllm_git_hash}
 vLLM Build Flags:
 {vllm_build_flags}
 GPU Topology:

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
@@ -2,18 +2,11 @@
 #include "ops.h"
 #include "registration.h"
 
-#include <git.h>
 #include <torch/library.h>
 
-std::string githash() { return std::string{git::CommitSHA1()}; }
-
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
-  // Show vllm git hash
-  ops.def("githash", &githash);
-  ops.impl("githash", torch::kCPU, &githash);
-
   // Attention ops
   // Compute the attention between an input query and the cached keys/values
   // using PagedAttention.

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -3,11 +3,8 @@
 #include "ops.h"
 #include "registration.h"
 
-#include <git.h>
 #include <torch/library.h>
 
-std::string githash() { return std::string{git::CommitSHA1()}; }
-
 // Note on op signatures:
 // The X_meta signatures are for the meta functions corresponding to op X.
 // They must be kept in sync with the signature for X. Generally, only
@@ -21,10 +18,6 @@ std::string githash() { return std::string{git::CommitSHA1()}; }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
-  // Show vllm git hash
-  ops.def("githash", &githash);
-  ops.impl("githash", torch::kCUDA, &githash);
-
   // Attention ops
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -15,14 +15,8 @@
 from .version import __version__
 
 
-def githash():
-    import torch
-    return torch.ops._C.githash()
-
-
 __all__ = [
     "__version__",
-    "githash",
     "LLM",
     "ModelRegistry",
     "PromptStrictInputs",
Benchmark suite	Current: `a512d63`	Previous: `abc0ceb`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4976551730438463` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`959.099586448837` tokens/s
Benchmark suite	Current: `a512d63`	Previous: `abc0ceb`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4839670949913555` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`953.8433644766804` tokens/s
Benchmark suite	Current: `a512d63`	Previous: `abc0ceb`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.5347108274896324` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`973.3289577560188` tokens/s
Benchmark suite	Current: `a512d63`	Previous: `abc0ceb`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.496178631285675` prompts/s	`2.4860718116442153` prompts/s	`1.00`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`958.5325944136991` tokens/s	`954.6515756713787` tokens/s	`1.00`