Add githash to nm-vllm (#299)

Add git hash information to nm-vllm: ``` >>> import vllm >>> vllm.githash() '106796861914146372aba9386aeff9361edfb34d' ``` --------- Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
neuralmagic · Jun 19, 2024 · d8da97b · d8da97b · github-actions · Jun 20, 2024
1 parent 8ad8c8a
commit d8da97b
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 3 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -8,6 +8,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 
 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/dep.cmake)
 
 #
 # Supported python versions.  These versions will be searched in order, the
@@ -206,7 +207,8 @@ define_gpu_extension_target(
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   USE_SABI 3
-  WITH_SOABI)
+  WITH_SOABI
+  LIBRARIES cmake_git_version_tracking)
 
 #
 # _moe_C extension
@@ -224,7 +226,8 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   USE_SABI 3
-  WITH_SOABI)
+  WITH_SOABI
+  LIBRARIES cmake_git_version_tracking)
 
 #
 # _punica_C extension
@@ -276,7 +279,8 @@ if (VLLM_PUNICA_GPU_ARCHES)
     COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
     ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
     USE_SABI 3
-    WITH_SOABI)
+    WITH_SOABI
+    LIBRARIES cmake_git_version_tracking)
 else()
   message(WARNING "Unable to create _punica_C target because none of the "
     "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")

diff --git a/cmake/dep.cmake b/cmake/dep.cmake
@@ -0,0 +1,6 @@
+include(FetchContent)
+FetchContent_Declare(cmake_git_version_tracking                   
+  GIT_REPOSITORY https://github.com/andrew-hardin/cmake-git-version-tracking.git
+  GIT_TAG 6c0cb87edd029ddfb403a8e24577c144a03605a6
+)
+FetchContent_MakeAvailable(cmake_git_version_tracking)
diff --git a/collect_env.py b/collect_env.py
@@ -15,13 +15,16 @@
 try:
     import torch
     TORCH_AVAILABLE = True
+    installed_path = os.path.dirname(torch.__file__)
+    sys.path.insert(0, os.path.dirname(installed_path))
 except (ImportError, NameError, AttributeError, OSError):
     TORCH_AVAILABLE = False
 
 # System Environment Information
 SystemEnv = namedtuple(
     'SystemEnv',
     [
+        'vllm_git_hash',
         'torch_version',
         'is_debug_build',
         'cuda_compiled_version',
@@ -140,6 +143,14 @@ def get_conda_packages(run_lambda, patterns=None):
                                                          for name in patterns))
 
 
+def get_vllm_git_hash():
+    try:
+        import vllm
+        return vllm.githash()
+    except ImportError:
+        return None
+
+
 def get_gcc_version(run_lambda):
     return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
 
@@ -538,6 +549,7 @@ def get_version_or_na(cfg, prefix):
     gpu_topo = get_gpu_topo(run_lambda)
 
     return SystemEnv(
+        vllm_git_hash=get_vllm_git_hash(),
         torch_version=version_str,
         is_debug_build=debug_mode_str,
         python_version='{} ({}-bit runtime)'.format(
@@ -614,6 +626,7 @@ def get_version_or_na(cfg, prefix):
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}
 vLLM Version: {vllm_version}
+vLLM Git Hash: {vllm_git_hash}
 vLLM Build Flags:
 {vllm_build_flags}
 GPU Topology:

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
@@ -2,11 +2,18 @@
 #include "ops.h"
 #include "registration.h"
 
+#include <git.h>
 #include <torch/library.h>
 
+std::string githash() { return std::string{git::CommitSHA1()}; }
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
+  // Show vllm git hash
+  ops.def("githash", &githash);
+  ops.impl("githash", torch::kCPU, &githash);
+
   // Attention ops
   // Compute the attention between an input query and the cached keys/values
   // using PagedAttention.

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -3,8 +3,11 @@
 #include "ops.h"
 #include "registration.h"
 
+#include <git.h>
 #include <torch/library.h>
 
+std::string githash() { return std::string{git::CommitSHA1()}; }
+
 // Note on op signatures:
 // The X_meta signatures are for the meta functions corresponding to op X.
 // They must be kept in sync with the signature for X. Generally, only
@@ -18,6 +21,10 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
+  // Show vllm git hash
+  ops.def("githash", &githash);
+  ops.impl("githash", torch::kCUDA, &githash);
+
   // Attention ops
   // Compute the attention between an input query and the cached
   // keys/values using PagedAttention.

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -16,6 +16,7 @@
 __version__ = "0.5.0"
 
 __all__ = [
+    "githash",
     "LLM",
     "ModelRegistry",
     "PromptStrictInputs",
@@ -33,3 +34,8 @@
     "initialize_ray_cluster",
     "PoolingParams",
 ]
+
+
+def githash():
+    import torch
+    return torch.ops._C.githash()
Benchmark suite	Current: `d8da97b`	Previous: `8ad8c8a`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4225222844514933` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`930.2485572293734` tokens/s
Benchmark suite	Current: `d8da97b`	Previous: `8ad8c8a`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.432386252736635` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`934.0363210508677` tokens/s
Benchmark suite	Current: `d8da97b`	Previous: `8ad8c8a`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.445825995665408` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`939.1971823355167` tokens/s
Benchmark suite	Current: `d8da97b`	Previous: `8ad8c8a`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4505808553524178` prompts/s	`2.4329362842575506` prompts/s	`0.99`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`941.0230484553284` tokens/s	`934.2475331548994` tokens/s	`0.99`