Upstream sync 2024 06 16 (#307)

Upstream sync 2024 06 16 (#310) - v0.5.0.post of vllm SUMMARY: * Merge commits from vllm-project@8f89d72 to vllm-project@0f0d8bc * Limit numpy to < 2.0 * Updated `run-tests` to print name of the test that is about to run (for debugging what hangs in automation) * Disable usage stats in automation * Temporarily disable ENTRYPOINTS (to be re-enabled in Andy's single whl PR) * Updated `run-tests` to consider exit code 5 from pytest to be a pass (since exit code 5 from pytest means that we did not run any tests) Note that vllm-project@8f89d72 is NOT included in this merge. COMPARE vs UPSTREAM: https://github.com/neuralmagic/nm-vllm/compare/upstream-sync-2024-06-16..vllm-project:vllm:v0.5.0.post1 --------- Signed-off-by: kevin <kevin@anyscale.com> Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Signed-off-by: Wang, Yi A <yi.a.wang@intel.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: Simon Mo <simon.mo@hey.com> Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Li, Jiang <jiang1.li@intel.com> Co-authored-by: Kevin H. Luu <kevin@anyscale.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Arthur Kim <kimdwkimdw@gmail.com> Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Sanger Steel <sangersteel@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Wang, Yi <yi.a.wang@intel.com> Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com> Co-authored-by: wenyujin333 <wuyou.wuyou@alibaba-inc.com> Co-authored-by: Jianan Gu <jianan.gu@intel.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: zifeitong <zifei.tong@parasail.io> Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Jie Fu (傅杰) <jiefu@tencent.com> Co-authored-by: Allen.Dou <allen.dou@hotmail.com>
neuralmagic · Jun 24, 2024 · 719d550 · 719d550
1 parent e487a48
commit 719d550
Show file tree

Hide file tree

Showing 116 changed files with 4,433 additions and 1,469 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -48,6 +48,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s spec_decode/e2e/test_integration_dist.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
@@ -7,7 +7,7 @@ steps:
       queue: cpu_queue
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
       DOCKER_BUILDKIT: "1"
@@ -19,6 +19,34 @@ steps:
           limit: 5
   - wait
 
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        soft_fail: true
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: false
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
@@ -31,7 +59,7 @@ steps:
       {% else %}
       queue: gpu_1_queue
       {% endif %}
-    soft_fail: true
+    soft_fail: {{ step.soft_fail or false }}
     {% if step.parallelism %}
     parallelism: {{ step.parallelism }}
     {% endif %}

diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
@@ -39,6 +39,9 @@ runs:
       # testmo
       echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
       echo "PROJECT_ID=12" >> $GITHUB_ENV
+      # disable usage stats (writes to protected /usr/local/apps)
+      echo "VLLM_NO_USAGE_STATS=1" >> $GITHUB_ENV
+      echo "DO_NOT_TRACK=1" >> $GITHUB_ENV
     env:
         HF_TOKEN_SECRET: ${{ inputs.hf_token }}
     shell: bash
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -109,6 +109,10 @@ do
     LOCAL_SUCCESS=0
     RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/")
 
+    # report which test is being run
+    # (in CI, if a test hangs, this logs *which* test is running *before* it hangs)
+    echo "=== RUNNING TEST: ${TEST} ==="
+
     # this is a bit messy and brittle, but certain tests
     # need to be run with specific options
     if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
@@ -125,7 +129,18 @@ do
         pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     fi
 
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+    # if a file gets exit code 0, we are good
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED TEST: ${TEST} ==="
+    # if a file does not run any tests, pytest reports exit code of 5
+    # since we skip full modules in our skipping strategy, this is common
+    elif [[ $LOCAL_SUCCESS == 5 ]]; then
+        echo "=== SKIPPED TEST: ${TEST} ==="
+    # otherwise, report failure
+    else
+        echo "=== FAILED TEST: ${TEST} ==="
+        SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+    fi
 
 done
 

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -25,7 +25,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
     - name: Analysing the code with ruff
       run: |
         ruff .

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -180,17 +180,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
     "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
 
   #
   # The CUTLASS kernels for Hopper require sm90a to be enabled.
   # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
   # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
     set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
           PROPERTIES
           COMPILE_FLAGS
           "-gencode arch=compute_90a,code=sm_90a")

diff --git a/Dockerfile b/Dockerfile
@@ -9,8 +9,8 @@
 # prepare basic build environment
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
 
-RUN apt-get update -y && \
-    apt-get install -y python3-pip git
+RUN apt-get update -y \
+    && apt-get install -y python3-pip git curl sudo
 
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-cuda.txt
 
 # install development dependencies
+COPY requirements-lint.txt requirements-lint.txt
+COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -3,9 +3,13 @@
 FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
+RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc
+
+RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
+
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
 
@@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 
-RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 
 CMD ["/bin/bash"]
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
@@ -0,0 +1,19 @@
+ARG NIGHTLY_DATE="20240601"
+ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
+
+FROM $BASE_IMAGE
+
+WORKDIR /workspace
+COPY . /workspace/vllm
+
+ENV VLLM_TARGET_DEVICE="tpu"
+# Install aiohttp separately to avoid build errors.
+RUN pip install aiohttp
+# Install the TPU and Pallas dependencies.
+RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
+RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+
+# Build vLLM.
+RUN cd /workspace/vllm && python setup.py develop
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -68,9 +68,13 @@ async def async_request_tgi(
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
                             continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
 
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
-                                              "data:")
+                        #NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = remove_prefix(chunk_bytes, "data:")
 
                         data = json.loads(chunk)
                         timestamp = time.perf_counter()

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -189,7 +189,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -346,7 +346,7 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu"],
+        choices=["cuda", "cpu", "tpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                  scale_b: torch.tensor,
                  out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm_dq(a,
-                                    b,
-                                    scale_a,
-                                    scale_b,
-                                    out_dtype=out_dtype)
+    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
 
 
 # bench

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
@@ -33,6 +33,7 @@ function (find_isa CPUINFO TARGET OUT)
     endif()
 endfunction()
 
+find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 
 if (AVX512_FOUND)
@@ -53,8 +54,11 @@ if (AVX512_FOUND)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+elseif (AVX2_FOUND)
+    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
+    message(WARNING "vLLM CPU backend using AVX2 ISA")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
 endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")