vllm-project · AniZpZ · Aug 14, 2023 · Aug 14, 2023 · Aug 15, 2023 · Aug 16, 2023
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -0,0 +1,24 @@
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+# run benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -0,0 +1,41 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+
+steps:
+- label: Regression Test
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  command: pytest -v -s async_engine
+
+- label: Distributed Test
+  command: pytest -v -s test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Engine Test
+  command: pytest -v -s engine
+
+- label: Kernels Test
+  command: pytest -v -s kernels
+  soft_fail: true
+
+- label: Models Test
+  commands:
+    - pytest -v -s models --forked
+  soft_fail: true
+
+- label: Samplers Test
+  command: pytest -v -s samplers --forked
+
+- label: Worker Test
+  command: pytest -v -s worker
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -0,0 +1,50 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - "-c"
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                env:
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
@@ -115,4 +115,4 @@ void gelu_fast(
   torch::Tensor& input)   // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
-}
+}
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -4,3 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
+#include "dtype_int8.cuh"
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -879,4 +879,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
@@ -86,6 +86,14 @@ inline __device__ float4 add(float4 a, float4 b) {
   return c;
 }
 
+// for compiling, the above function seems to be useless
+inline __device__ Float4_ add(Float4_ a, Float4_ b) {
+  Float4_ c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
 // Vector multiplication.
 template<>
 inline __device__ float mul<float, float>(float a, float b) {

diff --git a/csrc/attention/dtype_int8.cuh b/csrc/attention/dtype_int8.cuh
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <stdint.h>
+#include "attention_generic.cuh"
+#include "dtype_float32.cuh"
+
+namespace vllm {
+// define int8  vector types for quantization of kv cache
+
+template<>
+struct Vec<int8_t, 1> {
+    using Type = int8_t;
+};
+
+template<>
+struct Vec<int8_t, 2> {
+    using Type = int16_t;
+};
+
+template<>
+struct Vec<int8_t, 4> {
+    using Type = int32_t;
+};
+
+template<>
+struct Vec<int8_t, 8> {
+    using Type = int64_t;
+};
+
+template<>
+struct FloatVec<int8_t> {
+    using Type = float;
+};
+
+template<>
+struct FloatVec<int16_t> {
+    using Type = float2;
+};
+
+template<>
+struct FloatVec<int32_t> {
+    using Type = Float4_;
+};
+
+template<>
+struct FloatVec<int64_t> {
+    using Type = Float8_;
+};
+}
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
@@ -390,4 +390,4 @@ void gather_cached_kv(
         block_size,
         x);
     });
-}
+}
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
@@ -9,8 +9,17 @@
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)   \
+  // AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_QUANT_TYPES(...)              \
+  VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)         \
+  AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__)
 
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
   AT_DISPATCH_SWITCH(                                             \
     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -117,4 +117,4 @@ void fused_add_rms_norm(
         num_tokens,
         hidden_size);
     });
-}
+}
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -89,3 +89,95 @@ torch::Tensor gptq_gemm(
 void gptq_shuffle(
   torch::Tensor q_weight,
   torch::Tensor q_perm);
+
+// These are kernels used by smoothquant
+void rms_norm_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& weight,
+  float epsilon);
+
+void dequant_add_residual_rms_norm_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& gamma,
+  float scale,
+  float epsilon);
+
+void dequant_add_residual_rms_norm_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& gamma,
+  torch::Tensor& scale,
+  float epsilon,
+  float weight_dequant_scale);
+
+void add_residual_rms_norm_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& weight,
+  float epsilon);
+
+void dequant_rotary_embedding(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache,
+  bool is_neox,
+  torch::Tensor& query_out,
+  torch::Tensor& key_out,
+  float query_scale,
+  float key_scale);
+
+void dequant_silu_and_mul_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  float gate_scale,
+  float up_scale,
+  float out_scale);
+
+void dequant_silu_and_mul_quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  float gate_scale,
+  float up_scale,
+  torch::Tensor& out_scale,
+  torch::Tensor& tmp);
+
+void dequant_add_residual(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  float scale);
+
+void dequant_add_residual(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& scale,
+  float weight_dequant_scale);
+
+void dequant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  float scale);
+
+void dequant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale,
+  float weight_dequant_scale);
+
+void quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  float scale);
+
+void quant(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& scale);
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
@@ -127,4 +127,4 @@ void rotary_embedding(
           head_size);
       }
     });
-}
+}