vllm-project · vllm-bot · Jun 12, 2025 · May 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/
 
-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,12 +20,10 @@ repos:
     args: [--output-format, github, --fix]
   - id: ruff-format
     files: ^(.buildkite|benchmarks|examples)/.*
-- repo: https://github.com/codespell-project/codespell
-  rev: v2.4.1
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
   hooks:
-  - id: codespell
-    additional_dependencies: ['tomli']
-    args: ['--toml', 'pyproject.toml']
+  - id: typos
 - repo: https://github.com/PyCQA/isort
   rev: 6.0.1
   hooks:

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
@@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
 }
 
 template <typename T>
-FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
-                                        const int size) {
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
+                                         const int size) {
   T max = max_data[0];
   for (int i = 1; i < size; ++i) {
     max = max >= max_data[i] ? max : max_data[i];
@@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
 
         if (partition_num == 1) continue;
 
-        reducePartitonSoftmax(
+        reducePartitionSoftmax(
             max_logits + seq_idx * num_heads * max_num_partitions +
                 head_idx * max_num_partitions,
             exp_sums + seq_idx * num_heads * max_num_partitions +

diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
@@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   explicit FP16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  // non-temproal load
+  // non-temporal load
   explicit FP16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
@@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   explicit BF16Vec16(const void* ptr)
       : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
 
-  // non-temproal load
+  // non-temporal load
   explicit BF16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
@@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   // normal load
   explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
 
-  // non-temproal load
+  // non-temporal load
   explicit FP32Vec16(bool, void* ptr)
       : reg((__m512)_mm512_stream_load_si512(ptr)) {}
 
@@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
   // normal load
   explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
 
-  // non-temproal load
+  // non-temporal load
   explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
 
   void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
     _mm512_mask_storeu_epi8(ptr, mask, reg);
   }
 
-  // non-temproal save
+  // non-temporal save
   void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
 #endif

diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
@@ -12,7 +12,7 @@ void moe_permute(
     const torch::Tensor& input,                      // [n_token, hidden]
     const torch::Tensor& topk_weights,               //[n_token, topk]
     torch::Tensor& topk_ids,                         // [n_token, topk]
-    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const torch::Tensor& token_expert_indices,       // [n_token, topk]
     const std::optional<torch::Tensor>& expert_map,  // [n_expert]
     int64_t n_expert, int64_t n_local_expert, int64_t topk,
     const std::optional<int64_t>& align_block_size,
@@ -27,15 +27,15 @@ void moe_permute(
               "expert_first_token_offset must be int64");
   TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
               "topk_ids must be int32");
-  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
-              "token_expert_indicies must be int32");
+  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
+              "token_expert_indices must be int32");
   TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
               "src_row_id2dst_row_id_map must be int32");
   TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
               "expert_first_token_offset shape != n_local_expert+1")
   TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
-      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
   auto n_token = input.sizes()[0];
   auto n_hidden = input.sizes()[1];
   auto align_block_size_value =
@@ -71,7 +71,7 @@ void moe_permute(
                              expert_map_ptr, n_expert, stream);
   }
   // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
                     get_ptr<int>(permuted_experts_id),
                     get_ptr<int>(dst_row_id2src_row_id_map),
                     get_ptr<int64_t>(expert_first_token_offset), n_token,
@@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
 
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                  torch::Tensor& topk_ids,
-                 const torch::Tensor& token_expert_indicies,
+                 const torch::Tensor& token_expert_indices,
                  const std::optional<torch::Tensor>& expert_map,
                  int64_t n_expert, int64_t n_local_expert, int64_t topk,
                  const std::optional<int64_t>& align_block_size,
@@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
 
 void moe_unpermute(const torch::Tensor& input,
                    const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indicies,
+                   const torch::Tensor& token_expert_indices,
                    const std::optional<torch::Tensor>& expert_map,
                    int64_t n_expert, int64_t n_local_expert, int64_t topk,
                    const std::optional<int64_t>& align_block_size,

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
@@ -425,15 +425,15 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
 
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
     topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
-        gating_output, nullptr, topk_weights, topk_indicies,            \
+        gating_output, nullptr, topk_weights, topk_indices,            \
         token_expert_indices, num_tokens, topk, 0, num_experts,         \
         stream);
 
 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
     const float* gating_output,
     float* topk_weights,
-    IndType* topk_indicies,
+    IndType* topk_indices,
     int* token_expert_indices,
     float* softmax_workspace,
     const int num_tokens,
@@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
             moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                 gating_output, nullptr, softmax_workspace, num_experts);
             moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
                 num_experts, topk, 0, num_experts);
         }
     }

diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
@@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   m.def(
       "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
-      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
       "int n_local_expert,"
       "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
       "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "

diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
@@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
     static constexpr int A_CPY_VEC =
         decltype(max_common_vector(tCsA, tCrA_load)){};
 
-    static constexpr int COVERSION_WIDTH =
+    static constexpr int CONVERSION_WIDTH =
         std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
 
     auto load_A_to_registers = [&](int read_stage) {
@@ -1026,8 +1026,8 @@ struct MacheteCollectiveMma {
     // PIPELINED MAIN LOOP
     //
 
-    auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
-                                                         int read_stage) {
+    auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
+                                                          int read_stage) {
       load_extra_info_to_registers(partitioned_extra_info,
                                    copy_partitions_extra_info, k_block,
                                    read_stage);

diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
@@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
   //----------------------------------------------------
   __shared__ scalar_t s[max_lds_len];
 
@@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
   //----------------------------------------------------
   __shared__ scalar_t s[max_lds_len];
 
@@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
   uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
 
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
   // This will happen only for the last wave!
   if (m < M && (m + YTILE) >= M) {
     uint32_t startColumn = M - YTILE;
@@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 
     m += CuCount * _WvPrGrp * YTILE;
 
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
     // This will happen only for the last wave!
     if (m < M && (m + YTILE) >= M) {
       uint32_t startColumn = M - YTILE;
@@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   // Goal is to bring the activation matrix A to the LDS
   // and use it across the lifetime of the work group
   // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
   //----------------------------------------------------
   __shared__ scalar_t s[max_lds_len];
 
@@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   //----------------------------------------------------
   uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
 
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
   // This will happen only for the last wave!
   if (m < M && (m + YTILE) >= M) {
     uint32_t startColumn = M - YTILE;
@@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
     kBase = 0;
 
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
     // This will happen only for the last wave!
     if (m < M && (m + YTILE) >= M) {
       uint32_t startColumn = M - YTILE;

diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
   uint32_t const m = 1;  // Set M to 1 for compression
   uint32_t const n = a.size(1);
 
-  // Note: For correctess, the compressed format must be invariant in:
+  // Note: For correctness, the compressed format must be invariant in:
   //  - M, the flattened number of tokens
   //  - Whether output dtype is fp16 or bf16
   //  - CUTLASS epilogues

diff --git a/pyproject.toml b/pyproject.toml
@@ -137,10 +137,6 @@ exclude = [
     'vllm/attention/ops/.*\.py$'
 ]
 
-[tool.codespell]
-ignore-words-list = "dout, te, indicies, subtile, ElementE"
-skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
-
 [tool.isort]
 skip_glob = [
     ".buildkite/*",

diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
@@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
         "VLLM_USE_V1": "1",
     }
 
-    aysnc_tp_args = [
+    async_tp_args = [
         *common_args,
         "--tensor-parallel-size",
         str(tp_size),
@@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
     ]
 
     compare_two_settings(model_id,
-                         aysnc_tp_args,
+                         async_tp_args,
                          tp_args,
                          async_tp_env,
                          tp_env,

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
@@ -437,8 +437,8 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
     "enable_prefix_caching": True,
 }])
 @pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
-                                                 test_llm_generator):
+def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
+                                                  test_llm_generator):
     """Verify block manager v2 with auto prefix caching could works normal
     even when eviction started.
     With APC enabled, all blocks are held by native block at the beginning.

diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py
@@ -33,8 +33,8 @@
 @pytest.mark.parametrize("batch_size", [5])
 @pytest.mark.parametrize("seed", [1])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
-                                 batch_size, seed, backend, monkeypatch):
+def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
+                                  batch_size, seed, backend, monkeypatch):
     """
     The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
     asks for value of one of them (which is outside the sliding window).
@@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                         backend, monkeypatch):
     """
-    This is similar to test_sliding_window_retrival, however, it doesn't
+    This is similar to test_sliding_window_retrieval, however, it doesn't
     compare against the v1 block manager since v1 doesn't support
     chunked prefill with sliding window.
 

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -594,8 +594,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
     # should be preempted. 1 will also be preempted.
     budget = create_token_budget()
     output = scheduler._schedule_running(budget, curr_loras)
-    remainig_running = scheduler.running
-    assert len(remainig_running) == 0
+    remaining_running = scheduler.running
+    assert len(remaining_running) == 0
     assert len(output.decode_seq_groups) == 1
     assert len(output.prefill_seq_groups) == 0
     assert output.decode_seq_groups[0].seq_group.request_id == "0"

@@ -16,7 +16,7 @@
 assert chatml_jinja_path.exists()
 
 # Define models, templates, and their corresponding expected outputs
-MODEL_TEMPLATE_GENERATON_OUTPUT = [
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
     ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
 
 @pytest.mark.parametrize(
     "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATON_OUTPUT)
+    MODEL_TEMPLATE_GENERATION_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
                         continue_final_message, expected_output):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)