[CI] change spell checker from codespell to typos (#18711)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-12-11 03:54:59 +08:00 · 2025-06-12 10:57:10 +08:00 · 2025-06-12 10:57:10 +08:00 · 2f1c19b245
commit 2f1c19b245
parent 42f52cc95b
57 changed files with 335 additions and 163 deletions
--- a/.gitignore
+++ b/.gitignore
@ -200,5 +200,5 @@ benchmarks/**/*.json
 actionlint
 shellcheck*/
-# Ingore moe/marlin_moe gen code
+# Ignore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -20,12 +20,10 @@ repos:
    args: [--output-format, github, --fix]
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
- repo: https://github.com/codespell-project/codespell
+- repo: https://github.com/crate-ci/typos
-  rev: v2.4.1
+  rev: v1.32.0
  hooks:
-  - id: codespell
+  - id: typos
    additional_dependencies: ['tomli']
    args: ['--toml', 'pyproject.toml']
 - repo: https://github.com/PyCQA/isort
  rev: 6.0.1
  hooks:
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@ -137,7 +137,7 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
 }
 template <typename T>
-FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
+FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
                                         const int size) {
  T max = max_data[0];
  for (int i = 1; i < size; ++i) {
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
        if (partition_num == 1) continue;
-        reducePartitonSoftmax(
+        reducePartitionSoftmax(
            max_logits + seq_idx * num_heads * max_num_partitions +
                head_idx * max_num_partitions,
            exp_sums + seq_idx * num_heads * max_num_partitions +
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  explicit FP16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit FP16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const void* ptr)
      : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit BF16Vec16(bool, void* ptr)
      : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
  // normal load
  explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit FP32Vec16(bool, void* ptr)
      : reg((__m512)_mm512_stream_load_si512(ptr)) {}
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
  // normal load
  explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
-  // non-temproal load
+  // non-temporal load
  explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
  void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
    _mm512_mask_storeu_epi8(ptr, mask, reg);
  }
-  // non-temproal save
+  // non-temporal save
  void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
 };
 #endif
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -12,7 +12,7 @@ void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
    const torch::Tensor& topk_weights,               //[n_token, topk]
    torch::Tensor& topk_ids,                         // [n_token, topk]
-    const torch::Tensor& token_expert_indicies,      // [n_token, topk]
+    const torch::Tensor& token_expert_indices,       // [n_token, topk]
    const std::optional<torch::Tensor>& expert_map,  // [n_expert]
    int64_t n_expert, int64_t n_local_expert, int64_t topk,
    const std::optional<int64_t>& align_block_size,
@ -27,15 +27,15 @@ void moe_permute(
              "expert_first_token_offset must be int64");
  TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
              "topk_ids must be int32");
-  TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
+  TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
-              "token_expert_indicies must be int32");
+              "token_expert_indices must be int32");
  TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
              "src_row_id2dst_row_id_map must be int32");
  TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
              "expert_first_token_offset shape != n_local_expert+1")
  TORCH_CHECK(
-      src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
+      src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
-      "token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
+      "token_expert_indices shape must be same as src_row_id2dst_row_id_map");
  auto n_token = input.sizes()[0];
  auto n_hidden = input.sizes()[1];
  auto align_block_size_value =
@ -71,7 +71,7 @@ void moe_permute(
                             expert_map_ptr, n_expert, stream);
  }
  // expert sort topk expert id and scan expert id get expert_first_token_offset
-  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
+  sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
                    get_ptr<int>(permuted_experts_id),
                    get_ptr<int>(dst_row_id2src_row_id_map),
                    get_ptr<int64_t>(expert_first_token_offset), n_token,
@ -190,7 +190,7 @@ void shuffle_rows(const torch::Tensor& input_tensor,
 void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& topk_ids,
-                 const torch::Tensor& token_expert_indicies,
+                 const torch::Tensor& token_expert_indices,
                 const std::optional<torch::Tensor>& expert_map,
                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
                 const std::optional<int64_t>& align_block_size,
@ -203,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
 void moe_unpermute(const torch::Tensor& input,
                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-                   const torch::Tensor& token_expert_indicies,
+                   const torch::Tensor& token_expert_indices,
                   const std::optional<torch::Tensor>& expert_map,
                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
                   const std::optional<int64_t>& align_block_size,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -425,7 +425,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB)                       \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>(         \
-        gating_output, nullptr, topk_weights, topk_indicies,            \
+        gating_output, nullptr, topk_weights, topk_indices,            \
        token_expert_indices, num_tokens, topk, 0, num_experts,         \
        stream);
@ -433,7 +433,7 @@ template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
    const float* gating_output,
    float* topk_weights,
-    IndType* topk_indicies,
+    IndType* topk_indices,
    int* token_expert_indices,
    float* softmax_workspace,
    const int num_tokens,
@ -476,7 +476,7 @@ void topkGatingSoftmaxKernelLauncher(
            moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
                gating_output, nullptr, softmax_workspace, num_experts);
            moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
-                softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
+                softmax_workspace, nullptr, topk_weights, topk_indices, token_expert_indices,
                num_experts, topk, 0, num_experts);
        }
    }
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -66,7 +66,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.def(
      "moe_permute(Tensor input, Tensor topk_weight, Tensor! topk_ids,"
-      "Tensor token_expert_indicies, Tensor? expert_map, int n_expert,"
+      "Tensor token_expert_indices, Tensor? expert_map, int n_expert,"
      "int n_local_expert,"
      "int topk, int? align_block_size,Tensor! permuted_input, Tensor! "
      "expert_first_token_offset, Tensor! src_row_id2dst_row_id_map, Tensor! "
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@ -1003,7 +1003,7 @@ struct MacheteCollectiveMma {
    static constexpr int A_CPY_VEC =
        decltype(max_common_vector(tCsA, tCrA_load)){};
-    static constexpr int COVERSION_WIDTH =
+    static constexpr int CONVERSION_WIDTH =
        std::min(A_CPY_VEC, int(size<0>(tCrA_mma)));
    auto load_A_to_registers = [&](int read_stage) {
@ -1026,7 +1026,7 @@ struct MacheteCollectiveMma {
    // PIPELINED MAIN LOOP
    //
-    auto convert_A = [&, a_vec = Int<COVERSION_WIDTH>{}](int k_block,
+    auto convert_A = [&, a_vec = Int<CONVERSION_WIDTH>{}](int k_block,
                                                          int read_stage) {
      load_extra_info_to_registers(partitioned_extra_info,
                                   copy_partitions_extra_info, k_block,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@ -320,7 +320,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@ -581,7 +581,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@ -601,7 +601,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // int _WvPrGrp = mindiv(N, CuCount * YTILE, WvPrGrp);
  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
  // This will happen only for the last wave!
  if (m < M && (m + YTILE) >= M) {
    uint32_t startColumn = M - YTILE;
@ -827,7 +827,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    m += CuCount * _WvPrGrp * YTILE;
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
    // This will happen only for the last wave!
    if (m < M && (m + YTILE) >= M) {
      uint32_t startColumn = M - YTILE;
@ -882,7 +882,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  // Goal is to bring the activation matrix A to the LDS
  // and use it across the lifetime of the work group
  // TODO: When activation matrix is larger than 64 KB
-  //	     then this is not goint to work!
+  //	     then this is not going to work!
  //----------------------------------------------------
  __shared__ scalar_t s[max_lds_len];
@ -904,7 +904,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
  //----------------------------------------------------
  uint32_t m = (blockIdx.x * _WvPrGrp + threadIdx.y) * YTILE;
-  // Check whether there will be fragmenation!
+  // Check whether there will be fragmentation!
  // This will happen only for the last wave!
  if (m < M && (m + YTILE) >= M) {
    uint32_t startColumn = M - YTILE;
@ -1176,7 +1176,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
    m += CuCount * _WvPrGrp * YTILE;
    kBase = 0;
-    // Check whether there will be fragmenation!
+    // Check whether there will be fragmentation!
    // This will happen only for the last wave!
    if (m < M && (m + YTILE) >= M) {
      uint32_t startColumn = M - YTILE;
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu
@ -277,7 +277,7 @@ CompressorResult cutlass_sparse_compress_sm90(torch::Tensor const& a) {
  uint32_t const m = 1;  // Set M to 1 for compression
  uint32_t const n = a.size(1);
-  // Note: For correctess, the compressed format must be invariant in:
+  // Note: For correctness, the compressed format must be invariant in:
  //  - M, the flattened number of tokens
  //  - Whether output dtype is fp16 or bf16
  //  - CUTLASS epilogues
--- a/pyproject.toml
+++ b/pyproject.toml
@ -137,10 +137,6 @@ exclude = [
    'vllm/attention/ops/.*\.py$'
 ]
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE"
 skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
 [tool.isort]
 skip_glob = [
    ".buildkite/*",
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/test_async_tp.py
@ -223,7 +223,7 @@ def test_async_tp_pass_correctness(
        "VLLM_USE_V1": "1",
    }
-    aysnc_tp_args = [
+    async_tp_args = [
        *common_args,
        "--tensor-parallel-size",
        str(tp_size),
@ -242,7 +242,7 @@ def test_async_tp_pass_correctness(
    ]
    compare_two_settings(model_id,
-                         aysnc_tp_args,
+                         async_tp_args,
                         tp_args,
                         async_tp_env,
                         tp_env,
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@ -437,7 +437,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
    "enable_prefix_caching": True,
 }])
@pytest.mark.parametrize("seed", [1])
-def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
+def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
                                                  test_llm_generator):
    """Verify block manager v2 with auto prefix caching could works normal
    even when eviction started.
--- a/tests/core/block/e2e/test_correctness_sliding_window.py
+++ b/tests/core/block/e2e/test_correctness_sliding_window.py
@ -33,7 +33,7 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
-def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
+def test_sliding_window_retrieval(baseline_llm_generator, test_llm_generator,
                                  batch_size, seed, backend, monkeypatch):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
@ -100,7 +100,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator,
 def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed,
                                        backend, monkeypatch):
    """
-    This is similar to test_sliding_window_retrival, however, it doesn't
+    This is similar to test_sliding_window_retrieval, however, it doesn't
    compare against the v1 block manager since v1 doesn't support
    chunked prefill with sliding window.
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@ -594,8 +594,8 @@ def test_decode_schedule_preempted():
    # should be preempted. 1 will also be preempted.
    budget = create_token_budget()
    output = scheduler._schedule_running(budget, curr_loras)
-    remainig_running = scheduler.running
+    remaining_running = scheduler.running
-    assert len(remainig_running) == 0
+    assert len(remaining_running) == 0
    assert len(output.decode_seq_groups) == 1
    assert len(output.prefill_seq_groups) == 0
    assert output.decode_seq_groups[0].seq_group.request_id == "0"
--- a/tests/entrypoints/openai/test_chat_template.py
+++ b/tests/entrypoints/openai/test_chat_template.py
@ -16,7 +16,7 @@ chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 # Define models, templates, and their corresponding expected outputs
-MODEL_TEMPLATE_GENERATON_OUTPUT = [
+MODEL_TEMPLATE_GENERATION_OUTPUT = [
    ("facebook/opt-125m", chatml_jinja_path, True, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@ -91,7 +91,7 @@ def test_no_load_chat_template_literallike():
@pytest.mark.parametrize(
    "model,template,add_generation_prompt,continue_final_message,expected_output",
-    MODEL_TEMPLATE_GENERATON_OUTPUT)
+    MODEL_TEMPLATE_GENERATION_OUTPUT)
 def test_get_gen_prompt(model, template, add_generation_prompt,
                        continue_final_message, expected_output):
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@ -72,8 +72,8 @@ def test_copy_blocks(
    # destination blocks.
    assert 2 * num_mappings <= num_blocks
    src_blocks = random.sample(range(num_blocks), num_mappings)
-    remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
+    remaining_blocks = list(set(range(num_blocks)) - set(src_blocks))
-    dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
+    dst_blocks = random.sample(remaining_blocks, 2 * num_mappings)
    block_mapping: list[tuple[int, int]] = []
    for i in range(num_mappings):
        src = src_blocks[i]
@ -189,12 +189,12 @@ def test_reshape_and_cache(
    # Run the reference implementation.
    reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies_lst = block_indicies.cpu().tolist()
+    block_indices_lst = block_indices.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies_lst[i]
+        block_idx = block_indices_lst[i]
        block_offset = block_offsets_lst[i]
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
@ -322,12 +322,12 @@ def test_reshape_and_cache_flash(
                        kv_dtype=kv_cache_dtype)
    # Run the reference implementation.
-    block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
+    block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    block_indicies_lst = block_indicies.cpu().tolist()
+    block_indices_lst = block_indices.cpu().tolist()
    block_offsets = slot_mapping % block_size
    block_offsets_lst = block_offsets.cpu().tolist()
    for i in range(num_tokens):
-        block_idx = block_indicies_lst[i]
+        block_idx = block_indices_lst[i]
        block_offset = block_offsets_lst[i]
        if kv_cache_layout == "NHD":
            cloned_key_cache[block_idx, block_offset, :, :] = key[i]
--- a/tests/kernels/attention/test_encoder_decoder_attn.py
+++ b/tests/kernels/attention/test_encoder_decoder_attn.py
@ -46,7 +46,7 @@ CUDA_DEVICE = "cuda:0"
 MAX_DEC_SEQ_LENS = [128]
 MAX_ENC_SEQ_LENS = [128]
-# Narrow teest-cases for unsupported-scenario
+# Narrow test-cases for unsupported-scenario
 # tests
 HEAD_SIZES_FOR_UNSUPP = [HEAD_SIZES[0]]
--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@ -39,10 +39,10 @@ def rotary_embedding_opcheck(rot,
@pytest.mark.parametrize("head_size", [32, 108])
@pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.parametrize("use_key", [True, False])
-@pytest.mark.parametrize("head_stride_is_contingous", [True, False])
+@pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
 def test_rotary_embedding_opcheck(dist_init, device, max_position,
                                  is_neox_style, rotary_dim, head_size,
-                                  seq_len, use_key, head_stride_is_contingous):
+                                  seq_len, use_key, head_stride_is_contiguous):
    batch_size = 1
    base = 10000
    num_heads = 7
@ -52,7 +52,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    positions = torch.randint(0,
                              max_position, (batch_size, seq_len),
                              device=device)
-    head_stride = head_size + (64 if head_stride_is_contingous else 0)
+    head_stride = head_size + (64 if head_stride_is_contiguous else 0)
    query = torch.randn(batch_size,
                        seq_len,
@ -72,7 +72,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
    # if we have a contiguous head stride, test the alternate
    # [..., num_heads * head_dim] shape/layout
-    if head_stride_is_contingous:
+    if head_stride_is_contiguous:
        rotary_embedding_opcheck(
            rot, positions, query.flatten(start_dim=-2),
            key.flatten(start_dim=-2) if use_key else None)
--- a/tests/kernels/mamba/test_mamba_ssm_ssd.py
+++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py
@ -107,7 +107,7 @@ def generate_random_inputs(batch_size,
    return A, dt, X, B, C
-def generate_continous_batched_examples(example_lens_by_batch,
+def generate_continuous_batched_examples(example_lens_by_batch,
                                         num_examples,
                                         full_length,
                                         last_taken,
@ -269,10 +269,9 @@ def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
    exhausted: dict = {}  # map: eg -> boolean indicating example is exhausted
    states = None
-    for Y_min, cu_seqlens, seq_idx, (A, dt, X, B,
+    for Y_min, cu_seqlens, seq_idx, (
-                                     C) in generate_continous_batched_examples(
+            A, dt, X, B, C) in generate_continuous_batched_examples(
-                                         cases, num_examples, seqlen,
+                cases, num_examples, seqlen, last_taken, exhausted, n_heads,
                                         last_taken, exhausted, n_heads,
                d_head, itype):
        chunk_indices, chunk_offsets = \
--- a/tests/lora/test_transformers_model.py
+++ b/tests/lora/test_transformers_model.py
--- a/tests/models/language/generation/test_bart.py
+++ b/tests/models/language/generation/test_bart.py
@ -118,7 +118,7 @@ def run_test(
    # default to enforce_eager=True if enforce_eager
    # is left unspecified. However, the
    # VllmRunner test fixture (which wraps around the LLM class) defaults to
-    # enforce_eager=False (a behavior which a number of already-exisitng
+    # enforce_eager=False (a behavior which a number of already-existing
    # decoder-only unit tests expect), so when testing an encoder/decoder
    # model we must explicitly specify enforce_eager=True in the VllmRunner
    # constructor.
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
                                    size=(batch_size, 1),
                                    dtype=torch.int64)
    # The target probaility distribution is a temperature zero distribution
-    # with zero entroy. Since our draft token ids don't match the probability
+    # with zero entropy. Since our draft token ids don't match the probability
    # 1.0 tokens in the target distribution we will reject all of them and
    # fallback to the greedy sampling for selecting 1 token for each sequence.
    # Verify the same.
--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, EAGLE would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, Medusa would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, mtp would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under preemption
    * Test greedy equality under various ngram sizes / speculative sizes
-With those tests, we can say at least, ngram spec would not break the correctess
+With those tests, we can say at least, ngram spec would not break the
-for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
@ -30,7 +30,7 @@ model_config = {
    ])
@pytest.mark.parametrize("batch_size", [5])
@pytest.mark.parametrize("seed", [1])
-def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
+def test_sliding_window_retrieval(monkeypatch, model, batch_size, seed):
    """
    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
    asks for value of one of them (which is outside the sliding window).
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@ -7,7 +7,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
 from .utils import create_request, create_scheduler, create_vllm_config
-def test_basic_inferface():
+def test_basic_interface():
    """Unit test for basic NixlConnector interface functionality."""
    vllm_config = create_vllm_config()
@ -25,7 +25,7 @@ def test_basic_inferface():
    scheduler.add_request(request)
-    # Remote Prefill, triggers NixlConnectorMetdata.
+    # Remote Prefill, triggers NixlConnectorMetadata.
    scheduler_output = scheduler.schedule()
    kv_connector_metadata = scheduler_output.kv_connector_metadata
    assert kv_connector_metadata is not None
--- a/tests/v1/sample/test_logprobs_e2e.py
+++ b/tests/v1/sample/test_logprobs_e2e.py
@ -32,7 +32,7 @@ def test_prompt_logprobs_e2e():
            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
-def test_promt_logprobs_e2e_server():
+def test_prompt_logprobs_e2e_server():
    with RemoteOpenAIServer(MODEL, SERVER_ARGS) as remote_server:
        url = f"{remote_server.url_for('v1')}/completions"
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@ -209,32 +209,32 @@ def test_multi_step_model_runner_input():
    received_model_input = (StatefulModelInput.from_broadcasted_tensor_dict(
        tensor_dict, attn_backend=attn_backend))
-    receieved_frozen_input = received_model_input.frozen_model_input
+    received_frozen_input = received_model_input.frozen_model_input
    # Check that received copy has correct values.
    assert isinstance(received_model_input, StatefulModelInput)
-    assert receieved_frozen_input.input_tokens is not None
+    assert received_frozen_input.input_tokens is not None
-    assert (receieved_frozen_input.input_tokens ==
+    assert (received_frozen_input.input_tokens ==
            frozen_model_input.input_tokens).all()
-    assert receieved_frozen_input.input_positions is not None
+    assert received_frozen_input.input_positions is not None
-    assert (receieved_frozen_input.input_positions ==
+    assert (received_frozen_input.input_positions ==
            frozen_model_input.input_positions).all()
-    assert receieved_frozen_input.multi_modal_kwargs is None
+    assert received_frozen_input.multi_modal_kwargs is None
    assert (frozen_model_input.multi_modal_kwargs ==
            frozen_model_input.multi_modal_kwargs)
-    assert receieved_frozen_input.lora_requests is None
+    assert received_frozen_input.lora_requests is None
-    assert (receieved_frozen_input.lora_requests ==
+    assert (received_frozen_input.lora_requests ==
            frozen_model_input.lora_requests)
-    assert receieved_frozen_input.lora_mapping is None
+    assert received_frozen_input.lora_mapping is None
    assert (
-        receieved_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
+        received_frozen_input.lora_mapping == frozen_model_input.lora_mapping)
    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(receieved_frozen_input.attn_metadata, field.name,
+        assert getattr(received_frozen_input.attn_metadata, field.name,
                       None) == getattr(attn_metadata, field.name, None)
    # For sampling metadata, only selected_token_indices is copied.
-    assert (receieved_frozen_input.sampling_metadata.selected_token_indices ==
+    assert (received_frozen_input.sampling_metadata.selected_token_indices ==
            sampling_metadata.selected_token_indices)
-    assert receieved_frozen_input.sampling_metadata.seq_groups is None
+    assert received_frozen_input.sampling_metadata.seq_groups is None
    # check non frozen fields
    assert received_model_input.is_last_step == model_input.is_last_step
--- a/tools/report_build_time_ninja.py
+++ b/tools/report_build_time_ninja.py
@ -116,7 +116,7 @@ def ReadTargets(log, show_all):
            # If ninja.exe is rudely halted then the .ninja_log file may be
            # corrupt. Silently continue.
            continue
-        start, end, _, name, cmdhash = parts  # Ignore restat.
+        start, end, _, name, cmdhash = parts  # Ignore restart.
        # Convert from integral milliseconds to float seconds.
        start = int(start) / 1000.0
        end = int(end) / 1000.0
--- a/typos.toml
+++ b/typos.toml
@ -0,0 +1,179 @@
 [files]
 # these files may be written in non english words
 extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
    "benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
    "vllm/third_party/*"]
 ignore-hidden = true
 ignore-files = true
 ignore-dot = true
 ignore-vcs = true
 ignore-global = true
 ignore-parent = true
 [default]
 binary = false
 check-filename = false
 check-file = true
 unicode = true
 ignore-hex = true
 identifier-leading-digits = false
 locale = "en"
 extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
    ".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*", ".*fo.*", ".*ba.*",
    ".*ot.*", ".*[Tt]h[rR].*"]
 extend-ignore-words-re = []
 extend-ignore-re = []
 [default.extend-identifiers]
 bbc5b7ede = "bbc5b7ede"
 womens_doubles = "womens_doubles"
 v_2nd = "v_2nd"
 splitted_input = "splitted_input"
 NOOPs = "NOOPs"
 typ = "typ"
 nin_shortcut = "nin_shortcut"
 UperNetDecoder = "UperNetDecoder"
 subtile = "subtile"
 cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
 SFOuput = "SFOuput"
 # huggingface transformers repo uses these words
 depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
 DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
 depthwise_seperable_CNN = "depthwise_seperable_CNN"
 [default.extend-words]
 iy = "iy"
 tendencias = "tendencias"
 # intel cpu features
 tme = "tme"
 dout = "dout"
 Pn = "Pn"
 arange = "arange"
 [type.py]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.py.extend-identifiers]
 arange = "arange"
 NDArray = "NDArray"
 EOFError = "EOFError"
 [type.py.extend-words]
 [type.cpp]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.cpp.extend-identifiers]
 countr_one = "countr_one"
 [type.cpp.extend-words]
 [type.rust]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.rust.extend-identifiers]
 flate2 = "flate2"
 [type.rust.extend-words]
 ser = "ser"
 [type.lock]
 extend-glob = []
 check-file = false
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.lock.extend-identifiers]
 [type.lock.extend-words]
 [type.jl]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.jl.extend-identifiers]
 [type.jl.extend-words]
 modul = "modul"
 egals = "egals"
 usig = "usig"
 egal = "egal"
 [type.go]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.go.extend-identifiers]
 flate = "flate"
 [type.go.extend-words]
 [type.css]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.css.extend-identifiers]
 nd = "nd"
 [type.css.extend-words]
 [type.man]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.man.extend-identifiers]
 Nd = "Nd"
 [type.man.extend-words]
 [type.cert]
 extend-glob = []
 check-file = false
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.cert.extend-identifiers]
 [type.cert.extend-words]
 [type.sh]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.sh.extend-identifiers]
 stap = "stap"
 ot = "ot"
 [type.sh.extend-words]
 [type.vimscript]
 extend-glob = []
 extend-ignore-identifiers-re = []
 extend-ignore-words-re = []
 extend-ignore-re = []
 [type.vimscript.extend-identifiers]
 windo = "windo"
 [type.vimscript.extend-words]
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@ -1550,10 +1550,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                 token_expert_indicies: torch.Tensor,
+                 token_expert_indices: torch.Tensor,
                 gating_output: torch.Tensor) -> None:
-    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
+    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids, token_expert_indices,
-                                  token_expert_indicies, gating_output)
+                                  gating_output)
 def moe_wna16_marlin_gemm(input: torch.Tensor, output: Optional[torch.Tensor],
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@ -373,7 +373,7 @@ class CommonAttentionState(AttentionState):
                f"Expected attn_backend name to be either 'XFORMERS'," \
                f"'ROCM_FLASH', or 'FLASH_ATTN', but " \
                f"got '{self.runner.attn_backend.get_name()}'"
-            self._add_additonal_input_buffers_for_enc_dec_model(
+            self._add_additional_input_buffers_for_enc_dec_model(
                attn_metadata=attn_metadata, input_buffers=input_buffers)
        return input_buffers
@ -427,7 +427,7 @@ class CommonAttentionState(AttentionState):
        attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture
        attn_metadata.num_encoder_tokens = 0
-    def _add_additonal_input_buffers_for_enc_dec_model(
+    def _add_additional_input_buffers_for_enc_dec_model(
            self, attn_metadata, input_buffers: Dict[str, Any]):
        """
        Saves additional input buffers specific to the encoder-decoder model
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@ -40,7 +40,7 @@ class Internlm2ToolParser(ToolParser):
            request.skip_special_tokens = False
        return request
-    def get_argments(self, obj):
+    def get_arguments(self, obj):
        if "parameters" in obj:
            return obj.get("parameters")
        elif "arguments" in obj:
@ -119,9 +119,9 @@ class Internlm2ToolParser(ToolParser):
            # now we know we're on the same tool call and we're streaming
            # arguments
            else:
-                prev_arguments = self.get_argments(
+                prev_arguments = self.get_arguments(
                    self.prev_tool_call_arr[self.current_tool_id])
-                cur_arguments = self.get_argments(tool_call_arr)
+                cur_arguments = self.get_arguments(tool_call_arr)
                # not arguments generated
                if not cur_arguments and not prev_arguments:
@ -170,7 +170,7 @@ class Internlm2ToolParser(ToolParser):
            # check to see if the name is defined and has been sent. if so,
            # stream the name - otherwise keep waiting
            # finish by setting old and returning None as base case
-            tool_call_arr["arguments"] = self.get_argments(tool_call_arr)
+            tool_call_arr["arguments"] = self.get_arguments(tool_call_arr)
            self.prev_tool_call_arr = [tool_call_arr]
            return delta
        except Exception:
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@ -1202,7 +1202,7 @@ class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA):
    multiple LoRA adapters with a specialized kernel.
    Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding
-    which can handle multi lora adapters in a specialied kernel.
+    which can handle multi lora adapters in a specialized kernel.
    """
    def __init__(self, base_layer: RotaryEmbedding) -> None:
--- a/vllm/lora/punica_wrapper/utils.py
+++ b/vllm/lora/punica_wrapper/utils.py
@ -68,11 +68,11 @@ def convert_mapping(
                LoRA indices.
            sampler_indices: Tensor of shape [batch_size] mapping requests to
                LoRA indices for sampler. For generation, this will be the
-                same as base_indicies. For prefill, this will map requests
+                same as base_indices. For prefill, this will map requests
                to LoRA indices.
            sampler_indices_padded: Tensor of shape [batch_size] mapping
                requests to LoRA indices for sampler with padding.
-                Same as sampler_indicies, but -1 is replaced with
+                Same as sampler_indices, but -1 is replaced with
                max_loras.
            embeddings_indices: Tensor of shape [2, batch_size] mapping
                requests to embedding indices. First row is for embeddings
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@ -319,7 +319,7 @@ class MambaMixer2(CustomOp):
            n_groups == 1,  # if there was only one group
        )
        intermediate_settings = (intermediate_size, 0, False)
-        head_setings = (self.num_heads, 0, False)
+        head_settings = (self.num_heads, 0, False)
        # - the weight already has a "weight_loader" attribute
        #   which set_weight_attrs will raise if we do not
@ -372,7 +372,7 @@ class MambaMixer2(CustomOp):
                            intermediate_settings,
                            group_shard_settings,
                            group_shard_settings,
-                            head_setings,  # for dt
+                            head_settings,  # for dt
                        ],
                        self.tp_size,
                        tp_rank,
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_state.py
@ -516,7 +516,7 @@ def _chunk_state_varlen_kernel(
                offs_n[None, :] * stride_chunk_states_dstate)
        else:
-            # - this seems repetitve, buts its to help the compiler
+            # - this seems repetitive, buts its to help the compiler
            if start_idx < pid_c * chunk_size:
                past_states_ptrs = chunk_states_ptr + (
                    offs_m[:, None] * stride_chunk_states_hdim +
--- a/vllm/model_executor/layers/quantization/utils/int8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/int8_utils.py
@ -219,7 +219,7 @@ def per_token_group_quant_int8(
    quantized tensor along with the scaling factor used for quantization.
    Args:
-        x: The input tenosr with ndim >= 2.
+        x: The input tensor with ndim >= 2.
        group_size: The group size used for quantization.
        eps: The minimum to avoid dividing zero.
        dtype: The dype of output tensor. Note that only `torch.int8`
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@ -401,7 +401,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                        self.target_modules.append(
                            name.replace(rep_name, sub_name))
                # Add original module name even if the module has stacked map,
-                # in case model has a mixture of disk-merged and disk-splitted
+                # in case model has a mixture of disk-merged and disk-split
                # weights with same last name.
                self.target_modules.append(name)
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -131,7 +131,7 @@ class BaiChuanAttention(nn.Module):
        self.num_heads = (self.total_num_heads //
                          tensor_model_parallel_world_size)
        self.head_dim = hidden_size // self.total_num_heads
-        self.postion_embedding = position_embedding
+        self.position_embedding = position_embedding
        self.rope_theta = rope_theta
        self.max_position_embeddings = max_position_embeddings
@ -151,7 +151,7 @@ class BaiChuanAttention(nn.Module):
            quant_config=quant_config,
        )
        # Create the alibi slopes and slice them.
-        if self.postion_embedding == "ALIBI":
+        if self.position_embedding == "ALIBI":
            tp_rank = get_tensor_model_parallel_rank()
            head_start = tp_rank * self.num_heads
            head_end = (tp_rank + 1) * self.num_heads
@ -187,7 +187,7 @@ class BaiChuanAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.W_pack(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
-        if self.postion_embedding != "ALIBI":
+        if self.position_embedding != "ALIBI":
            q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
        output, _ = self.o_proj(attn_output)
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@ -344,7 +344,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
            self.image_newline = nn.Parameter(
                torch.randn(self.projector_config.n_embed) * embed_std)
            # This is a typo in original implementation
-            self.view_seperator = nn.Parameter(
+            self.view_separator = nn.Parameter(
                torch.randn(self.projector_config.n_embed) * embed_std)
        else:
            raise ValueError(
@ -549,13 +549,13 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
            if self.global_view_pos == "head":
                global_local_features = torch.cat([
                    global_features,
-                    self.view_seperator[None, :],
+                    self.view_separator[None, :],
                    local_features,
                ])
            else:
                global_local_features = torch.cat([
                    local_features,
-                    self.view_seperator[None, :],
+                    self.view_separator[None, :],
                    global_features,
                ])
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@ -197,7 +197,7 @@ class EAGLE(nn.Module):
        return logits
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-        # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
+        # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B
        # due to missing lm_head weights and its config being that of a
        # Llama model. Here's a compatible version with the same weights:
        # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@ -634,13 +634,13 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        kwargs["has_images"] = True
        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
        # This is a HACK. Fix this.
-        start_idices = (positions == 0).cpu().nonzero()
+        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_idices)
+        num_seqs = len(start_indices)
        seq_lens = []
        for i in range(num_seqs):
-            start_idx = start_idices[i].item()
+            start_idx = start_indices[i].item()
            if i < num_seqs - 1:
-                end_idx = start_idices[i + 1].item()
+                end_idx = start_indices[i + 1].item()
            else:
                end_idx = len(input_ids)
            seq_lens.append(end_idx - start_idx)
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@ -52,7 +52,7 @@ class Llama4MoE(nn.Module):
        renormalize: bool,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        router_scores, router_indices = fast_topk(gating_output, topk, dim=-1)
-        # psuedo-standard is that the router scores are floats
+        # pseudo-standard is that the router scores are floats
        router_scores = torch.sigmoid(router_scores.float())
        return (router_scores, router_indices.to(torch.int32))
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@ -114,9 +114,9 @@ class MixtralMoE(nn.Module):
                f"Tensor parallel size {self.tp_size} is greater than "
                f"the number of experts {self.num_total_experts}.")
        # Split experts equally between ranks
-        self.expert_indicies = np.array_split(range(
+        self.expert_indices = np.array_split(range(self.num_total_experts),
-            self.num_total_experts), self.tp_size)[self.rank].tolist()
+                                             self.tp_size)[self.rank].tolist()
-        if not self.expert_indicies:
+        if not self.expert_indices:
            raise ValueError(
                f"Rank {self.rank} has no experts assigned to it.")
@ -125,7 +125,7 @@ class MixtralMoE(nn.Module):
                       config.hidden_size,
                       config.intermediate_size,
                       quant_config=quant_config)
-            if idx in self.expert_indicies else None
+            if idx in self.expert_indices else None
            for idx in range(self.num_total_experts)
        ])
        self.gate = ReplicatedLinear(config.hidden_size,
@ -146,7 +146,7 @@ class MixtralMoE(nn.Module):
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        final_hidden_states = None
-        for expert_idx in self.expert_indicies:
+        for expert_idx in self.expert_indices:
            expert_layer = self.experts[expert_idx]
            expert_mask = (selected_experts == expert_idx)
            expert_weights = (routing_weights * expert_mask).sum(dim=-1,
--- a/vllm/model_executor/models/ovis.py
+++ b/vllm/model_executor/models/ovis.py
@ -283,7 +283,7 @@ class OvisProcessingInfo(BaseProcessingInfo):
    def get_image_size_with_most_features(self) -> ImageSize:
        height, width = self.get_hf_processor().get_image_size()
        hs = self.get_hf_config().visual_tokenizer_config.hidden_stride
-        # NOTE(Isotr0py): 9 is `max_partion` hardcoded in original code
+        # NOTE(Isotr0py): 9 is `max_partition` hardcoded in original code
        # https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/modeling_ovis.py#L96
        return ImageSize(width=width * hs * 9, height=height * hs * 9)
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@ -145,7 +145,7 @@ class Phi3SmallSelfAttention(nn.Module):
        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
        if self.tp_size > 1:
            assert self.num_key_value_heads % self.tp_size == 0
-        self.num_kv_heads_per_partion = max(
+        self.num_kv_heads_per_partition = max(
            1, self.num_key_value_heads // self.tp_size)
        self.num_heads_per_partition = self.num_heads // self.tp_size
@ -212,7 +212,7 @@ class Phi3SmallSelfAttention(nn.Module):
            bs_params = {
                'max_seqlen': self.max_position_embeddings,
                'num_heads': self.num_heads_per_partition,
-                "num_kv_heads": self.num_kv_heads_per_partion,
+                "num_kv_heads": self.num_kv_heads_per_partition,
                "block_size": self.sparse_block_size,
                "local_blocks": self.local_blocks,
                "vert_stride": self.vert_stride,
@ -222,7 +222,7 @@ class Phi3SmallSelfAttention(nn.Module):
        self.attn = Attention(self.num_heads_per_partition,
                              self.head_dim,
                              self.scale,
-                              num_kv_heads=self.num_kv_heads_per_partion,
+                              num_kv_heads=self.num_kv_heads_per_partition,
                              cache_config=cache_config,
                              quant_config=quant_config,
                              blocksparse_params=bs_params,
@ -243,8 +243,8 @@ class Phi3SmallSelfAttention(nn.Module):
        # NOTE: this is required by RotaryEmbed, which indeed does not have to
        # TODO: allow 3D QK for rotary forward
        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
-        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
-        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition)
        q, k = self.rotary_emb(positions, q, k)
        attn_output = self.attn(q, k, v)
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
@ -126,7 +126,7 @@ class ConformerEncoderLayer(nn.Module):
            (Multi-Head Attention),
            1 = typical Multi-Head Attention,
            1 < attn_group_sizes < attention_heads = Grouped-Query Attention
-            attn_group_sizes = attenion_heads = Multi-Query Attention
+            attn_group_sizes = attention_heads = Multi-Query Attention
    """
    def __init__(
@ -318,7 +318,7 @@ class TransformerEncoderBase(abc.ABC, nn.Module):
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query 
            Attention
-            attention_group_size = attenion_heads = Multi-Query Attention
+            attention_group_size = attention_heads = Multi-Query Attention
    """
    def __init__(
@ -744,7 +744,7 @@ class ConformerEncoder(TransformerEncoderBase):
            1 = typical Multi-Head Attention,
            1 < attention_group_size < attention_heads = Grouped-Query
            Attention
-            attention_group_size = attenion_heads = Multi-Query Attention
+            attention_group_size = attention_heads = Multi-Query Attention
    """
    extra_multi_layer_output_idxs: list[int]
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@ -147,15 +147,15 @@ class mp(torch.autograd.Function):
        grad_at_output = grad_at_output * multiplier
-        grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
+        grad_at_scores_expanded = masked_gates * grad_at_output.mul(-1)
-        grad_at_scores_expaned.scatter_add_(
+        grad_at_scores_expanded.scatter_add_(
            dim=-1,
            index=selected_experts,
            src=grad_at_output,
        )
        return (
-            grad_at_scores_expaned,
+            grad_at_scores_expanded,
            None,
            None,
            None,
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@ -324,7 +324,7 @@ def merge_and_sort_multimodal_metadata(
    Returns:
        list[str]: List of item modalities in order of their positions in the
        input sequence.
-        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from
+        list[PlaceholderRange]: Sorted list of all PlaceholderRanges from
        mm_positions.
        Optional[list[str]]: Sorted list of all hashes from mm_hashes if given,
        None otherwise.
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@ -68,7 +68,7 @@ class OvisProcessor(ProcessorMixin):
    """
    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template", "image_pad_token", "image_segement_len"]
+    valid_kwargs = ["chat_template", "image_pad_token", "image_segment_len"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@ -886,7 +886,7 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
            num_decode_tokens=0,
            slot_mapping=slot_mapping,
            multi_modal_placeholder_index_maps=
-            None,  # FIXME(kzawora): mutli-modality will not work here
+            None,  # FIXME(kzawora): multi-modality will not work here
            enable_kv_scales_calculation=False,
        )
        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@ -277,7 +277,7 @@ class StatefulModelInput(BroadcastableModelInput):
        assert fmi.input_tokens.shape[0] >= self.num_seqs
        fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs]
-        # Update frozen_model_input::input_positons.
+        # Update frozen_model_input::input_positions.
        assert fmi.input_positions is not None
        assert fmi.input_positions.shape[0] >= self.num_seqs
        fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self.
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@ -798,9 +798,9 @@ class ModelWrapper(nn.Module):
        """
        batch_size, seq_len = token_ids.shape
        # Calculate the positions to sample from.
-        start_indicies = torch.arange(
+        start_indices = torch.arange(
            batch_size, dtype=torch.int32, device=input_lens.device) * seq_len
-        logits_indices = start_indicies + input_lens - 1
+        logits_indices = start_indices + input_lens - 1
        attn_metadata = get_forward_context().attn_metadata
        # FIXME(woosuk): This is a temporary hack to avoid using the existing
@ -822,14 +822,14 @@ class ModelWrapper(nn.Module):
            num_kv_heads, num_blocks, block_size, _ = kv_caches[0][0].shape
            slot_mapping = attn_metadata.slot_mapping
            slot_mapping = slot_mapping.flatten()
-            head_indicies = torch.arange(0,
+            head_indices = torch.arange(0,
                                        num_kv_heads,
                                        device=slot_mapping.device,
                                        dtype=slot_mapping.dtype)
-            head_indicies *= block_size * num_blocks
+            head_indices *= block_size * num_blocks
            slot_mapping = slot_mapping.repeat_interleave(num_kv_heads).view(
                -1, num_kv_heads)
-            slot_mapping = slot_mapping + head_indicies.view(1, -1)
+            slot_mapping = slot_mapping + head_indices.view(1, -1)
            slot_mapping = slot_mapping.flatten()
            attn_metadata.slot_mapping = slot_mapping