From 2bb4435cb7e2e2317b0f20803347690fb38fe6b4 Mon Sep 17 00:00:00 2001 From: Didier Durand <2927957+didier-durand@users.noreply.github.com> Date: Sat, 15 Nov 2025 20:27:50 +0100 Subject: [PATCH] [Doc]: fix typos in various files (#28567) Signed-off-by: Didier Durand --- docs/design/moe_kernel_features.md | 2 +- docs/features/quantization/quark.md | 2 +- vllm/compilation/compiler_interface.py | 2 +- vllm/compilation/decorators.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index ee224e6922fbd..7663b82266f0b 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -68,7 +68,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes. ## Fused MoE Experts Kernels -The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. +The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties. Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`. diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index be0702f4c9e16..bd7bc186e13aa 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later. -As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are: +As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are: - amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8 - amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8 diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py index b0cdb08884a3b..11cf0f85c1787 100644 --- a/vllm/compilation/compiler_interface.py +++ b/vllm/compilation/compiler_interface.py @@ -299,7 +299,7 @@ class InductorAdaptor(CompilerInterface): self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir if disable_cache: return - # redirect the cache directory to a sub-directory + # redirect the cache directory to a subdirectory # set flags so that Inductor and Triton store their cache # in the cache_dir, then users only need to copy the cache_dir # to another machine to reuse the cache. diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index e325bca73abb0..11a18c0e6bb78 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -159,7 +159,7 @@ def support_torch_compile( `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic dim to be decorated with `mark_unbacked`. This is useful if we would like to - enforce that dynamo do not specialize on 0/1 values in the case of dummy input + enforce that dynamo does not specialize on 0/1 values in the case of dummy input such as for vision model compilation """ @@ -483,7 +483,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): Context manager to set/unset customized cudagraph partition wrappers. If we're using Inductor-based graph partitioning, we currently have the - whole `fx.Graph` before Inductor lowering and and the piecewise + whole `fx.Graph` before Inductor lowering and the piecewise splitting happens after all graph passes and fusions. Here, we add a custom hook for Inductor to wrap each partition with our static graph wrapper class to maintain more control over static graph diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d0d6164180e66..6590ca54af682 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2871,7 +2871,7 @@ class GPUModelRunner( "gpu_model_runner: set_async_sampled_token_ids" ): # Save ref of sampled_token_ids CPU tensor if the batch contains - # any requests with sampling params that that require output ids. + # any requests with sampling params that require output ids. self.input_batch.set_async_sampled_token_ids( async_output.sampled_token_ids_cpu, async_output.async_copy_ready_event,