From 1bd007f23476d98caeb0a62c00384d7f2cf052a6 Mon Sep 17 00:00:00 2001 From: co63oc Date: Wed, 3 Sep 2025 11:44:50 +0800 Subject: [PATCH] fix some typos (#24071) Signed-off-by: co63oc --- benchmarks/benchmark_block_pool.py | 2 +- benchmarks/benchmark_ngram_proposer.py | 2 +- csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu | 2 +- docs/configuration/optimization.md | 4 ++-- docs/design/io_processor_plugins.md | 2 +- .../prithvi_geospatial_mae_io_processor.py | 2 +- examples/online_serving/prithvi_geospatial_mae.py | 2 +- tests/compile/piecewise/test_multiple_graphs.py | 2 +- tests/kernels/moe/test_mxfp4_moe.py | 2 +- tests/models/multimodal/processing/test_mllama4.py | 2 +- tests/quantization/test_modelopt.py | 2 +- tests/samplers/test_beam_search.py | 2 +- tests/v1/attention/test_chunked_local_attention.py | 2 +- .../unit/test_shared_storage_connector.py | 14 +++++++------- tests/v1/logits_processors/test_custom_offline.py | 2 +- vllm/benchmarks/serve.py | 2 +- vllm/config/compilation.py | 2 +- vllm/config/parallel.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 2 +- vllm/entrypoints/openai/serving_responses.py | 2 +- vllm/model_executor/layers/activation.py | 2 +- .../compressed_tensors/transform/module.py | 2 +- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- vllm/model_executor/models/gemma3n_mm.py | 2 +- vllm/model_executor/models/interns1.py | 2 +- vllm/third_party/pynvml.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 2 +- vllm/v1/attention/backends/flashinfer.py | 2 +- vllm/v1/core/kv_cache_utils.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/v1/worker/kv_connector_model_runner_mixin.py | 2 +- 32 files changed, 39 insertions(+), 39 deletions(-) diff --git a/benchmarks/benchmark_block_pool.py b/benchmarks/benchmark_block_pool.py index fd363c2ad0514..eae8d9927ea39 100644 --- a/benchmarks/benchmark_block_pool.py +++ b/benchmarks/benchmark_block_pool.py @@ -57,7 +57,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=1000, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--allocate-blocks", diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index c60040d05ab7a..11833fa1b3c8b 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -77,7 +77,7 @@ def invoke_main() -> None: "--num-iteration", type=int, default=100, - help="Number of iterations to run to stablize final data readings", + help="Number of iterations to run to stabilize final data readings", ) parser.add_argument( "--num-req", type=int, default=128, help="Number of requests in the batch" diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index fdac47c425d61..d7efb717a9a76 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -181,7 +181,7 @@ struct W4A8GemmKernel { auto A_ptr = static_cast(A.const_data_ptr()); auto B_ptr = static_cast(B.const_data_ptr()); auto D_ptr = static_cast(D.data_ptr()); - // can we avoid harcode the 8 here + // can we avoid hardcode the 8 here auto S_ptr = static_cast const*>( group_scales.const_data_ptr()); diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md index 0ab2ae58ad861..c853fcf92941e 100644 --- a/docs/configuration/optimization.md +++ b/docs/configuration/optimization.md @@ -210,7 +210,7 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2 !!! note API server scale-out disables [multi-modal IPC caching](#ipc-caching) - because it requires a one-to-one correspondance between API and engine core processes. + because it requires a one-to-one correspondence between API and engine core processes. This does not impact [multi-modal processor caching](#processor-caching). @@ -227,7 +227,7 @@ to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalPro ### IPC Caching Multi-modal IPC caching is automatically enabled when -there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes, +there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes, to avoid repeatedly transferring the same multi-modal inputs between them. ### Configuration diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md index ee474b5a7b997..e70ee4a076e54 100644 --- a/docs/design/io_processor_plugins.md +++ b/docs/design/io_processor_plugins.md @@ -2,7 +2,7 @@ IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output. -When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggerd via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. +When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint. ## Writing an IO Processor Plugin diff --git a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py index 8023cd6677762..adc27859a1cdd 100644 --- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py +++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py @@ -12,7 +12,7 @@ from vllm.pooling_params import PoolingParams # multimodal data. In this specific case this example will take a geotiff # image as input, process it using the multimodal data processor, and # perform inference. -# Reuirement - install plugin at: +# Requirement - install plugin at: # https://github.com/christian-pinto/prithvi_io_processor_plugin diff --git a/examples/online_serving/prithvi_geospatial_mae.py b/examples/online_serving/prithvi_geospatial_mae.py index 31301e0042cf4..359162c470f08 100644 --- a/examples/online_serving/prithvi_geospatial_mae.py +++ b/examples/online_serving/prithvi_geospatial_mae.py @@ -10,7 +10,7 @@ import requests # multimodal data. In this specific case this example will take a geotiff # image as input, process it using the multimodal data processor, and # perform inference. -# Reuirements : +# Requirements : # - install plugin at: # https://github.com/christian-pinto/prithvi_io_processor_plugin # - start vllm in serving mode with the below args diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/piecewise/test_multiple_graphs.py index f5e2d9ddb7528..aee2acbd490ee 100644 --- a/tests/compile/piecewise/test_multiple_graphs.py +++ b/tests/compile/piecewise/test_multiple_graphs.py @@ -134,7 +134,7 @@ class SimpleModelWithTwoGraphs(ParentModel): # Test will fail without set_model_tag here with error: # "ValueError: too many values to unpack (expected 3)" # This is because CompiledAttention and CompiledAttentionTwo - # have different implmentations but the same torch.compile + # have different implementations but the same torch.compile # cache dir will be used as default prefix is 'model_tag' with set_model_tag("attn_one"): self.attn_one = CompiledAttention( diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index 7bd1ffce58e96..c29bed3dd6b32 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -224,7 +224,7 @@ def tg_mxfp4_moe( assert (w2_bias.dim() == 2 and w2_bias.shape[0] == num_experts and w2_bias.shape[1] == hidden_size) - # Swap w1 and w3 as the defenition of + # Swap w1 and w3 as the definition of # swiglu is different in the trtllm-gen w13_weight_scale_ = w13_weight_scale.clone() w13_weight_ = w13_weight.clone() diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index 3be77b5da63f2..e7b28ff8ec7f0 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int): chunks_per_image = prod(mm_data["patches_per_image"]) total_num_patches = chunks_per_image * tokens_per_patch num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][ - 1] # x-y seperator tokens + 1] # x-y separator tokens total_tokens = total_num_patches.item() + num_tiles.item( ) + 3 # image start, image, image end diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index fcbfa681d75c9..c60a03f44baec 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -27,7 +27,7 @@ def use_v0_only(monkeypatch): reason="ModelOpt FP8 is not supported on this GPU type.") def test_modelopt_fp8_checkpoint_setup(vllm_runner): """Test ModelOpt FP8 checkpoint loading and structure validation.""" - # TODO: provide a small publically available test checkpoint + # TODO: provide a small publicly available test checkpoint model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" "TinyLlama-1.1B-Chat-v1.0-fp8-0710") diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index cc9a88a255f9f..0320a5ef31a65 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -82,7 +82,7 @@ def test_beam_search_with_concurrency_limit( beam_width: int, ) -> None: # example_prompts[1]&[3]&[7] fails due to unknown reason even without - # concurency limit. skip them for now. + # concurrency limit. skip them for now. example_prompts = (example_prompts[:8]) concurrency_limit = 2 assert len(example_prompts) > concurrency_limit diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py index 8c5a63653db9f..be77256a0d2f0 100644 --- a/tests/v1/attention/test_chunked_local_attention.py +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -160,7 +160,7 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): # Use torch.arange instead of torch.randint so we can assert on # block table tensor values. The block table will have shape # (num_batches, cdiv(max_seq_len, block_size)) and the values will be - # aranged from 0 to cdiv(max_seq_len, block_size)-1 + # arranged from 0 to cdiv(max_seq_len, block_size)-1 arange_block_indices=True, ) diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_shared_storage_connector.py index db203b81f15fc..6be261e45cb00 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py @@ -33,7 +33,7 @@ def _check_path_len(path): def _list_path(path): - """Return the list of foldername (hashes generatd) under the path""" + """Return the list of foldername (hashes generated) under the path""" return list(path.iterdir()) @@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str, image_urls: list[Image], expected_len: int, info: str): """ One individual test to process the prompt and output base on 1 set of input - Then check if the length in the strorage path matches the expected length + Then check if the length in the storage path matches the expected length `info` introduces details or purpose of the individual test """ print(f"***info: {info}***") @@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path): """ Tests that SharedStorageConnector saves KV to the storage locations with proper hashes; that are unique for inputs with identical text but - differnt images (same size), or same multiple images but different orders. + different images (same size), or same multiple images but different orders. """ # Using tmp_path as the storage path to store KV print(f"KV storage path at: {str(tmp_path)}") @@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path): img=[image_1], expected_len=2, info=("image_1 single input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_2], expected_len=2, info=("image_2 single input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_1, image_2], expected_len=3, @@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path): img=[image_1, image_2], expected_len=4, info=("[image_1, image_2] input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[image_2, image_1], expected_len=4, info=("[image_2, image_1] input the 2nd time. " - "It should not form aother new hash.")), + "It should not form another new hash.")), InputCase(text=TEXT_PROMPTS[0], img=[], expected_len=5, diff --git a/tests/v1/logits_processors/test_custom_offline.py b/tests/v1/logits_processors/test_custom_offline.py index 97d96b129ae90..891f55a14633b 100644 --- a/tests/v1/logits_processors/test_custom_offline.py +++ b/tests/v1/logits_processors/test_custom_offline.py @@ -81,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None: target_token = params.extra_args[DUMMY_LOGITPROC_ARG] if not all(x == target_token for x in lp_toks): raise AssertionError( - f"Request {bdx} generated {lp_toks}, shoud all be " + f"Request {bdx} generated {lp_toks}, should all be " f"{target_token}") else: # This request does not exercise custom logitproc (or custom diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index abb838316cd31..a98eb2a78f103 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -189,7 +189,7 @@ async def get_request( # NOTE: If we simply accumulate the random delta values # from the gamma distribution, their sum would have 1-2% gap # from target_total_delay_s. The purpose of the following logic is to - # close the gap for stablizing the throughput data + # close the gap for stabilizing the throughput data # from different random seeds. target_total_delay_s = total_requests / request_rate normalize_factor = target_total_delay_s / delay_ts[-1] diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 5c3b220016360..28ad3d2f535d3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -234,7 +234,7 @@ class CompilationConfig: - FULL_AND_PIECEWISE. PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph - incompatiable ops (i.e. some attention ops) outside the cudagraph + incompatible ops (i.e. some attention ops) outside the cudagraph for general flexibility. This is the default mode. diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 9ea883d4a03cd..9d4594bab3c17 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -87,7 +87,7 @@ class ParallelConfig: data_parallel_external_lb: bool = False """Whether to use "external" DP LB mode. Applies only to online serving and when data_parallel_size > 0. This is useful for a "one-pod-per-rank" - wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank + wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank is provided explicitly to vllm serve.""" data_parallel_hybrid_lb: bool = False """Whether to use "hybrid" DP LB mode. Applies only to online serving diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6608d2a4a9e09..efe023d5595e5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -787,7 +787,7 @@ class NixlConnectorWorker: self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist( "NIXL_INIT_AGENT", descs) - # TODO(mgoin): Hybrid memory allocator is currently diabled for + # TODO(mgoin): Hybrid memory allocator is currently disabled for # models with local attention (Llama 4). Can remove this once enabled. if self.vllm_config.model_config.hf_config.model_type == "llama4": from transformers import Llama4TextConfig diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4c15de3030998..7f11b37e51728 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -717,7 +717,7 @@ class OpenAIServingResponses(OpenAIServing): prev_msgs.append(msg) messages.extend(prev_msgs) # Append the new input. - # Reponses API supports simple text inputs without chat format. + # Responses API supports simple text inputs without chat format. if isinstance(request.input, str): messages.append(get_user_message(request.input)) else: diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index eb7e494e32861..fac37ef75b638 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -362,7 +362,7 @@ class ReLUSquaredActivation(CustomOp): return torch.square(F.relu(x)) def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: - #TODO : implement cuda kenrels + #TODO : implement cuda kernels return self.forward_native(x) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py index b3be254717734..48ab2582a3b26 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py @@ -83,7 +83,7 @@ class HadamardTransform(torch.nn.Module): # do not fold into weight in order to utilize FWHT self.scales[part_id] = 1 / math.sqrt(data.size(0)) - # FUTURE: avoid runtime tranpose by processing weights + # FUTURE: avoid runtime transpose by processing weights # prior to apply def forward(self, value: Tensor, part_id: int = 0) -> Tensor: diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index a2301779c77e4..85d05ff51daa1 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -310,7 +310,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w13_bias = layer.w13_bias.data.to(torch.float32) w2_bias = layer.w2_bias.data.to(torch.float32) - # Swap w1 and w3 as the defenition of + # Swap w1 and w3 as the definition of # swiglu is different in the trtllm-gen def swap_every_two_rows(x, axis=-1): shape = x.shape diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index d831e9084db57..3074451e40a4d 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -179,7 +179,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] ) -> BatchFeature: # HF Transformers audio processor no longer accepts `audios` key. - # We pop `audios` and replace it with `audio` key to surpress + # We pop `audios` and replace it with `audio` key to suppress # the warning. if 'audios' in mm_data: mm_data['audio'] = mm_data.pop('audios') diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 26e358f9394c6..d998b8a0ab4f7 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -492,7 +492,7 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal, @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: - # transformers InternVLProcessor uses as the seperator + # transformers InternVLProcessor uses as the separator # refer to https://github.com/huggingface/transformers/blob/f90de364c2484c7c325bbe05befdcf487bd75b63/src/transformers/models/internvl/processing_internvl.py#L116 if modality.startswith("image"): return '' diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index c06aa567444d8..6aabbc217dd03 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -3533,7 +3533,7 @@ def nvmlDeviceGetMPSComputeRunningProcesses_v3(handle): return [] elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): # typical case - # oversize the array incase more processes are created + # oversize the array in case more processes are created c_count.value = c_count.value * 2 + 5 proc_array = c_nvmlProcessInfo_v3_t * c_count.value c_procs = proc_array() diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index dd2b956d4fa3d..3cc67acd04c6b 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -167,7 +167,7 @@ class FlashAttentionMetadataBuilder( # work for mixed prefill-decode and uniform-decode. But for non-spec decodes # the graphs would not work for mixed prefill-decode; sorta the inverse # of UNIFORM_SINGLE_TOKEN_DECODE. - # Theres probably a better way to describe this using `AttentionCGSupport` + # There's probably a better way to describe this using `AttentionCGSupport` # but for now just set it to `UNIFORM_BATCH` to get use to drop down # to FULL_AND_PIECEWISE. # TODO(luka, lucas): audit FA2 as part of: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 5fc3a1517b690..2f275b8b23b17 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -291,7 +291,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): paged_kv_indices_buffer=paged_kv_indices, paged_kv_last_page_len_buffer=paged_kv_last_page_len, # Tensor cores are enabled by default because the perf would be - # atleast as good as cuda cores for all attention ops in latest + # at least as good as cuda cores for all attention ops in latest # gpus. use_tensor_cores=True, ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 590baa6208d07..248ad9cda7c28 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -217,7 +217,7 @@ class FreeKVCacheBlockQueue: # Create a fake head and a tail block for the doubly linked list to # reduce branching in the code # - # The implementation garenteed that the fake head and tail + # The implementation guaranteed that the fake head and tail # are NEVER got popped, so we could safely assume each real blocks # in the queue has prev and next blocks. self.fake_free_list_head = KVCacheBlock(block_id=-1) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index ef5a7e39a5b16..ad70d9efaaaac 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -584,7 +584,7 @@ class InputBatch: if self.is_pooling_model: last_req_index -= 1 - # Samping state not used by pooling models. + # Sampling state not used by pooling models. continue # Autoregressive models require detailed tracking of condense diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c81bc58f1ef46..4556a51b809d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2776,7 +2776,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.attn_groups.append( create_attn_groups(attn_backends, kv_cache_spec)) - # Calculate reorder batch threshold (if neeeded) + # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() def initialize_cudagraph_capture(self) -> None: diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index a03ebe35d8e0a..e2ffa2f12fda5 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -82,7 +82,7 @@ class KVConnectorModelRunnerMixin: scheduler_output) if has_kv_transfer_group() else nullcontext() # This context manager must be used within an active forward context. - # It encapsulates the entire KV conector lifecycle within execute_model + # It encapsulates the entire KV connector lifecycle within execute_model @staticmethod @contextmanager def _get_kv_connector_output(