diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 2ea4f9ccaff2..64ffa62c04d8 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -403,7 +403,7 @@ class RandomDataset(BenchmarkDataset): # [6880, 6881] -> ['Ġcalls', 'here'] -> # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] # To avoid uncontrolled change of the prompt length, - # the encoded sequence is truncated before being decode again. + # the encoded sequence is truncated before being decoded again. total_input_len = prefix_len + int(input_lens[i]) re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[ :total_input_len diff --git a/benchmarks/kernels/benchmark_lora.py b/benchmarks/kernels/benchmark_lora.py index 3d38d4b3534e..89309c79f099 100644 --- a/benchmarks/kernels/benchmark_lora.py +++ b/benchmarks/kernels/benchmark_lora.py @@ -637,7 +637,7 @@ def bench_optype( # Clear LoRA optimization hash-maps. _LORA_A_PTR_DICT.clear() _LORA_B_PTR_DICT.clear() - # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup + # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up for kwargs in kwargs_list: op_type.bench_fn()(**kwargs) torch.cuda.synchronize() diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py index d23b7b6e4571..66d85eaf5131 100644 --- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py +++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py @@ -962,7 +962,7 @@ async def main_mp( # At this point all the clients finished, # collect results (TTFT, TPOT, etc.) from all the clients. - # This needs to happens before calling join on the clients + # This needs to happen before calling join on the clients # (result_queue should be emptied). while not result_queue.empty(): client_metrics.append(result_queue.get()) diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index a5b8397e7e7f..65a87d2dd9e8 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData: # Granite Speech def run_granite_speech(question: str, audio_count: int) -> ModelRequestData: - # NOTE - the setting in this example are somehat different than what is + # NOTE - the setting in this example are somewhat different from what is # optimal for granite speech, and it is generally recommended to use beam # search. Check the model README for suggested settings. # https://huggingface.co/ibm-granite/granite-speech-3.3-8b diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 03c08240d6a8..133d5d6ee2ef 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -250,7 +250,7 @@ def build_video_inputs_from_test_info( def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], size_type: SizeType): - """Applies a size scaler to one image; this can be a an image size factor, + """Applies a size scaler to one image; this can be an image size factor, which scales the image while maintaining the aspect ratio""" # Special case for embeddings; if it's a tensor, it's only valid if we # are considering size factors at constant scale, i.e., we just clone diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index 336e2dd2b120..1edb51213534 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -42,7 +42,7 @@ def get_filtered_test_settings( else: assert test_info.prompt_formatter is not None - # Everything looks okay; keep if this is has correct proc handling + # Everything looks okay; keep if this is correct proc handling if (test_info.distributed_executor_backend is not None) == new_proc_per_test: matching_tests[test_name] = test_info diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index c5ed4c6e4032..3b9037521168 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -822,7 +822,7 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[T], Generic[T]): and context_lens_tensor is not None \ and context_lens_tensor[:self.num_prefills].max() > 0: - # NOTE: it is recommend you read the `Chunked Prefill` section in + # NOTE: it is recommended you read the `Chunked Prefill` section in # the comment at the top of the file before trying to understand # the following code diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 9f9ad1854c3b..6010a4647a0a 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -717,7 +717,7 @@ class AsyncLLMEngine(EngineClient): # Stop the execute model loop in parallel workers until there # are more requests to process. This avoids waiting # indefinitely in torch.distributed ops which may otherwise - # timeout, and unblocks the RPC thread in the workers so that + # time out, and unblocks the RPC thread in the workers so that # they can process any other queued control plane messages, # such as add/remove lora adapters. await engine.engine.stop_remote_worker_execution_loop_async() diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 2d3248859c94..0beb9c8cc0b9 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -270,7 +270,7 @@ class MQLLMEngineClient(EngineClient): queue.put_nowait(request_output) async def setup(self): - """Setup the client before it starts sending server requests.""" + """Set up the client before it starts sending server requests.""" # Start output_loop if self.output_loop is None: diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index ebc526d6db2f..2e8894436a98 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -19,7 +19,7 @@ def awq_dequantize_kernel( num_rows, # input num rows in qweight BLOCK_SIZE_X: tl.constexpr, BLOCK_SIZE_Y: tl.constexpr): - # Setup the pids. + # Set up the pids. pid_x = tl.program_id(axis=0) pid_y = tl.program_id(axis=1) diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 4a43351260e9..6fd94afbe556 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -128,7 +128,7 @@ class QuantizationConfig(ABC): @staticmethod def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any: - """Get a optional value from the model's quantization config.""" + """Get an optional value from the model's quantization config.""" try: return QuantizationConfig.get_from_keys(config, keys) except ValueError: diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index b4c9aae254ea..9696b6c0913c 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -401,7 +401,7 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: - # For blackwell default to flashinfer prefill if its available since + # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100)) @@ -1018,7 +1018,7 @@ class MLACommonImpl(MLAAttentionImpl[M], Generic[M]): return layer.weight # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == (