diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 591e1780c11c6..e1a840bb15039 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update(): # Nothing is preempted. assert output.blocks_to_swap_out == [] # Since append_slot returns the source -> dist mapping, it should - # applied. + # be applied. assert output.blocks_to_copy == [(2, 3)] diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py index 0d0ce0be8c5f8..9122b7003bf9a 100644 --- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py +++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py @@ -32,7 +32,7 @@ def to_bytes(y, sr): async def transcribe_audio(client, tokenizer, y, sr): # Send loaded audio directly instead of loading from disk, - # dont account for that time though + # don't account for that time though with to_bytes(y, sr) as f: start_time = time.perf_counter() transcription = await client.audio.transcriptions.create( diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py index 6addcb41c4098..ff8f193fec552 100644 --- a/tests/entrypoints/openai/test_return_token_ids.py +++ b/tests/entrypoints/openai/test_return_token_ids.py @@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server): logprobs_token_ids.append(token_id) # When echo=True, the logprobs include both prompt and response tokens - # The token_ids field should match the the suffix of response portion + # The token_ids field should match the suffix of response portion # The prompt_token_ids should match the prompt portion assert len(completion.choices[0].token_ids) < len(logprobs_token_ids) response_token_ids_length = len(completion.choices[0].token_ids) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 10879f0be83c8..fe482112d386b 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): }], ) - # By default cache_salt in the engine prompt is not set + # By default, cache_salt in the engine prompt is not set with suppress(Exception): await serving_chat.create_chat_completion(req) assert "cache_salt" not in mock_engine.generate.call_args.args[0] diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index fa4125840a010..c46db8e307936 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: # We treat N-dimensional group scaling as extended numpy-style broadcasting - # in numpy simply stretches dimensions with an extent of 1 to match the + # in numpy simply stretches dimensions with an extent of 1 to match # the target shape by repeating the data along that dimension (broadcasting) # , we extend these semantics to say if the extent of a dimension in the # source shape is not 1 and does not match the target shape we repeat each diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 05e68a961a548..0f82e1f3e343e 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int, with torch.inference_mode(): sharded_output = run_dp_sharded_vision_model(image_input, vision_model) - # Check that the world size is setup correctly + # Check that the world size is set up correctly assert get_tensor_model_parallel_world_size() == world_size # Check that the outputs have the same shape @@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int, rope_type="rope_3d") sharded_output = torch.cat(sharded_output, dim=0) - # Check that the world size is setup correctly + # Check that the world size is set up correctly assert get_tensor_model_parallel_world_size() == world_size # Compare outputs (only on rank 0) diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index bd0fa6b80781a..cd1d34fc6c3ec 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -83,7 +83,7 @@ def test_ngram_correctness( model_name: str, ): ''' - Compare the outputs of a original LLM and a speculative LLM + Compare the outputs of an original LLM and a speculative LLM should be the same when using ngram speculative decoding. ''' with monkeypatch.context() as m: diff --git a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py index d8c56ac42f718..380e72a156336 100644 --- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py +++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py @@ -42,7 +42,7 @@ def test_basic_lifecycle(): engine_core_outputs = scheduler.update_from_output(scheduler_output, model_runner_output) - # Ensure the request is finished after 1 tokens. + # Ensure the request is finished after 1 token. assert request.is_finished() assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED output = engine_core_outputs[0].outputs[0] @@ -141,7 +141,7 @@ def test_short_prompt_lifecycle(): def test_prefix_cache_lifecycle(): - """Test that remote decode params still works with a prefix cache hit.""" + """Test that remote decode params still work with a prefix cache hit.""" vllm_config = create_vllm_config() scheduler = create_scheduler(vllm_config) diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 6317817408661..eacb2ad584baf 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None: dtype=torch.bfloat16, ) - # Setup the block table and KV cache for paged KV. + # Set up the block table and KV cache for paged KV. assert max_sequence_length % block_size == 0 max_blocks_per_batch = max_sequence_length // block_size kv_cache = torch.randn( @@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None: num_alloc_blocks_per_batch] = block_ids.view( -1, num_alloc_blocks_per_batch) - # Setup the slot mapping for the input KVs. + # Set up the slot mapping for the input KVs. tree_positions = sequence_position + torch.arange( 0, tree_size_q, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index ab0a9fbd255de..1fc214c12b5d1 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str: except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError, HFValidationError): # Handle errors that may occur during the download - # Return original path instead instead of throwing error here + # Return original path instead of throwing error here logger.exception("Error downloading the HuggingFace model") return lora_path diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index 099d8613fc1a7..b2dd2501095f8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -94,7 +94,7 @@ def find_matched_target( config that a layer corresponds to. Recall that a compressed-tensors configs has a concept of - config_groups, where each layer can be quantized with with a different + config_groups, where each layer can be quantized with a different scheme. targets in each config_group will be a list of either layer names diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index ac967dcc4003e..794e24c2c748c 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -213,7 +213,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> Image.Image: """ - Load a PIL image from a HTTP or base64 data URL. + Load a PIL image from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -237,7 +237,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> Image.Image: """ - Asynchronously load a PIL image from a HTTP or base64 data URL. + Asynchronously load a PIL image from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -261,7 +261,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> tuple[npt.NDArray, dict[str, Any]]: """ - Load video from a HTTP or base64 data URL. + Load video from an HTTP or base64 data URL. """ image_io = ImageMediaIO(image_mode=image_mode, **self.media_io_kwargs.get("image", {})) @@ -281,7 +281,7 @@ class MediaConnector: image_mode: str = "RGB", ) -> tuple[npt.NDArray, dict[str, Any]]: """ - Asynchronously load video from a HTTP or base64 data URL. + Asynchronously load video from an HTTP or base64 data URL. By default, the image is converted into RGB format. """ @@ -370,7 +370,7 @@ def group_mm_inputs_by_modality( def modality_group_func( mm_input: MultiModalKwargsItems) -> Union[str, int]: - # If the input has multiple modalities, return a id as the unique key + # If the input has multiple modalities, return an id as the unique key # for the mm_input input. if len(mm_input) > 1: return id(mm_input) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 011a90ece01bd..b286a4ba9fe54 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills( for i, req_id in enumerate(input_batch.req_ids): num_tokens = scheduler_output.num_scheduled_tokens[req_id] - # for now treat 1 scheduled token as "decode" even if its not, + # for now treat 1 scheduled token as "decode" even if it's not, # we should update this to something like < 8 in the future but # currently the TritonMLA._forward_decode only supports # num_tokens = 1 diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 95319831d5121..953185a8fc31d 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str: elif xdg_cache_home: return os.path.join(xdg_cache_home, ".cache", "outlines") # If homedir is "/", we may be inside a container, and thus writing to - # root would be problematic, so we fallback to using a tempfile. + # root would be problematic, so we fall back to using a tempfile. # Also validate the path exists, since os.path.expanduser does - # not garuntee existence. + # not guarantee existence. elif os.path.isdir(home_dir) and home_dir != "/": # Default Unix fallback: ~/.cache/outlines return os.path.join(home_dir, ".cache", "outlines") diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9adf8a14213f3..3f4e3ecbd4e26 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -250,7 +250,7 @@ class TPUWorker: scheduler_output: "SchedulerOutput", ) -> Optional[ModelRunnerOutput]: output = self.model_runner.execute_model(scheduler_output) - # every worker's output is needed when kv_transfer_group is setup + # every worker's output is needed when kv_transfer_group is set up return output if self.is_driver_worker or has_kv_transfer_group( ) else None