diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 6e56e24f2092..3a95b1fdfbab 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools) print(outputs[0].outputs[0].text.strip()) # yields -# 'The weather in Dallas, TX is 85 degrees fahrenheit. ' +# 'The weather in Dallas, TX is 85 degrees Fahrenheit. ' # 'It is partly cloudly, with highs in the 90's.' diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 3b9037521168..789393eb39a7 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -1052,7 +1052,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]): return layer.weight # we currently do not have quantized bmm's which are needed for - # `W_UV` and `W_UK_T`, we we just store fp16/bf16 copies and perform + # `W_UV` and `W_UK_T`, we just store fp16/bf16 copies and perform # the bmm's in 16-bit, the extra memory overhead of this is fairly low kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T assert kv_b_proj_weight.shape == ( diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 063af69f41da..f6f1838aedfc 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -1169,7 +1169,7 @@ class ModelConfig: ] # Any custom overrides will be in quantization_methods so we place # them at the start of the list so custom overrides have preference - # over the built in ones. + # over the built-in ones. quantization_methods = quantization_methods + overrides # Detect which checkpoint is it diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index c2f73fa28155..20d1e31a7106 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -770,7 +770,7 @@ class NixlConnectorWorker: # with joint KV for each block. This minimizes the overhead in # registerMem allowing faster descs queries. In order to be able to # split on kv_heads dim as required by heterogeneous TP, one must - # be able to index K/V separately. Hence the we double the number + # be able to index K/V separately. Hence we double the number # of 'virtual' regions here and halve `block_len` below. self.num_regions *= 2 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fdd25a2f9ce2..bee97f4cd04d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1159,7 +1159,7 @@ class EngineArgs: # Note(hc): In the current implementation of decode context # parallel(DCP), tp_size needs to be divisible by dcp_size, # because the world size does not change by dcp, it simply - # reuse the GPUs of TP group, and split one TP group into + # reuses the GPUs of TP group, and split one TP group into # tp_size//dcp_size DCP groups. assert self.tensor_parallel_size % self.decode_context_parallel_size \ == 0, ( diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0beb9c8cc0b9..7d1f29a9824d 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -235,7 +235,7 @@ class MQLLMEngineClient(EngineClient): # therefore we have to inform that the current # processed requests failed as well. Send back a dead # engine error give this feedback and also give a - # 'hint' to the server to shutdown next. + # 'hint' to the server to shut down next. exception = self.dead_error if request_id is None: diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index d0b5d013eb9e..7e1df795fb05 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -204,7 +204,7 @@ schema. Example: `[{"type": "text", "text": "Hello world!"}]`""" frontend_kwargs["lora_modules"]["type"] = optional_type(str) frontend_kwargs["lora_modules"]["action"] = LoRAParserAction - # Special case: Middleware needs append action + # Special case: Middleware needs to append action frontend_kwargs["middleware"]["action"] = "append" frontend_kwargs["middleware"]["type"] = str if "nargs" in frontend_kwargs["middleware"]: diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 6bf44a4345a9..9a9a19ce2188 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -176,7 +176,7 @@ class Llama4PythonicToolParser(ToolParser): index] += delta.function.arguments # HACK: serving_chat.py inspects the internal state of tool parsers - # when determining it's final streaming delta, automatically + # when determining its final streaming delta, automatically # adding autocompleted JSON. # These two lines avoid that nonsense while ensuring finish_reason # is set to tool_calls when at least one tool is called. diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index c0691f122904..e6b300fd84e9 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -143,7 +143,7 @@ class MistralToolParser(ToolParser): except json.JSONDecodeError: # use a regex to find the part corresponding to the tool call. # NOTE: This use case should not happen if the model is trained - # correctly. It's a easy possible fix so it's included, but + # correctly. It's an easy possible fix so it's included, but # can be brittle for very complex / highly nested tool calls raw_tool_call = self.tool_call_regex.findall(tool_content)[0] function_call_arr = json.loads(raw_tool_call) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 7a8c6f8571de..281563c3bfca 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -302,7 +302,7 @@ class FusedMoEPrepareAndFinalize(ABC): def max_num_tokens_per_rank(self) -> Optional[int]: """ Some PrepareFinalize All2All implementations are batched. Meaning, - they can processes only as set of tokens at a time. This + they can process only as set of tokens at a time. This function returns the batch size i.e the maximum number of tokens the implementation can process at a time. Return None if there are no such restrictions. diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 02057b476c6e..317ad079b392 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -201,7 +201,7 @@ def marlin_make_workspace(output_size_per_partition: int, def marlin_make_workspace_new(device: torch.device, max_blocks_per_sm: int = 1) -> torch.Tensor: # In the new marlin kernel, we use the num of threadblocks as workspace - # size. The num of threadblocks is is sms_count * max_blocks_per_sm. + # size. The num of threadblocks is sms_count * max_blocks_per_sm. sms = torch.cuda.get_device_properties(device).multi_processor_count return torch.zeros(sms * max_blocks_per_sm, dtype=torch.int, diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index c5902595a496..0e509b7453b9 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -98,7 +98,7 @@ class BlockTable: # here because M (max_model_len) is not necessarily divisible by # block_size. if self.dcp_world_size > 1: - # Note(hc): The DCP implement store kvcache with a interleave + # Note(hc): The DCP implement store kvcache with an interleave # style, the kvcache for the token whose token_idx is i is # always stored on the GPU whose dcp_rank equals i % cp_world_size: