diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index c50125b708b8..431adb8e997e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -921,7 +921,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -929,7 +929,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index 71cb420a52c4..6d3ba6c025ae 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -963,7 +963,7 @@ if __name__ == "__main__": "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " "Default value is \"ttft,tpot,itl\".") @@ -971,7 +971,7 @@ if __name__ == "__main__": "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index f0e5533bcae6..98daf1a1b8e6 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); // in case the final state is separated between the last "smem_exchange" and // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), - // (which occurs when `final_state_position` is a non-positivie index) + // (which occurs when `final_state_position` is a non-positive index) // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){ input_t vals_load[kNElts] = {0}; diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index c0a572b4aaea..f9c5ad4df54e 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata): assert self.use_cuda_graph if turn_prefills_into_decodes: - # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # When Multi-Step is enabled with Chunked-Prefill, prefills and # decodes are scheduled together. In the first step, all the # prefills turn into decodes. This update reflects that # conversion. diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index cede9915efcf..15625612e08e 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -152,11 +152,11 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module): logger.warning("Could not import HPU FusedSDPA kernel. " "vLLM will use native implementation.") - suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() - if head_size not in suppored_head_sizes: + supported_head_sizes = HPUPagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}.") + f"Supported head sizes are: {supported_head_sizes}.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 8d70afe282d6..5a47c0f63081 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention( return spda_o @ W_O NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatnated per head - `q_b_proj` is [W_UQ; W_QR] concatnated per head + `kv_b_proj` is [W_UK; W_UV] concatenated per head + `q_b_proj` is [W_UQ; W_QR] concatenated per head `out_proj` is W_O @@ -667,7 +667,7 @@ class MLACommonMetadata(AttentionMetadata): assert num_seqs > num_queries if turn_prefills_into_decodes: - # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # When Multi-Step is enabled with Chunked-Prefill, prefills and # decodes are scheduled together. In the first step, all the # prefills turn into decodes. This update reflects that # conversion. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index cd152e57d749..a9d4a70b55a8 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -414,11 +414,11 @@ class XFormersImpl(AttentionImpl[XFormersMetadata]): assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - suppored_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in suppored_head_sizes: + supported_head_sizes = PagedAttention.get_supported_head_sizes() + if head_size not in supported_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {suppored_head_sizes}.") + f"Supported head sizes are: {supported_head_sizes}.") self.attn_type = attn_type diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py index 6bce5879c81d..8c9145bb99e8 100644 --- a/vllm/attention/ops/nki_flash_attn.py +++ b/vllm/attention/ops/nki_flash_attn.py @@ -446,7 +446,7 @@ def flash_paged_attention( IO tensor dtypes: - This kernel assumes all IO tensors have the same dtype except for block_tables (int32) and mask (int32) - - If mixed_percision is True, then all Tensor Engine operation will be + - If mixed_precision is True, then all Tensor Engine operation will be performed in bfloat16 and accumulation will be performed in float32. Otherwise the intermediates will be in the same type as the inputs. diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 813556f90f53..dc0ec3219486 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -724,14 +724,14 @@ def add_cli_args(parser: argparse.ArgumentParser): "--percentile-metrics", type=str, default="ttft,tpot,itl", - help="Comma-seperated list of selected metrics to report percentils. " + help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ") parser.add_argument( "--metric-percentiles", type=str, default="99", - help="Comma-seperated list of percentiles for selected metrics. " + help="Comma-separated list of percentiles for selected metrics. " "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " "Use \"--percentile-metrics\" to select metrics.", ) diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 4c5d78a43df6..5f126c7571dc 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -93,7 +93,7 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): externally (before the next schedule() call) """ # Sequences can be in RUNNING or FINISHED_ABORTED state - # once scheduled, as a sequence is moved to FINSIHED_ABORTED + # once scheduled, as a sequence is moved to FINISHED_ABORTED # if a client disconnects from the api server. seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING) if seqs is None: diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py index 7997629d461a..acbff3258e46 100644 --- a/vllm/entrypoints/openai/tool_parsers/utils.py +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -98,7 +98,7 @@ def find_all_indices(string: str, substring: str) -> list[int]: # partial_json_parser doesn't support extra data and -# JSONDecorder.raw_decode doesn't support partial JSON +# JSONDecoder.raw_decode doesn't support partial JSON def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: try: return (partial_json_parser.loads(input_str, flags), len(input_str)) diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index bedda4c2ab21..014108e69506 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -29,7 +29,7 @@ def choose_scaled_mm_linear_kernel( compute_capability: Optional[int] = None ) -> Type[ScaledMMLinearKernel]: """ - Choose an ScalledMMLinearKernel that can implement the given config for the + Choose an ScaledMMLinearKernel that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of performance. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cfd7bc2a4057..3c8aecc09945 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -69,12 +69,12 @@ class CpuPlatform(Platform): cache_config = vllm_config.cache_config - ipex_avaliable = find_spec("intel_extension_for_pytorch") is not None + ipex_available = find_spec("intel_extension_for_pytorch") is not None if cache_config and cache_config.block_size is None: - cache_config.block_size = 128 if ipex_avaliable else 16 + cache_config.block_size = 128 if ipex_available else 16 - if not ipex_avaliable and cache_config.block_size != 16: + if not ipex_available and cache_config.block_size != 16: raise RuntimeError( f"--block-size={cache_config.block_size} requires" " intel_extension_for_pytorch") diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 2bb543bd73f7..f788d90bfb4a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -231,7 +231,7 @@ class Platform: parser: Optional[FlexibleArgumentParser] = None ) -> None: """ - Do some pre-registeration or update action for the current platform. + Do some pre-registration or update action for the current platform. This function is called before global VllmConfig is initialized or cli arguments are parsed. It's used for out-of-tree platforms to register or diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py index 249ace1f167f..0dae02d33fec 100644 --- a/vllm/reasoning/granite_reasoning_parser.py +++ b/vllm/reasoning/granite_reasoning_parser.py @@ -60,7 +60,7 @@ class GraniteReasoningParser(ReasoningParser): Args: model_output (str): Output of the model to be parsed. - request (ChatCompletionReqest): Request being processed. + request (ChatCompletionRequest): Request being processed. Returns: tuple[Optional[str], Optional[str]]: Tuple pair containing the diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 584320e76cbc..75cf09e0a228 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -101,7 +101,7 @@ class RequestOutputKind(Enum): CUMULATIVE = 0 # Return only deltas in each RequestOutput DELTA = 1 - # Do not return intermediate RequestOuputs + # Do not return intermediate RequestOutput FINAL_ONLY = 2 diff --git a/vllm/third_party/pynvml.py b/vllm/third_party/pynvml.py index 0a4be23a0936..7ed9ced0e262 100644 --- a/vllm/third_party/pynvml.py +++ b/vllm/third_party/pynvml.py @@ -1119,7 +1119,7 @@ class _PrintableStructure(Structure): e.g. class that has _field_ 'hex_value', c_uint could be formatted with _fmt_ = {"hex_value" : "%08X"} to produce nicer output. - Default fomratting string for all fields can be set with key "" like: + Default formatting string for all fields can be set with key "" like: _fmt_ = {"" : "%d MHz"} # e.g all values are numbers in MHz. If not set it's assumed to be just "%s" diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 1437db7e9d48..e6c4ebc729bb 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -83,8 +83,8 @@ spda_o = scaled_dot_product_attention( return spda_o @ W_O NOTE: in the actual code, - `kv_b_proj` is [W_UK; W_UV] concatnated per head - `q_b_proj` is [W_UQ; W_QR] concatnated per head + `kv_b_proj` is [W_UK; W_UV] concatenated per head + `q_b_proj` is [W_UQ; W_QR] concatenated per head `out_proj` is W_O diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index d79bce194b71..e854c2a44ff9 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -326,7 +326,7 @@ class WorkerProc: logger.debug("Worker interrupted.") except Exception: - # worker_busy_loop sends exceptions exceptons to Executor + # worker_busy_loop sends exceptions to Executor # for shutdown, but if there is an error in startup or an # error with IPC itself, we need to alert the parent. psutil.Process().parent().send_signal(signal.SIGUSR1) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a83409a72a88..debb7072cff8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -998,7 +998,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ) -> Union[ModelRunnerOutput, torch.Tensor]: self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - # Return empty ModelRunnerOuptut if there's no work to do. + # Return empty ModelRunnerOutput if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT if self.is_multimodal_model: diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7360c8760f21..c99c6cb72244 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -652,7 +652,7 @@ class TPUModelRunner: # Update cached state self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - # Return empty ModelRunnerOuptut if there's no work to do. + # Return empty ModelRunnerOutput if there's no work to do. return EMPTY_MODEL_RUNNER_OUTPUT if self.is_multimodal_model: