diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index eb216dc8baf10..79a546554fa1e 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -172,7 +172,7 @@ __device__ void paged_attention_kernel( // Load the query to registers. // Each thread in a thread group has a different part of the query. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the query, and the second thread // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because // q is split from a qkv tensor, it may not be contiguous. @@ -259,7 +259,7 @@ __device__ void paged_attention_kernel( // Load a key to registers. // Each thread in a thread group has a different part of the key. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the key, and the second thread // has 1, 5, 9, ... th vectors of the key, and so on. for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 15519bfed9cb4..b532bf42adfba 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'): "partly cloudly, with highs in the 90's.") -tool_funtions = {"get_current_weather": get_current_weather} +tool_functions = {"get_current_weather": get_current_weather} tools = [{ "type": "function", @@ -122,7 +122,7 @@ messages.append({ # above defined function tool_calls = json.loads(output) tool_answers = [ - tool_funtions[call['name']](**call['arguments']) for call in tool_calls + tool_functions[call['name']](**call['arguments']) for call in tool_calls ] # append the answer as a tool message and let the LLM give you an answer diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 0875128c4ff1b..90498c47fb104 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) - # lora loading should work for either absolute path and hugggingface id. + # lora loading should work for either absolute path and huggingface id. peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 11dfe4d4995d5..bdaba22c3c7a8 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation(): try: # enable hf hub transfer if available import hf_transfer # type: ignore # noqa - HF_TRANFER_ACTIVE = True + HF_TRANSFER_ACTIVE = True except ImportError: - HF_TRANFER_ACTIVE = False + HF_TRANSFER_ACTIVE = False assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == - HF_TRANFER_ACTIVE) + HF_TRANSFER_ACTIVE) def test_download_weights_from_hf(): diff --git a/vllm/config.py b/vllm/config.py index 81cac4d041166..19de4d0549b64 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -297,7 +297,7 @@ class ModelConfig: - 1K -> 1024\n - 25.6k -> 25,600""" spec_target_max_model_len: Optional[int] = None - """Specify the the maximum length for spec decoding draft models.""" + """Specify the maximum length for spec decoding draft models.""" quantization: Optional[QuantizationMethods] = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 13ddaaf961f7b..9feb9e4624591 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -153,7 +153,7 @@ def _lora_expand( lora_token_start_loc (torch.Tensor): A cumulative sum of num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that lora_token_start_loc[i], along with num_tokens_per_lora[i] - identifies the the region in token_indices_sorted_by_lora_ids that + identifies the region in token_indices_sorted_by_lora_ids that LoRA lora_ids[i] should process. lora_ids (torch.Tensor): LoRA ids to process. no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 1ea65e96d7502..bc6e6fcdd0a2e 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader( ) -> LoaderFunction: """Create a weight loader for mamba v2. This ensures that the projections are correctly sharded so that they can be split into x, B, C. It also - ensures the the all the groups corresponding to a head shard is placed + ensures that all the groups corresponding to a head shard is placed together with it. """ diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 512ec55177d84..fd8fb48c50e3a 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -21,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only IBM Granite speeech model.""" +"""Inference-only IBM Granite speech model.""" import math from collections.abc import Iterable, Mapping from typing import Optional, TypedDict, Union @@ -626,7 +626,7 @@ class GraniteSpeechForConditionalGeneration( audio_embed_sizes: torch.Tensor, ) -> torch.Tensor: """Calculate the input features mask, which will generally be used - to mask the the padded features for all entries in the batch except + to mask the padded features for all entries in the batch except for those with the most audio features. Args: diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 609746b48588c..98cef75069ae2 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module): if set to True, use GLULinear module, otherwise, used GLUPointWiseConv module. default to False. - attention_innner_dim: int, optional + attention_inner_dim: int, optional if equal to -1, attention dim for linears k/q/v is - equal to d_model. otherwise attention_innner_dim is used. + equal to d_model. otherwise attention_inner_dim is used. default -1. attention_glu_type: str, optional activation function for glu used in the multihead attention, @@ -148,7 +148,7 @@ class ConformerEncoderLayer(nn.Module): conv_glu_type="sigmoid", bias_in_glu=True, linear_glu_in_convm=False, - attention_innner_dim=-1, + attention_inner_dim=-1, attention_glu_type="swish", activation_checkpointing="", export=False, @@ -169,7 +169,7 @@ class ConformerEncoderLayer(nn.Module): n_head, d_model, dropout_rate, - attention_innner_dim, + attention_inner_dim, attention_glu_type, bias_in_glu, use_pt_scaled_dot_product_attention= diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d2843b65ab59c..d1cdd2c52750c 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -72,7 +72,7 @@ class Request: assert len(self.mm_inputs) == len(self.mm_hashes) # Read-only views - # Prevent directly appending to the these lists since + # Prevent directly appending to these lists since # they should also be updated simultaneously. self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids)