diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 715ed6748b846..0556c191ddea6 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -438,7 +438,7 @@ class FlashInferMetadata(AttentionMetadata): not in supported_head_sizes: raise ValueError( f"Only {supported_head_sizes} are supported for head_dim,", - f"received {self.head_dim}.") + f" received {self.head_dim}.") def begin_forward(self): if self.num_prefill_tokens > 0: diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index c3dbbdb868237..f47ea3684e03c 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -533,7 +533,7 @@ class MLACommonMetadata(AttentionMetadata): not in supported_head_sizes: raise ValueError( f"Only {supported_head_sizes} are supported for head_dim,", - f"received {self.head_dim}.") + f" received {self.head_dim}.") @property def prefill_metadata(self) -> Optional["MLACommonMetadata"]: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 1b1f6ca9beed6..3f40686ee2fda 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -497,7 +497,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): if logits_soft_cap is not None: raise ValueError( "ROCm Triton FlashAttention does not support attention" - "logits soft capping." + " logits soft capping." " please try using the ROCm CK " "FA backend instead by setting the env var " "`VLLM_USE_TRITON_FLASH_ATTN=0`") @@ -528,7 +528,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): if self.use_naive_attn: if logits_soft_cap is not None: raise ValueError( - "ROCm Naive FlashAttention does not support" + "ROCm Naive FlashAttention does not support " "attention logits soft capping.") self.attn_func = _sdpa_attention diff --git a/vllm/config.py b/vllm/config.py index fea673b685604..8e1ce87438af5 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -924,8 +924,8 @@ class ModelConfig: layers_block_type_value = getattr(self.hf_config, "layers_block_type", None) if layers_block_type_value is None: - raise ValueError("The model is an hybrid without a" - "layers_block_type in the hf_config," + raise ValueError("The model is an hybrid without a " + "layers_block_type in the hf_config, " "cannot determine the num of " f"{block_type.value} layers") @@ -2516,7 +2516,7 @@ def _get_and_verify_dtype( if current_platform.is_hpu() and config_dtype == torch.float16: logger.info( - "For HPU, we cast models to bfloat16 instead of" + "For HPU, we cast models to bfloat16 instead of " "using float16 by default. Please specify `dtype` if you " "want to use float16.") torch_dtype = torch.bfloat16 @@ -2732,7 +2732,7 @@ class DecodingConfig: backend=self.guided_decoding_backend).backend_name if backend not in valid_guided_backends: raise ValueError(f"Invalid guided_decoding_backend '{backend}," - f"must be one of {valid_guided_backends}") + f" must be one of {valid_guided_backends}") @dataclass @@ -3008,7 +3008,7 @@ class CompilationConfig(BaseModel): def model_post_init(self, __context: Any) -> None: if not self.enable_reshape and self.enable_fusion: logger.warning_once( - "Fusion enabled but reshape elimination disabled." + "Fusion enabled but reshape elimination disabled. " "RMSNorm + quant (fp8) fusion might not work") pass_config: PassConfig = Field(default_factory=PassConfig) @@ -3563,7 +3563,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): logger.warning( "`torch.compile` is turned on, but the model %s" " does not support it. Please open an issue on GitHub" - "if you want it to be supported.", + " if you want it to be supported.", vllm_config.model_config.model) _current_vllm_config = old_vllm_config diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 03c3b0be76393..4f04899e92e6d 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -227,10 +227,10 @@ class NCCLLibrary: self.lib = NCCLLibrary.path_to_library_cache[so_file] except Exception as e: logger.error( - "Failed to load NCCL library from %s ." + "Failed to load NCCL library from %s. " "It is expected if you are not running on NVIDIA/AMD GPUs." "Otherwise, the nccl library might not exist, be corrupted " - "or it does not support the current platform %s." + "or it does not support the current platform %s. " "If you already have the library, please set the " "environment variable VLLM_NCCL_SO_PATH" " to point to the correct nccl library path.", so_file, diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py index 58ab7f0b64243..57a2b0393ba44 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py @@ -137,7 +137,7 @@ class MooncakeTransferEngine: if metadata_backend not in supported_backend: raise ValueError( "Mooncake Configuration error. `metadata_backend`" - f"should be one of {supported_backend}.") + f" should be one of {supported_backend}.") self.engine.initializeExt(local_hostname, metadata_server, protocol, device_name, metadata_backend) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f04902ae1c767..c50c631dafccc 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -823,7 +823,7 @@ def _parse_chat_message_content_part( # content is empty, log a warning and skip if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: logger.warning( - "Skipping multimodal part (type: '%s')" + "Skipping multimodal part (type: '%s') " "with empty / unparsable content.", part_type) return None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index cefb9184b2028..3f3262f6e72c0 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1342,7 +1342,7 @@ class LLM: return params if params.guided_decoding is not None: - raise ValueError("Cannot set both guided_options_request and" + raise ValueError("Cannot set both guided_options_request and " "params.guided_decoding.") params.guided_decoding = GuidedDecodingParams( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 73061995572b5..9995951b3f3d6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -575,7 +575,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request): async def do_rerank_v1(request: RerankRequest, raw_request: Request): logger.warning_once( "To indicate that the rerank API is not part of the standard OpenAI" - " API, we have located it at `/rerank`. Please update your client" + " API, we have located it at `/rerank`. Please update your client " "accordingly. (Note: Conforms to JinaAI rerank API)") return await do_rerank(request, raw_request) diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py index b866413e3a62d..cf834fdca4265 100644 --- a/vllm/executor/ray_distributed_executor.py +++ b/vllm/executor/ray_distributed_executor.py @@ -513,7 +513,7 @@ class RayDistributedExecutor(DistributedExecutorBase): if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: raise ValueError( "cupy is not installed but required since " - "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." + "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. " "Run `pip install ray[adag]` and check cupy installation.") def _compiled_ray_dag(self, enable_asyncio: bool): diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 1734c670bf10e..7104004fcfaec 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -317,7 +317,7 @@ def initialize_ray_cluster( if parallel_config.world_size > device_bundles: raise ValueError( f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group." + f"number of available {device_str}s in the placement group. " f"Required number of devices: {parallel_config.world_size}. " f"Total number of devices: {device_bundles}.") else: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index eb53513a28307..774c3876e774b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -437,7 +437,7 @@ class LoRAModelManager(AdapterModelManager): def pin_adapter(self, lora_id: int) -> bool: """Pin a LoRAModel in the manager cache.""" raise NotImplementedError( - "Pinning is not supported in LoRAModelManager." + "Pinning is not supported in LoRAModelManager. " "Use LRUCacheLoRAModelManager for pinning") # type: ignore def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 389359a663ccc..a8de36491c5c7 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -71,7 +71,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR and self.input_quant.strategy == QuantizationStrategy.TENSOR): raise ValueError( - "For FP8 Fused MoE layers, only per-tensor scales" + "For FP8 Fused MoE layers, only per-tensor scales " "for weights and activations are supported. Found " f"{self.weight_quant}, {self.input_quant}") diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 09291c2bf1f0b..1c8d6cb1ea79a 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -74,7 +74,7 @@ class GPTQConfig(QuantizationConfig): def __repr__(self) -> str: return (f"GPTQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " - f"desc_act={self.desc_act})," + f"desc_act={self.desc_act}), " f"lm_head_quantized={self.lm_head_quantized}), " f"dynamic={self.dynamic}") diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 050130de1c0f3..36711a7a5098b 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -56,7 +56,7 @@ class ModelOptFp8Config(QuantizationConfig): quant_method = quant_config["quant_algo"] is_checkpoint_fp8_serialized = ("FP8" in quant_method) if not is_checkpoint_fp8_serialized: - raise ValueError("ModelOpt currently only supports static FP8" + raise ValueError("ModelOpt currently only supports static FP8 " "quantization in vLLM. Please check the " "`hf_quant_config.json` file for your model's " "quant configuration.") diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 82954612fb2ad..f6f66803f8169 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -25,8 +25,8 @@ class NeuronQuantConfig(QuantizationConfig): if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST: raise ValueError( f"Neuron quantization datatype {self.quant_dtype} is not valid," - f"the quantization datatype should match one of the below types" - f"{SUPPORTED_QUANT_DTYPE_LIST}") + f" the quantization datatype should match one of the below " + f"types {SUPPORTED_QUANT_DTYPE_LIST}") self.dequant_dtype = dequant_dtype self.quantize_method = quantize_method diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 36b08589fd16b..18393517a0bf9 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -55,7 +55,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): if not (weight_qscheme == "per_tensor" and input_qscheme == "per_tensor"): raise ValueError( - "For FP8 Fused MoE layers, only per-tensor scales" + "For FP8 Fused MoE layers, only per-tensor scales " "for weights and activations are supported. Found " f"{weight_qscheme}, {input_qscheme}") # noqa E501 diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 05e37251aa161..80416c1bc6ebc 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -118,7 +118,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int, and input_size_per_partition % group_size != 0): raise ValueError( f"Weight input_size_per_partition = {input_size_per_partition}" - f" is not divisible by group_size = {group_size}." + f" is not divisible by group_size = {group_size}. " "Consider reducing tensor_parallel_size or running " "with --quantization gptq.") diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index e23c637585562..4e8ef49235ed5 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1088,7 +1088,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): self.model_type = type(model).__name__ logger.info("Loading weights with BitsAndBytes quantization. " - " May take a while ...") + "May take a while ...") quant_config = getattr(model_config.hf_config, "quantization_config", None) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 4e2dda33bcab3..c58b65d493487 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -562,7 +562,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): # 3D tensor return list(torch.unbind(image_data, dim=0)) raise ValueError( - "We expect batched 2D tensors;" + "We expect batched 2D tensors; " "this can be either a list of 2D tensors or a single 3D tensor." ) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 4f5519f325e04..7e4cc6bac5e61 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -290,7 +290,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): expected_expr = str(expected_dims) raise ValueError( "The expected shape of pixel values per image per batch " - f" per patch is {expected_expr}. " + f"per patch is {expected_expr}. " f"You supplied {tuple(d.shape)}.") for d in data: diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index a20328289f924..16223953ff839 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -90,8 +90,8 @@ class GritLMPooler(nn.Module): # Return no instruction in case of missing BOS token. if prompt_token_ids[0] != self.token_ids[""]: - logger.warning("BOS token not found in prompt," - "thus using empty string for instruction." + logger.warning("BOS token not found in prompt, " + "thus using empty string for instruction. " "GritLM requires BOS token in prompt.") return instruction_len @@ -111,8 +111,8 @@ class GritLMPooler(nn.Module): if found_embed_pattern_idx != -1: instruction_len = found_embed_pattern_idx + len(embed_pattern_ids) else: - logger.warning("Query instruction not found in prompt," - "thus using BOS token as instruction instead." + logger.warning("Query instruction not found in prompt, " + "thus using BOS token as instruction instead. " "GritLM requires query instruction in prompt.") instruction_len = 1 diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 46f794e88ad5f..2699958331f3d 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -673,7 +673,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): for modality, count in counts.items(): if modality not in inputs or not inputs[modality]: raise ValueError(f"None input data of {modality}." - "But prompt requires.") + " But prompt requires.") counter_key = self.get_modality_num_counter(modality) if len(inputs[modality][counter_key]) != count: raise ValueError(f"The prompt requires {count} " diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 61d63e104de46..0f45f131065a8 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -639,7 +639,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, # 3D tensor return list(torch.unbind(image_data, dim=0)) raise ValueError( - "We expect batched 2D tensors;" + "We expect batched 2D tensors; " "this can be either a list of 2D tensors or a single 3D tensor." ) diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index 0d0c367e677e3..3d95e949e71da 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -153,8 +153,8 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal): vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"]) if self.model is None: raise ValueError( - "Unsupported task." - "Only SemanticSegmentationTask is supported for now" + "Unsupported task. " + "Only SemanticSegmentationTask is supported for now " "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 802e40a0c9523..093f8b7a8179b 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -160,7 +160,7 @@ class MultiModalProfiler(Generic[_I]): if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( - "The keys returned by `get_supported_mm_limits`" + "The keys returned by `get_supported_mm_limits` " f"({set(mm_counts.keys())}) should be the same as those " "returned by `get_mm_max_tokens_per_item` " f"({set(mm_max_tokens_per_item.keys())})") diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 5b07312561474..bf425b89132ee 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -190,7 +190,7 @@ class CudaPlatformBase(Platform): "Cannot use FlashAttention-2 backend for FP8 KV cache.") logger.warning( "Please use FlashInfer backend with FP8 KV Cache for " - "better performance by setting environment variable " + "better performance by setting environment variable " "VLLM_ATTENTION_BACKEND=FLASHINFER") target_backend = _Backend.XFORMERS elif block_size % 16 != 0: diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 41221de0afe50..f385064875ca7 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -97,7 +97,7 @@ class OpenVinoPlatform(Platform): if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": if not OpenVinoPlatform.is_openvino_cpu(): - logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" + logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is " "ignored for GPU, f16 data type will be used.") cache_config.cache_dtype = ov.Type.f16 else: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 04af319566af5..d99d4ef3dac06 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -73,7 +73,7 @@ class XPUPlatform(Platform): logger.warning( "bfloat16 is only supported on Intel Data Center GPU, " "Intel Arc GPU is not supported yet. Your device is %s," - "which is not supported. will fallback to float16", + " which is not supported. will fallback to float16", cls.get_device_name()) model_config.dtype = torch.float16 if not model_config.enforce_eager: diff --git a/vllm/prompt_adapter/models.py b/vllm/prompt_adapter/models.py index 3ba7d0896f95a..795591606f259 100644 --- a/vllm/prompt_adapter/models.py +++ b/vllm/prompt_adapter/models.py @@ -226,7 +226,7 @@ class PromptAdapterModelManager(AdapterModelManager): def pin_adapter(self, prompt_adapter_id: int) -> bool: """Pin a PromptAdapterModel in the manager cache.""" raise NotImplementedError( - "Pinning is not supported in PromptAdapterModelManager." + "Pinning is not supported in PromptAdapterModelManager. " "Use LRUCachePromptAdapterModelManager for pinning" ) # type: ignore diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 40ecc3481e6b2..c54e6abe18d73 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -16,7 +16,7 @@ try: ROCmFlashAttentionMetadata as FlashAttentionMetadata) except (ModuleNotFoundError, ImportError) as err: raise RuntimeError( - "Draft model speculative decoding currently only supports" + "Draft model speculative decoding currently only supports " "CUDA and ROCm flash attention backend.") from err from vllm.logger import init_logger diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index 0cab2c42e5791..be0f3b7e5e529 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -212,26 +212,26 @@ class JAISConfig(PretrainedConfig): if (not isinstance(self.alibi_scaling, dict) or len(self.alibi_scaling) != 2): raise ValueError( - "`alibi_scaling` must be a dictionary with two fields," + "`alibi_scaling` must be a dictionary with two fields, " "`type` and `factor` or `type` and `train_seq_len`, " f"got {self.alibi_scaling}") alibi_scaling_type = self.alibi_scaling.get("type", None) alibi_scaling_factor = self.alibi_scaling.get("factor", None) alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None) if alibi_scaling_type is None or alibi_scaling_type != "linear": - raise ValueError(f"`alibi_scaling`'s type field must be 'linear'," + raise ValueError(f"`alibi_scaling`'s type field must be 'linear', " f"got {alibi_scaling_type}") if (alibi_scaling_factor is not None and not isinstance(alibi_scaling_factor, float) or (alibi_scaling_factor is not None and alibi_scaling_factor <= 1.0)): raise ValueError( - f"`alibi_scaling`'s factor field must be a float > 1.0," + f"`alibi_scaling`'s factor field must be a float > 1.0, " f"got {alibi_scaling_factor}") if (alibi_dynamic_scaling is not None and not isinstance(alibi_dynamic_scaling, int) or (alibi_dynamic_scaling is not None and alibi_dynamic_scaling <= 1)): raise ValueError( - f"`alibi_scaling`'s `train_seq_len` field must be an" + f"`alibi_scaling`'s `train_seq_len` field must be an " f"integer > 1, got {alibi_dynamic_scaling}") diff --git a/vllm/utils.py b/vllm/utils.py index 675edc3620b57..29e60a9c9be2d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -447,7 +447,7 @@ def get_ip() -> str: logger.warning( "The environment variable HOST_IP is deprecated and ignored, as" " it is often used by Docker and other software to" - "interact with the container's network stack. Please " + " interact with the container's network stack. Please " "use VLLM_HOST_IP instead to set the IP address for vLLM processes" " to communicate with each other.") if host_ip: @@ -2091,8 +2091,8 @@ def set_ulimit(target_soft_limit=65535): (target_soft_limit, current_hard)) except ValueError as e: logger.warning( - "Found ulimit of %s and failed to automatically increase" - "with error %s. This can cause fd limit errors like" + "Found ulimit of %s and failed to automatically increase " + "with error %s. This can cause fd limit errors like " "`OSError: [Errno 24] Too many open files`. Consider " "increasing with ulimit -n", current_soft, e) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d9a415aee528b..a14a7082df4bb 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -277,5 +277,5 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): raise ValueError( "Bfloat16 is only supported on GPUs with compute capability " f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " - "You can use float16 instead by explicitly setting the" + "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 0690222d91afa..1ad66e6f3be7c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -545,7 +545,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): "value. This may cause low performance due to " "occupying the majority of available system " "memory. Please consider decreasing " - "gpu_memory_utilization or explicitly setting" + "gpu_memory_utilization or explicitly setting " "`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment " "variable.", memory_utilization) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ff38e3bfc207b..5d548bdb59f71 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -525,7 +525,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): raise ValueError( "Bfloat16 is only supported on GPUs with compute capability " f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " - "You can use float16 instead by explicitly setting the" + "You can use float16 instead by explicitly setting the " "`dtype` flag in CLI, for example: --dtype=half.") @@ -533,7 +533,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, max_model_len) -> None: if is_attention_free and num_gpu_blocks != 0: raise ValueError("No memory should be allocated for the cache blocks " - f"for an attention-free model, but {num_gpu_blocks}" + f"for an attention-free model, but {num_gpu_blocks} " "blocks are allocated.") if not is_attention_free and num_gpu_blocks <= 0: raise ValueError("No available memory for the cache blocks. "