mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 07:45:01 +08:00
[Misc] set single whitespace between log sentences (#13771)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
This commit is contained in:
parent
7196a3b1db
commit
51010a1807
@ -438,7 +438,7 @@ class FlashInferMetadata(AttentionMetadata):
|
|||||||
not in supported_head_sizes:
|
not in supported_head_sizes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Only {supported_head_sizes} are supported for head_dim,",
|
f"Only {supported_head_sizes} are supported for head_dim,",
|
||||||
f"received {self.head_dim}.")
|
f" received {self.head_dim}.")
|
||||||
|
|
||||||
def begin_forward(self):
|
def begin_forward(self):
|
||||||
if self.num_prefill_tokens > 0:
|
if self.num_prefill_tokens > 0:
|
||||||
|
|||||||
@ -533,7 +533,7 @@ class MLACommonMetadata(AttentionMetadata):
|
|||||||
not in supported_head_sizes:
|
not in supported_head_sizes:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Only {supported_head_sizes} are supported for head_dim,",
|
f"Only {supported_head_sizes} are supported for head_dim,",
|
||||||
f"received {self.head_dim}.")
|
f" received {self.head_dim}.")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
|
def prefill_metadata(self) -> Optional["MLACommonMetadata"]:
|
||||||
|
|||||||
@ -497,7 +497,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
|||||||
if logits_soft_cap is not None:
|
if logits_soft_cap is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"ROCm Triton FlashAttention does not support attention"
|
"ROCm Triton FlashAttention does not support attention"
|
||||||
"logits soft capping."
|
" logits soft capping."
|
||||||
" please try using the ROCm CK "
|
" please try using the ROCm CK "
|
||||||
"FA backend instead by setting the env var "
|
"FA backend instead by setting the env var "
|
||||||
"`VLLM_USE_TRITON_FLASH_ATTN=0`")
|
"`VLLM_USE_TRITON_FLASH_ATTN=0`")
|
||||||
@ -528,7 +528,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
|||||||
if self.use_naive_attn:
|
if self.use_naive_attn:
|
||||||
if logits_soft_cap is not None:
|
if logits_soft_cap is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"ROCm Naive FlashAttention does not support"
|
"ROCm Naive FlashAttention does not support "
|
||||||
"attention logits soft capping.")
|
"attention logits soft capping.")
|
||||||
|
|
||||||
self.attn_func = _sdpa_attention
|
self.attn_func = _sdpa_attention
|
||||||
|
|||||||
@ -924,8 +924,8 @@ class ModelConfig:
|
|||||||
layers_block_type_value = getattr(self.hf_config,
|
layers_block_type_value = getattr(self.hf_config,
|
||||||
"layers_block_type", None)
|
"layers_block_type", None)
|
||||||
if layers_block_type_value is None:
|
if layers_block_type_value is None:
|
||||||
raise ValueError("The model is an hybrid without a"
|
raise ValueError("The model is an hybrid without a "
|
||||||
"layers_block_type in the hf_config,"
|
"layers_block_type in the hf_config, "
|
||||||
"cannot determine the num of "
|
"cannot determine the num of "
|
||||||
f"{block_type.value} layers")
|
f"{block_type.value} layers")
|
||||||
|
|
||||||
@ -2516,7 +2516,7 @@ def _get_and_verify_dtype(
|
|||||||
|
|
||||||
if current_platform.is_hpu() and config_dtype == torch.float16:
|
if current_platform.is_hpu() and config_dtype == torch.float16:
|
||||||
logger.info(
|
logger.info(
|
||||||
"For HPU, we cast models to bfloat16 instead of"
|
"For HPU, we cast models to bfloat16 instead of "
|
||||||
"using float16 by default. Please specify `dtype` if you "
|
"using float16 by default. Please specify `dtype` if you "
|
||||||
"want to use float16.")
|
"want to use float16.")
|
||||||
torch_dtype = torch.bfloat16
|
torch_dtype = torch.bfloat16
|
||||||
@ -2732,7 +2732,7 @@ class DecodingConfig:
|
|||||||
backend=self.guided_decoding_backend).backend_name
|
backend=self.guided_decoding_backend).backend_name
|
||||||
if backend not in valid_guided_backends:
|
if backend not in valid_guided_backends:
|
||||||
raise ValueError(f"Invalid guided_decoding_backend '{backend},"
|
raise ValueError(f"Invalid guided_decoding_backend '{backend},"
|
||||||
f"must be one of {valid_guided_backends}")
|
f" must be one of {valid_guided_backends}")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -3008,7 +3008,7 @@ class CompilationConfig(BaseModel):
|
|||||||
def model_post_init(self, __context: Any) -> None:
|
def model_post_init(self, __context: Any) -> None:
|
||||||
if not self.enable_reshape and self.enable_fusion:
|
if not self.enable_reshape and self.enable_fusion:
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"Fusion enabled but reshape elimination disabled."
|
"Fusion enabled but reshape elimination disabled. "
|
||||||
"RMSNorm + quant (fp8) fusion might not work")
|
"RMSNorm + quant (fp8) fusion might not work")
|
||||||
|
|
||||||
pass_config: PassConfig = Field(default_factory=PassConfig)
|
pass_config: PassConfig = Field(default_factory=PassConfig)
|
||||||
@ -3563,7 +3563,7 @@ def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False):
|
|||||||
logger.warning(
|
logger.warning(
|
||||||
"`torch.compile` is turned on, but the model %s"
|
"`torch.compile` is turned on, but the model %s"
|
||||||
" does not support it. Please open an issue on GitHub"
|
" does not support it. Please open an issue on GitHub"
|
||||||
"if you want it to be supported.",
|
" if you want it to be supported.",
|
||||||
vllm_config.model_config.model)
|
vllm_config.model_config.model)
|
||||||
_current_vllm_config = old_vllm_config
|
_current_vllm_config = old_vllm_config
|
||||||
|
|
||||||
|
|||||||
@ -227,10 +227,10 @@ class NCCLLibrary:
|
|||||||
self.lib = NCCLLibrary.path_to_library_cache[so_file]
|
self.lib = NCCLLibrary.path_to_library_cache[so_file]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Failed to load NCCL library from %s ."
|
"Failed to load NCCL library from %s. "
|
||||||
"It is expected if you are not running on NVIDIA/AMD GPUs."
|
"It is expected if you are not running on NVIDIA/AMD GPUs."
|
||||||
"Otherwise, the nccl library might not exist, be corrupted "
|
"Otherwise, the nccl library might not exist, be corrupted "
|
||||||
"or it does not support the current platform %s."
|
"or it does not support the current platform %s. "
|
||||||
"If you already have the library, please set the "
|
"If you already have the library, please set the "
|
||||||
"environment variable VLLM_NCCL_SO_PATH"
|
"environment variable VLLM_NCCL_SO_PATH"
|
||||||
" to point to the correct nccl library path.", so_file,
|
" to point to the correct nccl library path.", so_file,
|
||||||
|
|||||||
@ -137,7 +137,7 @@ class MooncakeTransferEngine:
|
|||||||
if metadata_backend not in supported_backend:
|
if metadata_backend not in supported_backend:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Mooncake Configuration error. `metadata_backend`"
|
"Mooncake Configuration error. `metadata_backend`"
|
||||||
f"should be one of {supported_backend}.")
|
f" should be one of {supported_backend}.")
|
||||||
|
|
||||||
self.engine.initializeExt(local_hostname, metadata_server,
|
self.engine.initializeExt(local_hostname, metadata_server,
|
||||||
protocol, device_name, metadata_backend)
|
protocol, device_name, metadata_backend)
|
||||||
|
|||||||
@ -823,7 +823,7 @@ def _parse_chat_message_content_part(
|
|||||||
# content is empty, log a warning and skip
|
# content is empty, log a warning and skip
|
||||||
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
|
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Skipping multimodal part (type: '%s')"
|
"Skipping multimodal part (type: '%s') "
|
||||||
"with empty / unparsable content.", part_type)
|
"with empty / unparsable content.", part_type)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@ -1342,7 +1342,7 @@ class LLM:
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
if params.guided_decoding is not None:
|
if params.guided_decoding is not None:
|
||||||
raise ValueError("Cannot set both guided_options_request and"
|
raise ValueError("Cannot set both guided_options_request and "
|
||||||
"params.guided_decoding.")
|
"params.guided_decoding.")
|
||||||
|
|
||||||
params.guided_decoding = GuidedDecodingParams(
|
params.guided_decoding = GuidedDecodingParams(
|
||||||
|
|||||||
@ -575,7 +575,7 @@ async def do_rerank(request: RerankRequest, raw_request: Request):
|
|||||||
async def do_rerank_v1(request: RerankRequest, raw_request: Request):
|
async def do_rerank_v1(request: RerankRequest, raw_request: Request):
|
||||||
logger.warning_once(
|
logger.warning_once(
|
||||||
"To indicate that the rerank API is not part of the standard OpenAI"
|
"To indicate that the rerank API is not part of the standard OpenAI"
|
||||||
" API, we have located it at `/rerank`. Please update your client"
|
" API, we have located it at `/rerank`. Please update your client "
|
||||||
"accordingly. (Note: Conforms to JinaAI rerank API)")
|
"accordingly. (Note: Conforms to JinaAI rerank API)")
|
||||||
|
|
||||||
return await do_rerank(request, raw_request)
|
return await do_rerank(request, raw_request)
|
||||||
|
|||||||
@ -513,7 +513,7 @@ class RayDistributedExecutor(DistributedExecutorBase):
|
|||||||
if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
|
if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"cupy is not installed but required since "
|
"cupy is not installed but required since "
|
||||||
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set."
|
"VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set. "
|
||||||
"Run `pip install ray[adag]` and check cupy installation.")
|
"Run `pip install ray[adag]` and check cupy installation.")
|
||||||
|
|
||||||
def _compiled_ray_dag(self, enable_asyncio: bool):
|
def _compiled_ray_dag(self, enable_asyncio: bool):
|
||||||
|
|||||||
@ -317,7 +317,7 @@ def initialize_ray_cluster(
|
|||||||
if parallel_config.world_size > device_bundles:
|
if parallel_config.world_size > device_bundles:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"The number of required {device_str}s exceeds the total "
|
f"The number of required {device_str}s exceeds the total "
|
||||||
f"number of available {device_str}s in the placement group."
|
f"number of available {device_str}s in the placement group. "
|
||||||
f"Required number of devices: {parallel_config.world_size}. "
|
f"Required number of devices: {parallel_config.world_size}. "
|
||||||
f"Total number of devices: {device_bundles}.")
|
f"Total number of devices: {device_bundles}.")
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -437,7 +437,7 @@ class LoRAModelManager(AdapterModelManager):
|
|||||||
def pin_adapter(self, lora_id: int) -> bool:
|
def pin_adapter(self, lora_id: int) -> bool:
|
||||||
"""Pin a LoRAModel in the manager cache."""
|
"""Pin a LoRAModel in the manager cache."""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Pinning is not supported in LoRAModelManager."
|
"Pinning is not supported in LoRAModelManager. "
|
||||||
"Use LRUCacheLoRAModelManager for pinning") # type: ignore
|
"Use LRUCacheLoRAModelManager for pinning") # type: ignore
|
||||||
|
|
||||||
def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
|
def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
|
||||||
|
|||||||
@ -71,7 +71,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
|||||||
if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
|
if not (self.weight_quant.strategy == QuantizationStrategy.TENSOR
|
||||||
and self.input_quant.strategy == QuantizationStrategy.TENSOR):
|
and self.input_quant.strategy == QuantizationStrategy.TENSOR):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"For FP8 Fused MoE layers, only per-tensor scales"
|
"For FP8 Fused MoE layers, only per-tensor scales "
|
||||||
"for weights and activations are supported. Found "
|
"for weights and activations are supported. Found "
|
||||||
f"{self.weight_quant}, {self.input_quant}")
|
f"{self.weight_quant}, {self.input_quant}")
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ class GPTQConfig(QuantizationConfig):
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return (f"GPTQConfig(weight_bits={self.weight_bits}, "
|
return (f"GPTQConfig(weight_bits={self.weight_bits}, "
|
||||||
f"group_size={self.group_size}, "
|
f"group_size={self.group_size}, "
|
||||||
f"desc_act={self.desc_act}),"
|
f"desc_act={self.desc_act}), "
|
||||||
f"lm_head_quantized={self.lm_head_quantized}), "
|
f"lm_head_quantized={self.lm_head_quantized}), "
|
||||||
f"dynamic={self.dynamic}")
|
f"dynamic={self.dynamic}")
|
||||||
|
|
||||||
|
|||||||
@ -56,7 +56,7 @@ class ModelOptFp8Config(QuantizationConfig):
|
|||||||
quant_method = quant_config["quant_algo"]
|
quant_method = quant_config["quant_algo"]
|
||||||
is_checkpoint_fp8_serialized = ("FP8" in quant_method)
|
is_checkpoint_fp8_serialized = ("FP8" in quant_method)
|
||||||
if not is_checkpoint_fp8_serialized:
|
if not is_checkpoint_fp8_serialized:
|
||||||
raise ValueError("ModelOpt currently only supports static FP8"
|
raise ValueError("ModelOpt currently only supports static FP8 "
|
||||||
"quantization in vLLM. Please check the "
|
"quantization in vLLM. Please check the "
|
||||||
"`hf_quant_config.json` file for your model's "
|
"`hf_quant_config.json` file for your model's "
|
||||||
"quant configuration.")
|
"quant configuration.")
|
||||||
|
|||||||
@ -25,8 +25,8 @@ class NeuronQuantConfig(QuantizationConfig):
|
|||||||
if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
|
if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Neuron quantization datatype {self.quant_dtype} is not valid,"
|
f"Neuron quantization datatype {self.quant_dtype} is not valid,"
|
||||||
f"the quantization datatype should match one of the below types"
|
f" the quantization datatype should match one of the below "
|
||||||
f"{SUPPORTED_QUANT_DTYPE_LIST}")
|
f"types {SUPPORTED_QUANT_DTYPE_LIST}")
|
||||||
self.dequant_dtype = dequant_dtype
|
self.dequant_dtype = dequant_dtype
|
||||||
self.quantize_method = quantize_method
|
self.quantize_method = quantize_method
|
||||||
|
|
||||||
|
|||||||
@ -55,7 +55,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
|
|||||||
if not (weight_qscheme == "per_tensor"
|
if not (weight_qscheme == "per_tensor"
|
||||||
and input_qscheme == "per_tensor"):
|
and input_qscheme == "per_tensor"):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"For FP8 Fused MoE layers, only per-tensor scales"
|
"For FP8 Fused MoE layers, only per-tensor scales "
|
||||||
"for weights and activations are supported. Found "
|
"for weights and activations are supported. Found "
|
||||||
f"{weight_qscheme}, {input_qscheme}") # noqa E501
|
f"{weight_qscheme}, {input_qscheme}") # noqa E501
|
||||||
|
|
||||||
|
|||||||
@ -118,7 +118,7 @@ def verify_marlin_supports_shape(output_size_per_partition: int,
|
|||||||
and input_size_per_partition % group_size != 0):
|
and input_size_per_partition % group_size != 0):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Weight input_size_per_partition = {input_size_per_partition}"
|
f"Weight input_size_per_partition = {input_size_per_partition}"
|
||||||
f" is not divisible by group_size = {group_size}."
|
f" is not divisible by group_size = {group_size}. "
|
||||||
"Consider reducing tensor_parallel_size or running "
|
"Consider reducing tensor_parallel_size or running "
|
||||||
"with --quantization gptq.")
|
"with --quantization gptq.")
|
||||||
|
|
||||||
|
|||||||
@ -1088,7 +1088,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
|||||||
self.model_type = type(model).__name__
|
self.model_type = type(model).__name__
|
||||||
|
|
||||||
logger.info("Loading weights with BitsAndBytes quantization. "
|
logger.info("Loading weights with BitsAndBytes quantization. "
|
||||||
" May take a while ...")
|
"May take a while ...")
|
||||||
|
|
||||||
quant_config = getattr(model_config.hf_config, "quantization_config",
|
quant_config = getattr(model_config.hf_config, "quantization_config",
|
||||||
None)
|
None)
|
||||||
|
|||||||
@ -562,7 +562,7 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
# 3D tensor
|
# 3D tensor
|
||||||
return list(torch.unbind(image_data, dim=0))
|
return list(torch.unbind(image_data, dim=0))
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"We expect batched 2D tensors;"
|
"We expect batched 2D tensors; "
|
||||||
"this can be either a list of 2D tensors or a single 3D tensor."
|
"this can be either a list of 2D tensors or a single 3D tensor."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -290,7 +290,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
expected_expr = str(expected_dims)
|
expected_expr = str(expected_dims)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The expected shape of pixel values per image per batch "
|
"The expected shape of pixel values per image per batch "
|
||||||
f" per patch is {expected_expr}. "
|
f"per patch is {expected_expr}. "
|
||||||
f"You supplied {tuple(d.shape)}.")
|
f"You supplied {tuple(d.shape)}.")
|
||||||
|
|
||||||
for d in data:
|
for d in data:
|
||||||
|
|||||||
@ -90,8 +90,8 @@ class GritLMPooler(nn.Module):
|
|||||||
|
|
||||||
# Return no instruction in case of missing BOS token.
|
# Return no instruction in case of missing BOS token.
|
||||||
if prompt_token_ids[0] != self.token_ids["<s>"]:
|
if prompt_token_ids[0] != self.token_ids["<s>"]:
|
||||||
logger.warning("BOS token not found in prompt,"
|
logger.warning("BOS token not found in prompt, "
|
||||||
"thus using empty string for instruction."
|
"thus using empty string for instruction. "
|
||||||
"GritLM requires BOS token in prompt.")
|
"GritLM requires BOS token in prompt.")
|
||||||
return instruction_len
|
return instruction_len
|
||||||
|
|
||||||
@ -111,8 +111,8 @@ class GritLMPooler(nn.Module):
|
|||||||
if found_embed_pattern_idx != -1:
|
if found_embed_pattern_idx != -1:
|
||||||
instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
|
instruction_len = found_embed_pattern_idx + len(embed_pattern_ids)
|
||||||
else:
|
else:
|
||||||
logger.warning("Query instruction not found in prompt,"
|
logger.warning("Query instruction not found in prompt, "
|
||||||
"thus using BOS token as instruction instead."
|
"thus using BOS token as instruction instead. "
|
||||||
"GritLM requires query instruction in prompt.")
|
"GritLM requires query instruction in prompt.")
|
||||||
instruction_len = 1
|
instruction_len = 1
|
||||||
|
|
||||||
|
|||||||
@ -673,7 +673,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
for modality, count in counts.items():
|
for modality, count in counts.items():
|
||||||
if modality not in inputs or not inputs[modality]:
|
if modality not in inputs or not inputs[modality]:
|
||||||
raise ValueError(f"None input data of {modality}."
|
raise ValueError(f"None input data of {modality}."
|
||||||
"But prompt requires.")
|
" But prompt requires.")
|
||||||
counter_key = self.get_modality_num_counter(modality)
|
counter_key = self.get_modality_num_counter(modality)
|
||||||
if len(inputs[modality][counter_key]) != count:
|
if len(inputs[modality][counter_key]) != count:
|
||||||
raise ValueError(f"The prompt requires {count} "
|
raise ValueError(f"The prompt requires {count} "
|
||||||
|
|||||||
@ -639,7 +639,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
|
|||||||
# 3D tensor
|
# 3D tensor
|
||||||
return list(torch.unbind(image_data, dim=0))
|
return list(torch.unbind(image_data, dim=0))
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"We expect batched 2D tensors;"
|
"We expect batched 2D tensors; "
|
||||||
"this can be either a list of 2D tensors or a single 3D tensor."
|
"this can be either a list of 2D tensors or a single 3D tensor."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -153,8 +153,8 @@ class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
|
|||||||
vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
|
vllm_config.model_config.hf_config.to_dict()["pretrained_cfg"])
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Unsupported task."
|
"Unsupported task. "
|
||||||
"Only SemanticSegmentationTask is supported for now"
|
"Only SemanticSegmentationTask is supported for now "
|
||||||
"by PrithviGeospatialMAE.")
|
"by PrithviGeospatialMAE.")
|
||||||
|
|
||||||
def _parse_and_validate_multimodal_data(
|
def _parse_and_validate_multimodal_data(
|
||||||
|
|||||||
@ -160,7 +160,7 @@ class MultiModalProfiler(Generic[_I]):
|
|||||||
|
|
||||||
if mm_counts.keys() != mm_max_tokens_per_item.keys():
|
if mm_counts.keys() != mm_max_tokens_per_item.keys():
|
||||||
raise AssertionError(
|
raise AssertionError(
|
||||||
"The keys returned by `get_supported_mm_limits`"
|
"The keys returned by `get_supported_mm_limits` "
|
||||||
f"({set(mm_counts.keys())}) should be the same as those "
|
f"({set(mm_counts.keys())}) should be the same as those "
|
||||||
"returned by `get_mm_max_tokens_per_item` "
|
"returned by `get_mm_max_tokens_per_item` "
|
||||||
f"({set(mm_max_tokens_per_item.keys())})")
|
f"({set(mm_max_tokens_per_item.keys())})")
|
||||||
|
|||||||
@ -190,7 +190,7 @@ class CudaPlatformBase(Platform):
|
|||||||
"Cannot use FlashAttention-2 backend for FP8 KV cache.")
|
"Cannot use FlashAttention-2 backend for FP8 KV cache.")
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Please use FlashInfer backend with FP8 KV Cache for "
|
"Please use FlashInfer backend with FP8 KV Cache for "
|
||||||
"better performance by setting environment variable "
|
"better performance by setting environment variable "
|
||||||
"VLLM_ATTENTION_BACKEND=FLASHINFER")
|
"VLLM_ATTENTION_BACKEND=FLASHINFER")
|
||||||
target_backend = _Backend.XFORMERS
|
target_backend = _Backend.XFORMERS
|
||||||
elif block_size % 16 != 0:
|
elif block_size % 16 != 0:
|
||||||
|
|||||||
@ -97,7 +97,7 @@ class OpenVinoPlatform(Platform):
|
|||||||
|
|
||||||
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
|
||||||
if not OpenVinoPlatform.is_openvino_cpu():
|
if not OpenVinoPlatform.is_openvino_cpu():
|
||||||
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is"
|
logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
|
||||||
"ignored for GPU, f16 data type will be used.")
|
"ignored for GPU, f16 data type will be used.")
|
||||||
cache_config.cache_dtype = ov.Type.f16
|
cache_config.cache_dtype = ov.Type.f16
|
||||||
else:
|
else:
|
||||||
|
|||||||
@ -73,7 +73,7 @@ class XPUPlatform(Platform):
|
|||||||
logger.warning(
|
logger.warning(
|
||||||
"bfloat16 is only supported on Intel Data Center GPU, "
|
"bfloat16 is only supported on Intel Data Center GPU, "
|
||||||
"Intel Arc GPU is not supported yet. Your device is %s,"
|
"Intel Arc GPU is not supported yet. Your device is %s,"
|
||||||
"which is not supported. will fallback to float16",
|
" which is not supported. will fallback to float16",
|
||||||
cls.get_device_name())
|
cls.get_device_name())
|
||||||
model_config.dtype = torch.float16
|
model_config.dtype = torch.float16
|
||||||
if not model_config.enforce_eager:
|
if not model_config.enforce_eager:
|
||||||
|
|||||||
@ -226,7 +226,7 @@ class PromptAdapterModelManager(AdapterModelManager):
|
|||||||
def pin_adapter(self, prompt_adapter_id: int) -> bool:
|
def pin_adapter(self, prompt_adapter_id: int) -> bool:
|
||||||
"""Pin a PromptAdapterModel in the manager cache."""
|
"""Pin a PromptAdapterModel in the manager cache."""
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Pinning is not supported in PromptAdapterModelManager."
|
"Pinning is not supported in PromptAdapterModelManager. "
|
||||||
"Use LRUCachePromptAdapterModelManager for pinning"
|
"Use LRUCachePromptAdapterModelManager for pinning"
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@ try:
|
|||||||
ROCmFlashAttentionMetadata as FlashAttentionMetadata)
|
ROCmFlashAttentionMetadata as FlashAttentionMetadata)
|
||||||
except (ModuleNotFoundError, ImportError) as err:
|
except (ModuleNotFoundError, ImportError) as err:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Draft model speculative decoding currently only supports"
|
"Draft model speculative decoding currently only supports "
|
||||||
"CUDA and ROCm flash attention backend.") from err
|
"CUDA and ROCm flash attention backend.") from err
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
|
|||||||
@ -212,26 +212,26 @@ class JAISConfig(PretrainedConfig):
|
|||||||
if (not isinstance(self.alibi_scaling, dict)
|
if (not isinstance(self.alibi_scaling, dict)
|
||||||
or len(self.alibi_scaling) != 2):
|
or len(self.alibi_scaling) != 2):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"`alibi_scaling` must be a dictionary with two fields,"
|
"`alibi_scaling` must be a dictionary with two fields, "
|
||||||
"`type` and `factor` or `type` and `train_seq_len`, "
|
"`type` and `factor` or `type` and `train_seq_len`, "
|
||||||
f"got {self.alibi_scaling}")
|
f"got {self.alibi_scaling}")
|
||||||
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
alibi_scaling_type = self.alibi_scaling.get("type", None)
|
||||||
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
alibi_scaling_factor = self.alibi_scaling.get("factor", None)
|
||||||
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
alibi_dynamic_scaling = self.alibi_scaling.get("train_seq_len", None)
|
||||||
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
if alibi_scaling_type is None or alibi_scaling_type != "linear":
|
||||||
raise ValueError(f"`alibi_scaling`'s type field must be 'linear',"
|
raise ValueError(f"`alibi_scaling`'s type field must be 'linear', "
|
||||||
f"got {alibi_scaling_type}")
|
f"got {alibi_scaling_type}")
|
||||||
if (alibi_scaling_factor is not None
|
if (alibi_scaling_factor is not None
|
||||||
and not isinstance(alibi_scaling_factor, float)
|
and not isinstance(alibi_scaling_factor, float)
|
||||||
or (alibi_scaling_factor is not None
|
or (alibi_scaling_factor is not None
|
||||||
and alibi_scaling_factor <= 1.0)):
|
and alibi_scaling_factor <= 1.0)):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`alibi_scaling`'s factor field must be a float > 1.0,"
|
f"`alibi_scaling`'s factor field must be a float > 1.0, "
|
||||||
f"got {alibi_scaling_factor}")
|
f"got {alibi_scaling_factor}")
|
||||||
if (alibi_dynamic_scaling is not None
|
if (alibi_dynamic_scaling is not None
|
||||||
and not isinstance(alibi_dynamic_scaling, int)
|
and not isinstance(alibi_dynamic_scaling, int)
|
||||||
or (alibi_dynamic_scaling is not None
|
or (alibi_dynamic_scaling is not None
|
||||||
and alibi_dynamic_scaling <= 1)):
|
and alibi_dynamic_scaling <= 1)):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`alibi_scaling`'s `train_seq_len` field must be an"
|
f"`alibi_scaling`'s `train_seq_len` field must be an "
|
||||||
f"integer > 1, got {alibi_dynamic_scaling}")
|
f"integer > 1, got {alibi_dynamic_scaling}")
|
||||||
|
|||||||
@ -447,7 +447,7 @@ def get_ip() -> str:
|
|||||||
logger.warning(
|
logger.warning(
|
||||||
"The environment variable HOST_IP is deprecated and ignored, as"
|
"The environment variable HOST_IP is deprecated and ignored, as"
|
||||||
" it is often used by Docker and other software to"
|
" it is often used by Docker and other software to"
|
||||||
"interact with the container's network stack. Please "
|
" interact with the container's network stack. Please "
|
||||||
"use VLLM_HOST_IP instead to set the IP address for vLLM processes"
|
"use VLLM_HOST_IP instead to set the IP address for vLLM processes"
|
||||||
" to communicate with each other.")
|
" to communicate with each other.")
|
||||||
if host_ip:
|
if host_ip:
|
||||||
@ -2091,8 +2091,8 @@ def set_ulimit(target_soft_limit=65535):
|
|||||||
(target_soft_limit, current_hard))
|
(target_soft_limit, current_hard))
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Found ulimit of %s and failed to automatically increase"
|
"Found ulimit of %s and failed to automatically increase "
|
||||||
"with error %s. This can cause fd limit errors like"
|
"with error %s. This can cause fd limit errors like "
|
||||||
"`OSError: [Errno 24] Too many open files`. Consider "
|
"`OSError: [Errno 24] Too many open files`. Consider "
|
||||||
"increasing with ulimit -n", current_soft, e)
|
"increasing with ulimit -n", current_soft, e)
|
||||||
|
|
||||||
|
|||||||
@ -277,5 +277,5 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Bfloat16 is only supported on GPUs with compute capability "
|
"Bfloat16 is only supported on GPUs with compute capability "
|
||||||
f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
|
f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
|
||||||
"You can use float16 instead by explicitly setting the"
|
"You can use float16 instead by explicitly setting the "
|
||||||
"`dtype` flag in CLI, for example: --dtype=half.")
|
"`dtype` flag in CLI, for example: --dtype=half.")
|
||||||
|
|||||||
@ -545,7 +545,7 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase):
|
|||||||
"value. This may cause low performance due to "
|
"value. This may cause low performance due to "
|
||||||
"occupying the majority of available system "
|
"occupying the majority of available system "
|
||||||
"memory. Please consider decreasing "
|
"memory. Please consider decreasing "
|
||||||
"gpu_memory_utilization or explicitly setting"
|
"gpu_memory_utilization or explicitly setting "
|
||||||
"`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
|
"`VLLM_OPENVINO_KVCACHE_SPACE` (GB) environment "
|
||||||
"variable.", memory_utilization)
|
"variable.", memory_utilization)
|
||||||
|
|
||||||
|
|||||||
@ -525,7 +525,7 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Bfloat16 is only supported on GPUs with compute capability "
|
"Bfloat16 is only supported on GPUs with compute capability "
|
||||||
f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
|
f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
|
||||||
"You can use float16 instead by explicitly setting the"
|
"You can use float16 instead by explicitly setting the "
|
||||||
"`dtype` flag in CLI, for example: --dtype=half.")
|
"`dtype` flag in CLI, for example: --dtype=half.")
|
||||||
|
|
||||||
|
|
||||||
@ -533,7 +533,7 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
|
|||||||
max_model_len) -> None:
|
max_model_len) -> None:
|
||||||
if is_attention_free and num_gpu_blocks != 0:
|
if is_attention_free and num_gpu_blocks != 0:
|
||||||
raise ValueError("No memory should be allocated for the cache blocks "
|
raise ValueError("No memory should be allocated for the cache blocks "
|
||||||
f"for an attention-free model, but {num_gpu_blocks}"
|
f"for an attention-free model, but {num_gpu_blocks} "
|
||||||
"blocks are allocated.")
|
"blocks are allocated.")
|
||||||
if not is_attention_free and num_gpu_blocks <= 0:
|
if not is_attention_free and num_gpu_blocks <= 0:
|
||||||
raise ValueError("No available memory for the cache blocks. "
|
raise ValueError("No available memory for the cache blocks. "
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user