From b1a63d1b3be996babec6411e3abe559796f76ca9 Mon Sep 17 00:00:00 2001 From: nvjullin Date: Sat, 20 Sep 2025 04:36:34 +0800 Subject: [PATCH 01/24] [BugFix] Make FlashInferMetadataBuilder non-blocking (#25040) Signed-off-by: Julien Lin Co-authored-by: Michael Goin --- vllm/v1/attention/backends/flashinfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index dda6dd4fbea7a..cb092aa74e7f1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -585,9 +585,10 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): kv_data_type=self.kv_cache_dtype, ) else: - attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(self.device) + attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to( + self.device, non_blocking=True) attn_metadata.paged_kv_indptr_gpu = paged_kv_indptr_cpu.to( - self.device) + self.device, non_blocking=True) if num_decodes > 0: pure_decode = num_prefills == 0 From ddc9048394ae6294d0db7fd67270efea59c3a065 Mon Sep 17 00:00:00 2001 From: David-Wen <18927700430@163.com> Date: Sat, 20 Sep 2025 04:44:24 +0800 Subject: [PATCH 02/24] Fix: Correct FusedMoE layer reference in auto_round quantization (#24818) Signed-off-by: David-Wen <18927700430@163.com> Signed-off-by: Michael Goin Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/model_executor/layers/quantization/auto_round.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py index 1ca92273430dd..bf5141fa48943 100644 --- a/vllm/model_executor/layers/quantization/auto_round.py +++ b/vllm/model_executor/layers/quantization/auto_round.py @@ -241,7 +241,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return AWQMoEMethod(quant_args_marlin, layer.moe) + return AWQMoEMethod(quant_args_marlin, layer.moe_config) from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) @@ -327,7 +327,7 @@ class AutoRoundConfig(QuantizationConfig): if isinstance(layer, FusedMoE): if use_marlin: - return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe) + return GPTQMarlinMoEMethod(quant_args_marlin, layer.moe_config) else: from vllm.model_executor.layers.quantization.moe_wna16 import ( MoeWNA16Config) From e69e0b8b5fc5ef2958f7b1fc159119e9c4c0e2d2 Mon Sep 17 00:00:00 2001 From: Alec S <10566873+alecsolder@users.noreply.github.com> Date: Fri, 19 Sep 2025 17:40:16 -0400 Subject: [PATCH 03/24] [Frontend] Responses API messages out, just harmony for now (#24985) Signed-off-by: Alec Solder Co-authored-by: Alec Solder Co-authored-by: Ye (Charlotte) Qi --- .../openai/test_response_api_with_harmony.py | 15 +++++++++++++++ vllm/entrypoints/openai/protocol.py | 17 ++++++++++++++++- vllm/entrypoints/openai/serving_responses.py | 13 +++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 40a22c04b08a5..f3c3148577b85 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -744,3 +744,18 @@ async def test_function_calling_full_history(client: OpenAI, model_name: str): assert response_2 is not None assert response_2.status == "completed" assert response_2.output_text is not None + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_output_messages_enabled(client: OpenAI, model_name: str, + server): + response = await client.responses.create( + model=model_name, + input="What is the capital of South Korea?", + extra_body={"enable_response_messages": True}) + + assert response is not None + assert response.status == "completed" + assert len(response.input_messages) > 0 + assert len(response.output_messages) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 05d5d6d964dd3..c30681318f693 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -328,6 +328,13 @@ class ResponsesRequest(OpenAIBaseModel): "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " "to 256 bit). Not supported by vLLM engine V0.")) + + enable_response_messages: bool = Field( + default=False, + description=( + "Dictates whether or not to return messages as part of the " + "response object. Currently only supported for non-streaming " + "non-background and gpt-oss only. ")) # --8<-- [end:responses-extra-params] _DEFAULT_SAMPLING_PARAMS = { @@ -1831,6 +1838,11 @@ class ResponsesResponse(OpenAIBaseModel): model: str object: Literal["response"] = "response" output: list[ResponseOutputItem] + # These are populated when enable_response_messages is set to True + # TODO: Currently an issue where content of harmony messages + # is not available when these are serialized. Metadata is available + input_messages: Optional[list[ChatCompletionMessageParam]] = None + output_messages: Optional[list[ChatCompletionMessageParam]] = None parallel_tool_calls: bool temperature: float tool_choice: ToolChoice @@ -1860,6 +1872,8 @@ class ResponsesResponse(OpenAIBaseModel): output: list[ResponseOutputItem], status: ResponseStatus, usage: Optional[ResponseUsage] = None, + input_messages: Optional[list[ChatCompletionMessageParam]] = None, + output_messages: Optional[list[ChatCompletionMessageParam]] = None, ) -> "ResponsesResponse": incomplete_details: Optional[IncompleteDetails] = None @@ -1868,7 +1882,6 @@ class ResponsesResponse(OpenAIBaseModel): # TODO: implement the other reason for incomplete_details, # which is content_filter # incomplete_details = IncompleteDetails(reason='content_filter') - return cls( id=request.request_id, created_at=created_time, @@ -1877,6 +1890,8 @@ class ResponsesResponse(OpenAIBaseModel): metadata=request.metadata, model=model_name, output=output, + input_messages=input_messages, + output_messages=output_messages, parallel_tool_calls=request.parallel_tool_calls, temperature=sampling_params.temperature, tool_choice=request.tool_choice, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4894623aeac28..6e243671af242 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -475,9 +475,14 @@ class OpenAIServingResponses(OpenAIServing): # "completed" is implemented as the "catch-all" for now. status: ResponseStatus = "completed" + input_messages = None + output_messages = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) + if request.enable_response_messages: + input_messages = context.messages[:context.num_init_messages] + output_messages = context.messages[context.num_init_messages:] num_tool_output_tokens = context.num_tool_output_tokens if len(output) > 0: if context.finish_reason == "length": @@ -496,6 +501,12 @@ class OpenAIServingResponses(OpenAIServing): output = self._make_response_output_items(request, final_output, tokenizer) + # TODO: context for non-gptoss models doesn't use messages + # so we can't get them out yet + if request.enable_response_messages: + raise NotImplementedError( + "enable_response_messages is currently" + " only supported for gpt-oss") # Calculate usage. assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 @@ -519,6 +530,8 @@ class OpenAIServingResponses(OpenAIServing): response = ResponsesResponse.from_request( request, sampling_params, + input_messages=input_messages, + output_messages=output_messages, model_name=model_name, created_time=created_time, output=output, From 711e912946d23f4ccc1f554b1524c960553c5e28 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 19 Sep 2025 18:23:19 -0400 Subject: [PATCH 04/24] [Compile] Fix Compile Warning for Ignoring `MIN_BLOCK_PER_SM` (#25193) Signed-off-by: yewentao256 --- csrc/launch_bounds_utils.h | 38 +++++++++++++++++++ .../activation_nvfp4_quant_fusion_kernels.cu | 6 ++- csrc/quantization/fp4/nvfp4_experts_quant.cu | 10 +++-- csrc/quantization/fp4/nvfp4_quant_kernels.cu | 8 ++-- 4 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 csrc/launch_bounds_utils.h diff --git a/csrc/launch_bounds_utils.h b/csrc/launch_bounds_utils.h new file mode 100644 index 0000000000000..d5a89690111bc --- /dev/null +++ b/csrc/launch_bounds_utils.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +// maximum blocks per SM cap +#ifndef VLLM_LAUNCH_BLOCKS_CAP + #define VLLM_LAUNCH_BLOCKS_CAP 4 +#endif + +// compile-time estimate of max threads per SM for launch bounds. +#ifndef VLLM_MAX_THREADS_PER_SM + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300 + #define VLLM_MAX_THREADS_PER_SM 1536 + #else + #define VLLM_MAX_THREADS_PER_SM 2048 + #endif +#endif + +// compute the number of blocks per SM to request in __launch_bounds__ +#define VLLM_BLOCKS_DIV(VAL) (VLLM_MAX_THREADS_PER_SM / (VAL)) +#define VLLM_CLAMP_BLOCKS_PER_SM(VAL) \ + (((VAL) <= 0) \ + ? 1 \ + : (((VAL) < VLLM_LAUNCH_BLOCKS_CAP) ? (VAL) : VLLM_LAUNCH_BLOCKS_CAP)) +#define VLLM_BLOCKS_PER_SM(BLOCK_THREADS) \ + VLLM_CLAMP_BLOCKS_PER_SM(VLLM_BLOCKS_DIV(BLOCK_THREADS)) + +// runtime-time helper to compute blocks/SM +static inline int vllm_runtime_blocks_per_sm(int block_threads) { + int device = -1; + cudaGetDevice(&device); + int max_threads_per_sm = VLLM_MAX_THREADS_PER_SM; + cudaDeviceGetAttribute(&max_threads_per_sm, + cudaDevAttrMaxThreadsPerMultiProcessor, device); + int blocks = (block_threads > 0) ? (max_threads_per_sm / block_threads) : 1; + return VLLM_CLAMP_BLOCKS_PER_SM(blocks); +} diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu index 74fde23782ce5..7539f836ecf37 100644 --- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu +++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu @@ -26,6 +26,7 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { @@ -63,7 +64,7 @@ __inline__ __device__ PackedVec compute_silu_mul(PackedVec& vec, // Use UE4M3 by default. template -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) silu_mul_cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { @@ -131,7 +132,8 @@ void silu_and_mul_nvfp4_quant_sm1xxa(torch::Tensor& output, // [..., d] const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); auto stream = at::cuda::getCurrentCUDAStream(input.get_device()); dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024)); - int const numBlocksPerSM = 2048 / block.x; + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); VLLM_DISPATCH_HALF_TYPES( diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu index ce3ba2c19b9eb..6d385e0dd94e7 100644 --- a/csrc/quantization/fp4/nvfp4_experts_quant.cu +++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu @@ -26,12 +26,13 @@ #include "dispatch_utils.h" #include "nvfp4_utils.cuh" +#include "launch_bounds_utils.h" namespace vllm { // Use UE4M3 by default. template -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -129,7 +130,7 @@ __global__ void __launch_bounds__(512, 4) // Kernel for LARGE_M_TOPK = true (large m_topk optimized version) template -__global__ void __launch_bounds__(1024, 4) +__global__ void __launch_bounds__(1024, VLLM_BLOCKS_PER_SM(1024)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout, uint32_t* input_offset_by_experts, @@ -233,8 +234,9 @@ void quant_impl(void* output, void* output_scale, void* input, int const workSizePerRow = k / ELTS_PER_THREAD; int const totalWorkSize = m_topk * workSizePerRow; dim3 block(std::min(workSizePerRow, 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(static_cast((totalWorkSize + block.x - 1) / block.x), multiProcessorCount * numBlocksPerSM)); while (grid.x <= multiProcessorCount && block.x > 64) { diff --git a/csrc/quantization/fp4/nvfp4_quant_kernels.cu b/csrc/quantization/fp4/nvfp4_quant_kernels.cu index 0c1b9ef0664d7..5575ee8e4197e 100644 --- a/csrc/quantization/fp4/nvfp4_quant_kernels.cu +++ b/csrc/quantization/fp4/nvfp4_quant_kernels.cu @@ -26,13 +26,14 @@ #include "dispatch_utils.h" #include "cuda_utils.h" +#include "launch_bounds_utils.h" #include "nvfp4_utils.cuh" namespace vllm { // Use UE4M3 by default. template -__global__ void __launch_bounds__(512, 4) +__global__ void __launch_bounds__(512, VLLM_BLOCKS_PER_SM(512)) cvt_fp16_to_fp4(int32_t numRows, int32_t numCols, Type const* in, float const* SFScale, uint32_t* out, uint32_t* SFout) { using PackedVec = PackedVec; @@ -75,8 +76,9 @@ void invokeFP4Quantization(int m, int n, T const* input, float const* SFScale, // Grid, Block size. // Each thread converts 8 values. dim3 block(std::min(int(n / ELTS_PER_THREAD), 512)); - // Get number of blocks per SM (assume we can fully utilize the SM). - int const numBlocksPerSM = 2048 / block.x; + // Get number of blocks per SM + int const numBlocksPerSM = + vllm_runtime_blocks_per_sm(static_cast(block.x)); dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM)); // Launch the cvt kernel. From 431535b522c62f1422848e22dfd83bec2d04111a Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Fri, 19 Sep 2025 15:40:33 -0700 Subject: [PATCH 05/24] Enable modelopt gemma3 nvfp4/fp8, make workflow more robust (#22771) Signed-off-by: Zhiyu Cheng Signed-off-by: Michael Goin Co-authored-by: Michael Goin --- .../moe/test_modular_kernel_combinations.py | 3 +- vllm/compilation/backends.py | 7 ++- vllm/config/model.py | 3 ++ .../fused_moe/gpt_oss_triton_kernels_moe.py | 4 +- .../layers/quantization/modelopt.py | 53 +++++++++++++------ vllm/model_executor/models/gemma3.py | 16 ++++++ vllm/model_executor/models/siglip.py | 18 ++++++- 7 files changed, 82 insertions(+), 22 deletions(-) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 19c4301bd23d5..1c7e62d7aa4c8 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -11,7 +11,8 @@ import pytest import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.config import VllmConfig, current_platform, set_current_vllm_config +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.platforms import current_platform from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 3cc0fc3106f5a..d6bdb31a3c630 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -31,8 +31,11 @@ logger = init_logger(__name__) def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface: if compilation_config.use_inductor: - if envs.VLLM_USE_STANDALONE_COMPILE and is_torch_equal_or_newer( - "2.8.0.dev"): + # Use standalone compile only if requested, version is new enough, + # and the symbol actually exists in this PyTorch build. + if (envs.VLLM_USE_STANDALONE_COMPILE + and is_torch_equal_or_newer("2.8.0.dev") + and hasattr(torch._inductor, "standalone_compile")): logger.debug("Using InductorStandaloneAdaptor") return InductorStandaloneAdaptor() else: diff --git a/vllm/config/model.py b/vllm/config/model.py index 21457d3660a23..4e847922b61e6 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -964,6 +964,9 @@ class ModelConfig: "modelopt", "modelopt_fp4", "petit_nvfp4", + # Ensure heavy backends are probed last to avoid unnecessary + # imports during override detection (e.g., MXFP4 imports Triton) + "mxfp4", ] quantization_methods = [ q for q in supported_quantization if q not in overrides diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 08a9b34a42457..f12d3807517ff 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -20,10 +20,10 @@ if has_triton_kernels(): from triton_kernels.matmul_ogs import (FnSpecs, FusedActivation, matmul_ogs) from triton_kernels.routing import routing - except ModuleNotFoundError: + except (ModuleNotFoundError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " - "version is compatible.") + "version is compatible. Error: %s", e) def triton_kernel_moe_forward( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 7eac40825ac33..1083f398a3a20 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -160,6 +160,7 @@ class ModelOptFp8Config(QuantizationConfig): def is_layer_excluded(self, prefix: str) -> bool: """ Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and substring matching. This method handles both regular models and multimodal models that use the language_model prefix. For multimodal models, it checks if the @@ -168,11 +169,18 @@ class ModelOptFp8Config(QuantizationConfig): if self.exclude_modules is None: return False - # Check if any excluded module matches the prefix + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Then check substring matching for patterns not caught by exact match for module in self.exclude_modules: - if (module in prefix - or (prefix.startswith("language_model.") - and module in prefix.removeprefix("language_model."))): + # Skip exact matches already handled above + if (module != prefix and + (module in prefix or + (prefix.startswith("language_model.") + and module in prefix.removeprefix("language_model.")))): return True return False @@ -180,9 +188,10 @@ class ModelOptFp8Config(QuantizationConfig): prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptFp8LinearMethod(self) elif isinstance(layer, Attention): @@ -778,22 +787,34 @@ class ModelOptNvFp4Config(QuantizationConfig): return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) - def is_layer_excluded(self, prefix: str, - exclude_modules: list[str]) -> bool: + def is_layer_excluded(self, prefix: str) -> bool: + """ + Check if a layer should be excluded from quantization. + Handles both exact matching (for fused layers) and pattern matching. + """ + # First check exact matching with fused layer support + if is_layer_skipped(prefix, self.exclude_modules, + self.packed_modules_mapping): + return True + + # Check regex pattern matching for patterns not caught by exact match import regex as re - for pattern in exclude_modules: - regex_str = pattern.replace('.', r'\.').replace('*', r'.*') - if re.fullmatch(regex_str, prefix): - return True + for pattern in self.exclude_modules: + # Skip patterns that would be caught by exact matching + if '*' in pattern or '.' in pattern: + regex_str = pattern.replace('.', r'\.').replace('*', r'.*') + if re.fullmatch(regex_str, prefix): + return True return False def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import if isinstance(layer, LinearBase): - if (is_layer_skipped(prefix, self.exclude_modules, - self.packed_modules_mapping) - or self.is_layer_excluded(prefix, self.exclude_modules)): + if self.is_layer_excluded(prefix): + return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + if ("vision_tower" in prefix or "vision_model" in prefix): return UnquantizedLinearMethod() return ModelOptNvFp4LinearMethod(self) elif isinstance(layer, Attention): diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 1263e3049a14a..7246308d59028 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -446,6 +446,22 @@ class Gemma3Model(nn.Module): weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 3630f59f53e0a..eb49d6d2c3350 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -20,7 +20,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from .vision import VisionEncoderInfo, resolve_visual_encoder_outputs @@ -506,6 +507,21 @@ class SiglipVisionModel(nn.Module): if layer_idx >= layer_count: continue + # Check if this is a scale parameter that needs remapping first + if name.endswith( + (".k_scale", ".v_scale", ".q_scale", ".prob_scale")): + # Try to remap the scale name first + remapped_name = maybe_remap_kv_scale_name(name, params_dict) + if remapped_name is not None and remapped_name in params_dict: + # Successfully remapped, use the remapped name + param = params_dict[remapped_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(remapped_name) + continue + # If remapping failed, continue with normal processing + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue From ee7a66dd9a5ead46f062502af33766f45076f05d Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:59:41 -0700 Subject: [PATCH 06/24] allow disable flashinfer prefill (#25276) Signed-off-by: Lu Fang --- vllm/envs.py | 3 +++ vllm/v1/attention/backends/mla/common.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 19e2f8635275d..294a0b920fb78 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: VLLM_CONFIG_ROOT: str = os.path.expanduser("~/.config/vllm") VLLM_USAGE_STATS_SERVER: str = "https://stats.vllm.ai" VLLM_NO_USAGE_STATS: bool = False + VLLM_DISABLE_FLASHINFER_PREFILL: bool = False VLLM_DO_NOT_TRACK: bool = False VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 @@ -479,6 +480,8 @@ environment_variables: dict[str, Callable[[], Any]] = { lambda: os.environ.get("VLLM_USAGE_STATS_SERVER", "https://stats.vllm.ai"), "VLLM_NO_USAGE_STATS": lambda: os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1", + "VLLM_DISABLE_FLASHINFER_PREFILL": + lambda: os.environ.get("VLLM_DISABLE_FLASHINFER_PREFILL", "0") == "1", "VLLM_DO_NOT_TRACK": lambda: (os.environ.get("VLLM_DO_NOT_TRACK", None) or os.environ.get( "DO_NOT_TRACK", None) or "0") == "1", diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index a990cb2f1a972..5b307810de930 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -412,7 +412,8 @@ M = TypeVar("M", bound=MLACommonMetadata) def use_flashinfer_prefill() -> bool: # For blackwell default to flashinfer prefill if it's available since # it is faster than FA2. - return (flashinfer_available and not envs.VLLM_USE_CUDNN_PREFILL + return (not envs.VLLM_DISABLE_FLASHINFER_PREFILL and flashinfer_available + and not envs.VLLM_USE_CUDNN_PREFILL and current_platform.is_device_capability(100)) From 14c1432789c9c1b66308481b2c37439d3ee6661a Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 16:34:07 -0700 Subject: [PATCH 07/24] [BugFix] Fix async scheduling CPU tensor race take 2 (#25279) Signed-off-by: Nick Hill --- vllm/v1/worker/gpu_model_runner.py | 52 ++++++++++++++++++------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9d0f26266f0c5..3539f75612050 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1903,7 +1903,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): **self._init_model_kwargs(num_scheduled_tokens), **self._extract_mm_kwargs(scheduler_output), } - elif (self.enable_prompt_embeds and get_pp_group().is_first_rank): + elif self.enable_prompt_embeds and get_pp_group().is_first_rank: # Get the input embeddings for the tokens that are not input embeds, # then put them into the appropriate positions. # TODO(qthequartermasterman): Since even when prompt embeds are @@ -2125,6 +2125,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): invalid_req_indices, ) + @contextmanager + def synchronize_input_prep(self): + if self.prepare_inputs_event is None: + yield + return + + # Ensure prior step has finished with reused CPU tensors. + # This is required in the async scheduling case because + # the CPU->GPU transfer happens async. + self.prepare_inputs_event.synchronize() + try: + yield + finally: + self.prepare_inputs_event.record() + @torch.inference_mode() def execute_model( self, @@ -2132,33 +2147,28 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): intermediate_tensors: Optional[IntermediateTensors] = None, ) -> Union[ModelRunnerOutput, AsyncModelRunnerOutput, IntermediateTensors]: with record_function_or_nullcontext("Preprocess"): - self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: - if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, - self.vllm_config) - if self.cache_config.kv_sharing_fast_prefill: - assert not self.input_batch.num_prompt_logprobs, ( - "--kv-sharing-fast-prefill produces incorrect logprobs for " - "prompt tokens, tokens, please disable it when the requests" - " need prompt logprobs") + with self.synchronize_input_prep(): + # Update persistent batch states. + self._update_states(scheduler_output) + + if not scheduler_output.total_num_scheduled_tokens: + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward( + scheduler_output, self.vllm_config) + if self.cache_config.kv_sharing_fast_prefill: + assert not self.input_batch.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs") - if self.prepare_inputs_event is not None: - # Ensure prior step has finished with reused CPU tensors. - self.prepare_inputs_event.synchronize() - try: # Prepare the decoder inputs. (attn_metadata, logits_indices, spec_decode_metadata, num_scheduled_tokens_np, spec_decode_common_attn_metadata, max_query_len, ubatch_slices, num_tokens_after_padding ) = self._prepare_inputs(scheduler_output) - finally: - if self.prepare_inputs_event is not None: - self.prepare_inputs_event.record() - ( num_scheduled_tokens, num_input_tokens, From 3da17c2cc2c2e1d750020e033535f942f156f64c Mon Sep 17 00:00:00 2001 From: Lucas Kabela Date: Fri, 19 Sep 2025 17:27:21 -0700 Subject: [PATCH 08/24] [Bugfix] Remove VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE #2969 (#25090) Signed-off-by: Lucas Kabela --- tests/compile/test_basic_correctness.py | 16 ++-------------- tests/compile/test_full_graph.py | 4 +--- vllm/compilation/wrapper.py | 10 ++++------ vllm/envs.py | 5 ----- vllm/v1/worker/gpu_model_runner.py | 4 +--- vllm/worker/model_runner.py | 8 +++----- 6 files changed, 11 insertions(+), 36 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index fd2b1866e62e1..a1e5127ebeeb2 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -20,7 +20,6 @@ class TestSetting: tp_size: int attn_backend: str method: str - fullgraph: bool # we cannot afford testing the full Cartesian product @@ -36,7 +35,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # llama model with quantization TestSetting( @@ -46,7 +44,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # MoE model TestSetting( @@ -56,7 +53,6 @@ class TestSetting: tp_size=2, attn_backend="FLASH_ATTN", method="generate", - fullgraph=True, ), # embedding model TestSetting( @@ -73,7 +69,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), TestSetting( model="BAAI/bge-base-en-v1.5", @@ -82,7 +77,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="encode", - fullgraph=True, ), # vision language model TestSetting( @@ -92,7 +86,6 @@ class TestSetting: tp_size=1, attn_backend="FLASH_ATTN", method="generate_with_image", - fullgraph=False, ), ], ) @@ -109,9 +102,8 @@ def test_compile_correctness( tp_size = test_setting.tp_size attn_backend = test_setting.attn_backend method = test_setting.method - fullgraph = test_setting.fullgraph - if cuda_device_count_stateless() != pp_size * tp_size: - pytest.skip(f"Need exactly {pp_size}*{tp_size} CUDA gpus but got " + if cuda_device_count_stateless() < pp_size * tp_size: + pytest.skip(f"Need at least {pp_size}*{tp_size} CUDA gpus but got " f"{cuda_device_count_stateless()}") with monkeypatch.context() as m: @@ -149,9 +141,5 @@ def test_compile_correctness( ]: all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) - if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: - # "DYNAMO_ONCE" will always use fullgraph - all_envs[-1][ - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 84178344a5f36..3439a1b29038d 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -79,9 +79,7 @@ def test_full_graph( ): model, model_kwargs = model_info - with monkeypatch.context() as m: - # make sure these models can be captured in full graph mode - m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") + with monkeypatch.context(): print(f"MODEL={model}") run_model(optimization_level, model, model_kwargs) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 96d4eae2ee9aa..930e4d27b410f 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -10,7 +10,6 @@ from typing import Callable, Optional import torch -import vllm.envs as envs from vllm.config import (CompilationLevel, CUDAGraphMode, get_current_vllm_config) from vllm.logger import init_logger @@ -47,11 +46,10 @@ class TorchCompileWrapperWithCustomDispatcher: options = get_current_vllm_config( ).compilation_config.inductor_compile_config - compiled_callable = torch.compile( - self.forward, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend, - options=options) + compiled_callable = torch.compile(self.forward, + fullgraph=True, + backend=backend, + options=options) self.compiled_callable = compiled_callable self.original_code_object = self.__class__.forward.__code__ diff --git a/vllm/envs.py b/vllm/envs.py index 294a0b920fb78..3991a789d80f6 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -434,11 +434,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_FLASH_ATTN_VERSION": lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), - # Internal flag to enable Dynamo fullgraph capture - "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": - lambda: bool( - os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - # Feature flag to enable/disable Inductor standalone compile. # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is # enabled by default. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3539f75612050..dffadd1d769b7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2602,9 +2602,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model.compile( - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model.compile(fullgraph=True, backend=backend) return # for other compilation levels, cudagraph behavior is controlled by # CudagraphWraper and CudagraphDispatcher of vllm. diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c91c871766cff..f662f5a85eff6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ import torch.distributed import torch.nn as nn from tqdm.auto import tqdm -import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState @@ -1099,10 +1098,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): backend = self.vllm_config.compilation_config.init_backend( self.vllm_config) compilation_counter.dynamo_as_is_count += 1 - self.model = torch.compile( - self.model, - fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, - backend=backend) + self.model = torch.compile(self.model, + fullgraph=True, + backend=backend) def get_model(self) -> nn.Module: return self.model From a36c675817867235d368faf7e8d81e0ed3333d9c Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Fri, 19 Sep 2025 21:33:25 -0300 Subject: [PATCH 09/24] Don't skip special tokens with hermes-style tool calling (#25281) Signed-off-by: Max de Bayser --- .../openai/tool_parsers/hermes_tool_parser.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index a6ce33af6bd00..e74c420da1d3c 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -98,6 +98,15 @@ class Hermes2ProToolParser(ToolParser): else: return delta_text + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because the tool_call tokens are + # marked "special" in some models. Since they are skipped + # prior to the call to the tool parser, it breaks tool calling. + request.skip_special_tokens = False + return request + def extract_tool_calls( self, model_output: str, From c7e713616a53a097809609d5a7b536e8bfad4ab8 Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 19 Sep 2025 19:33:40 -0500 Subject: [PATCH 10/24] test: Remove vestigial skip for prompt embeds tests after landing v1 Prompt Embeds support (#25291) Signed-off-by: Andrew Sansom --- tests/entrypoints/openai/test_completion_with_prompt_embeds.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py index 0e3fc82f0c033..176c1825530e4 100644 --- a/tests/entrypoints/openai/test_completion_with_prompt_embeds.py +++ b/tests/entrypoints/openai/test_completion_with_prompt_embeds.py @@ -14,9 +14,6 @@ from transformers import AutoConfig from ...utils import RemoteOpenAIServer -pytest.skip("Skipping prompt_embeds test until V1 supports it.", - allow_module_level=True) - # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" From b8a287a0a8a035073d67b5101687e3a753dd02ac Mon Sep 17 00:00:00 2001 From: Andrew Sansom Date: Fri, 19 Sep 2025 19:46:23 -0500 Subject: [PATCH 11/24] [docs] Prompt Embedding feature support (#25288) Signed-off-by: Andrew Sansom --- docs/features/README.md | 34 ++++++++++++++++++---------------- docs/features/prompt_embeds.md | 3 --- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/docs/features/README.md b/docs/features/README.md index d8e26ec02aecc..10cc448cc2ee3 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -36,22 +36,23 @@ th:not(:first-child) { } -| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | -|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| -| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | -| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | -| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | -| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | -| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | -| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | -| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | -| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | -| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | -| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | -| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)^ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | -| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | -| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | +| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | [SD](spec_decode.md) | CUDA graph | [pooling](../models/pooling_models.md) | enc-dec | logP | prmpt logP | async output | multi-step | mm | best-of | beam-search | [prompt-embeds](prompt_embeds.md) | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | | +| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | | +| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | | +| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | | | | | | | | | | | | +| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | +| [pooling](../models/pooling_models.md) | 🟠\* | 🟠\* | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | +| enc-dec | ❌ | [❌](gh-issue:7366) | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | | | +| logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | +| prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | | | +| async output | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | | | +| multi-step | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | | | +| [mm](multimodal_inputs.md) | ✅ | ✅ | [🟠](gh-pr:4194)^ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | | | +| best-of | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ✅ | ✅ | | | +| beam-search | ✅ | ✅ | ✅ | [❌](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [❌](gh-issue:7968) | ❔ | ✅ | ✅ | | +| [prompt-embeds](prompt_embeds.md) | ✅ | [❌](gh-issue:25096) | ? | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ? | ? | ❌ | ? | ? | ✅ | \* Chunked prefill and prefix caching are only applicable to last-token pooling. ^ LoRA is only applicable to the language backbone of multimodal models. @@ -76,3 +77,4 @@ th:not(:first-child) { | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8477) | ✅ | ❌ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ? | [❌](gh-issue:25097) | diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md index 83993bd0140fa..f9d3c1fb6c23d 100644 --- a/docs/features/prompt_embeds.md +++ b/docs/features/prompt_embeds.md @@ -6,9 +6,6 @@ This page teaches you how to pass prompt embedding inputs to vLLM. The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary. -!!! note - Prompt embeddings are currently only supported in the v0 engine. - ## Offline Inference To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]: From 8945b001db3202f882108e50d16b6f9c5e6f01ed Mon Sep 17 00:00:00 2001 From: Boyuan Feng Date: Fri, 19 Sep 2025 18:02:15 -0700 Subject: [PATCH 12/24] [torch.compile] CUDAGraph Inductor partition integration (#24281) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Boyuan Feng Signed-off-by: Boyuan Feng Signed-off-by: boyuanfeng Co-authored-by: Luka Govedič --- tests/compile/piecewise/test_simple.py | 71 ++++++++++++++++++---- tests/compile/silly_attention.py | 1 + tests/compile/test_full_graph.py | 59 +++++++++++++++++- tests/compile/test_fusion_attn.py | 16 ++++- vllm/attention/layer.py | 2 + vllm/compilation/backends.py | 10 ++- vllm/compilation/decorators.py | 57 ++++++++++++++++- vllm/config/compilation.py | 84 ++++++++++++++++++++++---- vllm/v1/cudagraph_dispatcher.py | 12 ++-- 9 files changed, 280 insertions(+), 32 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 84f4945c82725..41055f431569c 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -15,6 +15,7 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, VllmConfig, set_current_vllm_config) from vllm.envs import VLLM_USE_V1 from vllm.forward_context import BatchDescriptor, set_forward_context +from vllm.utils import is_torch_equal_or_newer # This import automatically registers `torch.ops.silly.attention` from ..silly_attention import get_global_counter, reset_global_counter @@ -50,16 +51,21 @@ class SillyModel(nn.Module): return x -@pytest.mark.parametrize("use_inductor", [True, False]) -@torch.inference_mode() -def test_simple_piecewise_compile(use_inductor): - assert VLLM_USE_V1 - +def _run_simple_model( + splitting_ops, + use_inductor_graph_partition, + use_inductor, + expected_num_piecewise_graphs_seen, + expected_num_piecewise_capturable_graphs_seen, + expected_num_backend_compilations, + expected_num_cudagraph_captured, +): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, use_inductor=use_inductor, - splitting_ops=["silly.attention"], + splitting_ops=splitting_ops, + use_inductor_graph_partition=use_inductor_graph_partition, cudagraph_copy_inputs=True, cudagraph_capture_sizes=[1, 2], )) @@ -70,11 +76,11 @@ def test_simple_piecewise_compile(use_inductor): with compilation_counter.expect( num_graphs_seen=1, # one graph for the model - num_piecewise_graphs_seen=5, # 2 * num_layers + 1 - num_piecewise_capturable_graphs_seen=3, # 1 + num_layers - num_backend_compilations=3, # num_piecewise_capturable_graphs_seen - num_cudagraph_captured= - 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + num_piecewise_graphs_seen=expected_num_piecewise_graphs_seen, + num_piecewise_capturable_graphs_seen= + expected_num_piecewise_capturable_graphs_seen, + num_backend_compilations=expected_num_backend_compilations, + num_cudagraph_captured=expected_num_cudagraph_captured, ), set_forward_context(None, vllm_config=vllm_config): # background context # warm up with background context @@ -104,3 +110,46 @@ def test_simple_piecewise_compile(use_inductor): output = model(input) assert get_global_counter() == 2 assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0])) + + +@pytest.mark.parametrize("use_inductor", [True, False]) +@torch.inference_mode() +def test_simple_piecewise_compile(use_inductor): + assert VLLM_USE_V1 + _run_simple_model( + splitting_ops=["silly.attention"], + use_inductor_graph_partition=False, + use_inductor=use_inductor, + expected_num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + expected_num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + expected_num_backend_compilations= + 3, # num_piecewise_capturable_graphs_seen + expected_num_cudagraph_captured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ) + + +@torch.inference_mode() +@pytest.mark.parametrize("splitting_ops", [["silly.attention"], []]) +def test_simple_inductor_graph_partition(splitting_ops): + assert VLLM_USE_V1 + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + _run_simple_model( + # inductor graph partition automatically resets splitting_ops + # to be an empty list + splitting_ops=splitting_ops, + use_inductor_graph_partition=True, + use_inductor=True, + expected_num_piecewise_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_piecewise_capturable_graphs_seen= + 1, # since not splitting at fx graph level + expected_num_backend_compilations= + 1, # since not splitting at fx graph level + expected_num_cudagraph_captured= + 6, # inductor graph partition still captures 6 + # graph, same as fx graph partition. + ) diff --git a/tests/compile/silly_attention.py b/tests/compile/silly_attention.py index 13eb0bf4b1fa1..baedafbae99f1 100644 --- a/tests/compile/silly_attention.py +++ b/tests/compile/silly_attention.py @@ -60,4 +60,5 @@ direct_register_custom_op( mutates_args=["out"], fake_impl=silly_attention_fake, target_lib=silly_lib, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index 3439a1b29038d..870aa553ca628 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -3,6 +3,7 @@ from __future__ import annotations +import logging import tempfile from typing import Any, Optional, Union @@ -10,9 +11,13 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported +from tests.v1.attention.utils import _Backend from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel, PassConfig +from vllm.attention.selector import global_force_attn_backend_context_manager +from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode, + PassConfig) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from ..utils import create_new_process_for_each_test @@ -105,6 +110,18 @@ def test_full_graph( (CompilationConfig(level=CompilationLevel.PIECEWISE, debug_dump_path=tempfile.gettempdir()), ("facebook/opt-125m", {})), + ] + [ + # graph inductor partition + ( + CompilationConfig( + level=CompilationLevel.PIECEWISE, + # inductor graph partition uses + # torch._C.Tag.cudagraph_unsafe to specify splitting ops + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + compile_sizes=[1, 2]), + model) for model in models_list(all=False) + if is_torch_equal_or_newer("2.9.0.dev") ]) # only test some of the models @create_new_process_for_each_test() @@ -112,11 +129,51 @@ def test_custom_compile_config( compilation_config: CompilationConfig, model_info: tuple[str, dict[str, Any]], ): + if (compilation_config.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + model, model_kwargs = model_info print(f"MODEL={model}") run_model(compilation_config, model, model_kwargs) +def test_inductor_graph_partition_attn_fusion(caplog_vllm): + if not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + + model = "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_inductor_graph_partition=True, + cudagraph_mode=CUDAGraphMode.PIECEWISE, + custom_ops=["+quant_fp8"], + pass_config=PassConfig(enable_attn_fusion=True, enable_noop=True), + ) + model_kwargs = { + "kv_cache_dtype": "fp8", + "max_model_len": 1024, + } + with caplog_vllm.at_level( + logging.DEBUG), global_force_attn_backend_context_manager( + _Backend.FLASHINFER): + run_model(compilation_config, model, model_kwargs) + + try: + assert ("Fused quantization onto 48 attention nodes" + in caplog_vllm.text), caplog_vllm.text + except AssertionError: + # Note: this message is only triggered when the compilation goes + # through the custom pass. Due to multiple layers of cache on + # PyTorch side, the compilation of a graph may be cached such + # that custom pass directly goes through cache. In this case, + # we go through this branch and assert that the pass is not + # triggered. + assert "Fused quantization" not in caplog_vllm.text + + def run_model(compile_config: Union[int, CompilationConfig], model: str, model_kwargs: dict[str, Any]): prompts = [ diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 6baf4bf83f499..022f183b31932 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( Fp8LinearOp) from vllm.platforms import current_platform +from vllm.utils import is_torch_equal_or_newer from vllm.v1.kv_cache_interface import AttentionSpec FP8_DTYPE = current_platform.fp8_dtype() @@ -339,6 +340,10 @@ else: @pytest.mark.parametrize( "split_attention", [False, True] if current_platform.is_rocm() else [False]) +# TODO(boyuan): test inductor graph partition on rocm +@pytest.mark.parametrize( + "use_inductor_graph_partition", + [False] if current_platform.is_rocm() else [False, True]) @pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA") @pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8") @@ -352,9 +357,15 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, dtype: torch.dtype, model_name: str, model_class: type[AttentionQuantPatternModel], backend: _Backend, split_attention: bool, - monkeypatch, dist_init): + use_inductor_graph_partition: bool, + monkeypatch, dist_init, caplog_vllm): """Test AttentionStaticQuantPattern fusion pass""" + if use_inductor_graph_partition and not is_torch_equal_or_newer( + "2.9.0.dev"): + pytest.skip("inductor graph partition is only available " + "in PyTorch 2.9+") + monkeypatch.setenv("VLLM_USE_V1", "1") if split_attention: monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1") @@ -372,6 +383,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, custom_ops=["+quant_fp8"], + use_inductor_graph_partition=use_inductor_graph_partition, ), cache_config=CacheConfig(cache_dtype="fp8")) @@ -444,6 +456,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, backend=test_backend, fullgraph=True) assert model_compiled.attn._o_scale_float is None + result_fused_1 = model_compiled(q, k, v) if backend == _Backend.FLASHINFER: @@ -453,6 +466,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int, # _o_scale_float assert model_compiled.attn._o_scale_float is not None result_fused_2 = model_compiled(q, k, v) + assert model_compiled.attn._o_scale_float is not None torch.testing.assert_close(result_unfused, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8d5ebd93e063d..3d1269c0ecea8 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -577,6 +577,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) @@ -627,4 +628,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, + tags=(torch._C.Tag.cudagraph_unsafe, ), ) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index d6bdb31a3c630..17fc727b8fc70 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -329,6 +329,7 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] global compilation_start_time + compiled_graph_for_dynamic_shape = self.vllm_backend.\ compiler_manager.compile( submod, @@ -339,7 +340,6 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): num_graphs=len(self.compile_submod_names), runtime_shape=None) # Lazy import here to avoid circular import - from .cuda_graph import CUDAGraphOptions from .cuda_piecewise_backend import PiecewiseBackend piecewise_backend = PiecewiseBackend( @@ -347,7 +347,13 @@ class PiecewiseCompileInterpreter(torch.fx.Interpreter): len(self.compile_submod_names), sym_shape_indices, compiled_graph_for_dynamic_shape, self.vllm_backend) - if self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE: + if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and + not self.compilation_config.use_inductor_graph_partition): + # We're using Dynamo-based piecewise splitting, so we wrap + # the whole subgraph with a static graph wrapper. + from .cuda_graph import CUDAGraphOptions + # resolve the static graph wrapper class (e.g. CUDAGraphWrapper # class) as platform dependent. static_graph_wrapper_class = resolve_obj_by_qualname( diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 41d9fcb824b01..b7a6e23c1aa79 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextlib import inspect from typing import Callable, Optional, TypeVar, Union, overload from unittest.mock import patch @@ -14,7 +15,7 @@ from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors -from vllm.utils import supports_dynamo +from vllm.utils import resolve_obj_by_qualname, supports_dynamo from .monitor import start_monitoring_torch_compile @@ -301,8 +302,11 @@ def _support_torch_compile( with patch.object(InliningInstructionTranslator, 'inline_call', patched_inline_call), torch._dynamo.config.patch( - **dynamo_config_patches): + **dynamo_config_patches + ), maybe_use_cudagraph_partition_wrapper( + self.vllm_config): output = self.compiled_callable(*args, **kwargs) + return output # usually, capturing the model once is enough, and then we can @@ -314,3 +318,52 @@ def _support_torch_compile( cls.__call__ = __call__ return cls + + +@contextlib.contextmanager +def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig): + """ + Context manager to set/unset customized cudagraph partition wrappers. + + If we're using Inductor-based graph partitioning, we currently have the + whole `fx.Graph` before Inductor lowering and and the piecewise + splitting happens after all graph passes and fusions. Here, we add + a custom hook for Inductor to wrap each partition with our static + graph wrapper class to maintain more control over static graph + capture and replay. + """ + from vllm.config import CUDAGraphMode + + compilation_config = vllm_config.compilation_config + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + from torch._inductor.utils import CUDAGraphWrapperMetadata + + from vllm.compilation.cuda_graph import CUDAGraphOptions + from vllm.platforms import current_platform + + static_graph_wrapper_class = resolve_obj_by_qualname( + current_platform.get_static_graph_wrapper_cls()) + + def customized_cudagraph_wrapper(f, + metadata: CUDAGraphWrapperMetadata): + partition_id = metadata.partition_index + num_partitions = metadata.num_partitions + return static_graph_wrapper_class( + runnable=f, + vllm_config=vllm_config, + runtime_mode=CUDAGraphMode.PIECEWISE, + cudagraph_options=CUDAGraphOptions( + debug_log_enable=partition_id == 0, + gc_disable=partition_id != 0, + weak_ref_output=partition_id == num_partitions - 1, + )) + + torch._inductor.utils.set_customized_partition_wrappers( + customized_cudagraph_wrapper) + + yield + + if (compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and compilation_config.use_inductor_graph_partition): + torch._inductor.utils.set_customized_partition_wrappers(None) diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 3618f472e742d..22b38daf46c39 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -299,6 +299,26 @@ class CompilationConfig: minor release, i.e. v0.11.0 or v1.0.0. Please use cudagraph_mode instead. """ + use_inductor_graph_partition: bool = False + """Use inductor graph partition to split the graph at cudagraph_unsafe ops. + This partition happens at inductor codegen time after all passes and fusions + are finished. It generates a single `call` function which wraps + cudagraph-safe ops into partition functions and leave cudagraph-unsafe ops + outside the partition functions. For a graph with N cudagraph-unsafe ops + (e.g., Attention), there would be N+1 partitions. To mark an op as + cudagraph unsafe, we can add `tags=(torch._C.Tag.cudagraph_unsafe)` when + register the custom op. + + This config supports both full cudagraph and piecewise cudagraph without + compiling twice. For piecewise cudagraph, it applies vLLM CUDAGraph wrapper + to each partition. For N+1 partitions, there would be N+1 + CUDAGraph wrapper instances. + + For full CUDAGraph, we always apply a single CUDAGraph wrapper outside the + inductor `call` function in the model runner. The top-level full cudagraph + capture ignores all partitioning. + """ + pass_config: PassConfig = field(default_factory=PassConfig) """Custom inductor passes, see PassConfig for more details""" @@ -461,6 +481,12 @@ class CompilationConfig: "since full_cuda_graph is deprecated.") self.cudagraph_mode = CUDAGraphMode.FULL + if (self.use_inductor_graph_partition + and not is_torch_equal_or_newer("2.9.0.dev")): + raise ValueError("use_inductor_graph_partition is only " + "supported with torch>=2.9.0.dev. Set " + "use_inductor_graph_partition=False instead.") + def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") @@ -540,19 +566,36 @@ class CompilationConfig: "set_splitting_ops_for_v1 should only be called when " "level is CompilationLevel.PIECEWISE") + use_inductor_graph_partition_msg = ( + "When use_inductor_graph_partition=True, splitting_ops " + "are ignored and set to an empty list. Instead, " + "\"tags=(torch._C.Tag.cudagraph_unsafe, ),\" is " + "used to annotate custom ops for graph partition.") + if self.splitting_ops is None: - # NOTE: When using full cudagraph, instead of setting an empty - # list and capture the full cudagraph inside the flattened fx - # graph, we keep the piecewise fx graph structure but capture the - # full cudagraph outside the fx graph. This reduces some cpu - # overhead when the runtime batch_size is not cudagraph captured. - # see https://github.com/vllm-project/vllm/pull/20059 for details. - # make a copy to avoid mutating the class-level list via reference. - self.splitting_ops = list(self._attention_ops) + if self.use_inductor_graph_partition: + # When using inductor graph partition, we set splitting_ops + # to be empty and rely on torch._C.Tag.cudagraph_unsafe to + # annotate custom ops as splitting ops. + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] + else: + # NOTE: When using full cudagraph, instead of setting an empty + # list and capture the full cudagraph inside the flattened fx + # graph, we keep the piecewise fx graph structure but capture + # the full cudagraph outside the fx graph. This reduces some + # cpu overhead when the runtime batch_size is not cudagraph + # captured. see https://github.com/vllm-project/vllm/pull/20059 + # for details. make a copy to avoid mutating the class-level + # list via reference. + self.splitting_ops = list(self._attention_ops) elif len(self.splitting_ops) == 0: - logger.warning_once("Using piecewise compilation with empty " - "splitting_ops.") - if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: + logger.warning_once( + "Using piecewise compilation with empty " + "splitting_ops and use_inductor_graph_partition" + f"={self.use_inductor_graph_partition}.") + if (self.cudagraph_mode == CUDAGraphMode.PIECEWISE + and not self.use_inductor_graph_partition): logger.warning_once( "When compilation level is piecewise with empty " "splitting_ops, PIECEWISE cudagraph_mode will be " @@ -562,7 +605,26 @@ class CompilationConfig: "any problems.") self.cudagraph_mode = CUDAGraphMode.FULL self.splitting_ops = [] + elif self.use_inductor_graph_partition: + logger.warning_once(use_inductor_graph_partition_msg) + self.splitting_ops = [] def splitting_ops_contain_attention(self) -> bool: return self.splitting_ops is not None and all( op in self.splitting_ops for op in self._attention_ops) + + def is_attention_compiled_piecewise(self) -> bool: + use_fx_graph_piecewise_compilation = ( + self.level == CompilationLevel.PIECEWISE + and self.splitting_ops_contain_attention()) + + inductor_used = (self.level == CompilationLevel.PIECEWISE + and self.use_inductor) or ( + self.level >= CompilationLevel.DYNAMO_AS_IS + and self.backend == "inductor") + use_inductor_piecewise_compilation = ( + inductor_used and self.use_inductor_graph_partition + and not self.splitting_ops_contain_attention()) + + return use_fx_graph_piecewise_compilation or \ + use_inductor_piecewise_compilation diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index d2db7dcb3f091..ea4fba8eeea6d 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Optional -from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig +from vllm.config import CUDAGraphMode, VllmConfig from vllm.forward_context import BatchDescriptor from vllm.logger import init_logger @@ -39,11 +39,15 @@ class CudagraphDispatcher: CUDAGraphMode.FULL: set(), } - assert not self.cudagraph_mode.requires_piecewise_compilation() or \ - (self.compilation_config.level == CompilationLevel.PIECEWISE and - self.compilation_config.splitting_ops_contain_attention()), \ + not_use_piecewise_compilation = ( + not self.cudagraph_mode.requires_piecewise_compilation()) + + assert not_use_piecewise_compilation or \ + self.compilation_config.is_attention_compiled_piecewise(), \ "Compilation level should be CompilationLevel.PIECEWISE when "\ "cudagraph_mode piecewise cudagraphs is used, "\ + "and attention should be in splitting_ops or "\ + "inductor splitting should be used. " \ f"cudagraph_mode={self.cudagraph_mode}, "\ f"compilation_level={self.compilation_config.level}, "\ f"splitting_ops={self.compilation_config.splitting_ops}" From a25ade5d473fc00107bd3950141d8211331d3377 Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 18:06:34 -0700 Subject: [PATCH 13/24] [BugFix] Ensure appropriate guards in destructors (#25284) Signed-off-by: Nick Hill Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/compilation/collective_fusion.py | 2 +- .../kv_transfer/kv_connector/v1/nixl_connector.py | 9 +++++---- vllm/executor/executor_base.py | 3 --- vllm/v1/worker/gpu_worker.py | 3 ++- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 71274420c3426..0658b59a2e215 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -1183,7 +1183,7 @@ class AllReduceFusionPass(VllmInductorPass): self.end_and_log() def __del__(self): - if self.disabled: + if getattr(self, "disabled", True): return if flashinfer_comm is not None: flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ff62f60e5a42c..d3a08af088c11 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -569,9 +569,10 @@ class NixlConnectorWorker: def __del__(self): """Cleanup background threads on destruction.""" - self._handshake_initiation_executor.shutdown(wait=False) - if self._nixl_handshake_listener_t: - self._nixl_handshake_listener_t.join(timeout=0) + if executor := getattr(self, "_handshake_initiation_executor", None): + executor.shutdown(wait=False) + if listener_t := getattr(self, "_nixl_handshake_listener_t", None): + listener_t.join(timeout=0) @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, @@ -1379,4 +1380,4 @@ class NixlKVConnectorStats(KVConnectorStats): # TODO: reduce stats to a single value, calculate latency/throughput return { "num_successful_transfers": self.data["num_successful_transfers"] - } \ No newline at end of file + } diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index d18bef1256af5..42aa8d14a21eb 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -235,9 +235,6 @@ class ExecutorBase(ABC): """Shutdown the executor.""" self.collective_rpc("shutdown") - def __del__(self): - self.shutdown() - async def execute_model_async( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 6855526583f04..8b1e1bb8f45ca 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -683,7 +683,8 @@ class Worker(WorkerBase): tensorizer_config=tensorizer_config, ) def shutdown(self) -> None: - self.model_runner.ensure_kv_transfer_shutdown() + if runner := getattr(self, "model_runner", None): + runner.ensure_kv_transfer_shutdown() def init_worker_distributed_environment( From 535d80056b72443e68a96c1e4a1049cd9a85587d Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 19:02:38 -0700 Subject: [PATCH 14/24] [Misc] Support more collective_rpc return types (#25294) Signed-off-by: Nick Hill --- tests/v1/engine/test_engine_core_client.py | 203 ++++++++++++++++++++- vllm/v1/serial_utils.py | 60 ++++-- 2 files changed, 246 insertions(+), 17 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 625a3470e8025..992c4e01386e5 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -8,7 +8,7 @@ import time import uuid from dataclasses import dataclass from threading import Thread -from typing import Optional, Union +from typing import Any, Optional, Union from unittest.mock import MagicMock import pytest @@ -331,6 +331,46 @@ def echo_dc( return [val for _ in range(3)] if return_list else val +# Dummy utility function to test dict serialization with custom types. +def echo_dc_dict( + self, + msg: str, + return_dict: bool = False, +) -> Union[MyDataclass, dict[str, MyDataclass]]: + print(f"echo dc dict util function called: {msg}") + val = None if msg is None else MyDataclass(msg) + # Return dict of dataclasses to verify support for returning dicts + # with custom value types. + if return_dict: + return {"key1": val, "key2": val, "key3": val} + else: + return val + + +# Dummy utility function to test nested structures with custom types. +def echo_dc_nested( + self, + msg: str, + structure_type: str = "list_of_dicts", +) -> Any: + print(f"echo dc nested util function called: {msg}, " + f"structure: {structure_type}") + val = None if msg is None else MyDataclass(msg) + + if structure_type == "list_of_dicts": # noqa + # Return list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + return [{"a": val, "b": val}, {"c": val, "d": val}] + elif structure_type == "dict_of_lists": + # Return dict of lists: {"list1": [val, val], "list2": [val, val]} + return {"list1": [val, val], "list2": [val, val]} + elif structure_type == "deep_nested": + # Return deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + return {"outer": [{"inner": [val, val]}, {"inner": [val]}]} + else: + return val + + @pytest.mark.asyncio(loop_scope="function") async def test_engine_core_client_util_method_custom_return( monkeypatch: pytest.MonkeyPatch): @@ -384,6 +424,167 @@ async def test_engine_core_client_util_method_custom_return( client.shutdown() +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_custom_dict_return( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_dict", echo_dc_dict, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + # Test utility method returning custom / non-native data type. + core_client: AsyncMPClient = client + + # Test single object return + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", False) + assert isinstance(result, + MyDataclass) and result.message == "testarg3" + + # Test dict return with custom value types + result = await core_client.call_utility_async( + "echo_dc_dict", "testarg3", True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert isinstance(val, + MyDataclass) and val.message == "testarg3" + + # Test returning dict with None values + result = await core_client.call_utility_async( + "echo_dc_dict", None, True) + assert isinstance(result, dict) and len(result) == 3 + for key, val in result.items(): + assert key in ["key1", "key2", "key3"] + assert val is None + + finally: + client.shutdown() + + +@pytest.mark.asyncio(loop_scope="function") +async def test_engine_core_client_util_method_nested_structures( + monkeypatch: pytest.MonkeyPatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + # Must set insecure serialization to allow returning custom types. + m.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + # Monkey-patch core engine utility function to test. + m.setattr(EngineCore, "echo_dc_nested", echo_dc_nested, raising=False) + + engine_args = EngineArgs(model=MODEL_NAME, enforce_eager=True) + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class = Executor.get_class(vllm_config) + + with set_default_torch_num_threads(1): + client = EngineCoreClient.make_client( + multiprocess_mode=True, + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=True, + ) + + try: + core_client: AsyncMPClient = client + + # Test list of dicts: [{"a": val, "b": val}, {"c": val, "d": val}] + result = await core_client.call_utility_async( + "echo_dc_nested", "nested1", "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for i, item in enumerate(result): + assert isinstance(item, dict) + if i == 0: + assert "a" in item and "b" in item + assert isinstance( + item["a"], + MyDataclass) and item["a"].message == "nested1" + assert isinstance( + item["b"], + MyDataclass) and item["b"].message == "nested1" + else: + assert "c" in item and "d" in item + assert isinstance( + item["c"], + MyDataclass) and item["c"].message == "nested1" + assert isinstance( + item["d"], + MyDataclass) and item["d"].message == "nested1" + + # Test dict of lists: {"list1": [val, val], "list2": [val, val]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested2", "dict_of_lists") + assert isinstance(result, dict) and len(result) == 2 + assert "list1" in result and "list2" in result + for key, lst in result.items(): + assert isinstance(lst, list) and len(lst) == 2 + for item in lst: + assert isinstance( + item, MyDataclass) and item.message == "nested2" + + # Test deeply nested: {"outer": [{"inner": [val, val]}, + # {"inner": [val]}]} + result = await core_client.call_utility_async( + "echo_dc_nested", "nested3", "deep_nested") + assert isinstance(result, dict) and "outer" in result + outer_list = result["outer"] + assert isinstance(outer_list, list) and len(outer_list) == 2 + + # First dict in outer list should have "inner" with 2 items + inner_dict1 = outer_list[0] + assert isinstance(inner_dict1, dict) and "inner" in inner_dict1 + inner_list1 = inner_dict1["inner"] + assert isinstance(inner_list1, list) and len(inner_list1) == 2 + for item in inner_list1: + assert isinstance(item, + MyDataclass) and item.message == "nested3" + + # Second dict in outer list should have "inner" with 1 item + inner_dict2 = outer_list[1] + assert isinstance(inner_dict2, dict) and "inner" in inner_dict2 + inner_list2 = inner_dict2["inner"] + assert isinstance(inner_list2, list) and len(inner_list2) == 1 + assert isinstance( + inner_list2[0], + MyDataclass) and inner_list2[0].message == "nested3" + + # Test with None values in nested structures + result = await core_client.call_utility_async( + "echo_dc_nested", None, "list_of_dicts") + assert isinstance(result, list) and len(result) == 2 + for item in result: + assert isinstance(item, dict) + for val in item.values(): + assert val is None + + finally: + client.shutdown() + + @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 50c1470c67edc..c812a2ec6427a 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -7,7 +7,7 @@ import pickle from collections.abc import Sequence from inspect import isclass from types import FunctionType -from typing import Any, Optional, Union +from typing import Any, Callable, Optional, Union import cloudpickle import msgspec @@ -59,6 +59,42 @@ def _typestr(val: Any) -> Optional[tuple[str, str]]: return t.__module__, t.__qualname__ +def _encode_type_info_recursive(obj: Any) -> Any: + """Recursively encode type information for nested structures of + lists/dicts.""" + if obj is None: + return None + if type(obj) is list: + return [_encode_type_info_recursive(item) for item in obj] + if type(obj) is dict: + return {k: _encode_type_info_recursive(v) for k, v in obj.items()} + return _typestr(obj) + + +def _decode_type_info_recursive( + type_info: Any, data: Any, convert_fn: Callable[[Sequence[str], Any], + Any]) -> Any: + """Recursively decode type information for nested structures of + lists/dicts.""" + if type_info is None: + return data + if isinstance(type_info, dict): + assert isinstance(data, dict) + return { + k: _decode_type_info_recursive(type_info[k], data[k], convert_fn) + for k in type_info + } + if isinstance(type_info, list) and ( + # Exclude serialized tensors/numpy arrays. + len(type_info) != 2 or not isinstance(type_info[0], str)): + assert isinstance(data, list) + return [ + _decode_type_info_recursive(ti, d, convert_fn) + for ti, d in zip(type_info, data) + ] + return convert_fn(type_info, data) + + class MsgpackEncoder: """Encoder with custom torch tensor and numpy array serialization. @@ -129,12 +165,10 @@ class MsgpackEncoder: result = obj.result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: return None, result - # Since utility results are not strongly typed, we also encode - # the type (or a list of types in the case it's a list) to - # help with correct msgspec deserialization. - return _typestr(result) if type(result) is not list else [ - _typestr(v) for v in result - ], result + # Since utility results are not strongly typed, we recursively + # encode type information for nested structures of lists/dicts + # to help with correct msgspec deserialization. + return _encode_type_info_recursive(result), result if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError(f"Object of type {type(obj)} is not serializable" @@ -288,15 +322,9 @@ class MsgpackDecoder: if not envs.VLLM_ALLOW_INSECURE_SERIALIZATION: raise TypeError("VLLM_ALLOW_INSECURE_SERIALIZATION must " "be set to use custom utility result types") - assert isinstance(result_type, list) - if len(result_type) == 2 and isinstance(result_type[0], str): - result = self._convert_result(result_type, result) - else: - assert isinstance(result, list) - result = [ - self._convert_result(rt, r) - for rt, r in zip(result_type, result) - ] + # Use recursive decoding to handle nested structures + result = _decode_type_info_recursive(result_type, result, + self._convert_result) return UtilityResult(result) def _convert_result(self, result_type: Sequence[str], result: Any) -> Any: From c308501cb6a922af8c4183bd65be0094dd73de9a Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sat, 20 Sep 2025 04:11:03 +0100 Subject: [PATCH 15/24] Improve weight loading for encoder models in Transformers backend (#25289) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/model_executor/models/transformers.py | 28 ++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f40a20dee63d7..3bd4d10316ec6 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -702,21 +702,45 @@ class TransformersBase(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): class TransformersModel(TransformersBase): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ + # Handle BERT-like models + "bert": "model", # Add `model.` prefix for base model checkpoints "": "model.", - # Remove `model.` from places it should not be + # Remove `model.` prefix if it was already there "model.model.": "model.", + # Pooling adapters will be adjacent to `model` + "model.pooler": "pooler", "model.score": "score", + # Classifier adapter's classifier layer is renamed to score + "model.classifier": "score", + }, + orig_to_new_suffix={ + # Replace legacy suffixes used for norms + ".gamma": ".weight", + ".beta": ".bias", }) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - # Some encoder models have the position_ids buffer in the checkpoint + # After creating a pooling model, `pooler` will be duplicated. + # The one inside `model` comes from the Transformers modelling code. + # The one after `model` is an adapter from vLLM. + # We want to use the adapter so we nullify the original pooler. + if getattr(self.model, "pooler", None) is not None: + self.skip_prefixes.append("pooler.") + self.model.pooler = torch.nn.Identity() + + # Some encoder models have the position_ids buffer in the checkpoint. # vLLM will always pass position_ids as an argument, so we skip loading # the buffer if it exists self.skip_substrs.append("position_ids") + # Some encoder models have the bias of the final classifier layer + # in the checkpoint. vLLM does not use this bias, so we skip loading + # it if it exists + self.skip_substrs.append("score.bias") + def create_attention_instances( self, attn_type: AttentionType = AttentionType.DECODER): # TODO(hmellor): Better way to detect encoder models From 36429096171ff8785645c40c662d859dddedd829 Mon Sep 17 00:00:00 2001 From: JartX Date: Sat, 20 Sep 2025 05:18:13 +0200 Subject: [PATCH 16/24] [BUGFIX] GPTQ quantization compatibility for Qwen3 Next MOE models (AutoGPTQ and AutoRound-GPTQ) (#25268) Signed-off-by: JartX --- vllm/model_executor/models/qwen3_next.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index 98749c160ba4d..ce917f92bd2e5 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -148,9 +148,11 @@ class Qwen3NextSparseMoeBlock(nn.Module): def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ - # seems to avoid gate quantization. - # See: https://huggingface.co/Qwen/Qwen3-30B-A3B-GPTQ-Int4 - if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + # seems to avoid gate quantization while AutoRound does. + if isinstance( + quant_config, + (GPTQConfig, + GPTQMarlinConfig)) and not quant_config.autoround_version: return None return quant_config From b7f186bbb3101e97bb9ad42b7ffb3cdb4bb590fd Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 19 Sep 2025 21:28:31 -0700 Subject: [PATCH 17/24] [BugFix] Exclude self when checking for port collision (#25286) Signed-off-by: Nick Hill --- vllm/utils/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index d4013a69e99fe..fd1c0af31269c 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -987,8 +987,10 @@ def find_process_using_port(port: int) -> Optional[psutil.Process]: if sys.platform.startswith("darwin"): return None + our_pid = os.getpid() for conn in psutil.net_connections(): - if conn.laddr.port == port: + if conn.laddr.port == port and (conn.pid is not None + and conn.pid != our_pid): try: return psutil.Process(conn.pid) except psutil.NoSuchProcess: From 6c5f82e5aa87cd73ce03ce10fc44138f75ee1aea Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 19 Sep 2025 23:41:23 -0500 Subject: [PATCH 18/24] [BUG FIX][NON-CUDA]quick fix to avoid call cudagraph_unsafe in attention (#25298) Signed-off-by: Chendi Xue --- vllm/attention/layer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 3d1269c0ecea8..544a720524429 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -29,6 +29,10 @@ from vllm.utils import GiB_bytes, direct_register_custom_op logger = init_logger(__name__) USE_XFORMERS_OPS = None +try: + tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe, ) +except AttributeError: + tag_cudagraph_unsafe = () # type: ignore[assignment] def check_xformers_availability(): @@ -577,7 +581,7 @@ direct_register_custom_op( mutates_args=[], fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) @@ -628,5 +632,5 @@ direct_register_custom_op( mutates_args=["output", "output_block_scale"], fake_impl=unified_attention_with_output_fake, dispatch_key=current_platform.dispatch_key, - tags=(torch._C.Tag.cudagraph_unsafe, ), + tags=tag_cudagraph_unsafe, ) From f91480b2d44c263fb600b5cba5b0e6c7a195f742 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Sat, 20 Sep 2025 13:29:54 +0800 Subject: [PATCH 19/24] [Bugfix] fix tool call arguments is empty (#25223) Signed-off-by: chaunceyjiang Co-authored-by: xin.li --- .../test_completion_with_function_calling.py | 60 +++++++++++++++++++ vllm/entrypoints/chat_utils.py | 8 ++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/tests/entrypoints/openai/test_completion_with_function_calling.py b/tests/entrypoints/openai/test_completion_with_function_calling.py index 3649cefa9bf42..4355603fcd70b 100644 --- a/tests/entrypoints/openai/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/test_completion_with_function_calling.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import datetime from typing import Union import openai # use the official client for correctness check @@ -284,3 +285,62 @@ async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str, output.extend(chunk.choices[0].delta.tool_calls) for o in output: assert o.id is None or o.id == 'functions.get_current_weather:0' + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("arguments", ["{}", '']) +async def test_no_args_tool_call(client: openai.AsyncOpenAI, model_name: str, + arguments: str): + # Step 1: Define a tool that requires no parameters + tools = [{ + "type": "function", + "function": { + "name": "get_current_time", + "description": + "Get the current date and time. No parameters needed.", + "parameters": { + "type": "object", + "properties": {}, # No parameters + "required": [] # No required fields + } + } + }] + messages = [{"role": "user", "content": "What time is it now?"}] + # Step 2: Send user message and let model decide whether to call the tool + response = await client.chat.completions.create( + model=model_name, + messages=messages, + tools=tools, + tool_choice="auto" # Let model choose automatically + ) + + # Step 3: Check if model wants to call a tool + message = response.choices[0].message + if message.tool_calls: + # Get the first tool call + tool_call = message.tool_calls[0] + tool_name = tool_call.function.name + # Step 4: Execute the tool locally (no parameters) + if tool_name == "get_current_time": + # Test both empty string and "{}" for no-arg tool calls + tool_call.function.arguments = arguments + messages.append(message) + current_time = datetime.datetime.now() + result = current_time.isoformat() + messages.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": result, + }) + # Step 5: Send tool result back to model to continue conversation + final_response = await client.chat.completions.create( + model=model_name, + messages=messages, + ) + # Output final natural language response + assert final_response.choices[0].message.content is not None + + else: + # No tool called — just print model's direct reply + assert message.content is not None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 00ef39f134653..c2c0ad74ef431 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1450,9 +1450,11 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: and isinstance(message["tool_calls"], list) ): for item in message["tool_calls"]: - item["function"]["arguments"] = json.loads( - item["function"]["arguments"] - ) + # if arguments is None or empty string, set to {} + if content := item["function"].get("arguments"): + item["function"]["arguments"] = json.loads(content) + else: + item["function"]["arguments"] = {} def parse_chat_messages( From c60e6137f0bf2034853919b3a9d705d7e06b93cf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 20 Sep 2025 13:30:22 +0800 Subject: [PATCH 20/24] [Optimization] Avoid repeated model architecture conversion for pooling models (#25261) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 22 +++++++++++++++++++++- vllm/model_executor/model_loader/utils.py | 17 ++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 4e847922b61e6..921322bb475c5 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -322,8 +322,28 @@ class ModelConfig: factors.append(self.override_generation_config) factors.append(self.rope_scaling) factors.append(self.rope_theta) + # hf_config can control how the model looks! - factors.append(self.hf_config.to_json_string()) + try: + hf_config_json = self.hf_config.to_json_string(use_diff=False) + except TypeError: + from transformers import PretrainedConfig + + from vllm.utils.jsontree import json_map_leaves + + # Handle nested HF configs with unserializable values gracefully + hf_config_json = json.dumps( + json_map_leaves( + lambda v: v.to_dict() + if isinstance(v, PretrainedConfig) else str(v), + self.hf_config.to_dict(), + ), + indent=2, + sort_keys=True, + ) + "\n" + + factors.append(hf_config_json) + str_factors = str(factors) assert_hashable(str_factors) return hashlib.sha256(str(factors).encode()).hexdigest() diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index bd1773c753a93..e007d431880eb 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -165,7 +165,11 @@ def device_loading_context(module: torch.nn.Module, # New parameters or parameters already on target device are untouched -def get_model_architecture( +_MODEL_ARCH_BY_HASH = dict[str, tuple[type[nn.Module], str]]() +"""Caches the outputs of `_get_model_architecture`.""" + + +def _get_model_architecture( model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) @@ -209,6 +213,17 @@ def get_model_architecture( return model_cls, arch +def get_model_architecture( + model_config: ModelConfig) -> tuple[type[nn.Module], str]: + key = model_config.compute_hash() + if key in _MODEL_ARCH_BY_HASH: + return _MODEL_ARCH_BY_HASH[key] + + model_arch = _get_model_architecture(model_config) + _MODEL_ARCH_BY_HASH[key] = model_arch + return model_arch + + def get_model_cls(model_config: ModelConfig) -> type[nn.Module]: return get_model_architecture(model_config)[0] From 9607d5eb449711b349d4c2bee0a9c94afcc7ed14 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Fri, 19 Sep 2025 23:43:59 -0700 Subject: [PATCH 21/24] [Hybrid Allocator] Support full attention with different hidden size (#25101) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 118 +++++++++++++++++++--- vllm/v1/core/kv_cache_utils.py | 144 ++++++++++++++++++++------- vllm/v1/engine/core.py | 16 ++- vllm/v1/kv_cache_interface.py | 70 +++++++++++++ vllm/v1/worker/gpu_model_runner.py | 65 ++++++------ vllm/v1/worker/utils.py | 3 +- 6 files changed, 324 insertions(+), 92 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 4bf6bbbfeae28..4cb7ed6ce3824 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -18,12 +18,14 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.core.kv_cache_utils import ( BlockHash, FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, - get_kv_cache_configs, get_max_concurrency_for_kv_cache_config, - get_request_block_hasher, hash_block_tokens, init_none_hash, - is_kv_cache_type_uniform, make_block_hash_with_group_id) + generate_scheduler_kv_cache_config, get_kv_cache_configs, + get_max_concurrency_for_kv_cache_config, get_request_block_hasher, + hash_block_tokens, init_none_hash, is_kv_cache_spec_uniform, + make_block_hash_with_group_id) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -927,36 +929,36 @@ def test_merge_kv_cache_spec(): assert merged_layer_spec.sliding_window == 1 -def test_is_kv_cache_type_uniform(): +def test_is_kv_cache_spec_uniform(): kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_kv_cache_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_kv_cache_spec(num_kv_heads=32), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), } - assert is_kv_cache_type_uniform(kv_cache_spec) + assert is_kv_cache_spec_uniform(kv_cache_spec) kv_cache_spec = { "layer_1": new_sliding_window_spec(num_kv_heads=32, sliding_window=1), "layer_2": new_sliding_window_spec(num_kv_heads=32, sliding_window=2), } - assert not is_kv_cache_type_uniform(kv_cache_spec) + assert not is_kv_cache_spec_uniform(kv_cache_spec) @pytest.mark.parametrize( @@ -1286,14 +1288,28 @@ def test_get_kv_cache_config_one_worker(): ], ) - # different hidden size, unimplemented + # different hidden size kv_cache_specs_hybrid = { 'layer_1': new_kv_cache_spec(head_size=128), - 'layer_2': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=64), } - with pytest.raises(NotImplementedError): - get_kv_cache_configs(vllm_config, [kv_cache_specs_hybrid], - [mem_per_block_per_layer * 2 * 32])[0] + kv_cache_config_hybrid = get_kv_cache_configs( + vllm_config, [kv_cache_specs_hybrid], + [mem_per_block_per_layer * 3 * 32])[0] + assert kv_cache_config_hybrid == KVCacheConfig( + num_blocks=32, + kv_cache_tensors=[ + KVCacheTensor(size=mem_per_block_per_layer * 32 * 2, + shared_by=["layer_1"]), + KVCacheTensor(size=mem_per_block_per_layer * 32, + shared_by=["layer_2"]), + ], + kv_cache_groups=[ + KVCacheGroupSpec(["layer_1", "layer_2"], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs_hybrid)) + ]) # Test num_gpu_blocks_override vllm_config.cache_config.num_gpu_blocks_override = 16 @@ -1324,3 +1340,75 @@ def test_get_kv_cache_configs_attention_free(): kv_cache_groups=[], ) ] + + +def test_generate_uniform_type_kv_cache_specs(): + # All layers are full attention, can be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # Full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_sliding_window_spec(sliding_window=1), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # different order of full attention + sliding window, cannot be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_kv_cache_spec(), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + # Same-size sliding window, can be merged + kv_cache_specs = { + 'layer_1': new_sliding_window_spec(sliding_window=1), + 'layer_2': new_sliding_window_spec(sliding_window=1, head_size=128), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec == UniformTypeKVCacheSpecs( + block_size=16, kv_cache_specs=kv_cache_specs) + + # different block sizes, cannot be merged + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(block_size=16), + 'layer_2': new_kv_cache_spec(block_size=32), + } + uniform_spec = UniformTypeKVCacheSpecs.from_specs(kv_cache_specs) + assert uniform_spec is None + + +def test_generate_scheduler_kv_cache_config(): + kv_cache_specs = { + 'layer_1': new_kv_cache_spec(), + 'layer_2': new_kv_cache_spec(head_size=128), + } + kv_cache_configs = [ + KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], + UniformTypeKVCacheSpecs( + block_size=16, + kv_cache_specs=kv_cache_specs)), + ], + ) + ] + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + assert scheduler_kv_cache_config == KVCacheConfig( + num_blocks=10, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec(['layer_1', 'layer_2'], new_kv_cache_spec()) + ], + ) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index bc2ec5e42ea20..3ccd00121f8ed 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """KV-Cache Utilities.""" +import copy import os from collections import defaultdict, deque from collections.abc import Iterable, Sequence @@ -15,7 +16,8 @@ from vllm.utils import GiB_bytes, cdiv, sha256_cbor from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - KVCacheTensor, SlidingWindowSpec) + KVCacheTensor, SlidingWindowSpec, + UniformTypeKVCacheSpecs) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -750,7 +752,7 @@ def create_kv_cache_group_specs( return kv_cache_groups -def is_kv_cache_type_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: +def is_kv_cache_spec_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ Whether all layers in the given KVCacheSpec have the same KV cache spec. Note that we regard FullAttentionSpec with and without sliding window as @@ -793,6 +795,21 @@ def get_max_concurrency_for_kv_cache_config( return max_concurrency +def may_override_num_blocks(vllm_config: VllmConfig, num_blocks: int) -> int: + """ + Override the number of kv cache blocks if `num_gpu_blocks_override` is set. + """ + if vllm_config.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = \ + vllm_config.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) + num_blocks = num_gpu_blocks_override + + return num_blocks + + def get_num_blocks(vllm_config: VllmConfig, num_layers: int, available_memory: int, page_size: int) -> int: """ @@ -806,13 +823,7 @@ def get_num_blocks(vllm_config: VllmConfig, num_layers: int, """ num_blocks = int(available_memory // page_size // num_layers) num_blocks = max(num_blocks, 0) - if vllm_config.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = \ - vllm_config.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_blocks, num_gpu_blocks_override) - num_blocks = num_gpu_blocks_override + num_blocks = may_override_num_blocks(vllm_config, num_blocks) return num_blocks @@ -825,11 +836,11 @@ def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int: return page_sizes.pop() -def _get_kv_cache_groups_uniform_type( +def _get_kv_cache_groups_uniform_spec( kv_cache_specs: dict[str, KVCacheSpec]) -> list[KVCacheGroupSpec]: """ - Generates the KV cache configuration for a model with one type of KV cache. - Divide the available memory equally among all layers. + Generates the KV cache configuration for a model with the same KV cache + spec for all layers. Args: kv_cache_specs: The kv cache spec of each attention layer in the model @@ -842,6 +853,22 @@ def _get_kv_cache_groups_uniform_type( [list(kv_cache_specs.keys())]) +def _get_kv_cache_groups_uniform_type( + spec: UniformTypeKVCacheSpecs) -> list[KVCacheGroupSpec]: + """ + Generates the KV cache configuration for a model with one type of KV cache + but different hidden sizes. All layers are merged into one group. + + Args: + spec: The UniformTypeKVCacheSpecs of the model + + Returns: + The generated KVCacheGroupSpecs + """ + + return [KVCacheGroupSpec(list(spec.kv_cache_specs.keys()), spec)] + + def is_kv_cache_page_size_uniform( kv_cache_spec: dict[str, KVCacheSpec]) -> bool: """ @@ -1000,28 +1027,45 @@ def get_kv_cache_config_from_groups(vllm_config: VllmConfig, ) # Determine how model runners should initialize the KV cache tensors. - # We will have group_size memory pools, each is shared by one layer from - # each group. As layers of different groups have different block table, - # they will use different parts of the shared Tensor. - # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), - # (sw.1, padding) will be: (group_size = 2) - # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 - # full.1, sw.2: share another Tensor with size=available_memory//2 - group_size = max(len(group.layer_names) for group in kv_cache_groups) + if len(kv_cache_groups) == 1 and \ + isinstance(kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs): + # Special case: all layers have the same type of KV cache but with + # different hidden size. Allocate different amount of memory for each + # layer based on its hidden size. + num_blocks = available_memory // kv_cache_groups[ + 0].kv_cache_spec.page_size_bytes + num_blocks = may_override_num_blocks(vllm_config, num_blocks) + per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs + kv_cache_tensors = [ + KVCacheTensor(size=per_layer_specs[layer_name].page_size_bytes * + num_blocks, + shared_by=[layer_name]) + for layer_name in kv_cache_groups[0].layer_names + ] + else: + # General case: + # We will have group_size memory pools, each is shared by one layer from + # each group. As layers of different groups have different block table, + # they will use different parts of the shared Tensor. + # The memory layout for 3 groups (full.0, full.1), (sw.0, sw.2), + # (sw.1, padding) will be: (group_size = 2) + # full.0, sw.0, sw.1: share a Tensor with size=available_memory//2 + # full.1, sw.2: share another Tensor with size=available_memory//2 + group_size = max(len(group.layer_names) for group in kv_cache_groups) - page_size = get_uniform_page_size(kv_cache_specs) - assert group_size > 0, "group_size must be greater than 0" - num_blocks = get_num_blocks(vllm_config, group_size, available_memory, - page_size) - per_memory_pool_size = page_size * num_blocks - kv_cache_tensors = [] - for i in range(group_size): - shared_by = [] - for j in range(len(kv_cache_groups)): - if i < len(kv_cache_groups[j].layer_names): - shared_by.append(kv_cache_groups[j].layer_names[i]) - kv_cache_tensors.append( - KVCacheTensor(size=per_memory_pool_size, shared_by=shared_by)) + page_size = get_uniform_page_size(kv_cache_specs) + assert group_size > 0, "group_size must be greater than 0" + num_blocks = get_num_blocks(vllm_config, group_size, available_memory, + page_size) + kv_cache_tensors = [] + for i in range(group_size): + shared_by = [] + for j in range(len(kv_cache_groups)): + if i < len(kv_cache_groups[j].layer_names): + shared_by.append(kv_cache_groups[j].layer_names[i]) + kv_cache_tensors.append( + KVCacheTensor(size=page_size * num_blocks, + shared_by=shared_by)) kv_cache_config = KVCacheConfig( num_blocks=num_blocks, @@ -1059,7 +1103,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): kv_cache_spec: The kv cache spec of each attention layer in the model """ - if is_kv_cache_type_uniform(kv_cache_spec): + if is_kv_cache_spec_uniform(kv_cache_spec): return logger.warning( @@ -1097,7 +1141,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): attention_chunk_size=spec.attention_chunk_size, ) - if not is_kv_cache_type_uniform(kv_cache_spec): + if not is_kv_cache_spec_uniform(kv_cache_spec): raise ValueError("Hybrid KV cache manager is disabled but failed to " "convert the KV cache specs to one unified type.") @@ -1122,11 +1166,16 @@ def get_kv_cache_groups( # This returns an empty list to allow for the KVCacheManager to handle # attention free models. return [] - elif is_kv_cache_type_uniform(kv_cache_spec): + elif is_kv_cache_spec_uniform(kv_cache_spec): # KV cache of all layers are the same, which is true for # most models. Allocate the same amount of memory for # each layer. - return _get_kv_cache_groups_uniform_type(kv_cache_spec) + return _get_kv_cache_groups_uniform_spec(kv_cache_spec) + elif uniform_spec := UniformTypeKVCacheSpecs.from_specs(kv_cache_spec): + # All layers need the same number of token slots (e.g., all layers are + # full attention, or all layers are sliding window attention with the + # same window size). Put all layers into one group. + return _get_kv_cache_groups_uniform_type(uniform_spec) elif is_kv_cache_page_size_uniform(kv_cache_spec): # Model contains multiple attention types, but KV cache of all layers # have the same physical memory per block per layer. Split the layers @@ -1137,6 +1186,27 @@ def get_kv_cache_groups( raise NotImplementedError +def generate_scheduler_kv_cache_config( + kv_cache_configs: list[KVCacheConfig]) -> KVCacheConfig: + """ + Generate the KV cache configuration for the scheduler. + """ + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + # All workers have the same kv_cache_config except layer names, so use + # an arbitrary one to initialize the scheduler. + cfg = copy.deepcopy(kv_cache_configs[0]) + for group in cfg.kv_cache_groups: + if isinstance(group.kv_cache_spec, UniformTypeKVCacheSpecs): + # All layers in the UniformTypeKVCacheSpecs have the same type, + # so use an arbitrary one to initialize the scheduler. + group.kv_cache_spec = next( + iter(group.kv_cache_spec.kv_cache_specs.values())) + return cfg + + def get_kv_cache_configs(vllm_config: VllmConfig, kv_cache_specs: list[dict[str, KVCacheSpec]], available_memory: list[int]) -> list[KVCacheConfig]: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index a022e9c0d7058..a43042a5510a8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,7 +29,9 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.utils import (decorate_logs, get_hash_fn_by_name, make_zmq_socket, resolve_obj_by_qualname, set_process_title) -from vllm.v1.core.kv_cache_utils import (BlockHash, get_kv_cache_configs, +from vllm.v1.core.kv_cache_utils import (BlockHash, + generate_scheduler_kv_cache_config, + get_kv_cache_configs, get_request_block_hasher, init_none_hash) from vllm.v1.core.sched.interface import SchedulerInterface @@ -196,16 +198,10 @@ class EngineCore: kv_cache_configs = get_kv_cache_configs(vllm_config, kv_cache_specs, available_gpu_memory) - - # All workers have the same kv_cache_config except layer names, so use - # an arbitrary one to initialize the scheduler. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks + scheduler_kv_cache_config = generate_scheduler_kv_cache_config( + kv_cache_configs) + num_gpu_blocks = scheduler_kv_cache_config.num_blocks num_cpu_blocks = 0 - scheduler_kv_cache_config = kv_cache_configs[0] # Initialize kv cache and warmup the execution self.model_executor.initialize_from_config(kv_cache_configs) diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 0cf92a680a689..f72cc8f93a6c2 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -234,6 +234,76 @@ class CrossAttentionSpec(AttentionSpec): return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes +@dataclass(frozen=True) +class UniformTypeKVCacheSpecs(KVCacheSpec): + """ + A KV cache spec for multiple layers with the same type of attention. Here, + same types means always need the same number of token slots. For example, + sliding window attentions with different window sizes are not the same type + and should not be merged into one UniformTypeKVCacheSpecs. + """ + kv_cache_specs: dict[str, KVCacheSpec] + + @property + def page_size_bytes(self) -> int: + return sum(spec.page_size_bytes + for spec in self.kv_cache_specs.values()) + + def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + max_num_pages = max( + cdiv(spec.max_memory_usage_bytes(vllm_config), + spec.page_size_bytes) + for spec in self.kv_cache_specs.values()) + return max_num_pages * self.page_size_bytes + + @classmethod + def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool: + """ + Whether all layers have the same type of KV cache spec. + """ + block_sizes = set(spec.block_size for spec in kv_cache_specs.values()) + if len(block_sizes) > 1: + # Different block sizes, not uniform. + return False + one_spec = next(iter(kv_cache_specs.values())) + if isinstance(one_spec, (FullAttentionSpec, CrossAttentionSpec)): + return all( + isinstance(spec, type(one_spec)) + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, SlidingWindowSpec): + return all( + isinstance(spec, SlidingWindowSpec) + and spec.sliding_window == one_spec.sliding_window + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, ChunkedLocalAttentionSpec): + return all( + isinstance(spec, ChunkedLocalAttentionSpec) + and spec.attention_chunk_size == one_spec.attention_chunk_size + for spec in kv_cache_specs.values()) + elif isinstance(one_spec, MambaSpec): + return all( + isinstance(spec, MambaSpec) and spec.num_speculative_blocks == + one_spec.num_speculative_blocks + for spec in kv_cache_specs.values()) + else: + # NOTE(Chen): Please add new branches for new KV cache spec types. + raise NotImplementedError( + f"Unsupported KV cache spec type: {type(one_spec)}") + + @classmethod + def from_specs(cls, kv_cache_specs: dict[str, + KVCacheSpec]) -> Optional[Self]: + """ + Return a SameTypeKVCacheSpecs object if all layers have the same type + of KV cache spec. Return None if not. + """ + if cls.is_uniform_type(kv_cache_specs): + block_size = next(iter(kv_cache_specs.values())).block_size + return cls(block_size=block_size, kv_cache_specs=kv_cache_specs) + else: + return None + + @dataclass class KVCacheTensor: """ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dffadd1d769b7..233df8f1b0e9b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from collections import defaultdict from collections.abc import Iterator from contextlib import contextmanager from copy import deepcopy -from typing import TYPE_CHECKING, Any, Optional, Union, cast +from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union, cast import numpy as np import torch @@ -74,7 +74,8 @@ from vllm.v1.kv_cache_interface import (AttentionSpec, EncoderOnlyAttentionSpec, FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, - MambaSpec, SlidingWindowSpec) + MambaSpec, SlidingWindowSpec, + UniformTypeKVCacheSpecs) # yapf: enable from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, AsyncModelRunnerOutput, DraftTokenIds, LogprobsLists, LogprobsTensors, @@ -1187,7 +1188,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, num_common_prefix_blocks, - kv_cache_group_spec.kv_cache_spec, + attn_group.kv_cache_spec, builder, ) @@ -3453,12 +3454,16 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert len(self.attn_groups) == 0, \ "Attention backends are already initialized" - def get_attn_backends_for_layers( - layer_names: list[str] - ) -> dict[type[AttentionBackend], list[str]]: - layers = get_layers_from_vllm_config(self.vllm_config, - AttentionLayerBase, - layer_names) + class AttentionGroupKey(NamedTuple): + attn_backend: type[AttentionBackend] + kv_cache_spec: KVCacheSpec + + def get_attn_backends_for_group( + kv_cache_group_spec: KVCacheGroupSpec, + ) -> dict[AttentionGroupKey, list[str]]: + layers = get_layers_from_vllm_config( + self.vllm_config, AttentionLayerBase, + kv_cache_group_spec.layer_names) attn_backends = {} attn_backend_layers = defaultdict(list) # Dedupe based on full class name; this is a bit safer than @@ -3466,7 +3471,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # attention backend subclasses (e.g. ChunkedLocalAttention) unless # they are cached correctly, there will be different objects per # layer. - for layer_name in layer_names: + for layer_name in kv_cache_group_spec.layer_names: attn_backend = layers[layer_name].get_attn_backend() if layer_name in self.kv_sharing_fast_prefill_eligible_layers: @@ -3475,8 +3480,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): attn_backend, ) - key = attn_backend.full_cls_name() - attn_backends[key] = attn_backend + full_cls_name = attn_backend.full_cls_name() + layer_kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if isinstance(layer_kv_cache_spec, UniformTypeKVCacheSpecs): + layer_kv_cache_spec = layer_kv_cache_spec.kv_cache_specs[ + layer_name] + key = (full_cls_name, layer_kv_cache_spec) + attn_backends[key] = AttentionGroupKey(attn_backend, + layer_kv_cache_spec) attn_backend_layers[key].append(layer_name) return { attn_backends[k]: v @@ -3484,11 +3495,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): } def create_attn_groups( - attn_backends_map: dict[AttentionBackend, list[str]], - kv_cache_spec: KVCacheSpec, + attn_backends_map: dict[AttentionGroupKey, list[str]], ) -> list[AttentionGroup]: attn_groups: list[AttentionGroup] = [] - for attn_backend, layer_names in attn_backends_map.items(): + for (attn_backend, + kv_cache_spec), layer_names in attn_backends_map.items(): attn_metadata_builders = [] attn_metadata_builders.append(attn_backend.get_builder_cls()( kv_cache_spec, @@ -3506,16 +3517,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): )) attn_group = AttentionGroup(attn_backend, attn_metadata_builders, - layer_names) + layer_names, kv_cache_spec) attn_groups.append(attn_group) return attn_groups for kv_cache_group_spec in kv_cache_config.kv_cache_groups: - kv_cache_spec = kv_cache_group_spec.kv_cache_spec - attn_backends = get_attn_backends_for_layers( - kv_cache_group_spec.layer_names) - self.attn_groups.append( - create_attn_groups(attn_backends, kv_cache_spec)) + attn_backends = get_attn_backends_for_group(kv_cache_group_spec) + self.attn_groups.append(create_attn_groups(attn_backends)) # Calculate reorder batch threshold (if needed) self.calculate_reorder_batch_threshold() @@ -3680,14 +3688,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): def _attn_group_iterator(self) -> Iterator[AttentionGroup]: return itertools.chain.from_iterable(self.attn_groups) - def _kv_cache_spec_attn_group_iterator( - self) -> Iterator[tuple[KVCacheSpec, AttentionGroup]]: + def _kv_cache_spec_attn_group_iterator(self) -> Iterator[AttentionGroup]: if not self.kv_cache_config.kv_cache_groups: return - for kv_cache_spec_id, attn_groups in enumerate(self.attn_groups): - for attn_group in attn_groups: - yield self.kv_cache_config.kv_cache_groups[ - kv_cache_spec_id].kv_cache_spec, attn_group + for attn_groups in self.attn_groups: + yield from attn_groups def _reshape_kv_cache_tensors( self, @@ -3707,7 +3712,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): """ kv_caches: dict[str, torch.Tensor] = {} has_attn, has_mamba = False, False - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec attn_backend = group.backend for layer_name in group.layer_names: if layer_name in self.runner_only_attn_layers: @@ -3787,7 +3793,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): kv_caches: The KV cache buffer of each layer. """ - for kv_cache_spec, group in self._kv_cache_spec_attn_group_iterator(): + for group in self._kv_cache_spec_attn_group_iterator(): + kv_cache_spec = group.kv_cache_spec for layer_name in group.layer_names: kv_cache = kv_caches[layer_name] if (isinstance(kv_cache_spec, AttentionSpec) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index b76ac633892f3..021d18b2500f0 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -15,7 +15,7 @@ from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget -from vllm.v1.kv_cache_interface import KVCacheGroupSpec +from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec if TYPE_CHECKING: from vllm.attention.layer import Attention @@ -132,6 +132,7 @@ class AttentionGroup: backend: type[AttentionBackend] metadata_builders: list[AttentionMetadataBuilder] layer_names: list[str] + kv_cache_spec: KVCacheSpec def get_metadata_builder(self, ubatch_id: Optional[int] = None From be874c020196080305baf988ed8c1c82047323be Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 20 Sep 2025 00:04:05 -0700 Subject: [PATCH 22/24] [Bugfix] Fix Qwen3-VL-MoE weight loading for EP (#25300) Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen3_vl_moe.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 625f94cf7ad77..7912cf3ea52b0 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -122,9 +122,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): def load_fused_expert_weights(self, name: str, params_dict: dict, loaded_weight: torch.Tensor, shard_id: str, - num_experts: int): + num_experts: int) -> bool: param = params_dict[name] weight_loader = typing.cast(Callable[..., bool], param.weight_loader) + loaded_local_expert = False for expert_id in range(num_experts): curr_expert_weight = loaded_weight[expert_id] success = weight_loader(param, @@ -133,9 +134,10 @@ class Qwen3MoeLLMModel(Qwen3MoeModel): shard_id, expert_id, return_success=True) - if not success: - return False - return True + if success: + loaded_local_expert = True + + return loaded_local_expert def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -345,4 +347,4 @@ class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration): for _ in range(self.deepstack_num_level) ] if self.use_deepstack else None self.visual_dim = config.vision_config.out_hidden_size - self.multiscale_dim = self.visual_dim * self.deepstack_num_level \ No newline at end of file + self.multiscale_dim = self.visual_dim * self.deepstack_num_level From 3d9a1d2de5091455bb2fbf6b21fc9188fd4612a4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 20 Sep 2025 15:14:35 +0800 Subject: [PATCH 23/24] [V1] Support `LLM.apply_model` (#18465) Signed-off-by: DarkLight1337 --- tests/conftest.py | 12 +--- tests/kernels/moe/test_mxfp4_moe.py | 37 ++++++---- .../multimodal/generation/test_qwen2_vl.py | 46 ++++++------ tests/models/quantization/test_awq.py | 2 +- tests/quantization/test_compressed_tensors.py | 18 +++-- tests/quantization/test_fp8.py | 8 +-- tests/quantization/test_gptq_dynamic.py | 71 ++++++++++--------- tests/quantization/test_lm_head.py | 4 +- tests/quantization/test_modelopt.py | 10 +-- tests/quantization/test_ptpc_fp8.py | 47 +++++++----- tests/quantization/test_quark.py | 26 +++---- .../test_register_quantization_config.py | 17 +++-- vllm/engine/llm_engine.py | 7 +- vllm/entrypoints/llm.py | 9 ++- vllm/executor/executor_base.py | 33 +++++---- vllm/v1/engine/llm_engine.py | 7 +- vllm/worker/worker_base.py | 9 ++- 17 files changed, 194 insertions(+), 169 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3cd93f4ad3289..e8e95357ff5b9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -987,17 +987,7 @@ class VllmRunner: return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - if hasattr(self.llm.llm_engine, "model_executor"): - # This works either in V0 or in V1 with - # VLLM_ENABLE_V1_MULTIPROCESSING=0 - executor = self.llm.llm_engine.model_executor - return executor.apply_model(func) - - # This works in V1 with VLLM_ALLOW_INSECURE_SERIALIZATION=1 - def _apply_model(self): - return func(self.get_model()) - - return self.llm.llm_engine.collective_rpc(_apply_model) + return self.llm.apply_model(func) def get_llm(self) -> LLM: return self.llm diff --git a/tests/kernels/moe/test_mxfp4_moe.py b/tests/kernels/moe/test_mxfp4_moe.py index a3b8f07638d9a..61d3311cc1624 100644 --- a/tests/kernels/moe/test_mxfp4_moe.py +++ b/tests/kernels/moe/test_mxfp4_moe.py @@ -1,21 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import importlib import importlib.metadata from dataclasses import dataclass +from importlib.util import find_spec from typing import Optional import pytest import torch from packaging import version +from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 + QuarkLinearMethod, QuarkW4A4MXFP4) +from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 + QuarkW4A4MXFp4MoEMethod) from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') TRTLLM_GEN_MXFP4_AVAILABLE = current_platform.is_cuda( ) and current_platform.is_device_capability(100) @@ -39,6 +42,12 @@ class ModelCase: tp: int +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + + @pytest.mark.parametrize('model_case', [ ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1), ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8), @@ -55,21 +64,19 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): tensor_parallel_size=model_case.tp, load_format="dummy") as llm: - # TODO: llm.apply_model(check_model) currently relies on V0 internals. - # Re-enable this later. - # def check_model(model): - # layer = model.model.layers[0] + def check_model(model): + layer = model.model.layers[0] - # qkv_proj = layer.self_attn.qkv_proj + qkv_proj = layer.self_attn.qkv_proj - # assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - # assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4) - # assert isinstance(layer.mlp.experts.quant_method, - # QuarkW4A4MXFp4MoEMethod) + assert isinstance(layer.mlp.experts.quant_method, + QuarkW4A4MXFp4MoEMethod) - # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": - # llm.apply_model(check_model) + if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4": + llm.apply_model(check_model) output = llm.generate_greedy("Today I am in the French Alps and", max_tokens=20) diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py index a81f5e7ec8872..e56f4e4075be4 100644 --- a/tests/models/multimodal/generation/test_qwen2_vl.py +++ b/tests/models/multimodal/generation/test_qwen2_vl.py @@ -10,6 +10,7 @@ from PIL import Image from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video +from vllm.utils import set_default_torch_num_threads from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, PromptVideoInput, VllmRunner) @@ -17,11 +18,9 @@ from ...utils import check_logprobs_close @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - V1 Test: batch_make_xxxxx_embeddings calls a V0 internal - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") models = ["Qwen/Qwen2-VL-2B-Instruct"] @@ -126,9 +125,8 @@ def batch_make_image_embeddings( image_grid_thw_on_device = image_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + grid_thw=image_grid_thw_on_device).cpu() - # V1 Test: this calls a V0 internal. image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches @@ -210,7 +208,7 @@ def batch_make_video_embeddings( video_grid_thw_on_device = video_grid_thw.to(visual.device, dtype=torch.int64) return visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + grid_thw=video_grid_thw_on_device).cpu() # V1 Test: this calls a V0 internal. video_embeds = torch.concat(llm.apply_model(get_image_embeds)) @@ -266,19 +264,22 @@ def run_embedding_input_test( processor = AutoProcessor.from_pretrained(model) # max_model_len should be greater than image_feature_size - with vllm_runner(model, - runner="generate", - max_model_len=4000, - max_num_seqs=3, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: + with set_default_torch_num_threads(1): + vllm_model = vllm_runner( + model, + runner="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) + with vllm_model: outputs_per_case_for_original_input = [ vllm_model.generate_greedy_logprobs(prompts, max_tokens, @@ -329,9 +330,8 @@ def run_embedding_input_test( @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [10]) def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, - size_factors, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + size_factors, dtype, max_tokens, + num_logprobs, monkeypatch) -> None: images = [asset.pil_image for asset in image_assets] inputs_per_case: list[tuple[ diff --git a/tests/models/quantization/test_awq.py b/tests/models/quantization/test_awq.py index bd696198931ff..7005e435ecf46 100644 --- a/tests/models/quantization/test_awq.py +++ b/tests/models/quantization/test_awq.py @@ -112,7 +112,7 @@ def test_awq_models(vllm_runner, image_assets, source_model, quant_model, monkeypatch) -> None: # Test V1: this test hangs during setup on single-scale input. - # TODO: fixure out why and re-enable this on V1. + # TODO: figure out why and re-enable this on V1. monkeypatch.setenv("VLLM_USE_V1", "0") run_awq_test( vllm_runner, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 484f53246f349..b7949a488ad05 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -43,12 +43,9 @@ ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [ @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize( @@ -176,10 +173,11 @@ def test_compressed_tensors_w8a8_logprobs( dtype = "bfloat16" - # skip language translation prompt for the static per tensor asym model - if (model_path == - "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym" - ): # noqa: E501 + # skip language translation prompt for the static per tensor models + if model_path in ( + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym", + "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym", + ): example_prompts = example_prompts[0:-1] with hf_runner(model_path, dtype=dtype) as hf_model: diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index d781f462b4ad7..db53061cf2d1a 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -60,8 +60,8 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: def check_model(model): @@ -104,8 +104,8 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, if use_rocm_aiter: monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") if force_marlin: monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index aea50e99c1dd5..00a5946ed0154 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -31,41 +31,46 @@ MODEL_QUANT = [ @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, monkeypatch): - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") - - vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.llm.llm_engine.model_executor. - driver_worker.model_runner.model.named_modules()): - if name == "lm_head": - assert isinstance(submodule.quant_method, linear_method_cls) - elif name == 'model.layers.0.self_attn.qkv_proj': - # The first layer is quantized using bits=4, group_size=128 - # desc_act=True - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert config.weight_bits == 4 - assert config.group_size == 128 - assert config.desc_act - elif name == 'model.layers.1.self_attn.qkv_proj': - # The second layer is quantized using bits=8, group_size=32 - # desc_act=False - assert isinstance(submodule.quant_method, linear_method_cls) - config = submodule.quant_method.quant_config - assert get_dynamic_override(config, layer_name=name, - key="bits") == 8 - assert get_dynamic_override(config, - layer_name=name, - key="group_size") == 32 - assert not get_dynamic_override( - config, layer_name=name, key="desc_act") - elif (name == 'model.layers.2.self_attn.qkv_proj' - or name == 'model.layers.2.mlp.gate_up_proj'): - # All other layers (layer index >= 2) are not quantized - assert isinstance(submodule.quant_method, UnquantizedLinearMethod) + with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: - del vllm_model + def check_model(model): + for name, submodule in model.named_modules(): + if name == "lm_head": + assert isinstance(submodule.quant_method, + linear_method_cls) + elif name == 'model.layers.0.self_attn.qkv_proj': + # The first layer is quantized using bits=4, group_size=128 + # desc_act=True + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert config.weight_bits == 4 + assert config.group_size == 128 + assert config.desc_act + elif name == 'model.layers.1.self_attn.qkv_proj': + # The second layer is quantized using bits=8, group_size=32 + # desc_act=False + assert isinstance(submodule.quant_method, + linear_method_cls) + config = submodule.quant_method.quant_config + assert get_dynamic_override(config, + layer_name=name, + key="bits") == 8 + assert get_dynamic_override(config, + layer_name=name, + key="group_size") == 32 + assert not get_dynamic_override( + config, layer_name=name, key="desc_act") + elif (name == 'model.layers.2.self_attn.qkv_proj' + or name == 'model.layers.2.mlp.gate_up_proj'): + # All other layers (layer index >= 2) are not quantized + assert isinstance(submodule.quant_method, + UnquantizedLinearMethod) + + llm.apply_model(check_model) diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index b24964a9d0a9f..e69d4ad349c38 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -29,8 +29,8 @@ def test_lm_head( lm_head_quantized: bool, monkeypatch, ) -> None: - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as vllm_model: diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py index c60a03f44baec..e7174be73626a 100644 --- a/tests/quantization/test_modelopt.py +++ b/tests/quantization/test_modelopt.py @@ -11,16 +11,12 @@ import pytest import torch from tests.quantization.utils import is_quant_method_supported -from vllm.platforms import current_platform @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - if not current_platform.is_cpu(): - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.skipif(not is_quant_method_supported("modelopt"), diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py index 5f78bc30504c0..088b68510cffa 100644 --- a/tests/quantization/test_ptpc_fp8.py +++ b/tests/quantization/test_ptpc_fp8.py @@ -13,6 +13,16 @@ from vllm.model_executor.layers.quantization.ptpc_fp8 import ( PTPCFp8LinearMethod) from vllm.platforms import current_platform +UNSUPPORTED_STR = ( + "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only " + "support output dtype of bfloat16. torch.float16 is specified.") + + +@pytest.fixture(scope="function", autouse=True) +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + @pytest.mark.skipif(not is_quant_method_supported("ptpc_fp8"), reason="PTPC FP8 is not supported on this GPU type.") @@ -21,14 +31,22 @@ from vllm.platforms import current_platform @pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"]) def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - try: - with vllm_runner("facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - kv_cache_dtype=kv_cache_dtype) as llm: + llm = vllm_runner("facebook/opt-125m", + dtype=dtype, + quantization="ptpc_fp8", + kv_cache_dtype=kv_cache_dtype) + except AssertionError as e: + if str(e) == UNSUPPORTED_STR: + # If the error message matches, the test passes + return + else: + # If the error message does not match, re-raise the exception + raise - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + with llm: + + def check_model(model): fc1 = model.model.decoder.layers[0].fc1 assert isinstance(fc1.quant_method, PTPCFp8LinearMethod) if kv_cache_dtype == "ptpc_fp8": @@ -40,17 +58,8 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: if current_platform.has_device_capability(94): # For GPUs with hardware support, we keep weights in fp8 assert fc1.weight.dtype == torch.float8_e4m3fnuz - else: - pytest.skip() - output = llm.generate_greedy("Hello my name is", max_tokens=20) - assert output - except AssertionError as e: - if str( - e - ) == "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support output dtype of bfloat16. torch.float16 is specified.": # noqa: E501 - # If the error message matches, the test passes - pass - else: - # If the error message does not match, re-raise the exception - raise + llm.apply_model(check_model) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index c09931971e6fb..930f4acb328fd 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -7,10 +7,10 @@ Run `pytest tests/quantization/test_quark.py`. See also `tests/kernels/moe/test_mxfp4_moe.py`. """ -import importlib import importlib.metadata import os from dataclasses import dataclass +from importlib.util import find_spec import huggingface_hub import lm_eval @@ -24,9 +24,8 @@ from vllm.platforms import current_platform from .reference_mxfp4 import dq_mxfp4_torch, qdq_mxfp4_torch -QUARK_MXFP4_AVAILABLE = importlib.util.find_spec( - "quark") is not None and version.parse( - importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') +QUARK_MXFP4_AVAILABLE = find_spec("quark") is not None and version.parse( + importlib.metadata.version("amd-quark")) >= version.parse('0.8.99') if QUARK_MXFP4_AVAILABLE: from quark.torch.export.nn.modules.realquantizer import ( @@ -43,11 +42,9 @@ except huggingface_hub.errors.RepositoryNotFoundError: @pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This module relies on V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') +def enable_pickle(monkeypatch): + """`LLM.apply_model` requires pickling a function.""" + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @pytest.mark.parametrize('kv_cache_dtype', ['auto', 'fp8']) @@ -132,13 +129,12 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.llm.llm_engine.model_executor. - driver_worker.model_runner.model) - quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. - model_runner.model) - fp8_state_dict = fp8_model.state_dict() + def get_state_dict(model): + return {k: v.cpu() for k, v in model.state_dict().items()} + + quark_state_dict, = quark_handle.apply_model(get_state_dict) + fp8_state_dict, = fp8_handle.apply_model(get_state_dict) assert fp8_state_dict.keys() == quark_state_dict.keys() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 84705e92c85bb..03fe59d7e3bff 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -105,18 +105,21 @@ def test_register_quantization_config(): ]) def test_custom_quant(vllm_runner, model, monkeypatch): """Test infer with the custom quantization method.""" - # vllm_runner.apply_model() relies on V0 internals. - monkeypatch.setenv("VLLM_USE_V1", "0") + # `LLM.apply_model` requires pickling a function. + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") + with vllm_runner(model_name=model, quantization="custom_quant", enforce_eager=True) as llm: - model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj - # Check the quantization method is FakeQuantLinearMethod - assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 708f3bbeeff15..014bc56bc8ece 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -13,6 +13,7 @@ from typing import Sequence as GenericSequence from typing import Set, Type, Union, cast import torch +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -55,6 +56,7 @@ from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, from vllm.utils import Counter, Device, resolve_obj_by_qualname, weak_bind from vllm.version import __version__ as VLLM_VERSION from vllm.worker.model_runner_base import InputProcessingError +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 @@ -1817,13 +1819,16 @@ class LLMEngine: return sampling_params def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.model_executor.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + if envs.is_set("VLLM_USE_V1") and envs.VLLM_USE_V1: from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e21bfce0ab085..f2282c40f7073 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -522,9 +522,14 @@ class LLM: """ Run a function directly on the model inside each worker, returning the result for each of them. + + !!! warning + To reduce the overhead of data transfer, avoid returning large + arrays or tensors from this method. If you must return them, + make sure you move them to CPU first to avoid taking up additional + VRAM! """ - executor = self.llm_engine.model_executor - return executor.apply_model(func) + return self.llm_engine.apply_model(func) def _get_beam_search_lora_requests( self, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index 42aa8d14a21eb..b75b94ad0acc2 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -5,11 +5,10 @@ import asyncio import time from abc import ABC, abstractmethod from functools import cached_property -from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, - Union) +from typing import Any, Awaitable, Callable, List, Optional, Set, Union import torch.nn as nn -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.platforms from vllm.config import VllmConfig @@ -63,10 +62,10 @@ class ExecutorBase(ABC): @abstractmethod def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[_R]: """ Execute an RPC call on all workers. @@ -91,7 +90,7 @@ class ExecutorBase(ABC): """ raise NotImplementedError - def determine_num_available_blocks(self) -> Tuple[int, int]: + def determine_num_available_blocks(self) -> tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and swappable CPU KV cache. @@ -99,9 +98,10 @@ class ExecutorBase(ABC): ExecutorBase may require modification of the result, e.g. to ensure the selected cache sizes are compatible with all workers. - Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks - are blocks that are "active" on the device and can be appended to. - num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be + Returns a tuple `(num_gpu_blocks, num_cpu_blocks)`, where + `num_gpu_blocks` are blocks that are "active" on the device and can be + appended to. + `num_cpu_blocks` refers to "swapped" blocks in CPU memory and cannot be appended to. """ results = self.collective_rpc("determine_num_available_blocks") @@ -127,16 +127,15 @@ class ExecutorBase(ABC): self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + @deprecated("`llm_engine.model_executor.apply_model` will no longer work " + "in V1 Engine. Please replace with `llm_engine.apply_model` " + "and set `VLLM_ALLOW_INSECURE_SERIALIZATION=1`.") def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: """ Run a function directly on the model inside each worker, returning the result for each of them. """ - - def rpc_func(worker: WorkerBase) -> _R: - return func(worker.get_model()) - - return self.collective_rpc(rpc_func) + return self.collective_rpc("apply_model", args=(func, )) @cached_property # Avoid unnecessary RPC calls def supported_tasks(self) -> tuple[SupportedTask, ...]: @@ -308,8 +307,8 @@ class DistributedExecutorBase(ExecutorBase): def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + args: tuple = (), + kwargs: Optional[dict[str, Any]] = None) -> list[Any]: return self._run_workers(method, *args, **(kwargs or {})) @abstractmethod diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c93bfc35f0aeb..907656d1b24cb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -5,6 +5,7 @@ from collections.abc import Mapping from copy import copy from typing import Any, Callable, Optional, Union +import torch.nn as nn from typing_extensions import TypeVar import vllm.envs as envs @@ -33,6 +34,7 @@ from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, StatLoggerFactory) from vllm.v1.metrics.reader import Metric, get_metrics_snapshot from vllm.v1.metrics.stats import IterationStats +from vllm.v1.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -319,12 +321,15 @@ class LLMEngine: return self.engine_core.pin_lora(lora_id) def collective_rpc(self, - method: Union[str, Callable[..., _R]], + method: Union[str, Callable[[WorkerBase], _R]], timeout: Optional[float] = None, args: tuple = (), kwargs: Optional[dict[str, Any]] = None) -> list[_R]: return self.engine_core.collective_rpc(method, timeout, args, kwargs) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + return self.collective_rpc("apply_model", args=(func, )) + def __del__(self): if dp_group := getattr(self, "dp_group", None): stateless_destroy_torch_distributed_process_group(dp_group) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index aa76d21f0fcaa..d0a56f6ff4637 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -5,7 +5,8 @@ import dataclasses import os import time from abc import abstractmethod -from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union +from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, Type, + TypeVar, Union) import cloudpickle import torch @@ -28,6 +29,8 @@ from vllm.worker.model_runner_base import (BroadcastableModelInput, logger = init_logger(__name__) +_R = TypeVar("_R") + @warn_for_unimplemented_methods class WorkerBase: @@ -70,6 +73,10 @@ class WorkerBase: def get_model(self) -> nn.Module: raise NotImplementedError + def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R: + """Apply a function on the model inside this worker.""" + return fn(self.get_model()) + def load_model(self) -> None: """Load model onto target device.""" raise NotImplementedError From e08a3a3fdbdb5408f904a237b31ff2447a336b2f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 20 Sep 2025 04:16:56 -0400 Subject: [PATCH 24/24] [CI Failure] Disable FlashInfer RoPE to unblock CI (#25299) Signed-off-by: mgoin --- .../model_executor/layers/rotary_embedding/base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 3dc249ae9adb9..1c3576bee5392 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -6,8 +6,6 @@ from typing import Optional import torch from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer from .common import apply_rotary_emb_torch @@ -32,13 +30,15 @@ class RotaryEmbedding(CustomOp): self.base = base self.is_neox_style = is_neox_style self.dtype = dtype + # TODO(mgoin): disabled for now due to failures # Flashinfer only supports head_size=64, 128, 256, 512. # https://github.com/flashinfer-ai/flashinfer/blob/ebfd655efe830048dba5d582aaa61d61d1cf9a87/include/flashinfer/utils.cuh#L174-L202 - self.use_flashinfer = (self.enabled() - and dtype in (torch.float16, torch.bfloat16) - and current_platform.is_cuda() - and has_flashinfer() - and self.head_size in [64, 128, 256, 512]) + # self.use_flashinfer = (self.enabled() + # and dtype in (torch.float16, torch.bfloat16) + # and current_platform.is_cuda() + # and has_flashinfer() + # and self.head_size in [64, 128, 256, 512]) + self.use_flashinfer = False cache = self._compute_cos_sin_cache() if not self.use_flashinfer: