From 4d7231e7743e80078bbc68ccc37b5ba5a1f28bf5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 17:40:17 +0800 Subject: [PATCH 01/83] Revert #28875 (#29159) --- docker/Dockerfile | 17 +++++++++++++++++ docs/deployment/docker.md | 7 ++++--- .../installation/gpu.cuda.inc.md | 5 ++++- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 709b79e84fbbc..964700e2a43ac 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -56,6 +56,7 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch provides its own indexes for standard and nightly builds ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl +ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly # PIP supports multiple authentication schemes, including keyring # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to @@ -97,6 +98,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL +ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Activate virtual environment and add uv to PATH @@ -315,6 +317,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL +ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install uv for faster pip installs @@ -334,6 +337,20 @@ ENV UV_LINK_MODE=copy # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ +# arm64 (GH200) build follows the practice of "use existing pytorch" build, +# we need to install torch and torchvision from the nightly builds first, +# pytorch will not appear as a vLLM dependency in all of the following steps +# after this step +RUN --mount=type=cache,target=/root/.cache/uv \ + if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + uv pip install --system \ + --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ + "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ + uv pip install --system \ + --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ + --pre pytorch_triton==3.3.0+gitab727c40 ; \ + fi + # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 0e636c87f38a4..1c639f3533d47 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -82,7 +82,8 @@ DOCKER_BUILDKIT=1 docker build . \ ## Building for Arm64/aarch64 -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use +of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. !!! note Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` @@ -93,6 +94,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop ```bash # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) + python3 use_existing_torch.py DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile \ --target vllm-openai \ @@ -100,8 +102,7 @@ A docker container can be built for aarch64 systems such as the Nvidia Grace-Hop -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ - --build-arg RUN_WHEEL_CHECK=false + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" ``` !!! note diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index 601d3659af886..b2d0d64a2d355 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -158,7 +158,10 @@ uv pip install -e . ##### Use an existing PyTorch installation -There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build). +There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.: + +- Building vLLM with PyTorch nightly or a custom PyTorch build. +- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. To build vLLM using an existing PyTorch installation: From b34129bf8e5412e4094b89aba5246605c280a5fd Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Fri, 21 Nov 2025 17:41:20 +0800 Subject: [PATCH 02/83] [Misc] remove useless v1 env (#29164) Signed-off-by: David Chen <530634352@qq.com> --- tests/v1/e2e/test_lora_with_spec_decode.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py index 14532f2795443..8c9ab58c3c0ab 100644 --- a/tests/v1/e2e/test_lora_with_spec_decode.py +++ b/tests/v1/e2e/test_lora_with_spec_decode.py @@ -61,8 +61,6 @@ def test_batch_inference_correctness( model_setup: (method, model_name, spec_model_name, lora_path, tp_size) """ with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "1") - # Disable randomness m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8") torch.manual_seed(SEED) From aab0102a267eba814cdc09170b530a3aed96be60 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 19:56:59 +0800 Subject: [PATCH 03/83] [V0 deprecation] Remove more V0 references (#29088) Signed-off-by: DarkLight1337 --- docs/contributing/model/basic.md | 2 -- docs/design/prefix_caching.md | 3 --- docs/usage/reproducibility.md | 9 ++----- docs/usage/v1_guide.md | 2 +- examples/offline_inference/reproducibility.py | 8 ++---- examples/offline_inference/rlhf_utils.py | 8 +++--- .../offline_inference/save_sharded_state.py | 19 +++----------- examples/offline_inference/spec_decode.py | 6 +---- .../model_loader/test_sharded_state_loader.py | 13 ++-------- tests/tool_use/utils.py | 25 ++++++++++--------- vllm/entrypoints/llm.py | 1 - vllm/entrypoints/openai/protocol.py | 6 ++--- .../layers/mamba/mamba_mixer2.py | 1 - vllm/model_executor/models/interfaces.py | 2 -- vllm/model_executor/models/plamo2.py | 1 - 15 files changed, 31 insertions(+), 75 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index d7f5d2f311a37..e828de0adf3c2 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -133,8 +133,6 @@ We consider 3 different scenarios: For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. -Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. -V0-only classes and code will be removed in the very near future. The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized. For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 48536a877bd3f..cf792fdabe1a6 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others. -!!! note - Cache isolation is not supported in engine V0. - ## Data Structure The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified): diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index d8a1943209c1e..afc25b63902e2 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -1,10 +1,7 @@ # Reproducibility -vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve -reproducible results: - -- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. -- For V0: Set the global seed (see below). +vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve +reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py) @@ -30,8 +27,6 @@ However, in some cases, setting the seed will also [change the random state in u ### Default Behavior -In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected. - In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`. !!! note diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index e46bee3f4ef20..22f4e6761ea9a 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -2,7 +2,7 @@ !!! announcement - We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. + We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index d909438b41042..e135bc1b2abb7 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -11,13 +11,9 @@ import random from vllm import LLM, SamplingParams -# V1 only: Turn off multiprocessing to make the scheduling deterministic. +# Turn off multiprocessing to make the scheduling deterministic. os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" -# V0 only: Set the global seed. The default seed is None, which is -# not reproducible. -SEED = 42 - prompts = [ "Hello, my name is", "The president of the United States is", @@ -28,7 +24,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) def main(): - llm = LLM(model="facebook/opt-125m", seed=SEED) + llm = LLM(model="facebook/opt-125m") outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output in outputs: diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 13def88439ef2..5c0787b8778d6 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -30,8 +30,8 @@ class WorkerExtension: """ The class for vLLM's worker to inherit from. By defining an extension class, the code can work no matter what is - the underlying worker class. This way, the code can be compatible - with both vLLM V0 and V1. + the underlying worker class. + NOTE: we define this class in a separate module, and the main module should pass the full qualified name as `worker_extension_cls` argument. """ @@ -96,8 +96,8 @@ class ColocateWorkerExtension: """ The class for vLLM's worker to inherit from, in the colocate setting. By defining an extension class, the code can work no matter what is - the underlying worker class. This way, the code can be compatible - with both vLLM V0 and V1. + the underlying worker class. + NOTE: we define this class in a separate module, and the main module should pass the full qualified name as `worker_extension_cls` argument. """ diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index e25f46b126e6f..88ee48b98bff6 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -67,22 +67,9 @@ def main(args): Path(args.output).mkdir(exist_ok=True) # Dump worker states to output directory - # Check which engine version is being used - is_v1_engine = hasattr(llm.llm_engine, "engine_core") - - if is_v1_engine: - # For V1 engine, we need to use engine_core.save_sharded_state - print("Using V1 engine save path") - llm.llm_engine.engine_core.save_sharded_state( - path=args.output, pattern=args.file_pattern, max_size=args.max_file_size - ) - else: - # For V0 engine - print("Using V0 engine save path") - model_executor = llm.llm_engine.model_executor - model_executor.save_sharded_state( - path=args.output, pattern=args.file_pattern, max_size=args.max_file_size - ) + llm.llm_engine.engine_core.save_sharded_state( + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) # Copy metadata files to output directory for file in os.listdir(model_path): diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 3cdc3b245b72a..67a0732459709 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -158,11 +158,7 @@ def main(args): print(f"generated text: {output.outputs[0].text}") print("-" * 50) - try: - metrics = llm.get_metrics() - except AssertionError: - print("Metrics are not supported in the V0 engine.") - return + metrics = llm.get_metrics() total_num_output_tokens = sum( len(output.outputs[0].token_ids) for output in outputs diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py index 5bb841bf2fa0e..cf06b000efb51 100644 --- a/tests/model_executor/model_loader/test_sharded_state_loader.py +++ b/tests/model_executor/model_loader/test_sharded_state_loader.py @@ -60,18 +60,9 @@ def llama_3p2_1b_files(): def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): llm_sharded_writer = LLM(model=input_dir, **kwargs) - # Check which engine version is being used - is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core") + # Dump worker states to output directory - if is_v1_engine: - # For V1 engine, we need to use engine_core.save_sharded_state - print("Using V1 engine save path") - llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir) - else: - # For V0 engine - print("Using V0 engine save path") - model_executor = llm_sharded_writer.llm_engine.model_executor - model_executor.save_sharded_state(path=output_dir) + llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir) # Copy metadata files to output directory for file in os.listdir(input_dir): diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 38def6f874d7d..d188b21863812 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -140,21 +140,22 @@ CONFIGS: dict[str, ServerConfig] = { "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " "to the user's question - just respond to it normally.", }, - # V1 Test: Passing locally but failing in CI. This runs the - # V0 Engine because of CPU offloading. Need to debug why. + # FIXME: This test currently fails, need to debug why. # "granite20b": { - # "model": - # "mbayser/granite-20b-functioncalling-FP8-KV", + # "model": "mbayser/granite-20b-functioncalling-FP8-KV", # "arguments": [ - # "--tool-call-parser", "granite-20b-fc", "--chat-template", - # str(VLLM_PATH / - # "examples/tool_chat_template_granite_20b_fc.jinja"), - # "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20" + # "--tool-call-parser", + # "granite-20b-fc", + # "--chat-template", + # str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"), + # "--max_num_seqs", + # "1", + # "--enforce-eager", + # "--cpu-offload-gb", + # "20", # ], - # "supports_parallel": - # False, - # "supports_rocm": - # False, + # "supports_parallel": False, + # "supports_rocm": False, # }, "granite-3.0-8b": { "model": "ibm-granite/granite-3.0-8b-instruct", diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7421eb8b8abc9..848916dbd8763 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -339,7 +339,6 @@ class LLM: log_non_default_args(engine_args) - # Create the Engine (autoselects V0 vs V1) self.llm_engine = LLMEngine.from_engine_args( engine_args=engine_args, usage_context=UsageContext.LLM_CLASS ) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 41172d8ec2f72..b352c3ad01db0 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) @@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) kv_transfer_params: dict[str, Any] | None = Field( @@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 900701c46348b..0ea5805305eda 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp): hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C) return hidden_states - # NOTE: V0 put prefill before decode, v1 puts decode before prefill num_prefills = attn_metadata.num_prefills # request count num_decodes = attn_metadata.num_decode_tokens # token count (=request) num_prefill_tokens = attn_metadata.num_prefill_tokens # token count diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index dc4caf2f02f9d..9966498e1b4c9 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -586,13 +586,11 @@ class IsHybrid(Protocol): def get_mamba_state_shape_from_config( cls, vllm_config: VllmConfig, - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 22f9c87fc905b..472de5590dcf8 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp): has_decode = num_decodes > 0 num_actual_tokens = num_prefill_tokens + num_decodes - # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension hidden_states_d, hidden_states_p = torch.split( From cca2d2cdbe56529205c10e58363c7bd2d31e15df Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 21 Nov 2025 07:01:54 -0500 Subject: [PATCH 04/83] [Core] Align whisper closer to other multimodal models (#27292) Signed-off-by: Russell Bryant --- vllm/model_executor/models/whisper.py | 13 ++++--- vllm/v1/worker/gpu_model_runner.py | 49 +++++++-------------------- 2 files changed, 21 insertions(+), 41 deletions(-) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 91a10b95a08c0..50587c627160d 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -599,15 +599,16 @@ class WhisperModel(nn.Module): def forward( self, - input_features: torch.Tensor | list[torch.Tensor] | None, input_ids: torch.Tensor | None, positions: torch.Tensor, + encoder_outputs: list[torch.Tensor], ) -> torch.Tensor: - encoder_outputs = self.get_encoder_outputs(input_features) + assert len(encoder_outputs) in (0, 1) + enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None decoder_outputs = self.decoder( input_ids=input_ids, positions=positions, - encoder_hidden_states=encoder_outputs, + encoder_hidden_states=enc_states, ) return decoder_outputs @@ -894,13 +895,15 @@ class WhisperForConditionalGeneration( self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_outputs: list[torch.Tensor] | None = None, **kwargs, ) -> torch.Tensor: - audio_input = self._parse_and_validate_audio_input(**kwargs) + if encoder_outputs is None: + encoder_outputs = [] decoder_outputs = self.model( - input_features=audio_input["input_features"], input_ids=input_ids, positions=positions, + encoder_outputs=encoder_outputs, ) return decoder_outputs diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4c65a5e9b0292..e786cd8bc7c97 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1923,14 +1923,16 @@ class GPUModelRunner( return mm_kwargs, mm_hashes_pos - def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): + def _execute_mm_encoder( + self, scheduler_output: "SchedulerOutput" + ) -> list[torch.Tensor]: # Batch the multi-modal inputs using the helper method. mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler( scheduler_output ) if not mm_kwargs: - return + return [] # Batch mm inputs as much as we can: if a request in the batch has # multiple modalities or a different modality than the previous one, @@ -2007,6 +2009,8 @@ class GPUModelRunner( logger.debug("Finish execute for mm hash %s", mm_hash) self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash) + return encoder_outputs + def _gather_mm_embeddings( self, scheduler_output: "SchedulerOutput", @@ -2095,38 +2099,6 @@ class GPUModelRunner( return mm_embeds, is_mm_embed - def _extract_encoder_inputs( - self, - scheduler_output: "SchedulerOutput", - ) -> dict[str, torch.Tensor]: - """Extract encoder inputs for encoder-decoder models. - - This method extracts multimodal input features from scheduled encoder - inputs and formats them for the encoder-decoder model forward pass. - """ - # Batch the multi-modal inputs using the helper method. - mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output) - - if not mm_kwargs: - return {} - - # Group MM kwargs by modality and extract features - model = cast(SupportsMultiModal, self.model) - encoder_features = {} - for _, _, mm_kwargs_group in group_mm_kwargs_by_modality( - mm_kwargs, - device=self.device, - pin_memory=self.pin_memory, - merge_by_field_config=model.merge_by_field_config, - multimodal_cpu_fields=model.multimodal_cpu_fields, - ): - # Add the grouped features to encoder_features dict - # This allows the model to receive them as kwargs (e.g., - # input_features=...) - encoder_features.update(mm_kwargs_group) - - return encoder_features - def get_model(self) -> nn.Module: # get raw model out of the cudagraph wrapper. if isinstance(self.model, (CUDAGraphWrapper, UBatchWrapper)): @@ -2416,8 +2388,13 @@ class GPUModelRunner( self.model_config.is_encoder_decoder and scheduler_output.scheduled_encoder_inputs ): - encoder_inputs = self._extract_encoder_inputs(scheduler_output) - model_kwargs.update(encoder_inputs) + # Run the encoder, just like we do with other multimodal inputs. + # For an encoder-decoder model, our processing here is a bit + # simpler, because the outputs are just passed to the decoder. + # We are not doing any prompt replacement. We also will only + # ever have a single encoder input. + encoder_outputs = self._execute_mm_encoder(scheduler_output) + model_kwargs.update({"encoder_outputs": encoder_outputs}) return ( input_ids, From 2b1b3dfa4b02456b11b2bdbcd0857ddb96214a71 Mon Sep 17 00:00:00 2001 From: Bhagyashri Date: Fri, 21 Nov 2025 17:54:09 +0530 Subject: [PATCH 05/83] Update Dockerfile to use gcc-toolset-14 and fix test case failures on power (ppc64le) (#28957) Signed-off-by: Bhagyashri --- .../hardware_ci/run-cpu-test-ppc64le.sh | 10 +++--- docker/Dockerfile.ppc64le | 32 +++++++++++-------- requirements/common.txt | 4 +-- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh index 39ea180173081..3728f73fa2a36 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh @@ -25,20 +25,22 @@ function cpu_tests() { # offline inference podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -xve python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log # Run basic model test podman exec -it "$container_id" bash -c " + export TORCH_COMPILE_DISABLE=1 set -evx pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib - pip install sentence-transformers datamodel_code_generator + pip install sentence-transformers datamodel_code_generator tblib # Note: disable Bart until supports V1 # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] - pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m] + pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it] pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le index ad9eae94b83dd..b16bea3607d2f 100644 --- a/docker/Dockerfile.ppc64le +++ b/docker/Dockerfile.ppc64le @@ -8,8 +8,8 @@ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openbl ARG MAX_JOBS ARG OPENBLAS_VERSION=0.3.30 -RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \ - && source /opt/rh/gcc-toolset-13/enable \ +RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \ + && source /opt/rh/gcc-toolset-14/enable \ && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \ && unzip OpenBLAS-$OPENBLAS_VERSION.zip \ && cd OpenBLAS-$OPENBLAS_VERSION \ @@ -57,7 +57,7 @@ COPY --from=openblas-builder /tmp/control /dev/null RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ dnf install -y openssl-devel \ && dnf install -y \ - git tar gcc-toolset-13 automake libtool \ + git tar gcc-toolset-14 automake libtool \ pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \ harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \ @@ -84,7 +84,7 @@ ARG _GLIBCXX_USE_CXX11_ABI=1 ARG OPENBLAS_VERSION=0.3.30 RUN --mount=type=cache,target=/root/.cache/uv \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \ cd pytorch && \ uv pip install -r requirements.txt && \ @@ -97,7 +97,7 @@ ARG TORCHVISION_VERSION=0.22.0 ARG TORCHVISION_USE_NVJPEG=0 ARG TORCHVISION_USE_FFMPEG=0 RUN --mount=type=cache,target=/root/.cache/uv \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \ cd vision && \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \ @@ -113,7 +113,7 @@ ARG USE_ROCM=0 ARG USE_CUDA=0 ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1 RUN --mount=type=cache,target=/root/.cache/uv \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \ cd audio && \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \ @@ -130,7 +130,7 @@ ARG MAX_JOBS ARG PYARROW_PARALLEL ARG PYARROW_VERSION=21.0.0 RUN --mount=type=cache,target=/root/.cache/uv \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \ cd arrow/cpp && \ mkdir build && cd build && \ @@ -162,7 +162,7 @@ ARG OPENCV_VERSION=86 ARG OPENCV_PATCH=97f3f39 ARG ENABLE_HEADLESS=1 RUN --mount=type=cache,target=/root/.cache/uv \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ cd opencv-python && \ sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \ @@ -196,7 +196,7 @@ ARG MAX_JOBS ARG NUMBA_VERSION=0.61.2 # Clone all required dependencies -RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \ +RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \ git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \ cd ./numba && \ if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \ @@ -211,6 +211,9 @@ RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset FROM base-builder AS vllmcache-builder +ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config +ENV PATH=/usr/lib64/llvm15/bin:$PATH + COPY --from=torch-builder /tmp/control /dev/null COPY --from=arrow-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null @@ -225,10 +228,13 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 RUN --mount=type=cache,target=/root/.cache/uv \ dnf install llvm15 llvm15-devel -y && \ rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \ uv pip install maturin && \ uv build --wheel --out-dir /hf_wheels/ + +ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \ + CFLAGS="-fno-lto" RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ @@ -236,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \ --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \ --mount=type=bind,src=.,dst=/src/,rw \ - source /opt/rh/gcc-toolset-13/enable && \ + source /opt/rh/gcc-toolset-14/enable && \ export PATH=$PATH:/usr/lib64/llvm15/bin && \ uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ @@ -260,7 +266,7 @@ FROM base-builder AS lapack-builder ARG MAX_JOBS ARG LAPACK_VERSION=3.12.1 RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \ - && cd lapack && source /opt/rh/gcc-toolset-13/enable \ + && cd lapack && source /opt/rh/gcc-toolset-14/enable \ && cmake -B build -S . \ && cmake --build build -j ${MAX_JOBS:-$(nproc)} @@ -299,7 +305,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ microdnf install --nodocs -y \ - libomp tar findutils openssl llvm15 llvm15-devel \ + libomp libicu tar findutils openssl llvm15 llvm15-devel \ pkgconfig xsimd g++ gcc-fortran libsndfile \ libtiff libjpeg openjpeg2 zlib zeromq \ freetype lcms2 libwebp tcl tk utf8proc \ diff --git a/requirements/common.txt b/requirements/common.txt index f2d1c0762ef6a..3f8cd588422d0 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -19,12 +19,12 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer == 0.11.3 -llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" +llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le" outlines_core == 0.2.11 # required for outlines backend disk cache diskcache == 5.6.3 lark == 1.2.2 -xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" +xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs From 9452863088b458912634f13273784bf6e16c8a4c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 20:27:43 +0800 Subject: [PATCH 06/83] Revert "Revert #28875 (#29159)" (#29179) Signed-off-by: DarkLight1337 --- docker/Dockerfile | 17 ----------------- docs/deployment/docker.md | 7 +++---- .../installation/gpu.cuda.inc.md | 5 +---- 3 files changed, 4 insertions(+), 25 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 964700e2a43ac..709b79e84fbbc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch provides its own indexes for standard and nightly builds ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly # PIP supports multiple authentication schemes, including keyring # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to @@ -98,7 +97,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Activate virtual environment and add uv to PATH @@ -317,7 +315,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PYTORCH_CUDA_INDEX_BASE_URL -ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER # Install uv for faster pip installs @@ -337,20 +334,6 @@ ENV UV_LINK_MODE=copy # or future versions of triton. RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ -# arm64 (GH200) build follows the practice of "use existing pytorch" build, -# we need to install torch and torchvision from the nightly builds first, -# pytorch will not appear as a vLLM dependency in all of the following steps -# after this step -RUN --mount=type=cache,target=/root/.cache/uv \ - if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \ - uv pip install --system \ - --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ - --pre pytorch_triton==3.3.0+gitab727c40 ; \ - fi - # Install vllm wheel first, so that torch etc will be installed. RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ --mount=type=cache,target=/root/.cache/uv \ diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 1c639f3533d47..0e636c87f38a4 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \ ## Building for Arm64/aarch64 -A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use -of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. +A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. !!! note Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` @@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- ```bash # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) - python3 use_existing_torch.py DOCKER_BUILDKIT=1 docker build . \ --file docker/Dockerfile \ --target vllm-openai \ @@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `-- -t vllm/vllm-gh200-openai:latest \ --build-arg max_jobs=66 \ --build-arg nvcc_threads=2 \ - --build-arg torch_cuda_arch_list="9.0 10.0+PTX" + --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \ + --build-arg RUN_WHEEL_CHECK=false ``` !!! note diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index b2d0d64a2d355..601d3659af886 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -158,10 +158,7 @@ uv pip install -e . ##### Use an existing PyTorch installation -There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.: - -- Building vLLM with PyTorch nightly or a custom PyTorch build. -- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it. +There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build). To build vLLM using an existing PyTorch installation: From fc9f821d2062d412474ced64b9087c881651eb30 Mon Sep 17 00:00:00 2001 From: who who who Date: Fri, 21 Nov 2025 20:55:43 +0800 Subject: [PATCH 07/83] fix cross attention (#28346) Signed-off-by: fsx950223 --- vllm/v1/attention/backends/triton_attn.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 889c79db18ef5..09c36043c8c86 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -244,14 +244,11 @@ class TritonAttentionImpl(AttentionImpl): self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if attn_type != AttentionType.DECODER: + if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]: raise NotImplementedError( - "Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "TritonAttentionImpl" + "Encoder self-attention is not implemented for TritonAttentionImpl" ) - + self.attn_type = attn_type self.fp8_dtype = current_platform.fp8_dtype() self.sinks = sinks @@ -312,7 +309,11 @@ class TritonAttentionImpl(AttentionImpl): num_actual_tokens = attn_metadata.num_actual_tokens key_cache, value_cache = kv_cache.unbind(1) - if self.kv_sharing_target_layer_name is None: + if ( + self.kv_sharing_target_layer_name is None + and key is not None + and value is not None + ): # Reshape the input keys and values and store them in the cache. # Skip this if sharing KV cache with an earlier attention layer. if self.kv_cache_dtype.startswith("fp8"): @@ -346,7 +347,7 @@ class TritonAttentionImpl(AttentionImpl): max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table - descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) + descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2]) unified_attention( q=query[:num_actual_tokens], From 2092ce8c39a4f01a93dcb32d3c92d05586507e7c Mon Sep 17 00:00:00 2001 From: sfbemerk Date: Fri, 21 Nov 2025 13:57:19 +0100 Subject: [PATCH 08/83] Tool Call Parser logs should not contain user input / model output except on DEBUG (#29160) Signed-off-by: Benjamin Merkel Co-authored-by: Benjamin Merkel Co-authored-by: Chauncey --- .../openai/tool_parsers/glm4_moe_tool_parser.py | 2 +- .../openai/tool_parsers/qwen3coder_tool_parser.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py index 120e63b929b16..389e9754b34da 100644 --- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -78,7 +78,7 @@ class Glm4MoeModelToolParser(ToolParser): .get("type", None) ) return arg_type == "string" - logger.warning("No tool named '%s'.", tool_name) + logger.debug("No tool named '%s'.", tool_name) return False def _deserialize(value: str) -> Any: diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py index 26261c0065ead..9d4c079eba188 100644 --- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py @@ -128,7 +128,7 @@ class Qwen3CoderToolParser(ToolParser): return params else: return {} - logger.warning("Tool '%s' is not defined in the tools list.", func_name) + logger.debug("Tool '%s' is not defined in the tools list.", func_name) return {} def _convert_param_value( @@ -141,7 +141,7 @@ class Qwen3CoderToolParser(ToolParser): if param_name not in param_config: if param_config != {}: - logger.warning( + logger.debug( "Parsed parameter '%s' is not defined in the tool " "parameters for tool '%s', directly returning the " "string value.", @@ -169,7 +169,7 @@ class Qwen3CoderToolParser(ToolParser): try: return int(param_value) except (ValueError, TypeError): - logger.warning( + logger.debug( "Parsed value '%s' of parameter '%s' is not an " "integer in tool '%s', degenerating to string.", param_value, @@ -186,7 +186,7 @@ class Qwen3CoderToolParser(ToolParser): else int(float_param_value) ) except (ValueError, TypeError): - logger.warning( + logger.debug( "Parsed value '%s' of parameter '%s' is not a float " "in tool '%s', degenerating to string.", param_value, @@ -197,7 +197,7 @@ class Qwen3CoderToolParser(ToolParser): elif param_type in ["boolean", "bool", "binary"]: param_value = param_value.lower() if param_value not in ["true", "false"]: - logger.warning( + logger.debug( "Parsed value '%s' of parameter '%s' is not a boolean " "(`true` or `false`) in tool '%s', degenerating to " "false.", @@ -216,7 +216,7 @@ class Qwen3CoderToolParser(ToolParser): param_value = json.loads(param_value) return param_value except (json.JSONDecodeError, TypeError, ValueError): - logger.warning( + logger.debug( "Parsed value '%s' of parameter '%s' cannot be " "parsed with json.loads in tool '%s', will try " "other methods to parse it.", @@ -227,7 +227,7 @@ class Qwen3CoderToolParser(ToolParser): try: param_value = ast.literal_eval(param_value) # safer except (ValueError, SyntaxError, TypeError): - logger.warning( + logger.debug( "Parsed value '%s' of parameter '%s' cannot be " "converted via Python `ast.literal_eval()` in tool " "'%s', degenerating to string.", From 434f3d3eb869606af221f0307e16548c1f99da20 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 21 Nov 2025 15:01:20 +0100 Subject: [PATCH 09/83] Fix mistral config (#29172) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Patrick von Platen --- vllm/transformers_utils/configs/mistral.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 8f72f0b28b0de..fe202b2ed1568 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -90,6 +90,10 @@ def _remap_mistral_yarn_args(config: dict) -> dict: "rope_type": "yarn", "mscale_all_dim": 1, } + + if rope_theta := config.pop("rope_theta", None): + config["rope_parameters"]["rope_theta"] = rope_theta + for old_name, new_name in yarn_config_map.items(): if old_name in yarn_config: config["rope_parameters"][new_name] = yarn_config.pop(old_name) From f1805db1a671ffb1c99b2eae98e1b1b729fbcc65 Mon Sep 17 00:00:00 2001 From: skaraban3807 Date: Fri, 21 Nov 2025 19:43:52 +0530 Subject: [PATCH 10/83] [Perf] These changes enhance the NUMA functionality of vllm for systems with more than one NUMA nodes per socket (#25559) Signed-off-by: Siddappa Karabannavar --- csrc/cpu/utils.cpp | 65 +++++++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp index c5a48352e3089..5199ba2af024f 100644 --- a/csrc/cpu/utils.cpp +++ b/csrc/cpu/utils.cpp @@ -45,31 +45,54 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) { // Memory node binding if (numa_available() != -1) { int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front()); - // Verify all CPUs are on the same NUMA node - for (size_t i = 1; i < omp_cpu_ids.size(); ++i) { - int node_id = numa_node_of_cpu(omp_cpu_ids[i]); - TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i], - " is on NUMA node ", node_id, ", but CPU ", - omp_cpu_ids.front(), " is on NUMA node ", mem_node_id, - ". All CPUs should be on the same NUMA node for optimal " - "performance. Memory will be bound to NUMA node ", - mem_node_id, "."); + std::set node_ids; + for (const auto& cpu_id : omp_cpu_ids) { + int node_id = numa_node_of_cpu(cpu_id); + if (node_id != -1) { + node_ids.insert(node_id); + } + TORCH_WARN(node_id == mem_node_id, "CPU ", cpu_id, " is on NUMA node ", + node_id, ", but CPU ", omp_cpu_ids.front(), + " is on NUMA node ", mem_node_id, + ". All CPUs should be on the same NUMA node for optimal " + "performance. Memory will be bound to NUMA node ", + mem_node_id, "."); } - bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str()); - bitmask* src_mask = numa_get_membind(); + // Concatenate all node_ids into a single comma-separated string + if (!node_ids.empty()) { + std::string node_ids_str; + for (const int node_id : node_ids) { + if (!node_ids_str.empty()) { + node_ids_str += ","; + } + node_ids_str += std::to_string(node_id); + } - int pid = getpid(); + bitmask* mask = numa_parse_nodestring(node_ids_str.c_str()); + bitmask* src_mask = numa_get_membind(); - // move all existing pages to the specified numa node. - *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); - int page_num = numa_migrate_pages(pid, src_mask, mask); - if (page_num == -1) { - TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno)); + int pid = getpid(); + + if (mask && src_mask) { + // move all existing pages to the specified numa node. + *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp); + int page_num = numa_migrate_pages(pid, src_mask, mask); + if (page_num == -1) { + TORCH_WARN("numa_migrate_pages failed. errno: " + + std::to_string(errno)); + } + + // restrict memory allocation node. + numa_set_membind(mask); + numa_set_strict(1); + + numa_free_nodemask(mask); + numa_free_nodemask(src_mask); + } else { + TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " + + std::to_string(errno)); + } } - - // restrict memory allocation node. - numa_set_membind(mask); - numa_set_strict(1); } // OMP threads binding From 4050bae4171edeadb24be5b6b1f8a3287612f872 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 21 Nov 2025 22:57:26 +0800 Subject: [PATCH 11/83] [Doc] Update plugin doc (#28532) Signed-off-by: wangxiyuan --- docs/design/plugin_system.md | 98 +++++++++++++++++++++++++++++++++++- vllm/plugins/__init__.py | 3 ++ vllm/v1/metrics/loggers.py | 4 +- 3 files changed, 101 insertions(+), 4 deletions(-) diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index e8db8047ca4e6..9c84889f7f03d 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -4,7 +4,7 @@ The community frequently requests the ability to extend vLLM with custom feature ## How Plugins Work in vLLM -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work. +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_plugins_by_group][vllm.plugins.load_plugins_by_group] function in the `vllm.plugins` module. ## How vLLM Discovers Plugins @@ -57,6 +57,100 @@ Every plugin has three parts: - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. +### Platform plugins guidelines + +1. Create a platform plugin project, for example, `vllm_add_dummy_platform`. The project structure should look like this: + + ```shell + vllm_add_dummy_platform/ + ├── vllm_add_dummy_platform/ + │ ├── __init__.py + │ ├── my_dummy_platform.py + │ ├── my_dummy_worker.py + │ ├── my_dummy_attention.py + │ ├── my_dummy_device_communicator.py + │ ├── my_dummy_custom_ops.py + ├── setup.py + ``` + +2. In the `setup.py` file, add the following entry point: + + ```python + setup( + name="vllm_add_dummy_platform", + ... + entry_points={ + "vllm.platform_plugins": [ + "my_dummy_platform = vllm_add_dummy_platform:register" + ] + }, + ... + ) + ``` + + Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example: + + ```python + def register(): + return "vllm_add_dummy_platform.my_dummy_platform.MyDummyPlatform" + ``` + +3. Implement the platform class `MyDummyPlatform` in `my_dummy_platform.py`. The platform class should inherit from `vllm.platforms.interface.Platform`. Please follow the interface to implement the functions one by one. There are some important functions and properties that should be implemented at least: + + - `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree. + - `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc. + - `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes. + - `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process. + - `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name. + - `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name. + +4. Implement the worker class `MyDummyWorker` in `my_dummy_worker.py`. The worker class should inherit from [WorkerBase][vllm.v1.worker.worker_base.WorkerBase]. Please follow the interface to implement the functions one by one. Basically, all interfaces in the base class should be implemented, since they are called here and there in vLLM. To make sure a model can be executed, the basic functions should be implemented are: + + - `init_device`: This function is called to set up the device for the worker. + - `initialize_cache`: This function is called to set cache config for the worker. + - `load_model`: This function is called to load the model weights to device. + - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model. + - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs. + - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config + - `execute_model`: This function is called every step to inference the model. + + Additional functions that can be implemented are: + + - If the plugin wants to support sleep mode feature, please implement the `sleep` and `wakeup` functions. + - If the plugin wants to support graph mode feature, please implement the `compile_or_warm_up_model` function. + - If the plugin wants to support speculative decoding feature, please implement the `take_draft_token_ids` function. + - If the plugin wants to support lora feature, please implement the `add_lora`,`remove_lora`,`list_loras` and `pin_lora` functions. + - If the plugin wants to support data parallelism feature, please implement the `execute_dummy_batch` functions. + + Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented. + +5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations. + +6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports: + + - pytorch ops + there are 3 kinds of pytorch ops: + + - `communicator ops`: Device communicator op. Such as all-reduce, all-gather, etc. + Please implement the device communicator class `MyDummyDeviceCommunicator` in `my_dummy_device_communicator.py`. The device communicator class should inherit from [DeviceCommunicatorBase][vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase]. + - `common ops`: Common ops. Such as matmul, softmax, etc. + Please implement the common ops by register oot way. See more detail in [CustomOp][vllm.model_executor.custom_op.CustomOp] class. + - `csrc ops`: C++ ops. This kind of ops are implemented in C++ and are registered as torch custom ops. + Following csrc module and `vllm._custom_ops` to implement your ops. + + - triton ops + Custom way doesn't work for triton ops now. + +7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc. + ## Compatibility Guarantee -vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. +vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. + +The interface for the model/module may change during vLLM's development. If you see any deprecation log info, please upgrade your plugin to the latest version. + +## Deprecation announcement + +!!! warning "Deprecations" + - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0. + - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 0d8988f27959f..4c59d5364a763 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -17,6 +17,9 @@ IO_PROCESSOR_PLUGINS_GROUP = "vllm.io_processor_plugins" # Platform plugins group will be loaded in all processes when # `vllm.platforms.current_platform` is called and the value not initialized, PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins" +# Stat logger plugins group will be loaded in process0 only when serve vLLM with +# async mode. +STAT_LOGGER_PLUGINS_GROUP = "vllm.stat_logger_plugins" # make sure one process only loads plugins once plugins_loaded = False diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index cb36e7973650e..e2d82241ce210 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -16,7 +16,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPrometheus, ) from vllm.logger import init_logger -from vllm.plugins import load_plugins_by_group +from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group from vllm.v1.engine import FinishReason from vllm.v1.metrics.prometheus import unregister_vllm_metrics from vllm.v1.metrics.stats import ( @@ -67,7 +67,7 @@ class StatLoggerBase(ABC): def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]: factories: list[StatLoggerFactory] = [] - for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items(): + for name, plugin_class in load_plugins_by_group(STAT_LOGGER_PLUGINS_GROUP).items(): if not isinstance(plugin_class, type) or not issubclass( plugin_class, StatLoggerBase ): From d7219bcda3e6508cb14881bec303e2d0ab68c898 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 23:27:44 +0800 Subject: [PATCH 12/83] [Misc] Move dynamic seed initialization to `EngineArgs` (#29165) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 34 +++++++--------------------------- vllm/config/speculative.py | 7 +------ vllm/engine/arg_utils.py | 16 +++++++++++++++- vllm/v1/worker/tpu_worker.py | 3 --- 4 files changed, 23 insertions(+), 37 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index 97cba6ea7295e..8f59673f4e1c3 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -146,9 +146,12 @@ class ModelConfig: - "bfloat16" for a balance between precision and range.\n - "float" is shorthand for FP32 precision.\n - "float32" for FP32 precision.""" - seed: int | None = None - """Random seed for reproducibility. Initialized to None in V0, but - initialized to 0 in V1.""" + seed: int = 0 + """Random seed for reproducibility. + + We must set the global seed because otherwise, + different tensor parallel workers would sample different tokens, + leading to inconsistent results.""" hf_config: PretrainedConfig = field(init=False) """The Hugging Face config of the model.""" hf_text_config: PretrainedConfig = field(init=False) @@ -415,7 +418,7 @@ class ModelConfig: def __post_init__( self, # Multimodal config init vars - limit_mm_per_prompt: dict[str, int] | None, + limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, @@ -428,23 +431,6 @@ class ModelConfig: skip_mm_profiling: bool | None, video_pruning_rate: float | None, ) -> None: - # Set the default seed to 0 in V1. - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. However, without a consistent seed, - # different tensor parallel workers would sample different tokens, - # leading to inconsistent results. - if self.seed is None: - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name( self.model, self.served_model_name @@ -1151,12 +1137,6 @@ class ModelConfig: self, parallel_config: ParallelConfig, ) -> None: - if parallel_config.distributed_executor_backend == "external_launcher": - assert self.seed is not None, ( - "Seed must be set when using external launcher backend to " - "make sure sampling results are the same across workers." - ) - total_num_attention_heads = getattr( self.hf_text_config, "num_attention_heads", 0 ) diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index a0c65b6049e1e..d7c019c73d598 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -9,6 +9,7 @@ from pydantic import Field, SkipValidation, model_validator from pydantic.dataclasses import dataclass from typing_extensions import Self +from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.utils import config from vllm.logger import init_logger @@ -18,10 +19,8 @@ if TYPE_CHECKING: from transformers import PretrainedConfig import vllm.model_executor.layers.quantization as me_quant - from vllm.config import ModelConfig else: PretrainedConfig = Any - ModelConfig = Any me_quant = LazyLoader( "model_executor", globals(), "vllm.model_executor.layers.quantization" @@ -316,10 +315,6 @@ class SpeculativeConfig: self.prompt_lookup_min = 0 if self.model is not None: - # TODO: Move this import to the top once `ModelConfig` - # lives in `vllm.config.model`. - from vllm.config import ModelConfig - self.draft_model_config = ModelConfig( model=self.model, runner="draft", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bcb90119f9b04..6eaf328eb1655 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -367,7 +367,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = ModelConfig.seed + seed: int | None = None max_model_len: int | None = ModelConfig.max_model_len cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( @@ -1188,6 +1188,20 @@ class EngineArgs: if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" + # NOTE(woosuk): In V1, we use separate processes for workers (unless + # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here + # doesn't affect the user process. + if self.seed is None: + self.seed = 0 + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, + ) + if self.disable_mm_preprocessor_cache: logger.warning( "`--disable-mm-preprocessor-cache` is deprecated " diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 569b2aaa766e4..e1a109eca0a88 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -106,9 +106,6 @@ class TPUWorker: "Profiling enabled. Traces will be saved to: %s", self.profile_dir ) - if self.model_config.seed is None: - self.model_config.seed = 0 - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks From 711241c13cf9c1e543a1948bb25a40623f3da78c Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 09:58:38 -0600 Subject: [PATCH 13/83] [CI/Build] Fix illegal memory access and unsupported test in kernels/attention/test_cache.py (#29118) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/attention/test_cache.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index f33a27d1fd85a..028e164cb801b 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -68,6 +68,7 @@ def test_copy_blocks( pytest.skip() current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. assert 2 * num_mappings <= num_blocks @@ -152,6 +153,7 @@ def test_reshape_and_cache( pytest.skip() current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks slot_mapping_lst = random.sample(range(num_slots), num_tokens) @@ -272,6 +274,7 @@ def test_reshape_and_cache_flash( ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) assert implementation in ["cuda", "triton"] if implementation == "triton" and kv_cache_layout == "HND": pytest.skip("Triton implementation only supports NHD layout.") @@ -593,6 +596,7 @@ def test_concat_and_cache_mla( ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) total_slots = num_blocks * block_size slot_mapping_lst = random.sample(range(total_slots), num_tokens) @@ -662,11 +666,14 @@ def test_concat_and_cache_ds_mla( seed: int, device: str, ) -> None: + if current_platform.is_rocm(): + pytest.skip("concat_and_cache_mla doesn't support fp8_ds_mla on ROCm") if dtype.itemsize != 2: pytest.skip("ds_mla only supports 16-bit input") kv_cache_dtype = "fp8_ds_mla" current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) total_slots = num_blocks * block_size slot_mapping_lst = random.sample(range(total_slots), num_tokens) @@ -779,6 +786,7 @@ def test_copy_blocks_mla( ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) entry_size = kv_lora_rank + qk_rope_head_dim @@ -843,6 +851,7 @@ def test_swap_blocks_mla( ) -> None: current_platform.seed_everything(seed) torch.set_default_device(device) + torch.cuda.set_device(device) entry_size = kv_lora_rank + qk_rope_head_dim From 1f400c58b8a6d2852b137cd841206a6ea8aaf43a Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 21 Nov 2025 11:20:33 -0500 Subject: [PATCH 14/83] [CI] Add batch invariant test to ci (#27842) Signed-off-by: yewentao256 --- .buildkite/test-pipeline.yaml | 12 ++++++++++++ tests/v1/determinism/test_batch_invariance.py | 2 ++ tests/v1/determinism/utils.py | 3 ++- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6169b279dc8a4..a5719d438eece 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -346,6 +346,18 @@ steps: commands: - pytest -v -s v1/attention +- label: Batch Invariance Tests (H100) # 10min + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/ + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py + - label: V1 Test attention (B200) # 10min timeout_in_minutes: 30 gpu: b200 diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 74ae5e182da78..b9e2daafb8705 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -190,6 +190,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( max_num_seqs=32, max_model_len=8192, dtype="bfloat16", # not everything is supported + gpu_memory_utilization=0.9, ) # Use more realistic prompts for better token generation @@ -444,6 +445,7 @@ def test_logprobs_without_batch_invariance_should_fail( monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend) # CRITICAL: Disable batch invariance for this test + monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0") monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False) seed = int(os.getenv("VLLM_TEST_SEED", "12345")) random.seed(seed) diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 7ee442551e2c3..ecbb6a1126933 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -6,6 +6,7 @@ import random import pytest import torch +from vllm.attention.utils.fa_utils import flash_attn_supports_mla from vllm.platforms import current_platform skip_unsupported = pytest.mark.skipif( @@ -18,7 +19,7 @@ BACKENDS: list[str] = [ "FLASHINFER", ] -if current_platform.is_cuda() and current_platform.is_device_capability(90): +if flash_attn_supports_mla(): BACKENDS.append("FLASH_ATTN_MLA") DEFAULT_MODEL = "Qwen/Qwen3-1.7B" From 30b44a1598ea62fd3dcfd0d72a799ca4685e829e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 08:20:55 -0800 Subject: [PATCH 15/83] GPU Model Runner V2 (#25266) Signed-off-by: Woosuk Kwon --- .github/CODEOWNERS | 3 + vllm/envs.py | 5 + vllm/v1/attention/backends/flashinfer.py | 3 + vllm/v1/core/sched/output.py | 24 + vllm/v1/core/sched/scheduler.py | 28 +- vllm/v1/worker/gpu/README.md | 4 + vllm/v1/worker/gpu/__init__.py | 0 vllm/v1/worker/gpu/async_utils.py | 89 +++ vllm/v1/worker/gpu/attn_utils.py | 187 ++++++ vllm/v1/worker/gpu/block_table.py | 315 +++++++++ vllm/v1/worker/gpu/cudagraph_utils.py | 198 ++++++ vllm/v1/worker/gpu/dp_utils.py | 22 + vllm/v1/worker/gpu/input_batch.py | 265 ++++++++ vllm/v1/worker/gpu/model_runner.py | 814 +++++++++++++++++++++++ vllm/v1/worker/gpu/sampler.py | 327 +++++++++ vllm/v1/worker/gpu/states.py | 265 ++++++++ vllm/v1/worker/gpu/structured_outputs.py | 76 +++ vllm/v1/worker/gpu_worker.py | 26 +- 18 files changed, 2639 insertions(+), 12 deletions(-) create mode 100644 vllm/v1/worker/gpu/README.md create mode 100644 vllm/v1/worker/gpu/__init__.py create mode 100644 vllm/v1/worker/gpu/async_utils.py create mode 100644 vllm/v1/worker/gpu/attn_utils.py create mode 100644 vllm/v1/worker/gpu/block_table.py create mode 100644 vllm/v1/worker/gpu/cudagraph_utils.py create mode 100644 vllm/v1/worker/gpu/dp_utils.py create mode 100644 vllm/v1/worker/gpu/input_batch.py create mode 100644 vllm/v1/worker/gpu/model_runner.py create mode 100644 vllm/v1/worker/gpu/sampler.py create mode 100644 vllm/v1/worker/gpu/states.py create mode 100644 vllm/v1/worker/gpu/structured_outputs.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 6e178bb690c56..0e834c057c401 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -35,6 +35,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /vllm/v1/kv_cache_interface.py @heheda12345 /vllm/v1/offloading @ApostaC +# Model runner V2 +/vllm/v1/worker/gpu @WoosukKwon + # Test ownership /.buildkite/lm-eval-harness @mgoin /tests/distributed/test_multi_node_assignment.py @youkaichao diff --git a/vllm/envs.py b/vllm/envs.py index 888a09cf6d3ec..d2d6917403420 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -231,6 +231,7 @@ if TYPE_CHECKING: VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256 VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary" + VLLM_USE_V2_MODEL_RUNNER: bool = False def get_default_cache_root(): @@ -1522,6 +1523,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices( "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"] ), + # Flag to enable v2 model runner. + "VLLM_USE_V2_MODEL_RUNNER": lambda: bool( + int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0")) + ), } # --8<-- [end:env-vars-definition] diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 3ad7e8c52fc1f..e3f499216d7f1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -593,6 +593,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) return self._workspace_buffer + def set_workspace_buffer(self, workspace_buffer: torch.Tensor): + self._workspace_buffer = workspace_buffer + def _get_prefill_wrapper( self, ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper: diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 20fdb3446404b..7902513dce49a 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -44,11 +44,15 @@ class NewRequestData: lora_request: LoRARequest | None prompt_embeds: "torch.Tensor | None" = None + # Only used for v2 model runner. + prefill_token_ids: list[int] | None = None + @classmethod def from_request( cls, request: Request, block_ids: tuple[list[int], ...], + prefill_token_ids: list[int] | None = None, ) -> "NewRequestData": return cls( req_id=request.request_id, @@ -60,6 +64,7 @@ class NewRequestData: num_computed_tokens=request.num_computed_tokens, lora_request=request.lora_request, prompt_embeds=request.prompt_embeds, + prefill_token_ids=prefill_token_ids, ) def __repr__(self) -> str: @@ -68,6 +73,7 @@ class NewRequestData: f"NewRequestData(" f"req_id={self.req_id}," f"prompt_token_ids={self.prompt_token_ids}," + f"prefill_token_ids={self.prefill_token_ids}," f"mm_features={self.mm_features}," f"sampling_params={self.sampling_params}," f"block_ids={self.block_ids}," @@ -183,6 +189,10 @@ class SchedulerOutput: # freed from the encoder cache. free_encoder_mm_hashes: list[str] + # Request IDs that are preempted in this step. + # Only used for v2 model runner. + preempted_req_ids: set[str] | None = None + # Whether the scheduled requests have all the output tokens they # need to perform grammar bitmask computation. pending_structured_output_tokens: bool = False @@ -193,6 +203,20 @@ class SchedulerOutput: # EC Cache Connector metadata ec_connector_metadata: ECConnectorMetadata | None = None + @classmethod + def make_empty(cls) -> "SchedulerOutput": + return cls( + scheduled_new_reqs=[], + scheduled_cached_reqs=CachedRequestData.make_empty(), + num_scheduled_tokens={}, + total_num_scheduled_tokens=0, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[], + finished_req_ids=set(), + free_encoder_mm_hashes=[], + ) + @dataclass class GrammarOutput: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 1ac8520a8ed25..9195b112d8690 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -6,6 +6,7 @@ from collections import defaultdict from collections.abc import Iterable from typing import Any +from vllm import envs from vllm.config import VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ( ECConnectorMetadata, @@ -187,6 +188,7 @@ class Scheduler(SchedulerInterface): pcp_world_size=self.pcp_world_size, ) self.use_pp = self.parallel_config.pipeline_parallel_size > 1 + self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER def schedule(self) -> SchedulerOutput: # NOTE(woosuk) on the scheduling algorithm: @@ -658,12 +660,25 @@ class Scheduler(SchedulerInterface): ) # Construct the scheduler output. - new_reqs_data = [ - NewRequestData.from_request( - req, req_to_new_blocks[req.request_id].get_block_ids() - ) - for req in scheduled_new_reqs - ] + if self.use_v2_model_runner: + scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs + scheduled_resumed_reqs = [] + new_reqs_data = [ + NewRequestData.from_request( + req, + req_to_new_blocks[req.request_id].get_block_ids(), + req._all_token_ids, + ) + for req in scheduled_new_reqs + ] + else: + new_reqs_data = [ + NewRequestData.from_request( + req, req_to_new_blocks[req.request_id].get_block_ids() + ) + for req in scheduled_new_reqs + ] + with record_function_or_nullcontext("schedule: make_cached_request_data"): cached_reqs_data = self._make_cached_request_data( scheduled_running_reqs, @@ -685,6 +700,7 @@ class Scheduler(SchedulerInterface): scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, scheduled_encoder_inputs=scheduled_encoder_inputs, num_common_prefix_blocks=num_common_prefix_blocks, + preempted_req_ids={req.request_id for req in preempted_reqs}, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between diff --git a/vllm/v1/worker/gpu/README.md b/vllm/v1/worker/gpu/README.md new file mode 100644 index 0000000000000..093f524b3250f --- /dev/null +++ b/vllm/v1/worker/gpu/README.md @@ -0,0 +1,4 @@ +# [Experimental] Model Runner V2 + +This directory contains the new model runner which is under active development. +Ping [Woosuk Kwon](https://github.com/WoosukKwon) for any changes. diff --git a/vllm/v1/worker/gpu/__init__.py b/vllm/v1/worker/gpu/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py new file mode 100644 index 0000000000000..638ec6fb0b082 --- /dev/null +++ b/vllm/v1/worker/gpu/async_utils.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import contextmanager + +import numpy as np +import torch + +from vllm.v1.outputs import ( + AsyncModelRunnerOutput, + ModelRunnerOutput, + SamplerOutput, +) + + +class AsyncOutput(AsyncModelRunnerOutput): + def __init__( + self, + model_runner_output: ModelRunnerOutput, + sampler_output: SamplerOutput, + num_sampled_tokens: np.ndarray, + copy_stream: torch.cuda.Stream, + copy_event: torch.cuda.Event, + ): + self.model_runner_output = model_runner_output + self.sampler_output = sampler_output + self.num_sampled_tokens = num_sampled_tokens + self.copy_stream = copy_stream + self.copy_event = copy_event + + default_stream = torch.cuda.current_stream() + with torch.cuda.stream(self.copy_stream): + self.copy_stream.wait_stream(default_stream) + + # NOTE(woosuk): We must ensure that CPU tensors are not freed + # before the device-to-host copy is fully completed. For instance, + # operations like + # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy() + # are unsafe because the underlying CPU tensor can be prematurely freed and + # reused by other tensors before the asynchronous copy finishes, potentially + # causing race conditions. To prevent this, we delay freeing by holding + # references until the copy event signals completion. + # Likewise, we also need to keep the reference to the GPU tensors. + # This is done by keeping the reference to sampler_output and + # model_runner_output. + self.sampled_token_ids = sampler_output.sampled_token_ids.to( + "cpu", non_blocking=True + ) + if sampler_output.logprobs_tensors is not None: + self.logprobs_tensors = ( + sampler_output.logprobs_tensors.to_cpu_nonblocking() + ) + else: + self.logprobs_tensors = None + self.prompt_logprobs_dict = {} + if self.model_runner_output.prompt_logprobs_dict: + for k, v in self.model_runner_output.prompt_logprobs_dict.items(): + self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking() + self.copy_event.record(self.copy_stream) + + def get_output(self) -> ModelRunnerOutput: + self.copy_event.synchronize() + + # NOTE(woosuk): The following code is to ensure compatibility with + # the existing model runner. + # Going forward, we should keep the data structures as NumPy arrays + # rather than Python lists. + sampled_token_ids_np = self.sampled_token_ids.numpy() + num_reqs = sampled_token_ids_np.shape[0] + sampled_token_ids: list[np.ndarray] = [ + sampled_token_ids_np[i, : self.num_sampled_tokens[i]] + for i in range(num_reqs) + ] + self.model_runner_output.sampled_token_ids = sampled_token_ids + + if self.logprobs_tensors is not None: + self.model_runner_output.logprobs = self.logprobs_tensors.tolists() + self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict + return self.model_runner_output + + +@contextmanager +def async_barrier(event: torch.cuda.Event | None): + if event is not None: + event.synchronize() + try: + yield + finally: + if event is not None: + event.record() diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py new file mode 100644 index 0000000000000..8850c18092299 --- /dev/null +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -0,0 +1,187 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence +from typing import Any + +import torch + +from vllm.attention.backends.abstract import AttentionBackend +from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.v1.attention.backends.utils import ( + AttentionMetadataBuilder, + CommonAttentionMetadata, +) +from vllm.v1.kv_cache_interface import ( + KVCacheConfig, + KVCacheSpec, +) +from vllm.v1.utils import CpuGpuBuffer +from vllm.v1.worker.utils import bind_kv_cache + + +def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]: + kv_cache_spec: dict[str, KVCacheSpec] = {} + attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase) + for layer_name, attn_module in attn_layers.items(): + # Skip modules that don't need KV cache (eg encoder-only attention) + if spec := attn_module.get_kv_cache_spec(vllm_config): + kv_cache_spec[layer_name] = spec + return kv_cache_spec + + +def init_attn_backend( + kv_cache_config: KVCacheConfig, + vllm_config: VllmConfig, + device: torch.device, +): + attn_backends: dict[str, AttentionBackend] = {} + attn_metadata_builders: list[AttentionMetadataBuilder] = [] + flashinfer_workspace: torch.Tensor | None = None + for kv_cache_group_spec in kv_cache_config.kv_cache_groups: + layer_names = kv_cache_group_spec.layer_names + any_layer_name = next(iter(layer_names)) + + attn_layers = get_layers_from_vllm_config( + vllm_config, AttentionLayerBase, layer_names + ) + attn_backend = attn_layers[any_layer_name].get_attn_backend() + for layer_name in layer_names: + attn_backends[layer_name] = attn_backend + + attn_metadata_builder = attn_backend.get_builder_cls()( + kv_cache_group_spec.kv_cache_spec, + layer_names, + vllm_config, + device, + ) + attn_metadata_builders.append(attn_metadata_builder) # type: ignore + + if "FLASHINFER" in attn_backend.get_name(): + if flashinfer_workspace is None: + flashinfer_workspace = attn_metadata_builder._get_workspace_buffer() + else: + attn_metadata_builder.set_workspace_buffer(flashinfer_workspace) + return attn_backends, attn_metadata_builders + + +def _allocate_kv_cache( + kv_cache_config: KVCacheConfig, + device: torch.device, +): + kv_cache_raw_tensors: dict[str, torch.Tensor] = {} + for kv_cache_tensor in kv_cache_config.kv_cache_tensors: + tensor = torch.zeros(kv_cache_tensor.size, dtype=torch.int8, device=device) + for layer_name in kv_cache_tensor.shared_by: + kv_cache_raw_tensors[layer_name] = tensor + + layer_names = set() + for group in kv_cache_config.kv_cache_groups: + for layer_name in group.layer_names: + layer_names.add(layer_name) + assert layer_names == set(kv_cache_raw_tensors.keys()), ( + "Some layers are not correctly initialized" + ) + return kv_cache_raw_tensors + + +def _reshape_kv_cache( + kv_cache_config: KVCacheConfig, + kv_cache_raw_tensors: dict[str, torch.Tensor], + attn_backends: dict[str, AttentionBackend], +) -> dict[str, torch.Tensor]: + kv_caches: dict[str, torch.Tensor] = {} + for kv_cache_group_spec in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + for layer_name in kv_cache_group_spec.layer_names: + raw_tensor = kv_cache_raw_tensors[layer_name] + assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0 + num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes + + attn_backend = attn_backends[layer_name] + kv_cache_shape = attn_backend.get_kv_cache_shape( + num_blocks, + kv_cache_spec.block_size, + kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size, + ) + + # FIXME(woosuk): Add kv_cache_stride_order to all attention backends. + try: + kv_cache_stride_order = attn_backend.get_kv_cache_stride_order() + assert len(kv_cache_stride_order) == len(kv_cache_shape) + except (AttributeError, NotImplementedError): + kv_cache_stride_order = tuple(range(len(kv_cache_shape))) + + kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order) + inv_order = [ + kv_cache_stride_order.index(i) + for i in range(len(kv_cache_stride_order)) + ] + + dtype = kv_cache_spec.dtype + raw_tensor = raw_tensor.view(dtype) + raw_tensor = raw_tensor.view(kv_cache_shape) + kv_caches[layer_name] = raw_tensor.permute(*inv_order) + return kv_caches + + +def init_kv_cache( + runner_kv_caches: list[torch.Tensor], + forward_context: dict[str, Any], + kv_cache_config: KVCacheConfig, + attn_backends: dict[str, AttentionBackend], + device: torch.device, +) -> None: + kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device) + kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends) + bind_kv_cache(kv_caches, forward_context, runner_kv_caches) + + +def build_attn_metadata( + attn_metadata_builders: list[AttentionMetadataBuilder], + num_reqs: int, + num_tokens: int, + query_start_loc: CpuGpuBuffer, + seq_lens: CpuGpuBuffer, + num_computed_tokens_cpu: torch.Tensor, + block_tables: Sequence[torch.Tensor], + slot_mappings: torch.Tensor, + kv_cache_config: KVCacheConfig, +) -> dict[str, Any]: + query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1] + query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1] + max_query_len = int(query_start_loc.np[: num_reqs + 1].max()) + seq_lens_gpu = seq_lens.gpu[:num_reqs] + seq_lens_cpu = seq_lens.cpu[:num_reqs] + max_seq_len = int(seq_lens.np[:num_reqs].max()) + + attn_metadata: dict[str, Any] = {} + kv_cache_groups = kv_cache_config.kv_cache_groups + for i, kv_cache_spec in enumerate(kv_cache_groups): + block_table = block_tables[i] + slot_mapping = slot_mappings[i] + + common_attn_metadata = CommonAttentionMetadata( + query_start_loc=query_start_loc_gpu, + query_start_loc_cpu=query_start_loc_cpu, + seq_lens=seq_lens_gpu, + seq_lens_cpu=seq_lens_cpu, + max_seq_len=max_seq_len, + num_computed_tokens_cpu=num_computed_tokens_cpu, + num_reqs=num_reqs, + num_actual_tokens=num_tokens, + max_query_len=max_query_len, + block_table_tensor=block_table, + slot_mapping=slot_mapping, + causal=True, + ) + + attn_metadata_builder = attn_metadata_builders[i] + metadata = attn_metadata_builder.build( + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + ) + for layer_name in kv_cache_spec.layer_names: + attn_metadata[layer_name] = metadata + return attn_metadata diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py new file mode 100644 index 0000000000000..ff24e88ede2c0 --- /dev/null +++ b/vllm/v1/worker/gpu/block_table.py @@ -0,0 +1,315 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable + +import torch +import triton +import triton.language as tl + +from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.utils.math_utils import cdiv +from vllm.v1.utils import CpuGpuBuffer + + +class BlockTables: + def __init__( + self, + block_sizes: list[int], + max_num_reqs: int, + max_num_batched_tokens: int, + max_model_len: int, + device: torch.device, + pin_memory: bool, + ): + self.block_sizes = block_sizes + self.max_num_reqs = max_num_reqs + self.max_num_batched_tokens = max_num_batched_tokens + self.max_model_len = max_model_len + self.device = device + self.pin_memory = pin_memory + + self.num_kv_cache_groups = len(self.block_sizes) + # num_kv_cache_groups x [max_num_reqs, max_num_blocks] + self.block_tables: list[torch.Tensor] = [] + for i in range(self.num_kv_cache_groups): + block_size = self.block_sizes[i] + max_num_blocks = cdiv(self.max_model_len, block_size) + block_table = torch.zeros( + self.max_num_reqs, + max_num_blocks, + dtype=torch.int32, + device=self.device, + ) + self.block_tables.append(block_table) + self.block_table_ptrs = self._make_ptr_tensor(self.block_tables) + + # Block tables used for model's forward pass. + # num_kv_cache_groups x [max_num_reqs, max_num_blocks] + self.input_block_tables: list[torch.Tensor] = [ + torch.zeros_like(block_table) for block_table in self.block_tables + ] + self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables) + + self.block_table_strides = torch.tensor( + [b.stride(0) for b in self.block_tables], + dtype=torch.int64, + device=self.device, + ) + self.block_sizes_tensor = torch.tensor( + self.block_sizes, dtype=torch.int32, device=self.device + ) + self.num_blocks = torch.zeros( + self.num_kv_cache_groups, + self.max_num_reqs, + dtype=torch.int32, + device=self.device, + ) + self.slot_mappings = torch.zeros( + self.num_kv_cache_groups, + self.max_num_batched_tokens, + dtype=torch.int64, + device=self.device, + ) + + # Misc buffers. + self.req_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int32) + self.overwrite = self._make_buffer(self.max_num_reqs, dtype=torch.bool) + self.cu_num_new_blocks = self._make_buffer( + self.num_kv_cache_groups, self.max_num_reqs + 1, dtype=torch.int32 + ) + + def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: + return CpuGpuBuffer( + *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device + ) + + def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor: + # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses. + ptrs_tensor_cpu = torch.tensor( + [t.data_ptr() for t in x], + dtype=torch.uint64, + device="cpu", + pin_memory=self.pin_memory, + ) + return ptrs_tensor_cpu.to(self.device, non_blocking=True) + + def append_block_ids( + self, + # [num_reqs] + req_indices: list[int], + # [num_kv_cache_groups, num_reqs + 1] + cu_num_new_blocks: tuple[list[int], ...], + # [num_kv_cache_groups, num_new_blocks] + new_block_ids: tuple[list[int], ...], + # [num_reqs] + overwrite: list[bool], + ) -> None: + num_reqs = len(req_indices) + self.req_indices.np[:num_reqs] = req_indices + self.overwrite.np[:num_reqs] = overwrite + for i in range(self.num_kv_cache_groups): + self.cu_num_new_blocks.np[i, : num_reqs + 1] = cu_num_new_blocks[i] + + # NOTE(woosuk): Here, we cannot use a fixed-size buffer because there's + # no clear upper bound to the number of new blocks in a single step. + # NOTE(woosuk): The buffer has to be cached, because otherwise we cannot + # guarantee that the buffer is not freed before the copy is completed. + self.new_block_ids_cpu = torch.empty( + self.num_kv_cache_groups, + max(len(x) for x in new_block_ids), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory, + ) + new_block_ids_np = self.new_block_ids_cpu.numpy() + for i in range(self.num_kv_cache_groups): + new_block_ids_np[i, : len(new_block_ids[i])] = new_block_ids[i] + new_block_ids_gpu = self.new_block_ids_cpu.to(self.device, non_blocking=True) + + _append_block_ids_kernel[(self.num_kv_cache_groups, num_reqs)]( + self.req_indices.copy_to_gpu(num_reqs), + self.cu_num_new_blocks.copy_to_gpu(), + self.cu_num_new_blocks.gpu.stride(0), + new_block_ids_gpu, + new_block_ids_gpu.stride(0), + self.overwrite.copy_to_gpu(num_reqs), + self.block_table_strides, + self.block_table_ptrs, + self.num_blocks, + self.num_blocks.stride(0), + BLOCK_SIZE=1024, # type: ignore + ) + + def gather_block_tables( + self, + idx_mapping: torch.Tensor, + ) -> tuple[torch.Tensor, ...]: + num_reqs = idx_mapping.shape[0] + _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)]( + idx_mapping, + self.block_table_ptrs, + self.input_block_table_ptrs, + self.block_table_strides, + self.num_blocks, + self.num_blocks.stride(0), + BLOCK_SIZE=1024, # type: ignore + ) + return tuple(block_table[:num_reqs] for block_table in self.input_block_tables) + + def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]: + return tuple(block_table[:num_reqs] for block_table in self.input_block_tables) + + def compute_slot_mappings( + self, + query_start_loc: torch.Tensor, + positions: torch.Tensor, + ) -> torch.Tensor: + num_reqs = query_start_loc.shape[0] - 1 + num_tokens = positions.shape[0] + num_groups = self.num_kv_cache_groups + _compute_slot_mappings_kernel[(num_groups, num_reqs + 1)]( + num_tokens, + self.max_num_batched_tokens, + query_start_loc, + positions, + self.input_block_table_ptrs, + self.block_table_strides, + self.block_sizes_tensor, + self.slot_mappings, + self.slot_mappings.stride(0), + PAD_ID=PAD_SLOT_ID, + BLOCK_SIZE=1024, # type: ignore + ) + return self.slot_mappings[:, :num_tokens] + + def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor: + self.slot_mappings.fill_(PAD_SLOT_ID) + return self.slot_mappings[:, :num_tokens] + + +@triton.jit +def _append_block_ids_kernel( + # Inputs + req_indices, # [num_reqs] + cu_num_new_blocks_ptr, # [num_kv_cache_groups, num_reqs + 1] + cu_num_new_blocks_stride, + new_block_ids_ptr, # [num_kv_cache_groups, num_new_blocks] + new_block_ids_stride, + overwrite, # [num_reqs] + block_table_strides, # [num_kv_cache_groups] + # Outputs + block_table_ptrs, # [num_kv_cache_groups] + num_blocks_ptr, # [num_kv_cache_groups, max_num_reqs] + num_blocks_stride, + # Constants + BLOCK_SIZE: tl.constexpr, +): + group_id = tl.program_id(0) + batch_idx = tl.program_id(1) + req_idx = tl.load(req_indices + batch_idx) + do_overwrite = tl.load(overwrite + batch_idx) + + group_new_blocks_ptr = cu_num_new_blocks_ptr + group_id * cu_num_new_blocks_stride + start_idx = tl.load(group_new_blocks_ptr + batch_idx) + end_idx = tl.load(group_new_blocks_ptr + batch_idx + 1) + num_new_blocks = end_idx - start_idx + + group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride + dst_start_idx = tl.load(group_num_blocks_ptr + req_idx) if not do_overwrite else 0 + dst_end_idx = dst_start_idx + num_new_blocks + tl.store(group_num_blocks_ptr + req_idx, dst_end_idx) + + # Destination + block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32) + block_table_stride = tl.load(block_table_strides + group_id) + row_ptr = block_table_ptr + req_idx * block_table_stride + + group_new_block_ids_ptr = new_block_ids_ptr + group_id * new_block_ids_stride + for i in range(0, num_new_blocks, BLOCK_SIZE): + offset = i + tl.arange(0, BLOCK_SIZE) + block_ids = tl.load( + group_new_block_ids_ptr + start_idx + offset, mask=offset < num_new_blocks + ) + tl.store( + row_ptr + dst_start_idx + offset, block_ids, mask=offset < num_new_blocks + ) + + +@triton.jit +def _gather_block_tables_kernel( + batch_idx_to_req_idx, # [batch_size] + src_block_table_ptrs, # [num_kv_cache_groups] + dst_block_table_ptrs, # [num_kv_cache_groups] + block_table_strides, # [num_kv_cache_groups] + num_blocks_ptr, # [num_kv_cache_groups, max_num_reqs] + num_blocks_stride, + BLOCK_SIZE: tl.constexpr, +): + # kv cache group id + group_id = tl.program_id(0) + batch_idx = tl.program_id(1) + req_idx = tl.load(batch_idx_to_req_idx + batch_idx) + + group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride + num_blocks = tl.load(group_num_blocks_ptr + req_idx) + + stride = tl.load(block_table_strides + group_id) + src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32) + src_row_ptr = src_block_table_ptr + req_idx * stride + dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32) + dst_row_ptr = dst_block_table_ptr + batch_idx * stride + + for i in tl.range(0, num_blocks, BLOCK_SIZE): + offset = i + tl.arange(0, BLOCK_SIZE) + block_ids = tl.load(src_row_ptr + offset, mask=offset < num_blocks) + tl.store(dst_row_ptr + offset, block_ids, mask=offset < num_blocks) + + +@triton.jit +def _compute_slot_mappings_kernel( + num_tokens, + max_num_tokens, + cu_num_tokens, # [num_reqs + 1] + pos, # [num_tokens] + block_table_ptrs, # [num_kv_cache_groups] + block_table_strides, # [num_kv_cache_groups] + page_sizes, # [num_kv_cache_groups] + slot_mappings_ptr, # [num_kv_cache_groups, max_num_tokens] + slot_mappings_stride, + PAD_ID: tl.constexpr, + BLOCK_SIZE: tl.constexpr, +): + # kv cache group id + group_id = tl.program_id(0) + req_idx = tl.program_id(1) + slot_mapping_ptr = slot_mappings_ptr + group_id * slot_mappings_stride + + if req_idx == tl.num_programs(1) - 1: + # Pad remaining slots to -1. This is needed for CUDA graphs. + for i in range(num_tokens, max_num_tokens, BLOCK_SIZE): + offset = i + tl.arange(0, BLOCK_SIZE) + tl.store(slot_mapping_ptr + offset, PAD_ID, mask=offset < max_num_tokens) + return + + block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32) + block_table_stride = tl.load(block_table_strides + group_id) + page_size = tl.load(page_sizes + group_id) + + start_idx = tl.load(cu_num_tokens + req_idx) + end_idx = tl.load(cu_num_tokens + req_idx + 1) + for i in range(start_idx, end_idx, BLOCK_SIZE): + offset = i + tl.arange(0, BLOCK_SIZE) + positions = tl.load(pos + offset, mask=offset < end_idx, other=0) + block_indices = positions // page_size + block_numbers = tl.load( + block_table_ptr + req_idx * block_table_stride + block_indices + ) + slot_ids = block_numbers * page_size + positions % page_size + tl.store(slot_mapping_ptr + offset, slot_ids, mask=offset < end_idx) + + +@triton.jit +def _load_ptr(ptr_to_ptr, elem_dtype): + ptr = tl.load(ptr_to_ptr) + ptr = tl.cast(ptr, tl.pointer_type(elem_dtype)) + return tl.multiple_of(ptr, 16) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py new file mode 100644 index 0000000000000..7fd1f76669f48 --- /dev/null +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +from contextlib import contextmanager + +import numpy as np +import torch +import torch.nn as nn +from tqdm import tqdm + +from vllm.config import VllmConfig +from vllm.config.compilation import CUDAGraphMode +from vllm.distributed.parallel_state import graph_capture, is_global_first_rank +from vllm.forward_context import set_forward_context +from vllm.v1.attention.backends.utils import AttentionMetadataBuilder +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.worker.gpu.attn_utils import build_attn_metadata +from vllm.v1.worker.gpu.block_table import BlockTables +from vllm.v1.worker.gpu.input_batch import InputBuffers + + +class CudaGraphManager: + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.device = device + + self.max_model_len = vllm_config.model_config.max_model_len + self.dp_size = vllm_config.parallel_config.data_parallel_size + self.compilation_config = vllm_config.compilation_config + assert self.compilation_config is not None + + self.cudagraph_mode = self.compilation_config.cudagraph_mode + self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + self.padded_sizes = self._init_padded_sizes() + + self.graphs: dict[int, torch.cuda.CUDAGraph] = {} + self.pool = torch.cuda.graph_pool_handle() + self.hidden_states: torch.Tensor | None = None + + def _init_padded_sizes(self) -> dict[int, int]: + if not self.cudagraph_mode.has_full_cudagraphs(): + # Full cuda graphs are not used. + return {} + + padded_sizes: dict[int, int] = {} + assert len(self.cudagraph_sizes) > 0 + for i in range(1, self.cudagraph_sizes[-1] + 1): + for x in self.cudagraph_sizes: + if i <= x: + padded_sizes[i] = x + break + return padded_sizes + + def needs_capture(self) -> bool: + return len(self.padded_sizes) > 0 + + def get_cudagraph_size( + self, + scheduler_output: SchedulerOutput, + num_tokens_after_padding: int, + ) -> int | None: + if not self.cudagraph_mode.has_full_cudagraphs(): + return None + if self.cudagraph_mode != CUDAGraphMode.FULL: + # TODO(woosuk): Support uniform decode with multiple tokens (spec decoding). + all_decode = all( + x == 1 for x in scheduler_output.num_scheduled_tokens.values() + ) + if not all_decode: + # Prefill is included. + return None + return self.padded_sizes.get(num_tokens_after_padding) + + def capture_graph( + self, + batch_size: int, + model: nn.Module, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_metadata_builders: list[AttentionMetadataBuilder], + kv_cache_config: KVCacheConfig, + ) -> None: + assert batch_size not in self.graphs + + # Prepare dummy inputs. + input_ids = input_buffers.input_ids.gpu[:batch_size] + positions = input_buffers.positions.gpu[:batch_size] + + input_buffers.query_start_loc.np[: batch_size + 1] = np.arange(batch_size + 1) + input_buffers.query_start_loc.np[batch_size:] = batch_size + input_buffers.query_start_loc.copy_to_gpu() + input_buffers.seq_lens.np[:batch_size] = self.max_model_len + input_buffers.seq_lens.np[batch_size:] = 0 + input_buffers.seq_lens.copy_to_gpu() + + input_block_tables = [x[:batch_size] for x in block_tables.input_block_tables] + slot_mappings = block_tables.slot_mappings[:, :batch_size] + + attn_metadata = build_attn_metadata( + attn_metadata_builders=attn_metadata_builders, + num_reqs=batch_size, + num_tokens=batch_size, + query_start_loc=input_buffers.query_start_loc, + seq_lens=input_buffers.seq_lens, + num_computed_tokens_cpu=None, # FIXME + block_tables=input_block_tables, + slot_mappings=slot_mappings, + kv_cache_config=kv_cache_config, + ) + if self.dp_size > 1: + num_tokens_across_dp = torch.full( + (self.dp_size,), + batch_size, + dtype=torch.int32, + device="cpu", + ) + else: + num_tokens_across_dp = None + + # Warm up. + with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=batch_size, + num_tokens_across_dp=num_tokens_across_dp, + ): + hidden_states = model( + input_ids=input_ids, + positions=positions, + ) + if self.hidden_states is None: + self.hidden_states = torch.empty_like(hidden_states) + torch.cuda.synchronize() + + # Capture the graph. + graph = torch.cuda.CUDAGraph() + with ( + set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=batch_size, + num_tokens_across_dp=num_tokens_across_dp, + ), + torch.cuda.graph(graph, self.pool), + ): + hidden_states = model( + input_ids=input_ids, + positions=positions, + ) + self.hidden_states[:batch_size] = hidden_states + self.graphs[batch_size] = graph + + @torch.inference_mode() + def capture( + self, + model: nn.Module, + input_buffers: InputBuffers, + block_tables: BlockTables, + attn_metadata_builders: list[AttentionMetadataBuilder], + kv_cache_config: KVCacheConfig, + ) -> None: + assert self.needs_capture() + # Capture larger graphs first. + sizes_to_capture = sorted(self.cudagraph_sizes, reverse=True) + if is_global_first_rank(): + sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs") + + with freeze_gc(), graph_capture(device=self.device): + for batch_size in sizes_to_capture: + self.capture_graph( + batch_size, + model, + input_buffers, + block_tables, + attn_metadata_builders, + kv_cache_config, + ) + + def run(self, batch_size: int) -> torch.Tensor: + assert batch_size in self.graphs + self.graphs[batch_size].replay() + assert self.hidden_states is not None + return self.hidden_states[:batch_size] + + +@contextmanager +def freeze_gc(): + gc.collect() + gc.freeze() + try: + yield + finally: + gc.unfreeze() diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py new file mode 100644 index 0000000000000..9bfc7f25bef3a --- /dev/null +++ b/vllm/v1/worker/gpu/dp_utils.py @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +import torch.distributed as dist + +from vllm.distributed.parallel_state import get_dp_group + + +def get_batch_metadata_across_dp( + num_tokens: int, + cudagraph_size: int, + dp_size: int, + dp_rank: int, +) -> tuple[torch.Tensor, torch.Tensor]: + assert dp_size > 1 + # Use CPU group to avoid CPU-GPU synchronization. + group = get_dp_group().cpu_group + tensor = torch.zeros(2, dp_size, dtype=torch.int32, device="cpu") + tensor[0][dp_rank] = num_tokens + tensor[1][dp_rank] = cudagraph_size + dist.all_reduce(tensor, group=group) + return tensor[0], tensor[1] diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py new file mode 100644 index 0000000000000..89f375649146f --- /dev/null +++ b/vllm/v1/worker/gpu/input_batch.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass +from typing import Any + +import numba +import numba.types as types +import numpy as np +import torch +import triton +import triton.language as tl + +from vllm.utils import random_uuid +from vllm.utils.math_utils import cdiv +from vllm.v1.utils import CpuGpuBuffer + + +class InputBuffers: + def __init__( + self, + max_num_reqs: int, + max_num_tokens: int, + hidden_size: int, + vocab_size: int, + dtype: torch.dtype, + device: torch.device, + pin_memory: bool, + ): + self.max_num_reqs = max_num_reqs + self.max_num_tokens = max_num_tokens + self.device = device + self.pin_memory = pin_memory + + self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32) + self.input_ids = self._make_buffer(max_num_tokens, dtype=torch.int32) + self.positions = self._make_buffer(max_num_tokens, dtype=torch.int64) + self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32) + self.seq_lens = self._make_buffer(max_num_reqs, dtype=torch.int32) + + # Structured outputs. + self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32) + self.grammar_bitmask = self._make_buffer( + max_num_reqs, cdiv(vocab_size, 32), dtype=torch.int32 + ) + + def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer: + return CpuGpuBuffer( + *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device + ) + + +@dataclass +class InputBatch: + # batch_idx -> req_id + req_ids: list[str] + num_reqs: int + + # batch_idx -> req_state_idx + idx_mapping: torch.Tensor + idx_mapping_np: np.ndarray + + # [num_reqs] + # batch_idx -> num_scheduled_tokens + num_scheduled_tokens: np.ndarray + # sum(num_scheduled_tokens) + num_tokens: int + num_tokens_after_padding: int + + # [num_reqs + 1] + query_start_loc: torch.Tensor + query_start_loc_np: np.ndarray + # [num_reqs] + seq_lens: torch.Tensor + seq_lens_np: np.ndarray + + # [num_tokens_after_padding] + input_ids: torch.Tensor + # [num_tokens_after_padding] + positions: torch.Tensor + + # layer_name -> Metadata + attn_metadata: dict[str, Any] + + # [num_reqs] + logits_indices: torch.Tensor + + @classmethod + def make_dummy( + cls, + num_reqs: int, + num_tokens: int, + input_buffers: InputBuffers, + device: torch.device, + ) -> "InputBatch": + assert 0 < num_reqs <= num_tokens + req_ids = [f"req_{i}_{random_uuid()}" for i in range(num_reqs)] + idx_mapping_np = np.arange(num_reqs, dtype=np.int32) + idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device) + num_scheduled_tokens = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32) + num_scheduled_tokens[-1] += num_tokens % num_reqs + assert int(num_scheduled_tokens.sum()) == num_tokens + + input_buffers.query_start_loc.np[0] = 0 + input_buffers.query_start_loc.np[1 : num_reqs + 1] = np.cumsum( + num_scheduled_tokens + ) + input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens + query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1] + query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1] + # seq_len equals to query_len + input_buffers.seq_lens.np[:num_reqs] = num_scheduled_tokens + input_buffers.seq_lens.np[num_reqs:] = 0 + seq_lens_np = input_buffers.seq_lens.np[:num_reqs] + seq_lens = input_buffers.seq_lens.copy_to_gpu()[:num_reqs] + + input_ids = input_buffers.input_ids.copy_to_gpu(num_tokens) + positions = input_buffers.positions.copy_to_gpu(num_tokens) + # attn_metadata = defaultdict(lambda: None) + logits_indices = query_start_loc[1:] - 1 + return cls( + req_ids=req_ids, + num_reqs=num_reqs, + idx_mapping=idx_mapping, + idx_mapping_np=idx_mapping_np, + num_scheduled_tokens=num_scheduled_tokens, + num_tokens=num_tokens, + num_tokens_after_padding=num_tokens, + query_start_loc=query_start_loc, + query_start_loc_np=query_start_loc_np, + seq_lens=seq_lens, + seq_lens_np=seq_lens_np, + input_ids=input_ids, + positions=positions, + attn_metadata=None, # type: ignore + logits_indices=logits_indices, + ) + + +# NOTE: With the type annotations, this function is pre-compiled +# before the first call. +@numba.jit( + [ + types.none( + types.int32[:], # idx_mapping + types.int32[:, :], # token_ids + types.int32[:], # num_computed_tokens + types.int32[:], # num_scheduled_tokens + types.int32[:], # input_ids + types.int64[:], # positions + types.int32[:], # query_start_loc + types.int32[:], # seq_lens + ) + ], + nopython=True, + cache=True, +) +def _prepare_inputs( + idx_mapping: np.ndarray, # batch_idx -> req_idx + token_ids: np.ndarray, # [N, max_model_len] + num_computed_tokens: np.ndarray, # [N] + num_scheduled_tokens: np.ndarray, # [B] + input_ids: np.ndarray, # [num_input_tokens] + positions: np.ndarray, # [num_input_tokens] + query_start_loc: np.ndarray, # [B + 1] + seq_lens: np.ndarray, # [B] +) -> None: + num_reqs = num_scheduled_tokens.shape[0] + query_start_loc[0] = 0 + + cu_num_tokens = 0 + for i in range(num_reqs): + req_idx = idx_mapping[i] + query_len = num_scheduled_tokens[i] + start = num_computed_tokens[req_idx] + end = start + query_len + seq_lens[i] = end + + start_idx = cu_num_tokens + end_idx = start_idx + query_len + input_ids[start_idx:end_idx] = token_ids[req_idx, start:end] + positions[start_idx:end_idx] = np.arange(start, end, dtype=np.int64) + + cu_num_tokens = end_idx + query_start_loc[i + 1] = cu_num_tokens + + # Pad the inputs for CUDA graphs. + # Note: pad query_start_loc to be non-decreasing, as kernels + # like FlashAttention requires that + query_start_loc[num_reqs + 1 :].fill(cu_num_tokens) + # Fill unused with 0 for full cuda graph mode. + seq_lens[num_reqs:].fill(0) + + +def prepare_inputs( + idx_mapping: np.ndarray, + prefill_token_ids: np.ndarray, + num_computed_tokens: np.ndarray, + num_scheduled_tokens: np.ndarray, + input_ids: CpuGpuBuffer, + positions: CpuGpuBuffer, + query_start_loc: CpuGpuBuffer, + seq_lens: CpuGpuBuffer, + num_tokens: int, +) -> None: + _prepare_inputs( + idx_mapping, + prefill_token_ids, + num_computed_tokens, + num_scheduled_tokens, + input_ids.np, + positions.np, + query_start_loc.np, + seq_lens.np, + ) + input_ids.copy_to_gpu(num_tokens) + positions.copy_to_gpu(num_tokens) + # NOTE(woosuk): We should copy the whole query_start_loc and seq_lens + # tensors from CPU to GPU, because they may include paddings needed + # for full CUDA graph mode. + query_start_loc.copy_to_gpu() + seq_lens.copy_to_gpu() + return + + +@triton.jit +def _combine_last_token_ids_kernel( + input_ids_ptr, + idx_mapping_ptr, + last_token_ids_ptr, + query_start_loc_ptr, + seq_lens_ptr, + prefill_len_ptr, +): + batch_idx = tl.program_id(0) + req_state_idx = tl.load(idx_mapping_ptr + batch_idx) + + seq_len = tl.load(seq_lens_ptr + batch_idx) + prefill_len = tl.load(prefill_len_ptr + req_state_idx) + if seq_len <= prefill_len: + # Handling prefill tokens. + return + + last_token_id = tl.load(last_token_ids_ptr + req_state_idx) + end = tl.load(query_start_loc_ptr + batch_idx + 1) + tl.store(input_ids_ptr + end - 1, last_token_id) + + +def combine_last_token_ids( + input_ids: torch.Tensor, + idx_mapping: torch.Tensor, + last_token_ids: torch.Tensor, + query_start_loc: torch.Tensor, + seq_lens: torch.Tensor, + prefill_len: torch.Tensor, +) -> torch.Tensor: + num_reqs = seq_lens.shape[0] + _combine_last_token_ids_kernel[(num_reqs,)]( + input_ids, + idx_mapping, + last_token_ids, + query_start_loc, + seq_lens, + prefill_len, + ) + return input_ids diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py new file mode 100644 index 0000000000000..08aad9ddd06b3 --- /dev/null +++ b/vllm/v1/worker/gpu/model_runner.py @@ -0,0 +1,814 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +import time +from copy import deepcopy +from typing import Any + +import numpy as np +import torch +import torch.nn as nn + +from vllm.config import VllmConfig +from vllm.config.compilation import CUDAGraphMode +from vllm.forward_context import set_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model_loader +from vllm.utils.mem_constants import GiB_bytes +from vllm.utils.mem_utils import DeviceMemoryProfiler +from vllm.utils.platform_utils import is_pin_memory_available +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput +from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.outputs import ( + EMPTY_MODEL_RUNNER_OUTPUT, + LogprobsTensors, + ModelRunnerOutput, +) +from vllm.v1.sample.sampler import SamplerOutput +from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier +from vllm.v1.worker.gpu.attn_utils import ( + build_attn_metadata, + get_kv_cache_spec, + init_attn_backend, + init_kv_cache, +) +from vllm.v1.worker.gpu.block_table import BlockTables +from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager +from vllm.v1.worker.gpu.dp_utils import get_batch_metadata_across_dp +from vllm.v1.worker.gpu.input_batch import ( + InputBatch, + InputBuffers, + combine_last_token_ids, + prepare_inputs, +) +from vllm.v1.worker.gpu.sampler import Sampler, compute_prompt_logprobs +from vllm.v1.worker.gpu.states import RequestState, SamplingMetadata +from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask +from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin +from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin + +logger = init_logger(__name__) + + +class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.compilation_config = vllm_config.compilation_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.speculative_config = vllm_config.speculative_config + self.observability_config = vllm_config.observability_config + + self.device = device + self.pin_memory = is_pin_memory_available() + self.dtype = self.model_config.dtype + self.kv_cache_dtype = self.dtype + if self.cache_config.cache_dtype != "auto": + # Quantized KV cache. + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + self.cache_config.cache_dtype + ] + self.is_pooling_model = False + + self.vocab_size = self.model_config.get_vocab_size() + self.max_model_len = self.model_config.max_model_len + self.max_num_tokens = self.scheduler_config.max_num_batched_tokens + self.max_num_reqs = self.scheduler_config.max_num_seqs + self.hidden_size = self.model_config.get_hidden_size() + + self.dp_size = self.parallel_config.data_parallel_size + self.dp_rank = self.parallel_config.data_parallel_rank + + self.use_async_scheduling = self.scheduler_config.async_scheduling + self.output_copy_stream = torch.cuda.Stream(self.device) + self.output_copy_event = torch.cuda.Event() + if self.use_async_scheduling: + self.input_prep_event = torch.cuda.Event() + self.structured_outputs_event = torch.cuda.Event() + else: + self.input_prep_event = None + self.structured_outputs_event = None + + self.req_states = RequestState( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + vocab_size=self.vocab_size, + device=self.device, + pin_memory=self.pin_memory, + ) + self.input_buffers = InputBuffers( + max_num_reqs=self.max_num_reqs, + max_num_tokens=self.max_num_tokens, + hidden_size=self.hidden_size, + vocab_size=self.vocab_size, + dtype=self.dtype, + device=self.device, + pin_memory=self.pin_memory, + ) + self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode) + + # CUDA graphs. + self.cudagraph_manager = CudaGraphManager( + vllm_config=self.vllm_config, + device=self.device, + ) + + def get_supported_tasks(self) -> tuple[str]: + return ("generate",) + + def load_model(self, *args, **kwargs) -> None: + time_before_load = time.perf_counter() + with DeviceMemoryProfiler() as m: + model_loader = get_model_loader(self.vllm_config.load_config) + logger.info("Loading model from scratch...") + + self.model = model_loader.load_model( + vllm_config=self.vllm_config, + model_config=self.vllm_config.model_config, + ) + if self.lora_config: + self.model = self.load_lora_model( + self.model, + self.vllm_config, + self.device, + ) + time_after_load = time.perf_counter() + + self.model_memory_usage = m.consumed_memory + logger.info( + "Model loading took %.4f GiB and %.6f seconds", + m.consumed_memory / GiB_bytes, + time_after_load - time_before_load, + ) + + def get_model(self) -> nn.Module: + return self.model + + def get_kv_cache_spec(self): + return get_kv_cache_spec(self.vllm_config) + + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: + kv_cache_config = deepcopy(kv_cache_config) + self.kv_cache_config = kv_cache_config + block_sizes = [ + kv_cache_group.kv_cache_spec.block_size + for kv_cache_group in kv_cache_config.kv_cache_groups + ] + + self.block_tables = BlockTables( + block_sizes=block_sizes, + max_num_reqs=self.max_num_reqs, + max_num_batched_tokens=self.max_num_tokens, + max_model_len=self.max_model_len, + device=self.device, + pin_memory=self.pin_memory, + ) + + self.attn_backends, self.attn_metadata_builders = init_attn_backend( + self.kv_cache_config, + self.vllm_config, + self.device, + ) + + self.kv_caches: list[torch.Tensor] = [] + init_kv_cache( + self.kv_caches, + self.compilation_config.static_forward_context, + self.kv_cache_config, + self.attn_backends, + self.device, + ) + # Attention groups are not supported. + self.attn_groups = [] # type: ignore + + def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None: + block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs) + slot_mappings = self.block_tables.get_dummy_slot_mappings( + input_batch.num_tokens + ) + num_computed_tokens_cpu = torch.zeros( + input_batch.num_reqs, dtype=torch.int32, device="cpu" + ) + attn_metadata = build_attn_metadata( + attn_metadata_builders=self.attn_metadata_builders, + num_reqs=input_batch.num_reqs, + num_tokens=input_batch.num_tokens, + query_start_loc=self.input_buffers.query_start_loc, + seq_lens=self.input_buffers.seq_lens, + num_computed_tokens_cpu=num_computed_tokens_cpu, + block_tables=block_tables, + slot_mappings=slot_mappings, + kv_cache_config=self.kv_cache_config, + ) + input_batch.attn_metadata = attn_metadata + + @torch.inference_mode() + def _dummy_run( + self, + num_tokens: int, + *args, + skip_attn: bool = True, + **kwargs, + ) -> tuple[torch.Tensor, torch.Tensor]: + num_reqs = min(num_tokens, self.max_num_reqs) + input_batch = InputBatch.make_dummy( + num_reqs=num_reqs, + num_tokens=num_tokens, + input_buffers=self.input_buffers, + device=self.device, + ) + if not skip_attn: + self.prepare_dummy_attn_metadata(input_batch) + + if self.dp_size == 1: + num_tokens_across_dp: torch.Tensor | None = None + else: + num_tokens_across_dp = torch.full( + (self.dp_size,), num_tokens, dtype=torch.int32, device="cpu" + ) + num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32) + with ( + self.maybe_dummy_run_with_lora( + self.lora_config, + input_batch.num_scheduled_tokens, + num_sampled_tokens, + ), + set_forward_context( + input_batch.attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + ), + ): + hidden_states = self.model( + input_ids=input_batch.input_ids, + positions=input_batch.positions, + ) + sample_hidden_states = hidden_states[input_batch.logits_indices] + return hidden_states, sample_hidden_states + + @torch.inference_mode() + def _dummy_sampler_run( + self, + hidden_states: torch.Tensor, + ) -> None: + num_reqs = hidden_states.shape[0] + sampling_metadata = SamplingMetadata.make_dummy( + num_reqs=num_reqs, + device=self.device, + ) + logits = self.model.compute_logits(hidden_states) + self.sampler(logits, sampling_metadata) + + @torch.inference_mode() + def profile_run(self) -> None: + hidden_states, sample_hidden_states = self._dummy_run( + self.max_num_tokens, + skip_attn=True, + ) + self._dummy_sampler_run(sample_hidden_states) + torch.cuda.synchronize() + del hidden_states, sample_hidden_states + gc.collect() + + def reset_mm_cache(self) -> None: + pass + + def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: + # SP is not supported yet. + return num_scheduled_tokens + + @torch.inference_mode() + def capture_model(self) -> int: + if not self.cudagraph_manager.needs_capture(): + logger.warning( + "Skipping CUDA graph capture. To turn on CUDA graph capture, " + "ensure `cudagraph_mode` was not manually set to `NONE`" + ) + return 0 + + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + with self.maybe_setup_dummy_loras(self.lora_config): + self.cudagraph_manager.capture( + model=self.model, + input_buffers=self.input_buffers, + block_tables=self.block_tables, + attn_metadata_builders=self.attn_metadata_builders, + kv_cache_config=self.kv_cache_config, + ) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info( + "Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, + cuda_graph_size / (1 << 30), + ) + return cuda_graph_size + + def warmup_for_prefill(self) -> None: + # For FlashInfer, we would like to execute a dummy prefill run + # to trigger JIT compilation. + if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()): + self._dummy_run(self.max_num_tokens, skip_attn=False) + torch.cuda.synchronize() + + def update_states(self, scheduler_output: SchedulerOutput) -> None: + for req_id in scheduler_output.preempted_req_ids: + self.req_states.remove_request(req_id) + for req_id in scheduler_output.finished_req_ids: + self.req_states.remove_request(req_id) + + # TODO(woosuk): Change SchedulerOutput. + req_indices: list[int] = [] + cu_num_new_blocks = tuple( + [0] for _ in range(self.block_tables.num_kv_cache_groups) + ) + new_block_ids: tuple[list[int], ...] = tuple( + [] for _ in range(self.block_tables.num_kv_cache_groups) + ) + overwrite: list[bool] = [] + + # Add new requests. + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + self.req_states.add_request( + req_id=req_id, + prompt_len=len(new_req_data.prompt_token_ids), + prefill_token_ids=new_req_data.prefill_token_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + sampling_params=new_req_data.sampling_params, + lora_request=new_req_data.lora_request, + ) + + req_index = self.req_states.req_id_to_index[req_id] + req_indices.append(req_index) + for i, block_ids in enumerate(new_req_data.block_ids): + x = cu_num_new_blocks[i][-1] + cu_num_new_blocks[i].append(x + len(block_ids)) + new_block_ids[i].extend(block_ids) + overwrite.append(True) + + # Add new blocks for the existing requests. + cached_reqs = scheduler_output.scheduled_cached_reqs + for i, req_id in enumerate(cached_reqs.req_ids): + req_index = self.req_states.req_id_to_index[req_id] + + req_new_block_ids = cached_reqs.new_block_ids[i] + if req_new_block_ids is not None: + req_indices.append(req_index) + for group_id, block_ids in enumerate(req_new_block_ids): + x = cu_num_new_blocks[group_id][-1] + cu_num_new_blocks[group_id].append(x + len(block_ids)) + new_block_ids[group_id].extend(block_ids) + overwrite.append(False) + + if req_indices: + self.block_tables.append_block_ids( + req_indices=req_indices, + cu_num_new_blocks=cu_num_new_blocks, + new_block_ids=new_block_ids, + overwrite=overwrite, + ) + + def prepare_inputs( + self, + scheduler_output: SchedulerOutput, + num_tokens_after_padding: int, + ) -> InputBatch: + num_tokens = scheduler_output.total_num_scheduled_tokens + assert num_tokens > 0 + num_reqs = len(scheduler_output.num_scheduled_tokens) + + # Decode first, then prefill. + # batch_idx -> req_id + req_ids = sorted( + scheduler_output.num_scheduled_tokens, + key=scheduler_output.num_scheduled_tokens.get, + ) + num_scheduled_tokens = np.array( + [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32 + ) + + idx_mapping_list = [ + self.req_states.req_id_to_index[req_id] for req_id in req_ids + ] + idx_mapping = self.input_buffers.idx_mapping + idx_mapping.np[:num_reqs] = idx_mapping_list + idx_mapping_np = idx_mapping.np[:num_reqs] + idx_mapping = idx_mapping.copy_to_gpu(num_reqs) + + # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks] + block_tables = self.block_tables.gather_block_tables(idx_mapping) + + prepare_inputs( + idx_mapping_np, + self.req_states.prefill_token_ids, + self.req_states.num_computed_tokens, + num_scheduled_tokens, + self.input_buffers.input_ids, + self.input_buffers.positions, + self.input_buffers.query_start_loc, + self.input_buffers.seq_lens, + num_tokens, + ) + + query_start_loc = self.input_buffers.query_start_loc + query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1] + query_start_loc_np = query_start_loc.np[: num_reqs + 1] + seq_lens_gpu = self.input_buffers.seq_lens.gpu[:num_reqs] + seq_lens_np = self.input_buffers.seq_lens.np[:num_reqs] + + # Some input token ids are directly read from the last sampled tokens. + combine_last_token_ids( + self.input_buffers.input_ids.gpu, + idx_mapping, + self.req_states.last_sampled_tokens, + query_start_loc_gpu, + seq_lens_gpu, + self.req_states.prefill_len.copy_to_gpu(), + ) + + # Compute slot mappings: [num_kv_cache_groups, num_tokens] + slot_mappings = self.block_tables.compute_slot_mappings( + query_start_loc_gpu, self.input_buffers.positions.gpu[:num_tokens] + ) + + num_computed_tokens_cpu = torch.from_numpy( + self.req_states.num_computed_tokens[idx_mapping_np] + ) + + # Logits indices to sample next token from. + logits_indices = query_start_loc_gpu[1:] - 1 + + # Layer name -> attention metadata. + attn_metadata = build_attn_metadata( + attn_metadata_builders=self.attn_metadata_builders, + num_reqs=num_reqs, + num_tokens=num_tokens, + query_start_loc=self.input_buffers.query_start_loc, + seq_lens=self.input_buffers.seq_lens, + num_computed_tokens_cpu=num_computed_tokens_cpu, + block_tables=block_tables, + slot_mappings=slot_mappings, + kv_cache_config=self.kv_cache_config, + ) + + input_ids = self.input_buffers.input_ids.gpu[:num_tokens_after_padding] + positions = self.input_buffers.positions.gpu[:num_tokens_after_padding] + return InputBatch( + req_ids=req_ids, + num_reqs=num_reqs, + idx_mapping=idx_mapping, + idx_mapping_np=idx_mapping_np, + num_scheduled_tokens=num_scheduled_tokens, + num_tokens=num_tokens, + num_tokens_after_padding=num_tokens_after_padding, + query_start_loc=query_start_loc_gpu, + query_start_loc_np=query_start_loc_np, + seq_lens=seq_lens_gpu, + seq_lens_np=seq_lens_np, + input_ids=input_ids, + positions=positions, + attn_metadata=attn_metadata, + logits_indices=logits_indices, + ) + + def sample( + self, + hidden_states: torch.Tensor, + input_batch: InputBatch, + sampling_metadata: SamplingMetadata, + grammar_output: GrammarOutput | None, + ) -> SamplerOutput: + sample_hidden_states = hidden_states[input_batch.logits_indices] + logits = self.model.compute_logits(sample_hidden_states) + if grammar_output is not None: + # Apply grammar bitmask to the logits in-place. + with async_barrier(self.structured_outputs_event): + apply_grammar_bitmask( + logits, + input_batch.req_ids, + grammar_output.structured_output_request_ids, + grammar_output.grammar_bitmask, + self.input_buffers, + ) + sampler_output = self.sampler(logits, sampling_metadata) + return sampler_output + + def compute_prompt_logprobs( + self, + hidden_states: torch.Tensor, + input_batch: InputBatch, + ) -> dict[str, LogprobsTensors]: + idx_mapping_np = input_batch.idx_mapping_np + needs_prompt_logprobs = self.req_states.needs_prompt_logprobs[idx_mapping_np] + if not np.any(needs_prompt_logprobs): + # No request asks for prompt logprobs. + return {} + + num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping_np] + prompt_lens = self.req_states.prompt_len[idx_mapping_np] + # NOTE(woosuk): -1 because the last prompt token's hidden state is not + # needed for prompt logprobs. + includes_prompt = num_computed_tokens < prompt_lens - 1 + # NOTE(woosuk): If the request was resumed after preemption, its prompt + # logprobs must have been computed before preemption. Skip. + resumed_after_prompt = ( + prompt_lens < self.req_states.prefill_len.np[idx_mapping_np] + ) + needs_prompt_logprobs &= includes_prompt & ~resumed_after_prompt + if not np.any(needs_prompt_logprobs): + return {} + + # Just to be safe, clone the input ids. + n = input_batch.num_tokens + # Shift the input ids by one. + token_ids = torch.empty_like(input_batch.input_ids[:n]) + token_ids[: n - 1] = input_batch.input_ids[1:n] + # To avoid out-of-bound access, set the last token id to 0. + token_ids[n - 1] = 0 + + # Handle chunked prompts. + seq_lens = self.input_buffers.seq_lens.np[: input_batch.num_reqs] + is_prompt_chunked = seq_lens < prompt_lens + prefill_token_ids = self.req_states.prefill_token_ids + query_start_loc = self.input_buffers.query_start_loc.np + for i, req_id in enumerate(input_batch.req_ids): + if not needs_prompt_logprobs[i]: + continue + if not is_prompt_chunked[i]: + continue + # The prompt is chunked. Get the next prompt token. + req_idx = input_batch.idx_mapping_np[i] + next_prompt_token = int(prefill_token_ids[req_idx, seq_lens[i]]) + idx = int(query_start_loc[i + 1] - 1) + # Set the next prompt token. + # NOTE(woosuk): This triggers a GPU operation. + token_ids[idx] = next_prompt_token + + # NOTE(woosuk): We mask out logprobs for negative tokens. + prompt_logprobs, prompt_ranks = compute_prompt_logprobs( + token_ids, + hidden_states[:n], + self.model.compute_logits, + ) + + prompt_token_ids = token_ids.unsqueeze(-1) + prompt_logprobs_dict: dict[str, LogprobsTensors] = {} + for i, req_id in enumerate(input_batch.req_ids): + if not needs_prompt_logprobs[i]: + continue + + start_idx = query_start_loc[i] + end_idx = query_start_loc[i + 1] + assert start_idx < end_idx, ( + f"start_idx ({start_idx}) >= end_idx ({end_idx})" + ) + logprobs = LogprobsTensors( + logprob_token_ids=prompt_token_ids[start_idx:end_idx], + logprobs=prompt_logprobs[start_idx:end_idx], + selected_token_ranks=prompt_ranks[start_idx:end_idx], + ) + + req_extra_data = self.req_states.extra_data[req_id] + prompt_logprobs_list = req_extra_data.in_progress_prompt_logprobs + if is_prompt_chunked[i]: + # Prompt is chunked. Do not return the logprobs yet. + prompt_logprobs_list.append(logprobs) + continue + + if prompt_logprobs_list: + # Merge the in-progress logprobs. + prompt_logprobs_list.append(logprobs) + logprobs = LogprobsTensors( + logprob_token_ids=torch.cat( + [x.logprob_token_ids for x in prompt_logprobs_list] + ), + logprobs=torch.cat([x.logprobs for x in prompt_logprobs_list]), + selected_token_ranks=torch.cat( + [x.selected_token_ranks for x in prompt_logprobs_list] + ), + ) + prompt_logprobs_list.clear() + + prompt_logprobs_dict[req_id] = logprobs + return prompt_logprobs_dict + + def postprocess( + self, + sampler_output: SamplerOutput, + prompt_logprobs_dict: dict[str, LogprobsTensors], + input_batch: InputBatch, + ) -> AsyncOutput | ModelRunnerOutput: + # Store the last sampled token ids. + self.req_states.last_sampled_tokens[input_batch.idx_mapping] = ( + sampler_output.sampled_token_ids + ) + # Get the number of sampled tokens. + # 0 if chunked-prefilling, 1 if not. + idx_mapping_np = input_batch.idx_mapping_np + is_chunked_prefilling = ( + input_batch.seq_lens_np < self.req_states.num_tokens[idx_mapping_np] + ) + num_sampled_tokens = (~is_chunked_prefilling).astype(np.int32) + # Increment the number of tokens. + self.req_states.num_tokens[idx_mapping_np] += num_sampled_tokens + # Increment the number of computed tokens. + self.req_states.num_computed_tokens[idx_mapping_np] += ( + input_batch.num_scheduled_tokens + ) + + model_runner_output = ModelRunnerOutput( + req_ids=input_batch.req_ids, + req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)}, + sampled_token_ids=None, + logprobs=None, + prompt_logprobs_dict=prompt_logprobs_dict, + pooler_output=[], + kv_connector_output=None, + num_nans_in_logits=None, + ) + async_output = AsyncOutput( + model_runner_output=model_runner_output, + sampler_output=sampler_output, + num_sampled_tokens=num_sampled_tokens, + copy_stream=self.output_copy_stream, + copy_event=self.output_copy_event, + ) + if self.use_async_scheduling: + return async_output + return async_output.get_output() + + def get_cudagraph_and_dp_padding( + self, + scheduler_output: SchedulerOutput, + ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if self.dp_size == 1: + # No DP. Only consider CUDA graphs. + if total_num_scheduled_tokens == 0: + # Special case: no tokens to run. + return CUDAGraphMode.NONE, 0, None + + cudagraph_size = self.cudagraph_manager.get_cudagraph_size( + scheduler_output, total_num_scheduled_tokens + ) + if cudagraph_size is not None: + # Use full CUDA graph. + return CUDAGraphMode.FULL, cudagraph_size, None + # Fall back to eager mode. + # TODO(woosuk): Support piecewise CUDA graphs. + return CUDAGraphMode.NONE, total_num_scheduled_tokens, None + + # Consider DP padding and CUDA graph. + if total_num_scheduled_tokens == 0: + # Special handling is needed for 0. + cudagraph_size_before_dp: int | None = 0 + else: + cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size( + scheduler_output, total_num_scheduled_tokens + ) + if cudagraph_size_before_dp is None: + cudagraph_size_before_dp = -1 + + assert cudagraph_size_before_dp is not None + num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp( + total_num_scheduled_tokens, + cudagraph_size_before_dp, + self.dp_size, + self.dp_rank, + ) + if all(cudagraph_size_across_dp >= 0): + # If all ranks can use CUDA graph, pad to the maximum number of tokens + # across DP and use CUDA graph. + num_tokens_after_padding = int(cudagraph_size_across_dp.max().item()) + cudagraph_mode = CUDAGraphMode.FULL + else: + # If any of the ranks cannot use CUDA graph, use eager mode for all ranks. + # No padding is needed except for ranks that have no tokens to run. + num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1) + num_tokens_after_padding = num_tokens_across_dp[self.dp_rank] + cudagraph_mode = CUDAGraphMode.NONE + return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: SchedulerOutput, + intermediate_tensors: Any | None = None, + dummy_run: bool = False, + ) -> ModelRunnerOutput | None: + assert intermediate_tensors is None + if scheduler_output.total_num_scheduled_tokens == 0 and not dummy_run: + # No need to run the model. + with async_barrier(self.input_prep_event): + self.update_states(scheduler_output) + return EMPTY_MODEL_RUNNER_OUTPUT + + # NOTE: Call this before the async barrier so CPU all-reduce and + # GPU execution can overlap. + cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = ( + self.get_cudagraph_and_dp_padding(scheduler_output) + ) + with async_barrier(self.input_prep_event): + self.update_states(scheduler_output) + if num_tokens_after_padding == 0: + # All DP ranks have zero tokens to run. + return EMPTY_MODEL_RUNNER_OUTPUT + + if not dummy_run: + # Common case. + # Prepare all the inputs and copy to the input buffers. + input_batch = self.prepare_inputs( + scheduler_output, + num_tokens_after_padding, + ) + + # NOTE(woosuk): Sampling metadata should be built under the async + # barrier to avoid race conditions. + pos = input_batch.positions[input_batch.logits_indices] + sampling_metadata = self.req_states.make_sampling_metadata( + input_batch.idx_mapping_np, pos + ) + + if self.lora_config: + # Activate LoRA adapters. + lora_inputs = self.req_states.make_lora_inputs( + input_batch.req_ids, + input_batch.idx_mapping_np, + input_batch.num_scheduled_tokens, + ) + self._set_active_loras(*lora_inputs) + else: + # No actual tokens to run. A dummy run for DP. + num_reqs = min(num_tokens_after_padding, self.max_num_reqs) + input_batch = InputBatch.make_dummy( + num_reqs=num_reqs, + num_tokens=num_tokens_after_padding, + input_buffers=self.input_buffers, + device=self.device, + ) + self.prepare_dummy_attn_metadata(input_batch) + sampling_metadata = None + + # Run model. + if cudagraph_mode == CUDAGraphMode.FULL: + # Run CUDA graph. + # NOTE(woosuk): Here, we don't need to pass the input tensors, + # because they are already copied to the CUDA graph input buffers. + hidden_states = self.cudagraph_manager.run( + input_batch.num_tokens_after_padding + ) + else: + # Run PyTorch model in eager mode. + with set_forward_context( + input_batch.attn_metadata, + self.vllm_config, + num_tokens=input_batch.num_tokens_after_padding, + cudagraph_runtime_mode=cudagraph_mode, + num_tokens_across_dp=num_tokens_across_dp, + ): + hidden_states = self.model( + input_ids=input_batch.input_ids, + positions=input_batch.positions, + ) + + self.execute_model_state = hidden_states, input_batch, sampling_metadata + return None + + @torch.inference_mode() + def sample_tokens( + self, + grammar_output: GrammarOutput | None, + ) -> AsyncOutput | ModelRunnerOutput: + assert self.execute_model_state is not None + hidden_states, input_batch, sampling_metadata = self.execute_model_state + self.execute_model_state = None # type: ignore + assert sampling_metadata is not None + + sampler_output = self.sample( + hidden_states, input_batch, sampling_metadata, grammar_output + ) + prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch) + output = self.postprocess( + sampler_output, + prompt_logprobs_dict, + input_batch, + ) + return output diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py new file mode 100644 index 0000000000000..e916aadb6b5a0 --- /dev/null +++ b/vllm/v1/worker/gpu/sampler.py @@ -0,0 +1,327 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import torch +import triton +import triton.language as tl + +from vllm.config.model import LogprobsMode +from vllm.v1.outputs import LogprobsTensors, SamplerOutput +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p + + +class Sampler: + def __init__( + self, + logprobs_mode: LogprobsMode = "raw_logprobs", + ): + if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]: + raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}") + self.logprobs_mode = logprobs_mode + + def __call__( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + if sampling_metadata.max_num_logprobs is not None: + if self.logprobs_mode == "processed_logprobs": + sampled, logits = self.sample( + logits, sampling_metadata, return_logits=True + ) + else: + assert self.logprobs_mode == "raw_logprobs" + sampled, _ = self.sample(logits, sampling_metadata, return_logits=False) + + logprobs_tensors = compute_topk_logprobs( + logits, + sampling_metadata.max_num_logprobs, + sampled, + ) + else: + sampled, _ = self.sample(logits, sampling_metadata, return_logits=False) + logprobs_tensors = None + + # These are GPU tensors. + sampler_output = SamplerOutput( + # The sampled tokens are expanded to 2D tensor with shape + # [num_requests, 1], where each row represents one generated + # token per request. + sampled_token_ids=sampled.view(-1, 1), + logprobs_tensors=logprobs_tensors, + ) + return sampler_output + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + return_logits: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + is_greedy = sampling_metadata.temperature == 0 + temp = torch.where(is_greedy, 1.0, sampling_metadata.temperature) + logits = logits / temp.view(-1, 1) + logits = apply_top_k_top_p( + logits, sampling_metadata.top_k, sampling_metadata.top_p + ) + + sampled = gumbel_sample( + logits, + is_greedy, + sampling_metadata.seeds, + sampling_metadata.pos, + ) + return sampled, logits if return_logits else None + + +@triton.jit +def _gumbel_sample_kernel( + sampled_ptr, + logits_ptr, + logits_stride, + seeds_ptr, + pos_ptr, + is_greedy_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + is_greedy = tl.load(is_greedy_ptr + req_idx) + + if is_greedy: + # Greedy sampling. Don't apply gumbel noise. + max_val = float("-inf") + max_idx = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, + mask=mask, + other=float("-inf"), + ) + + idx = tl.argmax(logits, axis=0) + value = tl.max(logits, axis=0) + is_greater = value > max_val + max_val = tl.where(is_greater, value, max_val) + max_idx = tl.where(is_greater, i + idx, max_idx) + tl.store(sampled_ptr + req_idx, max_idx) + return + + # Random sampling. + # Calculate gumbel seed. + seed = tl.load(seeds_ptr + req_idx) + pos = tl.load(pos_ptr + req_idx) + gumbel_seed = tl.randint(seed, pos) + + max_val = float("-inf") + max_idx = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + + # Generate gumbel noise. + r = tl.rand(gumbel_seed, block).to(tl.float64) + gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20) + gumbel_noise = gumbel_noise.to(tl.float32) + + # Apply gumbel noise. + logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask) + logits = tl.where(mask, logits + gumbel_noise, float("-inf")) + + # Argmax to get the sampled token. + idx = tl.argmax(logits, axis=0) + value = tl.max(logits, axis=0) + is_greater = value > max_val + max_val = tl.where(is_greater, value, max_val) + max_idx = tl.where(is_greater, i + idx, max_idx) + tl.store(sampled_ptr + req_idx, max_idx) + + +def gumbel_sample( + logits: torch.Tensor, # [num_reqs, vocab_size] + is_greedy: torch.Tensor, # [num_reqs] + seed: torch.Tensor, # [num_reqs] + pos: torch.Tensor, # [num_reqs] +) -> torch.Tensor: + num_reqs, vocab_size = logits.shape + # NOTE(woosuk): Use int64 for later indexing. + sampled = torch.empty( + num_reqs, + dtype=torch.int64, + device=logits.device, + ) + _gumbel_sample_kernel[(num_reqs,)]( + sampled, + logits, + logits.stride(0), + seed, + pos, + is_greedy, + vocab_size, + num_warps=8, + BLOCK_SIZE=16384, # type: ignore + ) + return sampled + + +@triton.jit +def _topk_log_softmax_kernel( + output_ptr, + logits_ptr, + logits_stride, + topk_ids_ptr, + topk, + vocab_size, + BLOCK_SIZE: tl.constexpr, + PADDED_TOPK: tl.constexpr, +): + req_idx = tl.program_id(0) + row_ptr = logits_ptr + req_idx * logits_stride + + max_val = float("-inf") + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf")) + max_val = tl.max(tl.maximum(logits, max_val)) + max_val = max_val.to(tl.float32) # type: ignore + + se = 0.0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0) + # NOTE(woosuk): Make sure that logits and all following operations use FP32. + logits = logits.to(tl.float32) + e = tl.exp(logits - max_val) + e = tl.where(block < vocab_size, e, 0.0) + se += tl.sum(e) + lse = tl.log(se) + + k_offset = tl.arange(0, PADDED_TOPK) + k_mask = k_offset < topk + topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0) + + logits = tl.load(row_ptr + topk_ids, mask=k_mask) + logits = logits.to(tl.float32) + o = logits - max_val - lse + tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask) + + +@triton.jit +def _ranks_kernel( + output_ptr, + logits_ptr, + logits_stride, + token_ids_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + row_ptr = logits_ptr + req_idx * logits_stride + + token_id = tl.load(token_ids_ptr + req_idx) + x = tl.load(row_ptr + token_id) + + n = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf")) + n += tl.sum((logits > x).to(tl.int32)) + tl.store(output_ptr + req_idx, n) + + +def compute_token_logprobs( + logits: torch.Tensor, + token_ids: torch.Tensor, +) -> torch.Tensor: + batch_size = logits.shape[0] + vocab_size = logits.shape[1] + token_ids = token_ids.to(torch.int64) + num_logprobs = token_ids.shape[1] + logprobs = torch.empty( + batch_size, + num_logprobs, + dtype=torch.float32, + device=logits.device, + ) + _topk_log_softmax_kernel[(batch_size,)]( + logprobs, + logits, + logits.stride(0), + token_ids, + num_logprobs, + vocab_size, + BLOCK_SIZE=1024, # type: ignore + PADDED_TOPK=triton.next_power_of_2(num_logprobs), + ) + return logprobs + + +def compute_topk_logprobs( + logits: torch.Tensor, + num_logprobs: int, + sampled_token_ids: torch.Tensor, +) -> LogprobsTensors: + assert num_logprobs >= 0 + batch_size, vocab_size = logits.shape + if num_logprobs == 0: + logprob_token_ids = sampled_token_ids.unsqueeze(-1) + else: + topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices + logprob_token_ids = torch.cat( + (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1 + ) + + # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full + # logprobs tensor. Instead, we only compute and return the logprobs of + # the topk + 1 tokens. + logprobs = compute_token_logprobs(logits, logprob_token_ids) + token_ranks = torch.empty( + batch_size, + dtype=torch.int64, + device=logits.device, + ) + _ranks_kernel[(batch_size,)]( + token_ranks, + logits, + logits.stride(0), + sampled_token_ids, + vocab_size, + BLOCK_SIZE=8192, # type: ignore + ) + return LogprobsTensors( + logprob_token_ids=logprob_token_ids, + logprobs=logprobs, + selected_token_ranks=token_ranks, + ) + + +def compute_prompt_logprobs( + prompt_token_ids: torch.Tensor, + prompt_hidden_states: torch.Tensor, + logits_fn: Callable[[torch.Tensor], torch.Tensor], +) -> tuple[torch.Tensor, torch.Tensor]: + # Since materializing the full prompt logits can take too much memory, + # we compute it in chunks. + CHUNK_SIZE = 1024 + logprobs = [] + ranks = [] + prompt_token_ids = prompt_token_ids.to(torch.int64) + for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE): + end_idx = start_idx + CHUNK_SIZE + # NOTE(woosuk): logits_fn can be slow because it involves all-gather. + prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx]) + prompt_logprobs = compute_topk_logprobs( + prompt_logits, + 0, # num_logprobs + prompt_token_ids[start_idx:end_idx], + ) + logprobs.append(prompt_logprobs.logprobs) + ranks.append(prompt_logprobs.selected_token_ranks) + + logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0] + ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0] + return logprobs, ranks diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py new file mode 100644 index 0000000000000..5d05c3f57790a --- /dev/null +++ b/vllm/v1/worker/gpu/states.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass, field + +import numpy as np +import torch + +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams +from vllm.v1.outputs import LogprobsTensors +from vllm.v1.utils import CpuGpuBuffer + +_NP_INT64_MIN = np.iinfo(np.int64).min +_NP_INT64_MAX = np.iinfo(np.int64).max +NO_LORA_ID = 0 + + +@dataclass +class SamplingMetadata: + temperature: torch.Tensor + + top_p: torch.Tensor | None + top_k: torch.Tensor | None + + seeds: torch.Tensor + pos: torch.Tensor + + # None means no logprobs, 0 means sampled token logprobs only + max_num_logprobs: int | None + + @classmethod + def make_dummy( + cls, + num_reqs: int, + device: torch.device, + ) -> "SamplingMetadata": + assert num_reqs > 0 + temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device) + temperature[0] = 0.5 + # TODO(woosuk): Use top-p and top-k for dummy sampler. + # Currently, they are disabled because of memory usage. + # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device) + # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device) + top_p = None + top_k = None + seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device) + pos = torch.zeros(num_reqs, dtype=torch.int64, device=device) + max_num_logprobs = 20 + + return cls( + temperature=temperature, + top_p=top_p, + top_k=top_k, + seeds=seeds, + pos=pos, + max_num_logprobs=max_num_logprobs, + ) + + +class RequestState: + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_batched_tokens: int, + vocab_size: int, + device: torch.device, + pin_memory: bool, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_batched_tokens = max_num_batched_tokens + self.vocab_size = vocab_size + self.device = device + self.pin_memory = pin_memory + + self.req_id_to_index: dict[str, int] = {} + self.index_to_req_id: dict[int, str] = {} + self.free_indices = list(range(max_num_reqs)) + self.extra_data: dict[str, ExtraData] = {} + + self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32) + self.prefill_token_ids = np.zeros( + (self.max_num_reqs, self.max_model_len), + dtype=np.int32, + ) + self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32) + self.num_tokens = np.zeros(self.max_num_reqs, dtype=np.int32) + self.num_computed_tokens = np.zeros(self.max_num_reqs, dtype=np.int32) + + # Last sampled tokens. + self.last_sampled_tokens = torch.zeros( + self.max_num_reqs, + 1, + dtype=torch.int64, + device=device, + ) + + # LoRA. + self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32) + self.lora_ids.fill(NO_LORA_ID) + + # Sampling parameters. + self.temperature = self._make_param(self.max_num_reqs, torch.float32) + self.top_p = self._make_param(self.max_num_reqs, torch.float32) + self.top_k = self._make_param(self.max_num_reqs, torch.int32) + self.seeds = self._make_param(self.max_num_reqs, torch.int64) + + self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32) + # -1 means no logprobs are requested. + self.num_logprobs.fill(-1) + self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool) + + def _make_param(self, size: int, dtype: torch.dtype) -> "Param": + return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory) + + def _make_buffer(self, size: int, dtype: torch.dtype) -> CpuGpuBuffer: + return CpuGpuBuffer( + size, dtype=dtype, device=self.device, pin_memory=self.pin_memory + ) + + @property + def num_reqs(self) -> int: + return len(self.req_id_to_index) + + def add_request( + self, + req_id: str, + prompt_len: int, + prefill_token_ids: list[int], + num_computed_tokens: int, + sampling_params: SamplingParams, + lora_request: LoRARequest | None, + ) -> None: + assert len(self.free_indices) > 0, "No free indices" + req_idx = self.free_indices.pop() + self.req_id_to_index[req_id] = req_idx + self.index_to_req_id[req_idx] = req_id + self.extra_data[req_id] = ExtraData(lora_request) + + self.prompt_len[req_idx] = prompt_len + prefill_len = len(prefill_token_ids) + assert prefill_len >= prompt_len, ( + f"prefill_len {prefill_len} < prompt_len {prompt_len}" + ) + self.prefill_len.np[req_idx] = prefill_len + self.prefill_token_ids[req_idx, :prefill_len] = prefill_token_ids + self.num_tokens[req_idx] = prefill_len + self.num_computed_tokens[req_idx] = num_computed_tokens + + if lora_request is not None: + self.lora_ids[req_idx] = lora_request.lora_int_id + else: + self.lora_ids[req_idx] = NO_LORA_ID + + self.temperature.np[req_idx] = sampling_params.temperature + self.top_p.np[req_idx] = sampling_params.top_p + if 0 < sampling_params.top_k < self.vocab_size: + top_k = sampling_params.top_k + else: + top_k = self.vocab_size + self.top_k.np[req_idx] = top_k + + if sampling_params.seed is not None: + seed = sampling_params.seed + else: + seed = np.random.randint(_NP_INT64_MIN, _NP_INT64_MAX) + self.seeds.np[req_idx] = seed + + if sampling_params.logprobs is not None: + num_logprobs = sampling_params.logprobs + else: + num_logprobs = -1 + self.num_logprobs[req_idx] = num_logprobs + + # For now, only support prompt logprobs for the prompt tokens. + needs_prompt_logprobs = sampling_params.prompt_logprobs is not None + self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs + + def remove_request(self, req_id: str) -> None: + self.extra_data.pop(req_id, None) + req_idx = self.req_id_to_index.pop(req_id, None) + if req_idx is None: + # Request not found. + return + self.index_to_req_id.pop(req_idx, None) + self.free_indices.append(req_idx) + + def make_sampling_metadata( + self, + idx_mapping: np.ndarray, + pos: torch.Tensor, + ) -> SamplingMetadata: + temperature = self.temperature.np[idx_mapping] + temperature = self.temperature.copy_np_to_gpu(temperature) + + top_p = self.top_p.np[idx_mapping] + no_top_p = np.all(top_p == 1.0) + top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None + + top_k = self.top_k.np[idx_mapping] + no_top_k = np.all(top_k == self.vocab_size) + top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None + + seeds = self.seeds.np[idx_mapping] + seeds = self.seeds.copy_np_to_gpu(seeds) + + num_logprobs = self.num_logprobs[idx_mapping] + max_num_logprobs: int | None = int(np.max(num_logprobs)) + if max_num_logprobs == -1: + max_num_logprobs = None + + return SamplingMetadata( + temperature=temperature, + top_p=top_p, + top_k=top_k, + seeds=seeds, + pos=pos, + max_num_logprobs=max_num_logprobs, + ) + + def make_lora_inputs( + self, + req_ids: list[str], + idx_mapping: np.ndarray, + num_scheduled_tokens: np.ndarray, + ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]: + lora_ids = self.lora_ids[idx_mapping] + prompt_lora_mapping = tuple(lora_ids) + token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens)) + + active_lora_requests: set[LoRARequest] = set() + for req_id in req_ids: + lora_request = self.extra_data[req_id].lora_request + if lora_request is not None: + active_lora_requests.add(lora_request) + return prompt_lora_mapping, token_lora_mapping, active_lora_requests + + +class Param: + def __init__( + self, + size: int, + dtype: torch.dtype, + device: torch.device, + pin_memory: bool, + ): + self.buffer = CpuGpuBuffer( + size, + dtype=dtype, + device=device, + pin_memory=pin_memory, + ) + self.np = np.zeros_like(self.buffer.np) + + def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor: + n = x.shape[0] + self.buffer.np[:n] = x + return self.buffer.copy_to_gpu(n) + + +@dataclass +class ExtraData: + lora_request: LoRARequest | None + in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list) diff --git a/vllm/v1/worker/gpu/structured_outputs.py b/vllm/v1/worker/gpu/structured_outputs.py new file mode 100644 index 0000000000000..83051b0ed33ff --- /dev/null +++ b/vllm/v1/worker/gpu/structured_outputs.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import numpy as np +import torch + +from vllm.triton_utils import tl, triton +from vllm.v1.worker.gpu.input_batch import InputBuffers + + +def apply_grammar_bitmask( + logits: torch.Tensor, + req_ids: list[str], + grammar_req_ids: list[str], + grammar_bitmask: np.ndarray, + input_buffers: InputBuffers, +) -> None: + input_buffers.grammar_bitmask.np[: grammar_bitmask.shape[0]] = grammar_bitmask + input_buffers.grammar_bitmask.copy_to_gpu(grammar_bitmask.shape[0]) + + batch_size = logits.shape[0] + grammar_req_id_to_idx = {req_id: i for i, req_id in enumerate(grammar_req_ids)} + # logits -> bitmask mapping + mapping = [grammar_req_id_to_idx.get(req_id, -1) for req_id in req_ids] + input_buffers.bitmask_indices.np[:batch_size] = mapping + input_buffers.bitmask_indices.copy_to_gpu(batch_size) + + vocab_size = logits.shape[-1] + BLOCK_SIZE = 8192 + grid = (batch_size, triton.cdiv(vocab_size, BLOCK_SIZE)) + _apply_grammar_bitmask_kernel[grid]( + logits, + logits.stride(0), + input_buffers.grammar_bitmask.gpu, + input_buffers.grammar_bitmask.gpu.stride(0), + input_buffers.bitmask_indices.gpu, + vocab_size, + BLOCK_SIZE=BLOCK_SIZE, + ) + + +# Adapted from +# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py +@triton.jit +def _apply_grammar_bitmask_kernel( + logits_ptr, + logits_stride, + bitmask_ptr, + bitmask_stride, + bitmask_indices_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + logits_idx = tl.program_id(0) + bitmask_idx = tl.load(bitmask_indices_ptr + logits_idx) + if bitmask_idx == -1: + # No bitmask to apply. + return + + # Load the bitmask. + block_id = tl.program_id(1) + bitmask_offset = (block_id * BLOCK_SIZE) // 32 + tl.arange(0, BLOCK_SIZE // 32) + packed_bitmask = tl.load( + bitmask_ptr + bitmask_idx * bitmask_stride + bitmask_offset, + mask=bitmask_offset < bitmask_stride, + ) + # Unpack the bitmask. + bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0 + bitmask = bitmask.reshape(BLOCK_SIZE) + + # Apply the bitmask to the logits. + block_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + tl.store( + logits_ptr + logits_idx * logits_stride + block_offset, + -float("inf"), + mask=bitmask & (block_offset < vocab_size), + ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f1fd5be966c37..6a4bfde5f972b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -41,7 +41,7 @@ from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, memory_profiling -from vllm.v1.core.sched.output import GrammarOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ( @@ -58,7 +58,6 @@ logger = init_logger(__name__) if TYPE_CHECKING: from vllm.model_executor.model_loader.tensorizer import TensorizerConfig - from vllm.v1.core.sched.output import SchedulerOutput class Worker(WorkerBase): @@ -101,6 +100,8 @@ class Worker(WorkerBase): else: self.profiler = None + self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER + def sleep(self, level: int = 1) -> None: from vllm.device_allocator.cumem import CuMemAllocator @@ -237,9 +238,17 @@ class Worker(WorkerBase): raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct the model runner - self.model_runner: GPUModelRunner = GPUModelRunner( - self.vllm_config, self.device - ) + if self.use_v2_model_runner: + from vllm.v1.worker.gpu.model_runner import ( + GPUModelRunner as GPUModelRunnerV2, + ) + + # HACK(woosuk): This is a temporary fix to avoid type errors. + self.model_runner: GPUModelRunner = GPUModelRunnerV2( # type: ignore + self.vllm_config, self.device + ) + else: + self.model_runner = GPUModelRunner(self.vllm_config, self.device) if self.rank == 0: # If usage stat is enabled, collect relevant info. @@ -573,7 +582,12 @@ class Worker(WorkerBase): self.profiler.stop() def execute_dummy_batch(self) -> None: - self.model_runner._dummy_run(1, uniform_decode=True) + if self.use_v2_model_runner: + self.model_runner.execute_model( + SchedulerOutput.make_empty(), dummy_run=True + ) + else: + self.model_runner._dummy_run(1, uniform_decode=True) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_runner.add_lora(lora_request) From b7f1f490a61c99d0b371e39aefbe5546cba231a9 Mon Sep 17 00:00:00 2001 From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com> Date: Fri, 21 Nov 2025 08:34:46 -0800 Subject: [PATCH 16/83] Upstream triton fp4 weight preshuffle (#28888) Signed-off-by: Aleksandr Malyshev Co-authored-by: Aleksandr Malyshev --- vllm/_aiter_ops.py | 25 +++++++ .../quark/schemes/quark_ocp_mx.py | 65 +++++++++++++++---- 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index e53e4ae6e5296..db79b3f5e8bcb 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -948,6 +948,31 @@ class rocm_aiter_ops: (8192, 32768), ] + @staticmethod + def is_triton_gemm_afp4wfp4_presh_ws_tuned(n: int, k: int) -> bool: + return (n, k) in [ + (8192, 4096), + (1280, 8192), + (16384, 53248), + (106496, 16384), + (57344, 8192), + (8192, 2048), + (2560, 8192), + (10240, 8192), + (16384, 16384), + (8192, 28672), + (28672, 8192), + (18432, 16384), + (8192, 1024), + (7168, 8192), + (5120, 8192), + (8192, 8192), + (8192, 7168), + (14336, 8192), + (8192, 14336), + (8192, 3584), + ] + @staticmethod def shuffle_weight( self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16) diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py index 007e78e68d5cd..33e9f9806b27e 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py @@ -10,6 +10,7 @@ import torch import torch.nn.functional as F from vllm import envs +from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( dequant_mxfp4, @@ -49,7 +50,10 @@ def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool: try: from aiter.ops.shuffle import shuffle_weight - from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4 + from aiter.ops.triton.gemm_afp4wfp4 import ( + gemm_afp4wfp4, + gemm_afp4wfp4_preshuffled_weight_scales, + ) from aiter.ops.triton.quant import dynamic_mxfp4_quant from vllm.utils.torch_utils import direct_register_custom_op @@ -66,23 +70,56 @@ try: x_scales: torch.Tensor | None = None, ) -> torch.Tensor: M = x.shape[0] + N = weight.shape[0] + K = weight.shape[1] if rocm_use_aiter_fp4_asm_gemm: - if x_scales is None: - # use hip quant kernel for performance - x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True) + if M <= 64 and rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned(N, K): + if x_scales is None: + # use hip quant kernel for performance + if M >= 32: + x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True) + else: + x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=False) + else: + x_q = x + x_s = x_scales + + if M >= 32: + x_s = x_s.view(torch.uint8).view(x_s.shape[0] // 32, -1) + else: + x_s = x_s[:M, ...].view(torch.uint8) + + y = torch.empty(M, N, device=x_q.device, dtype=out_dtype) + gemm_afp4wfp4_preshuffled_weight_scales( + x_q.view(torch.uint8), + weight.view(torch.uint8).view(weight.shape[0] // 16, -1), + x_s, + weight_scale.view(torch.uint8).view( + weight_scale.shape[0] // 32, -1 + ), + out_dtype, + y, + ) else: - x_q = x - x_s = x_scales + if x_scales is None: + # use hip quant kernel for performance + x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True) + else: + x_q = x + x_s = x_scales - # 32 alignment is enough for dim0 padding of output for - # gemm_a4w4 kernel - y = torch.empty( - (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype - ) + # 32 alignment is enough for dim0 padding of output for + # gemm_a4w4 kernel + y = torch.empty( + (M + 31) // 32 * 32, + weight.shape[0], + device=x_q.device, + dtype=out_dtype, + ) - gemm_a4w4( - x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True - ) + gemm_a4w4( + x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True + ) return y[:M] else: if x_scales is None: From a42ab317acff8c4b7d4808bb34548a530ee04f0f Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 21 Nov 2025 11:46:20 -0500 Subject: [PATCH 17/83] [Log] Optimize startup log (#28948) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Nick Hill --- .../layers/fused_moe/fused_moe.py | 6 +++-- .../model_executor/layers/quantization/fp8.py | 10 +++++--- vllm/profiler/gpu_profiler.py | 25 ++++++++++--------- vllm/v1/core/kv_cache_utils.py | 3 ++- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f44328418f1bc..df208eae2e71c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -872,8 +872,10 @@ def get_moe_configs( for config_file_path in config_file_paths: if os.path.exists(config_file_path): with open(config_file_path) as f: - logger.info( - "Using configuration from %s for MoE layer.", config_file_path + logger.info_once( + "Using configuration from %s for MoE layer.", + config_file_path, + scope="global", ) # If a configuration has been found, return it tuned_config = json.load(f) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 92fbdd7093483..91bd45bf879cb 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: # deepGEMM on supported platforms with block-quantized weights if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant: if not has_deep_gemm(): - logger.warning_once("DeepGEMM backend requested but not available.") + logger.warning_once( + "DeepGEMM backend requested but not available.", scope="local" + ) elif is_deep_gemm_supported(): - logger.info_once("Using DeepGEMM backend for FP8 MoE") + logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") return Fp8MoeBackend.DEEPGEMM # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights @@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: and current_platform.is_device_capability(100) and block_quant ): - logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE") + logger.info_once( + "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local" + ) return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM # default to Triton diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py index 2155b67a3db4b..3e2cbe7296e9d 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/gpu_profiler.py @@ -139,18 +139,19 @@ class TorchProfilerWrapper(WorkerProfiler): self.local_rank = local_rank torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info( - "Torch profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, - ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) + if local_rank in (None, 0): + logger.info( + "Torch profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir, + ) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 01ecd881115df..b18ba8e8b2c7b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1236,10 +1236,11 @@ def _report_kv_cache_config( max_concurrency = get_max_concurrency_for_kv_cache_config( vllm_config, kv_cache_config ) - logger.info( + logger.info_once( "Maximum concurrency for %s tokens per request: %.2fx", max_model_len_str, max_concurrency, + scope="local", ) From e99e467384001e284e0722a33362866b10fed65b Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 10:53:09 -0600 Subject: [PATCH 18/83] [CI/Build][Kernel][AMD] Move extra dim to after load in _fwd_kv_parallel in lighting_attn.py (#29132) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- vllm/model_executor/layers/lightning_attn.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py index 99853680eac6c..ffccdc12241cb 100644 --- a/vllm/model_executor/layers/lightning_attn.py +++ b/vllm/model_executor/layers/lightning_attn.py @@ -198,7 +198,7 @@ def _fwd_kv_parallel( ) # Load the decay factors for the current head and block - k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :] + k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK) kv_index = tl.arange(0, CBLOCK) @@ -228,6 +228,12 @@ def _fwd_kv_parallel( # Load decay factor and compute weighted key-value outer product k_decay = tl.load(k_decay_ptr) + + # NOTE: Need to add the extra dim here due to AMD MLIR lowering error. + # Please don't move it back until issue is resolved. + # Issue: https://github.com/ROCm/triton/issues/907 + k_decay = k_decay[None, :] + kv += tl.dot(k_trans * k_decay, v) # Move to the next sub-block From b4c8fbaae2592501f442817f86e32cfeb795d81f Mon Sep 17 00:00:00 2001 From: Mingyuan Ma <111467530+Victor49152@users.noreply.github.com> Date: Fri, 21 Nov 2025 08:54:11 -0800 Subject: [PATCH 19/83] Add TRTLLM MoE NVFP4 kernel to CompressedTensorsW4A4MoeMethod (#28892) Signed-off-by: mingyuanm Signed-off-by: mgoin Co-authored-by: mgoin --- .../compressed_tensors_moe.py | 142 +++++++++-- .../layers/quantization/modelopt.py | 205 ++-------------- .../quantization/utils/flashinfer_fp4_moe.py | 221 ++++++++++++++++++ 3 files changed, 358 insertions(+), 210 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index fa254030a271a..ad547dd409822 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -8,6 +8,7 @@ from enum import Enum import torch from compressed_tensors import CompressionFormat from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy +from torch.nn.parameter import Parameter import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk @@ -50,9 +51,15 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, + flashinfer_trtllm_fp4_moe, + prepare_static_weights_for_trtllm_fp4_moe, reorder_w1w3_to_w3w1, select_nvfp4_gemm_impl, ) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + FlashinferMoeBackend, + get_flashinfer_moe_backend, +) from vllm.model_executor.layers.quantization.utils.fp8_utils import ( expert_weight_is_col_major, requant_weight_ue8m0_inplace, @@ -193,6 +200,13 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.group_size = 16 + self.flashinfer_moe_backend = None + if self.allow_flashinfer: + self.flashinfer_moe_backend = get_flashinfer_moe_backend() + logger.info_once( + f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels" + " for CompressedTensorsW4A4MoeMethod." + ) def create_weights( self, @@ -344,21 +358,20 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): if self.use_marlin: prepare_moe_fp4_layer_for_marlin(layer) return - - # swizzle weight scales - layer.w13_weight_scale = torch.nn.Parameter( - swizzle_blockscale(layer.w13_weight_scale), requires_grad=False - ) - - layer.w2_weight_scale = torch.nn.Parameter( - swizzle_blockscale(layer.w2_weight_scale), requires_grad=False - ) - # w13 - w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to( - torch.float32 - ) - + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + w13_input_global_scale = ( + layer.w13_input_global_scale.min() + .to(torch.float32) + .expand(layer.num_experts) + ) + else: + w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to( + torch.float32 + ) layer.g1_alphas = torch.nn.Parameter( ((1 / w13_input_global_scale) * layer.w13_weight_scale_2), requires_grad=False, @@ -369,22 +382,92 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): ) # w2 + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + w2_input_global_scale = ( + layer.w2_input_global_scale.min() + .to(torch.float32) + .expand(layer.num_experts) + ) + else: + w2_input_global_scale = layer.w2_input_global_scale + layer.g2_alphas = torch.nn.Parameter( - ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to( - torch.float32 - ), + ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32), requires_grad=False, ) layer.w2_input_scale_quant = torch.nn.Parameter( - (layer.w2_input_global_scale), requires_grad=False + (w2_input_global_scale), requires_grad=False ) + # TensorRT-LLM specific processing + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + # Prepare static weights for TRT-LLM kernel + # alternate: prepare_static_weight_layouts_for_trtllm_moe + ( + gemm1_weights_fp4_shuffled, + gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, + gemm2_scales_fp4_shuffled, + ) = prepare_static_weights_for_trtllm_fp4_moe( + layer.w13_weight, + layer.w2_weight, + layer.w13_weight_scale, + layer.w2_weight_scale, + layer.w2_weight.size(-2), # hidden_size + layer.w13_weight.size(-2) // 2, # intermediate_size + layer.w13_weight.size(0), # num_experts + ) + logger.debug_once("Finished shuffling weights for TRT-LLM MOE") + + layer.gemm1_weights_fp4_shuffled = Parameter( + gemm1_weights_fp4_shuffled, requires_grad=False + ) + layer.gemm2_weights_fp4_shuffled = Parameter( + gemm2_weights_fp4_shuffled, requires_grad=False + ) + layer.gemm1_scales_fp4_shuffled = Parameter( + gemm1_scales_fp4_shuffled, requires_grad=False + ) + layer.gemm2_scales_fp4_shuffled = Parameter( + gemm2_scales_fp4_shuffled, requires_grad=False + ) + + # Additional parameter needed for TRT-LLM + layer.g1_scale_c = Parameter( + (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32), + requires_grad=False, + ) + + # Clean up weights that won't be used by TRT-LLM + del layer.w2_weight + del layer.w2_weight_scale + del layer.w13_weight + del layer.w13_weight_scale + else: + # swizzle weight scales + layer.w13_weight_scale = torch.nn.Parameter( + swizzle_blockscale(layer.w13_weight_scale), requires_grad=False + ) + + layer.w2_weight_scale = torch.nn.Parameter( + swizzle_blockscale(layer.w2_weight_scale), requires_grad=False + ) + def maybe_make_prepare_finalize( self, routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, ) -> mk.FusedMoEPrepareAndFinalize | None: - if self.use_marlin: + if self.use_marlin or ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): return None elif not self.allow_flashinfer: return super().maybe_make_prepare_finalize(routing_tables) @@ -411,7 +494,10 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - if self.use_marlin: + if ( + self.use_marlin + or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): return None return nvfp4_moe_quant_config( @@ -452,6 +538,22 @@ class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod): ) assert activation == "silu", "Only SiLU activation is supported." + if ( + self.allow_flashinfer + and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM + ): + return flashinfer_trtllm_fp4_moe( + layer=layer, + x=x, + router_logits=router_logits, + top_k=top_k, + global_num_experts=global_num_experts, + num_expert_group=num_expert_group, + topk_group=topk_group, + custom_routing_function=custom_routing_function, + e_score_correction_bias=e_score_correction_bias, + ) + topk_weights, topk_ids, _ = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 6b5ed7762eb31..01a23168bdde3 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -15,7 +15,6 @@ from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, - RoutingMethodType, fp8_w8a8_moe_quant_config, nvfp4_moe_quant_config, ) @@ -38,6 +37,8 @@ from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import ( build_flashinfer_fp4_cutlass_moe_prepare_finalize, + flashinfer_trtllm_fp4_moe, + prepare_static_weights_for_trtllm_fp4_moe, reorder_w1w3_to_w3w1, select_nvfp4_gemm_impl, ) @@ -1136,7 +1137,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self.allow_flashinfer = _nvfp4.allow_flashinfer self.use_marlin = _nvfp4.use_marlin self.flashinfer_moe_backend = None - self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} if self.allow_flashinfer: self.flashinfer_moe_backend = get_flashinfer_moe_backend() logger.info_once( @@ -1303,138 +1303,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): ) layer.register_parameter("w2_input_scale", w2_input_scale) - def prepare_static_weights_for_trtllm_fp4_moe( - self, - # args_dequant, - # args, - gemm1_weights, - gemm2_weights, - gemm1_scales_linear_fp4_bytes, - gemm2_scales_linear_fp4_bytes, - hidden_size, - intermediate_size, - num_experts, - ): - from flashinfer import nvfp4_block_scale_interleave - from flashinfer.fused_moe.core import ( - _maybe_get_cached_w3_w1_permute_indices, - get_w2_permute_indices_with_cache, - ) - - """Prepare quantized weights for kernel (done offline with weights).""" - epilogue_tile_m = 128 # FIXME: this depends on the kernel internals - - # Convert quantized weights to proper formats - gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape( - num_experts, 2 * intermediate_size, hidden_size // 2 - ) # packed fp4 - gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view( - torch.float8_e4m3fn - ).reshape( - num_experts, 2 * intermediate_size, hidden_size // 16 - ) # fp8 scaling factors - - gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape( - num_experts, hidden_size, intermediate_size // 2 - ) # packed fp4 - gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view( - torch.float8_e4m3fn - ).reshape( - num_experts, hidden_size, intermediate_size // 16 - ) # fp8 scaling factors - - gemm1_weights_fp4_shuffled = [] - gemm1_scales_fp4_shuffled = [] - gemm2_weights_fp4_shuffled = [] - gemm2_scales_fp4_shuffled = [] - for i in range(num_experts): - # Calculate the permute indices for the following: - # 1. Reorder rows of W1 and scales for fused gated activation - # 2. Shuffle weights and scaling factors for transposed mma output - # for both w3_w1 and w2 weights and scale factors - permute_indices = _maybe_get_cached_w3_w1_permute_indices( - self._cache_permute_indices, - gemm1_weights_fp4[i].view(torch.uint8), - epilogue_tile_m, - ) - gemm1_weights_fp4_shuffled.append( - gemm1_weights_fp4[i] - .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)] - .contiguous() - ) - - permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices( - self._cache_permute_indices, - gemm1_scales_linear_fp4[i].view(torch.uint8), - epilogue_tile_m, - num_elts_per_sf=16, - ) - gemm1_scales_fp4_shuffled.append( - nvfp4_block_scale_interleave( - gemm1_scales_linear_fp4[i] - .view(torch.uint8)[ - permute_sf_indices.to(gemm1_scales_linear_fp4.device) - ] - .contiguous() - ) - ) - - permute_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - gemm2_weights_fp4[i].view(torch.uint8), - epilogue_tile_m, - ) - gemm2_weights_fp4_shuffled.append( - gemm2_weights_fp4[i] - .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)] - .contiguous() - ) - - permute_sf_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - gemm2_scales_linear_fp4[i].view(torch.uint8), - epilogue_tile_m, - num_elts_per_sf=16, - ) - gemm2_scales_fp4_shuffled.append( - nvfp4_block_scale_interleave( - gemm2_scales_linear_fp4[i] - .view(torch.uint8)[ - permute_sf_indices.to(gemm2_scales_linear_fp4.device) - ] - .contiguous() - ) - ) - - # Stack weights for all experts - gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled) - gemm1_scales_fp4_shuffled = ( - torch.stack(gemm1_scales_fp4_shuffled) - .view(torch.float8_e4m3fn) - .reshape(num_experts, 2 * intermediate_size, hidden_size // 16) - ) - - gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled) - gemm2_scales_fp4_shuffled = ( - torch.stack(gemm2_scales_fp4_shuffled) - .view(torch.float8_e4m3fn) - .reshape(num_experts, hidden_size, intermediate_size // 16) - ) - return ( - gemm1_weights_fp4_shuffled, - gemm1_scales_fp4_shuffled, - gemm2_weights_fp4_shuffled, - gemm2_scales_fp4_shuffled, - ) - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # GEMM 1 processing gemm1_weight = layer.w13_weight.data gemm1_weight_scale = layer.w13_weight_scale.data - if ( - self.allow_flashinfer - and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + if self.allow_flashinfer and ( + self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS + or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1( gemm1_weight, gemm1_weight_scale, dim=-2 @@ -1508,7 +1384,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): gemm1_scales_fp4_shuffled, gemm2_weights_fp4_shuffled, gemm2_scales_fp4_shuffled, - ) = self.prepare_static_weights_for_trtllm_fp4_moe( + ) = prepare_static_weights_for_trtllm_fp4_moe( layer.w13_weight, layer.w2_weight, layer.w13_weight_scale, @@ -1614,68 +1490,17 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): - import flashinfer - - from vllm.model_executor.models.llama4 import Llama4MoE - - a1_gscale = layer.w13_input_scale_quant - (hidden_states_fp4, hidden_states_scale_linear_fp4) = ( - flashinfer.fp4_quantize( - x, - a1_gscale, - is_sf_swizzled_layout=False, - ) - ) - use_llama4_routing = ( - custom_routing_function is Llama4MoE.custom_routing_function - ) - routing_method_type = layer.routing_method_type - if use_llama4_routing: - routing_method_type = RoutingMethodType.Llama4 - router_logits = ( - router_logits.to(torch.float32) - if routing_method_type == RoutingMethodType.DeepSeekV3 - else router_logits - ) - routing_bias = e_score_correction_bias - if routing_bias is not None: - routing_bias = routing_bias.to(torch.bfloat16) - out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( - routing_logits=router_logits, - routing_bias=routing_bias, - hidden_states=hidden_states_fp4, - hidden_states_scale=hidden_states_scale_linear_fp4.view( - torch.float8_e4m3fn - ).flatten(), - gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, - gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), - gemm1_bias=None, - gemm1_alpha=None, - gemm1_beta=None, - gemm1_clamp_limit=None, - gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, - gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), - gemm2_bias=None, - output1_scale_scalar=layer.g1_scale_c.data, - output1_scale_gate_scalar=layer.g1_alphas.data, - output2_scale_scalar=layer.g2_alphas.data, - num_experts=global_num_experts, + return flashinfer_trtllm_fp4_moe( + layer=layer, + x=x, + router_logits=router_logits, top_k=top_k, - n_group=num_expert_group, + global_num_experts=global_num_experts, + num_expert_group=num_expert_group, topk_group=topk_group, - intermediate_size=layer.intermediate_size_per_partition, - local_expert_offset=layer.ep_rank * layer.local_num_experts, - local_num_experts=layer.local_num_experts, - routed_scaling_factor=1.0, - tile_tokens_dim=None, - routing_method_type=routing_method_type, - do_finalize=True, - )[0] - return out + custom_routing_function=custom_routing_function, + e_score_correction_bias=e_score_correction_bias, + ) topk_weights, topk_ids, _ = FusedMoE.select_experts( hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 36e8599dd9484..eda40657b1e39 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -9,6 +9,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, + RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( FlashInferCuteDSLExperts, @@ -110,3 +111,223 @@ def select_nvfp4_gemm_impl( "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS " "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)" ) + + +def prepare_static_weights_for_trtllm_fp4_moe( + # args_dequant, + # args, + gemm1_weights, + gemm2_weights, + gemm1_scales_linear_fp4_bytes, + gemm2_scales_linear_fp4_bytes, + hidden_size, + intermediate_size, + num_experts, +): + from flashinfer import nvfp4_block_scale_interleave + from flashinfer.fused_moe.core import ( + _maybe_get_cached_w3_w1_permute_indices, + get_w2_permute_indices_with_cache, + ) + + _cache_permute_indices: dict[torch.Size, torch.Tensor] = {} + """Prepare quantized weights for kernel (done offline with weights).""" + epilogue_tile_m = 128 # FIXME: this depends on the kernel internals + + # Convert quantized weights to proper formats + gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape( + num_experts, 2 * intermediate_size, hidden_size // 2 + ) # packed fp4 + gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn + ).reshape( + num_experts, 2 * intermediate_size, hidden_size // 16 + ) # fp8 scaling factors + + gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape( + num_experts, hidden_size, intermediate_size // 2 + ) # packed fp4 + gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view( + torch.float8_e4m3fn + ).reshape(num_experts, hidden_size, intermediate_size // 16) # fp8 scaling factors + + gemm1_weights_fp4_shuffled = [] + gemm1_scales_fp4_shuffled = [] + gemm2_weights_fp4_shuffled = [] + gemm2_scales_fp4_shuffled = [] + for i in range(num_experts): + # Calculate the permute indices for the following: + # 1. Reorder rows of W1 and scales for fused gated activation + # 2. Shuffle weights and scaling factors for transposed mma output + # for both w3_w1 and w2 weights and scale factors + permute_indices = _maybe_get_cached_w3_w1_permute_indices( + _cache_permute_indices, + gemm1_weights_fp4[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_fp4_shuffled.append( + gemm1_weights_fp4[i] + .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)] + .contiguous() + ) + + permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices( + _cache_permute_indices, + gemm1_scales_linear_fp4[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm1_scales_fp4_shuffled.append( + nvfp4_block_scale_interleave( + gemm1_scales_linear_fp4[i] + .view(torch.uint8)[ + permute_sf_indices.to(gemm1_scales_linear_fp4.device) + ] + .contiguous() + ) + ) + + permute_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + gemm2_weights_fp4[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_fp4_shuffled.append( + gemm2_weights_fp4[i] + .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)] + .contiguous() + ) + + permute_sf_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + gemm2_scales_linear_fp4[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm2_scales_fp4_shuffled.append( + nvfp4_block_scale_interleave( + gemm2_scales_linear_fp4[i] + .view(torch.uint8)[ + permute_sf_indices.to(gemm2_scales_linear_fp4.device) + ] + .contiguous() + ) + ) + + # Stack weights for all experts + gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled) + gemm1_scales_fp4_shuffled = ( + torch.stack(gemm1_scales_fp4_shuffled) + .view(torch.float8_e4m3fn) + .reshape(num_experts, 2 * intermediate_size, hidden_size // 16) + ) + + gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled) + gemm2_scales_fp4_shuffled = ( + torch.stack(gemm2_scales_fp4_shuffled) + .view(torch.float8_e4m3fn) + .reshape(num_experts, hidden_size, intermediate_size // 16) + ) + return ( + gemm1_weights_fp4_shuffled, + gemm1_scales_fp4_shuffled, + gemm2_weights_fp4_shuffled, + gemm2_scales_fp4_shuffled, + ) + + +def flashinfer_trtllm_fp4_moe( + layer: torch.nn.Module, + x: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + global_num_experts: int, + num_expert_group: int | None, + topk_group: int | None, + custom_routing_function: object | None, + e_score_correction_bias: torch.Tensor | None, +) -> torch.Tensor: + """ + Apply FlashInfer TensorRT-LLM FP4 MoE kernel. + + Args: + layer: The MoE layer with weights and scales + x: Input tensor + router_logits: Router logits for expert selection + top_k: Number of experts to select per token + global_num_experts: Total number of experts across all ranks + num_expert_group: Number of expert groups (for grouped routing) + topk_group: Top-k within each group + custom_routing_function: Custom routing function (e.g., Llama4) + e_score_correction_bias: Optional routing bias correction + + Returns: + Output tensor from the MoE layer + """ + import flashinfer + + from vllm.model_executor.models.llama4 import Llama4MoE + + # Quantize input to FP4 + a1_gscale = layer.w13_input_scale_quant + (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize( + x, + a1_gscale, + is_sf_swizzled_layout=False, + ) + + # Determine routing method type + use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function + routing_method_type = layer.routing_method_type + if use_llama4_routing: + routing_method_type = flashinfer.RoutingMethodType.Llama4 + + # Prepare routing bias + routing_bias = e_score_correction_bias + if routing_bias is not None: + routing_bias = routing_bias.to(torch.bfloat16) + + router_logits = ( + router_logits.to(torch.float32) + if routing_method_type == RoutingMethodType.DeepSeekV3 + else router_logits + ) + + # Call TRT-LLM FP4 block-scale MoE kernel + out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe( + routing_logits=router_logits, + routing_bias=routing_bias, + hidden_states=hidden_states_fp4, + hidden_states_scale=hidden_states_scale_linear_fp4.view( + torch.float8_e4m3fn + ).flatten(), + gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, + gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm1_bias=None, + gemm1_alpha=None, + gemm1_beta=None, + gemm1_clamp_limit=None, + gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, + gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( + torch.float8_e4m3fn + ), + gemm2_bias=None, + output1_scale_scalar=layer.g1_scale_c.data, + output1_scale_gate_scalar=layer.g1_alphas.data, + output2_scale_scalar=layer.g2_alphas.data, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group if num_expert_group is not None else 0, + topk_group=topk_group if topk_group is not None else 0, + intermediate_size=layer.intermediate_size_per_partition, + local_expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + routed_scaling_factor=None, + tile_tokens_dim=None, + routing_method_type=routing_method_type, + do_finalize=True, + )[0] + + return out From 460d02a417b440ce8b3b8d09c6f5214a2a346426 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 21 Nov 2025 10:55:27 -0600 Subject: [PATCH 20/83] [NIXL] Fix after virtual block_size for host_buffer with heter kv_layout (#29122) Signed-off-by: Chendi Xue --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 14 +++++++++++++- vllm/platforms/xpu.py | 8 -------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 1626f819af8b5..7c0911240493c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -1042,10 +1042,12 @@ class NixlConnectorWorker: NOT directly supported by NIXL (e.g., tpu) """ xfer_buffers: dict[str, torch.Tensor] = {} + inv_order = [0, 1, 3, 2, 4] try: for layer_name, kv_cache in kv_caches.items(): kv_shape = kv_cache.shape kv_dtype = kv_cache.dtype + permute_shape = False if ( self.kv_cache_layout == "NHD" and self.vllm_config.kv_transfer_config is not None @@ -1059,10 +1061,20 @@ class NixlConnectorWorker: # Since NHD will not support Decode/Prefill TP_ratio > 1, # we can leverage host_buffer for permute self.host_buffer_kv_cache_layout = "HND" - kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4]) + kv_shape = ( + tuple(kv_shape[i] for i in inv_order) + if not self.use_mla + else kv_shape + ) + permute_shape = not self.use_mla + xfer_buffers[layer_name] = torch.empty( kv_shape, dtype=kv_dtype, device="cpu" ) + if permute_shape: + xfer_buffers[layer_name] = xfer_buffers[layer_name].permute( + inv_order + ) except MemoryError as e: logger.error("NIXLConnectorWorker gets %s.", e) raise diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 65516827a16da..18a3186b142f1 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -251,10 +251,6 @@ class XPUPlatform(Platform): ) -> None: """Copy blocks from src_cache to dst_cache on XPU.""" _src_cache = src_cache[:, src_block_indices] - if _src_cache.shape[2:] != dst_cache.shape[2:]: - # To support TP_ratio, HOST KV might be initiated with HND - # while XPU device KV is with NHD - _src_cache = _src_cache.permute(0, 1, 3, 2, 4) dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device) @classmethod @@ -267,8 +263,4 @@ class XPUPlatform(Platform): ) -> None: """Copy blocks from XPU to host (CPU).""" _src_cache = src_cache[:, src_block_indices] - if _src_cache.shape[2:] != dst_cache.shape[2:]: - # XPU device KV is with NHD while HOST KV - # might be initiated with HND for TP_ratio support - _src_cache = _src_cache.permute(0, 1, 3, 2, 4) dst_cache[:, dst_block_indices] = _src_cache.cpu() From 75648b16ddce1bff02c39c6f06be62a58385ff52 Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Fri, 21 Nov 2025 11:12:16 -0600 Subject: [PATCH 21/83] [ROCm][CI] Fix config/test_config_generation.py (#29142) Signed-off-by: charlifu --- docker/Dockerfile.rocm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 731a97d93da1f..42466d1801cf6 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -7,6 +7,8 @@ FROM ${BASE_IMAGE} AS base ARG ARG_PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} +ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ @@ -121,8 +123,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker -ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false # ENV that can improve safe tensor loading, and end-to-end time From ceca06050124a10b33e78ee33d1a25a97edd1f74 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 22 Nov 2025 02:19:25 +0800 Subject: [PATCH 22/83] [Deprecation] Deprecate `seed=None` (#29185) Signed-off-by: DarkLight1337 --- vllm/engine/arg_utils.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6eaf328eb1655..888f57b1ac1df 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -367,7 +367,7 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = None + seed: int | None = 0 max_model_len: int | None = ModelConfig.max_model_len cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( @@ -1192,6 +1192,12 @@ class EngineArgs: # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here # doesn't affect the user process. if self.seed is None: + logger.warning_once( + "`seed=None` is equivalent to `seed=0` in V1 Engine. " + "You will no longer be allowed to pass `None` in v0.13.", + scope="local", + ) + self.seed = 0 if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: logger.warning( @@ -1203,28 +1209,31 @@ class EngineArgs: ) if self.disable_mm_preprocessor_cache: - logger.warning( + logger.warning_once( "`--disable-mm-preprocessor-cache` is deprecated " "and will be removed in v0.13. " "Please use `--mm-processor-cache-gb 0` instead.", + scope="local", ) self.mm_processor_cache_gb = 0 elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: - logger.warning( + logger.warning_once( "VLLM_MM_INPUT_CACHE_GIB` is deprecated " "and will be removed in v0.13. " "Please use `--mm-processor-cache-gb %d` instead.", envs.VLLM_MM_INPUT_CACHE_GIB, + scope="local", ) self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB if self.enable_multimodal_encoder_data_parallel: - logger.warning( + logger.warning_once( "--enable-multimodal-encoder-data-parallel` is deprecated " "and will be removed in v0.13. " - "Please use `--mm-encoder-tp-mode data` instead." + "Please use `--mm-encoder-tp-mode data` instead.", + scope="local", ) self.mm_encoder_tp_mode = "data" From 1bed891f72a6cbd32c0c75dfaa29ad21d7a68b75 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 10:21:40 -0800 Subject: [PATCH 23/83] [Chore] Fix pre-commit error after #25266 (#29190) --- vllm/v1/worker/gpu/async_utils.py | 20 +++++++++++--------- vllm/v1/worker/gpu/attn_utils.py | 14 ++++++++------ vllm/v1/worker/gpu/cudagraph_utils.py | 12 ++++++++++-- vllm/v1/worker/gpu/model_runner.py | 16 ++++++++++------ vllm/v1/worker/gpu/sampler.py | 2 +- 5 files changed, 40 insertions(+), 24 deletions(-) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index 638ec6fb0b082..e523090aa2172 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -7,6 +7,7 @@ import torch from vllm.v1.outputs import ( AsyncModelRunnerOutput, + LogprobsTensors, ModelRunnerOutput, SamplerOutput, ) @@ -46,15 +47,18 @@ class AsyncOutput(AsyncModelRunnerOutput): "cpu", non_blocking=True ) if sampler_output.logprobs_tensors is not None: - self.logprobs_tensors = ( + self.logprobs_tensors: LogprobsTensors | None = ( sampler_output.logprobs_tensors.to_cpu_nonblocking() ) else: self.logprobs_tensors = None - self.prompt_logprobs_dict = {} + self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): - self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking() + if v is not None: + self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking() + else: + self.prompt_logprobs_dict[k] = None self.copy_event.record(self.copy_stream) def get_output(self) -> ModelRunnerOutput: @@ -64,12 +68,10 @@ class AsyncOutput(AsyncModelRunnerOutput): # the existing model runner. # Going forward, we should keep the data structures as NumPy arrays # rather than Python lists. - sampled_token_ids_np = self.sampled_token_ids.numpy() - num_reqs = sampled_token_ids_np.shape[0] - sampled_token_ids: list[np.ndarray] = [ - sampled_token_ids_np[i, : self.num_sampled_tokens[i]] - for i in range(num_reqs) - ] + sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist() + num_reqs = len(sampled_token_ids) + for i in range(num_reqs): + del sampled_token_ids[i][self.num_sampled_tokens[i] :] self.model_runner_output.sampled_token_ids = sampled_token_ids if self.logprobs_tensors is not None: diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 8850c18092299..222db565dff17 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Sequence -from typing import Any +from typing import Any, cast import torch @@ -13,6 +13,7 @@ from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, ) from vllm.v1.kv_cache_interface import ( + AttentionSpec, KVCacheConfig, KVCacheSpec, ) @@ -22,7 +23,8 @@ from vllm.v1.worker.utils import bind_kv_cache def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]: kv_cache_spec: dict[str, KVCacheSpec] = {} - attn_layers = get_layers_from_vllm_config(vllm_config, AttentionLayerBase) + layer_type = cast(type[Any], AttentionLayerBase) + attn_layers = get_layers_from_vllm_config(vllm_config, layer_type) for layer_name, attn_module in attn_layers.items(): # Skip modules that don't need KV cache (eg encoder-only attention) if spec := attn_module.get_kv_cache_spec(vllm_config): @@ -35,16 +37,15 @@ def init_attn_backend( vllm_config: VllmConfig, device: torch.device, ): - attn_backends: dict[str, AttentionBackend] = {} + attn_backends: dict[str, type[AttentionBackend]] = {} attn_metadata_builders: list[AttentionMetadataBuilder] = [] flashinfer_workspace: torch.Tensor | None = None for kv_cache_group_spec in kv_cache_config.kv_cache_groups: layer_names = kv_cache_group_spec.layer_names any_layer_name = next(iter(layer_names)) - attn_layers = get_layers_from_vllm_config( - vllm_config, AttentionLayerBase, layer_names - ) + layer_type = cast(type[Any], AttentionLayerBase) + attn_layers = get_layers_from_vllm_config(vllm_config, layer_type, layer_names) attn_backend = attn_layers[any_layer_name].get_attn_backend() for layer_name in layer_names: attn_backends[layer_name] = attn_backend @@ -93,6 +94,7 @@ def _reshape_kv_cache( kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group_spec in kv_cache_config.kv_cache_groups: kv_cache_spec = kv_cache_group_spec.kv_cache_spec + assert isinstance(kv_cache_spec, AttentionSpec) for layer_name in kv_cache_group_spec.layer_names: raw_tensor = kv_cache_raw_tensors[layer_name] assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0 diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 7fd1f76669f48..31a706475243c 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -34,8 +34,16 @@ class CudaGraphManager: self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None - self.cudagraph_mode = self.compilation_config.cudagraph_mode - self.cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + if self.compilation_config.cudagraph_mode is None: + self.cudagraph_mode = CUDAGraphMode.NONE + else: + self.cudagraph_mode = self.compilation_config.cudagraph_mode + if self.compilation_config.cudagraph_capture_sizes is not None: + self.cudagraph_sizes = sorted( + self.compilation_config.cudagraph_capture_sizes + ) + else: + self.cudagraph_sizes = [] self.padded_sizes = self._init_padded_sizes() self.graphs: dict[int, torch.cuda.CUDAGraph] = {} diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 08aad9ddd06b3..9ca37ff282d82 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -329,8 +329,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): torch.cuda.synchronize() def update_states(self, scheduler_output: SchedulerOutput) -> None: - for req_id in scheduler_output.preempted_req_ids: - self.req_states.remove_request(req_id) + if scheduler_output.preempted_req_ids is not None: + for req_id in scheduler_output.preempted_req_ids: + self.req_states.remove_request(req_id) for req_id in scheduler_output.finished_req_ids: self.req_states.remove_request(req_id) @@ -346,6 +347,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Add new requests. for new_req_data in scheduler_output.scheduled_new_reqs: + assert new_req_data.prompt_token_ids is not None + assert new_req_data.prefill_token_ids is not None + assert new_req_data.sampling_params is not None req_id = new_req_data.req_id self.req_states.add_request( req_id=req_id, @@ -398,8 +402,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # Decode first, then prefill. # batch_idx -> req_id req_ids = sorted( - scheduler_output.num_scheduled_tokens, - key=scheduler_output.num_scheduled_tokens.get, + scheduler_output.num_scheduled_tokens.keys(), + key=lambda k: scheduler_output.num_scheduled_tokens[k], ) num_scheduled_tokens = np.array( [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32 @@ -637,9 +641,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): model_runner_output = ModelRunnerOutput( req_ids=input_batch.req_ids, req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)}, - sampled_token_ids=None, + sampled_token_ids=None, # type: ignore logprobs=None, - prompt_logprobs_dict=prompt_logprobs_dict, + prompt_logprobs_dict=prompt_logprobs_dict, # type: ignore pooler_output=[], kv_connector_output=None, num_nans_in_logits=None, diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py index e916aadb6b5a0..55f98ca6bb6a3 100644 --- a/vllm/v1/worker/gpu/sampler.py +++ b/vllm/v1/worker/gpu/sampler.py @@ -8,8 +8,8 @@ import triton.language as tl from vllm.config.model import LogprobsMode from vllm.v1.outputs import LogprobsTensors, SamplerOutput -from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p +from vllm.v1.worker.gpu.states import SamplingMetadata class Sampler: From 1840c5cb1818ae036cb4d8276d37ce81142acbee Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 21 Nov 2025 14:41:52 -0500 Subject: [PATCH 24/83] [BugFix] Make sure to allocate worst case MoE workspace during profile run in the DP + EP case (#27426) Signed-off-by: Lucas Wilkinson --- vllm/envs.py | 4 +- .../layers/fused_moe/modular_kernel.py | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index d2d6917403420..9b1ed1fc680b4 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,7 +55,7 @@ if TYPE_CHECKING: VLLM_CPU_SGL_KERNEL: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") VLLM_XLA_CHECK_RECOMPILATION: bool = False - VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024 + VLLM_FUSED_MOE_CHUNK_SIZE: int = 16 * 1024 VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto" VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False @@ -785,7 +785,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Enable SPMD mode for TPU backend. "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))), "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int( - os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768") + os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(16 * 1024)) ), # Control whether to use fused MoE activation chunking. Current chunking # logic is incompatible with torch.compile and causes IMA. See issue diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 4af7af9257dfa..b2af58cdca887 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -10,6 +10,9 @@ from typing import final import torch import vllm.envs as envs +from vllm.config import get_current_vllm_config +from vllm.forward_context import get_forward_context, is_forward_context_available +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig from vllm.model_executor.layers.fused_moe.utils import ( _resize_cache, @@ -26,6 +29,8 @@ from vllm.v1.worker.ubatching import ( dbo_yield, ) +logger = init_logger(__name__) + # # This file defines a set of base classes used to make MoE kernels more modular. # The goal is to be able to utilize different communication mechanisms with @@ -798,6 +803,42 @@ class FusedMoEModularKernel(torch.nn.Module): buffers = self.shared_buffers[ubatch_idx] workspace_dtype = self.fused_experts.workspace_dtype(out_dtype) + # Force worst-case allocation in profiling run for + # "mk.FusedMoEModularKernel.Standard" formats where this is only bounded + # by `VLLM_FUSED_MOE_CHUNK_SIZE` and may not be seen during profiling with + # DP+EP due to the random token routing. + is_profile_run = ( + is_forward_context_available() + and get_forward_context().attn_metadata is None + ) + if is_profile_run and self.fused_experts.supports_chunking(): + parallel_config = get_current_vllm_config().parallel_config + is_dp_ep = ( + parallel_config.data_parallel_size > 1 + and parallel_config.enable_expert_parallel + ) + if is_dp_ep: + max_workspace_13, max_workspace_2, max_fused_out_shape = ( + self.fused_experts.workspace_shapes( + envs.VLLM_FUSED_MOE_CHUNK_SIZE, + N, + K, + top_k, + global_num_experts, + local_num_experts, + expert_tokens_meta, + ) + ) + buffers.workspace13.get( + max_workspace_13, device=device, dtype=workspace_dtype + ) + buffers.workspace2.get( + max_workspace_2, device=device, dtype=workspace_dtype + ) + buffers.fused_out.get( + max_fused_out_shape, device=device, dtype=workspace_dtype + ) + # Get intermediate workspace shapes based off the chunked M size. workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes( M_chunk, From 53a1ba6ec584ea93531a3195b3b9f8049786055b Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Sat, 22 Nov 2025 05:06:09 +0800 Subject: [PATCH 25/83] [log] add weights loading time log to sharded_state loader (#28628) Signed-off-by: Andy Xie --- vllm/model_executor/model_loader/sharded_state_loader.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index d94dbd9f06e0b..1538f0c2af655 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -4,6 +4,7 @@ import collections import glob import os +import time from collections.abc import Generator from typing import Any @@ -132,6 +133,7 @@ class ShardedStateLoader(BaseModelLoader): f"pre-sharded checkpoints are currently supported!" ) state_dict = self._filter_subtensors(model.state_dict()) + counter_before_loading_weights = time.perf_counter() for key, tensor in self.iterate_over_files(filepaths): # If loading with LoRA enabled, additional padding may # be added to certain parameters. We only load into a @@ -150,6 +152,12 @@ class ShardedStateLoader(BaseModelLoader): ) param_data.copy_(tensor) state_dict.pop(key) + counter_after_loading_weights = time.perf_counter() + logger.info_once( + "Loading weights took %.2f seconds", + counter_after_loading_weights - counter_before_loading_weights, + scope="local", + ) if state_dict: raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!") From c68c7b403dce632dbbbb6d2482ea86fe7bf53d51 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 21 Nov 2025 16:58:32 -0500 Subject: [PATCH 26/83] [BugFix] Fix missing symbol triggering FA2 fallback on Hopper (#29107) Signed-off-by: Lucas Wilkinson --- cmake/external_projects/vllm_flash_attn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake index 6cc5cda14c525..ff687e0af7b44 100644 --- a/cmake/external_projects/vllm_flash_attn.cmake +++ b/cmake/external_projects/vllm_flash_attn.cmake @@ -38,7 +38,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 71bb26f6295449be880344b93b51791cc009237d + GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From 57430fc95c8a94a7c68b3d525e3b8823b0f2433f Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Fri, 21 Nov 2025 22:58:59 +0100 Subject: [PATCH 27/83] Default model load/config/tokenizer to `mistral` format if relevant files exist (#28659) Signed-off-by: Julien Denize Signed-off-by: Julien Denize <40604584+juliendenize@users.noreply.github.com> Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: mgoin --- docs/features/tool_calling.md | 23 +++++-- .../language/generation/test_mistral.py | 2 +- tests/models/multimodal/test_mapping.py | 14 ++++- .../models/quantization/test_bitsandbytes.py | 3 + tests/tool_use/utils.py | 6 ++ tests/transformers_utils/test_config.py | 62 +++++++++++++++++++ tests/transformers_utils/test_utils.py | 6 +- .../llm/test_struct_output_generate.py | 14 ++++- vllm/config/model.py | 9 +-- vllm/model_executor/model_loader/__init__.py | 2 + .../model_loader/default_loader.py | 20 +++++- vllm/transformers_utils/config.py | 46 +++++++++++++- vllm/transformers_utils/configs/mistral.py | 2 +- vllm/transformers_utils/tokenizer.py | 30 +++++---- vllm/v1/engine/processor.py | 25 +++++++- 15 files changed, 230 insertions(+), 34 deletions(-) create mode 100644 tests/transformers_utils/test_config.py diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 7e6c69e717dba..dd79ba19b7247 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes` Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) -* Additional mistral function-calling models are compatible as well. +* Additional Mistral function-calling models are compatible as well. Known issues: @@ -158,12 +158,25 @@ Known issues: Recommended flags: -1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend: +1. To use the official Mistral AI's format: - `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral` + `--tool-call-parser mistral` -2. To use the default Transformers tokenization backend: - `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` +2. To use the Transformers format when available: + + `--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + +!!! note + Models officially released by Mistral AI have two possible formats: + + 1. The official format that is used by default with `auto` or `mistral` arguments: + + `--tokenizer_mode mistral --config_format mistral --load_format mistral` + This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend. + + 2. The Transformers format, when available, that is used with `hf` arguments: + + `--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja` ### Llama Models (`llama3_json`) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index 0ae83ec16020a..80e337d570a36 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -208,7 +208,7 @@ def test_mistral_format( with vllm_runner( model, dtype=dtype, - tokenizer_mode="auto", + tokenizer_mode="hf", load_format="safetensors", config_format="hf", ) as hf_format_model: diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 2f38dc450ef96..0d2eaca95504e 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") + is_mistral_model = model_arch in [ + "Mistral3ForConditionalGeneration", + "PixtralForConditionalGeneration", + "VoxtralForConditionalGeneration", + ] + + if not is_mistral_model or model_info.tokenizer_mode == "mistral": + tokenizer_mode = model_info.tokenizer_mode + else: + tokenizer_mode = "hf" + model_id = model_info.default model_config = ModelConfig( model_id, tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, + tokenizer_mode=tokenizer_mode, + config_format="hf", revision=model_info.revision, trust_remote_code=model_info.trust_remote_code, hf_overrides=model_info.hf_overrides, diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py index dc4b4546e451b..5b8aaa299fdc1 100644 --- a/tests/models/quantization/test_bitsandbytes.py +++ b/tests/models/quantization/test_bitsandbytes.py @@ -259,6 +259,9 @@ def validate_generated_texts( tensor_parallel_size=vllm_tp_size, enforce_eager=False, default_torch_num_threads=1, + tokenizer_mode="hf", + load_format="hf", + config_format="hf", ) as llm: vllm_outputs = llm.generate_greedy(prompts, max_tokens) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index d188b21863812..7584b903156b7 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -128,6 +128,12 @@ CONFIGS: dict[str, ServerConfig] = { "arguments": [ "--enforce-eager", "--no-enable-prefix-caching", + "--tokenizer_mode", + "hf", + "--load_format", + "hf", + "--config_format", + "hf", "--tool-call-parser", "mistral", "--chat-template", diff --git a/tests/transformers_utils/test_config.py b/tests/transformers_utils/test_config.py new file mode 100644 index 0000000000000..de28ab5f99e8c --- /dev/null +++ b/tests/transformers_utils/test_config.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, call, patch + +import pytest + +from vllm.transformers_utils.config import list_filtered_repo_files + + +@pytest.mark.parametrize( + "allow_patterns,expected_relative_files", + [ + ( + ["*.json", "correct*.txt"], + ["json_file.json", "subfolder/correct.txt", "correct_2.txt"], + ), + ], +) +def test_list_filtered_repo_files( + allow_patterns: list[str], expected_relative_files: list[str] +): + with tempfile.TemporaryDirectory() as tmp_dir: + # Prep folder and files + path_tmp_dir = Path(tmp_dir) + subfolder = path_tmp_dir / "subfolder" + subfolder.mkdir() + (path_tmp_dir / "json_file.json").touch() + (path_tmp_dir / "correct_2.txt").touch() + (path_tmp_dir / "uncorrect.txt").touch() + (path_tmp_dir / "uncorrect.jpeg").touch() + (subfolder / "correct.txt").touch() + (subfolder / "uncorrect_sub.txt").touch() + + def _glob_path() -> list[str]: + return [ + str(file.relative_to(path_tmp_dir)) + for file in path_tmp_dir.glob("**/*") + if file.is_file() + ] + + # Patch list_repo_files called by fn + with patch( + "vllm.transformers_utils.config.list_repo_files", + MagicMock(return_value=_glob_path()), + ) as mock_list_repo_files: + out_files = sorted( + list_filtered_repo_files( + tmp_dir, allow_patterns, "revision", "model", "token" + ) + ) + assert out_files == sorted(expected_relative_files) + assert mock_list_repo_files.call_count == 1 + assert mock_list_repo_files.call_args_list[0] == call( + repo_id=tmp_dir, + revision="revision", + repo_type="model", + token="token", + ) diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py index beaef04d766bf..bfe1cec76c138 100644 --- a/tests/transformers_utils/test_utils.py +++ b/tests/transformers_utils/test_utils.py @@ -2,7 +2,11 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3 +from vllm.transformers_utils.utils import ( + is_cloud_storage, + is_gcs, + is_s3, +) def test_is_gcs(): diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index a00600b87eca1..d1b037b7956cf 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -46,11 +46,15 @@ EAGLE_SPEC_CONFIG = { PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None), - ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), + # FIXME: Since "auto" will use Mistral tokenizer and these backends do not support + # it, we skip these tests for now. + # ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None), + # ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None), pytest.param( "mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", - "auto", + "hf", None, marks=pytest.mark.skip( reason=( @@ -80,7 +84,7 @@ PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [ # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None), # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"), ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG), - ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG), + ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG), ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG), ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG), ] @@ -151,6 +155,8 @@ def test_structured_output( ), seed=120, tokenizer_mode=tokenizer_mode, + load_format="auto" if not model_name.startswith("mistralai/") else "hf", + config_format="auto" if not model_name.startswith("mistralai/") else "hf", speculative_config=speculative_config, ) @@ -720,6 +726,8 @@ def test_structured_output_auto_mode( max_model_len=1024, structured_outputs_config=dict(backend="auto"), tokenizer_mode=tokenizer_mode, + load_format="auto", + config_format="auto", ) sampling_params = SamplingParams( diff --git a/vllm/config/model.py b/vllm/config/model.py index 8f59673f4e1c3..49688e17cf932 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -81,7 +81,7 @@ TaskOption = Literal[ "transcription", "draft", ] -TokenizerMode = Literal["auto", "slow", "mistral", "custom"] +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "custom"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -130,7 +130,8 @@ class ModelConfig: name or path will be used.""" tokenizer_mode: TokenizerMode = "auto" """Tokenizer mode:\n - - "auto" will use the fast tokenizer if available.\n + - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n + - "hf" will use the fast tokenizer if available.\n - "slow" will always use the slow tokenizer.\n - "mistral" will always use the tokenizer from `mistral_common`.\n - "custom" will use --tokenizer to select the preregistered tokenizer.""" @@ -241,8 +242,8 @@ class ModelConfig: first one.""" config_format: str | ConfigFormat = "auto" """The format of the model config to load:\n - - "auto" will try to load the config in hf format if available else it - will try to load in mistral format.\n + - "auto" will try to load the config in hf format if available after trying + to load in mistral format.\n - "hf" will load the config in hf format.\n - "mistral" will load the config in mistral format.""" hf_token: bool | str | None = None diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index 301f2d00bf404..052d2cfc1099e 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -30,6 +30,7 @@ logger = init_logger(__name__) # if a new load format is added here LoadFormats = Literal[ "auto", + "hf", "bitsandbytes", "dummy", "fastsafetensors", @@ -45,6 +46,7 @@ LoadFormats = Literal[ ] _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = { "auto": DefaultModelLoader, + "hf": DefaultModelLoader, "bitsandbytes": BitsAndBytesModelLoader, "dummy": DummyModelLoader, "fastsafetensors": DefaultModelLoader, diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index 67aa584c6bda2..7401a7a0e2dbb 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -31,6 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import ( safetensors_weights_iterator, ) from vllm.platforms import current_platform +from vllm.transformers_utils.config import list_filtered_repo_files logger = init_logger(__name__) @@ -96,8 +97,25 @@ class DefaultModelLoader(BaseModelLoader): load_format = self.load_config.load_format use_safetensors = False index_file = SAFE_WEIGHTS_INDEX_NAME - # Some quantized models use .pt files for storing the weights. + + # First check for 'auto' format that mistral files format are present. + # This is to load mistral models with official format by default. if load_format == "auto": + load_format = ( + "mistral" + if len( + list_filtered_repo_files( + model_name_or_path=model_name_or_path, + allow_patterns=["consolidated*.safetensors"], + revision=revision, + ) + ) + > 0 + else "hf" + ) + + # Some quantized models use .pt files for storing the weights. + if load_format == "hf": allow_patterns = ["*.safetensors", "*.bin"] elif load_format == "safetensors" or load_format == "fastsafetensors": use_safetensors = True diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index df24738477e76..9eac7bb50afa6 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import fnmatch import json import os import time @@ -355,6 +356,41 @@ def list_repo_files( return with_retry(lookup_files, "Error retrieving file list") +def list_filtered_repo_files( + model_name_or_path: str, + allow_patterns: list[str], + revision: str | None = None, + repo_type: str | None = None, + token: str | bool | None = None, +) -> list[str]: + try: + all_files = list_repo_files( + repo_id=model_name_or_path, + revision=revision, + token=token, + repo_type=repo_type, + ) + except Exception: + logger.error( + "Error retrieving file list. Please ensure your `model_name_or_path`" + "`repo_type`, `token` and `revision` arguments are correctly set. " + "Returning an empty list." + ) + return [] + + file_list = [] + # Filter patterns on filenames + for pattern in allow_patterns: + file_list.extend( + [ + file + for file in all_files + if fnmatch.fnmatch(os.path.basename(file), pattern) + ] + ) + return file_list + + def file_exists( repo_id: str, file_name: str, @@ -619,10 +655,14 @@ def get_config( if config_format == "auto": try: - if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision): - config_format = "hf" - elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): + # First check for Mistral to avoid defaulting to + # Transformers implementation. + if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision): config_format = "mistral" + elif is_gguf or file_or_path_exists( + model, HF_CONFIG_NAME, revision=revision + ): + config_format = "hf" else: raise ValueError( "Could not detect config format for no config file found. " diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index fe202b2ed1568..8da4ab35c56c3 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -118,7 +118,7 @@ def _remap_general_mistral_args(config: dict) -> dict: "model_type": ("model_type", "transformer"), "hidden_act": ("activation", "silu"), "tie_word_embeddings": ("tied_embeddings", False), - "max_seq_len": ("max_seq_len", 128_000), + "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)), "max_position_embeddings": ("max_position_embeddings", 128_000), } diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index a393568909d27..233076741503d 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -3,8 +3,8 @@ import contextlib import copy +import importlib.util import os -import warnings from functools import lru_cache from pathlib import Path from typing import TYPE_CHECKING, Any, TypeAlias @@ -15,7 +15,10 @@ from typing_extensions import assert_never from vllm import envs from vllm.logger import init_logger -from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config +from vllm.transformers_utils.config import ( + get_sentence_transformer_tokenizer_config, + list_filtered_repo_files, +) from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file @@ -182,25 +185,29 @@ def get_tokenizer( kwargs["gguf_file"] = Path(tokenizer_name).name tokenizer_name = Path(tokenizer_name).parent - # if tokenizer is from official mistral org - is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai" - if is_from_mistral_org and tokenizer_mode != "mistral": - warnings.warn( - "It is strongly recommended to run mistral models with " - '`--tokenizer-mode "mistral"` to ensure correct ' - "encoding and decoding.", - FutureWarning, - stacklevel=2, + # if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format + # first to use official Mistral tokenizer if possible. + mistral_common_installed = importlib.util.find_spec("mistral_common") is not None + if tokenizer_mode == "auto" and mistral_common_installed: + allow_patterns = ["tekken.json", "tokenizer.model.v*"] + files_list = list_filtered_repo_files( + model_name_or_path=str(tokenizer_name), + allow_patterns=allow_patterns, + revision=revision, ) + if len(files_list) > 0: + tokenizer_mode = "mistral" tokenizer: AnyTokenizer if tokenizer_mode == "mistral": + logger.debug_once(f"Loading MistralTokenizer from {tokenizer_name}") tokenizer = MistralTokenizer.from_pretrained( str(tokenizer_name), revision=revision ) elif tokenizer_mode == "custom": from vllm.transformers_utils.tokenizer_base import TokenizerRegistry + logger.debug_once(f"Loading CustomTokenizer from {tokenizer_name}") tokenizer = TokenizerRegistry.get_tokenizer( str(tokenizer_name), *args, @@ -210,6 +217,7 @@ def get_tokenizer( ) else: try: + logger.debug_once(f"Loading AutoTokenizer from {tokenizer_name}") tokenizer = AutoTokenizer.from_pretrained( tokenizer_name, *args, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 905ad406b307e..af4f0e410e253 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -20,6 +20,7 @@ from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import length_from_prompt_token_ids_or_embeds from vllm.v1.engine import EngineCoreRequest from vllm.v1.metrics.stats import MultiModalCacheStats @@ -300,12 +301,24 @@ class Processor: # allows <|special_token|> and similar, see # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens # Without tokenizer these are disallowed in grammars. + if isinstance(self.tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'guidance' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) validate_guidance_grammar(params, tokenizer=None) elif backend == "outlines": # outlines backend validate_structured_output_request_outlines(params) elif backend == "lm-format-enforcer": # lm format enforcer backend + if isinstance(self.tokenizer, MistralTokenizer): + raise ValueError( + "Mistral tokenizer is not supported for the 'lm-format-enforcer' " + "structured output backend. Please use ['xgrammar', 'outlines'] " + "backends or tokenizer_mode='hf' instead." + ) validate_structured_output_request_lm_format_enforcer(params) else: # NOTE: backend must be "auto" here, because we have @@ -320,9 +333,15 @@ class Processor: except ValueError: # The request either failed validation # or includes some jsonschema feature(s) that - # are not supported in xgrammar. Fall back to guidance. - validate_guidance_grammar(params, tokenizer=None) - params.structured_outputs._backend = "guidance" + # are not supported in xgrammar. + if isinstance(self.tokenizer, MistralTokenizer): + # Fall back to outlines if the tokenizer is Mistral + validate_structured_output_request_outlines(params) + params.structured_outputs._backend = "outlines" + else: + # Fall back to guidance by default. + validate_guidance_grammar(params, tokenizer=None) + params.structured_outputs._backend = "guidance" # Remember that this backend was set automatically params.structured_outputs._backend_was_auto = True From 3137991f55c9372d4743154a56933a37e47feca7 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 21 Nov 2025 17:28:17 -0500 Subject: [PATCH 28/83] [BugFix] EPLB + B200 + DeepGEMM : Handle column-major scales tensor (#29162) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- tests/distributed/eplb_utils.py | 49 +++ tests/distributed/test_eplb_execute.py | 40 +-- .../distributed/test_eplb_fused_moe_layer.py | 285 ++++++++++++++++++ vllm/model_executor/layers/fused_moe/layer.py | 41 +++ 4 files changed, 376 insertions(+), 39 deletions(-) create mode 100644 tests/distributed/eplb_utils.py create mode 100644 tests/distributed/test_eplb_fused_moe_layer.py diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py new file mode 100644 index 0000000000000..27a63e0215148 --- /dev/null +++ b/tests/distributed/eplb_utils.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +import random + +import torch +import torch.multiprocessing as mp + +from vllm.distributed.parallel_state import ( + init_distributed_environment, +) +from vllm.utils.system_utils import update_environment_variables + +mp.set_start_method("spawn", force=True) + + +def distributed_run(fn, world_size, *args): + number_of_processes = world_size + processes: list[mp.Process] = [] + for i in range(number_of_processes): + env: dict[str, str] = {} + env["RANK"] = str(i) + env["LOCAL_RANK"] = str(i) + env["WORLD_SIZE"] = str(number_of_processes) + env["LOCAL_WORLD_SIZE"] = str(number_of_processes) + env["MASTER_ADDR"] = "localhost" + env["MASTER_PORT"] = "12345" + p = mp.Process(target=fn, args=(env, world_size, *args)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def set_env_vars_and_device(env: dict[str, str]) -> None: + update_environment_variables(env) + local_rank = os.environ["LOCAL_RANK"] + device = torch.device(f"cuda:{local_rank}") + torch.cuda.set_device(device) + init_distributed_environment() + + # Ensure each worker process has the same random seed + random.seed(42) + torch.manual_seed(42) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 0a97749ac318c..9498e75b279b7 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -1,57 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import random import pytest import torch import torch.distributed -import torch.multiprocessing as mp from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.parallel_state import ( ensure_model_parallel_initialized, get_tp_group, - init_distributed_environment, ) -from vllm.utils.system_utils import update_environment_variables -mp.set_start_method("spawn", force=True) - - -def distributed_run(fn, world_size, *args): - number_of_processes = world_size - processes: list[mp.Process] = [] - for i in range(number_of_processes): - env: dict[str, str] = {} - env["RANK"] = str(i) - env["LOCAL_RANK"] = str(i) - env["WORLD_SIZE"] = str(number_of_processes) - env["LOCAL_WORLD_SIZE"] = str(number_of_processes) - env["MASTER_ADDR"] = "localhost" - env["MASTER_PORT"] = "12345" - p = mp.Process(target=fn, args=(env, world_size, *args)) - processes.append(p) - p.start() - - for p in processes: - p.join() - - for p in processes: - assert p.exitcode == 0 - - -def set_env_vars_and_device(env: dict[str, str]) -> None: - update_environment_variables(env) - local_rank = os.environ["LOCAL_RANK"] - device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) - init_distributed_environment() - - # Ensure each worker process has the same random seed - random.seed(42) - torch.manual_seed(42) +from .eplb_utils import distributed_run, set_env_vars_and_device def create_expert_indices_with_redundancy( diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py new file mode 100644 index 0000000000000..55f26519887a1 --- /dev/null +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Test that the interaction between EPLB and FusedMoE Layer is okay + +from dataclasses import dataclass + +import pytest +import torch + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace +from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, + get_tp_group, +) +from vllm.model_executor.layers.fused_moe.layer import FusedMoE + +from .eplb_utils import distributed_run, set_env_vars_and_device + + +@dataclass +class TestConfig: + num_layers: int + num_experts: int + num_local_experts: int + num_topk: int + hidden_size: int + intermediate_size: int + weight_dtype: torch.dtype + weight_scale_dtype: torch.dtype | None + column_major_scales: bool + + +def make_expert_weights( + layer_idx: int, + global_expert_idx: int, + global_num_experts: int, + tensor_shape: tuple[int, ...], + tensor_dtype: torch.dtype, + tensor_device: torch.device, + is_column_major: bool, +) -> torch.Tensor: + assert len(tensor_shape) == 2 + + if is_column_major: + tensor_shape = (tensor_shape[1], tensor_shape[0]) + + x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device) + value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel() + x.view(-1).copy_( + torch.arange( + value_offset, + value_offset + x.numel(), + dtype=tensor_dtype, + device=tensor_device, + ) + ) + + if is_column_major: + x = torch.transpose(x, 1, 0) + assert not x.is_contiguous() + return x + + +def make_fused_moe_layer( + rank: int, + layer_idx: int, + test_config: TestConfig, +) -> FusedMoE: + fml = FusedMoE( + num_experts=test_config.num_experts, + top_k=test_config.num_topk, + hidden_size=test_config.hidden_size, + intermediate_size=test_config.intermediate_size, + prefix=f"dummy_layer_{layer_idx}", + activation="silu", + is_act_and_mul=True, + params_dtype=test_config.weight_dtype, + ) + + device = torch.device(f"cuda:{rank}") + + from functools import partial + + _make_expert_weights = partial( + make_expert_weights, + layer_idx=layer_idx, + global_num_experts=test_config.num_experts, + tensor_device=device, + ) + + assert isinstance(fml.w13_weight.data, torch.Tensor) + assert isinstance(fml.w2_weight.data, torch.Tensor) + fml.w13_weight.data = fml.w13_weight.data.to(device=device) + fml.w2_weight.data = fml.w2_weight.data.to(device=device) + w13_weight = fml.w13_weight.data + w2_weight = fml.w2_weight.data + assert w13_weight.size(0) == test_config.num_local_experts + for i in range(test_config.num_local_experts): + g_i = rank * test_config.num_local_experts + i + w13_weight_e = w13_weight[i] + w2_weight_e = w2_weight[i] + w13_weight_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w13_weight_e.shape, + tensor_dtype=w13_weight_e.dtype, + is_column_major=False, + ) + ) + w2_weight_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w2_weight_e.shape, + tensor_dtype=w2_weight_e.dtype, + is_column_major=False, + ) + ) + + block_size = 16 + + def block_quant_scales_shape( + shape: tuple[int, ...], is_column_major: bool + ) -> tuple[int, ...]: + assert len(shape) == 3 + if not is_column_major: + return (shape[0], shape[1] // block_size, shape[2] // block_size) + else: + return (shape[0], shape[2] // block_size, shape[1] // block_size) + + is_column_major = test_config.column_major_scales + w13_weight_scale_inv = torch.empty( + block_quant_scales_shape(w13_weight.shape, is_column_major), + dtype=test_config.weight_dtype, + device=device, + ) + w2_weight_scale_inv = torch.empty( + block_quant_scales_shape(w2_weight.shape, is_column_major), + dtype=test_config.weight_dtype, + device=device, + ) + + for i in range(test_config.num_local_experts): + g_i = rank * test_config.num_local_experts + i + w13_s_e = w13_weight_scale_inv[i] + w2_s_e = w2_weight_scale_inv[i] + w13_s_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w13_s_e.shape, + tensor_dtype=w13_s_e.dtype, + # Fill data in row-major and then + # transpose if test_config requires col-major. + is_column_major=False, + ) + ) + w2_s_e.copy_( + _make_expert_weights( + global_expert_idx=g_i, + tensor_shape=w2_s_e.shape, + tensor_dtype=w2_s_e.dtype, + is_column_major=False, + ) + ) + if is_column_major: + w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2) + w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2) + assert not w13_weight_scale_inv.is_contiguous() + assert not w2_weight_scale_inv.is_contiguous() + + # Add scales to the parameter list + fml.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False + ) + fml.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False + ) + + return fml + + +def _test_eplb_fml(env, world_size: int, test_config: TestConfig): + # Initialize model parallel (using tensor parallel as an entrypoint + # to expert parallel) + set_env_vars_and_device(env) + + vllm_config = VllmConfig() + vllm_config.parallel_config.tensor_parallel_size = world_size + vllm_config.parallel_config.enable_expert_parallel = True + + with set_current_vllm_config(vllm_config): + ensure_model_parallel_initialized( + tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1 + ) + + ep_group = get_tp_group().cpu_group + ep_rank = torch.distributed.get_rank() + + fml_layers = [ + make_fused_moe_layer(ep_rank, layer_idx, test_config) + for layer_idx in range(test_config.num_layers) + ] + rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers] + + indices = torch.zeros( + test_config.num_layers, test_config.num_experts, dtype=torch.long + ) + for lidx in range(test_config.num_layers): + indices[lidx] = torch.Tensor(range(test_config.num_experts)) + + shuffled_indices = torch.zeros_like(indices) + for lidx in range(test_config.num_layers): + shuffled_indices[lidx] = torch.randperm(test_config.num_experts) + + rearrange_expert_weights_inplace( + indices, + shuffled_indices, + rank_expert_weights, + ep_group, + is_profile=False, + ) + + num_local_experts = test_config.num_local_experts + num_global_experts = test_config.num_experts + for lidx, fml in enumerate(fml_layers): + for name, w in fml.named_parameters(): + for e in range(num_local_experts): + g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e] + ref = make_expert_weights( + layer_idx=lidx, + global_expert_idx=int(g_e.item()), + global_num_experts=num_global_experts, + tensor_shape=w[e].shape, + tensor_dtype=w[e].dtype, + tensor_device=w[e].device, + is_column_major=not w[e].is_contiguous(), + ) + assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), ( + f"w[{e}] {w[e].size()} {w[e].stride()} vs " + f"ref {ref.size()} {ref.stride()}" + ) + torch.testing.assert_close(w[e], ref) + + +@pytest.mark.parametrize("world_size", [2]) +@pytest.mark.parametrize("num_layers", [4]) +@pytest.mark.parametrize("num_experts", [16]) +@pytest.mark.parametrize("hidden_size", [256]) +@pytest.mark.parametrize("intermediate_size", [256]) +@pytest.mark.parametrize("column_major_scales", [True, False]) +def test_eplb_fml( + world_size: int, + num_layers: int, + num_experts: int, + hidden_size: int, + intermediate_size: int, + column_major_scales: bool, +): + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need at least {world_size} GPUs to run the test") + + num_local_experts = num_experts // world_size + num_topk = 4 + # The dtypes are fine as we are essentially just checking data-copies + weight_dtype = torch.bfloat16 + weight_scale_dtype = torch.bfloat16 + + test_config = TestConfig( + num_layers=num_layers, + num_experts=num_experts, + num_local_experts=num_local_experts, + num_topk=num_topk, + hidden_size=hidden_size, + intermediate_size=intermediate_size, + weight_dtype=weight_dtype, + weight_scale_dtype=weight_scale_dtype, + column_major_scales=column_major_scales, + ) + + distributed_run( + _test_eplb_fml, + world_size, + test_config, + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b2f554efd8a6f..6619b64b2bbc0 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1391,7 +1391,48 @@ class FusedMoE(CustomOp): yield param_name def get_expert_weights(self) -> Iterable[torch.Tensor]: + def _maybe_make_contiguous( + name: str, p: torch.nn.Parameter + ) -> torch.nn.Parameter: + """ + In some cases, the last 2 dimensions (the non-expert dimensions) + of the weight scale tensor are transposed. This function + transforms the tensor (view update) so the tensor is contiguous(). + Example: A non-contiguous scale tensor, + `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to + `x_` of shape (E, 16, 32) and stride (512, 32, 1). + Note that we specifically use torch.transpose() so `x_` refers + to the same underlying memory. The tensors `x` and `x_`, pointing + to the same underlying memory make this transformation safe in the + context of EPLB. i.e. It is the same memory and just the view + is different. + Note: This function handles the "weight_scale" tensors specifically. + This could however be generalized to handle similar tensors. + """ + if p.ndim != 3: + return p + if p.is_contiguous(): + # Already contiguous. do nothing. + return p + # p is non-contiguous. We only handle the case where the last 2 + # dimensions of the scales tensor is transposed. We can handle + # other cases when they become relevant. + is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1 + if "weight_scale" not in name or not is_transposed_12: + # do nothing. + return p + + # Do not update the layer paramater as the layer's MoE operations would + # expect the parameter's tensor to the same shape / stride. Instead, + # make a new torch.nn.Parameter that is used just in the context of + # EPLB. + return torch.nn.Parameter( + torch.transpose(p.data, 1, 2), requires_grad=False + ) + weights = list(self.named_parameters()) + weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights] + assert all( weight.is_contiguous() for name, weight in weights From c6fa3895e90f6daef4d223188f6b4156311f40c9 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Fri, 21 Nov 2025 22:45:00 +0000 Subject: [PATCH 29/83] [KV Connector] Fix async connector prefix cache metrics (#28585) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mark McLoughlin Co-authored-by: Nicolò Lucchesi --- tests/v1/core/test_scheduler.py | 17 +++++++++++++---- vllm/v1/core/sched/scheduler.py | 16 ++++++++-------- vllm/v1/request.py | 3 +++ 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 04e738293cd77..d9a69a77c9797 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1057,7 +1057,8 @@ def test_kv_connector_basic(is_async: bool): ) -def test_external_prefix_cache_metrics(): +@pytest.mark.parametrize("is_async", [False, True]) +def test_external_prefix_cache_metrics(is_async: bool): """ Verify connector prefix cache metrics are updated correctly when the scheduler processes requests with KV connector hits. @@ -1067,7 +1068,9 @@ def test_external_prefix_cache_metrics(): NUM_MATCHED_NEW_TOKENS = 4 scheduler = create_scheduler( enable_prefix_caching=False, - use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False), + use_kv_connector=mock_kv( + matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async + ), ) # --- Prepare simple requests --- @@ -1079,9 +1082,15 @@ def test_external_prefix_cache_metrics(): num_tokens=NUM_TOKENS, max_tokens=MAX_TOKENS, ) + req_ids = [] + req_to_index = {} + for i, request in enumerate(requests): + scheduler.add_request(request) + req_ids.append(request.request_id) + req_to_index[request.request_id] = i - for req in requests: - scheduler.add_request(req) + if is_async: + _step_until_kv_transfer_finished(scheduler, req_ids) # --- Trigger scheduling and simulate model output --- output = scheduler.schedule() diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9195b112d8690..4cb5348cbacc3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -470,6 +470,7 @@ class Scheduler(SchedulerInterface): skipped_waiting_requests.prepend_request(request) continue + request.num_external_computed_tokens = ext_tokens num_external_computed_tokens = ext_tokens # Total computed tokens (local + external). @@ -576,9 +577,6 @@ class Scheduler(SchedulerInterface): new_computed_blocks + new_blocks, num_external_computed_tokens, ) - self._update_connector_prefix_cache_stats( - request, num_external_computed_tokens - ) # Request was already popped from self.waiting # unless it was re-added above due to new_blocks being None. @@ -590,6 +588,8 @@ class Scheduler(SchedulerInterface): request.status = RequestStatus.WAITING_FOR_REMOTE_KVS continue + self._update_connector_prefix_cache_stats(request) + req_index += 1 self.running.append(request) if self.log_stats: @@ -1380,15 +1380,13 @@ class Scheduler(SchedulerInterface): # KV Connector Related Methods ######################################################################## - def _update_connector_prefix_cache_stats( - self, request: Request, num_external_tokens: int - ) -> None: + def _update_connector_prefix_cache_stats(self, request: Request) -> None: if self.connector_prefix_cache_stats is None: return self.connector_prefix_cache_stats.record( num_tokens=request.num_tokens, - num_hits=num_external_tokens, + num_hits=request.num_external_computed_tokens, preempted=request.num_preemptions > 0, ) @@ -1571,9 +1569,11 @@ class Scheduler(SchedulerInterface): marked_invalid_block = True # Truncate the computed tokens at the first failed block request.num_computed_tokens = idx * self.block_size - total_affected_tokens += ( + num_affected_tokens = ( req_num_computed_tokens - request.num_computed_tokens ) + total_affected_tokens += num_affected_tokens + request.num_external_computed_tokens -= num_affected_tokens if is_affected: if not marked_invalid_block: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 3d92906fbf4b1..366cdadf5a583 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -121,6 +121,9 @@ class Request: # The number of requests being preempted by the scheduler self.num_preemptions = 0 + # The number of tokens that have been computed remotely. + self.num_external_computed_tokens = 0 + self.block_hashes: list[BlockHash] = [] self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None if block_hasher is not None: From e9af6ba62ac99683139ff8d6bac87677fecf0b0c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 15:52:28 -0800 Subject: [PATCH 30/83] [Model Runner V2] Optimize Gumbel Sampling Kernel (#29210) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/sampler.py | 93 ++++++++++++++++------------------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/vllm/v1/worker/gpu/sampler.py b/vllm/v1/worker/gpu/sampler.py index 55f98ca6bb6a3..499e9d3b1538d 100644 --- a/vllm/v1/worker/gpu/sampler.py +++ b/vllm/v1/worker/gpu/sampler.py @@ -3,10 +3,9 @@ from collections.abc import Callable import torch -import triton -import triton.language as tl from vllm.config.model import LogprobsMode +from vllm.triton_utils import tl, triton from vllm.v1.outputs import LogprobsTensors, SamplerOutput from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p from vllm.v1.worker.gpu.states import SamplingMetadata @@ -78,7 +77,10 @@ class Sampler: @triton.jit def _gumbel_sample_kernel( - sampled_ptr, + local_argmax_ptr, + local_argmax_stride, + local_max_ptr, + local_max_stride, logits_ptr, logits_stride, seeds_ptr, @@ -88,40 +90,21 @@ def _gumbel_sample_kernel( BLOCK_SIZE: tl.constexpr, ): req_idx = tl.program_id(0) + block_idx = tl.program_id(1) + block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, + mask=mask, + other=float("-inf"), + ) + is_greedy = tl.load(is_greedy_ptr + req_idx) - - if is_greedy: - # Greedy sampling. Don't apply gumbel noise. - max_val = float("-inf") - max_idx = 0 - for i in range(0, vocab_size, BLOCK_SIZE): - block = i + tl.arange(0, BLOCK_SIZE) - mask = block < vocab_size - logits = tl.load( - logits_ptr + req_idx * logits_stride + block, - mask=mask, - other=float("-inf"), - ) - - idx = tl.argmax(logits, axis=0) - value = tl.max(logits, axis=0) - is_greater = value > max_val - max_val = tl.where(is_greater, value, max_val) - max_idx = tl.where(is_greater, i + idx, max_idx) - tl.store(sampled_ptr + req_idx, max_idx) - return - - # Random sampling. - # Calculate gumbel seed. - seed = tl.load(seeds_ptr + req_idx) - pos = tl.load(pos_ptr + req_idx) - gumbel_seed = tl.randint(seed, pos) - - max_val = float("-inf") - max_idx = 0 - for i in range(0, vocab_size, BLOCK_SIZE): - block = i + tl.arange(0, BLOCK_SIZE) - mask = block < vocab_size + if not is_greedy: + # Calculate the seed for gumbel noise. + seed = tl.load(seeds_ptr + req_idx) + pos = tl.load(pos_ptr + req_idx) + gumbel_seed = tl.randint(seed, pos) # Generate gumbel noise. r = tl.rand(gumbel_seed, block).to(tl.float64) @@ -129,16 +112,13 @@ def _gumbel_sample_kernel( gumbel_noise = gumbel_noise.to(tl.float32) # Apply gumbel noise. - logits = tl.load(logits_ptr + req_idx * logits_stride + block, mask=mask) logits = tl.where(mask, logits + gumbel_noise, float("-inf")) - # Argmax to get the sampled token. - idx = tl.argmax(logits, axis=0) - value = tl.max(logits, axis=0) - is_greater = value > max_val - max_val = tl.where(is_greater, value, max_val) - max_idx = tl.where(is_greater, i + idx, max_idx) - tl.store(sampled_ptr + req_idx, max_idx) + idx = tl.argmax(logits, axis=0) + token_id = block_idx * BLOCK_SIZE + idx + value = tl.max(logits, axis=0) + tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id) + tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value) def gumbel_sample( @@ -148,23 +128,36 @@ def gumbel_sample( pos: torch.Tensor, # [num_reqs] ) -> torch.Tensor: num_reqs, vocab_size = logits.shape - # NOTE(woosuk): Use int64 for later indexing. - sampled = torch.empty( + BLOCK_SIZE = 1024 + num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE) + local_argmax = torch.empty( num_reqs, + num_blocks, dtype=torch.int64, device=logits.device, ) - _gumbel_sample_kernel[(num_reqs,)]( - sampled, + local_max = torch.empty( + num_reqs, + num_blocks, + dtype=torch.float32, + device=logits.device, + ) + _gumbel_sample_kernel[(num_reqs, num_blocks)]( + local_argmax, + local_argmax.stride(0), + local_max, + local_max.stride(0), logits, logits.stride(0), seed, pos, is_greedy, vocab_size, - num_warps=8, - BLOCK_SIZE=16384, # type: ignore + BLOCK_SIZE=BLOCK_SIZE, ) + # NOTE(woosuk): Use int64 for later indexing. + max_block_idx = local_max.argmax(dim=-1, keepdim=True) + sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1) return sampled From 30d64662387aaa74abcee294f27b83043f2d1ae6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 21 Nov 2025 19:47:05 -0500 Subject: [PATCH 31/83] [BugFix] Fix Eagle `IndexError: list index out of range` for even `num_speculative_tokens` (#29102) Signed-off-by: Lucas Wilkinson --- tests/conftest.py | 8 ++++++++ vllm/config/compilation.py | 16 ++++++++++------ vllm/v1/spec_decode/eagle.py | 33 +++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b17081352edcf..5afdb225b8923 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -748,6 +748,14 @@ class VllmRunner: # being captured which can trigger edge cases that we don't handle yet. kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]} + # Make sure we have atleast one cudagraph large enough for a single decode. + if (speculative_config := kwargs.get("speculative_config")) and ( + num_speculative_tokens := speculative_config["num_speculative_tokens"] + ): + kwargs["compilation_config"]["cudagraph_capture_sizes"].append( + num_speculative_tokens + 1 + ) + with init_ctx: self.llm = LLM( model=model_name, diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index abdae49106120..9b5309598d0e2 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -950,14 +950,18 @@ class CompilationConfig: ) ) + if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size: + # if one valid but would be round_down use that + rounded_sizes = [multiple_of] + if len(rounded_sizes) == 0: - logger.warning( - "No valid cudagraph sizes after rounding to multiple of " - " num_speculative_tokens + 1 (%d); please adjust num_speculative_tokens" - " or max_cudagraph_capture_size (or cudagraph_capture_sizes)", - multiple_of, + raise ValueError( + f"No valid cudagraph sizes after rounding to multiple of {multiple_of} " + f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)" + f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}" + f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})" + f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})" ) - return self.max_cudagraph_capture_size = rounded_sizes[-1] self.cudagraph_capture_sizes = rounded_sizes diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0df9cd3214e53..3de418f1d13c8 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -86,9 +86,9 @@ class EagleProposer: self.use_cuda_graph = False - compilation_config = self.vllm_config.compilation_config - if compilation_config.mode == CompilationMode.VLLM_COMPILE: - cudagraph_mode = compilation_config.cudagraph_mode + self.compilation_config = self.vllm_config.compilation_config + if self.compilation_config.mode == CompilationMode.VLLM_COMPILE: + cudagraph_mode = self.compilation_config.cudagraph_mode if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode( CUDAGraphMode.PIECEWISE ): @@ -103,13 +103,6 @@ class EagleProposer: and not self.speculative_config.enforce_eager ) - self.cudagraph_batch_sizes = ( - (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes)) - if self.use_cuda_graph - else [] - ) - - self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes) # persistent buffers for cuda graph self.input_ids = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=device @@ -276,7 +269,10 @@ class EagleProposer: per_layer_attn_metadata[layer_name] = draft_indexer_metadata cudagraph_runtime_mode = CUDAGraphMode.NONE - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -366,7 +362,10 @@ class EagleProposer: # Generate the remaining draft tokens. draft_token_ids_list = [draft_token_ids] - if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and batch_size <= self.compilation_config.max_cudagraph_capture_size + ): input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -777,7 +776,10 @@ class EagleProposer: self.positions[:num_tokens] = tree_positions.view(-1) self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1) - if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + self.use_cuda_graph + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE else: @@ -1114,7 +1116,10 @@ class EagleProposer: ) -> None: # Determine if CUDA graphs should be used for this run. cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph - if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]: + if ( + cudagraphs_enabled + and num_tokens <= self.compilation_config.max_cudagraph_capture_size + ): num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens) with set_forward_context( From d5dbdbfcb2cfc2e4d82a1e2605576f1e4e440ca7 Mon Sep 17 00:00:00 2001 From: Angela Yi Date: Fri, 21 Nov 2025 17:10:27 -0800 Subject: [PATCH 32/83] [docs] Fix cudagraph mode config (#29170) Signed-off-by: angelayi --- docs/design/debug_vllm_compile.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md index 3b454e851b54e..8912eb58f8ac7 100644 --- a/docs/design/debug_vllm_compile.md +++ b/docs/design/debug_vllm_compile.md @@ -9,7 +9,7 @@ TL;DR: |----------|----------|-------------| | --enforce-eager | enforce_eager=True | Turn off torch.compile and CUDAGraphs | | -O.mode=0 | mode=CompilationMode.NONE | Turn off torch.compile only | -| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) | Turn off CUDAGraphs only | +| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) | Turn off CUDAGraphs only | | -O.backend=eager | compilation_config=CompilationConfig(backend='eager') | Turn off TorchInductor | ## vLLM-torch.compile overview From 9a3101b2ba6821488f4b7a9b93124e479edc4d3e Mon Sep 17 00:00:00 2001 From: Charlie Fu Date: Fri, 21 Nov 2025 19:11:02 -0600 Subject: [PATCH 33/83] [Rocm][CI] Fix DeekSeek V2-Lite Accuracy CI (#29135) Signed-off-by: charlifu --- .../deepseek_v2_lite_ep_eplb.sh | 12 +++++++++++- .../qwen30b_a3b_fp8_block_ep.sh | 11 ++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 5302f524a0ae4..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -17,7 +17,17 @@ wait_for_server() { } MODEL="deepseek-ai/DeepSeek-V2-lite" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") + +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh index a5135299297e2..0d06f53a183d0 100644 --- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh +++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh @@ -17,7 +17,16 @@ wait_for_server() { } MODEL="QWen/Qwen3-30B-A3B-FP8" -BACKENDS=("deepep_high_throughput" "deepep_low_latency") +# Set BACKENDS based on platform +if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then + # ROCm platform + BACKENDS=("allgather_reducescatter") + # Disable MOE padding for ROCm since it is causing eplb to fail + export VLLM_ROCM_MOE_PADDING=0 +else + # Non-ROCm platform (CUDA/other) + BACKENDS=("deepep_high_throughput" "deepep_low_latency") +fi cleanup() { if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then From 1d34eb11e057f6b42af36bdb13852d2701f04245 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Fri, 21 Nov 2025 20:14:49 -0500 Subject: [PATCH 34/83] [CI] Bug: Fix triton import issue (#29202) Signed-off-by: yewentao256 --- vllm/v1/worker/gpu/block_table.py | 3 +-- vllm/v1/worker/gpu/input_batch.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py index ff24e88ede2c0..b31e9b179d26c 100644 --- a/vllm/v1/worker/gpu/block_table.py +++ b/vllm/v1/worker/gpu/block_table.py @@ -3,10 +3,9 @@ from collections.abc import Iterable import torch -import triton -import triton.language as tl from vllm.attention.backends.utils import PAD_SLOT_ID +from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py index 89f375649146f..8313b32d29797 100644 --- a/vllm/v1/worker/gpu/input_batch.py +++ b/vllm/v1/worker/gpu/input_batch.py @@ -7,9 +7,8 @@ import numba import numba.types as types import numpy as np import torch -import triton -import triton.language as tl +from vllm.triton_utils import tl, triton from vllm.utils import random_uuid from vllm.utils.math_utils import cdiv from vllm.v1.utils import CpuGpuBuffer From d045e22dfeee61ece1a20ac4aec8cf483a42d406 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Sat, 22 Nov 2025 01:30:55 +0000 Subject: [PATCH 35/83] [Model][Qwen3VL] Tune Triton w8a8 block fp8 kernel for L40s (#29217) Signed-off-by: Lukas Geiger --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++ 4 files changed, 584 insertions(+) create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..6b2c1dc1312bf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..b0eaf02a541ad --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..4cd357d5086ca --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 0000000000000..ca2179ddf3d2f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 5 + }, + "8": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 5 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 5 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + } +} From ed8e6843cc7167113bb9a436818f2e242c841b9f Mon Sep 17 00:00:00 2001 From: Ryan Rock Date: Fri, 21 Nov 2025 19:31:22 -0600 Subject: [PATCH 36/83] [CI/Build] Add terratorch for AMD (#29205) Signed-off-by: Ryan Rock --- requirements/rocm-test.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index 432e11977872d..eabb5065bfceb 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -39,3 +39,6 @@ mteb[bm25s]>=1.38.11, <2 # Required for eval tests lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d + +# Plugins test +terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e From 5c8f2adf50e0cf2c5acf908ac796089cc45abdcf Mon Sep 17 00:00:00 2001 From: Jie Luo <65482183+Livinfly@users.noreply.github.com> Date: Sat, 22 Nov 2025 09:34:28 +0800 Subject: [PATCH 37/83] [Bugfix] Fix block size in block_table with PCP (#29094) Signed-off-by: Livinfly --- vllm/v1/worker/block_table.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 76e17f3797a1a..37ec0fb97e06b 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -84,7 +84,7 @@ class BlockTable: self.pcp_world_size = get_pcp_group().world_size self.pcp_rank = get_pcp_group().rank_in_group except AssertionError: - # DCP might not be initialized in testing + # PCP might not be initialized in testing self.pcp_world_size = 1 self.pcp_rank = 0 try: @@ -268,6 +268,11 @@ class MultiGroupBlockTable: # (max_model_len//dcp_world_size) tokens in kvcache, # so the block_size which used for calc max_num_blocks_per_req # must be multiplied by dcp_world_size. + try: + pcp_world_size = get_pcp_group().world_size + except AssertionError: + # PCP might not be initialized in testing + pcp_world_size = 1 try: dcp_world_size = get_dcp_group().world_size except AssertionError: @@ -280,12 +285,14 @@ class MultiGroupBlockTable: f"must match block_sizes length ({len(block_sizes)})" ) + total_cp_world_size = dcp_world_size * pcp_world_size + self.block_tables = [ BlockTable( block_size, max_num_reqs, max( - cdiv(max_model_len, block_size * dcp_world_size), + cdiv(max_model_len, block_size * total_cp_world_size), 1 + num_speculative_tokens, ), max_num_batched_tokens, From 1ef9c9e29480f95340e124cc7d81a2876a60516d Mon Sep 17 00:00:00 2001 From: qli88 Date: Fri, 21 Nov 2025 19:36:19 -0600 Subject: [PATCH 38/83] [CI/Build] Disable test_gptoss_tp.py in 'LoRA TP Test' group for ROCm platform (#29204) Signed-off-by: qli88 --- .buildkite/test-amd.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 4e2ff5c5a6bd5..4ee81fdabf665 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1319,7 +1319,10 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - pytest -v -s -x lora/test_olmoe_tp.py - - pytest -v -s -x lora/test_gptoss_tp.py + + # Disabled for now because MXFP4 backend on non-cuda platform + # doesn't support LoRA yet + #- pytest -v -s -x lora/test_gptoss_tp.py - label: Weight Loading Multiple GPU Test # 33min From 052950e5b3c48b1189df62f833ed9cff4aabb0bd Mon Sep 17 00:00:00 2001 From: FlintyLemming Date: Sat, 22 Nov 2025 09:37:51 +0800 Subject: [PATCH 39/83] Add fused MoE config for H200 E160 N192 fp8 (#29182) Signed-off-by: FlintyLemming --- ...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..54fe5374cb95d --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From 6f403501a085f4917e49e1714bdf44d2aabd06f9 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 20:13:18 -0600 Subject: [PATCH 40/83] [CI/Build][AMD] Enable Entrypoints Integration Test (Pooling) to run without error on ROCm (#29212) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/entrypoints/pooling/correctness/test_mteb_embed.py | 6 ++++++ tests/entrypoints/pooling/correctness/test_mteb_score.py | 6 ++++++ tests/entrypoints/pooling/llm/test_embedding.py | 6 ++++++ tests/entrypoints/pooling/llm/test_encode.py | 6 ++++++ tests/entrypoints/pooling/llm/test_score.py | 6 ++++++ tests/entrypoints/pooling/openai/test_embedding.py | 6 ++++++ .../entrypoints/pooling/openai/test_embedding_dimensions.py | 6 ++++++ .../entrypoints/pooling/openai/test_embedding_long_text.py | 6 ++++++ tests/entrypoints/pooling/openai/test_rerank.py | 6 ++++++ tests/entrypoints/pooling/openai/test_score.py | 6 ++++++ tests/entrypoints/pooling/openai/test_truncation.py | 6 ++++++ 11 files changed, 66 insertions(+) diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/correctness/test_mteb_embed.py index 7f16638e51e2c..64673534fd32a 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_embed.py @@ -11,6 +11,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( run_mteb_embed_task, ) from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/correctness/test_mteb_score.py index 1afe68b189db8..81ad0097187b0 100644 --- a/tests/entrypoints/pooling/correctness/test_mteb_score.py +++ b/tests/entrypoints/pooling/correctness/test_mteb_score.py @@ -13,6 +13,12 @@ from tests.models.language.pooling_mteb_test.mteb_utils import ( run_mteb_rerank, ) from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) os.environ["VLLM_LOGGING_LEVEL"] = "WARNING" diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/llm/test_embedding.py index 5455b5f91fc09..f5eab4c29ae18 100644 --- a/tests/entrypoints/pooling/llm/test_embedding.py +++ b/tests/entrypoints/pooling/llm/test_embedding.py @@ -9,6 +9,12 @@ import torch.nn.functional as F from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/llm/test_encode.py index ca85d2758fce4..f86ecef2e4744 100644 --- a/tests/entrypoints/pooling/llm/test_encode.py +++ b/tests/entrypoints/pooling/llm/test_encode.py @@ -7,6 +7,12 @@ import pytest from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "intfloat/multilingual-e5-small" diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/llm/test_score.py index b69c6a47c1913..ce36d61cb8476 100644 --- a/tests/entrypoints/pooling/llm/test_score.py +++ b/tests/entrypoints/pooling/llm/test_score.py @@ -9,6 +9,12 @@ import torch from tests.models.utils import softmax from vllm import LLM, PoolingParams from vllm.distributed import cleanup_dist_env_and_memory +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/openai/test_embedding.py index e971b23e8f1a0..0c88d800e2f99 100644 --- a/tests/entrypoints/pooling/openai/test_embedding.py +++ b/tests/entrypoints/pooling/openai/test_embedding.py @@ -19,6 +19,7 @@ from vllm.entrypoints.openai.protocol import ( EmbeddingResponse, PoolingResponse, ) +from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils.serial_utils import ( EMBED_DTYPE_TO_TORCH_DTYPE, @@ -28,6 +29,11 @@ from vllm.utils.serial_utils import ( decode_pooling_output, ) +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) + MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py index ba9fb64262772..8018dac2d3ffe 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py +++ b/tests/entrypoints/pooling/openai/test_embedding_dimensions.py @@ -12,6 +12,12 @@ from tests.models.language.pooling.embed_utils import run_embedding_correctness_ from tests.models.utils import EmbedModelInfo from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODELS = [ EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False), diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/openai/test_embedding_long_text.py index f977c81a9084e..a9ade09dad0b5 100644 --- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py +++ b/tests/entrypoints/pooling/openai/test_embedding_long_text.py @@ -16,6 +16,12 @@ import pytest_asyncio from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import EmbeddingResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) def _generate_random_text(word_count: int) -> str: diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/openai/test_rerank.py index 1d85190c12a19..5a772e22a7414 100644 --- a/tests/entrypoints/pooling/openai/test_rerank.py +++ b/tests/entrypoints/pooling/openai/test_rerank.py @@ -8,6 +8,12 @@ import torch.nn.functional as F from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "BAAI/bge-reranker-base" DTYPE = "bfloat16" diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/openai/test_score.py index b8f796d47efaa..ceff9d0181825 100644 --- a/tests/entrypoints/pooling/openai/test_score.py +++ b/tests/entrypoints/pooling/openai/test_score.py @@ -10,6 +10,12 @@ from torch import tensor from tests.utils import RemoteOpenAIServer from vllm.entrypoints.openai.protocol import ScoreResponse +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODELS = [ {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True}, diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/openai/test_truncation.py index 6889628dc9145..0d2d385840402 100644 --- a/tests/entrypoints/pooling/openai/test_truncation.py +++ b/tests/entrypoints/pooling/openai/test_truncation.py @@ -7,6 +7,12 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform + +if current_platform.is_rocm(): + pytest.skip( + "Encoder self-attention is not implemented on ROCm.", allow_module_level=True + ) MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2" max_model_len = 128 From 77e1c035d039ec546bb01b4915eed6b5735156c2 Mon Sep 17 00:00:00 2001 From: Yihua Cheng Date: Fri, 21 Nov 2025 19:18:00 -0800 Subject: [PATCH 41/83] [chore][LMCache connector] Remove useless logs from lmcache connector (#29069) Signed-off-by: ApostaC --- .../v1/lmcache_integration/multi_process_adapter.py | 1 - .../kv_transfer/kv_connector/v1/lmcache_mp_connector.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py index ab2eeed9f6b8a..6acfb73997f25 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py @@ -310,7 +310,6 @@ class LMCacheMPWorkerAdapter: request_id, result, ) - logger.info("Retrieve request for request_id=%s finished", request_id) # Remove the finished requests from the tracking dicts for request_id in finished_stores: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py index 22ddabbf1e352..d1d3e475cc889 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py @@ -469,9 +469,6 @@ class LMCacheMPConnector(KVConnectorBase_V1): ops.append(meta.op) if len(request_ids) > 0: - logger.info( - "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids - ) self.worker_adapter.batched_submit_retrieve_requests( request_ids, ops, event ) From fd65015a14be5f2ce663cd959dff6970285c54b4 Mon Sep 17 00:00:00 2001 From: rasmith Date: Fri, 21 Nov 2025 21:34:33 -0600 Subject: [PATCH 42/83] [CI/Build] Only use supported types and features on ROCm in MoE kernel tests (#29149) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/kernels/moe/test_batched_moe.py | 9 +++++++-- tests/kernels/moe/test_block_fp8.py | 5 +++++ tests/kernels/moe/test_gpt_oss_triton_kernels.py | 5 +++++ tests/kernels/moe/test_modular_kernel_combinations.py | 6 ++++++ tests/kernels/moe/test_moe_permute_unpermute.py | 6 ++++++ tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py | 6 ++++++ tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 6 ++++++ 7 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2285709fa7d60..dab1207d78031 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -39,6 +39,11 @@ MNK_FACTORS = [ NUM_EXPERTS = [8, 64] TOP_KS = [1, 2, 6] +DTYPES = [torch.bfloat16] + +if not current_platform.is_fp8_fnuz(): + DTYPES.append(torch.float8_e4m3fn) + vllm_config = VllmConfig() @@ -96,7 +101,7 @@ class BatchedMMTensors: @pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512]) @pytest.mark.parametrize("K", [128, 1024]) @pytest.mark.parametrize("N", [128, 1024]) -@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("per_act_token_quant", [False, True]) def test_batched_mm( @@ -229,7 +234,7 @@ def test_batched_mm( @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS) @pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("topk", TOP_KS) -@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16]) +@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("per_act_token_quant", [False, True]) @pytest.mark.parametrize("block_shape", [None, [128, 128]]) @pytest.mark.parametrize("input_scales", [False]) diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py index 88db4b3e537c2..b0ff1e64e3219 100644 --- a/tests/kernels/moe/test_block_fp8.py +++ b/tests/kernels/moe/test_block_fp8.py @@ -31,6 +31,11 @@ dg_available = has_deep_gemm() if current_platform.get_device_capability() < (9, 0): pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) vllm_config = VllmConfig() diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index af33fd4e3fc3b..98e80ec029777 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -270,6 +270,11 @@ class Case: @pytest.mark.parametrize("num_token", [2]) @pytest.mark.parametrize("tp", [1, 2, 4, 8]) def test_equiv(num_token, a_dtype, w_dtype, tp): + from triton_kernels.tensor_details import layout + + if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"): + pytest.skip("make_default_matmul_mxfp4_w_layout not available") + M = num_token E = ModelConfig.num_experts K = ModelConfig.hidden_size diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index e3b8621b452fa..2a30ef2355529 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -46,6 +46,12 @@ meets_multi_gpu_requirements = pytest.mark.skipif( reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages", ) +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + def format_result(verbose, msg, ex=None): if ex is not None: diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index ba1f657b3ecda..12dd322dccc52 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -23,6 +23,12 @@ TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] current_platform.seed_everything(0) +if current_platform.is_rocm(): + pytest.skip( + "moe_permute_unpermute_supported is not defined for ROCm", + allow_module_level=True, + ) + def torch_permute( hidden_states: torch.Tensor, diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index d6b78dd2c2323..b220205759e2d 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -14,6 +14,12 @@ from vllm.platforms import current_platform from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.math_utils import cdiv, round_up +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + fp8_dtype = torch.float8_e4m3fn CASES = [ diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 7a467e160b784..0ab025dceca40 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -19,6 +19,12 @@ if current_platform.get_device_capability() < (9, 0): vllm_config = VllmConfig() +if current_platform.is_fp8_fnuz(): + pytest.skip( + "Tests in this file require float8_e4m3fn and platform does not support", + allow_module_level=True, + ) + def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): """Matrix multiplication function that supports per-token input From 933f67ecd81231ebfa5e2434d3ae3819b6c28068 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Fri, 21 Nov 2025 19:59:07 -0800 Subject: [PATCH 43/83] [Bugfix]Fix a conditional to not check zero value (#28754) Signed-off-by: Yanan Cao --- vllm/compilation/caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 63b7ad7279e37..6297d9f995aa4 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -116,7 +116,8 @@ class VllmSerializableFunction(SerializableCallable): the AOT compiled path. """ compile_inputs = [ - inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs) + inp if inp is not None else example_inputs[i] + for i, inp in enumerate(fn.example_inputs) ] with tracing(TracingContext(fake_mode)): fn.optimized_call = vllm_backend( From 1489902b531bb649f8110c94572b2d8b753a72cc Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 22 Nov 2025 12:01:30 +0800 Subject: [PATCH 44/83] [LoRA] Cleanup FusedMoEWithLoRA (#29187) Signed-off-by: Jee Jee Li --- vllm/lora/layers/fused_moe.py | 193 ++++++++++++------------ vllm/lora/punica_wrapper/punica_base.py | 4 +- vllm/lora/punica_wrapper/punica_gpu.py | 4 +- 3 files changed, 98 insertions(+), 103 deletions(-) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index adf30855cafc3..5aeaca8de5e53 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -42,6 +42,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() self.device = base_layer.w2_weight.device + self.w13_slices = 2 self._inject_lora_into_fused_moe() def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]: @@ -60,8 +61,8 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): def _get_lora_moe_configs( self, op_prefix: str, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, + num_loras: int, + rank: int, num_slices: int, M: int, layer: FusedMoE, @@ -69,23 +70,25 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): config_dtype: str, ): if envs.VLLM_TUNED_CONFIG_FOLDER: + hidden_size = layer.hidden_size + intermediate_size = layer.intermediate_size_per_partition shrink_config = get_lora_op_configs( op_type=f"fused_moe_lora_{op_prefix}_shrink", - max_loras=lora_a_stacked.shape[0], + max_loras=num_loras, batch=M, - hidden_size=lora_a_stacked.shape[-1], - rank=lora_a_stacked.shape[-2], + hidden_size=hidden_size, + rank=rank, num_slices=num_slices, - moe_intermediate_size=lora_b_stacked.shape[-2], + moe_intermediate_size=intermediate_size, ) expand_config = get_lora_op_configs( op_type=f"fused_moe_lora_{op_prefix}_expand", - max_loras=lora_a_stacked.shape[0], + max_loras=num_loras, batch=M, - hidden_size=lora_a_stacked.shape[-1], - rank=lora_a_stacked.shape[-2], + hidden_size=hidden_size, # lora_a_stacked.shape[-1], + rank=rank, num_slices=num_slices, - moe_intermediate_size=lora_b_stacked.shape[-2], + moe_intermediate_size=intermediate_size, # lora_b_stacked.shape[-2], ) else: # fall back to the default config get_config_func = functools.partial( @@ -152,12 +155,12 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - + max_lora_rank = self.w13_lora_a_stacked[0].shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w13", - lora_a_stacked=self.w1_lora_a_stacked, - lora_b_stacked=self.w1_lora_b_stacked, - num_slices=2, + num_loras=self.max_loras, + rank=max_lora_rank, + num_slices=self.w13_slices, M=M, layer=layer, top_k=top_k, @@ -165,7 +168,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): ) # get the block size of m from customized config or default config - max_loras = self.w1_lora_a_stacked.shape[0] ( sorted_token_ids_lora, expert_ids_lora, @@ -175,7 +177,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens, shrink_config["BLOCK_SIZE_M"], self.base_layer.local_num_experts, - max_loras, + self.max_loras, self.adapter_enabled, expert_map, ) @@ -186,17 +188,15 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora ) - w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked] - w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked] - max_lora_rank = self.w1_lora_a_stacked.shape[-2] - expert_ids_lora = expert_ids_lora.view(max_loras, -1) - sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + expert_ids_lora = expert_ids_lora.view(self.max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1) + # self.punica_wrapper.add_lora_fused_moe( input.view(-1, top_k, input.shape[-1]), hidden_states, - w13_lora_a_stacked, - w13_lora_b_stacked, + self.w13_lora_a_stacked, + self.w13_lora_b_stacked, topk_weights, sorted_token_ids_lora, expert_ids_lora, @@ -230,11 +230,11 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE num_tokens = hidden_states.size(0) M = min(num_tokens, CHUNK_SIZE) - + max_lora_rank = self.w2_lora_a_stacked.shape[-2] shrink_config, expand_config = self._get_lora_moe_configs( op_prefix="w2", - lora_a_stacked=self.w2_lora_a_stacked, - lora_b_stacked=self.w2_lora_b_stacked, + num_loras=self.max_loras, + rank=max_lora_rank, num_slices=1, M=M, layer=layer, @@ -247,20 +247,19 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): num_tokens_post_padded_lora = moe_state_dict[ "num_tokens_post_padded_lora" ] - max_loras = self.w1_lora_a_stacked.shape[0] - expert_ids_lora = expert_ids_lora.view(max_loras, -1) - sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1) + + expert_ids_lora = expert_ids_lora.view(self.max_loras, -1) + sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1) intermediate_cache2 = moe_state_dict["intermediate_cache2"] intermediate_cache3 = args[0] - max_lora_rank = self.w2_lora_a_stacked.shape[-2] shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size) self.punica_wrapper.add_lora_fused_moe( intermediate_cache3, intermediate_cache2, - [self.w2_lora_a_stacked], - [self.w2_lora_b_stacked], + (self.w2_lora_a_stacked,), + (self.w2_lora_b_stacked,), topk_weights, sorted_token_ids_lora, expert_ids_lora, @@ -289,7 +288,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): fused_experts.moe_sum = moe_sum_decorator( self.base_layer, fused_experts.moe_sum ) - self.base_layer.quant_method = FusedMoEModularMethod( self.base_layer.quant_method, m_fused_moe_fn ) @@ -301,33 +299,42 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): model_config: PretrainedConfig | None = None, ) -> None: """Initializes lora matrices.""" + assert self.w13_slices == 2 + self.max_loras = lora_config.max_loras self.fully_sharded = lora_config.fully_sharded_loras self.adapter_enabled = torch.tensor( [0] * (max_loras + 1), dtype=torch.int, device=self.device ) - self.w1_lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - lora_config.max_lora_rank - if not self.fully_sharded - else divide(lora_config.max_lora_rank, self.tp_size), - self.base_layer.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, + self.w13_lora_a_stacked = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + lora_config.max_lora_rank + if not self.fully_sharded + else divide(lora_config.max_lora_rank, self.tp_size), + self.base_layer.hidden_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.w13_slices) ) - self.w1_lora_b_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, + + self.w13_lora_b_stacked = tuple( + torch.zeros( + ( + max_loras, + self.base_layer.local_num_experts, + self.base_layer.intermediate_size_per_partition, + lora_config.max_lora_rank, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + for _ in range(self.w13_slices) ) self.w2_lora_a_stacked = torch.zeros( @@ -353,29 +360,6 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): device=self.device, ) - self.w3_lora_a_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - lora_config.max_lora_rank - if not self.fully_sharded - else divide(lora_config.max_lora_rank, self.tp_size), - self.base_layer.hidden_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.w3_lora_b_stacked = torch.zeros( - ( - max_loras, - self.base_layer.local_num_experts, - self.base_layer.intermediate_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - # They will be used by 'LoRALayerWeights.create_dummy_lora_weights' # to create a dummy LoRA weights. self.lora_a_stacked = [] @@ -383,20 +367,28 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): for lora_id in range(max_loras): for experts_id in range(self.base_layer.local_num_experts): # gate_proj,down_proj,up_proj - self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append( + self.w13_lora_a_stacked[0][lora_id][experts_id] + ) self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id]) - self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id]) + self.lora_a_stacked.append( + self.w13_lora_a_stacked[1][lora_id][experts_id] + ) - self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append( + self.w13_lora_b_stacked[0][lora_id][experts_id] + ) self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id]) - self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id]) + self.lora_b_stacked.append( + self.w13_lora_b_stacked[1][lora_id][experts_id] + ) def reset_lora(self, index: int): """Resets the lora weights at index back to 0.""" - self.w1_lora_a_stacked[index] = 0 - self.w1_lora_b_stacked[index] = 0 - self.w3_lora_a_stacked[index] = 0 - self.w3_lora_b_stacked[index] = 0 + for pos in range(self.w13_slices): + self.w13_lora_a_stacked[pos][index] = 0 + self.w13_lora_b_stacked[pos][index] = 0 + self.w2_lora_a_stacked[index] = 0 self.w2_lora_b_stacked[index] = 0 self.adapter_enabled[index] = 0 @@ -434,7 +426,7 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): if self.fully_sharded: # Based on S-LoRA, we slice W1 and W3 A along the rank dim, # and W2 B along the hidden_size dim. - w13_shard_size = self.w1_lora_a_stacked[index, eid].shape[0] + w13_shard_size = self.w13_lora_a_stacked[0][index, eid].shape[0] w13_start_idx = self.tp_rank * w13_shard_size w13_end_idx = (self.tp_rank + 1) * w13_shard_size w1_lora_a = w1_lora_a[w13_start_idx:w13_end_idx, :] @@ -444,29 +436,32 @@ class FusedMoEWithLoRA(BaseLayerWithLoRA): w2_start_idx = self.tp_rank * w2_shard_size w2_end_idx = (self.tp_rank + 1) * w2_shard_size w2_lora_b = w2_lora_b[w2_start_idx:w2_end_idx, :] - - self.w1_lora_a_stacked[ + # w1 lora_a + self.w13_lora_a_stacked[0][ index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1] ].copy_(w1_lora_a, non_blocking=True) - - self.w3_lora_a_stacked[ + # w3 lora_a + self.w13_lora_a_stacked[1][ index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1] ].copy_(w3_lora_a, non_blocking=True) + # w1 lora_b + self.w13_lora_b_stacked[0][ + index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] + ].copy_(w1_lora_b, non_blocking=True) + # w3 lora_b + self.w13_lora_b_stacked[1][ + index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] + ].copy_(w3_lora_b, non_blocking=True) + + self.w2_lora_a_stacked[ + index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] + ].copy_(w2_lora_a, non_blocking=True) + self.w2_lora_b_stacked[ index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1] ].copy_(w2_lora_b, non_blocking=True) - self.w1_lora_b_stacked[ - index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1] - ].copy_(w1_lora_b, non_blocking=True) - self.w3_lora_b_stacked[ - index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1] - ].copy_(w3_lora_b, non_blocking=True) - self.w2_lora_a_stacked[ - index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1] - ].copy_(w2_lora_a, non_blocking=True) - @classmethod def can_replace_layer( cls, diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 7c0fc8167711d..ce38751e4b6a7 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -470,8 +470,8 @@ class PunicaWrapperBase(PunicaWrapperABC): self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: list[torch.Tensor], - lora_b_stacked: list[torch.Tensor], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], topk_weights: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 52138ef0cc3b0..ef4b4ab7c3497 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -360,8 +360,8 @@ class PunicaWrapperGPU(PunicaWrapperBase): self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: list[torch.Tensor], - lora_b_stacked: list[torch.Tensor], + lora_a_stacked: tuple[torch.Tensor, ...], + lora_b_stacked: tuple[torch.Tensor, ...], topk_weights: torch.Tensor, sorted_token_ids: torch.Tensor, expert_ids: torch.Tensor, From e9056056fbacecbac4318bd0323745fdd7fe55b6 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 21 Nov 2025 20:21:35 -0800 Subject: [PATCH 45/83] [Model Runner V2] Limit cudagraph size to max decode batch size (#29221) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 31a706475243c..763bd61834625 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -27,9 +27,11 @@ class CudaGraphManager: device: torch.device, ): self.vllm_config = vllm_config + self.scheduler_config = vllm_config.scheduler_config self.device = device self.max_model_len = vllm_config.model_config.max_model_len + self.max_num_reqs = self.scheduler_config.max_num_seqs self.dp_size = vllm_config.parallel_config.data_parallel_size self.compilation_config = vllm_config.compilation_config assert self.compilation_config is not None @@ -39,9 +41,11 @@ class CudaGraphManager: else: self.cudagraph_mode = self.compilation_config.cudagraph_mode if self.compilation_config.cudagraph_capture_sizes is not None: - self.cudagraph_sizes = sorted( - self.compilation_config.cudagraph_capture_sizes - ) + cudagraph_sizes = sorted(self.compilation_config.cudagraph_capture_sizes) + # Limit the cudagraph sizes to the max decode batch size. + self.cudagraph_sizes = [ + x for x in cudagraph_sizes if x <= self.max_num_reqs + ] else: self.cudagraph_sizes = [] self.padded_sizes = self._init_padded_sizes() @@ -54,9 +58,10 @@ class CudaGraphManager: if not self.cudagraph_mode.has_full_cudagraphs(): # Full cuda graphs are not used. return {} + if not self.cudagraph_sizes: + return {} padded_sizes: dict[int, int] = {} - assert len(self.cudagraph_sizes) > 0 for i in range(1, self.cudagraph_sizes[-1] + 1): for x in self.cudagraph_sizes: if i <= x: From 742e9ff6b39ad0433bac0d7417a41bbdc74854a3 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Fri, 21 Nov 2025 23:42:11 -0800 Subject: [PATCH 46/83] [responsesAPI] parse reasoning item input (#28248) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia Co-authored-by: Cyrus Leung --- .../online_serving/openai_responses_client.py | 44 ++++++++++++ .../openai/test_response_api_simple.py | 71 +++++++++++++++++++ .../openai/test_response_api_with_harmony.py | 27 ++++++- tests/entrypoints/test_responses_utils.py | 58 +++++++++++++++ vllm/entrypoints/responses_utils.py | 13 ++++ 5 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 examples/online_serving/openai_responses_client.py create mode 100644 tests/entrypoints/openai/test_response_api_simple.py diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py new file mode 100644 index 0000000000000..b4eb24671507a --- /dev/null +++ b/examples/online_serving/openai_responses_client.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Set up this example by starting a vLLM OpenAI-compatible server. +Reasoning models can be used through the Responses API as seen here +https://platform.openai.com/docs/api-reference/responses +For example: +vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3 + +""" + +from openai import OpenAI + +input_messages = [{"role": "user", "content": "What model are you?"}] + + +def main(): + base_url = "http://localhost:8000/v1" + client = OpenAI(base_url=base_url, api_key="empty") + model = "Qwen/Qwen3-8B" # get_first_model(client) + response = client.responses.create( + model=model, + input=input_messages, + ) + + for message in response.output: + if message.type == "reasoning": + # append reasoning message + input_messages.append(message) + + response_2 = client.responses.create( + model=model, + input=input_messages, + ) + print(response_2.output_text) + # I am Qwen, a large language model developed by Alibaba Cloud. + # I am designed to assist with a wide range of tasks, including + # answering questions, creating content, coding, and engaging in + # conversations. I can help with various topics and provide + # information or support in multiple languages. How can I assist you today? + + +if __name__ == "__main__": + main() diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py new file mode 100644 index 0000000000000..425b8199a0fd0 --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_simple.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen3-8B" + + +@pytest.fixture(scope="module") +def server(): + args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"] + env_dict = dict( + VLLM_ENABLE_RESPONSES_API_STORE="1", + # uncomment for tool calling + # PYTHON_EXECUTION_BACKEND="dangerously_use_uv", + ) + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_basic(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is 13 * 24?", + ) + assert response is not None + print("response: ", response) + assert response.status == "completed" + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + # make sure we get a reasoning and text output + assert response.output[0].type == "reasoning" + assert response.output[1].type == "message" + assert type(response.output[1].content[0].text) is str diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index dea8d2d28f61a..6251e1776c30a 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -35,7 +35,7 @@ GET_WEATHER_SCHEMA = { @pytest.fixture(scope="module") def server(): - args = ["--enforce-eager", "--tool-server", "demo"] + args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"] env_dict = dict( VLLM_ENABLE_RESPONSES_API_STORE="1", PYTHON_EXECUTION_BACKEND="dangerously_use_uv", @@ -550,6 +550,31 @@ def call_function(name, args): raise ValueError(f"Unknown function: {name}") +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_reasoning_item(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input=[ + {"type": "message", "content": "Hello.", "role": "user"}, + { + "type": "reasoning", + "id": "lol", + "content": [ + { + "type": "reasoning_text", + "text": "We need to respond: greeting.", + } + ], + "summary": [], + }, + ], + temperature=0.0, + ) + assert response is not None + assert response.status == "completed" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_function_calling(client: OpenAI, model_name: str): diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 48bf06088bc05..91c818374e3fd 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -1,7 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest +from openai.types.responses.response_reasoning_item import ( + Content, + ResponseReasoningItem, + Summary, +) + from vllm.entrypoints.responses_utils import ( + construct_chat_message_with_tool_call, convert_tool_responses_to_completions_format, ) @@ -28,3 +36,53 @@ class TestResponsesUtils: result = convert_tool_responses_to_completions_format(input_tool) assert result == {"type": "function", "function": input_tool} + + def test_construct_chat_message_with_tool_call(self): + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=[ + Content( + text="Leroy Jenkins", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert formatted_item["reasoning"] == "Leroy Jenkins" + + item = ResponseReasoningItem( + id="lol", + summary=[ + Summary( + text='Hmm, the user has just started with a simple "Hello,"', + type="summary_text", + ) + ], + type="reasoning", + content=None, + encrypted_content=None, + status=None, + ) + + formatted_item = construct_chat_message_with_tool_call(item) + assert formatted_item["role"] == "assistant" + assert ( + formatted_item["reasoning"] + == 'Hmm, the user has just started with a simple "Hello,"' + ) + + item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=None, + encrypted_content="TOP_SECRET_MESSAGE", + status=None, + ) + with pytest.raises(ValueError): + construct_chat_message_with_tool_call(item) diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index d966f58804b67..912e8a690573d 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -10,6 +10,7 @@ from openai.types.chat.chat_completion_message_tool_call_param import ( Function as FunctionCallTool, ) from openai.types.responses import ResponseFunctionToolCall +from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool from vllm import envs @@ -37,6 +38,18 @@ def construct_chat_message_with_tool_call( ) ], ) + elif isinstance(item, ResponseReasoningItem): + reasoning_content = "" + if item.encrypted_content: + raise ValueError("Encrypted content is not supported.") + if len(item.summary) == 1: + reasoning_content = item.summary[0].text + elif item.content and len(item.content) == 1: + reasoning_content = item.content[0].text + return { + "role": "assistant", + "reasoning": reasoning_content, + } elif item.get("type") == "function_call_output": # Append the function call output as a tool message. return ChatCompletionToolMessageParam( From ea38474ac564efdc09762ad066139b75cf68f924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mads=20Kildeg=C3=A5rd?= Date: Sat, 22 Nov 2025 10:58:22 +0100 Subject: [PATCH 47/83] [Frontend][Responses API] Multi-turn (with type: "output_text") support for non-harmony requests (#29175) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mads Kildegård --- vllm/entrypoints/chat_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aaf8a3ae9d2dd..bf80856c1bbfc 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1283,6 +1283,7 @@ MM_PARSER_MAP: dict[ "text": lambda part: _TextParser(part).get("text", None), "thinking": lambda part: _ThinkParser(part).get("thinking", None), "input_text": lambda part: _TextParser(part).get("text", None), + "output_text": lambda part: _TextParser(part).get("text", None), "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None), "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None), "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None), @@ -1463,7 +1464,7 @@ def _parse_chat_message_content_part( ) return None - if part_type in ("text", "input_text", "refusal", "thinking"): + if part_type in ("text", "input_text", "output_text", "refusal", "thinking"): str_content = cast(str, content) if wrap_dicts: return {"type": "text", "text": str_content} From 988ee66b0d54ec08a24135f7a947affe69e9dd52 Mon Sep 17 00:00:00 2001 From: jinghanhu Date: Sat, 22 Nov 2025 18:07:50 +0800 Subject: [PATCH 48/83] Handle triton kernel import exception (#29062) --- vllm/model_executor/layers/fused_moe/config.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 21eb4d590a7d1..1826fafa8c4f5 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -28,10 +28,11 @@ logger = init_logger(__name__) if has_triton_kernels(): try: from triton_kernels.matmul_ogs import PrecisionConfig - except ImportError: + except (ImportError, AttributeError) as e: logger.error( "Failed to import Triton kernels. Please make sure your triton " - "version is compatible." + "version is compatible. Error: %s", + e, ) From e6309acdba3a26e803d1ea7f66804f4ad30c2b9a Mon Sep 17 00:00:00 2001 From: "Jane (Yuan) Xu" <31798555+janeyx99@users.noreply.github.com> Date: Sat, 22 Nov 2025 05:35:32 -0500 Subject: [PATCH 49/83] Simplify `from_blob` usage in `get_cuda_view_from_cpu_tensor` (#29027) Signed-off-by: Jane Xu --- csrc/cuda_view.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu index 938bd4ab7fc62..9853fc942bab7 100644 --- a/csrc/cuda_view.cu +++ b/csrc/cuda_view.cu @@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) { auto strides = cpu_tensor.strides(); auto options = cpu_tensor.options().device(torch::kCUDA); - // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter, - // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the - // memory, so we don't free it here. - auto deleter = [](void*) { - // no-op, since the memory is owned by the original CPU tensor - }; - + // use default no-op deleter, since the memory is owned by the original CPU + // tensor torch::Tensor cuda_tensor = - torch::from_blob(device_ptr, sizes, strides, deleter, options); + torch::from_blob(device_ptr, sizes, strides, options); TORCH_CHECK(cuda_tensor.device().is_cuda(), "Resulting tensor is not on CUDA device"); From a4fdf2405c737843d1e95e406959f3e2e6bcf899 Mon Sep 17 00:00:00 2001 From: rasmith Date: Sat, 22 Nov 2025 04:59:39 -0600 Subject: [PATCH 50/83] [CI/Build] Skip tests that require libcudart in test_lmcache_integration.py (#29228) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- .../kv_connector/unit/test_lmcache_integration.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py index 11507d7cd4e7b..33418edc325af 100644 --- a/tests/v1/kv_connector/unit/test_lmcache_integration.py +++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py @@ -9,6 +9,12 @@ # Assumption vs. Correctness Tests: # these unit tests do *not* test correctness of LMCache-side or vLLM-side logic # it is to ensure that assumptions LMCache makes about vLLM's interface are stable + +import pytest + +from vllm.platforms import current_platform + + def assumes(obj, attr, is_callable=False, is_instance_of=None): import inspect from dataclasses import is_dataclass @@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None): assert isinstance(attr_value, is_instance_of), assumption_msg +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_multimodal_interface(): # protect against interface changes from vllm.multimodal.inputs import PlaceholderRange @@ -72,6 +81,9 @@ def test_multimodal_interface(): assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9] +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_config_interface(): # protect against interface changes from vllm.config import VllmConfig @@ -146,6 +158,9 @@ def test_config_interface(): ) +@pytest.mark.skipif( + current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm" +) def test_request_interface(): # protect against interface changes from types import NoneType From 8e22da1d7fcd43efd8fec18c0c0bf6a8e7cf61a6 Mon Sep 17 00:00:00 2001 From: rasmith Date: Sat, 22 Nov 2025 05:00:54 -0600 Subject: [PATCH 51/83] [CI/Build Don't add FLASHINFER backend in test_cpu_offloading.py (#29229) Signed-off-by: Randall Smith Co-authored-by: Randall Smith --- tests/v1/kv_offload/test_cpu_offloading.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 3ee41c40859dc..406d4c0b4c1fd 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -12,10 +12,14 @@ from tqdm import tqdm from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import KVEventsConfig, KVTransferConfig from vllm.distributed.kv_events import BlockStored, KVEventBatch +from vllm.platforms import current_platform from vllm.utils.system_utils import set_env_var CPU_BLOCK_SIZES = [48] -ATTN_BACKENDS = ["FLASH_ATTN", "FLASHINFER"] +ATTN_BACKENDS = ["FLASH_ATTN"] + +if current_platform.is_cuda(): + ATTN_BACKENDS.append("FLASHINFER") class MockSubscriber: From 5a4802588ed8f7918468986fce130c19ee721674 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 22 Nov 2025 19:34:15 +0800 Subject: [PATCH 52/83] [Misc] Further clean up chunked prefill and prefix caching init (#29186) Signed-off-by: DarkLight1337 --- tests/engine/test_arg_utils.py | 2 +- tests/v1/core/test_scheduler.py | 19 +++++++------------ tests/v1/core/utils.py | 11 +++-------- vllm/config/cache.py | 4 ++-- vllm/engine/arg_utils.py | 24 +++++++++++++++++++----- vllm/v1/core/sched/scheduler.py | 2 +- 6 files changed, 33 insertions(+), 29 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 472b1487ef440..10827e3b4b9cd 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -279,7 +279,7 @@ def test_prefix_cache_default(): args = parser.parse_args([]) engine_args = EngineArgs.from_cli_args(args=args) - assert not engine_args.enable_prefix_caching, "prefix caching defaults to off." + assert engine_args.enable_prefix_caching, "prefix caching should default to on." # with flag to turn it on. args = parser.parse_args(["--enable-prefix-caching"]) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index d9a69a77c9797..09acde6e08faa 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -76,11 +76,11 @@ def test_get_num_unfinished_requests(): @pytest.mark.parametrize( "enable_prefix_caching, prompt_logprobs", [ - (None, None), + (False, None), (True, 5), ], ) -def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None): +def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None): """Test scheduling. Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs """ @@ -582,12 +582,12 @@ def test_check_stop_min_tokens(): @pytest.mark.parametrize( "enable_prefix_caching, prompt_logprobs", [ - (None, None), + (False, None), (True, 5), ], ) def test_schedule_concurrent_batches( - enable_prefix_caching: bool | None, prompt_logprobs: int | None + enable_prefix_caching: bool, prompt_logprobs: int | None ): scheduler = create_scheduler( max_num_batched_tokens=1024, @@ -1425,7 +1425,7 @@ def create_scheduler_with_priority( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, - enable_prefix_caching: bool | None = None, + enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, use_kv_connector: bool = False, @@ -1444,7 +1444,7 @@ def create_scheduler_with_priority( max_num_batch_tokens: max num tokens to batch enable_prefix_caching: optionally force APC config (True/False) or use default - (None) + (False) Returns: {class}`Scheduler` instance with priority scheduling @@ -1467,17 +1467,12 @@ def create_scheduler_with_priority( seed=42, ) # Cache config, optionally force APC - kwargs_cache = ( - {} - if enable_prefix_caching is None - else {"enable_prefix_caching": enable_prefix_caching} - ) cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", - **kwargs_cache, + enable_prefix_caching=enable_prefix_caching, ) kv_transfer_config = ( KVTransferConfig( diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index 65511c17473b2..6830f68736453 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -42,7 +42,7 @@ def create_scheduler( model: str = "facebook/opt-125m", max_num_seqs: int = 16, max_num_batched_tokens: int = 8192, - enable_prefix_caching: bool | None = None, + enable_prefix_caching: bool = False, long_prefill_token_threshold: int = 0, disable_chunked_mm_input: bool = False, use_kv_connector: None | bool | MockKVConfig = None, @@ -63,7 +63,7 @@ def create_scheduler( max_num_batch_tokens: max num tokens to batch enable_prefix_caching: optionally force APC config (True/False) or use default - (None) + (False) Returns: {class}`Scheduler` instance @@ -87,17 +87,12 @@ def create_scheduler( skip_tokenizer_init=skip_tokenizer_init, ) # Cache config, optionally force APC - kwargs_cache = ( - {} - if enable_prefix_caching is None - else {"enable_prefix_caching": enable_prefix_caching} - ) cache_config = CacheConfig( block_size=block_size, gpu_memory_utilization=0.9, swap_space=0, cache_dtype="auto", - **kwargs_cache, + enable_prefix_caching=enable_prefix_caching, ) kv_transfer_config = None if isinstance(use_kv_connector, MockKVConfig): diff --git a/vllm/config/cache.py b/vllm/config/cache.py index 2652c7c06ad0f..ef6928d8ebd5c 100644 --- a/vllm/config/cache.py +++ b/vllm/config/cache.py @@ -73,8 +73,8 @@ class CacheConfig: sliding_window: int | None = None """Sliding window size for the KV cache. This is primarily set in `ModelConfig` and that value should be manually duplicated here.""" - enable_prefix_caching: bool | None = None - """Whether to enable prefix caching. Enabled by default for V1.""" + enable_prefix_caching: bool = True + """Whether to enable prefix caching.""" prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256" """Set the hash algorithm for prefix caching:\n - "sha256" uses Pickle for object serialization before hashing.\n diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 888f57b1ac1df..611bf1b375849 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -425,7 +425,7 @@ class EngineArgs: ParallelConfig.max_parallel_loading_workers ) block_size: BlockSize | None = CacheConfig.block_size - enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching + enable_prefix_caching: bool | None = None prefix_caching_hash_algo: PrefixCachingHashAlgo = ( CacheConfig.prefix_caching_hash_algo ) @@ -1975,10 +1975,11 @@ class EngineArgs: if self.prefill_context_parallel_size > 1: default_chunked_prefill = False default_prefix_caching = False - logger.warning( + logger.warning_once( "--prefill-context-parallel-size > 1 is not compatible with " "chunked prefill and prefix caching now. Chunked prefill " - "and prefix caching have been disabled by default." + "and prefix caching have been disabled by default.", + scope="local", ) if self.enable_chunked_prefill is None: @@ -1988,15 +1989,27 @@ class EngineArgs: "%s chunked prefill by default", "Enabling" if default_chunked_prefill else "Disabling", ) + elif ( + model_config.runner_type == "generate" + and not self.enable_chunked_prefill + and default_chunked_prefill + ): + logger.warning_once( + "This model does not officially support disabling chunked prefill. " + "Disabling this manually may cause the engine to crash " + "or produce incorrect outputs.", + scope="local", + ) elif ( model_config.runner_type == "pooling" and self.enable_chunked_prefill and not default_chunked_prefill ): - logger.warning( + logger.warning_once( "This model does not officially support chunked prefill. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", + scope="local", ) if self.enable_prefix_caching is None: @@ -2011,10 +2024,11 @@ class EngineArgs: and self.enable_prefix_caching and not default_prefix_caching ): - logger.warning( + logger.warning_once( "This model does not officially support prefix caching. " "Enabling this manually may cause the engine to crash " "or produce incorrect outputs.", + scope="local", ) world_size = self.pipeline_parallel_size * self.tensor_parallel_size diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 4cb5348cbacc3..a7ec0de372631 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -180,7 +180,7 @@ class Scheduler(SchedulerInterface): self.kv_cache_manager = KVCacheManager( kv_cache_config=kv_cache_config, max_model_len=self.max_model_len, - enable_caching=bool(self.cache_config.enable_prefix_caching), + enable_caching=self.cache_config.enable_prefix_caching, use_eagle=self.use_eagle, log_stats=self.log_stats, enable_kv_cache_events=self.enable_kv_cache_events, From 6965a392a4cd38ee65ac6a9c2730e0a7c62a658d Mon Sep 17 00:00:00 2001 From: Nandan Vallamdasu Date: Sat, 22 Nov 2025 18:28:22 +0530 Subject: [PATCH 53/83] Fix: Resolve circular import in model_loader/utils.py (#29189) Signed-off-by: nandan2003 Signed-off-by: Nandan Vallamdasu Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/model_loader/utils.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index e74434e9d12cb..1db6337f4c9f9 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -19,12 +19,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) -from vllm.model_executor.models.adapters import ( - as_embedding_model, - as_reward_model, - as_seq_cls_model, - try_create_mm_pooling_model_cls, -) + from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal from vllm.utils.platform_utils import is_pin_memory_available @@ -172,6 +167,12 @@ _MODEL_ARCH_BY_HASH = dict[int, tuple[type[nn.Module], str]]() def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]: + from vllm.model_executor.models.adapters import ( + as_embedding_model, + as_reward_model, + as_seq_cls_model, + try_create_mm_pooling_model_cls, + ) architectures = getattr(model_config.hf_config, "architectures", []) model_cls, arch = model_config.registry.resolve_model_cls( From 2d4978a57e0addf55cde6113e9615ed064b72fb7 Mon Sep 17 00:00:00 2001 From: yihong Date: Sat, 22 Nov 2025 21:00:04 +0800 Subject: [PATCH 54/83] fix: clean up function never use in setup.py (#29061) Signed-off-by: yihong0618 --- setup.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/setup.py b/setup.py index 5591bcb132447..8871b04d8fc46 100644 --- a/setup.py +++ b/setup.py @@ -74,18 +74,6 @@ def is_ninja_available() -> bool: return which("ninja") is not None -def is_url_available(url: str) -> bool: - from urllib.request import urlopen - - status = None - try: - with urlopen(url) as f: - status = f.status - except Exception: - return False - return status == 200 - - class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None: super().__init__(name, sources=[], py_limited_api=True, **kwa) @@ -533,28 +521,6 @@ def get_nvcc_cuda_version() -> Version: return nvcc_cuda_version -def get_gaudi_sw_version(): - """ - Returns the driver version. - """ - # Enable console printing for `hl-smi` check - output = subprocess.run( - "hl-smi", - shell=True, - text=True, - capture_output=True, - env={"ENABLE_CONSOLE": "true"}, - ) - if output.returncode == 0 and output.stdout: - return ( - output.stdout.split("\n")[2] - .replace(" ", "") - .split(":")[1][:-1] - .split("-")[0] - ) - return "0.0.0" # when hl-smi is not available - - def get_vllm_version() -> str: # Allow overriding the version. This is useful to build platform-specific # wheels (e.g. CPU, TPU) without modifying the source. From 5f7209a793ec553889f8ba9972a0034393a6b196 Mon Sep 17 00:00:00 2001 From: Bram Wasti Date: Sat, 22 Nov 2025 08:00:50 -0500 Subject: [PATCH 55/83] [tiny] Remove unsupported TRITON_MLA backend from batch invariance (#28832) Signed-off-by: Bram Wasti Signed-off-by: Bram Wasti Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> --- vllm/model_executor/layers/batch_invariant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index bec7af0286345..8b33727f05fbc 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -805,11 +805,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", - "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 + # "TRITON_MLA", ] if curr_attn_backend not in supported_backends: warning = ( From 066209a045216c87bd582be97830eae728a29369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 22 Nov 2025 15:38:44 +0100 Subject: [PATCH 56/83] [Attention] Refactor FA `block_size` limitations to hybrid models only (#29084) Signed-off-by: NickLucche --- tests/v1/attention/test_mla_backends.py | 2 +- tests/v1/worker/test_gpu_model_runner.py | 4 ++- vllm/attention/backends/abstract.py | 10 ++++--- vllm/v1/attention/backends/flash_attn.py | 27 ++++++++++++++----- vllm/v1/attention/backends/flashinfer.py | 12 ++++----- vllm/v1/attention/backends/mla/cutlass_mla.py | 5 +++- .../attention/backends/mla/flashattn_mla.py | 5 +++- .../attention/backends/mla/flashinfer_mla.py | 5 +++- vllm/v1/attention/backends/mla/flashmla.py | 5 +++- .../attention/backends/mla/flashmla_sparse.py | 5 +++- vllm/v1/attention/backends/mla/indexer.py | 6 ++--- .../attention/backends/mla/rocm_aiter_mla.py | 4 ++- vllm/v1/attention/backends/rocm_aiter_fa.py | 5 +++- vllm/v1/attention/backends/tree_attn.py | 5 +++- vllm/v1/attention/backends/triton_attn.py | 5 +++- vllm/v1/attention/backends/xformers.py | 5 +++- vllm/v1/worker/gpu_model_runner.py | 4 +-- 17 files changed, 82 insertions(+), 32 deletions(-) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 1bd05e6183dc2..783e02ce89bdb 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -61,7 +61,7 @@ for backend in BACKENDS_TO_TEST: BACKEND_BLOCK_SIZES = {} for backend in BACKENDS_TO_TEST: - supported_sizes = backend.get_class().supported_kernel_block_sizes + supported_sizes = backend.get_class().get_supported_kernel_block_sizes() if supported_sizes: default_size = supported_sizes[0] block_size = ( diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 01c1364f7ee62..d0f1b703fcb92 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size( supported_sizes: list[int | MultipleOf], ): class _MockBackend: - supported_kernel_block_sizes = supported_sizes + @staticmethod + def get_supported_kernel_block_sizes(): + return supported_sizes return _MockBackend() diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 67ded88475243..bd7e81b15bfc3 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -46,9 +46,12 @@ class AttentionBackend(ABC): # makes sure the output tensor is allocated inside the cudagraph. accept_output_buffer: bool = False supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)] supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(1)] + @staticmethod @abstractmethod def get_name() -> str: @@ -142,10 +145,11 @@ class AttentionBackend(ABC): if block_size not in valid_sizes: return False - if not cls.supported_kernel_block_sizes: + supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes() + if not supported_kernel_block_sizes: return True - for supported_size in cls.supported_kernel_block_sizes: + for supported_size in supported_kernel_block_sizes: if isinstance(supported_size, MultipleOf): supported_size = supported_size.base # With hybrid_blocks feature, the framework-level block size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 9fa6b1dfd19dd..a9a4af5ac1183 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -32,7 +32,7 @@ if is_flash_attn_varlen_func_available(): get_scheduler_metadata, reshape_and_cache_flash, ) -from vllm.config import VllmConfig, get_layers_from_vllm_config +from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config from vllm.config.cache import CacheDType from vllm.distributed.parallel_state import get_dcp_group from vllm.logger import init_logger @@ -56,11 +56,26 @@ logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - # NOTE(tdoublep): while in principle, FA supports - # MultipleOf(16), these are the block sizes that do not - # suffer from the NaN propagation problem described here: - # https://github.com/Dao-AILab/flash-attention/issues/1974 - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + vllm_config = get_current_vllm_config() + model_config = vllm_config.model_config + cache_config = vllm_config.cache_config + if ( + model_config + and model_config.is_hybrid + and ( + cache_config.mamba_ssm_cache_dtype == "float32" + or cache_config.mamba_cache_dtype == "float32" + ) + ): + # NOTE(tdoublep): while in principle, FA supports + # MultipleOf(16), these are the block sizes that do not + # suffer from the NaN propagation problem described here: + # https://github.com/Dao-AILab/flash-attention/issues/1974 + return [16, 32, 64] + return [MultipleOf(16)] @staticmethod def get_name() -> str: diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e3f499216d7f1..8159f4096107f 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -16,7 +16,6 @@ from flashinfer import ( from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache from flashinfer.prefill import trtllm_batch_context_with_kv_cache from flashinfer.utils import FP4Tensor -from typing_extensions import override from vllm import envs from vllm.attention.backends.abstract import ( @@ -275,10 +274,6 @@ class BatchDCPPrefillWrapper: class FlashInferBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - # Note: Not sure for all platforms, - # but on Blackwell, only support a page size of - # 16, 32, 64 - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", @@ -286,6 +281,12 @@ class FlashInferBackend(AttentionBackend): "fp8_e5m2", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + # Note: Not sure for all platforms, but on Blackwell, + # only support a page size of 16, 32, 64. + return [16, 32, 64] + @staticmethod def get_name() -> str: return "FLASHINFER" @@ -566,7 +567,6 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): ) @classmethod - @override def get_cudagraph_support( cls: type["FlashInferMetadataBuilder"], vllm_config: VllmConfig, diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index 60cb5022a55eb..5e3fbc0abf083 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -36,13 +36,16 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): class CutlassMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [128] + @staticmethod def get_name() -> str: return "CUTLASS_MLA" diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 12639edc8b9a1..d369814c10b6f 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -41,9 +41,12 @@ logger = init_logger(__name__) class FlashAttnMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] + @staticmethod def get_name() -> str: return "FLASH_ATTN_MLA" diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 52bb19e039e45..f02a4bb1ef35a 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -35,13 +35,16 @@ class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]): class FlashInferMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [32, 64] + @staticmethod def get_name() -> str: return "FLASHINFER_MLA" diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 3aab1f9bb7fb6..74a4cd8430250 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -39,13 +39,16 @@ logger = init_logger(__name__) class FlashMLABackend(MLACommonBackend): supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", "fp8_e4m3", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [64] + @staticmethod def get_name() -> str: return "FLASHMLA" diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 3f2cc8c38327e..1eee1d225293b 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -55,9 +55,12 @@ structured as: class FlashMLASparseBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [64] + @staticmethod def get_name() -> str: return "FLASHMLA_SPARSE" diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index d38361e0fcbf8..77f1ba00d5b04 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -24,9 +24,9 @@ logger = init_logger(__name__) class DeepseekV32IndexerBackend(AttentionBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [ - 1 if current_platform.is_rocm() else 64 - ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [1 if current_platform.is_rocm() else 64] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 6ccc1a341d56c..56f9c7a281e7f 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -21,7 +21,9 @@ from vllm.v1.kv_cache_interface import AttentionSpec class AiterMLABackend(MLACommonBackend): - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [1] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [1] @staticmethod def get_name() -> str: diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index ea611848b0e81..c8742e9835203 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -447,7 +447,10 @@ class AiterFlashAttentionMetadataBuilder( class AiterFlashAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py index 1bf38ed225a4c..523f759e05a21 100644 --- a/vllm/v1/attention/backends/tree_attn.py +++ b/vllm/v1/attention/backends/tree_attn.py @@ -31,7 +31,10 @@ logger = init_logger(__name__) class TreeAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 09c36043c8c86..d051a89f03bb4 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend): torch.bfloat16, torch.float32, ] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [ "auto", "fp8", @@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend): "fp8_e5m2", ] + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] + @staticmethod def get_name() -> str: return "TRITON_ATTN" diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py index d15d79417cc61..5039c44b9c3e6 100644 --- a/vllm/v1/attention/backends/xformers.py +++ b/vllm/v1/attention/backends/xformers.py @@ -42,7 +42,10 @@ logger = init_logger(__name__) class XFormersAttentionBackend(AttentionBackend): accept_output_buffer: bool = True supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16] - supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)] + + @staticmethod + def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: + return [MultipleOf(16)] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e786cd8bc7c97..298bb1ef5f6fd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4618,7 +4618,7 @@ class GPUModelRunner( """ for backend in backends: is_supported = False - for supported_size in backend.supported_kernel_block_sizes: + for supported_size in backend.get_supported_kernel_block_sizes(): if isinstance(supported_size, int): if block_size == supported_size: is_supported = True @@ -4649,7 +4649,7 @@ class GPUModelRunner( all_int_supported_sizes = set( supported_size for backend in backends - for supported_size in backend.supported_kernel_block_sizes + for supported_size in backend.get_supported_kernel_block_sizes() if isinstance(supported_size, int) ) From d44a63c6d6e1a545aff270b3b85cf231ef779dab Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Sat, 22 Nov 2025 06:41:25 -0800 Subject: [PATCH 57/83] [BugFix] Fix returned logprobs with spec decode + prefill chunking (#29216) Signed-off-by: Nick Hill --- tests/v1/sample/test_logprobs.py | 13 +++++++++---- vllm/v1/sample/sampler.py | 5 ++++- vllm/v1/worker/gpu_model_runner.py | 19 +++++++++---------- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 42584938bc06f..c0b0e1ea226ed 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -521,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode): pytest.param( ( "eagle", - "meta-llama/Llama-3.1-8B-Instruct", - "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", + "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama3_2_1B_speculator.eagle3", ), marks=large_gpu_mark(min_gb=32), ), @@ -541,7 +541,7 @@ def test_spec_decode_logprobs( """ from vllm import LLM - prompt = "Hello world" + prompt = "Hello world " * 50 sampling_params = SamplingParams( temperature=0, logprobs=3, max_tokens=10, ignore_eos=False ) @@ -582,6 +582,9 @@ def test_spec_decode_logprobs( seed=42, logprobs_mode=logprobs_mode, gpu_memory_utilization=0.4, + # Force prefill chunking + enable_chunked_prefill=True, + max_num_batched_tokens=32, ) spec_results = spec_llm.generate([prompt], sampling_params) # Collect logprobs outputs from spec decode LLM. @@ -597,6 +600,8 @@ def test_spec_decode_logprobs( # Per-token logprobs are expected to be the same. assert len(ref_logprobs) == len(spec_logprobs) for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs): - assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3) + assert math.isclose( + ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1 + ) assert ref_logprob.rank == spec_logprob.rank assert ref_logprob.decoded_token == spec_logprob.decoded_token diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 39c63fe31ad2c..c75b4f0543c0d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -81,7 +81,10 @@ class Sampler(nn.Module): if logprobs_mode == "raw_logprobs": raw_logprobs = self.compute_logprobs(logits) elif logprobs_mode == "raw_logits": - raw_logprobs = logits.clone() + if logits.dtype == torch.float32: + raw_logprobs = logits.clone() + else: + raw_logprobs = logits.to(torch.float32) # Use float32 for the logits. logits = logits.to(torch.float32) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 298bb1ef5f6fd..979f977587038 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2466,7 +2466,9 @@ class GPUModelRunner( num_sampled_tokens = sampler_output.sampled_token_ids.shape[0] sampled_token_ids = sampler_output.sampled_token_ids + logprobs_tensors = sampler_output.logprobs_tensors invalid_req_indices = [] + cu_num_new_tokens: list[int] | None = None if not self.use_async_scheduling: # Get the valid generated tokens. max_gen_len = sampled_token_ids.shape[-1] @@ -2479,6 +2481,12 @@ class GPUModelRunner( sampled_token_ids, self.input_batch.vocab_size, ) + if logprobs_tensors: + # Needed for extracting logprobs when spec decoding. + # This must be done prior to discarding sampled tokens. + cu_num_new_tokens = [0] + for toks in valid_sampled_token_ids: + cu_num_new_tokens.append(cu_num_new_tokens[-1] + len(toks)) # Mask out the sampled tokens that should not be sampled. for i in discard_sampled_tokens_req_indices: valid_sampled_token_ids[int(i)].clear() @@ -2506,10 +2514,6 @@ class GPUModelRunner( # the sampled tokens back, because there's no direct communication # between the first-stage worker and the last-stage worker. req_ids = self.input_batch.req_ids - logprobs_tensors = sampler_output.logprobs_tensors - cu_num_accepted_tokens = ( - [0] if spec_decode_metadata and logprobs_tensors else None - ) for req_idx in range(num_sampled_tokens): if self.use_async_scheduling: sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None @@ -2518,11 +2522,6 @@ class GPUModelRunner( num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0 - if cu_num_accepted_tokens is not None: - cu_num_accepted_tokens.append( - cu_num_accepted_tokens[-1] + num_sampled_ids - ) - if not sampled_ids: continue @@ -2544,7 +2543,7 @@ class GPUModelRunner( req_state.output_token_ids.extend(sampled_ids) logprobs_lists = ( - logprobs_tensors.tolists(cu_num_accepted_tokens) + logprobs_tensors.tolists(cu_num_new_tokens) if not self.use_async_scheduling and logprobs_tensors is not None else None ) From ae66818379fc2403e43c47154a98170aa7cab192 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 22 Nov 2025 22:48:01 +0800 Subject: [PATCH 58/83] [Misc] Fix pre-commit (#29238) Signed-off-by: DarkLight1337 --- vllm/model_executor/model_loader/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 1db6337f4c9f9..2021b68b8a60b 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -19,7 +19,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) - from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal from vllm.utils.platform_utils import is_pin_memory_available @@ -173,6 +172,7 @@ def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], as_seq_cls_model, try_create_mm_pooling_model_cls, ) + architectures = getattr(model_config.hf_config, "architectures", []) model_cls, arch = model_config.registry.resolve_model_cls( From d84d8f4429a5246a9d9f179b47fac7e13801710d Mon Sep 17 00:00:00 2001 From: ZiTian Zhao Date: Sat, 22 Nov 2025 22:48:59 +0800 Subject: [PATCH 59/83] Fix EVS crash when using `video_embeds` inputs in Qwen2.5-VL (#29232) Signed-off-by: zitian.zhao Co-authored-by: Cyrus Leung --- vllm/model_executor/models/qwen2_5_vl.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8e3c0e84dfe51..1500a437613cc 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -230,6 +230,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): - hidden_size must match the hidden size of language model backbone. - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w) format + - second_per_grid_ts: The video time interval (in seconds) for each + grid along the temporal dimension in the 3D position IDs. Returned + when `videos` is not `None`. """ type: Literal["video_embeds"] @@ -244,6 +247,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema): TensorShape("nv", 3), ] + second_per_grid_ts: Annotated[ + torch.Tensor | None, + TensorShape("nv"), + ] = None + Qwen2_5_VLVideoInputs: TypeAlias = ( Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs @@ -1311,6 +1319,7 @@ class Qwen2_5_VLForConditionalGeneration( type="video_embeds", video_embeds=video_embeds, video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, ) def _process_image_input( @@ -1422,7 +1431,13 @@ class Qwen2_5_VLForConditionalGeneration( # Cast to long to match the original code # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa - second_per_grid_ts = video_input["second_per_grid_ts"].long() + second_per_grid_ts = video_input.get("second_per_grid_ts") + if second_per_grid_ts is None: + raise ValueError( + "second_per_grid_ts is required when video_pruning_rate > 0 " + "is enabled for video inputs, including the video_embeds path." + ) + second_per_grid_ts = second_per_grid_ts.long() tokens_per_second = self.config.vision_config.tokens_per_second video_embeds_out = [] From f55c76c2b3270bb45072c05d6d53460c373b2172 Mon Sep 17 00:00:00 2001 From: Federico <65908512+coval3nte@users.noreply.github.com> Date: Sat, 22 Nov 2025 17:42:48 +0100 Subject: [PATCH 60/83] chore: add RTX_PRO_6000 GLM4.6-FP8 kernel tuning (#29240) --- ...ackwell_Server_Edition,dtype=fp8_w8a8.json | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8b78f87e7f73b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json @@ -0,0 +1,147 @@ +{ + "triton_version": "3.5.0", + "1": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + }, + "4": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 5 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 3 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 3 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 3 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 3 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 3 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 3 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 3 + } +} From 730bd35378bf2a5b56b6d3a45be28b3092d26519 Mon Sep 17 00:00:00 2001 From: Fadi Arafeh <115173828+fadara01@users.noreply.github.com> Date: Sat, 22 Nov 2025 17:04:36 +0000 Subject: [PATCH 61/83] [perf][cpu] Accelerate paged attention GEMMs (QK, PV) on Arm CPUs with NEON (#29193) Signed-off-by: Fadi Arafeh --- csrc/cpu/cpu_attn.cpp | 17 ++ csrc/cpu/cpu_attn_impl.hpp | 8 +- csrc/cpu/cpu_attn_neon.hpp | 386 +++++++++++++++++++++++++ vllm/engine/arg_utils.py | 3 +- vllm/v1/attention/backends/cpu_attn.py | 7 +- 5 files changed, 416 insertions(+), 5 deletions(-) create mode 100644 csrc/cpu/cpu_attn_neon.hpp diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 50f17c758c148..92f8bee5a47a0 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -13,6 +13,18 @@ #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX: #endif +#ifdef __aarch64__ + #include "cpu_attn_neon.hpp" + #define NEON_DISPATCH(...) \ + case cpu_attention::ISA::NEON: { \ + using attn_impl = cpu_attention::AttentionImpl; \ + return __VA_ARGS__(); \ + } +#else + #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON: +#endif // #ifdef __aarch64__ + #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \ case HEAD_DIM: { \ constexpr size_t head_dim = HEAD_DIM; \ @@ -41,6 +53,7 @@ [&] { \ switch (ISA_TYPE) { \ AMX_DISPATCH(__VA_ARGS__) \ + NEON_DISPATCH(__VA_ARGS__) \ case cpu_attention::ISA::VEC: { \ using attn_impl = \ cpu_attention::AttentionImpl class AttentionImpl {}; @@ -143,6 +143,12 @@ struct AttentionMetadata { case ISA::VEC: ss << "VEC, "; break; + case ISA::VEC16: + ss << "VEC16, "; + break; + case ISA::NEON: + ss << "NEON, "; + break; } ss << "workitem_group_num: " << workitem_group_num << ", reduction_item_num: " << reduction_item_num diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp new file mode 100644 index 0000000000000..827f0cfbc718e --- /dev/null +++ b/csrc/cpu/cpu_attn_neon.hpp @@ -0,0 +1,386 @@ +#ifndef CPU_ATTN_NEON_HPP +#define CPU_ATTN_NEON_HPP + +#include "cpu_attn_impl.hpp" +#include +#include +namespace cpu_attention { + +namespace { + +#define BLOCK_SIZE_ALIGNMENT 32 +#define HEAD_SIZE_ALIGNMENT 32 +#define MAX_Q_HEAD_NUM_PER_ITER 16 + +// These do not use vectorized class for loading / converting +// because csrc/cpu/cpu_types_arm.hpp does not have fallback options +// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that +// doesn't support BF16. +// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency. +template +FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0, + float32x4_t& b1); + +template <> +FORCE_INLINE void load_row8_B_as_f32(const float* p, float32x4_t& b0, + float32x4_t& b1) { + b0 = vld1q_f32(p + 0); + b1 = vld1q_f32(p + 4); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::Half* p, + float32x4_t& b0, + float32x4_t& b1) { + const float16_t* h = reinterpret_cast(p); + float16x8_t v = vld1q_f16(h); + b0 = vcvt_f32_f16(vget_low_f16(v)); + b1 = vcvt_f32_f16(vget_high_f16(v)); +} + +template <> +FORCE_INLINE void load_row8_B_as_f32(const c10::BFloat16* p, + float32x4_t& b0, + float32x4_t& b1) { + const uint16_t* u = reinterpret_cast(p); +#ifdef ARM_BF16_SUPPORT + uint16x8_t u0 = vld1q_u16(u); + bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0); + b0 = vcvtq_low_f32_bf16(bf0); + b1 = vcvtq_high_f32_bf16(bf0); +#else + uint16x8_t x0 = vld1q_u16(u); + uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16); + uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16); + b0 = vreinterpretq_f32_u32(lo); + b1 = vreinterpretq_f32_u32(hi); +#endif +} + +// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs +// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2) +// #FMLAs = (K // 4) * (4 * 2 * M) +// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads +template +FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4( + const float* __restrict A, // [M x K], + const kv_cache_t* __restrict B, // [K x 8], + float* __restrict C, // [M x 8], + int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) { + // kernel supports max M of 8, as it'd spill for larger M + static_assert(1 <= M && M <= 8, "M must be in [1,8]"); + +// helpers for per-M codegen +#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7) +#define IF_M(i) if constexpr (M > (i)) + + // A row base pointers +#define DECL_A(i) const float* a##i = A + (i) * lda; + ROWS_APPLY(DECL_A) +#undef DECL_A + + // declare 2 accumulators per row of M +#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1; + ROWS_APPLY(DECL_ACC) +#undef DECL_ACC + + // initialize accumulators +#define INIT_ACC(i) \ + IF_M(i) { \ + if (accumulate) { \ + acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \ + acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \ + } else { \ + acc##i##_0 = vdupq_n_f32(0.f); \ + acc##i##_1 = vdupq_n_f32(0.f); \ + } \ + } + ROWS_APPLY(INIT_ACC) +#undef INIT_ACC + + int32_t k = 0; + + // K unrolled by 4 + for (; k + 3 < K; k += 4) { + // load A[k..k+3] for each active row (M) +#define LOAD_A4(i) \ + float32x4_t a##i##v; \ + IF_M(i) a##i##v = vld1q_f32(a##i + k); + ROWS_APPLY(LOAD_A4) +#undef LOAD_A4 + + // helper: FMA lane L from aiv +#define FMAS_LANE(i, aiv, L) \ + IF_M(i) { \ + acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \ + acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \ + } + + // k + 0 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 0) * ldb, b0, b1); +#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0) + ROWS_APPLY(STEP_K0) +#undef STEP_K0 + } + // k + 1 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 1) * ldb, b0, b1); +#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1) + ROWS_APPLY(STEP_K1) +#undef STEP_K1 + } + // k + 2 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 2) * ldb, b0, b1); +#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2) + ROWS_APPLY(STEP_K2) +#undef STEP_K2 + } + // k + 3 + { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)(k + 3) * ldb, b0, b1); +#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3) + ROWS_APPLY(STEP_K3) +#undef STEP_K3 + } +#undef FMAS_LANE + } + + // K tail + for (; k < K; ++k) { + float32x4_t b0, b1; + load_row8_B_as_f32(B + (int64_t)k * ldb, b0, b1); +#define TAIL_ROW(i) \ + IF_M(i) { \ + float32x4_t ai = vdupq_n_f32(*(a##i + k)); \ + acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \ + acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \ + } + ROWS_APPLY(TAIL_ROW) +#undef TAIL_ROW + } + + // store accumulators to C +#define STORE_ROW(i) \ + IF_M(i) { \ + vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \ + vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \ + } + ROWS_APPLY(STORE_ROW) +#undef STORE_ROW + +#undef ROWS_APPLY +#undef IF_M +} + +template +FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A, + const kv_cache_t* __restrict B, + float* __restrict C, int32_t M, + int32_t K, int64_t lda, + int64_t ldb, int64_t ldc, + bool accumulate) { + // micro kernel is Mx8 + static_assert(N % 8 == 0, "N must be a multiple of 8"); + for (int32_t m = 0; m < M;) { + int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1; + const float* Ab = A + m * lda; + float* Cb = C + m * ldc; + + for (int32_t n = 0; n < N; n += 8) { + const kv_cache_t* Bn = B + n; + float* Cn = Cb + n; + switch (mb) { + case 8: + gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 4: + gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + case 2: + gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + default: + gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc, + K, accumulate); + break; + } + } + // no tail loop for N as it's guaranteed to be a multiple of 8 + m += mb; + } +} + +template +class TileGemmNeonFMLA { + public: + template + FORCE_INLINE static void gemm(const int32_t m_size, + float* __restrict__ a_tile, + kv_cache_t* __restrict__ b_tile, + float* __restrict__ c_tile, const int64_t lda, + const int64_t ldb, const int64_t ldc, + const int32_t block_size, + const int32_t dynamic_k_size, + const bool accum_c) { + if constexpr (phase == AttentionGemmPhase::QK) { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c); + } else { + gemm_macro_neon_fmla_Mx8_Ku4( + a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc, + accum_c); + } + } +}; + +} // namespace + +// this is similar to "ISA::VEC" at the moment +template +class AttentionImpl { + public: + using query_t = scalar_t; + using q_buffer_t = float; + using kv_cache_t = scalar_t; + using logits_buffer_t = float; + using partial_output_buffer_t = float; + using prob_buffer_t = float; + + constexpr static int64_t BlockSizeAlignment = + BLOCK_SIZE_ALIGNMENT; // KV token num unit of QK and PV phases + constexpr static int64_t HeadDimAlignment = + HEAD_SIZE_ALIGNMENT; // headdim num unit of PV phase + constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER; + constexpr static int64_t HeadDim = head_dim; + constexpr static ISA ISAType = ISA::NEON; + constexpr static bool scale_on_logits = false; // apply scale on q_buffer + + static_assert(HeadDim % HeadDimAlignment == 0); + // the gemm micro kernel is Mx8 + static_assert(HeadDimAlignment % 8 == 0); + static_assert(BlockSizeAlignment % 8 == 0); + + public: + template