From aab0102a267eba814cdc09170b530a3aed96be60 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 21 Nov 2025 19:56:59 +0800 Subject: [PATCH] [V0 deprecation] Remove more V0 references (#29088) Signed-off-by: DarkLight1337 --- docs/contributing/model/basic.md | 2 -- docs/design/prefix_caching.md | 3 --- docs/usage/reproducibility.md | 9 ++----- docs/usage/v1_guide.md | 2 +- examples/offline_inference/reproducibility.py | 8 ++---- examples/offline_inference/rlhf_utils.py | 8 +++--- .../offline_inference/save_sharded_state.py | 19 +++----------- examples/offline_inference/spec_decode.py | 6 +---- .../model_loader/test_sharded_state_loader.py | 13 ++-------- tests/tool_use/utils.py | 25 ++++++++++--------- vllm/entrypoints/llm.py | 1 - vllm/entrypoints/openai/protocol.py | 6 ++--- .../layers/mamba/mamba_mixer2.py | 1 - vllm/model_executor/models/interfaces.py | 2 -- vllm/model_executor/models/plamo2.py | 1 - 15 files changed, 31 insertions(+), 75 deletions(-) diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md index d7f5d2f311a37..e828de0adf3c2 100644 --- a/docs/contributing/model/basic.md +++ b/docs/contributing/model/basic.md @@ -133,8 +133,6 @@ We consider 3 different scenarios: For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference. The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config. For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes. -Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations. -V0-only classes and code will be removed in the very near future. The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized. For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together). diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md index 48536a877bd3f..cf792fdabe1a6 100644 --- a/docs/design/prefix_caching.md +++ b/docs/design/prefix_caching.md @@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others. -!!! note - Cache isolation is not supported in engine V0. - ## Data Structure The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified): diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md index d8a1943209c1e..afc25b63902e2 100644 --- a/docs/usage/reproducibility.md +++ b/docs/usage/reproducibility.md @@ -1,10 +1,7 @@ # Reproducibility -vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve -reproducible results: - -- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. -- For V0: Set the global seed (see below). +vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve +reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`. Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py) @@ -30,8 +27,6 @@ However, in some cases, setting the seed will also [change the random state in u ### Default Behavior -In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected. - In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`. !!! note diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index e46bee3f4ef20..22f4e6761ea9a 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -2,7 +2,7 @@ !!! announcement - We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. + We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details. V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack). diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py index d909438b41042..e135bc1b2abb7 100644 --- a/examples/offline_inference/reproducibility.py +++ b/examples/offline_inference/reproducibility.py @@ -11,13 +11,9 @@ import random from vllm import LLM, SamplingParams -# V1 only: Turn off multiprocessing to make the scheduling deterministic. +# Turn off multiprocessing to make the scheduling deterministic. os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" -# V0 only: Set the global seed. The default seed is None, which is -# not reproducible. -SEED = 42 - prompts = [ "Hello, my name is", "The president of the United States is", @@ -28,7 +24,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) def main(): - llm = LLM(model="facebook/opt-125m", seed=SEED) + llm = LLM(model="facebook/opt-125m") outputs = llm.generate(prompts, sampling_params) print("-" * 50) for output in outputs: diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py index 13def88439ef2..5c0787b8778d6 100644 --- a/examples/offline_inference/rlhf_utils.py +++ b/examples/offline_inference/rlhf_utils.py @@ -30,8 +30,8 @@ class WorkerExtension: """ The class for vLLM's worker to inherit from. By defining an extension class, the code can work no matter what is - the underlying worker class. This way, the code can be compatible - with both vLLM V0 and V1. + the underlying worker class. + NOTE: we define this class in a separate module, and the main module should pass the full qualified name as `worker_extension_cls` argument. """ @@ -96,8 +96,8 @@ class ColocateWorkerExtension: """ The class for vLLM's worker to inherit from, in the colocate setting. By defining an extension class, the code can work no matter what is - the underlying worker class. This way, the code can be compatible - with both vLLM V0 and V1. + the underlying worker class. + NOTE: we define this class in a separate module, and the main module should pass the full qualified name as `worker_extension_cls` argument. """ diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py index e25f46b126e6f..88ee48b98bff6 100644 --- a/examples/offline_inference/save_sharded_state.py +++ b/examples/offline_inference/save_sharded_state.py @@ -67,22 +67,9 @@ def main(args): Path(args.output).mkdir(exist_ok=True) # Dump worker states to output directory - # Check which engine version is being used - is_v1_engine = hasattr(llm.llm_engine, "engine_core") - - if is_v1_engine: - # For V1 engine, we need to use engine_core.save_sharded_state - print("Using V1 engine save path") - llm.llm_engine.engine_core.save_sharded_state( - path=args.output, pattern=args.file_pattern, max_size=args.max_file_size - ) - else: - # For V0 engine - print("Using V0 engine save path") - model_executor = llm.llm_engine.model_executor - model_executor.save_sharded_state( - path=args.output, pattern=args.file_pattern, max_size=args.max_file_size - ) + llm.llm_engine.engine_core.save_sharded_state( + path=args.output, pattern=args.file_pattern, max_size=args.max_file_size + ) # Copy metadata files to output directory for file in os.listdir(model_path): diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 3cdc3b245b72a..67a0732459709 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -158,11 +158,7 @@ def main(args): print(f"generated text: {output.outputs[0].text}") print("-" * 50) - try: - metrics = llm.get_metrics() - except AssertionError: - print("Metrics are not supported in the V0 engine.") - return + metrics = llm.get_metrics() total_num_output_tokens = sum( len(output.outputs[0].token_ids) for output in outputs diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py index 5bb841bf2fa0e..cf06b000efb51 100644 --- a/tests/model_executor/model_loader/test_sharded_state_loader.py +++ b/tests/model_executor/model_loader/test_sharded_state_loader.py @@ -60,18 +60,9 @@ def llama_3p2_1b_files(): def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): llm_sharded_writer = LLM(model=input_dir, **kwargs) - # Check which engine version is being used - is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core") + # Dump worker states to output directory - if is_v1_engine: - # For V1 engine, we need to use engine_core.save_sharded_state - print("Using V1 engine save path") - llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir) - else: - # For V0 engine - print("Using V0 engine save path") - model_executor = llm_sharded_writer.llm_engine.model_executor - model_executor.save_sharded_state(path=output_dir) + llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir) # Copy metadata files to output directory for file in os.listdir(input_dir): diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 38def6f874d7d..d188b21863812 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -140,21 +140,22 @@ CONFIGS: dict[str, ServerConfig] = { "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " "to the user's question - just respond to it normally.", }, - # V1 Test: Passing locally but failing in CI. This runs the - # V0 Engine because of CPU offloading. Need to debug why. + # FIXME: This test currently fails, need to debug why. # "granite20b": { - # "model": - # "mbayser/granite-20b-functioncalling-FP8-KV", + # "model": "mbayser/granite-20b-functioncalling-FP8-KV", # "arguments": [ - # "--tool-call-parser", "granite-20b-fc", "--chat-template", - # str(VLLM_PATH / - # "examples/tool_chat_template_granite_20b_fc.jinja"), - # "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20" + # "--tool-call-parser", + # "granite-20b-fc", + # "--chat-template", + # str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"), + # "--max_num_seqs", + # "1", + # "--enforce-eager", + # "--cpu-offload-gb", + # "20", # ], - # "supports_parallel": - # False, - # "supports_rocm": - # False, + # "supports_parallel": False, + # "supports_rocm": False, # }, "granite-3.0-8b": { "model": "ibm-granite/granite-3.0-8b-instruct", diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7421eb8b8abc9..848916dbd8763 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -339,7 +339,6 @@ class LLM: log_non_default_args(engine_args) - # Create the Engine (autoselects V0 vs V1) self.llm_engine = LLMEngine.from_engine_args( engine_args=engine_args, usage_context=UsageContext.LLM_CLASS ) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 41172d8ec2f72..b352c3ad01db0 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) @@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) kv_transfer_params: dict[str, Any] | None = Field( @@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel): "environments. The salt should be random, protected from " "access by 3rd parties, and long enough to be " "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit). Not supported by vLLM engine V0." + "to 256 bit)." ), ) diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 900701c46348b..0ea5805305eda 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp): hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C) return hidden_states - # NOTE: V0 put prefill before decode, v1 puts decode before prefill num_prefills = attn_metadata.num_prefills # request count num_decodes = attn_metadata.num_decode_tokens # token count (=request) num_prefill_tokens = attn_metadata.num_prefill_tokens # token count diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index dc4caf2f02f9d..9966498e1b4c9 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -586,13 +586,11 @@ class IsHybrid(Protocol): def get_mamba_state_shape_from_config( cls, vllm_config: VllmConfig, - use_v1: bool = True, ) -> tuple[tuple[int, int], tuple[int, int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Args: vllm_config: vLLM config - use_v1: Get shapes for V1 (or V0) Returns: Tuple containing: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 22f9c87fc905b..472de5590dcf8 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp): has_decode = num_decodes > 0 num_actual_tokens = num_prefill_tokens + num_decodes - # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input # Split along token dimension hidden_states_d, hidden_states_p = torch.split(