mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 12:25:01 +08:00
[V0 deprecation] Remove more V0 references (#29088)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
b34129bf8e
commit
aab0102a26
@ -133,8 +133,6 @@ We consider 3 different scenarios:
|
||||
For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
|
||||
The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
|
||||
For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
|
||||
Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
|
||||
V0-only classes and code will be removed in the very near future.
|
||||
The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
|
||||
|
||||
For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
|
||||
|
||||
@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
|
||||
|
||||
With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
|
||||
|
||||
!!! note
|
||||
Cache isolation is not supported in engine V0.
|
||||
|
||||
## Data Structure
|
||||
|
||||
The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
# Reproducibility
|
||||
|
||||
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
|
||||
reproducible results:
|
||||
|
||||
- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||
- For V0: Set the global seed (see below).
|
||||
vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
|
||||
reproducible results, you need to turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
|
||||
|
||||
Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
|
||||
|
||||
@ -30,8 +27,6 @@ However, in some cases, setting the seed will also [change the random state in u
|
||||
|
||||
### Default Behavior
|
||||
|
||||
In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
|
||||
|
||||
In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
|
||||
|
||||
!!! note
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
!!! announcement
|
||||
|
||||
We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
|
||||
We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
|
||||
|
||||
V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
|
||||
|
||||
|
||||
@ -11,13 +11,9 @@ import random
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
# V1 only: Turn off multiprocessing to make the scheduling deterministic.
|
||||
# Turn off multiprocessing to make the scheduling deterministic.
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
|
||||
# V0 only: Set the global seed. The default seed is None, which is
|
||||
# not reproducible.
|
||||
SEED = 42
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
@ -28,7 +24,7 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
|
||||
|
||||
def main():
|
||||
llm = LLM(model="facebook/opt-125m", seed=SEED)
|
||||
llm = LLM(model="facebook/opt-125m")
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
|
||||
@ -30,8 +30,8 @@ class WorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class. This way, the code can be compatible
|
||||
with both vLLM V0 and V1.
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
@ -96,8 +96,8 @@ class ColocateWorkerExtension:
|
||||
"""
|
||||
The class for vLLM's worker to inherit from, in the colocate setting.
|
||||
By defining an extension class, the code can work no matter what is
|
||||
the underlying worker class. This way, the code can be compatible
|
||||
with both vLLM V0 and V1.
|
||||
the underlying worker class.
|
||||
|
||||
NOTE: we define this class in a separate module, and the main module
|
||||
should pass the full qualified name as `worker_extension_cls` argument.
|
||||
"""
|
||||
|
||||
@ -67,22 +67,9 @@ def main(args):
|
||||
Path(args.output).mkdir(exist_ok=True)
|
||||
# Dump worker states to output directory
|
||||
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm.llm_engine, "engine_core")
|
||||
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm.llm_engine.engine_core.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
llm.llm_engine.engine_core.save_sharded_state(
|
||||
path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
|
||||
)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(model_path):
|
||||
|
||||
@ -158,11 +158,7 @@ def main(args):
|
||||
print(f"generated text: {output.outputs[0].text}")
|
||||
print("-" * 50)
|
||||
|
||||
try:
|
||||
metrics = llm.get_metrics()
|
||||
except AssertionError:
|
||||
print("Metrics are not supported in the V0 engine.")
|
||||
return
|
||||
metrics = llm.get_metrics()
|
||||
|
||||
total_num_output_tokens = sum(
|
||||
len(output.outputs[0].token_ids) for output in outputs
|
||||
|
||||
@ -60,18 +60,9 @@ def llama_3p2_1b_files():
|
||||
|
||||
def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
|
||||
llm_sharded_writer = LLM(model=input_dir, **kwargs)
|
||||
# Check which engine version is being used
|
||||
is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
|
||||
|
||||
# Dump worker states to output directory
|
||||
if is_v1_engine:
|
||||
# For V1 engine, we need to use engine_core.save_sharded_state
|
||||
print("Using V1 engine save path")
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
|
||||
else:
|
||||
# For V0 engine
|
||||
print("Using V0 engine save path")
|
||||
model_executor = llm_sharded_writer.llm_engine.model_executor
|
||||
model_executor.save_sharded_state(path=output_dir)
|
||||
llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
|
||||
|
||||
# Copy metadata files to output directory
|
||||
for file in os.listdir(input_dir):
|
||||
|
||||
@ -140,21 +140,22 @@ CONFIGS: dict[str, ServerConfig] = {
|
||||
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
|
||||
"to the user's question - just respond to it normally.",
|
||||
},
|
||||
# V1 Test: Passing locally but failing in CI. This runs the
|
||||
# V0 Engine because of CPU offloading. Need to debug why.
|
||||
# FIXME: This test currently fails, need to debug why.
|
||||
# "granite20b": {
|
||||
# "model":
|
||||
# "mbayser/granite-20b-functioncalling-FP8-KV",
|
||||
# "model": "mbayser/granite-20b-functioncalling-FP8-KV",
|
||||
# "arguments": [
|
||||
# "--tool-call-parser", "granite-20b-fc", "--chat-template",
|
||||
# str(VLLM_PATH /
|
||||
# "examples/tool_chat_template_granite_20b_fc.jinja"),
|
||||
# "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
|
||||
# "--tool-call-parser",
|
||||
# "granite-20b-fc",
|
||||
# "--chat-template",
|
||||
# str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
|
||||
# "--max_num_seqs",
|
||||
# "1",
|
||||
# "--enforce-eager",
|
||||
# "--cpu-offload-gb",
|
||||
# "20",
|
||||
# ],
|
||||
# "supports_parallel":
|
||||
# False,
|
||||
# "supports_rocm":
|
||||
# False,
|
||||
# "supports_parallel": False,
|
||||
# "supports_rocm": False,
|
||||
# },
|
||||
"granite-3.0-8b": {
|
||||
"model": "ibm-granite/granite-3.0-8b-instruct",
|
||||
|
||||
@ -339,7 +339,6 @@ class LLM:
|
||||
|
||||
log_non_default_args(engine_args)
|
||||
|
||||
# Create the Engine (autoselects V0 vs V1)
|
||||
self.llm_engine = LLMEngine.from_engine_args(
|
||||
engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
|
||||
)
|
||||
|
||||
@ -377,7 +377,7 @@ class ResponsesRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
|
||||
@ -763,7 +763,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
kv_transfer_params: dict[str, Any] | None = Field(
|
||||
@ -1249,7 +1249,7 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit). Not supported by vLLM engine V0."
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@ -590,7 +590,6 @@ class MambaMixer2(MambaBase, CustomOp):
|
||||
hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
|
||||
return hidden_states
|
||||
|
||||
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
|
||||
num_prefills = attn_metadata.num_prefills # request count
|
||||
num_decodes = attn_metadata.num_decode_tokens # token count (=request)
|
||||
num_prefill_tokens = attn_metadata.num_prefill_tokens # token count
|
||||
|
||||
@ -586,13 +586,11 @@ class IsHybrid(Protocol):
|
||||
def get_mamba_state_shape_from_config(
|
||||
cls,
|
||||
vllm_config: VllmConfig,
|
||||
use_v1: bool = True,
|
||||
) -> tuple[tuple[int, int], tuple[int, int, int]]:
|
||||
"""Calculate shapes for Mamba's convolutional and state caches.
|
||||
|
||||
Args:
|
||||
vllm_config: vLLM config
|
||||
use_v1: Get shapes for V1 (or V0)
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
|
||||
@ -290,7 +290,6 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
|
||||
has_decode = num_decodes > 0
|
||||
num_actual_tokens = num_prefill_tokens + num_decodes
|
||||
|
||||
# NOTE: V0 put prefill before decode, v1 puts decode before prefill
|
||||
# Separate prefill and decode by splitting varlen input
|
||||
# Split along token dimension
|
||||
hidden_states_d, hidden_states_p = torch.split(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user