diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bbbcfb745d571..7f1848b4bfbc1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -159,7 +159,6 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py @@ -182,7 +181,6 @@ steps: - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference @@ -330,17 +328,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: Speculative decoding tests # 40min - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/spec_decode - - tests/spec_decode - - vllm/model_executor/models/eagle.py - commands: - - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py - - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - - label: LoRA Test %N # 15min each mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: @@ -726,7 +713,6 @@ steps: - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. # TODO: investigate and fix - # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 97f9e7dc1578a..8c68bc8f02b64 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat -/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm diff --git a/.github/mergify.yml b/.github/mergify.yml index fccce82d50d16..5c878ac02069f 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -164,10 +164,7 @@ pull_request_rules: description: Automatically apply speculative-decoding label conditions: - or: - - files~=^vllm/spec_decode/ - files~=^vllm/v1/spec_decode/ - - files=vllm/model_executor/layers/spec_decode_base_sampler.py - - files~=^tests/spec_decode/ - files~=^tests/v1/spec_decode/ - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py - files~=^vllm/model_executor/models/.*eagle.*\.py diff --git a/pyproject.toml b/pyproject.toml index 85a112ff51cfe..0c8d2f82d1d60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,6 @@ line-length = 80 "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] -"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] # Python 3.8 typing - skip utils for ROCm "vllm/utils/__init__.py" = ["UP006", "UP035"] diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 8281298d6634c..ee9ac2129f2db 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -6,7 +6,7 @@ import msgspec from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.sequence import ExecuteModelRequest -from ..spec_decode.utils import create_batch +from .utils import create_batch def test_msgspec_serialization(): diff --git a/tests/core/utils.py b/tests/core/utils.py index b746c17864641..033fffd2c4e24 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,15 +4,16 @@ import time from collections import defaultdict from collections.abc import Sequence as GenericSequence -from typing import Any, Optional +from itertools import count +from typing import Any, Optional, Union import torch -from vllm import SamplingParams from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs from vllm.lora.request import LoRARequest -from vllm.sequence import (Logprob, Sequence, SequenceGroup, +from vllm.sampling_params import SamplingParams +from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata) @@ -262,3 +263,130 @@ class SchedulerProxy: self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: _, _, ret = self.call_history["schedule"][-1] return ret + + +def create_seq_group_metadata_from_prompts( + prompts: list[list[int]], + num_gpu_blocks: int, + block_size: int, + final_prompt_lens: list[int], + continuations: Optional[list[list[int]]] = None, + seq_ids: Optional[list[int]] = None, +) -> list[SequenceGroupMetadata]: + + if continuations is None: + continuations = [[] for _ in prompts] + + if seq_ids is None: + seq_ids = list(i for i, _ in enumerate(prompts)) + + free_gpu_blocks = list(range(num_gpu_blocks)) + + block_allocations = { + i: [ + free_gpu_blocks.pop() + for _ in range(round_up_to_next_block(final_len, block_size)) + ] + for i, final_len in enumerate(final_prompt_lens) + } + + seq_grou_metadata_list = [] + for i, (prompt_token_ids, + cont_token_ids) in enumerate(zip(prompts, continuations)): + data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids) + data.update_num_computed_tokens( + len(prompt_token_ids) + len(cont_token_ids) - 1) + seq_data = {i: data} + seq_grou_metadata_list.append( + SequenceGroupMetadata( + request_id=str(i), + is_prompt=len(cont_token_ids) == 0, + seq_data=seq_data, + sampling_params=SamplingParams(temperature=0.0), + block_tables={i: block_allocations[i][:]}, + )) + return seq_grou_metadata_list + + +def create_chunked_seq_group_metadata_from_prompt( + prompt: list[int], + num_gpu_blocks: int, + chunk_size: int, + block_size: int, + seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: + + if seq_id is None: + seq_id = 0 + + free_gpu_blocks = list(range(num_gpu_blocks)) + + block_allocations = [ + free_gpu_blocks.pop() + for _ in range(round_up_to_next_block(len(prompt), block_size)) + ] + + seq_group_metadata_list = [] + for i, idx in enumerate(range(0, len(prompt), chunk_size)): + chunk_ids = prompt[idx:idx + chunk_size] + data = SequenceData.from_seqs(prompt) + data.update_num_computed_tokens(idx) + seq_data = {i: data} + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + do_sample=idx + chunk_size >= len(prompt), # terminal chunk + seq_data=seq_data, + sampling_params=SamplingParams(temperature=0.0), + block_tables={i: block_allocations}, + token_chunk_size=len(chunk_ids))) + return seq_group_metadata_list + + +def create_batch(batch_size, + k, + prompt_len: Union[int, list[int]] = 10, + prev_output_token_len: int = 10, + seq_ids: Optional[list[int]] = None, + num_gpu_blocks: Optional[int] = None, + block_size: Optional[int] = None, + prefill_chunk_size: Optional[int] = None): + if block_size is None: + block_size = 8 + + if num_gpu_blocks is None: + num_gpu_blocks = 2048 // block_size + + iterator = count() + + if isinstance(prompt_len, int): + prompt_lens = [prompt_len for _ in range(batch_size)] + else: + prompt_lens = prompt_len + + prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] + + if prefill_chunk_size: + # Create a batch of chunked prompts. + if not seq_ids: + seq_ids = list(range(len(prompts))) + seq_group_metadata_list = [] + for p, sid in zip(prompts, seq_ids): + seq_group_metadata_list += \ + create_chunked_seq_group_metadata_from_prompt( + p, num_gpu_blocks, prefill_chunk_size, block_size, sid) + seq_group_metadata_list = seq_group_metadata_list[:batch_size] + prev_output_tokens = [] + else: + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_prompt_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, num_gpu_blocks, block_size, final_prompt_lens, + prev_output_tokens, seq_ids) + return seq_group_metadata_list, prompts, prev_output_tokens diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7bb5d8980d614..54dbb747de09a 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,15 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time - import pytest import ray from prometheus_client import REGISTRY import vllm.envs as envs from vllm import EngineArgs, LLMEngine -from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger @@ -232,149 +229,6 @@ def test_engine_log_metrics_regression( assert_metrics(model, engine, disable_log_stats, len(example_prompts)) -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [10]) -def test_metric_spec_decode( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - k = 5 - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_config={ - "model": model, - "num_speculative_tokens": k, - }, - ) as vllm_model: - - # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] - stat_logger.local_interval = 0 - - # Note that the purpose of this test is to verify spec decode - # metrics instead of functional correctness, so the expected values - # are intended to be loose. - metric_name_to_expected_fn = { - "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1, - "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1, - "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k, - "counter_spec_decode_num_draft_tokens": lambda v: v == k, - "counter_spec_decode_num_emitted_tokens": - lambda v: 0 <= v <= k + 1, - } - - # Use one request to better inspect the metrics. - prompts = example_prompts[:1] - - _ = vllm_model.generate_greedy(prompts, max_tokens) - for metric_name, is_expected in metric_name_to_expected_fn.items(): - metric_val = getattr( - stat_logger.metrics, - metric_name).labels(**stat_logger.labels)._value.get() - assert is_expected(metric_val), ( - f"the value of metric {metric_name} ({metric_val}) " - "does not meet expectation") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [10]) -@pytest.mark.parametrize("log_interval", [1, 3, 5, 7]) -def test_metric_spec_decode_interval( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - log_interval: int, -) -> None: - k = 5 - - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_config={ - "model": model, - "num_speculative_tokens": k, - }, - enforce_eager=True, - ) - - engine = LLMEngine.from_engine_args(engine_args) - - try: - - engine.add_request( - "request-id-0", - example_prompts[0], - SamplingParams(max_tokens=max_tokens), - ) - - # set log internal - stat_logger = engine.stat_loggers['prometheus'] - stat_logger.local_interval = log_interval - - # prefill - engine.step() - - # wait for 5 seconds to ensure that spec decode metrics - # get triggered in first decode step - time.sleep(5) - - # first decode step should trigger async collection of metrics - engine.step() - - # wait one second to allow H2D transfer to finish - time.sleep(1) - - # second decode step should now be able to collect the spec - # decode stats and the request should also be finished - engine.step() - - # must have finisehd now - assert not engine.has_unfinished_requests() - - # wait to ensure logging occurs - time.sleep(log_interval) - - # force logging - engine.step() - - # Note that the purpose of this test is to verify spec decode - # metrics instead of functional correctness, so the expected values - # are intended to be loose. - metric_name_to_expected_fn = { - "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1, - "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1, - "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k, - "counter_spec_decode_num_draft_tokens": lambda v: v == k, - "counter_spec_decode_num_emitted_tokens": - lambda v: 0 <= v <= k + 1, - } - - for metric_name, is_expected in metric_name_to_expected_fn.items(): - metric_val = getattr( - stat_logger.metrics, - metric_name).labels(**stat_logger.labels)._value.get() - assert is_expected(metric_val), ( - f"the value of metric {metric_name} ({metric_val}) " - "does not meet expectation") - - finally: - del engine - cleanup_dist_env_and_memory() - - def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, num_requests: int) -> None: if disable_log_stats: diff --git a/tests/models/registry.py b/tests/models/registry.py index 56ae501021f4e..3ffa7f81a1ad9 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -457,12 +457,12 @@ _MULTIMODAL_EXAMPLE_MODELS = { _SPECULATIVE_DECODING_EXAMPLE_MODELS = { - "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m", - speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501 "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 - "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", - speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 + # Temporarily disabled. + # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. + # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", + # speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random", speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501 trust_remote_code=True), diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 01b2260abe8cf..1ce90070c5c85 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -72,11 +72,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce): @create_new_process_for_each_test() -@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [ - ("MLPSpeculatorPreTrainedModel", False, False), - ("DeepseekV2ForCausalLM", True, False), - ("Qwen2VLForConditionalGeneration", True, True), -]) +@pytest.mark.parametrize( + "model_arch,is_pp,init_cuda", + [ + # TODO(woosuk): Re-enable this once the MLP Speculator is supported + # in V1. + # ("MLPSpeculatorPreTrainedModel", False, False), + ("DeepseekV2ForCausalLM", True, False), + ("Qwen2VLForConditionalGeneration", True, True), + ]) def test_registry_is_pp(model_arch, is_pp, init_cuda): assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py deleted file mode 100644 index 3b93c64113dac..0000000000000 --- a/tests/samplers/test_rejection_sampler.py +++ /dev/null @@ -1,577 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for rejection sampling.""" - -import pytest -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.utils import set_random_seed - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -def mock_causal_accepted_tensor( - k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor: - """Generate an "accepted" tensor which should yield causally-accepted tokens - up to last accepted indices. - - Tokens after last_accepted_indices+1 may also be accepted, although they - will not be causally accepted. - """ - batch_size = last_accepted_indices.shape[0] - - accepted = (torch.arange(k).expand(batch_size, k) - <= last_accepted_indices.unsqueeze(-1).broadcast_to( - batch_size, k)) - - # Sprinkle accepted values after the contiguous initial accepted values. - # This replicates the behavior of rejection sampling, which may "accept" - # a token that cannot be accepted because of causality. - sprinkle_candidates = (torch.arange(k).expand( - batch_size, - k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + - 1) - sprinkle = torch.rand(batch_size, k) > 0.5 - accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] - return accepted - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize( - "which_tokens_accepted", - ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_correct_output_format(which_tokens_accepted: str, seed: int, - device: str, use_flashinfer: bool): - """Verify the output has correct format given predetermined accepted matrix. - """ - set_random_seed(seed) - torch.set_default_device(device) - - batch_size = 10 - k = 5 - vocab_size = 3000 - - if which_tokens_accepted == "all_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -1 + k * torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "no_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "some_tokens_accepted": - last_accepted_indices = torch.randint(low=-1, - high=k, - size=(batch_size, )) - accepted = mock_causal_accepted_tensor(k, last_accepted_indices) - else: - raise AssertionError() - - recovered_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - output_token_ids = rejection_sampler._create_output( # pylint: disable=protected-access - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - - expected_bonus_token_ids = bonus_token_ids.clone() - - if which_tokens_accepted == "all_tokens_accepted": - # Expect all tokens to be equal to draft tokens. - assert torch.equal(output_token_ids[:, :-1], draft_token_ids) - - # Expect all bonus tokens to be included. - assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids) - elif which_tokens_accepted == "no_tokens_accepted": - # Expect first token to be equal to recovered tokens. - assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0]) - - # Expect everything else to be -1. - assert torch.equal(output_token_ids[:, 1:], - torch.ones_like(output_token_ids[:, 1:]) * -1) - elif which_tokens_accepted == "some_tokens_accepted": - recovered_plus_bonus = torch.cat( - (recovered_token_ids, expected_bonus_token_ids), dim=-1) - # Assert first rejected token is a recovered token or bonus token. - assert torch.equal( - recovered_plus_bonus[torch.arange(0, batch_size), - last_accepted_indices + 1], - output_token_ids[torch.arange(0, batch_size), - last_accepted_indices + 1]) - - # Assert every subsequent token is -1. - subsequent_mask = torch.arange(0, k + 1).expand( - batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1) - assert torch.all(output_token_ids[subsequent_mask] == -1) - - -@pytest.mark.parametrize("k", list(range(1, 6))) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, - device: str, use_flashinfer: bool): - torch.set_default_device(device) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0]) -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) -@pytest.mark.parametrize("n_rep", [100]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -# @pytest.mark.parametrize("use_flashinfer", [True, False]) -# Not testing FlashInfer now, since 0.2.3 API removed the ability -# to pass in uniform samples. -@pytest.mark.parametrize("use_flashinfer", [False]) -@torch.inference_mode() -def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, - frac_seeded: float, n_rep: int, device: str, - use_flashinfer: bool): - torch.set_default_device(device) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded - - results = [] - for _ in range(n_rep): - seeded_seqs = { - i: torch.Generator(device=device).manual_seed(i) - for i in range(batch_size) if seeded_mask[i] - } - results.append( - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, seeded_seqs)) - - for i in range(batch_size): - if seeded_mask[i]: - for j in range(1, n_rep): - assert torch.equal(results[j][i], results[0][i]) - - -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [3, 8, 32, 128]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -# @pytest.mark.parametrize("use_flashinfer", [True, False]) -# Not testing FlashInfer now, since 0.2.3 API removed the ability -# to pass in uniform samples. -@pytest.mark.parametrize("use_flashinfer", [False]) -@torch.inference_mode() -def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int, - device: str, use_flashinfer: bool): - torch.set_default_device(device) - set_random_seed(0) - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - single_batches = [] - for i in range(batch_size): - single_batches.append((draft_probs[i].clone().unsqueeze(0), - draft_token_ids[i].clone().unsqueeze(0), - target_probs[i].clone().unsqueeze(0), - bonus_token_ids[i].clone().unsqueeze(0), - draft_token_ids[i].clone().unsqueeze(0))) - - set_random_seed(0) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - results = [] - seeded_seqs = { - i: torch.Generator(device=device).manual_seed(i) - for i in range(1, batch_size) # 0 is seed None - } - batch_result = rejection_sampler(target_probs.clone(), - bonus_token_ids.clone(), - draft_probs.clone(), - draft_token_ids.clone(), seeded_seqs) - - set_random_seed(0) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - for i in range(batch_size): - request_seeded_seqs = { - 0: torch.Generator(device=device).manual_seed(i) - } if seeded_seqs.get(i) is not None else None - (draft_probs, draft_token_ids, target_probs, bonus_token_ids, - draft_token_ids) = single_batches[i] - results.append( - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, request_seeded_seqs)) - for i in range(batch_size): - assert torch.equal(batch_result[i], results[i].squeeze(0)) - - -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_compare_nonflashinfer_backend(k: int, vocab_size: int, - batch_size: int, device: str): - """ - Test the flashinfer and nonflashinfer backend generate - the same output metrics. - """ - - pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed " - "the ability to pass in uniform samples.") - - torch.set_default_device(device) - torch.manual_seed(0) - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - num_accepted_tokens = [] - num_emitted_tokens = [] - num_draft_tokens = [] - - def get_seeded_seqs(): - return { - i: torch.Generator(device=device).manual_seed(i) - for i in range(batch_size) - } - - for use_flashinfer in [True, False]: - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - # We use seeded sequences to ensure the same tokens are accepted - # for both flashinfer and nonflashinfer backends. - seeded_seqs = get_seeded_seqs() - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, seeded_seqs) - num_accepted_tokens.append(rejection_sampler.num_accepted_tokens) - num_emitted_tokens.append(rejection_sampler.num_emitted_tokens) - num_draft_tokens.append(rejection_sampler.num_draft_tokens) - - assert num_accepted_tokens[0] == num_accepted_tokens[1] - assert num_emitted_tokens[0] == num_emitted_tokens[1] - assert num_draft_tokens[0] == num_draft_tokens[1] - - -@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) -@pytest.mark.parametrize("which_token_ids", - ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_raises_when_vocab_oob(above_or_below_vocab_range: str, - which_token_ids: str, device: str, - use_flashinfer: bool): - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer, - strict_mode=True) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - oob_token_ids = None - if which_token_ids == "bonus_token_ids": - oob_token_ids = bonus_token_ids - elif which_token_ids == "draft_token_ids": - oob_token_ids = draft_token_ids - else: - raise AssertionError() - - if above_or_below_vocab_range == "above": - rogue_token_id = vocab_size + 1 - elif above_or_below_vocab_range == "below": - rogue_token_id = -1 - else: - raise AssertionError() - - oob_token_ids[0][0] = rogue_token_id - - with pytest.raises(AssertionError): - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) -@pytest.mark.parametrize("seed", list(range(5))) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_rejection_sampling_approximates_target_distribution( - seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool): - """Verify rejection sampling approximates target distribution, - despite sampling from a potentially distinct draft distribution. - - This is done by first creating a random target probability - distribution and a random draft probability distribution. We then - sample token ids from the rejection sampler using these draft - and target distributions. The samples are used to estimate - the output probability distribution, which we expect to approximate - the target distribution. - - A basic distance metric is used to determine similarity between - distributions. - - We expect that as we increase the number of samples, - the distance between the observed distribution and the target - distribution decreases. To measure this, we compare the distance - of the observed distribution against both the target distribution - and a uniform random distribution. We expect the distance between - the observed distribution and the target distribution to improve - much more than the distance improvement between the observed - distribution and the random distribution. - - When draft_and_target_probs_equal=True, the draft and target - probabilities are exactly equal. Rejection sampling should - still work without any NaNs or exceptions. - """ - torch.set_default_device("cpu") - set_random_seed(seed) - helper = _CorrectnessTestHelper( - vocab_size=10, - rejection_sampler=RejectionSampler(use_flashinfer=use_flashinfer), - ) - - draft_probs, target_probs, reference_probs = helper.generate_probs_for_test( - draft_and_target_probs_equal) - - sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference: list[float] = [] - distance_wrt_target: list[float] = [] - - for num_samples in sample_sizes: - (reference_vs_rejsample_dist, - target_vs_rejsample_dist) = helper.run_and_compare_distributions( - draft_probs, - target_probs, - reference_probs, - num_samples, - ) - - distance_wrt_reference.append(reference_vs_rejsample_dist) - distance_wrt_target.append(target_vs_rejsample_dist) - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} " - f"{reference_vs_rejsample_dist=:.05f}") - print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} " - f"{relative_change_in_distance_wrt_reference=:.02f}") - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target - > relative_change_in_distance_wrt_reference * - expected_improvement_multiplier) - - -def get_ratio_first_to_last(elements: list[float]) -> float: - return elements[0] / elements[-1] - - -class _CorrectnessTestHelper: - """Class that packages together logic required for the unit-level - rejection sampling correctness test. - """ - - def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): - self.rejection_sampler = rejection_sampler - self.vocab_size = vocab_size - self.vocab_range = (0, vocab_size) - - self.rejection_sampler.init_gpu_tensors(device=0) - - # Keep test simple, use k=1 - self.k = 1 - - # Bonus tokens not used, but rejection sampler requires - # correct shape. - self.num_bonus_tokens = 1 - - def generate_probs_for_test( - self, draft_and_target_probs_equal: bool - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = (F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2)) - - num_reference_probs = 100 - reference_probs = F.softmax( - torch.rand(num_reference_probs, - self.vocab_size, - dtype=torch.float32), - dim=-1, - ) - - if draft_and_target_probs_equal: - target_probs = draft_probs.clone() - - return draft_probs, target_probs, reference_probs - - def run_and_compare_distributions(self, draft_probs: torch.Tensor, - target_probs: torch.Tensor, - reference_probs: torch.Tensor, - num_samples: int) -> tuple[float, float]: - # Sample using rejection sampling. - rej_sample_probs = self._estimate_rejection_sampling_pdf( - draft_probs, target_probs, num_samples) - - # Average distance from reference probs. - reference_vs_rejsample_dist = torch.dist( - reference_probs, - rej_sample_probs).item() / reference_probs.shape[0] - target_vs_rejsample_dist = torch.dist(target_probs, - rej_sample_probs).item() - - return reference_vs_rejsample_dist, target_vs_rejsample_dist - - def _estimate_rejection_sampling_pdf( - self, - draft_probs: torch.Tensor, - target_probs: torch.Tensor, - num_samples: int, - ) -> torch.Tensor: - # Repeat draft probs num_samples times. - draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat( - num_samples, 1, 1) - - # Repeat target probs num_samples * (k + 1) times. - # Rejection sampler requires bonus token probs, but they aren't used. - target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat( - num_samples, self.k + 1, 1) - - # Randomly sample draft token ids from draft probs. - draft_token_ids = torch.multinomial(draft_probs[:, 0, :], - num_samples=1, - replacement=True).reshape( - num_samples, self.k) - - # Bonus tokens not used but required. - bonus_token_ids = torch.zeros((1, self.num_bonus_tokens), - dtype=torch.int64, - device="cuda").repeat(num_samples, 1) - - # Get output tokens via rejection sampling. - output_token_ids = self.rejection_sampler(target_probs.to("cuda"), - bonus_token_ids.to("cuda"), - draft_probs.to("cuda"), - draft_token_ids.to("cuda")) - - # Remove bonus tokens - output_token_ids = output_token_ids[:, :-1].flatten() - - # Estimate probability density function - hist = torch.histogram(output_token_ids.to(dtype=torch.float, - device="cpu"), - bins=self.vocab_size, - range=self.vocab_range, - density=True) - - return hist.hist diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py deleted file mode 100644 index 119841470bfb5..0000000000000 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ /dev/null @@ -1,480 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for rejection sampling.""" - -import pytest -import torch - -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.model_executor.utils import set_random_seed - -CUDA_DEVICES = [f"cuda:{i}" for i in range(1)] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -def get_zero_temperature_prob_dist(batch_size, k, vocab_size): - """ - Generates a fake temperature zero probability distribution. - Returns: - 1. A fake temperature zero probability distribution of shape - [batch_size, k, vocab_size] - 2. Tensor of shape [batch_size, k] containing the token ids - of the probability 1.0 tokens at each position. - """ - # Simulate temperature 0 probability distribution for target probabilities - # and create target probabilities such that only 1 token id has - # probability 1.0 - target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - probs = torch.rand(batch_size, k, vocab_size) - _, zero_temperature_token_ids = torch.max(probs, dim=-1) - # set the probability of the tokens with ids in zero_temperature_token_ids - # to 1 and the rest to 0. - target_probs = torch.zeros_like(probs).scatter_( - -1, zero_temperature_token_ids.unsqueeze(-1), 1.0) - return target_probs, zero_temperature_token_ids - - -def get_draft_token_ids(batch_size: int, k: int, vocab_size: int, - token_ids_to_exclude: torch.Tensor): - """ - Returns a tensor of shape [batch_size, k] of fake draft token ids - drawn randomly from a vocab of size vocab_size. We however ensure - that token_ids from token_ids_to_exclude are excluded at the - corresponding positions. - """ - draft_token_ids = torch.empty(batch_size, k, dtype=torch.long) - for i in range(batch_size): - for j in range(k): - # Generate a random token ID excluding token_ids_to_exclude[i, j] - while True: - token_id = torch.randint(0, vocab_size, (1, )).item() - if token_id != token_ids_to_exclude[i, j]: - draft_token_ids[i, j] = token_id - break - return draft_token_ids - - -def get_acceptance_sampler( - posterior_threshold: float = 0.03, - posterior_alpha: float = 0.9, - strict_mode: bool = False, -) -> TypicalAcceptanceSampler: - """ - Initializes and returns a TypicalAcceptanceSampler. - """ - return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha, - strict_mode) - - -@pytest.mark.parametrize("k", list(range(1, 6))) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, - device: str): - """ - Tests that the TypicalAcceptancSampler forward succeeds for - different combinations of k, vocab_size, batch_size and num devices. - """ - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler() - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - # Verify that sampling succeeds for all cases. - typical_acceptance_sampler(target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - - -@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) -@pytest.mark.parametrize("which_token_ids", - ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_raises_when_vocab_oob(above_or_below_vocab_range: str, - which_token_ids: str, device: str): - """ - Tests that we throw an exception of the token ids fall outside - the bound of the provided vocabulary. - """ - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - # Verify that appropriate exceptions are thrown for out - # of bound vocabs. - oob_token_ids = None - if which_token_ids == "bonus_token_ids": - oob_token_ids = bonus_token_ids - elif which_token_ids == "draft_token_ids": - oob_token_ids = draft_token_ids - else: - raise AssertionError() - - if above_or_below_vocab_range == "above": - rogue_token_id = vocab_size + 1 - elif above_or_below_vocab_range == "below": - rogue_token_id = -1 - else: - raise AssertionError() - - oob_token_ids[0][0] = rogue_token_id - - with pytest.raises(AssertionError): - typical_acceptance_sampler(target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_uniform_target_distribution_accepts_all_tokens( - seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a uniform target probability - distribution. - - This test verifies that when provided with a uniform target probability - distribution, the TypicalAcceptanceSampler accepts all draft tokens. The - entropy of the uniform target distribution being high should lead to all - draft tokens being accepted. - """ - set_random_seed(seed) - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - # We are using a uniform target probability distribution. - # For a uniform distribution the entropy is very high and it - # should lead to all draft tokens being accepted. Verify that. - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze()) - - assert torch.all(output_token_ids[:, :k] == draft_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_temperature_zero_target_distribution(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a zero-temperature target - probability distribution. - - This test verifies that when using a zero-temperature target probability - distribution, where only one token has a probability of 1.0, the - TypicalAcceptanceSampler correctly rejects all draft tokens that do not - match this probability. Additionally, it ensures that when all draft - tokens are rejected, the sampler falls back to greedy sampling to select a - single token from the target distribution. - """ - set_random_seed(seed) - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Simulate temperature 0 probability distribution for target probabilities - # and create target probabilities such that only 1 token id has - # probability 1.0 - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - # Populate draft_token_ids such that they exclude the token_ids - # with probability = 1.0 - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - # The target probaility distribution is a temperature zero distribution - # with zero entropy. Since our draft token ids don't match the probability - # 1.0 tokens in the target distribution we will reject all of them and - # fallback to the greedy sampling for selecting 1 token for each sequence. - # Verify the same. - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, -1] == -1) - assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:, - 0]) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_mixed_target_distribution(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a mixed target probability - distribution. - - This test ensures that the TypicalAcceptanceSampler handles a mixed - target probability distribution correctly. Specifically, it uses a - zero-temperature distribution for some sequences and a uniform - distribution for others. The test verifies that: - - - For sequences with a zero-temperature distribution, only the token - with a probability of 1.0 is accepted, and all other tokens are rejected. - - For sequences with a uniform distribution, all draft tokens are - accepted. - """ - set_random_seed(seed) - k = 3 - batch_size = 4 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # For sequences 0 and 2 set the distribution to a temperature - # zero distribution. For sequences 1 and 3 set it to a uniform - # distribution. - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - target_probs = target_with_bonus_probs[:, :-1] - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32) - target_probs[[1, 3]] = uniform_probs - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - # verify the shape of output_token_ids - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - # For sequences 0 and 2 verify that only 1 token is accepted - # which is the token with probability 1.0 in the target distribution - # at position 0. - assert torch.all(output_token_ids[[0, 2], 1:] == -1) - assert (torch.all(output_token_ids[[0, 2], - 0] == zero_temperature_token_ids[[0, 2], - 0])) - # For sequences 1 and 3 verify that all tokens are accepted since the - # target probability distribution is uniform. In addition verify that - # we also accept the bonus tokens. - assert torch.all( - output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :]) - assert torch.all(output_token_ids[[1, 3], -1] != -1) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_accept_tokens_partially(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler's behavior when only a subset of draft - tokens should be accepted. - - This test verifies that the TypicalAcceptanceSampler correctly accepts or - rejects draft tokens based on a zero-temperature target probability - distribution. Specifically, it ensures that: - - - When all draft tokens match tokens with a probability of 1.0 in the - target distribution, all draft tokens are accepted. - - When only some draft tokens match tokens with a probability of 1.0 in - the target distribution, only those matching tokens are accepted, and the - rest are rejected. - """ - set_random_seed(seed) - k = 5 - batch_size = 1 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Create a temperature zero target probability distribution and ensure - # all draft token ids correspond to the tokens with 1.0 probability. - # Verify that all of them are accepted. - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - draft_token_ids = zero_temperature_token_ids - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids) - # Next only keep the first 2 draft tokens same as the zero temperature - # tokens. For the remaining 3 choose some other tokens. In the - # response we will expect the first 2 tokens to be the same as the - # draft tokens and the recovered token and rest as -1 - draft_token_ids_to_replace = get_draft_token_ids( - batch_size, k, vocab_size, zero_temperature_token_ids) - draft_token_ids = torch.cat( - (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2]) - assert torch.all( - output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2]) - assert torch.all(output_token_ids[:, -3:] == -1) - - -@pytest.mark.parametrize("seed", list(range(1))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_accept_tokens_set_non_default_posteriors(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with custom posterior thresholds and - alpha values. This test verifies that by modifying the posterior - thresholds and alpha values we can change the acceptance behavior of the - sampler. - """ - set_random_seed(seed) - k = 5 - batch_size = 1 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Simulate temperature 0 probability distribution for target - # probabilities and create target probabilities such that only 1 token - # id has probability 1.0 and others have a very low probability of - # 0.00001. Populate draft_token_ids such that they exclude the token_ids - # with probability = 1.0. Without any changes to the posterior thresholds - # none of the draft tokens are accepted. - target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist( - batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - target_probs[target_probs == 0] = 0.00001 - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 1:-1] == -1) - - # Change the posterior threshold values to 0.0 so that we will - # now accept even draft tokens with very low probability in the - # target distribution. Simulate and verify the same. - typical_acceptance_sampler = TypicalAcceptanceSampler( - strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0) - typical_acceptance_sampler.init_gpu_tensors(device=device) - output_token_ids = typical_acceptance_sampler( - target_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_get_recovered_token_ids(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler's method for generating - replacement token IDs. - - This test verifies that the `_get_recovered_token_ids` method of the - TypicalAcceptanceSampler correctly identifies the token IDs to be used - as recovered token IDs based on the target probability distribution. - Specifically, it ensures that the method correctly identifies the - tokens with the highest probability for each sequence in the batch. - """ - set_random_seed(seed) - k = 10 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - expected_replacement_tokens = torch.argmax(target_probs, dim=-1) - actual_replacement_tokens = ( - typical_acceptance_sampler._get_recovered_token_ids(target_probs)) - assert torch.all(expected_replacement_tokens == actual_replacement_tokens) diff --git a/tests/spec_decode/__init__.py b/tests/spec_decode/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py deleted file mode 100644 index 375b248ebedaa..0000000000000 --- a/tests/spec_decode/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/spec_decode/e2e/__init__.py b/tests/spec_decode/e2e/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py deleted file mode 100644 index f3fe9db3f79ea..0000000000000 --- a/tests/spec_decode/e2e/conftest.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Sequence -from itertools import cycle -from typing import Optional, Union - -import pytest -import torch - -from vllm import LLM, SamplingParams -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import PromptLogprobs, SampleLogprobs - -from ...models.utils import (TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs, - check_logprobs_close, check_outputs_equal) -from ...utils import RemoteOpenAIServer - -PROMPTS = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - "San Francisco is know for its", - "Facebook was created in 2004 by", - "Curious George is a", - "Python 3.11 brings improvements to its", -] - - -@pytest.fixture -def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - test_llm_kwargs, seed): - - def generate(): - kwargs = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - llm = LLM(**kwargs) - - if seed is not None: - set_random_seed(seed) - - yield llm - - del llm - cleanup_dist_env_and_memory() - - return generate - - -def maybe_assert_ngram_worker(llm): - # Verify the proposer worker is ngram if ngram is specified. - if (llm.llm_engine.speculative_config is not None - and llm.llm_engine.speculative_config.method == "ngram"): - from vllm.spec_decode.ngram_worker import NGramWorker - assert isinstance( - llm.llm_engine.model_executor.driver_worker.proposer_worker, - NGramWorker) - - -def get_output_from_llm_generator( - llm_generator, prompts, - sampling_params) -> tuple[list[str], list[list[int]], float]: - tokens: list[str] = [] - token_ids: list[list[int]] = [] - acceptance_rate: float = -1.0 - for llm in llm_generator(): - maybe_assert_ngram_worker(llm) - - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - - token_ids = [output.outputs[0].token_ids for output in outputs] - tokens = [output.outputs[0].text for output in outputs] - - # Fetch acceptance rate if logging is enabled. - if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None): - stat_logger = stat_loggers["prometheus"] - acceptance_rate = (stat_logger.metrics. - gauge_spec_decode_draft_acceptance_rate.labels( - **stat_logger.labels)._value.get()) - del llm - - return tokens, token_ids, acceptance_rate - - -def check_logprobs_correctness( - spec_outputs: Sequence[Union[TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs]], - baseline_outputs: Sequence[Union[TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs]], - disable_logprobs: bool = False, -): - """Compare sampled and prompt logprobs between baseline and spec decoding - """ - if not disable_logprobs: - return check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=spec_outputs, - name_0="org", - name_1="sd", - ) - - # Check correctness when disable_logprobs == True - for spec_output, baseline_output in zip(spec_outputs, baseline_outputs): - # Check generated token logprobs. - spec_logprobs = spec_output[2] - baseline_logprobs = baseline_output[2] - _check_logprobs_when_output_disabled(spec_logprobs, - baseline_logprobs, - is_prompt_logprobs=False) - - # Check prompt logprobs too, if they exist - if len(baseline_output) == 4: - assert len(spec_output) == 4 - spec_prompt_logprobs = spec_output[3] - baseline_prompt_logprobs = baseline_output[3] - _check_logprobs_when_output_disabled(spec_prompt_logprobs, - baseline_prompt_logprobs, - is_prompt_logprobs=True) - - -def _check_logprobs_when_output_disabled( - spec_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs], - baseline_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs], - is_prompt_logprobs: bool = False, -): - # Prompt logprobs are optional - if is_prompt_logprobs and baseline_logprobs is None: - assert spec_logprobs is None - return - - assert spec_logprobs is not None - assert baseline_logprobs is not None - assert len(spec_logprobs) == len(baseline_logprobs) - - # For each generated position of the sequence. - for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate( - zip(spec_logprobs, baseline_logprobs)): - - # First prompt logprob is expected to be None - if is_prompt_logprobs and baseline_pos_logprobs is None: - assert spec_pos_logprobs is None - assert pos == 0 - continue - - assert spec_pos_logprobs is not None - assert baseline_pos_logprobs is not None - - # When disabled, the 1 logprob is returned with dummy values for the - # score and rank, but the token id should match the baseline model - assert len(spec_pos_logprobs) == 1 - (spec_pos_logprob_token_id, - spec_pos_logprob) = next(iter(spec_pos_logprobs.items())) - assert spec_pos_logprob.rank == -1 - assert spec_pos_logprob.logprob == 0.0 - if isinstance(spec_pos_logprob_token_id, torch.Tensor): - spec_pos_logprob_token_id = spec_pos_logprob_token_id.item() - assert spec_pos_logprob_token_id in baseline_pos_logprobs - - -def run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - max_output_len: int, - seed: Optional[int] = 0, - temperature: float = 0.0, - disable_seed: bool = False, - ignore_eos: bool = True, - ensure_all_accepted: bool = False, - expected_acceptance_rate: Optional[float] = None, - logprobs: Optional[int] = None, - prompt_logprobs: Optional[int] = None, - disable_logprobs: bool = False): - - org_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **baseline_llm_kwargs, - } - - sd_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - - if disable_seed: - seed = None - - sampling_params = SamplingParams(temperature=temperature, - max_tokens=max_output_len, - seed=seed, - ignore_eos=ignore_eos, - logprobs=logprobs, - prompt_logprobs=prompt_logprobs) - - with vllm_runner(**org_args) as vllm_model: - org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - with vllm_runner(**sd_args) as vllm_model: - if ensure_all_accepted or expected_acceptance_rate is not None: - # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers[ - 'prometheus'] - stat_logger.local_interval = -100 - - sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - if ensure_all_accepted or expected_acceptance_rate is not None: - acceptance_rate = (stat_logger.metrics. - gauge_spec_decode_draft_acceptance_rate.labels( - **stat_logger.labels)._value.get()) - - if ensure_all_accepted: - assert True - # FIXME: ci fails to log acceptance rate. - # It works locally. - # assert acceptance_rate == 1.0 - - if expected_acceptance_rate is not None: - assert acceptance_rate >= expected_acceptance_rate - 1e-2 - - # Only pass token entries, not the logprobs - check_outputs_equal(outputs_0_lst=[out[0:2] for out in org_outputs], - outputs_1_lst=[out[0:2] for out in sd_outputs], - name_0="org", - name_1="sd") - - # Check logprobs if requested - if logprobs is not None or prompt_logprobs is not None: - check_logprobs_correctness(sd_outputs, org_outputs, disable_logprobs) - - -def run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - max_output_len: int, - seed: int = 0, - temperature: float = 0.0, - logprobs: Optional[int] = None): - """Helper method that compares the outputs of both the baseline LLM and - the test LLM. It asserts greedy equality, e.g. that the outputs are exactly - the same when temperature is zero. - """ - arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs - arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs - env1 = env2 = None - - max_wait_seconds = 240 - results = [] - - prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - for args, env in ((arg1, env1), (arg2, env2)): - with RemoteOpenAIServer(model, - args, - env_dict=env, - max_wait_seconds=max_wait_seconds) as server: - client = server.get_client() - - completion = client.completions.create(model=model, - prompt=prompts, - max_tokens=max_output_len, - seed=seed, - temperature=temperature, - logprobs=logprobs) - - results.append({ - "test": - "seeded_sampling", - "text": [choice.text for choice in completion.choices], - "logprobs": [choice.logprobs for choice in completion.choices], - "finish_reason": - [choice.finish_reason for choice in completion.choices], - "usage": - completion.usage, - }) - - n = len(results) // 2 - arg1_results = results[:n] - arg2_results = results[n:] - # Separate logprobs to avoid asserting exact equality. - arg1_logprobs = [r.pop("logprobs") for r in arg1_results] - arg2_logprobs = [r.pop("logprobs") for r in arg2_results] - - for arg1_result, arg2_result in zip(arg1_results, arg2_results): - assert arg1_result == arg2_result, ( - f"Results for {model=} are not the same with {arg1=} and {arg2=}. " - f"{arg1_result=} != {arg2_result=}") - if logprobs: - for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs): - for l1, l2 in zip(logs1, logs2): - assert l1.tokens == l2.tokens diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py deleted file mode 100644 index 6c453879a6a6a..0000000000000 --- a/tests/spec_decode/e2e/test_compatibility.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm import SamplingParams - -from .conftest import get_output_from_llm_generator - - -@pytest.mark.parametrize("common_llm_kwargs", - [{ - "model": "meta-llama/Llama-3.2-1B-Instruct", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - # Speculative max model len > overridden max model len should raise. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 129, - }, - "max_model_len": 128, - }, - { - # Speculative max model len > draft max model len should raise. - # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12 - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 2048 + 1, - }, - }, - { - # Speculative max model len > target max model len should raise. - # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18 - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 131072 + 1, - }, - }, - ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): - """Verify that speculative decoding validates speculative_max_model_len. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - with pytest.raises(ValueError, match="cannot be larger than"): - get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py deleted file mode 100644 index 7c369feec4152..0000000000000 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ /dev/null @@ -1,480 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, EAGLE would not break the -correctness for the target model outputs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random" - -# max. number of speculative tokens: this corresponds to -# num_heads in the config.json of the speculator model. -MAX_SPEC_TOKENS = 4 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness_cuda_graph( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 8, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that eagle speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that eagle speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "meta-llama/Llama-2-7b-chat-hf", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-llama2-chat-7B", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # 2 for small prompt, 256//16 for generated. - "num_gpu_blocks_override": 2 + 256 // 16, - "max_model_len": (2 + 256 // 16) * 16, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # 2 for small prompt, 256//16 for generated. - "num_gpu_blocks_override": 2 + 256 // 16, - "max_model_len": (2 + 256 // 16) * 16, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "Qwen/Qwen2-7B-Instruct", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-Qwen2-7B-Instruct", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py deleted file mode 100644 index f15a9224c0030..0000000000000 --- a/tests/spec_decode/e2e/test_integration.py +++ /dev/null @@ -1,161 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -other features, e.g. cuda graphs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -MAIN_MODEL = "JackFram/llama-68m" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Verify equality when cuda graphs allowed. - "enforce_eager": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - # Identical models. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("output_len", [32]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify spec decode equality when cuda graphs are enabled. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", []) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - # Explicitly specify draft model quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": "gptq", - }, - }, - # Explicitly specify GPTQ-based draft model to use marlin quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": "marlin", - }, - }, - # Not explicitly specify draft model quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": None, - }, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, seed: int): - """Verify spec decode works well with draft model quantization configs. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": MAIN_MODEL, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py deleted file mode 100644 index a18be80c50dd9..0000000000000 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ /dev/null @@ -1,247 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -tensor parallelism. -""" - -import json -from typing import Optional - -import pytest -import torch - -from vllm.platforms import current_platform - -from .conftest import run_equality_correctness_test_tp - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor-parallel-size", - "2" - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("test_llm_kwargs", [ - [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }), - ], - [ - "--speculative_config", - json.dumps({ - "model": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - }), - ], -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify greedy equality when tensor parallelism is used. - """ - if current_platform.is_rocm(): - pytest.skip("hip is not well-supported yet") - run_equality_correctness_test_tp("JackFram/llama-68m", - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ]), - ("ibm-granite/granite-3b-code-instruct", [ - "--speculative_config", - json.dumps({ - "model": "ibm-granite/granite-3b-code-instruct", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ])]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - seed: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [["--enable-chunked-prefill", "False"], - [ - "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", - "--max-num-seqs", "4" - ]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }), - ]), - ("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "draft_tensor_parallel_size": 1, - }), - ])]) -@pytest.mark.parametrize("logprobs", [None]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - logprobs: Optional[int], - batch_size: int, seed: int): - """Verify spec decode works well with same and different TP size for - the draft model with chunked prefill. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0, - logprobs=logprobs) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [["--enable-chunked-prefill", "False"], - [ - "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", - "--max-num-seqs", "4" - ]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }), - ]), - ("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "draft_tensor_parallel_size": 1, - "disable_logprobs": False, - }), - ])]) -@pytest.mark.parametrize("logprobs", [2]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_chunked_prefill_tp2_with_logprobs( - model, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int], - batch_size: int, seed: int): - """Verify spec decode works well with same and different TP size for - the draft model with chunked prefill. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0, - logprobs=logprobs) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py deleted file mode 100644 index 039eec8fd2cc9..0000000000000 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -tensor parallelism. -""" - -import json - -import openai -import pytest -import torch - -from .conftest import run_equality_correctness_test_tp - -MAIN_MODEL = "JackFram/llama-68m" -SPEC_MODEL = "JackFram/llama-68m" - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce_eager", - "--tensor-parallel-size", - "4", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - [], -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - #TODO(wooyeon): add spec_draft_dp=2 case - [ - "--speculative_config", - json.dumps({ - "model": f"{SPEC_MODEL}", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ], - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - seed: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_equality_correctness_test_tp(MAIN_MODEL, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor-parallel-size", - "4", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - [ - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "--speculative_config", - json.dumps({ - "model": f"{SPEC_MODEL}", - "num_speculative_tokens": 5, - "max_model_len": 32, - }), - ], - ]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify job failure with RuntimeError when all sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - - TODO: fix it to pass without raising Error. (#5814) - """ - with pytest.raises( - (openai.APIConnectionError, openai.InternalServerError)): - run_equality_correctness_test_tp(MAIN_MODEL, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py deleted file mode 100644 index 4de7ee05605ad..0000000000000 --- a/tests/spec_decode/e2e/test_logprobs.py +++ /dev/null @@ -1,315 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from itertools import cycle - -import pytest - -from vllm import SamplingParams - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 7, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12]) -def test_logprobs_equality(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int, prefill_chunk_size: int): - """Verify output logprobs are equal with and without speculative decoding, - as well as with and without chunked prefill. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 6, - "disable_logprobs": False, - }, -}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_logprobs_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int, logprobs: int): - """Veriy logprob greedy equality with different speculation lens. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - # Artificially limit the draft model max model len; this forces - # vLLM to skip speculation once the sequences grow beyond 32-k - # tokens. - "max_model_len": 32, - }, - }]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1]) -def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify logprobs greedy equality when some sequences skip speculation. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [6]) -def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify at least one logprob result has num_logprobs+1, which tests the - case where the sampled token is not in top-k logprobs. - - Ideally, this test should validate equality with non-spec by getting - logprobs. This is left as future improvement. - """ - temperature = 1.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - "San Francisco is know for its", - "Facebook was created in 2004 by", - "Curious George is a", - "Python 3.11 brings improvements to its", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - logprobs=logprobs, - ) - - sd_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - with vllm_runner(**sd_args) as vllm_model: - sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - num_returned_logprobs = [ - len(seq_logprobs) for seq_logprobs in sd_outputs[-1] - ] - - # Assert one of the returned logprobs has > num_logprobs (indicating the - # sampled token is not in top-k). - assert any( - [num_returned > logprobs for num_returned in num_returned_logprobs]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("logprobs", [0]) -def test_logprobs_disabled(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Check the behavior when logprobs are disabled. - Token choices should match with the base model. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py deleted file mode 100644 index bc9501bd57370..0000000000000 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ /dev/null @@ -1,417 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, Medusa would not break the -correctness for the target model outputs. -""" - -import pytest - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - -# main model -# lmsys/vicuna-7b-v1.3 was to be used but it's causing -# OOM in CI pipeline, so using a smaller model. -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random" - -# max number of speculative tokens: this corresponds to -# num_heads in the config.json of the speculator model. -MAX_SPEC_TOKENS = 5 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 8, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, logprobs: int, - prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness_cuda_graph( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify that medusa speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int, - prefill_chunk_size: int): - """Verify that medusa speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int, prefill_chunk_size: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py deleted file mode 100644 index 0e41d93eaa190..0000000000000 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ /dev/null @@ -1,533 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, MLPSpeculator would not break the -correctness for the target model outputs. -""" - -from unittest.mock import patch - -import pytest - -from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-160m" - -# speculative model -SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator" - -# max. number of speculative tokens: this corresponds to -# n_predict in the config.json of the speculator model. -MAX_SPEC_TOKENS = 3 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [4, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "model": SPEC_MODEL, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [8]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - # NOTE Test is sensitive enough st if we don't enable chunked prefill - # scheduling on baseline too, we get slightly different logprobs, ending - # up sampling different tokens at the tail (ie top tokens don't change). - # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? - maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize("output_len", [2048]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify acceptance rate with different batch size and large output - length.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=0.0, - seed=seed, - expected_acceptance_rate=0.48) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # Speculative config - "speculative_config": { - "model": SPEC_MODEL, - }, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) -@pytest.mark.parametrize("output_len", [64]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("temperature", [1.0]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - temperature: float, - prefill_chunk_size: int, seed: int): - """Verify seeded runs produce the same output.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - seed=seed) - - # Ensure this same test does fail if we _don't_ include per-request seeds - with pytest.raises(AssertionError): - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - seed=seed, - disable_seed=True) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_greedy_correctness_with_padding( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality when the vocab dimension is padded - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - - # Default pad_to is 64, test model has vocab_size of 32000 - def patched_pad_vocab_size(vocab_size, pad_to=None): - return pad_vocab_size(vocab_size, pad_to=32064) - - with patch( - "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size", - patched_pad_vocab_size): - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - prefill_chunk_size: int, seed: int, output_len: int): - """Verify that mlp speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -# Speculative decoding is disabled when sequences reach decoding and the batch -# consists of single-token requests. Hence we set `max_num_seqs` -# >= `speculative_disable_by_batch_size` to test feature interaction. -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - prefill_chunk_size: int, seed: int, - output_len: int): - """Verify that mlp speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": MAIN_MODEL, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, prefill_chunk_size: int, seed: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py deleted file mode 100644 index d9c7be8ffe71f..0000000000000 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ /dev/null @@ -1,333 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, mtp would not break the -correctness for the target model outputs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "luccafong/deepseek_mtp_main_random" - -# max. number of speculative tokens: this corresponds to -# num_nextn_predict_layers in the config.json of the speculator model. -MAX_SPEC_TOKENS = 1 - -# precision -PRECISION = "bfloat16" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - output_len: int, seed: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 8, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that mtp speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4 - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that mtp speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py deleted file mode 100644 index ccc8e745ab371..0000000000000 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ /dev/null @@ -1,842 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""The tests in this file verify end-to-end speculative decoding correctness. - -This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. This gives us good coverage of temp=0. - -At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the -highest probability in the target distribution are accepted. Therefore, we can -expect greedy equality for the TypicalAcceptanceSampler at temp=0. - -For temp>0, we rely on unit tests on the rejection sampler to verify that the -output distribution is the same with spec decode vs. no spec decode (this would -be prohibitively expensive to run with a real model). Similarly, for the -TypicalAcceptance sampler also, we rely on unit tests to validate temp>0 -test cases. - -NOTE: Speculative decoding's distribution equality requires that the measured -distributions of the target model and proposal model be deterministic given the -same input. vLLM largely guarantees this. - -@cadedaniel has seen cases where the output probabilities of a draft/target -model change slightly with certain batch sizes or prompts, even with Torch -determinism flags set. It is unclear if this is a bug in vLLM, due to non- -determinism in on-device batched operations, a bug in vLLM's spec decode -implementation, or the "hardware numerics" limitations. Either way, rejection -sampling ensures the output distribution matches the target model, but it breaks -greedy-equality tests for those batch sizes/prompts. -""" - -from itertools import cycle - -import pytest -from transformers import AutoTokenizer - -from vllm import SamplingParams - -from ...utils import create_new_process_for_each_test -from .conftest import (get_output_from_llm_generator, - run_equality_correctness_test) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - # Note this is repeated in the test body; to initialize a tokenizer. - "model": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - # Chunked prefill enabled with small value - # to make sure we get mixed batches. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, - { - # Verify the detokenizer assertions in the test work when spec - # decode is disabled. - }, - ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_with_detokenization(test_llm_generator, - batch_size: int): - """Run generation with speculative decoding on a batch. Verify the engine - generates the correct number of tokens (via ignore_eos=True), and that the - detokenization matches HF transformers. - """ - output_len = 32 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - batch_tokens, batch_token_ids, _ = get_output_from_llm_generator( - test_llm_generator, prompts, sampling_params) - - # Expect a generation for each prompt in the batch. - assert len(batch_token_ids) == len(prompts) - - # Expect each generation to have expected number of tokens (note ignore_eos - # is True). - assert [len(token_ids) - for token_ids in batch_token_ids] == ([output_len] * batch_size) - - # Expect detokenized string to match. - tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") - for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): - expected_tokens = tok.decode(actual_token_ids) - print(f"{actual_token_ids=}") - assert actual_tokens.strip() == expected_tokens.strip() - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_logprobs": False, - }, - "enable_chunked_prefill": False, -}, { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, -}]) -@pytest.mark.parametrize( - "output_len", - [ - # Use long output len for the small model test. - 10, - ]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a tiny model with batch size of one. - - Since this test is cheaper than other e2e correctness tests, we generate - with a higher output_len. - - When the draft model is the same as the target model, we further check - whether all speculative tokens are accepted. - """ - ensure_all_accepted = per_test_common_llm_kwargs.get( - "model_name") == test_llm_kwargs.get("speculative_config")["model"] - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - prompt_logprobs=2, - logprobs=2, - disable_logprobs=False, - temperature=0.0, - ensure_all_accepted=ensure_all_accepted) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [64]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a tiny model and large batch size. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("max_output_len", [ - 256, -]) -@pytest.mark.parametrize("batch_size", [32]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - max_output_len: int, seed: int): - """Verify greedy equality on a tiny model, with a large batch size, and when - sampling respects the EOS token. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len, - seed=seed, - temperature=0.0, - ignore_eos=False) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # A "real" model (not tiny). - "model_name": "meta-llama/Llama-2-7b-chat-hf", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize( - "output_len", - [ - # Use decently long output len for a high quality test. - 256, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_real_model_bs1( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a "real" model and batch size of 1. This is - separate from large BS tests to make identifying the source of bugs easier. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # A "real" model (not tiny). - "model_name": "meta-llama/Llama-2-7b-chat-hf", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [32]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality with a "real" model on a nontrivial batch size. - This is the closest test to a real production workload. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # https://github.com/triton-lang/triton/issues/2266 tl.dot - # doesn't support embedding < 16 - { - "block_size": 16, - }, - { - "block_size": 32, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - """Verify greedy equality over different block sizes. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 32, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 32, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, - ]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_skip_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality when some (or all) sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_by_batch_size": 2, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_by_batch_size": 2, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("output_len", [10]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_disable_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality when all sequences disable speculation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - }, - "enable_chunked_prefill": False, - } - # Try a range of common k, as well as large speculation. - for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] - ] + [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - """Verify that speculative decoding produces exact equality to without spec - decode with many different values of k. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - "acceptance_method": "typical_acceptance_sampler", - }, - "enable_chunked_prefill": False - } - # Try a range of common k. - for k in [1, 2, 3] - ] + [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - "acceptance_method": "typical_acceptance_sampler", - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - } for k in [1, 2, 3]]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - """Verify that speculative decoding produces exact equality to without spec - decode with TypicalAcceptanceSampler as the draft token acceptance - sampling method. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py deleted file mode 100644 index 58d1a6ca7adda..0000000000000 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ /dev/null @@ -1,392 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -For ngram lookup, its idea comes from https://github.com/apoorvumang/prompt-lookup-decoding, -and is merged into transform code base: https://github.com/huggingface/transformers/pull/27775. -Since there is no model is needed for generate the proposal, we could make -the testcase much simpler than drafter multi-step one. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various ngram sizes / speculative sizes - -With those tests, we can say at least, ngram spec would not break the -correctness for the target model outputs. -""" - -import pytest - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-68m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": False, - }, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 256, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality on a tiny model with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-68m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 8, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - """Verify greedy equality on a tiny model with different batch size.""" - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=0, - seed=seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": k, - "prompt_lookup_max": 3, - }, - } - # Try a range of common k, as well as large speculation. - for k in [1, 3, 5] - ] + [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": k, - "prompt_lookup_max": 1, - }, - } - # Try a range of common k, as well as large speculation. - for k in [1, 3, 5] - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding produces exact equality - to without spec decode with many different values of k and - different ngram prompt_lookup_max. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_by_batch_size": 4 - }, -}, { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_by_batch_size": 4, - "disable_mqa_scorer": True, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding produces exact equality - to without spec decode with many different values of k and - different ngram prompt_lookup_max. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_scorer(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py deleted file mode 100644 index 4cf373809dba2..0000000000000 --- a/tests/spec_decode/e2e/test_seed.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "JackFram/llama-160m" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # speculative config - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - }, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) -@pytest.mark.parametrize("batch_size", [1, 8, 32]) -@pytest.mark.parametrize("temperature", [0.1, 1.0]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 20, - ]) -def test_seeded_consistency(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - temperature: float, output_len: int): - """Verify outputs are consistent across multiple runs with same seed - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - disable_seed=False, - ) - - # Ensure this same test does fail if we _don't_ include per-request seeds - with pytest.raises(AssertionError): - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - disable_seed=True, - ) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py deleted file mode 100644 index d20c549b09052..0000000000000 --- a/tests/spec_decode/test_batch_expansion.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer - -from .utils import create_seq_group_metadata_from_prompts, mock_worker - - -@pytest.mark.parametrize('num_target_seq_ids', [100]) -@pytest.mark.skip_global_cleanup -def test_create_target_seq_id_iterator(num_target_seq_ids: int): - """Verify all new sequence ids are greater than all input - seq ids. - """ - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - - all_seq_ids = [ - [1, 3, 5, 7], - list(range(100)) + [0], - [100], - ] - - for seq_ids in all_seq_ids: - max_seq_id = max(seq_ids) - iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access - for _ in range(num_target_seq_ids): - assert next(iterator) > max_seq_id - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup -def test_get_token_ids_to_score(k: int): - """Verify correct tokens are selected for scoring. - """ - proposal_token_ids = torch.tensor( - list(range(k)), - dtype=torch.int64, - device='cuda', - ) - - expected_output: list[list[int]] = [ - [], - ] - for i in range(proposal_token_ids.shape[0]): - expected_output.append(proposal_token_ids[:i + 1].tolist()) - - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access - - actual_output = [ - x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output - ] - - assert actual_output == expected_output - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup -def test_create_single_target_seq_group_metadata(k: int): - """Verify correct creation of a batch-expanded seq group metadata. - """ - - prompt_tokens = [1, 2, 3] - prev_output_tokens = [4, 5, 6] - - token_ids = list(range(k)) - - num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1 - - final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len( - token_ids) - - block_size = 32 - input_seq_group_metadata = create_seq_group_metadata_from_prompts( - [prompt_tokens], 2048 // block_size, block_size, [final_seq_len], - [prev_output_tokens], [num_tokens_processed])[0] - - input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0] - target_seq_id = 100 - - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access - input_seq_group_metadata, - input_seq_id, - target_seq_id, - token_ids, - input_seq_group_metadata.sampling_params, - ) - - assert output.request_id == input_seq_group_metadata.request_id - assert output.sampling_params.repetition_penalty == \ - input_seq_group_metadata.sampling_params.repetition_penalty - assert output.sampling_params.temperature == \ - input_seq_group_metadata.sampling_params.temperature - assert output.sampling_params.top_p == \ - input_seq_group_metadata.sampling_params.top_p - assert output.sampling_params.top_k == \ - input_seq_group_metadata.sampling_params.top_k - assert len(output.seq_data) == 1 - assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple( - prompt_tokens) - assert output.seq_data[target_seq_id].get_output_token_ids() == tuple( - prev_output_tokens + token_ids) - - assert len(output.block_tables) == 1 - assert output.block_tables[ - target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id] diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py deleted file mode 100644 index 407786ad3c647..0000000000000 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ /dev/null @@ -1,90 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.metrics import AsyncMetricsCollector -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker -from vllm.spec_decode.top1_proposer import Top1Proposer - -from .test_utils import mock_spec_decode_sampler -from .utils import create_batch, mock_worker - - -@pytest.mark.parametrize('queue_size', [4]) -@pytest.mark.parametrize('batch_size', [1]) -@pytest.mark.parametrize('k', [1]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int, - acceptance_sampler_method: str): - """Verify that speculative tokens are disabled when the batch size - exceeds the threshold. - """ - disable_by_batch_size = 3 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - disable_by_batch_size=disable_by_batch_size) - - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - running_queue_size=queue_size) - - if queue_size > disable_by_batch_size: - with patch.object(worker, - '_run_no_spec', - side_effect=ValueError(exception_secret)), \ - pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - - # When the batch size is larger than the threshold, - # we expect no speculative tokens (0). - expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0 - assert seq_group_metadata_list[ - 0].num_speculative_tokens == expected_num_spec_tokens - - draft_worker.sampler_output.side_effect = ValueError(exception_secret) - - proposer = Top1Proposer( - worker=draft_worker, - device='cpu', # not used - vocab_size=100, # not used - # Must be long enough to avoid being skipped due to length. - max_proposal_len=1024, - ) - - if queue_size < disable_by_batch_size: - # Should raise exception when executing the mocked draft model. - with pytest.raises(ValueError, match=exception_secret): - proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - else: - # Should not execute the draft model because spec decode is disabled - # for all requests. Accordingly, the proposal length should be 0. - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - assert proposals.proposal_lens.tolist() == [0] * batch_size diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py deleted file mode 100644 index 5d9dd3f72a78a..0000000000000 --- a/tests/spec_decode/test_memory_usage.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -This test verifies that memory usage remains constant (or never grows) when -we enable / disable speculation via --speculative-disable-by-batch-size. - -There are a lot of things we try to keep track of between batches of requests -and if certain tensors are not freed from memory, can result in CUDA ooms. - -This is particularly relevant for production situations where speculation might -be enabled during off hours, but disabled once traffic peaks during the workday. -Since traffic will stay high for a long period of time, verifying we do not -increase our memory usage over time is essential to prevent possible CUDA ooms. -""" - -import torch - -import vllm -from tests.core.utils import create_dummy_prompt -from vllm.sequence import SequenceGroup - -ITERATIONS = 100 -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random" - -BATCH_SIZE = 5 -SPEC_DISABLE_BATCH_SIZE = 2 - - -def add_seq_group_to_engine(engine: vllm.LLMEngine, seq_group: SequenceGroup): - scheduler = engine.scheduler[0] - scheduler.add_seq_group(seq_group) - - -""" -Since we are using a batch size greater than the disabled batch size, -we can ensure we go through the _no_spec codepath for most of our engine steps. -""" - - -def test_memory_usage_no_spec(): - previous_memory_allocated = None - llm = vllm.LLM(model=MAIN_MODEL, - speculative_config={ - "model": SPEC_MODEL, - "num_speculative_tokens": 3, - "disable_by_batch_size": SPEC_DISABLE_BATCH_SIZE, - }) - - batch_sequences = set() - engine = llm.llm_engine - - for i in range(ITERATIONS): - seq, seq_group = create_dummy_prompt(request_id=str(i), - prompt_length=10, - min_tokens=10, - max_tokens=10) - - add_seq_group_to_engine(engine, seq_group) - - batch_sequences.add(seq) - engine.step() - for seq in list(batch_sequences): - if seq.is_finished(): - batch_sequences.remove(seq) - - # If we aren't at our batch size yet, continue - if len(batch_sequences) <= BATCH_SIZE: - continue - - # Otherwise, loop until at least one request is done - while not any(seq.is_finished() for seq in batch_sequences): - engine.step() - - # Remove it from the set - for seq in list(batch_sequences): - if seq.is_finished(): - batch_sequences.remove(seq) - - # At this point, we are always at the case where we have finished - # processing some number of requests from the batch after running - # several _no_spec executions. The memory should not have - # increased between the previous time this was recorded and the - # current time. - if previous_memory_allocated is None: - previous_memory_allocated = torch.cuda.memory_allocated() - else: - assert previous_memory_allocated == torch.cuda.memory_allocated() diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py deleted file mode 100644 index e8de410f8a941..0000000000000 --- a/tests/spec_decode/test_metrics.py +++ /dev/null @@ -1,205 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.spec_decode.metrics import AsyncMetricsCollector - - -def test_initial_call_returns_none(): - """Expect first call to get metrics to return None. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=0) - maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert maybe_metrics is None - - -def test_second_call_returns_metrics(): - """Expect second call to not return None. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -@pytest.mark.parametrize("rank", [1, 2, 3, 4]) -def test_nonzero_rank_noop(rank): - """Verify nonzero ranks don't collect metrics. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=rank) - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - -def test_noop_until_time(): - """Verify metrics aren't collected until enough time passes. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s - 0.1, collect_interval_s - 0.1, - collect_interval_s + 0.1, collect_interval_s + 0.1 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -def test_timer_is_reset(): - """Verify that the internal timer inside AsyncMetricsCollector - is reset after collection. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, - collect_interval_s + 0.1, - collect_interval_s + 0.1, - collect_interval_s + 0.2, - collect_interval_s + 0.2, - 2 * collect_interval_s + 0.1, - 2 * collect_interval_s + 0.1, - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -@pytest.mark.parametrize("has_data", [True, False]) -def test_initial_metrics_has_correct_values(has_data: bool): - """Test correctness of metrics data. - """ - if has_data: - num_accepted_tokens = 103 - num_emitted_tokens = 104 - num_draft_tokens = 105 - else: - num_accepted_tokens = 0 - num_emitted_tokens = 0 - num_draft_tokens = 0 - k = 5 - - max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens( - num_draft_tokens, k) - - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = num_draft_tokens - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - _ = collector.maybe_collect_rejsample_metrics(k) - metrics = collector.maybe_collect_rejsample_metrics(k) - - assert metrics.num_spec_tokens == k - assert metrics.accepted_tokens == num_accepted_tokens - assert metrics.draft_tokens == num_draft_tokens - assert metrics.emitted_tokens == num_emitted_tokens - - if has_data: - assert (metrics.draft_acceptance_rate == num_accepted_tokens / - num_draft_tokens) - assert (metrics.system_efficiency == num_emitted_tokens / - max_num_emitted_tokens) - else: - assert math.isnan(metrics.draft_acceptance_rate) - assert math.isnan(metrics.system_efficiency) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py deleted file mode 100644 index f2d93203b8e10..0000000000000 --- a/tests/spec_decode/test_multi_step_worker.py +++ /dev/null @@ -1,838 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.attention.selector import (_Backend, - global_force_attn_backend_context_manager) -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob, - get_all_seq_ids) -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker import Worker - -from .utils import (assert_logprobs_dict_allclose, create_batch, - create_seq_group_metadata_from_prompts, create_worker, - patch_execute_model_with_seeds, zero_kv_cache) - - -@pytest.mark.parametrize('num_steps', list(range(1, 17))) -def test_assert_enough_kv_space(num_steps: int): - """Test that the multi step worker checks for sufficient space in the KV - cache. It should throw if it cannot run all the steps. - """ - block_size = 16 - num_gpu_blocks = 2048 // block_size - - prompts = [ - list(range(block_size * 3)), - list(range(block_size * 2)), - ] - - prev_output_tokens = [ - list(range(block_size * 1)), - list(range(block_size * 2)), - ] - - final_prompt_lens = [ - len(prompt + output) + num_steps - for prompt, output in zip(prompts, prev_output_tokens) - ] - - inputs = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens, - continuations=prev_output_tokens) - - assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space # pylint: disable=protected-access - worker = MagicMock() - worker.model_runner.block_size = block_size - - for seq_group_metadata in inputs: - original_block_tables = seq_group_metadata.block_tables - - # No exception. - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = { - seq_id: [] - for seq_id, physical_blocks in original_block_tables.items() - } - - # Expect exception. - with pytest.raises(ValueError, - match='times but found insufficient KV space for'): - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = original_block_tables - - -@torch.inference_mode() -def test_same_output_for_single_step(): - """Verify the multi step worker produces the same output as the normal - worker for num_steps=1. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 32 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - # multi_step_worker.model_runner = worker.model_runner - # multi_step_worker.cache_engine = worker.cache_engine - - num_steps = 1 - - prompts = [ - [1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - ] - - final_prompt_lens = [len(prompt) + num_steps for prompt in prompts] - - multi_step_seq_group = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - actual_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=multi_step_seq_group), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step=set()) - assert len(actual_output) == num_steps - actual_output = actual_output[0] - - single_step_seq_group = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - zero_kv_cache(worker.cache_engine) - set_random_seed(seed) - expected_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=single_step_seq_group))[0] - - actual_token_ids = [ - output.samples[0].output_token for output in actual_output - ] - actual_logprobs = [output.samples[0].logprobs for output in actual_output] - - expected_token_ids = [ - output.samples[0].output_token for output in expected_output - ] - expected_logprobs = [ - output.samples[0].logprobs for output in expected_output - ] - - assert actual_token_ids == expected_token_ids - - print(f'{actual_logprobs=}') - print(f'{expected_logprobs=}') - assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs) - - -@torch.inference_mode() -def test_same_output_for_multi_step(): - """Verify the multi-step worker produces the same output as the normal - worker when num_steps > 1. This test runs the multi-step worker once, and - then runs the worker num_steps times, and compares the output. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - # Make sure we go over the block boundary. - num_steps = block_size + 1 - - random.seed(seed) - prompts = [[ - random.randint(0, 1000) for _ in range(random.randint(10, 20)) - ] for _ in range(10)] - - final_prompt_lens = [len(prompt) + num_steps for prompt in prompts] - - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - - continuations = [[1] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step=set()) - - # Run single-step repeatedly. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - continuations = [[1] for _ in prompts] - set_random_seed(seed) - - for _ in multi_step_output: - - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Get token ids and logprobs for comparison. - multi_step_output_logprobs: list[list[dict[int, - Logprob]]] = [[] - for _ in prompts] - single_step_output_logprobs: list[list[dict[int, - Logprob]]] = [[] - for _ in prompts] - - multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts] - single_step_output_token_ids: list[list[int]] = [[] for _ in prompts] - for i, _ in enumerate(prompts): - for multi_step, single_step in zip(multi_step_output, - single_step_output): - multi_step_output_token_ids[i].append( - multi_step[i].samples[0].output_token) - single_step_output_token_ids[i].append( - single_step[i].samples[0].output_token) - - multi_step_output_logprobs[i].append( - multi_step[i].samples[0].logprobs) - single_step_output_logprobs[i].append( - single_step[i].samples[0].logprobs) - - # Print per-sequence token ids - for i, (multi_step_tokens, single_step_tokens) in enumerate( - zip(multi_step_output_token_ids, single_step_output_token_ids)): - print(f'{i=} {multi_step_tokens=}') - print(f'{i=} {single_step_tokens=}') - print(f'{i=} equal {multi_step_tokens == single_step_tokens}') - - # Assert token ids are equal. - for multi_step_tokens, single_step_tokens in zip( - multi_step_output_token_ids, single_step_output_token_ids): - assert multi_step_tokens == single_step_tokens - - # Assert logprobs are equal. - for multi_step_logprobs, single_step_logprobs in zip( - multi_step_output_logprobs, single_step_output_logprobs): - assert_logprobs_dict_allclose(multi_step_logprobs, - single_step_logprobs) - - -@torch.inference_mode() -def test_multi_step_with_batch_expansion_correct_output(): - """ - In this test we verify that the MultiStepWorker is able to handle bonus - tokens correctly. The test verifies that if a sequence has a - bonus token then the MultiStepWorker is able to expand the batch by adding - new sequences corresponding to the sequences with bonus tokens. The - expanded batch is then used for predicting the next tokens. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 128 - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - random.seed(seed) - prompts = [[0] for _ in range(batch_size)] - num_steps = 2 - final_prompt_lens = [(num_steps + 1) for prompt in prompts] - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - # Create the test continuations - continuations = [[random.randint(0, 1000)] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - # Run single-step twice to generate 2 tokens. This - # will simulate the bonus token case with the second token - # being the bonus token. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - set_random_seed(seed) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Create continuations for the MultiStepWorker. The continuations have - # 2 tokens in order to simulate the bonus token case. - multi_step_continuations = [] - for continuation in continuations: - multi_step_continuations.append(continuation[:2]) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step and verify that the third token prediction is accurate - # for all sequences. - zero_kv_cache(multi_step_worker.cache_engine) - all_seq_ids = {i for i in range(batch_size)} - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=1, - seq_ids_with_bonus_token_in_last_step=all_seq_ids) - for index, output in enumerate(multi_step_output[-1].outputs): - assert (continuations[index][-1] == output.samples[0].output_token) - - -@torch.inference_mode() -def test_multi_step_with_batch_expansion_incorrect_output(): - """ - Tests the MultiStepWorker's ability to handle batch expansion with bonus - tokens in a negative case scenario. This test provides the MultiStepWorker - with a batch containing sequences with bonus tokens but specifies the - sequence IDs with bonus tokens incorrectly. The test verifies that the - MultiStepWorker generates correct tokens for the sequences where the - sequence ID is specified correctly and incorrect tokens for those where - the sequence ID is specified incorrectly. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 128 - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - random.seed(seed) - prompts = [[0] for _ in range(batch_size)] - num_steps = 2 - final_prompt_lens = [(num_steps + 1) for prompt in prompts] - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - # Create the test continuations - continuations = [[random.randint(0, 1000)] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - # Run single-step twice to generate 2 tokens. This - # will simulate the bonus token case with the second token - # being the bonus token. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - set_random_seed(seed) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Create continuations for the MultiStepWorker. The continuations have - # 2 tokens in order to simulate the bonus token case. - multi_step_continuations = [] - for continuation in continuations: - multi_step_continuations.append(continuation[:2]) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. In this run INCORRECTLY specify that only the odd number - # sequences have bonus tokens. Verify that with this setting the third token - # prediction is accurate only for the odd numbered sequences. Also verify - # that the prediction might be wrong for some of the even numbered - # sequences. - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0} - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=1, - seq_ids_with_bonus_token_in_last_step=odd_seq_ids) - num_mismatch = 0 - for index, output in enumerate(multi_step_output[-1].outputs): - if (index % 2) != 0: - assert (continuations[index][-1] == output.samples[0].output_token) - elif (continuations[index][-1] != output.samples[0].output_token): - num_mismatch += 1 - # The prediction is accurate for some of the sequences even without proper - # handling of the bonus tokens. Hence verify that the number of sequences - # for which there is a mismatch is > 0. - assert (num_mismatch > 0) - - -@torch.inference_mode() -@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) -# The choice of backends forces the multi_step_worker to choose between -# the vanilla model_runner and TP1DraftModelRunner and that we can test -# both code paths. -@pytest.mark.parametrize('attn_backend', - [_Backend.XFORMERS, _Backend.FLASH_ATTN]) -def test_multi_step_correct_kvcache(num_steps, attn_backend): - """Verify that the KV cache of the draft model - is correctly updated for sequences with bonus token. - """ - seed = 100 - model_name = "JackFram/llama-68m" - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 1 - - with global_force_attn_backend_context_manager(attn_backend): - dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' - multi_step_worker = create_worker(MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - dtype=dtype) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker(Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - dtype=dtype) - - prompts = [[0] for _ in range(batch_size)] - # Already generate two tokens for the sequence - # so that we can simulate the bonus token case - multi_step_continuations = [[ - random.randint(0, 1000), - random.randint(0, 1000) - ] for _ in prompts] - final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] - - seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. - zero_kv_cache(multi_step_worker.cache_engine) - multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step= - seq_ids_with_bonus_token_in_last_step) - - # Run single-step repeatedly. - zero_kv_cache(worker.cache_engine) - # Generate the kv cache for the bonus token first - single_step_continuations = [c[:1] for c in multi_step_continuations] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=single_step_continuations, - final_prompt_lens=final_prompt_lens) - single_step_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list)) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - single_step_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list)) - - for i, seq_group_output in enumerate(single_step_output[-1]): - multi_step_continuations[i].append( - seq_group_output.samples[0].output_token) - - # Verify that the KV cache of the single-step and - # multi-step workers are the same. - single_step_gpu_cache = worker.cache_engine[0].gpu_cache - multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache - num_layers = len(single_step_gpu_cache) - allclose = lambda a, b: torch.allclose( - a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) - for i in range(num_layers): - assert allclose(single_step_gpu_cache[i][0], - multi_step_gpu_cache[i][0]) - assert allclose(single_step_gpu_cache[i][1], - multi_step_gpu_cache[i][1]) - - -@torch.inference_mode() -def test_draft_proposals_full_speculation_len(): - """Verify Top1Proposer correctly handles case where all sequences - can speculate. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=2048, - ) - draft_worker.sampler_output.return_value = [ - SamplerOutput( - outputs=[], - sampled_token_probs=torch.rand(batch_size, - vocab_size, - device=device, - dtype=torch.float32), - logprobs=torch.rand(batch_size, - vocab_size, - device=device, - dtype=torch.float32), - sampled_token_ids=torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - device=device, - dtype=torch.long), - ) for _ in range(k) - ], True - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] - - -@torch.inference_mode() -def test_draft_proposals_no_speculations(): - """Verify Top1Proposer correctly handles case where no sequences - can speculate. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - prompt_len = 10 - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=prompt_len + k - 1, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prompt_len=prompt_len) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] - - -@torch.inference_mode() -def test_draft_proposals_mixed_k(): - """Verify Top1Proposer correctly handles case some sequences can - speculate and some can't. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - - small_prompt_len = 5 - long_prompt_len = 10 - prev_output_token_len = 20 - - expected_num_proposal_seqs = 6 - expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs - - prompt_len = [ - small_prompt_len for _ in range(expected_num_proposal_seqs - 1) - ] + [long_prompt_len - for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len] - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=long_prompt_len + prev_output_token_len + k - 1, - ) - - draft_worker.sampler_output.return_value = [ - SamplerOutput( - outputs=[], - sampled_token_probs=torch.rand(expected_num_proposal_seqs, - vocab_size, - device=device, - dtype=torch.float32), - logprobs=torch.rand(expected_num_proposal_seqs, - vocab_size, - device=device, - dtype=torch.float32), - sampled_token_ids=torch.randint( - low=0, - high=vocab_size, - size=(expected_num_proposal_seqs, ), - device=device, - dtype=torch.long), - ) for _ in range(k) - ], True - - seq_group_metadata_list, _, _ = create_batch( - batch_size, - k, - prompt_len=prompt_len, - prev_output_token_len=prev_output_token_len, - ) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [ - k for _ in range(expected_num_proposal_seqs - 1) - ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k] - - -@torch.inference_mode() -def test_use_draft_model_runner_advance_step(): - """Verify that draft model runner triggers advance step - when applicable. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - k = 5 - batch_size = 32 - block_size = 32 - num_gpu_blocks = 2048 // block_size - worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - - # Mock "_gpu_advance_step" to raise an exception when called. - exception_secret = "artificial stop" - worker.model_runner._gpu_advance_step = MagicMock() - worker.model_runner._gpu_advance_step.side_effect = ValueError( - exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks) - - # Fallback (should not call) when num_steps=1. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - num_steps=1) - worker.execute_model(execute_model_req=execute_model_req) - - # Expect exception if _gpu_advance_step is called. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - num_steps=k) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - call_args_list = worker.model_runner._gpu_advance_step.call_args_list - assert len(call_args_list) == 1 - - -@torch.inference_mode() -def test_expand_execute_model_request_sync_with_expand_hidden_states(): - """ - In this test we verify that the logic for expanding the - seq_group_metadata_list remains in sync with the expansion logic of - the HiddenStates in _expand_execute_model_request. - """ - k = 5 - batch_size = 16 - seq_with_bonus_token_in_last_step = [1, 3, 8, 10, 13, 15] - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - execute_model_request = ExecuteModelRequest( - seq_group_metadata_list, - previous_hidden_states=HiddenStates( - torch.arange(batch_size), seq_group_metadata_list, - torch.arange(batch_size, 2 * batch_size))) - - expanded_execute_model_request, orig_seq_group_ids = MultiStepWorker.\ - _expand_execute_model_request(execute_model_request, - seq_with_bonus_token_in_last_step) - - all_seq_ids = torch.tensor( - get_all_seq_ids( - expanded_execute_model_request.seq_group_metadata_list)) - ref_expanded_hidden_states = all_seq_ids + batch_size - ref_expanded_hidden_states[orig_seq_group_ids] -= batch_size - - assert (ref_expanded_hidden_states == expanded_execute_model_request. - previous_hidden_states.hidden_states).all().item() diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py deleted file mode 100644 index 8a7c114856811..0000000000000 --- a/tests/spec_decode/test_ngram_worker.py +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.ngram_worker import NGramWorker -from vllm.spec_decode.top1_proposer import Top1Proposer - -from .utils import create_seq_group_metadata_from_prompts, create_worker - - -def test_ngram_algo_correctness_for_single_no_match(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario cannot find any candidate in one single batch - """ - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [1, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find no candidate - [1, 2, 3, 4, 5, 6, 7], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([1, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([1, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([1]) - assert proposals.proposal_lens.tolist() == [0] - - -def test_ngram_algo_correctness_for_batches_not_match_all(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario find some candidate not full in batchs - """ - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [1, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find no candidate - [1, 2, 3, 4, 5, 6, 7], - # shall find candidate 12,13,14,15,16 - [11, 12, 13, 14, 15, 16, 11], - # shall find candidate 23,24,25,26,21 - [21, 21, 22, 23, 24, 25, 26, 21, 22], - # shall find candidate 34,35,36,37,38 - [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33], - # shall find no candidate as exceed max_proposal_len - [ - 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 33, 34, 35, 36, 37, - 38, 31, 32, 33 - ], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - for sg in seq_group_metadata_list: - sg.is_prompt = False - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([5, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([5]) - - # the first sequence has no match so proposal_len should be overwritten to 0 - assert proposals.proposal_lens.tolist( - ) == [0] + [proposal_len for _ in range(3)] + [0] - - for i in range(proposal_len): - assert proposals.proposal_token_ids[0][i] == -1 - assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1] - assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3] - assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5] - assert proposals.proposal_token_ids[4][i] == -1 - - -def test_ngram_algo_correctness_for_batches_match_all(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario find candidate in all batches - """ - - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [0, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find candidate 12,13,14,15,16 - [11, 12, 13, 14, 15, 16, 11], - # shall find candidate 23,24,25,26,21 - [21, 21, 22, 23, 24, 25, 26, 21, 22], - # shall find candidate 34,35,36,37,38 - [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - # Normally drafter is run on decode requests only; here we check the output - # of the ngram worker as it is the sole proposer that has no forward. - for sg in seq_group_metadata_list: - sg.is_prompt = False - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([3, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([3, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([3]) - - assert proposals.proposal_lens.tolist() == [proposal_len for _ in range(3)] - - for i in range(proposal_len): - assert proposals.proposal_token_ids[0][i] == prompts[0][i + 1] - assert proposals.proposal_token_ids[1][i] == prompts[1][i + 3] - assert proposals.proposal_token_ids[2][i] == prompts[2][i + 5] diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py deleted file mode 100644 index 55fcf00557476..0000000000000 --- a/tests/spec_decode/test_scorer.py +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random - -import pytest -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores -from vllm.spec_decode.mqa_scorer import MQAScorer -from vllm.worker.worker import Worker - -from .utils import create_batch, create_worker - - -def create_proposal(propose_lens: list[int], vocab_size: int, - device: str) -> SpeculativeProposals: - batch_size = len(propose_lens) - max_propose_len = max(propose_lens) - proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size), - device=device) - - proposal_token_ids = torch.full((batch_size, max_propose_len), - fill_value=-1, - device=device) - for i in range(batch_size): - proposal_token_ids[i][:propose_lens[i]] = torch.argmax( - proposal_probs[i][:propose_lens[i]], dim=-1) - - propose_lens = torch.tensor(propose_lens, device=device) - return SpeculativeProposals(proposal_token_ids, proposal_probs, - propose_lens) - - -def assert_score_equal(score1: SpeculativeScores, - score2: SpeculativeScores) -> None: - assert torch.allclose(score1.probs, score2.probs) - assert torch.allclose(score1.logprobs, score2.logprobs) - assert torch.equal( - score1.token_ids, - score2.token_ids), f"{score1.token_ids}, {score2.token_ids}" - - -@pytest.mark.parametrize('model_name', ['facebook/opt-125m']) -@pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16]) -@pytest.mark.parametrize('max_propose_len', [1, 3, 5]) -@pytest.mark.parametrize('mixed_propose_len', [True]) -@pytest.mark.parametrize('device', ['cuda']) -@pytest.mark.parametrize('prefill_chunking', [False, True]) -def test_scorer(model_name: str, batch_size: int, max_propose_len: int, - mixed_propose_len: bool, device: str, - prefill_chunking: bool) -> None: - """ - Compare the batch expansion scorer and mqa scorer return the same score. - We test for both queries with the same propose length and different - propose length, as well as mixed prefill-decode batches. - """ - seed = 0 - block_size = 32 - num_gpu_blocks = 2048 // block_size - scorer_worker = create_worker(Worker, model_name, block_size, - num_gpu_blocks, seed) - scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer - scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True - scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True - - vocab_size = scorer_worker.vocab_size - - if not mixed_propose_len: - propose_lens = [max_propose_len] * batch_size - else: - # There must be at least 1 decode request, otherwise - # we have nothing to score (`_run_no_spec`). - non_zero_cnt = random.randint(1, batch_size) - propose_lens = [max_propose_len - ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt) - random.shuffle(propose_lens) - - seq_group_metadatalist, _, _ = create_batch(batch_size, - max_propose_len, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks) - - if mixed_propose_len and prefill_chunking and (n_prefills := - batch_size - non_zero_cnt): - prefill, _, _ = create_batch(n_prefills, - None, - prefill_chunk_size=4, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - seq_ids=list( - range(batch_size, - batch_size + n_prefills))) - # re-order to guarantee prefill|decode order - target_group_metadatalist = [ - seq_group_metadatalist[i] for i, p in enumerate(propose_lens) - if p > 0 - ] - seq_group_metadatalist = prefill + target_group_metadatalist - propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0] - - proposals = create_proposal(propose_lens, vocab_size, device) - requests = ExecuteModelRequest(seq_group_metadatalist, - num_lookahead_slots=max_propose_len) - - batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device, - vocab_size) - batch_expansion_score = batch_expansion_scorer.score_proposals( - requests, proposals) - - mqa_scorer = MQAScorer(scorer_worker, device, vocab_size) - mqa_score = mqa_scorer.score_proposals(requests, proposals) - - assert_score_equal(batch_expansion_score, mqa_score) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py deleted file mode 100644 index 8aceaadff8d38..0000000000000 --- a/tests/spec_decode/test_spec_decode_worker.py +++ /dev/null @@ -1,945 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from collections import defaultdict -from types import SimpleNamespace -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SequenceOutput -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.metrics import (AsyncMetricsCollector, - SpecDecodeWorkerMetrics) -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, - split_num_cache_blocks_evenly) -from vllm.worker.worker import Worker - -from .test_utils import mock_spec_decode_sampler -from .utils import (create_batch, create_sampler_output_list, create_worker, - mock_worker) - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_calls_draft_model(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the draft worker with correct - inputs. Everything else is mocked out. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker( - draft_worker, - target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector) - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - - call_args_list = draft_worker.get_spec_proposals.call_args_list - assert len(call_args_list) == 1 - - for args, _ in call_args_list: - actual_execute_model_data = args[0] - assert actual_execute_model_data == execute_model_req - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_batch_expansion_correctly_calls_target_model( - k: int, batch_size: int, acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the target model with correct - inputs with batch expansion. Everything else is mocked out. - """ - draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) - target_worker = mock_worker(use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - draft_worker, - target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - disable_mqa_scorer=True) - worker.init_device() - - vocab_size = 32_000 - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, prompts, prev_output_tokens = create_batch( - batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - exception_secret = 'artificial stop' - target_worker.execute_model.side_effect = ValueError(exception_secret) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - seen_contexts: list[list[int]] = [] - - call_args_list = target_worker.execute_model.call_args_list - assert len(call_args_list) == 1 - for _, kwargs in call_args_list: - seq_group_metadata_list = kwargs[ - "execute_model_req"].seq_group_metadata_list - - assert len(seq_group_metadata_list) == (k + 1) * batch_size - for seq_group_metadata in seq_group_metadata_list: - for seq_data in seq_group_metadata.seq_data.values(): - seen_contexts.append(seq_data.get_token_ids()) - - expected_seen_contexts: list[list[int]] = [] - - for prompt, prev_generated, draft_tokens in zip( - prompts, prev_output_tokens, proposal_token_ids.tolist()): - - for i in range(len(draft_tokens) + 1): - expected_seen_contexts.append(prompt + prev_generated + - draft_tokens[:i]) - - seen_contexts.sort() - expected_seen_contexts.sort() - assert expected_seen_contexts == seen_contexts - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the rejection sampler with - correct inputs. Everything else is mocked out. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - exception_secret = 'artificial stop' - - spec_decode_sampler.side_effect = ValueError(exception_secret) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - assert len(spec_decode_sampler.call_args_list) == 1 - _, kwargs = spec_decode_sampler.call_args_list[0] - actual = SimpleNamespace(**kwargs) - - assert torch.equal(actual.bonus_token_ids, - target_token_ids.reshape(batch_size, k + 1)[:, -1:]) - assert torch.equal(actual.target_with_bonus_probs, - target_token_probs.reshape(batch_size, k + 1, -1)) - assert torch.equal(actual.draft_token_ids, proposal_token_ids) - assert torch.equal(actual.draft_probs, proposal_probs) - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_formats_output(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker formats sampler output correctly. - Everything else is mocked out. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - spec_decode_sampler_output = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k + 1), - dtype=torch.int64, - device='cuda') - for i in range(batch_size): - minimum_accepted_tokens = 1 - spec_decode_sampler_output[i][ - -random.randint(minimum_accepted_tokens, k + 1):] = -1 - - spec_decode_sampler.return_value = spec_decode_sampler_output - output = worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - expected_output = create_sampler_output_list( - token_ids=spec_decode_sampler_output.transpose(0, 1), - probs=[None for _ in range(k + 1)], - logprobs=[None for _ in range(k + 1)]) - - seq_ids = [ - next(iter(seq_group_metadata.seq_data.keys())) - for seq_group_metadata in seq_group_metadata_list - ] - actual_output_by_seq: dict[int, list[SequenceOutput]] = { - seq_id: [] - for seq_id in seq_ids - } - expected_output_by_seq: dict[int, list[SequenceOutput]] = { - seq_id: [] - for seq_id in seq_ids - } - - for step in output: - for seq_group in step: - for sample in seq_group.samples: - seq_id = sample.parent_seq_id - actual_output_by_seq[seq_id].append(sample) - - for step in expected_output: - for seq_group in step: - for sample in seq_group.samples: - seq_id = sample.parent_seq_id - expected_output_by_seq[seq_id].append(sample) - - all_seen_seq_ids = set( - list(actual_output_by_seq.keys()) + - list(expected_output_by_seq.keys())) - for seq_id in all_seen_seq_ids: - actual_by_step = actual_output_by_seq[seq_id] - expected_by_step = expected_output_by_seq[seq_id] - - for i in range(k + 1): - if i >= len(actual_by_step): - assert expected_by_step[i].output_token == -1 - continue - assert actual_by_step[i].output_token == expected_by_step[ - i].output_token - - -@pytest.mark.parametrize('k', [1, 2]) -@pytest.mark.parametrize('batch_size', [1]) -@pytest.mark.parametrize('returns_metrics', [True, False]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker collects metrics. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - spec_decode_sampler_output = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k + 1), - dtype=torch.int64, - device='cuda') - for i in range(batch_size): - minimum_accepted_tokens = 1 - spec_decode_sampler_output[i][ - -random.randint(minimum_accepted_tokens, k + 1):] = -1 - spec_decode_sampler.return_value = spec_decode_sampler_output - - mock_rejsample_metrics = MagicMock( - spec=SpecDecodeWorkerMetrics) if returns_metrics else None - metrics_collector.maybe_collect_rejsample_metrics.return_value = ( - mock_rejsample_metrics) - - output = worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics - - call_args_list = ( - metrics_collector.maybe_collect_rejsample_metrics.call_args_list) - assert len(call_args_list) == 1 - args, kwargs = call_args_list[0] - assert args[0] == k or kwargs.get('k', -1) == k - - -@pytest.mark.parametrize('k', [0]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_k_equals_zero(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify that the SpecDecodeWorker calls the draft and target workers - when k is zero. This happens during prefill. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - sampler_output = MagicMock(spec=SamplerOutput) - sampler_output.hidden_states = None - target_worker.execute_model.return_value = [sampler_output] - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prev_output_token_len=0) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - out = worker.execute_model(execute_model_req=execute_model_req) - - assert len(out) == 1, f"expected only one token output when {k=}" - assert out[0].sampled_token_probs is None, ( - "expect gpu tensor references to be None") - assert out[ - 0].sampled_token_ids is None, "expect gpu tensor references to be None" - - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - - -@pytest.mark.parametrize('k', [0, 5]) -@pytest.mark.parametrize('batch_size', [0]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_empty_input_batch(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify that the SpecDecodeWorker calls the draft and target workers - when the input batch is empty. This can happen if the engine communicates - to the workers information without scheduling a batch. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - sampler_output = MagicMock(spec=SamplerOutput) - sampler_output.hidden_states = None - target_worker.execute_model.return_value = [sampler_output] - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prev_output_token_len=0) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - out = worker.execute_model(execute_model_req=execute_model_req) - - assert len(out) == 1, f"expected only one token output when {k=}" - assert out[0].sampled_token_probs is None, ( - "expect gpu tensor references to be None") - assert out[ - 0].sampled_token_ids is None, "expect gpu tensor references to be None" - - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - - -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup -def test_init_device(acceptance_sampler_method: str): - """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as - well as other GPU initialization. - """ - draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) - target_worker = mock_worker(use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - worker.init_device() - - draft_worker.init_device.assert_called_once() - - target_worker.init_device.assert_called_once() - - metrics_collector.init_tensors.assert_called_once() - spec_decode_sampler.init_tensors.assert_called_once() - - -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_initialize_cache(acceptance_sampler_method): - """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer - workers. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - worker = SpecDecodeWorker(proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - metrics_collector=metrics_collector) - - kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023} - worker.initialize_cache(**kwargs) - - draft_worker.initialize_cache.assert_called_once_with(**kwargs) - target_worker.initialize_cache.assert_called_once_with(**kwargs) - - -@pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) -@pytest.mark.parametrize('available_cpu_blocks', [500]) -@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) -@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup -def test_determine_num_available_blocks(available_gpu_blocks: int, - available_cpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker correctly profiles num available GPU blocks. - Specifically, it should run profiling in the scorer worker, and then evenly - split the blocks between proposer and scorer worker. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - target_worker.determine_num_available_blocks.return_value = ( - available_gpu_blocks, available_cpu_blocks) - target_worker.get_cache_block_size_bytes.return_value = ( - target_cache_block_size_bytes) - draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes - - worker = SpecDecodeWorker( - draft_worker, target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector) - - num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() - - target_worker.determine_num_available_blocks.assert_called_once() - assert num_cpu_blocks == available_cpu_blocks - - assert num_gpu_blocks == split_num_cache_blocks_evenly( - target_cache_block_size_bytes, draft_kv_size_bytes, - available_gpu_blocks) - - -@pytest.mark.parametrize('available_gpu_blocks', - list(range(20)) + [1024, 1024**2]) -@pytest.mark.parametrize('target_cache_block_size_bytes', - [2 * 2 * 4096, 2 * 2 * 8192]) -@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup -def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int): - """Verify split_num_cache_blocks_evenly does not exceed original memory - allocation in bytes. - """ - num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes, - draft_kv_size_bytes, - available_gpu_blocks) - assert (num_blocks * target_cache_block_size_bytes) + ( - num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks * - target_cache_block_size_bytes) - - -@torch.inference_mode() -def test_populate_seq_ids_with_bonus_tokens(): - """ - Verify that a call to _create_output_sampler_list correctly updates - seq_with_bonus_token_in_last_step. - - seq_with_bonus_token_in_last_step is an internal data structure in - SpecDecodeWorker that tracks the sequence IDs which are assigned bonus - tokens by the target model in their last forward pass. This state is - maintained only for models relying on the KV cache, such as those using - the MultiStepWorker. - """ - batch_size = 10 - k = 5 - vocab_size = 10000 - num_sequences_with_bonus_tokens = 5 - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] - target_worker.device = 'cuda' - - set_random_seed(1) - draft_worker = mock_worker(cls=MultiStepWorker) - draft_worker.device = 'cuda' - # The sequence_ids attached to each sequence in the batch. - # The sequence at index i has seq_id assigned_seq_ids[i] - assigned_seq_ids = list(range(batch_size)) - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - seq_ids=assigned_seq_ids, - prev_output_token_len=10) - target_token_logprobs = torch.rand(batch_size, (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - accepted_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, (k + 1)), - dtype=torch.int64, - device='cuda') - expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set) - for seq_group_metadata in seq_group_metadata_list: - for seq_id in seq_group_metadata.seq_data: - expected_request_id_seq_ids_mapping[ - seq_group_metadata.request_id].add(seq_id) - # Generate a random sample of sequence indexes with bonus tokens - seq_indexes_with_bonus_tokens = random.sample( - range(batch_size), num_sequences_with_bonus_tokens) - # Create a mask that is True for indices in seq_indexes_with_bonus_tokens - mask = torch.ones(batch_size, dtype=torch.bool, device='cuda') - mask[seq_indexes_with_bonus_tokens] = False - # Set the last token ID to -1 for all indices not in - # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in - # those indices. - accepted_token_ids[mask, -1:] = -1 - worker = SpecDecodeWorker(draft_worker, - target_worker, - mock_spec_decode_sampler("rejection_sampler"), - disable_logprobs=False, - metrics_collector=metrics_collector) - # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs. - # This set includes all sequence IDs in the batch as well as an additional - # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in - # the range [0, batch_size + num_extra_sequence_ids). - num_extra_sequence_ids = 10 - worker._seq_with_bonus_token_in_last_step = set( - range(batch_size + num_extra_sequence_ids)) - worker._create_output_sampler_list( - seq_group_metadata_list=seq_group_metadata_list, - accepted_token_ids=accepted_token_ids, - target_logprobs=target_token_logprobs, - prompt_logprobs=None, - k=k, - stage_times=(0, 0, 0)) - # Verify that _seq_with_bonus_token_in_last_step contains the following: - # 1. Sequence IDs that were already present in - # _seq_with_bonus_token_in_last_step but were not part of the current - # batch are retained. - # 2. Of the sequence IDs present in the current batch, only those with a - # bonus token are retained in _seq_with_bonus_token_in_last_step. - # Sequence IDs that are present in the current batch but do not have - # bonus tokens are removed from _seq_with_bonus_token_in_last_step. - expected_seq_ids_with_bonus_tokens = \ - set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens]) - additional_sequence_ids = \ - set(range(batch_size, batch_size + num_extra_sequence_ids)) - assert worker._seq_with_bonus_token_in_last_step == \ - expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids) - assert worker._request_id_seq_id_mapping == \ - expected_request_id_seq_ids_mapping - - -@torch.inference_mode() -def test_handle_finished_requests(): - """ - Test to verify that finished request IDs are appropriately processed to - update the internal state of the SpecDecodeWorker. - - This test initializes the SpecDecodeWorker with mock data, marks certain - requests as finished, and ensures that the corresponding sequence IDs are - correctly removed from the internal mappings. - """ - batch_size = 32 - k = 3 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(draft_worker, target_worker, - mock_spec_decode_sampler("rejection_sampler"), - metrics_collector) - # Initialize the request_id_seq_id_mapping mapping dict with a few fake - # request ids and corresponding sequence ids. - worker._request_id_seq_id_mapping = \ - {'request-1': {1,2,3}, 'request-2': {4,5,6,7}, - 'request-3': {8,9}, 'request-4': {10,11}} - # Initialize seq_with_bonus_token_in_last_step with a few fake - # sequence ids. - worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10} - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - # Mark requests with ids request-1 and request-3 as finished. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - finished_requests_ids=['request-1', 'request-3']) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - # Verify that request-1 and request-3 are removed from - # request_id_seq_id_mapping - assert worker._request_id_seq_id_mapping == \ - {'request-2': {4,5,6,7}, 'request-4': {10,11}} - # Verify that all sequence ids corresponding to 'request-1' - # and 'request-3' are removed from seq_with_bonus_token_in_last_step. - assert worker._seq_with_bonus_token_in_last_step == \ - {4,5,10} - - -@pytest.mark.parametrize('k', [3]) -@pytest.mark.parametrize('batch_size', [2, 32]) -@pytest.mark.parametrize("batch_composition", - ["prefill_only", "decode_only", "mixed"]) -@torch.inference_mode() -def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str): - """ - Verify SpecDecodeWorker calls match the expected flow. - """ - vocab_size = 32_000 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(draft_worker, - target_worker, - mock_spec_decode_sampler("rejection_sampler"), - disable_logprobs=False, - metrics_collector=metrics_collector) - exception_secret = 'artificial stop' - worker.scorer = mock_worker(BatchExpansionTop1Scorer) - worker.scorer.score_proposals.side_effect = ValueError(exception_secret) - - # Create batch with combination of terminal/non-terminal prefill chunks - # and decodes (different seq_ids). - decodes, _, _ = create_batch(batch_size, k) - # Pre-chunking here, get 'batch_size' chunks. - prefill, _, _ = create_batch(batch_size, - k, - prefill_chunk_size=4, - seq_ids=list(range(batch_size, - batch_size * 2))) - - if batch_composition == "prefill_only": - n_prefills = batch_size - elif batch_composition == "decode_only": - n_prefills = 0 - else: - n_prefills = random.randint(1, batch_size - 1) - n_decodes = batch_size - n_prefills - - prefill = random.sample(prefill, n_prefills) - decodes = random.sample(decodes, n_decodes) - target_group_metadata_list = prefill + decodes - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=target_group_metadata_list, - # For prefill only batches we expect num_lookahead_slots = 0. - num_lookahead_slots=k if n_decodes > 0 else 0) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - if not len(decodes): - worker.execute_model(execute_model_req=execute_model_req) - # no spec run (prefill only) - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - else: - # Decode-only run OR mixed batch, scorer call fails (it's mocked) - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - # but first draft still counted - assert draft_worker.get_spec_proposals.call_count == 1 - - -def test_correctly_load_weight_for_eagle(): - """ - Verify SpecDecodeWorker loads lm_head weight for eagle correctly. - """ - seed = 100 - block_size = 32 - num_gpu_blocks = 8096 // block_size - target_worker = create_worker( - Worker, - "JackFram/llama-68m", - block_size, - num_gpu_blocks, - seed, - ) - draft_worker = create_worker( - MultiStepWorker, - "abhigoyal/vllm-eagle-llama-68m-random", - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - - spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler") - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False) - worker.proposer_worker.maybe_load_lm_head_weight( - target_worker.model_runner.model.lm_head.weight.data) - assert torch.allclose( - worker.proposer_worker.worker.model_runner.model.lm_head.weight.data, - worker.scorer_worker.model_runner.model.lm_head.weight.data) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py deleted file mode 100644 index 9cfc618b9d950..0000000000000 --- a/tests/spec_decode/test_utils.py +++ /dev/null @@ -1,150 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.layers.sampler import _get_ranks -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids -from vllm.spec_decode.util import (get_sampled_token_logprobs, - split_batch_by_proposal_len) - - -def test_get_all_seq_ids(): - """Verify get_all_seq_ids extracts all seq ids. - """ - expected_seq_ids = list(range(10)) + list(range(100, 110)) - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id=str(seq_id), - is_prompt=True, - seq_data={ - seq_id: MagicMock(), - }, - sampling_params=MagicMock(), - block_tables={ - seq_id: MagicMock(), - }, - lora_request=None, - ) for seq_id in expected_seq_ids - ] - - actual_seq_ids = get_all_seq_ids(seq_group_metadata_list) - assert actual_seq_ids == expected_seq_ids - - -@pytest.fixture -def fake_sequence_group_metadata(): - seq_ids = list(range(3)) - return [ - SequenceGroupMetadata( - request_id=str(i), - is_prompt=True, - seq_data={ - i: MagicMock(), - }, - sampling_params=MagicMock(), - block_tables={ - i: MagicMock(), - }, - lora_request=None, - ) for i in seq_ids - ] - - -def test_filter_zero_length_proposals(fake_sequence_group_metadata): - proposal_lens = [0, 1, 0] - _, (filtered_groups, - indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - expected_groups = [ - fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] - ] - expected_indices = [0, 2] - - assert filtered_groups == expected_groups - assert indices == expected_indices - - -def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): - proposal_lens = [0, 1, 2] - (filtered_groups, - indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - expected_groups = [ - fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] - ] - expected_indices = [1, 2] - - assert filtered_groups == expected_groups - assert indices == expected_indices - - -def test_empty_inputs(): - _, (filtered_groups, indices) = split_batch_by_proposal_len([], []) - - assert filtered_groups == [] - assert indices == [] - - -def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): - proposal_lens = [0, 0, 0] - (filtered_groups, - indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - assert filtered_groups == [] - assert indices == [] - - -def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): - proposal_lens = [1, 1, 1] - _, (filtered_groups, - indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - assert filtered_groups == [] - assert indices == [] - - -def mock_spec_decode_sampler(acceptance_sampler_method): - """ - Returns either a RejectionSampler or TypicalAcceptanceSampler - object depending on whether acceptance_sampler_method is - 'rejection_sampler' or 'typical_acceptance_sampler' respectively. - """ - if acceptance_sampler_method == "rejection_sampler": - sampler = MagicMock(spec=RejectionSampler) - sampler.token_id_dtype = torch.int64 - return sampler - elif acceptance_sampler_method == "typical_acceptance_sampler": - sampler = MagicMock(spec=TypicalAcceptanceSampler) - sampler.token_id_dtype = torch.int64 - return sampler - else: - raise ValueError(f"Invalid sampler name {acceptance_sampler_method}") - - -def test_get_sampled_token_logprobs(): - """Verify get_sampled_token_logprobs returns consistent rankings - with regular get_ranks when probabilities match exactly. - """ - logprob_tensor = torch.tensor( - [[[-.1, -.1]] * 2]) # shape (num_steps, batch_size, vocab_size) - sampled_token_tensor = torch.tensor([[1, - 0]]) # shape (num_steps, batch_size) - ranks_spec_dec, _ = get_sampled_token_logprobs(logprob_tensor, - sampled_token_tensor) - - ranks_regular = _get_ranks(logprob_tensor.reshape((2, -1)), - sampled_token_tensor.reshape(-1)) - - assert torch.equal(ranks_spec_dec.reshape(-1), ranks_regular) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py deleted file mode 100644 index 1733f66feec07..0000000000000 --- a/tests/spec_decode/utils.py +++ /dev/null @@ -1,290 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Sequence as GenericSequence -from itertools import count -from typing import Callable, Optional, TypeVar, Union -from unittest.mock import MagicMock - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sampling_params import SamplingParams -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceData, SequenceGroupMetadata, SequenceOutput) -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker import Worker - -T = TypeVar("T", bound=Worker) - - -def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size - - -def mock_worker(cls=None, - vocab_size: int = 30_000, - max_model_len: int = 2048, - rank: int = 0, - use_spec: bool = True) -> MagicMock: - if cls is None: - cls = Worker - - spec = cls if use_spec else None - - worker = MagicMock(spec=spec) - worker.vocab_size = vocab_size - worker.max_model_len = max_model_len - worker.rank = rank - worker.device = 'cuda:0' - return worker - - -def patch_execute_model_with_seeds(worker: Worker, rand_seeds: list[int]): - seed_iter = iter(rand_seeds) - original_execute_model = worker.execute_model - - def new_execute_model(*args, **kwargs): - result = original_execute_model(*args, **kwargs) - set_random_seed(next(seed_iter)) - return result - - return new_execute_model - - -def zero_kv_cache(cache_engine: list[CacheEngine]): - assert cache_engine[0].gpu_cache - for key_blocks, value_blocks in cache_engine[0].gpu_cache: - key_blocks.zero_() - value_blocks.zero_() - - -def create_worker(cls: Callable[..., T], - model_name: str, - block_size: int, - num_gpu_blocks: int, - seed: int, - is_driver_worker: bool = True, - enforce_eager: bool = True, - model_runner_cls: Optional[ModelRunner] = None, - dtype: Optional[str] = "auto") -> T: - engine_args = EngineArgs( - model=model_name, - seed=seed, - block_size=block_size, - enforce_eager=enforce_eager, - dtype=dtype, - ) - engine_config = engine_args.create_engine_config() - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - - worker = cls( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - model_runner_cls=model_runner_cls, - ) - - worker.init_device() - worker.load_model() - - engine_config.cache_config.num_gpu_blocks = num_gpu_blocks - engine_config.cache_config.num_cpu_blocks = 0 - worker.initialize_cache( - num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, - num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - - return worker - - -def create_seq_group_metadata_from_prompts( - prompts: list[list[int]], - num_gpu_blocks: int, - block_size: int, - final_prompt_lens: list[int], - continuations: Optional[list[list[int]]] = None, - seq_ids: Optional[list[int]] = None, -) -> list[SequenceGroupMetadata]: - - if continuations is None: - continuations = [[] for _ in prompts] - - if seq_ids is None: - seq_ids = list(i for i, _ in enumerate(prompts)) - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = { - i: [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(final_len, block_size)) - ] - for i, final_len in enumerate(final_prompt_lens) - } - - seq_grou_metadata_list = [] - for i, (prompt_token_ids, - cont_token_ids) in enumerate(zip(prompts, continuations)): - data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids) - data.update_num_computed_tokens( - len(prompt_token_ids) + len(cont_token_ids) - 1) - seq_data = {i: data} - seq_grou_metadata_list.append( - SequenceGroupMetadata( - request_id=str(i), - is_prompt=len(cont_token_ids) == 0, - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations[i][:]}, - )) - return seq_grou_metadata_list - - -def create_chunked_seq_group_metadata_from_prompt( - prompt: list[int], - num_gpu_blocks: int, - chunk_size: int, - block_size: int, - seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: - - if seq_id is None: - seq_id = 0 - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(len(prompt), block_size)) - ] - - seq_group_metadata_list = [] - for i, idx in enumerate(range(0, len(prompt), chunk_size)): - chunk_ids = prompt[idx:idx + chunk_size] - data = SequenceData.from_seqs(prompt) - data.update_num_computed_tokens(idx) - seq_data = {i: data} - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=str(seq_id), - is_prompt=True, - do_sample=idx + chunk_size >= len(prompt), # terminal chunk - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations}, - token_chunk_size=len(chunk_ids))) - return seq_group_metadata_list - - -def assert_logprobs_dict_allclose( - actual_logprobs: list[dict[int, Logprob]], - expected_logprobs: list[dict[int, Logprob]]) -> None: - for single_step_actual_logprobs, single_step_expected_logprobs in zip( - actual_logprobs, expected_logprobs): - assert set(single_step_actual_logprobs.keys()) == set( - single_step_expected_logprobs.keys()) - for token_id in single_step_actual_logprobs: - actual = torch.tensor( - single_step_actual_logprobs[token_id].logprob) - expected = torch.tensor( - single_step_expected_logprobs[token_id].logprob) - torch.testing.assert_close(actual, expected) - - -def create_sampler_output_list( - token_ids: torch.Tensor, - probs: GenericSequence[Optional[torch.Tensor]], - logprobs: GenericSequence[Optional[torch.Tensor]], - seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]: - num_steps, batch_size = token_ids.shape - token_ids_by_step = token_ids.tolist() - - if seq_ids is None: - seq_ids = list(range(batch_size)) - - return [ - SamplerOutput(outputs=[ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - output_token=token_id, - parent_seq_id=seq_ids[seq_index], - logprobs={token_id: Logprob(0)}, - ) - ], - prompt_logprobs=None, - ) for seq_index, token_id in enumerate(token_ids_by_step[step]) - ], - sampled_token_probs=probs[step], - logprobs=logprobs[step], - sampled_token_ids=token_ids[step]) - for step in range(num_steps) - ] - - -def create_batch(batch_size, - k, - prompt_len: Union[int, list[int]] = 10, - prev_output_token_len: int = 10, - seq_ids: Optional[list[int]] = None, - num_gpu_blocks: Optional[int] = None, - block_size: Optional[int] = None, - prefill_chunk_size: Optional[int] = None): - if block_size is None: - block_size = 8 - - if num_gpu_blocks is None: - num_gpu_blocks = 2048 // block_size - - iterator = count() - - if isinstance(prompt_len, int): - prompt_lens = [prompt_len for _ in range(batch_size)] - else: - prompt_lens = prompt_len - - prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] - - if prefill_chunk_size: - # Create a batch of chunked prompts. - if not seq_ids: - seq_ids = list(range(len(prompts))) - seq_group_metadata_list = [] - for p, sid in zip(prompts, seq_ids): - seq_group_metadata_list += \ - create_chunked_seq_group_metadata_from_prompt( - p, num_gpu_blocks, prefill_chunk_size, block_size, sid) - seq_group_metadata_list = seq_group_metadata_list[:batch_size] - prev_output_tokens = [] - else: - prev_output_tokens = [[ - next(iterator) for _ in range(prev_output_token_len) - ] for _ in range(batch_size)] - final_prompt_lens = [ - len(prompt) + len(prev_output_token) + k + 1 - for prompt, prev_output_token in zip(prompts, prev_output_tokens) - ] - - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, final_prompt_lens, - prev_output_tokens, seq_ids) - return seq_group_metadata_list, prompts, prev_output_tokens - - -def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs): - if prefill_chunk_size > 0: - llm_kwargs.update( - **{ - "enable_chunked_prefill": True, - "max_num_batched_tokens": prefill_chunk_size, - "max_num_seqs": prefill_chunk_size - }) - else: - llm_kwargs["enable_chunked_prefill"] = False diff --git a/tests/test_sequence.py b/tests/test_sequence.py index a782a3bf7716b..c734c8514a6da 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -29,7 +29,6 @@ def test_sampler_output_initialization(sampler_output, sample_outputs): assert len(sampler_output) == len(sample_outputs) assert sampler_output.sampled_token_probs is None assert sampler_output.sampled_token_ids is None - assert sampler_output.spec_decode_worker_metrics is None def test_sampler_output_getitem(sampler_output, sample_outputs): diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 7a7ba346a7197..39515d710e81e 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -40,12 +40,6 @@ def test_unsupported_configs(monkeypatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - kv_cache_dtype="fp8", - ).create_engine_config() - with pytest.raises(NotImplementedError): AsyncEngineArgs( model=MODEL, diff --git a/tools/mypy.sh b/tools/mypy.sh index 77d342da1ec82..af4c61233abd0 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -32,6 +32,5 @@ run_mypy vllm/lora run_mypy vllm/model_executor run_mypy vllm/plugins run_mypy vllm/prompt_adapter -run_mypy vllm/spec_decode run_mypy vllm/worker run_mypy vllm/v1 diff --git a/vllm/config.py b/vllm/config.py index 7ae9b1b7fd02c..8383a663c75ea 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2536,8 +2536,6 @@ class DeviceConfig: SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp"] -SpeculativeAcceptanceMethod = Literal["rejection_sampler", - "typical_acceptance_sampler"] @config @@ -2560,13 +2558,6 @@ class SpeculativeConfig: If using `ngram` method, the related configuration `prompt_lookup_max` and `prompt_lookup_min` should be considered.""" - acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler" - """The method to use for accepting draft tokens:\n - - "rejection_sampler" maps to `RejectionSampler`.\n - - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`. - - If using `typical_acceptance_sampler`, the related configuration - `posterior_threshold` and `posterior_alpha` should be considered.""" draft_tensor_parallel_size: Optional[int] = None """The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.""" @@ -2593,9 +2584,6 @@ class SpeculativeConfig: will use the default version.""" # Advanced control - disable_mqa_scorer: bool = False - """Disable the MQA scorer and fall back to batch expansion for scoring - proposals.""" disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number of enqueued requests is larger than this value, if provided.""" @@ -2608,16 +2596,6 @@ class SpeculativeConfig: """Minimum size of ngram token window when using Ngram proposer, if provided. Defaults to 1.""" - # Typical acceptance sampler configuration - posterior_threshold: Optional[float] = None - """A threshold value that sets a lower bound on the posterior probability - of a token in the target model for it to be accepted. This threshold is - used only when we use the `TypicalAcceptanceSampler` for token acceptance. - """ - posterior_alpha: Optional[float] = None - """Scaling factor for entropy-based threshold, applied when using - `TypicalAcceptanceSampler`.""" - speculative_token_tree: Optional[str] = None """Specifies the tree structure for speculative token generation. """ @@ -2795,8 +2773,8 @@ class SpeculativeConfig: elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type == - "deepseek_mtp"): + elif (self.draft_model_config.hf_config.model_type + in ("deepseek_mtp", "mimo_mtp")): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: logger.warning( @@ -2806,6 +2784,11 @@ class SpeculativeConfig: ) else: self.method = "draft_model" + raise NotImplementedError( + "Speculative decoding with draft model is not " + "supported yet. Please consider using other " + "speculative decoding methods such as ngram, medusa, " + "eagle, or deepseek_mtp.") # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): @@ -2864,12 +2847,6 @@ class SpeculativeConfig: self.target_parallel_config, self.draft_tensor_parallel_size)) - if self.acceptance_method == "typical_acceptance_sampler": - if self.posterior_threshold is None: - self.posterior_threshold = 0.09 - if self.posterior_alpha is None: - self.posterior_alpha = 0.3 - @staticmethod def _maybe_override_draft_max_model_len( speculative_max_model_len: Optional[int], @@ -2975,30 +2952,6 @@ class SpeculativeConfig: if self.draft_model_config: self.draft_model_config.verify_with_parallel_config( self.draft_parallel_config) - # Validate and set draft token acceptance related settings. - - if self.acceptance_method is None: - raise ValueError("acceptance_method is not set. " - "Expected values are rejection_sampler or " - "typical_acceptance_sampler.") - - if (self.acceptance_method != 'rejection_sampler' - and self.acceptance_method != 'typical_acceptance_sampler'): - raise ValueError( - "Expected acceptance_method to be either " - "rejection_sampler or typical_acceptance_sampler. Instead it " - f"is {self.acceptance_method}") - - if self.acceptance_method == "typical_acceptance_sampler" and ( - (self.posterior_threshold is not None - and self.posterior_threshold < 0) or - (self.posterior_alpha is not None and self.posterior_alpha < 0)): - raise ValueError( - "Expected the posterior_threshold and posterior_alpha of " - "typical_acceptance_sampler to be > 0. " - "Instead found posterior_threshold = " - f"{self.posterior_threshold} and posterior_alpha = " - f"{self.posterior_alpha}") if (self.disable_by_batch_size is not None and self.disable_by_batch_size < 2): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b20defde73ed0..a7fcf6c354e82 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1417,28 +1417,12 @@ class EngineArgs: return False # V1 supports N-gram, Medusa, and Eagle speculative decoding. - is_ngram_enabled = False - is_eagle_enabled = False - is_medusa_enabled = False - if self.speculative_config is not None: - # This is supported but experimental (handled below). - speculative_method = self.speculative_config.get("method") - if speculative_method: - if speculative_method in ("ngram", "[ngram]"): - is_ngram_enabled = True - elif speculative_method == "medusa": - is_medusa_enabled = True - elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"): - is_eagle_enabled = True - else: - speculative_model = self.speculative_config.get("model") - if speculative_model in ("ngram", "[ngram]"): - is_ngram_enabled = True - if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled): - # Other speculative decoding methods are not supported yet. - _raise_or_fallback(feature_name="Speculative Decoding", - recommend_to_remove=False) - return False + if (self.speculative_config is not None + and self.speculative_config.get("method") == "draft_model"): + raise NotImplementedError( + "Speculative decoding with draft model is not supported yet. " + "Please consider using other speculative decoding methods " + "such as ngram, medusa, eagle, or deepseek_mtp.") # No XFormers so far. V1_BACKENDS = [ diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 25fa1c3058bef..e2f8de1990b5f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1780,13 +1780,6 @@ class LLMEngine: num_generation_tokens_from_prefill_groups) num_tokens_iter = (num_generation_tokens_iter + num_prompt_tokens_iter) - # Spec decode, if enabled, emits specialized metrics from the worker in - # sampler output. - if model_output and isinstance(model_output[0], SamplerOutput) and ( - model_output[0].spec_decode_worker_metrics is not None): - spec_decode_metrics = model_output[0].spec_decode_worker_metrics - else: - spec_decode_metrics = None return Stats( now=now, @@ -1808,7 +1801,6 @@ class LLMEngine: num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, - spec_decode_metrics=spec_decode_metrics, num_preemption_iter=num_preemption_iter, # Request stats diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 8d51f0472351b..ba8dbd1fad791 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter from typing import Dict, List, Optional, Type, Union, cast @@ -19,9 +18,6 @@ if ray is not None: else: ray_metrics = None -if TYPE_CHECKING: - from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics - logger = init_logger(__name__) prometheus_client.disable_created_metrics() @@ -199,30 +195,6 @@ class Metrics: documentation="Count of successfully processed requests.", labelnames=labelnames + [Metrics.labelname_finish_reason]) - # Speculative decoding stats - self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls( - name="vllm:spec_decode_draft_acceptance_rate", - documentation="Speulative token acceptance rate.", - labelnames=labelnames, - multiprocess_mode="sum") - self.gauge_spec_decode_efficiency = self._gauge_cls( - name="vllm:spec_decode_efficiency", - documentation="Speculative decoding system efficiency.", - labelnames=labelnames, - multiprocess_mode="sum") - self.counter_spec_decode_num_accepted_tokens = (self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens_total", - documentation="Number of accepted tokens.", - labelnames=labelnames)) - self.counter_spec_decode_num_draft_tokens = self._counter_cls( - name="vllm:spec_decode_num_draft_tokens_total", - documentation="Number of draft tokens.", - labelnames=labelnames) - self.counter_spec_decode_num_emitted_tokens = (self._counter_cls( - name="vllm:spec_decode_num_emitted_tokens_total", - documentation="Number of emitted tokens.", - labelnames=labelnames)) - # --8<-- [end:metrics-definitions] @@ -391,9 +363,6 @@ class LoggingStatLogger(StatLoggerBase): self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) - # Update spec decode metrics - self.maybe_update_spec_decode_metrics(stats) - # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): @@ -435,10 +404,6 @@ class LoggingStatLogger(StatLoggerBase): stats.gpu_prefix_cache_hit_rate * 100, stats.cpu_prefix_cache_hit_rate * 100, ) - if self.spec_decode_metrics is not None: - log_fn( - self._format_spec_decode_metrics_str( - self.spec_decode_metrics)) self._reset(stats, prompt_throughput, generation_throughput) @@ -447,21 +412,9 @@ class LoggingStatLogger(StatLoggerBase): self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now - self.spec_decode_metrics = None self.last_prompt_throughput = prompt_throughput self.last_generation_throughput = generation_throughput - def _format_spec_decode_metrics_str( - self, metrics: "SpecDecodeWorkerMetrics") -> str: - - return ("Speculative metrics: " - f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " - f"System efficiency: {metrics.system_efficiency:.3f}, " - f"Number of speculative tokens: {metrics.num_spec_tokens}, " - f"Number of accepted tokens: {metrics.accepted_tokens}, " - f"Number of draft tokens: {metrics.draft_tokens}, " - f"Number of emitted tokens: {metrics.emitted_tokens}.") - def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError @@ -579,33 +532,14 @@ class PrometheusStatLogger(StatLoggerBase): self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) - # Update spec decode metrics - self.maybe_update_spec_decode_metrics(stats) - # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - if self.spec_decode_metrics is not None: - self._log_gauge( - self.metrics.gauge_spec_decode_draft_acceptance_rate, - self.spec_decode_metrics.draft_acceptance_rate) - self._log_gauge(self.metrics.gauge_spec_decode_efficiency, - self.spec_decode_metrics.system_efficiency) - self._log_counter( - self.metrics.counter_spec_decode_num_accepted_tokens, - self.spec_decode_metrics.accepted_tokens) - self._log_counter( - self.metrics.counter_spec_decode_num_draft_tokens, - self.spec_decode_metrics.draft_tokens) - self._log_counter( - self.metrics.counter_spec_decode_num_emitted_tokens, - self.spec_decode_metrics.emitted_tokens) # Reset tracked stats for next interval. self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now - self.spec_decode_metrics = None def info(self, type: str, obj: SupportsMetricsInfo) -> None: # Info type metrics are syntactic sugar for a gauge permanently set to 1 diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9375dc4c495ba..3281a9121a9df 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -16,10 +16,9 @@ do this in Python code and lazily import prometheus_client. import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import List from vllm.config import SupportsMetricsInfo, VllmConfig -from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @dataclass @@ -65,8 +64,6 @@ class Stats: running_lora_adapters: List[str] max_lora: str - spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None - class StatLoggerBase(ABC): """Base class for StatLogger.""" @@ -77,7 +74,6 @@ class StatLoggerBase(ABC): self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval - self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: @@ -86,9 +82,3 @@ class StatLoggerBase(ABC): @abstractmethod def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError - - def maybe_update_spec_decode_metrics(self, stats: Stats): - """Save spec decode metrics (since they are unlikely - to be emitted at same time as log interval).""" - if stats.spec_decode_metrics is not None: - self.spec_decode_metrics = stats.spec_decode_metrics diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index e0fa6a00ecfa4..8b66ef0dc7658 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -104,11 +104,6 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor): seqs = sequence_group.get_seqs( status=SequenceStatus.FINISHED_ABORTED) - for output in outputs: - if output.samples[0].output_token != VLLM_INVALID_TOKEN_ID: - sequence_group.metrics.spec_token_acceptance_counts[ - output.step_index] += 1 - assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences" assert len(seqs) == 1, ( "Beam search not supported in multi-step decoding.") diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py deleted file mode 100644 index db68f18726d38..0000000000000 --- a/vllm/model_executor/layers/rejection_sampler.py +++ /dev/null @@ -1,406 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from functools import cached_property -from importlib.util import find_spec -from typing import Optional - -import torch -import torch.jit - -import vllm.envs as envs -from vllm.logger import init_logger -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeStochasticBaseSampler) -from vllm.platforms import current_platform - -logger = init_logger(__name__) - -if find_spec("flashinfer"): - """ - Consider utilizing the FlashInfer rejection sampling kernel initially, - as it employs a dedicated kernel rather than relying on - Torch tensor operations. This design choice helps to fuse operations, - reduce memory I/O, and consequently enhances performance. - """ - from flashinfer.sampling import chain_speculative_sampling -else: - chain_speculative_sampling = None - - -class RejectionSampler(SpecDecodeStochasticBaseSampler): - """Apply modified rejection sampling as described in "Accelerating Large - Language Model Decoding with Speculative Sampling" - https://arxiv.org/pdf/2302.01318.pdf. - """ - - def __init__(self, - strict_mode: bool = False, - use_flashinfer: Optional[bool] = None): - """Create a rejection sampler. - - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - use_flashinfer: We will use this parameter to determine whether - to use the FlashInfer rejection sampling kernel or not. If it's - None, we will use the default value from the environment variable. - This parameter is only used for testing purposes. - """ - super().__init__(strict_mode=strict_mode) - if use_flashinfer is None: - self.use_flashinfer = envs.VLLM_USE_FLASHINFER_SAMPLER and ( - chain_speculative_sampling is not None) - else: - self.use_flashinfer = use_flashinfer - - if self.use_flashinfer: - logger.info("Use flashinfer for rejection sampling.") - else: - logger.info("Use pytorch for rejection sampling.") - - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, - ) -> torch.Tensor: - """Sample token ids using rejection sampling. This accepts or rejects - tokens proposed by the draft model using the probability of each token - according to the draft and target models. - - In the worst case where all draft tokens are rejected, it is guaranteed - one correct token will be emitted. - - In the case where all draft tokens are accepted, a bonus token will be - accepted as its cheap to have the target model score this speculative - sequence. - - Args: - target_with_bonus_probs: The probability distribution - over token ids given context according to the target model. - shape = [batch_size, num_speculative_tokens + 1, vocab_size] - - bonus_token_ids: The "bonus" token ids that are accepted iff all - speculative tokens in a sequence are accepted. - shape = [batch_size, num_bonus_tokens] - - draft_probs: The probability distribution over token ids given - context according to the draft model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - draft_token_ids: The token ids that were sampled from the draft - probabilities. - shape = [batch_size, num_speculative_tokens] - - seeded_seqs: Dict of batch row index to torch generator, for - sequences using seeded generation. - - Returns: - output_token_ids: The token ids sampled via rejection sampling, - or -1 if unable to sample a token because the previous token - was rejected. - shape = [batch_size, num_speculative_tokens + num_bonus_tokens] - """ - # Only perform shape/dtype/device checking in strict mode, as it adds - # overhead. - if self._strict_mode: - self._raise_if_incorrect_input(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - - batch_size, k, _ = draft_probs.shape - - # batch_size = 0 when all requests in the batch are - # non_spec requests. In this case, output_token_ids is - # just an empty tensor. - if batch_size == 0: - return torch.empty(0, k + 1, device=draft_probs.device, dtype=int) - - # If use Flashinfer chain_speculative_sampling kernel - # for rejection sampling - if self.use_flashinfer and chain_speculative_sampling is not None: - batch_size, k, _ = draft_probs.shape - - (output_token_ids, accepted_token_num, - emitted_token_num) = chain_speculative_sampling( - draft_probs, - draft_token_ids, - target_with_bonus_probs, - ) - - # num_emitted_tokens returned by flashinfer - # does not include the bonus token - # Flashinfer stops at the first token that violates - # the condition p >= q and does not include recovery/bonus token. - # Therefore, we need to add batch_size here. - self.num_accepted_tokens += accepted_token_num.sum() - self.num_emitted_tokens += emitted_token_num.sum() + batch_size - self.num_draft_tokens += batch_size * k - else: - accepted, recovered_token_ids = ( - self._batch_modified_rejection_sampling( - target_with_bonus_probs[:, :-1], - draft_probs, - draft_token_ids, - seeded_seqs, - )) - - output_token_ids = self._create_output( - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - - return output_token_ids - - def _batch_modified_rejection_sampling( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], - ) -> tuple[torch.Tensor, torch.Tensor]: - """Perform modified rejection sampling on each sequence. - - Returns: - A tuple of two tensors: - 0: A bool tensor of which tokens in each sequence is accepted. - shape = [batch_size, k] - 1: Token ids sampled from a recovered distribution, to be used - when a token is rejected. - shape = [batch_size, k] - """ - - batch_size, k, vocab_size = draft_probs.shape - - # shape [batch_size, k] - accepted = self._get_accepted(target_probs, draft_probs, - draft_token_ids, seeded_seqs) - - recovered_probs = self._get_recovered_probs( - target_probs, draft_probs).reshape(batch_size * k, vocab_size) - - # NOTE: the recovered_probs are overwritten by this method. - recovered_token_ids = _multinomial( - recovered_probs, - num_samples=1, - k=k, - seeded_seqs=seeded_seqs or {}, - ).reshape(batch_size, k) - - return accepted, recovered_token_ids - - def _create_uniform_samples(self, - seeded_seqs: Optional[dict[int, - torch.Generator]], - batch_size: int, k: int, - device: torch.device) -> torch.Tensor: - """ - Generates a batch of uniform random samples, with optional seeding - for specific sequences. - - This method creates a tensor of shape `(batch_size, k + 1)` filled - with uniform random values in the range [0, 1). If `seeded_seqs` - is provided, the sequences corresponding to specific indices - will be generated using the provided `torch.Generator` for - reproducibility. The other sequences will be generated without - a seed. - - Args: - seeded_seqs : Optional[dict[int, torch.Generator]] - A dictionary mapping indices in the batch to - `torch.Generator` objects. If `None`, all samples are - generated without a seed. - batch_size : int - The number of sequences to generate. - k : int - The number of random samples per sequence. - device : torch.device - The device on which to allocate the tensor. - - Returns: - uniform_rand : torch.Tensor - A tensor of shape `(batch_size, k + 1)` containing uniform - random values in the range [0, 1). - """ - if not seeded_seqs: - return torch.rand(batch_size, k + 1, device=device) - - uniform_rand = torch.empty(batch_size, k + 1, device=device) - - non_seeded_indices = [] - for idx in range(batch_size): - generator = seeded_seqs.get(idx) - if generator is None: - non_seeded_indices.append(idx) - else: - uniform_rand[idx, :] = torch.rand(1, - k + 1, - dtype=self.probs_dtype, - device=device, - generator=generator) - if non_seeded_indices: - uniform_rand[non_seeded_indices, :] = torch.rand( - len(non_seeded_indices), - k + 1, - dtype=self.probs_dtype, - device=device) - return uniform_rand - - def _get_accepted( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], - ) -> torch.Tensor: - r"""Create bool matrix over the proposed draft tokens. If - True, then a token can be accepted, else it should be - rejected. - - Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of - $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according - to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the - same conditional probability according to the draft model, the token - is accepted with probability: - - $$ - \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} - {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - $$ - - This implementation does not apply causality. When using the output, - if a token is rejected, subsequent tokens should not be used. - - Returns a bool tensor of shape [batch_size, k] specifying which tokens - are accepted. - """ - batch_size, k, _ = draft_probs.shape - batch_indices = torch.arange(batch_size, - device=target_probs.device)[:, None] - probs_indices = torch.arange(k, device=target_probs.device) - - # shape [batch_size, k] - selected_draft_probs = draft_probs[batch_indices, probs_indices, - draft_token_ids] - - # shape [batch_size, k] - selected_target_probs = target_probs[batch_indices, probs_indices, - draft_token_ids] - - uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size, - k - 1, target_probs.device) - - capped_ratio = torch.minimum( - selected_target_probs / selected_draft_probs, - torch.full((1, ), 1, device=target_probs.device)) - accepted = uniform_rand < capped_ratio - - return accepted - - def _get_recovered_probs( - self, - target_probs: torch.Tensor, # [k, vocab_size] - draft_probs: torch.Tensor, # [k, vocab_size] - ) -> torch.Tensor: - r"""Create a probability distribution for each proposed token which can - be sampled if the proposed token is rejected. - - When this routine is applied sequentially, the true distribution of the - target model is recovered (within hardware numerics). - - The probability distribution used in this rejection case is constructed - as follows. Given $q(x|x_1, \dots, x_n)$, the probability of - $x$ given context $x_1, \dots, x_n$ according to the target - model and $p(x|x_1, \dots, x_n)$, the same conditional probability - according to the draft model: - - $$ - x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - $$ - - where $(f(x))_+$ is defined as: - - $$ - (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - $$ - - See https://github.com/vllm-project/vllm/pull/2336 for a visualization - of the draft, target, and recovered probability distributions. - - Returns a tensor of shape [batch_size, k, vocab_size]. - - Note: - This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. - """ - _, k, _ = draft_probs.shape - - # shape [batch_size, k, vocab_size] - difference = target_probs - draft_probs - - # TODO(cade): Can we use logprobs instead of probs, and avoid the - # division-by-zero errors without introducing distribution drift? - - # shape [batch_size, k, vocab_size] - f = torch.clamp(difference, min=self._smallest_positive_value) - - # shape [batch_size, k, vocab_size] - recovered_probs = f / torch.sum(f, dim=-1).reshape(-1, k, 1) - - return recovered_probs - - @cached_property - def _smallest_positive_value(self) -> float: - """Return the smallest positive value representable by the probs dtype. - This value is used when constructing a distribution from which to sample - recovered tokens in the first rejection case. - - See _get_recovered_probs for more details - - Note that this isn't actually the smallest positive value representable - by float32, but the smallest positive normal value. - See https://en.wikipedia.org/wiki/Subnormal_number for more information. - """ - return torch.finfo(self.probs_dtype).tiny - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead that skips the sync. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) -def _multinomial( - probs: torch.Tensor, - num_samples: int, - k: int, - seeded_seqs: dict[int, torch.Generator], -) -> torch.Tensor: - - if num_samples > 1: - # This is equivalent to torch.repeat_interleaved (which also - # forces a GPU<->CPU sync). - probs = probs[:, None, :].expand(probs.shape[0], num_samples, - probs.shape[1]).contiguous().view( - -1, probs.shape[1]) - q = torch.empty_like(probs) - if not seeded_seqs: - q.exponential_(1.0) - else: - start = 0 - for idx in range(len(q) // k): - end = start + k - generator = seeded_seqs.get(idx) - # Note: generator might be None for non seeded - q[start:end].exponential_(1.0, generator=generator) - start = end - - return probs.div_(q).argmax(dim=1).view(-1, num_samples) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 08840fc40cf6a..e77eb637c8942 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -21,7 +21,6 @@ from vllm.sampling_params import SamplingType from vllm.sequence import (VLLM_INVALID_TOKEN_ID, CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SampleLogprobs, SequenceOutput) -from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable @@ -119,9 +118,6 @@ class SamplerOutput( # specified in lieu of prompt token ids or text. sampled_token_embeds: Optional[torch.Tensor] = None - # Spec decode metrics populated by workers. - spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None - # Optional last hidden states from the model. hidden_states: Optional[torch.Tensor] = None @@ -159,11 +155,9 @@ class SamplerOutput( else self.sampled_token_probs.shape) sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) - return ( - f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr}, " - f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") + return (f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr})") class Sampler(nn.Module): diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py deleted file mode 100644 index 0a36fe9be45b1..0000000000000 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ /dev/null @@ -1,259 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import abstractmethod -from typing import Optional, Union - -import torch -import torch.jit -import torch.nn as nn - -from vllm.platforms import current_platform - - -class SpecDecodeBaseSampler(nn.Module): - """Base class for samplers used for Speculative Decoding verification - step. - """ - - def __init__(self, strict_mode: bool = False): - """Base class constructor. - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - """ - super().__init__() - self._strict_mode = strict_mode - - # NOTE: A "bonus token" is accepted iff all proposal tokens are - # accepted. There is always only one possible bonus token. We store this - # value in a variable for readability. - self._num_bonus_tokens = 1 - - self.num_accepted_tokens: Optional[torch.Tensor] = None - self.num_emitted_tokens: Optional[torch.Tensor] = None - self.num_draft_tokens: int = 0 - - def init_gpu_tensors(self, device: Union[int, str]) -> None: - assert self.num_accepted_tokens is None - if isinstance(device, int): - device = f"{current_platform.device_type}:{device}" - elif not isinstance(device, str): - raise ValueError(f"Device must be int or str, get {type(device)}") - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - def init_tensors(self, - device: Union[int, str], - device_type: Union[torch.device, str] = 'cuda') -> None: - assert self.num_accepted_tokens is None - if isinstance(device_type, torch.device): - device_type = device_type.type - if isinstance(device, int): - device = f"{device_type}:{device}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - @property - def probs_dtype(self): - return torch.float32 - - @property - def token_id_dtype(self): - return torch.int64 - - def _create_output( - self, - accepted: torch.Tensor, # [batch_size, k] - substitute_token_ids: torch.Tensor, # [batch_size, k] - draft_token_ids: torch.Tensor, # [batch_size, k] - bonus_token_ids: torch.Tensor, # [batch_size] - ) -> torch.Tensor: - """Format output. Returns a matrix of token ids. When - a token is rejected via sampling, all subsequent token ids are - set to -1 for the sequence. - - Args: - accepted: A boolean tensor indicating if the corresponding - draft token in draft_token_ids should be accepted or not. - substitute_token_ids: A tensor of token_ids that can be used - as substitutes for the draft token ids if the proposed token - is rejected. - draft_token_ids: A tensor of token ids speculated by the - draft model. - bonus_token_ids: Token ids to use as the bonus token if - all the draft tokens are accepted. - Returns: - A tensor containing the accepted token ids. The shape of the - tensor is [batch_size, k + num_bonus_tokens] - """ - batch_size, k = substitute_token_ids.shape - bonus_token_ids = bonus_token_ids.squeeze(-1) - # Determine the index of the first False value for each row. - limits = (accepted == 0).max(1).indices - limits[~(accepted == 0).any(1)] = k - - # Create masks using the indices. - indices = torch.arange(k, device=accepted.device).unsqueeze(0) - accepted_mask = indices < limits.unsqueeze(1) - after_false_mask = indices == limits.unsqueeze(1) - - # Create an extended output tensor - output_with_bonus_tokens = -torch.ones( - (batch_size, k + self._num_bonus_tokens), - dtype=self.token_id_dtype, - device=accepted.device) - output = output_with_bonus_tokens[:, :k] - - # Fill in the first k columns of the output tensor using masks and data - # tensors. - output[:, :k] = torch.where(accepted_mask, draft_token_ids, - -torch.ones_like(draft_token_ids)) - - # Fill the last column. - # We check output directly as accepted may have True values inconsistent - # with causal acceptance. - output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, - bonus_token_ids, -1) - - # Fill the recovered token ids. - output.mul_(~after_false_mask).add_( - substitute_token_ids.mul(after_false_mask)) - - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k - - return output_with_bonus_tokens - - def _raise_if_incorrect_input( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - self._raise_if_incorrect_shape(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_incorrect_dtype(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_inconsistent_device(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_out_of_bounds_vocab(target_with_bonus_probs.shape[-1], - draft_token_ids, bonus_token_ids) - - def _raise_if_incorrect_shape( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - (target_batch_size, num_target_probs, - target_vocab_size) = target_with_bonus_probs.shape - - # Does not count the extra token - num_target_probs -= 1 - - # validate the shape of draft token ids. - draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - assert draft_token_ids_batch_size == target_batch_size - assert num_draft_token_ids == num_target_probs - - # validate the shape of bonus token ids - bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - - # validate the shape of draft probs if it is set - if draft_probs is not None: - (draft_batch_size, num_draft_probs, - draft_vocab_size) = draft_probs.shape - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - def _raise_if_incorrect_dtype( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - assert target_with_bonus_probs.dtype == self.probs_dtype - assert draft_token_ids.dtype == self.token_id_dtype - assert bonus_token_ids.dtype == self.token_id_dtype - if draft_probs is not None: - assert draft_probs.dtype == self.probs_dtype - - def _raise_if_inconsistent_device( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - devices = [ - t.device for t in [ - target_with_bonus_probs, bonus_token_ids, draft_probs, - draft_token_ids - ] if t is not None - ] - assert all([devices[0] == device for device in devices]) - - def _raise_if_out_of_bounds_vocab( - self, - vocab_size: int, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - ) -> None: - assert torch.all(bonus_token_ids < vocab_size) - assert torch.all(bonus_token_ids >= 0) - assert torch.all(draft_token_ids < vocab_size) - assert torch.all(draft_token_ids >= 0) - - -class SpecDecodeDeterministicBaseSampler(SpecDecodeBaseSampler): - """Base class for samplers used for Speculative Decoding verification - step which are deterministic. - """ - - @abstractmethod - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> torch.Tensor: - raise NotImplementedError - - -class SpecDecodeStochasticBaseSampler(SpecDecodeBaseSampler): - """Base class for samplers used for Speculative Decoding verification - step which are stochastic - """ - - @abstractmethod - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, - ) -> torch.Tensor: - raise NotImplementedError diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py deleted file mode 100644 index 5dabaa5379e7b..0000000000000 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ /dev/null @@ -1,166 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -import torch.jit - -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeDeterministicBaseSampler) - - -class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): - """Apply typical acceptance sampling as described in section 3.3.1 in - "MEDUSA: Simple LLM Inference Acceleration Framework with - Multiple Decoding Heads" - https://arxiv.org/pdf/2401.10774 - """ - - def __init__( - self, - posterior_threshold: float, - posterior_alpha: float, - strict_mode: bool = False, - ): - """Create a Typical Acceptance Sampler. - - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - posterior_threshold : A threshold value that sets a lower bound - on the posterior probability of a token in target model for it - to be accepted. - posterior_alpha : A scaling factor for the entropy-based - threshold in typical acceptance sampling. - """ - self._posterior_threshold = posterior_threshold - self._posterior_alpha = posterior_alpha - super().__init__(strict_mode=strict_mode) - - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> torch.Tensor: - """Sample token ids using typical acceptance sampling. This accepts - or rejects tokens proposed by the draft model using the probability - of each token according to the draft and target models. - - In the worst case where all draft tokens are rejected, it is guaranteed - one token will be emitted. - - In the case where all draft tokens are accepted, the bonus token will be - accepted. - - Args: - target_probs: The probability distribution over token ids given - context according to the target model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - bonus_token_ids: The "bonus" token ids that are accepted iff all - speculative tokens in a sequence are accepted. - shape = [batch_size, num_bonus_tokens] - - draft_probs: This parameter is unused by the acceptance sampler. - - draft_token_ids: The token ids that were sampled from the draft - probabilities. - shape = [batch_size, num_speculative_tokens] - - Returns: - output_token_ids: The token ids sampled via rejection sampling, - or -1 if unable to sample a token because the previous token - was rejected. - shape = [batch_size, num_speculative_tokens + num_bonus_tokens] - """ - # Only perform shape/dtype/device checking in strict mode, as it adds - # overhead. - if self._strict_mode: - self._raise_if_incorrect_input(target_with_bonus_probs, - draft_token_ids, bonus_token_ids) - target_probs = target_with_bonus_probs[:, :-1] - accepted = self._evaluate_accepted_tokens(target_probs, - draft_token_ids) - recovered_token_ids = self._get_recovered_token_ids(target_probs) - output_token_ids = self._create_output(accepted, recovered_token_ids, - draft_token_ids, - bonus_token_ids) - return output_token_ids - - def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): - r""" - Evaluates and returns a mask of accepted tokens based on the - posterior probabilities. - - Args: - target_probs (torch.Tensor): A tensor of shape - (batch_size, k, vocab_size) representing the probabilities of - each token in the vocabulary for each position in the proposed - sequence. This is the distribution generated by the target - model. - draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k) - representing the proposed token ids. - - A draft token_id x_{n+k} is accepted if it satisfies the - following condition - - $$ - p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > - \min \left( \epsilon, \delta * \exp \left( - -H(p_{\text{original}}( - \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) - $$ - - where $p_{\text{original}}$ corresponds to target_probs - and $\epsilon$ and $\delta$ correspond to hyperparameters - specified using self._posterior_threshold and self._posterior_alpha - - This method computes the posterior probabilities for the given - draft token ids based on the provided target probabilities. It - calculates the entropy of the posterior distribution and determines - a dynamic threshold for each token position using the provided - posterior_threshold and posterior_alpha values. The method then - returns a boolean mask indicating which tokens can be accepted. - - Returns: - torch.Tensor: A boolean tensor of shape (batch_size, k) where each - element indicates whether the corresponding draft token has - been accepted or rejected. True indicates acceptance and false - indicates rejection. - """ - device = target_probs.device - candidates_prob = torch.gather( - target_probs, dim=-1, - index=draft_token_ids.unsqueeze(-1)).squeeze(-1) - # A small constant added to prevent computing the logarithm of zero, - # which can lead to undefined values. - epsilon = 1e-5 - posterior_entropy = -torch.sum( - target_probs * torch.log(target_probs + epsilon), dim=-1) - threshold = torch.minimum( - torch.ones_like(posterior_entropy, device=device) * - self._posterior_threshold, - torch.exp(-posterior_entropy) * self._posterior_alpha, - ) - accepted_mask = candidates_prob > threshold - return accepted_mask - - def _get_recovered_token_ids(self, target_probs): - """ - The recovered token ids will fill the first unmatched token - by the target token. - - Args: - target_probs (torch.Tensor): A tensor of shape - (batch_size, k, vocab_size) containing the target probability - distribution. - - Returns: - torch.Tensor: A tensor of shape (batch_size, k) with the recovered - token ids which are selected from target probs. - """ - max_indices = torch.argmax(target_probs, dim=-1) - - return max_indices diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py deleted file mode 100644 index c551ecd68ef86..0000000000000 --- a/vllm/model_executor/models/eagle.py +++ /dev/null @@ -1,261 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Iterable -from typing import Optional - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models import ModelRegistry -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .utils import maybe_prefix - -logger = init_logger(__name__) - - -class DummyInputLayerNorm(nn.Module): - - def __init__(self, weight=None, bias=None): - super().__init__() - self.weight = nn.Parameter(weight) if weight is not None else None - self.bias = nn.Parameter(bias) if bias is not None else None - - def forward(self, x): - return x - - -class DummyOutputNorm(nn.Module): - - def forward(self, x, residual): - if residual is None: - return x - else: - return x + residual, None - - -class EAGLE(nn.Module): - """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077 - Reference implementation: https://github.com/SafeAILab/EAGLE - - Differences from reference implementation: - 1. In reference, LlamaDecoderLayer implementation doesn't have - input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427). - Following this approach, our implementation also disables - the input_layernorm for the first decoder layer. - 2. We allow any decoder layer to be used in EAGLE whereas in reference - decoder layer is fixed to be LlamaDecoderLayer. - 3. We have an optional token_map which reduces draft vocab to most - frequently used tokens to give some additional speed-up by reducing - sampling overhead. This is disabled unless the checkpoint file has - explicit token_map tensor and config has an optional attribute - truncated_vocab_size < vocab_size. To use this technique, one has to find - the top-k most frequent tokens in target dataset and add that as a tensor - in the draft checkpoint (using key token_map). Also, the draft config - needs to have truncated_vocab_size (=k) as an attribute. - 4. We allow an enhanced EAGLE architecture similar to the DeepSeek MTP - module with regards to the use of additional RMS norms. The original - EAGLE architecture 1) skips the pre-attention norm in its first - transformer block, and 2) skips the final output norm, both of which we - found to be suboptimal. We also add the support for separate norms - applying to both the token embedding and hidden states before projection - as in DeepSeek MTP, which we found to improve performance as well. - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - self.dtype = vllm_config.model_config.dtype - self.config = config - - architectures = getattr(self.config.model, "architectures", []) - model_cls, _ = ModelRegistry.resolve_model_cls(architectures) - - self.model = model_cls(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.fc = nn.Linear(config.model.hidden_size * 2, - config.model.hidden_size, - bias=getattr(self.config, "eagle_fc_bias", False)) - - # Modify layer normalization and residual connections as suggested - # in the EAGLE framework: https://github.com/SafeAILab/EAGLE - # While weights and biases are generally not needed, - # they are retained here to support certain unit tests - # (e.g., spec_decode/e2e/test_eagle_correctness.py). - if not hasattr(self.config.model, - "skip_prenorm") or self.config.model.skip_prenorm: - self.model.model.layers[0].input_layernorm = DummyInputLayerNorm( - weight=self.model.model.layers[0].input_layernorm.weight) - - if not hasattr( - self.config.model, - "skip_output_norm") or self.config.model.skip_output_norm: - self.model.model.norm = DummyOutputNorm() - - self.add_para_norm = False - if hasattr(self.config.model, - "add_para_norm") and self.config.model.add_para_norm: - self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.add_para_norm = True - - self.orig_vocab_size = config.vocab_size - self.truncated_vocab_size = config.truncated_vocab_size - self.unpadded_vocab_size = self.truncated_vocab_size - - self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=self.truncated_vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - ) - - logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - self.truncated_vocab_size, - logit_scale) - - # Token map is a idx to token mapping to reduce the vocab size for - # the draft model. Using smaller vocab size for draft, containing - # only most frequent tokens reduces the speculation overhead. This - # doesn't affect the acceptance rate much and thus gives more speed - # -up. By default, this is disabled and is only used if the EAGLE - # checkpoint file has token_map tensor. - self.token_map = None - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - previous_hidden_states: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - - if inputs_embeds is None: - inputs_embeds = self.get_input_embeddings(input_ids) - - # Handle both empty previous_hidden_states - # and mismatched batch size - batch_size = inputs_embeds.size(0) - if previous_hidden_states.size(0) == 0 or \ - previous_hidden_states.size(0) != batch_size: - hidden_dim = self.config.model.hidden_size - device = inputs_embeds.device - # Create zero tensor with matching batch size - previous_hidden_states = \ - torch.zeros(batch_size, hidden_dim, device=device) - - if self.add_para_norm: - inputs_embeds = torch.cat([ - self.enorm(inputs_embeds), - self.hnorm(previous_hidden_states) - ], - dim=-1) - else: - inputs_embeds = torch.cat([inputs_embeds, previous_hidden_states], - dim=-1) - - inputs_embeds = self.fc(inputs_embeds) - - inputs_embeds[positions == 0] = 0 # masking inputs at position=0 - - hidden_states = self.model.model( - input_ids=None, - inputs_embeds=inputs_embeds, - positions=positions, - intermediate_tensors=intermediate_tensors, - ) - return hidden_states - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - - if self.token_map is not None: - _logits = logits - logits = -torch.inf * torch.ones( - size=(*_logits.shape[:-1], self.orig_vocab_size), - device=_logits.device, - dtype=_logits.dtype) - - logits[..., self.token_map] = _logits - - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B - # due to missing lm_head weights and its config being that of a - # Llama model. Here's a compatible version with the same weights: - # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm - # Also, here's an example script for converting trained EAGLE - # checkpoint to vLLM compatible version: https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d - model_weights = {} - for name, loaded_weight in weights: - if name == "token_map": - if self.config.truncated_vocab_size < self.config.vocab_size: - self.token_map = nn.Parameter(loaded_weight, - requires_grad=False) - elif name.startswith("fc.weight"): - weight_loader = getattr(self.fc.weight, "weight_loader", - default_weight_loader) - weight_loader(self.fc.weight, loaded_weight) - elif name.startswith("fc.bias"): - if self.fc.bias is not None: - weight_loader = getattr(self.fc.bias, "weight_loader", - default_weight_loader) - weight_loader(self.fc.bias, loaded_weight) - else: - logger.warning_once("Found bias in the loaded weights but " - "the model config doesn't have bias.") - elif name.startswith("enorm.weight"): - weight_loader = getattr(self.enorm.weight, "weight_loader", - default_weight_loader) - weight_loader(self.enorm.weight, loaded_weight) - elif name.startswith("hnorm.weight"): - weight_loader = getattr(self.hnorm.weight, "weight_loader", - default_weight_loader) - weight_loader(self.hnorm.weight, loaded_weight) - elif name.startswith("model.lm_head.") or name.startswith( - "model.model."): - model_weights[name.split("model.", 1)[-1]] = loaded_weight - elif name.startswith("lm_head.") or name.startswith("model."): - model_weights[name] = loaded_weight - else: - model_weights[f"model.{name}"] = loaded_weight - - if "lm_head.weight" in model_weights: - lm_head_weight = model_weights.pop("lm_head.weight") - - if self.token_map is not None and\ - lm_head_weight.shape[0] > self.token_map.shape[0]: - - lm_head_weight = lm_head_weight[self.token_map] - - else: - # NOTE(Shangming): initialize the placeholder for lm_head weight. - lm_head_weight = torch.zeros( - self.lm_head.org_vocab_size, - self.lm_head.embedding_dim, - dtype=self.dtype, - ) - - weight_loader = getattr(self.lm_head.weight, "weight_loader", - default_weight_loader) - weight_loader(self.lm_head.weight, lm_head_weight) - - self.model.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fd831727ab2f4..d5233c28b19b6 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -239,14 +239,15 @@ _MULTIMODAL_MODELS = { _SPECULATIVE_DECODING_MODELS = { "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"), - "EAGLEModel": ("eagle", "EAGLE"), "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"), "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"), "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "MedusaModel": ("medusa", "Medusa"), - "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), + # Temporarily disabled. + # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. + # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } _TRANSFORMERS_MODELS = { diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 240724a675a44..962e2b3aab601 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -132,14 +132,10 @@ class CudaPlatformBase(Platform): parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.gpu_worker.Worker" - else: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.worker.Worker" + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Speculative decoding is not supported on vLLM V0.") + parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index e9e18d3fe8e4b..0bf9262776b18 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -326,15 +326,10 @@ class RocmPlatform(Platform): parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: - if envs.VLLM_USE_V1: + if not envs.VLLM_USE_V1: raise NotImplementedError( - "Speculative decoding is not yet supported on vLLM V1." - ) - else: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.worker.Worker" + "Speculative decoding is not supported on vLLM V0.") + parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ diff --git a/vllm/sequence.py b/vllm/sequence.py index ffe890eb2dab4..87ba74c68536f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -112,13 +112,6 @@ class RequestMetrics: model_execute_time: The time spent in the model execute function. This will include model forward, block/sync across workers, cpu-gpu sync time and sampling time. - spec_token_acceptance_counts: number of accepted speculative tokens at - each position; the first token is from - the target model and is always accepted; - e.g., when it's [10, 8, 4, 2] for a req, - it means there were 10 forward passes in - total, and there were 8, 4, 2 accepted - tokens at 1st, 2nd, 3rd speculation step. """ arrival_time: float last_token_time: float @@ -129,7 +122,6 @@ class RequestMetrics: scheduler_time: Optional[float] = None model_forward_time: Optional[float] = None model_execute_time: Optional[float] = None - spec_token_acceptance_counts: Optional[list[int]] = None class SequenceDataDelta( @@ -748,9 +740,7 @@ class SequenceGroup: last_token_time=arrival_time, first_scheduled_time=None, first_token_time=None, - time_in_queue=None, - spec_token_acceptance_counts=[0] * - draft_size) + time_in_queue=None) self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None @@ -1390,8 +1380,6 @@ class ExecuteModelRequest( previous_hidden_states: Optional[HiddenStates] = None # The number of forward steps to run. num_steps: int = 1 - # The step index for spec model input. - spec_step_idx: Optional[int] = None # Finished request ids since last step. finished_requests_ids: list[str] = msgspec.field(default_factory=list) # The last sampled token ids for multi step decoding. diff --git a/vllm/spec_decode/__init__.py b/vllm/spec_decode/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py deleted file mode 100644 index f9b882469a4df..0000000000000 --- a/vllm/spec_decode/batch_expansion.py +++ /dev/null @@ -1,506 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from array import array -from itertools import chain, count -from typing import Iterator, List, Optional, Tuple - -import torch - -from vllm import SamplingParams -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, VLLM_TOKEN_ID_ARRAY_TYPE, - ExecuteModelRequest, SequenceData, - SequenceGroupMetadata, get_all_seq_ids) -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import nvtx_range, split_batch_by_proposal_len - -SeqId = int -TargetSeqId = int -TokenId = int - -DEFAULT_SIMPLE_SAMPLING_PARAMS = SamplingParams() - - -class BatchExpansionTop1Scorer(SpeculativeScorer): - """Implements a speculative scorer that uses batch expansion to get - probabilities of speculative tokens according to the scoring model. - - Batch expansion converts a list of sequences and multiple query positions - to a new batch of sequences, each with a single query position. This allows - for MQA-like scoring in speculative decoding without requiring an MQA - kernel. - - It is strictly less efficient than MQA scoring. - - It only supports scoring the top1 proposal tokens of the proposer, instead - of topk/tree. - """ - - @nvtx_range("BatchExpansionTop1Scorer.score_proposals") - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - """Score the proposed tokens via the scorer model. - - This converts each input sequence to a set of k+1 target sequences. The - target sequences have the unique continuations to be scored and a - unique sequence ID that is different from all input sequence ids. - - If a speculative sequence length would exceed the max model length, then - no speculation is produced for that sequence. - - Args: - execute_model_req: The execution request. - proposals: The speculative proposals to score. - Returns: - SpeculativeScores: The scores of each speculative token, along with - which sequences were ignored during scoring. - """ - - # TODO(cade) perform this on GPU to remove blocking call. - proposal_lens_list = proposals.proposal_lens.tolist() - proposal_token_ids_list = proposals.proposal_token_ids.tolist() - - # Filter the list to ignore invalid proposals. - proposal_token_ids_list_without_skips = [ - proposals for proposals in proposal_token_ids_list - if VLLM_INVALID_TOKEN_ID not in proposals - ] - - (spec_indices, non_spec_indices, target_seq_group_metadata_list, - num_scoring_tokens) = self._expand_batch( - seq_group_metadata_list=execute_model_req.seq_group_metadata_list, - proposal_token_ids_list=proposal_token_ids_list_without_skips, - proposal_lens_list=proposal_lens_list, - ) - - target_sampler_output = self._scorer_worker.execute_model( - execute_model_req=execute_model_req.clone( - seq_group_metadata_list=target_seq_group_metadata_list)) - assert len(target_sampler_output) == 1, "expected single-step output" - target_sampler_output = target_sampler_output[0] - - if not non_spec_indices: - # All sequence groups in batch have spec decoding enabled - return self._contract_batch_all_spec( - target_sampler_output=target_sampler_output, - proposals=proposals, - ) - else: - # Batch has a mix of spec decode enabled and disabled seq groups - return self._contract_batch( - execute_model_req.seq_group_metadata_list, - target_sampler_output=target_sampler_output, - proposals=proposals, - num_scoring_tokens=num_scoring_tokens, - non_spec_indices=non_spec_indices, - spec_indices=spec_indices, - k=execute_model_req.num_lookahead_slots, - ) - - def _expand_batch( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids_list: List[List[TokenId]], - proposal_lens_list: List[int], - ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: - """Given the input sequences and potentially multiple corresponding - proposal tokens, create a new batch where each sequence has a single - query token. - """ - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - (spec_seqs, spec_indices), (non_spec_seqs, non_spec_indices) = \ - split_batch_by_proposal_len( - seq_group_metadata_list, proposal_lens_list) - - spec_expanded_seqs = self._create_scoring_model_input( - seq_group_metadata_list=spec_seqs, - proposal_token_ids=proposal_token_ids_list, - # NOTE: We determine the seq ids in the expanded batch using the - # full seq_group_metadata_list, instead of only spec_seqs. - target_seq_ids_iter=self._create_target_seq_id_iterator( - seq_ids=get_all_seq_ids(seq_group_metadata_list)), - ) - - num_scoring_tokens = len(spec_expanded_seqs) - # Batch speculative and non-speculative (e.g. chunked prefill) requests - # but make sure order is prefill|decode due to backend requirement. - target_seq_group_metadata_list = non_spec_seqs + spec_expanded_seqs - - return (spec_indices, non_spec_indices, target_seq_group_metadata_list, - num_scoring_tokens) - - def _contract_non_speculative( - self, scores: SpeculativeScores, - seq_group_metadata_list: List[SequenceGroupMetadata], - non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, - has_prompt_log: bool) -> SpeculativeScores: - """ - Augment input `scores` with non-speculative requests outputs. - This includes decode requests with speculation turned off, as well - as prefill requests when `enable_chunked_prefill` is set. - For the latter, prefills are further separated into terminal and - non-terminal chunks (from which no token is sampled). - """ - if not non_spec_indices: - return scores - - if has_prompt_log: - # When prompt_logprobs is enabled, prefills yield output token - # (and respective prob) in the last entry (prompt|out): - # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..]. - # With chunked prefill, non-terminal chunks have -1 on each - # position: they're still picked, but they're discarded later. - seq_meta = seq_group_metadata_list - nospec_sizes = torch.tensor([ - seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1 - for i in non_spec_indices - ]) - nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1) - else: - # In this case only sampled tokens are returned, select all. - nospec_sampled_token_idxs = list( - range(len(non_spec_outputs.token_ids))) - - scores.token_ids[non_spec_indices, :1] = \ - non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1) - scores.probs[non_spec_indices, :1, :] = \ - non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1) - scores.logprobs[non_spec_indices, :1, :] = \ - non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1) - if scores.hidden_states is not None: - assert non_spec_outputs.hidden_states is not None - scores.hidden_states[non_spec_indices, :1, :] = \ - non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1) - return scores - - def _contract_batch( - self, - contracted_seq_group_metadata_list: List[SequenceGroupMetadata], - target_sampler_output: SamplerOutput, - proposals: SpeculativeProposals, num_scoring_tokens: int, - non_spec_indices: List[int], spec_indices: List[int], - k: int) -> SpeculativeScores: - """Contract the expanded batch back into its original size. - This maps the scores of speculative tokens back to their original - sequences. - - contracted_bs is the original batch size, and the batch size that the - target_sampler_output will be contracted to. - """ - contracted_bs = len(contracted_seq_group_metadata_list) - (target_token_ids, target_probs, target_logprobs, target_hidden_states, - non_spec_target_token_ids, non_spec_target_probs, - non_spec_target_logprobs, - non_spec_target_hidden_states) = self._split_scoring_output( - target_sampler_output, num_scoring_tokens) - - # Map distinct sequences used to score each token - # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - expanded_batch_size, k = proposals.proposal_token_ids.shape - - # The number of tokens in the expanded batch used for speculation is - # equal to the total expanded batch size minus the number of samples for - # non-speculative sequences, prefill chunks with no out tokens included - non_spec_expanded_bs = len(non_spec_indices) - spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs - - target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1) - target_probs = target_probs.reshape(*target_token_ids.shape, - self._vocab_size) - target_logprobs = target_logprobs.reshape(target_probs.shape) - - if target_hidden_states is not None: - target_hidden_states = target_hidden_states.reshape( - *target_token_ids.shape, target_hidden_states.shape[-1]) - - all_tokens = target_token_ids.new_full(size=(contracted_bs, k + 1), - fill_value=-1) - all_probs = target_probs.new_zeros(*all_tokens.shape, self._vocab_size) - all_logprobs = target_logprobs.new_full(size=all_probs.shape, - fill_value=-float("inf")) - - if target_sampler_output.hidden_states is not None: - all_hidden_states = target_hidden_states.new_zeros( - size=(contracted_bs, k + 1, target_hidden_states.shape[-1])) - else: - all_hidden_states = None - - has_prompt_log = any((sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - for sg in contracted_seq_group_metadata_list) - # When prompt logprobs is enabled, lens of returned tensors go from - # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. - # We adjust stride accordingly to get the generated tokens and - # their probs, but pass on prompt_logprobs as is. - prompt_logprobs = None - if (not self._scorer_worker.model_runner.disable_logprobs\ - and has_prompt_log): - prompt_logprobs = [ - o.prompt_logprobs for o in target_sampler_output.outputs - ] - elif not has_prompt_log: - # When prompt logprobs are not to be returned, - # we can ignore non-terminal chunks (no out token). - non_spec_indices = [ - idx for idx in non_spec_indices - if contracted_seq_group_metadata_list[idx].do_sample - ] - - # "Contract" speculative. - if spec_indices: - all_tokens[spec_indices] = target_token_ids - all_probs[spec_indices] = target_probs - all_logprobs[spec_indices] = target_logprobs - if all_hidden_states is not None: - all_hidden_states[spec_indices] = target_hidden_states - - spec_scores = SpeculativeScores(probs=all_probs, - token_ids=all_tokens, - logprobs=all_logprobs, - hidden_states=all_hidden_states, - prompt_logprobs=prompt_logprobs) - - non_spec_outputs = SpeculativeScores( - probs=non_spec_target_probs, - token_ids=non_spec_target_token_ids, - logprobs=non_spec_target_logprobs, - hidden_states=non_spec_target_hidden_states) - # Contract remaining nonspec entries based on non_spec_indices, if any. - return self._contract_non_speculative( - spec_scores, contracted_seq_group_metadata_list, non_spec_indices, - non_spec_outputs, has_prompt_log) - - def _contract_batch_all_spec( - self, - target_sampler_output: SamplerOutput, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - """Contract the expanded batch back into its original size. - This maps the scores of speculative tokens back to their original - sequences. - - It assumes all sequences in the batch were previously expanded. - """ - - # Map distinct sequences used to score each token - # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - contracted_bs, k = proposals.proposal_token_ids.shape - - # Reshape tensors to original batch size - target_token_ids = target_sampler_output.sampled_token_ids.reshape( - contracted_bs, k + 1) - target_probs = target_sampler_output.sampled_token_probs.reshape( - *target_token_ids.shape, self._vocab_size) - target_logprobs = target_sampler_output.logprobs.reshape( - target_probs.shape) - target_hidden_states = target_sampler_output.hidden_states - if target_hidden_states is not None: - target_hidden_states = target_hidden_states.reshape( - *target_token_ids.shape, target_hidden_states.shape[-1]) - - return SpeculativeScores(probs=target_probs, - token_ids=target_token_ids, - logprobs=target_logprobs, - hidden_states=target_hidden_states, - prompt_logprobs=None) - - def _create_scoring_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] - target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: - """Given the original input sequences and proposed tokens from the draft - model, create a list of target sequences that can be used for scoring. - - target_seq_ids_iter provides sequence ids for the expanded batch, - fulfilling the requirement that no seq id in the expanded batch is equal - to the seq id in the original batch. - """ - - if not seq_group_metadata_list: - return [] - - target_seq_group_metadata = list( - chain.from_iterable( - self._create_target_seq_group_metadata( - seq_group_metadata, - proposal_token_ids, - i, - target_seq_ids_iter, - ) for i, seq_group_metadata in enumerate( - seq_group_metadata_list))) - - return target_seq_group_metadata - - def _create_target_seq_group_metadata( - self, - input_seq_group_metadata: SequenceGroupMetadata, - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] - batch_index: int, - target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: - """Given an input sequence group metadata and a list of draft tokens, - create a list of target SequenceGroupMetadata, one for each - token id that needs to be scored. - - Naive speculative decoding requires K target model scores, one for each - draft model token. However one can add a bonus token such that if each - token is accepted, then a final token may be sampled from the model. - This function creates K+1 target SequenceGroupMetadata to take - advantage of the bonus token. - """ - assert len(input_seq_group_metadata.seq_data) == 1, ( - "Beam search " - "not supported in speculative decoding") - input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys())) - - token_ids_to_score = self._get_token_ids_to_score( - proposal_token_ids[batch_index]) - - sampling_params = input_seq_group_metadata.sampling_params - target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] - for i, token_ids in enumerate(token_ids_to_score): - target_seq_group_metadata_list.append( - self._create_single_target_seq_group_metadata( - input_seq_group_metadata, - input_seq_id, - next(target_seq_ids_iter), - token_ids, - sampling_params=sampling_params, - )) - - return target_seq_group_metadata_list - - @staticmethod - def _create_single_target_seq_group_metadata( - seq_group_metadata: SequenceGroupMetadata, - seq_id: SeqId, - target_seq_id: TargetSeqId, - token_ids: List[TokenId], - sampling_params: SamplingParams, - ) -> SequenceGroupMetadata: - """Create a single target SequenceGroupMetadata. - - Args: - seq_group_metadata: The metadata for the input sequence. - seq_id: The input sequence ID. - target_seq_id: The corresponding target sequence ID. - token_ids: The list of token ids that are to be appended to the - input sequence. - """ - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_token_ids = seq_data.prompt_token_ids_array - new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] - mrope_position_delta = seq_data.mrope_position_delta - - new_seq_data_dict = { - target_seq_id: - SequenceData( - prompt_token_ids, - _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE, - new_output_token_ids), - ), - } - # This is a hack. Technically, spec decoding should compute - # num_lookahead slots at one shot, but instead, it expands the batch - # and evaluate one by one right now. context_len is seq_len - 1 because - # the kv cache is filled by a previous batch in the batch expansion. - for data in new_seq_data_dict.values(): - data.update_num_computed_tokens(data.get_len() - 1) - data.mrope_position_delta = mrope_position_delta - - return SequenceGroupMetadata( - request_id=seq_group_metadata.request_id, - is_prompt=seq_group_metadata.is_prompt, - seq_data=new_seq_data_dict, - sampling_params=sampling_params, - block_tables={ - target_seq_id: seq_group_metadata.block_tables[seq_id], - }, - lora_request=None, - token_chunk_size=1, - ) - - @staticmethod - def _split_scoring_output( - sampler_output: SamplerOutput, num_scoring_tokens: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], torch.Tensor, torch.Tensor, - torch.Tensor, Optional[torch.Tensor]]: - """Split the target model output into speculative and non-speculative - output. - """ - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - # - # First samples are non-speculative, latter samples are from speculative - # scoring (prefill|decode order). - split_sizes = (sampler_output.sampled_token_ids.numel() - - num_scoring_tokens, num_scoring_tokens) - (non_spec_probs, - spec_probs) = sampler_output.sampled_token_probs.split(split_sizes) - (non_spec_sampled_tokens, spec_sampled_tokens - ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) - (non_spec_logprobs, - spec_logprobs) = sampler_output.logprobs.split(split_sizes) - - if sampler_output.hidden_states is not None: - (non_spec_hidden_states, spec_hidden_states - ) = sampler_output.hidden_states.split(split_sizes) - else: - non_spec_hidden_states, spec_hidden_states = None, None - - return (spec_sampled_tokens, spec_probs, spec_logprobs, - spec_hidden_states, non_spec_sampled_tokens, non_spec_probs, - non_spec_logprobs, non_spec_hidden_states) - - @staticmethod - def _create_target_seq_id_iterator( - seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: - """Create an iterator for creating target sequence ids. - Target sequence ids are distinct from sequence ids because we create a - distinct target sequence id for each proposal token to be scored. - - This implementation increments a counter starting at 1 + max of all - provided input sequence ids. - """ - return count(start=max(seq_ids) + 1) - - @staticmethod - def _get_token_ids_to_score( - full_spec_token_ids: List[TokenId] # shape: [k] - ) -> List[List[TokenId]]: - """Given an int tensor of proposal token ids, return a list of - token ids that should be scored. - - Returns k+1 output lists. The additional one is used for generating the - bonus token. - - Example: - Input: [0, 1, 2, 3] (k=4) - Output: (k+1 lists) - [] - [0] - [0, 1] - [0, 1, 2] - [0, 1, 2, 3] - """ - empty_token_ids: List[TokenId] = [] - - token_ids_to_score = [empty_token_ids] - token_ids_to_score.extend(full_spec_token_ids[:i + 1] - for i in range(len(full_spec_token_ids))) - return token_ids_to_score diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py deleted file mode 100644 index 96646ec947186..0000000000000 --- a/vllm/spec_decode/draft_model_runner.py +++ /dev/null @@ -1,349 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional - -import torch - -from vllm.forward_context import set_forward_context -from vllm.model_executor.layers.sampler import SamplerOutput - -try: - try: - from vllm.attention.backends.flash_attn import FlashAttentionMetadata - except (ModuleNotFoundError, ImportError): - # vllm_flash_attn is not installed, try the ROCm FA metadata - from vllm.attention.backends.rocm_flash_attn import ( - ROCmFlashAttentionMetadata as FlashAttentionMetadata) -except (ModuleNotFoundError, ImportError) as err: - raise RuntimeError( - "Draft model speculative decoding currently only supports " - "CUDA and ROCm flash attention backend.") from err - -from vllm.logger import init_logger -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerWrapperBase) - -logger = init_logger(__name__) - -# A flag to enable debug prints for the updated input tensors -# before each step. -debug_advance_input = False -# A flag to allow GPU advance step for draft model runner. -# Set to False for debugging. -allow_gpu_advance_step = True - - -class TP1DraftModelRunner(ModelRunnerWrapperBase): - """Specialized model runner for speculative decoding draft model. - Since the draft model always execute k forward passes consecutively to - generate k speculative tokens in a single speculative decoding step, - we could get rid of most CPU-GPU synchronization and data transfer - overheads by keeping model input and output tensors on GPU all the time. - - TODOs: - 1. Currently supports only flash-attn, add support for other attn_backends. - 2. Support TP > 1 (this requires some designs because we do not expect - any broadcasting inside execute_model). - """ - - def __init__(self, model_runner: ModelRunnerBase): - super().__init__(model_runner) - - self.indices_of_seq_with_bonus_tokens = None - - def _update_sampling_metadata(self, sampling_metadata, num_seqs, - num_queries): - - assert sampling_metadata.num_prompts == 0 - assert len(sampling_metadata.seq_groups) == num_queries - assert sampling_metadata.selected_token_indices.shape == ( - num_queries, ) - # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501 - - # Verify that all sequences are decodes - for i in range(num_queries): - seq_group = sampling_metadata.seq_groups[i] - - assert seq_group.is_prompt is False # No prompt - assert seq_group.prompt_logprob_indices == [] # No prompt - assert seq_group.sample_indices == [i] # Simple - - def _gpu_advance_step(self, model_input: ModelRunnerInputBase, - last_output: SamplerOutput) -> ModelRunnerInputBase: - # Currently, we expect "decode mode" only - assert not model_input.is_prompt - - # Get num_seqs - num_seqs = len(model_input.seq_lens) - num_queries = len(model_input.query_lens) - - # Get output tokens GPU tensor - sampled_token_ids = last_output.sampled_token_ids - assert sampled_token_ids is not None - - # Update attn_metadata - attn_metadata = model_input.attn_metadata - assert isinstance(attn_metadata, FlashAttentionMetadata) - - attn_metadata.advance_step(model_input, sampled_token_ids, - self.block_size, num_seqs, num_queries) - - # Update sampling_metadata - sampling_metadata = model_input.sampling_metadata - self._update_sampling_metadata(sampling_metadata, num_seqs, - num_queries) - - # Create new input - new_model_input = self._model_input_cls( - input_tokens=model_input.input_tokens, - input_positions=model_input.input_positions, - attn_metadata=attn_metadata, - seq_lens=attn_metadata.seq_lens, - query_lens=model_input.query_lens, - lora_mapping=model_input.lora_mapping, - lora_requests=model_input.lora_requests, - multi_modal_kwargs=model_input.multi_modal_kwargs, - sampling_metadata=model_input.sampling_metadata, - is_prompt=False, - ) - - # Ensure we skip CPU samples - assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True - # We can reuse sampling tensors since every decode iteration is the same - new_model_input.sampling_metadata.reuse_sampling_tensors = True - - if debug_advance_input: - logger.debug("NEW INPUT: ") - logger.debug(" input_tokens = %s", new_model_input.input_tokens) - logger.debug(" input_positions = %s", - new_model_input.input_positions) - logger.debug(" seq_lens = %d", new_model_input.seq_lens) - logger.debug(" query_lens = %d", new_model_input.query_lens) - logger.debug(" attn_metadata:") - logger.debug(" seq_lens_tensor: %s", - attn_metadata.seq_lens_tensor) - logger.debug(" slot_mapping: %s", attn_metadata.slot_mapping) - logger.debug(" block_tables: %s", attn_metadata.block_tables) - - return new_model_input - - def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): - """Determines if draft_model_runner GPU multi-step can be used. - Currently required conditions are: - 1. Only decodes - 2. Only flash-attn - 3. No LORA - 4. No prompt_adapter_config - """ - if not allow_gpu_advance_step: - return False - - # We allow multi-step GPU only in decode mode - for seq_group in execute_model_req.seq_group_metadata_list: - if seq_group.is_prompt: - return False - - # TODO: Add support for other attn backends - if self.attn_backend.get_name() not in ("FLASH_ATTN", ): - return False - - # TODO: Add support for LORA - if self.lora_config: - return False - - # TODO: Add soft-tuning prompt adapter support - return not self.prompt_adapter_config - - def set_indices_of_seq_with_bonus_tokens(self, - indices_of_seq_with_bonus_tokens): - self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelRunnerInputBase, - kv_caches: List[torch.Tensor], - previous_hidden_states: Optional[torch.Tensor] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[List[SamplerOutput]]: - """Executes num_steps forward passes with advacement of input tensors - on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions. - - Optimizations used: - 1. Input tensors are updated on the GPU directly - 2. Skips GPU=>CPU serialization of sampler outputs (we don't need - them since we do batch expansion later that uses GPU outputs) - 3. Reuses sampling tensors (since we run only decodes and they have - a repeating sampling logic) - """ - - # When num_steps == 1, we execute the fallback here for the GPU - # advance_step, which runs prepare_inputs on CPU and for each spec - # iteration invokes this function only once - # (Look at multi-step-worker code) - is_fallback = num_steps == 1 - if not is_fallback: - # Since we do not broadcast data inside execute_model anymore, - # we need to figure out the best way to support TP > 1 in this - # case, because we will at least need to broadcast the sampled - # tokens to all workers. - if not self.is_driver_worker: - raise ValueError("TP1DraftModelRunner only supports TP=1.") - - # Sanity - if self.lora_config is not None: - raise ValueError("TP1DraftModelRunner has no support for LORA") - if self.prompt_adapter_config is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "prompt_adapter_config") - if model_input.inputs_embeds is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "inputs_embeds") - if model_input.multi_modal_kwargs: - raise ValueError( - "TP1DraftModelRunner has no support for multi_modal_kwargs" - ) - else: - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - - self.attn_state.begin_forward(model_input) - - # Detect exec mode - assert model_input.attn_metadata is not None - use_cuda_graph = False - if model_input.attn_metadata.num_prefills > 0: - # In this case, execute_model(..) was called directly - if num_steps > 1: - raise ValueError( - "execute_model(..) of draft_model_runner can be called " - "directly only with a single-step prefill") - else: - # We can skip CPU samples for spec token generation. - # (We do allow CPU samples for num_steps == 1 to support the - # fallback case, where supports_gpu_multi_step(..) does not pass) - model_input.sampling_metadata.skip_sampler_cpu_output = ( - not is_fallback) - - # Attn attr defines if we use cuda graphs - use_cuda_graph = model_input.attn_metadata.use_cuda_graph - - # Get model - if use_cuda_graph: - if model_input.inputs_embeds is None: - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, False)]) - else: - graph_batch_size = model_input.inputs_embeds.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, True)]) - - if previous_hidden_states is not None: - hidden_states = torch.cat([ - previous_hidden_states, - torch.empty([ - graph_batch_size - previous_hidden_states.shape[0], - *previous_hidden_states.shape[1:] - ], - dtype=previous_hidden_states.dtype, - device=previous_hidden_states.device) - ]) - else: - hidden_states = None - else: - model_executable = self.model - hidden_states = previous_hidden_states - - outputs: List[SamplerOutput] = [] - for step in range(num_steps): - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - - model_execute_kwargs = {"previous_hidden_states": hidden_states} \ - if previous_hidden_states is not None else {} - - compute_logits_kwargs = {} - # Run model - if hasattr(self.model.config, "num_nextn_predict_layers"): - # for DeepSeek MTP only to use the corresponding layer for - # each step - spec_step_idx = kwargs.get("spec_step_idx", step) - model_execute_kwargs["spec_step_idx"] = spec_step_idx - compute_logits_kwargs["spec_step_idx"] = spec_step_idx - with set_forward_context(model_input.attn_metadata, - self.vllm_config): - hidden_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=None, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **model_execute_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata, - **compute_logits_kwargs) - if not self.is_driver_worker: - return [] - # Sample the next token. - output = self.model_runner.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - outputs.append(output) - - if self.return_hidden_states and is_fallback: - if use_cuda_graph: - indices = model_input.sampling_metadata\ - .selected_token_indices - output.hidden_states = hidden_states[:len(indices)] - else: - output.hidden_states = hidden_states - - if model_input.attn_metadata.num_prefills == 0 \ - and self.indices_of_seq_with_bonus_tokens is not None: - assert output.sampled_token_ids is not None - # output.sampled_token_ids should be of shape (num_seqs, 1) - nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape - assert num_tokens_per_seq == 1 - count = 0 - for i in range(nums_seqs): - bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[ - count] - if i != bonus_seq_idx: - # The following might cause a cpu->gpu sync - # However, the performance impact is negligible as we - # benchmarked on H100. - output.sampled_token_ids[ - i, :] = model_input.input_tokens[bonus_seq_idx] - else: - count += 1 - - # Prepare inputs for the next step - if step != num_steps - 1: - model_input = self._gpu_advance_step(model_input, outputs[-1]) - - return outputs diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py deleted file mode 100644 index 70ec1590e7ad0..0000000000000 --- a/vllm/spec_decode/interfaces.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import List, Optional, Set, Union - -import torch - -from vllm.sequence import ExecuteModelRequest, PromptLogprobs -from vllm.worker.worker_base import WorkerBase - - -@dataclass -class SpeculativeProposals: - """Datastructure used to represent proposal tokens from some proposer. It - also tracks how many speculative tokens each sequence has. - """ - - # Speculative proposal tokens. - proposal_token_ids: torch.Tensor - - # Probabilities of the proposal tokens according to the proposer. - proposal_probs: torch.Tensor - - # The valid length of each proposal; can be zero. - proposal_lens: torch.Tensor - - # A flag to mark that there's no available proposals - no_proposals: bool = False - - def __repr__(self): - return (f"SpeculativeProposals(" - f"proposal_token_ids={self.proposal_token_ids}, " - f"proposal_probs={self.proposal_probs.shape}, " - f"proposal_lens={self.proposal_lens})") - - -@dataclass -class SpeculativeScores: - """Datastructure used to represent the scores of speculative tokens - according to the scoring model. - """ - - # Probabilities of the speculative tokens according to the scoring model. - probs: torch.Tensor - - # Log-probabilities of the speculative tokens according to the scoring - # model. These values can be used to generate Logprob objects that are - # returned to the user. - logprobs: torch.Tensor - - # Token ids sampled from the scoring model. Used for speculative bonus - # tokens and also non-speculative normal decoding. - token_ids: torch.Tensor - - # Optional last hidden states from the scoring model. - hidden_states: Optional[torch.Tensor] = None - - # Scoring model may also return logprobs for prompt tokens - # for each request, when chunked prefill is enabled. - prompt_logprobs: Optional[List[PromptLogprobs]] = None - - def __repr__(self): - return (f"SpeculativeScores(" - f"probs={self.probs.shape}, " - f"token_ids={self.token_ids.shape})") - - -class SpeculativeProposer(ABC): - - @abstractmethod - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - # If set, this contains all sequence IDs that were assigned - # bonus tokens in their last forward pass. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - raise NotImplementedError - - -class SpeculativeScorer(ABC): - - def __init__(self, scorer_worker: WorkerBase, - device: Union[torch.device, str], vocab_size: int): - self._scorer_worker = scorer_worker - if isinstance(device, torch.device): - device = device.type - self._device = device - self._vocab_size = vocab_size - - @abstractmethod - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - raise NotImplementedError diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py deleted file mode 100644 index 82b5a79fa7cb9..0000000000000 --- a/vllm/spec_decode/medusa_worker.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import DelegateWorkerBase - - -class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase): - """Worker for Medusa. - """ - - def __init__(self, *args, **kwargs): - DelegateWorkerBase.__init__(self, *args, **kwargs) - # Lazy initialization list. - self._proposer: Top1Proposer - - def init_device(self): - self.worker.init_device() - - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - self.device, - self.vocab_size, - max_proposal_len=self.max_model_len, - ) - - def set_include_gpu_probs_tensor(self): - pass - - def set_should_modify_greedy_probs_inplace(self): - pass - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass to generate sample_len future tokens. - Returns the list of sampler output, one per layer, along with indicator - of whether torch tensor in sampler output need to be transposed in - latter sampler_output_to_torch logic. - - For medusa worker, this indicator shall be False. - """ - self._raise_if_unsupported(execute_model_req) - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - seq_lens, query_lens = self._prepare_input_tensors( - seq_group_metadata_list) - - generators = self.model_runner.get_generators( - execute_model_req.finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.model_runner.pin_memory, generators) - - model_outputs = self.model_runner.model.generate_proposals( - previous_hidden_states=execute_model_req.previous_hidden_states. - hidden_states, - sampling_metadata=sampling_metadata) - - return model_outputs, False - - def _prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[List[int], List[int]]: - if not seq_group_metadata_list: - return [], [] - - seq_lens: List[int] = [] - query_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - is_prompt = seq_group_metadata.is_prompt - - for seq_data in seq_group_metadata.seq_data.values(): - seq_data_len = seq_data.get_len() - if is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min( - seq_data_len, - context_len + seq_group_metadata.token_chunk_size) - seq_lens.append(seq_len) - query_lens.append(seq_len - context_len) - else: - seq_lens.append(seq_data_len) - query_lens.append(1) - - return seq_lens, query_lens - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """MedusaWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "MedusaWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "MedusaWorker does not support beam search.") diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py deleted file mode 100644 index a4784cad962d0..0000000000000 --- a/vllm/spec_decode/metrics.py +++ /dev/null @@ -1,213 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from typing import Callable, Optional, Union - -import msgspec -import torch - -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeBaseSampler) -from vllm.platforms import current_platform -from vllm.utils import is_pin_memory_available - - -class SpecDecodeWorkerMetrics( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """Dataclass holding metrics emitted from the spec decode worker. - """ - - # The empirical acceptance rate of the proposal method on a per-token basis. - # This is useful for evaluating how well the proposal method aligns with the - # scoring method. - draft_acceptance_rate: float - - # The empirical efficiency, measured as the number of tokens emitted by the - # system divided by the number of tokens that could be emitted by the system - # if the proposal method were perfect. - system_efficiency: float - - # The number of speculative tokens produced by the proposal method. - draft_tokens: int - - # The number of tokens emitted by the entire system. - emitted_tokens: int - - # The number of tokens accepted by the scoring model and verification - # routine, e.g. Llama2-70B and lossless rejection sampling. - # - # NOTE: Any token accepted by the verification routine is considered - # accepted (regardless of if the speculative prefix is also accepted). The - # user will usually see less accepted tokens. This metric is helpful when - # evaluating alignment of the proposal method with the scoring model. - accepted_tokens: int - - # The number of speculative tokens per sequence. - num_spec_tokens: int - - -Timer = Callable[[], float] - - -class AsyncMetricsCollector: - """Class which copies rejection/typical-acceptance sampler metrics - from the device to CPU on a non-default Torch stream. - """ - - def __init__(self, - spec_decode_sampler: SpecDecodeBaseSampler, - timer: Optional[Timer] = None, - collect_interval_s: float = 5.0): - self.spec_decode_sampler = spec_decode_sampler - self._timer = time.time if timer is None else timer - - self._rank: Optional[int] = None - - # We don't have a device set yet. - self._copy_stream: Optional[torch.cuda.Stream] = None - - self._in_flight_copy: Optional[torch.cuda.Event] = None - - pin_memory = is_pin_memory_available() - self._aggregate_num_accepted_tokens = torch.tensor( - 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) - self._aggregate_num_emitted_tokens = torch.tensor( - 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) - self._aggregate_num_draft_tokens = 0 - - self._rejsample_metrics_collect_interval_s = collect_interval_s - self._last_metrics_collect_time = self._timer() - - def init_gpu_tensors(self, rank: int) -> None: - self._rank = rank - self._copy_stream = torch.cuda.Stream() - - def init_tensors(self, - rank: int, - device_type: Union[torch.device, str] = 'cuda') -> None: - self._rank = rank - if isinstance(device_type, torch.device): - device_type = device_type.type - stream = current_platform.Stream - if stream is not None: - self._copy_stream = stream() - - def maybe_collect_rejsample_metrics( - self, k: int) -> Optional[SpecDecodeWorkerMetrics]: - # Skip for any platform that doesn't have device Event - if current_platform.Event is None: - return None - - # If a copy was initiated in the previous call, collect and return. - if self._in_flight_copy is not None: - ready_event = self._in_flight_copy - self._in_flight_copy = None - return self._collect_rejsample_metrics(k, ready_event) - - # Otherwise, check if we should start a new copy. - if self._should_collect_rejsample_metrics(self._timer()): - assert self._in_flight_copy is None - self._in_flight_copy = self._copy_rejsample_metrics_async() - - return None - - def _should_collect_rejsample_metrics(self, now: float) -> bool: - """Return whether or not this iteration should print sampling - metrics. - """ - if self._rank != 0: - return False - - return now - self._last_metrics_collect_time >= self._rejsample_metrics_collect_interval_s # noqa: E501 - - def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: - """Copy rejection/typical-acceptance sampling metrics - (number of accepted tokens, etc) to CPU asynchronously. - - Returns a device event recording when the copy is complete. - """ - assert self._copy_stream is not None - self._copy_stream.wait_stream(current_platform.current_stream()) - - with current_platform.stream(self._copy_stream): - self._aggregate_num_accepted_tokens.copy_( - self.spec_decode_sampler.num_accepted_tokens, - non_blocking=True) - self._aggregate_num_emitted_tokens.copy_( - self.spec_decode_sampler.num_emitted_tokens, non_blocking=True) - # Number of draft tokens is calculated on CPU, so no copy is - # required. - self._aggregate_num_draft_tokens = ( - self.spec_decode_sampler.num_draft_tokens) - - aggregate_metrics_ready = current_platform.Event() - aggregate_metrics_ready.record(self._copy_stream) - - return aggregate_metrics_ready - - def _collect_rejsample_metrics( - self, k: int, - ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics: - """Create metrics object from statistics copied asynchronously. - - Args: - k: int. The number of speculative tokens; used to determine system - efficiency. - ready_event: torch.cuda.Event. The CUDA event recording when the - async GPU->CPU copy is complete. - """ - - ready_event.synchronize() - - # update time of last collection - self._last_metrics_collect_time = self._timer() - - accepted_tokens = self._aggregate_num_accepted_tokens.item() - emitted_tokens = self._aggregate_num_emitted_tokens.item() - draft_tokens = self._aggregate_num_draft_tokens - - max_num_emitted_tokens = self.get_max_num_emitted_tokens( - draft_tokens, k) - - if draft_tokens > 0: - draft_acceptance_rate = accepted_tokens / draft_tokens - else: - draft_acceptance_rate = float("nan") - - if max_num_emitted_tokens > 0: - system_efficiency = emitted_tokens / max_num_emitted_tokens - else: - system_efficiency = float("nan") - - return SpecDecodeWorkerMetrics( - num_spec_tokens=k, - draft_acceptance_rate=draft_acceptance_rate, - system_efficiency=system_efficiency, - accepted_tokens=accepted_tokens, - draft_tokens=draft_tokens, - emitted_tokens=emitted_tokens, - ) - - @staticmethod - def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int: - """Calculate the number of emitted tokens, assuming all tokens are - accepted. - - This is equal to the number of sequences that have been speculated on, - times (speculation len + 1). The +1 comes from the bonus token. - """ - # Determine the number of sequences that have been speculated on. Since - # the batch size can be variable, we divide by k. - assert draft_tokens % k == 0 - total_num_spec_seqs = draft_tokens // k - - # A single sequence may emit k accepted tokens and one bonus token in - # the best case. - num_emitted_per_seq_if_all_accepted = k + 1 - - # The max num of emitted tokens is the number of speculated sequences - # times the max emitted per seq. - return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py deleted file mode 100644 index 8e8c05d26361b..0000000000000 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ /dev/null @@ -1,94 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase - - -class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): - """Worker for MLPSpeculator models. - - Not currently compatible with LoRA or chunked prefill. - """ - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass to generate sample_len future tokens. - Returns the list of sampler output, one per layer, along with indicator - of whether torch tensor in sampler output need to be transposed in - latter sampler_output_to_torch logic. - - For mlp spec worker, this indicator shall be True. - """ - self._raise_if_unsupported(execute_model_req) - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - (input_tokens, seq_lens, - query_lens) = self._prepare_input_tensors(seq_group_metadata_list) - - generators = self.model_runner.get_generators( - execute_model_req.finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.model_runner.pin_memory, generators) - - model_outputs = self.model_runner.model.generate_proposals( - input_ids=input_tokens, - previous_hidden_states=execute_model_req.previous_hidden_states. - hidden_states, - num_predict_tokens=sample_len, - sampling_metadata=sampling_metadata) - - assert len(model_outputs) == sample_len - - return model_outputs, True - - def _prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, List[int], List[int]]: - if not seq_group_metadata_list: - return torch.empty(0, device=self.device), [], [] - - input_tokens: List[int] = [] - seq_lens: List[int] = [] - query_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - is_prompt = seq_group_metadata.is_prompt - - for seq_data in seq_group_metadata.seq_data.values(): - seq_data_len = seq_data.get_len() - if is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min( - seq_data_len, - context_len + seq_group_metadata.token_chunk_size) - tokens = seq_data.get_token_ids()[context_len:seq_len] - seq_lens.append(seq_len) - input_tokens.extend(tokens) - query_lens.append(seq_len - context_len) - else: - seq_lens.append(seq_data_len) - input_tokens.append(seq_data.get_last_token_id()) - query_lens.append(1) - - input_tokens_tensor = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - return input_tokens_tensor, seq_lens, query_lens diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py deleted file mode 100644 index 18e7b055a6782..0000000000000 --- a/vllm/spec_decode/mqa_scorer.py +++ /dev/null @@ -1,160 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.sequence import (ExecuteModelRequest, SequenceData, - SequenceGroupMetadata, get_all_seq_ids) -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) - -SeqId = int -TargetSeqId = int - - -class MQAScorer(SpeculativeScorer): - - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - target_seq_group_metadata_list = [] - target_seq_id_start = max( - get_all_seq_ids(execute_model_req.seq_group_metadata_list)) + 1 - all_proposal_tokens = proposals.proposal_token_ids.tolist() - all_proposal_lengths = proposals.proposal_lens.tolist() - for i, seq_group_metadata in enumerate( - execute_model_req.seq_group_metadata_list): - if all_proposal_lengths[i] == 0: - # Keep prompt seqs untouched (keep computed_tokens for chunks). - target_seq_group_metadata_list.append(seq_group_metadata) - continue - - seq_data_dict = seq_group_metadata.seq_data - assert len(seq_data_dict) == 1 - seq_id = next(iter(seq_data_dict.keys())) - - seq_data: SequenceData = seq_data_dict[seq_id] - prompt_token_ids = seq_data.get_prompt_token_ids() - output_token_ids = seq_data.get_output_token_ids() - proposal_token_ids = all_proposal_tokens[ - i][:all_proposal_lengths[i]] - new_output_token_ids = [*output_token_ids, *proposal_token_ids] - - target_seq_id = target_seq_id_start + i - new_seq_data = SequenceData.from_seqs( - prompt_token_ids=prompt_token_ids, - output_token_ids=new_output_token_ids, - ) - new_seq_data.update_num_computed_tokens( - len(prompt_token_ids) + len(output_token_ids) - 1) - - # Ensure that the new decode sequence has at least one token. - assert len(output_token_ids) >= 1 - new_seq_data_dict = {target_seq_id: new_seq_data} - - new_seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group_metadata.request_id, - is_prompt=seq_group_metadata.is_prompt, - seq_data=new_seq_data_dict, - sampling_params=seq_group_metadata.sampling_params, - block_tables={ - target_seq_id: seq_group_metadata.block_tables[seq_id], - }, - lora_request=None, - ) - target_seq_group_metadata_list.append(new_seq_group_metadata) - - target_sampler_output = self._scorer_worker.execute_model( - execute_model_req=execute_model_req.clone( - seq_group_metadata_list=target_seq_group_metadata_list)) - - target_sampler_output = target_sampler_output[0] - - k = execute_model_req.num_lookahead_slots - bs = len(execute_model_req.seq_group_metadata_list) - target_token_ids = target_sampler_output.sampled_token_ids - target_probs = target_sampler_output.sampled_token_probs - target_logprobs = target_sampler_output.logprobs - prompt_logprobs = None - - # If all requests have the same number of query tokens, we can avoid - # the for loop to build output for better performance. - if min(all_proposal_lengths) == k: - # Regular decodes only. - assert all(not sg.is_prompt - for sg in target_seq_group_metadata_list - if sg.is_prompt) - bs, _ = proposals.proposal_token_ids.shape - all_tokens = target_token_ids.reshape(bs, k + 1) - all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) - all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size) - else: - # We either have decodes with different lens or prefill+decodes. - all_tokens = target_token_ids.new_full(size=(bs, k + 1), - fill_value=-1) - all_probs = target_probs.new_zeros(*all_tokens.shape, - self._vocab_size) - all_logprobs = target_logprobs.new_full(size=all_probs.shape, - fill_value=-float("inf")) - target_token_ids = target_token_ids.flatten() - - # When prompt logprobs is enabled, lens of returned tensors go from - # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. - # We adjust stride accordingly to get the generated tokens and - # their probs, but pass on prompt_logprobs as is, since it may be - # that n_prompts >> K. - has_prompt_log = any((sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - for sg in target_seq_group_metadata_list) - # TODO (NickLucche) we should surface `disable_logprobs` as to not - # break abstraction to get its value. - if (not self._scorer_worker.model_runner.disable_logprobs\ - and has_prompt_log): - prompt_logprobs = [ - o.prompt_logprobs for o in target_sampler_output.outputs - ] - - # Split loop into prefill|decode for readability. - start_loc, i = 0, 0 - while i < len(target_seq_group_metadata_list - ) and target_seq_group_metadata_list[i].is_prompt: - seq_meta = target_seq_group_metadata_list[i] - end_loc = start_loc - if has_prompt_log: - end_loc += seq_meta.token_chunk_size - elif seq_meta.do_sample: - end_loc += 1 - - # Skip chunks with no output tokens. - if seq_meta.do_sample: - # Get sampled token (last position in chunk) and its prob. - all_tokens[i, 0] = target_token_ids[end_loc - 1] - all_probs[i, 0] = target_probs[end_loc - 1] - all_logprobs[i, 0] = target_logprobs[end_loc - 1] - - i += 1 - start_loc = end_loc - # Decodes. - while i < len(target_seq_group_metadata_list): - proposed_len, seq_meta = all_proposal_lengths[ - i], target_seq_group_metadata_list[i] - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc - i += 1 - - hidden_states = None - if target_sampler_output.hidden_states is not None: - hidden_states = target_sampler_output.hidden_states.reshape( - bs, (k + 1), -1) - - return SpeculativeScores(probs=all_probs, - token_ids=all_tokens, - logprobs=all_logprobs, - hidden_states=hidden_states, - prompt_logprobs=prompt_logprobs) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py deleted file mode 100644 index 4a9bbe44d89a0..0000000000000 --- a/vllm/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,423 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copy -import weakref -from typing import Dict, List, Set, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.platforms import current_platform -from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData, - SequenceGroupMetadata) - -if current_platform.is_cuda_alike(): - from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner - -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeProposer) -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import DelegateWorkerBase - - -class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - def __init__(self, *args, **kwargs): - DelegateWorkerBase.__init__(self, *args, **kwargs) - # Lazy initialization list. - self._proposer: SpeculativeProposer - - def init_device(self) -> None: - self.worker.init_device() - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - self.device, - self.vocab_size, - max_proposal_len=self.max_model_len, - ) - - def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for MultiStepWorker - self.model_runner.sampler.include_gpu_probs_tensor = True - if hasattr(self.model_runner.model, "sampler"): - (self.model_runner.model.sampler.include_gpu_probs_tensor) = True - - def set_should_modify_greedy_probs_inplace(self) -> None: - self.model_runner.sampler.should_modify_greedy_probs_inplace = True - if hasattr(self.model_runner.model, "sampler"): - (self.model_runner.model.sampler.should_modify_greedy_probs_inplace - ) = True - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass sample_len times. Returns the list of - sampler output, one per model forward pass, along with indicator of - whether torch tensor in sampler output need to be transposed in latter - sampler_output_to_torch logic. - - For multi step worker, this indicator shall be True. - """ - self._raise_if_unsupported(execute_model_req) - # Expand the batch for sequences with a bonus token. - # Perform a forward pass on the expanded batch and filter the - # response to retain only the original sequences' responses. - expanded_request, indices_of_seq_with_bonus_tokens =\ - self._expand_execute_model_request( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - # Run model sample_len times. - model_outputs: List[SamplerOutput] = [] - if current_platform.is_cuda_alike() and isinstance( - self.model_runner, TP1DraftModelRunner - ) and self.model_runner.supports_gpu_multi_step(expanded_request): - # Here we run the draft_model_runner with multi-step prepare - # on the GPU directly - expanded_request.num_steps = sample_len - self.model_runner.set_indices_of_seq_with_bonus_tokens( - indices_of_seq_with_bonus_tokens) - model_outputs = self.execute_model( - execute_model_req=expanded_request) - else: - # Here we run multi-step directly, with every step prepared - # on the CPU. - # TODO: Remove this branch once DraftModelRunner supports TP>1 - # and other restrictions that are part of DraftModelRunner's - # supports_gpu_multi_step(..) - if expanded_request.previous_hidden_states is not None: - self.worker.model_runner.return_hidden_states = True - for _ in range(sample_len): - model_output: List[SamplerOutput] = self.worker.execute_model( - execute_model_req=expanded_request) - assert (len(model_output) == 1 - ), "composing multistep workers not supported" - model_output = model_output[0] - self._maybe_update_previous_hidden_states( - model_output, expanded_request) - - self._append_new_tokens( - model_output, expanded_request.seq_group_metadata_list, - indices_of_seq_with_bonus_tokens) - model_outputs.append(model_output) - - # move indices to device to avoid stream sync - indices_of_seq_with_bonus_tokens = torch.tensor( - indices_of_seq_with_bonus_tokens, device=self.device) - filtered_model_outputs = self._filter_model_output( - model_outputs, indices_of_seq_with_bonus_tokens) - return filtered_model_outputs, True - - @staticmethod - def _maybe_update_previous_hidden_states( - model_output: SamplerOutput, - expanded_request: ExecuteModelRequest) -> None: - """ - Updates the previous hidden states in an expanded request - in-place with the hidden states from the model output. - """ - if expanded_request.previous_hidden_states is not None: - expanded_request.previous_hidden_states = HiddenStates( - model_output.hidden_states, - expanded_request.seq_group_metadata_list) - - @staticmethod - def _expand_execute_model_request( - execute_model_req: ExecuteModelRequest, - seq_with_bonus_token_in_last_step: set, - ) -> Tuple[ExecuteModelRequest, List[int]]: - """ - Expands the execute model request based on sequences with bonus - tokens. - - For each sequence with a bonus token, this method creates a new - sequence without the bonus token and adds it to the execute model - request. The original sequence groups are also retained. The indices - of the original sequence groups are returned for further processing. - - Args: - execute_model_req (ExecuteModelRequest): The original execute - model request. - seq_with_bonus_token_in_last_step (set): Set of sequence IDs that - contain bonus tokens. - - Returns: - Tuple[ExecuteModelRequest, List[int]]: The updated execute model - request with expanded sequences and a list of indices corresponding - to the original sequence groups. - """ - updated_seq_group_metadata_list: List[SequenceGroupMetadata] = [] - updated_execute_model_req = execute_model_req.clone( - updated_seq_group_metadata_list) - indices_of_original_sequence_groups = [] - for seq_group in execute_model_req.seq_group_metadata_list: - seq_group_has_bonus_tokens = False - for seq_id, _ in seq_group.seq_data.items(): - # Identify sequences with bonus tokens in the sequence group. - if seq_id in seq_with_bonus_token_in_last_step: - seq_group_has_bonus_tokens = True - break - if seq_group_has_bonus_tokens: - #Create new sequences without the last bonus token. These new - # sequence have the same sequence id as the original sequence. - # We create a new sequence group and add them there. - updated_seq_group_without_bonus_token = \ - MultiStepWorker._copy_seq_metadata_excluding_last_token( - seq_group, seq_with_bonus_token_in_last_step) - updated_seq_group_metadata_list.append( - updated_seq_group_without_bonus_token) - # Add the original sequence group. - updated_seq_group_metadata_list.append( - MultiStepWorker._shallow_copy_seq_group_metadata(seq_group)) - # Record the index of the original sequence group. - indices_of_original_sequence_groups.append( - len(updated_seq_group_metadata_list) - 1) - - updated_execute_model_req.seq_group_metadata_list =\ - updated_seq_group_metadata_list - - if isinstance(updated_execute_model_req.previous_hidden_states, - HiddenStates): - updated_execute_model_req.previous_hidden_states\ - .expand_with_bonus_tokens(seq_with_bonus_token_in_last_step) - - return updated_execute_model_req, indices_of_original_sequence_groups - - @staticmethod - def _filter_model_output( - expanded_batch_outputs: List[SamplerOutput], - output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: - """ - Filters the model output to include only the specified sequence - outputs. This method contracts the expanded batch output from the - model to retain the outputs of only those sequences indicated by the - provided indices. - - Args: - expanded_batch_output (List[SamplerOutput]): The expanded output - batch from the model. - output_indices_to_retain (torch.Tensor): Indices of the model - outputs to retain. - - Returns: - List[SamplerOutput]: A list containing the filtered model - outputs for the specified indices. - """ - return [ - SamplerOutput( - outputs=[ - expanded_batch_output.outputs[i] - for i in output_indices_to_retain - ] if len(expanded_batch_output.outputs) > 0 else [], - sampled_token_probs=( - expanded_batch_output. - sampled_token_probs[output_indices_to_retain] - if expanded_batch_output.sampled_token_probs is not None - else None), - logprobs=( - expanded_batch_output.logprobs[output_indices_to_retain] - if expanded_batch_output.logprobs is not None else None), - sampled_token_ids=(expanded_batch_output. - sampled_token_ids[output_indices_to_retain] - if expanded_batch_output.sampled_token_ids - is not None else None)) - for expanded_batch_output in expanded_batch_outputs - ] - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: set, - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - @staticmethod - def _append_new_tokens( - model_output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - indices_of_seq_with_bonus_tokens: List[int]) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - count = 0 - for index, (seq_group_metadata, sequence_group_outputs) in enumerate( - zip(seq_group_metadata_list, model_output)): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - # Determine the actual token ID to be generated, - # considering bonus tokens - if index != indices_of_seq_with_bonus_tokens[count]: - bonus_seq_metadata = seq_group_metadata_list[ - indices_of_seq_with_bonus_tokens[count]] - _, bonus_token_seq_data = next( - iter(bonus_seq_metadata.seq_data.items())) - token_id = bonus_token_seq_data.output_token_ids[-1] - else: - count += 1 - - seq.append_token_id(token_id, token_logprob.logprob, - seq_output.output_embed) - seq.update_num_computed_tokens(1) - - @staticmethod - def _shallow_copy_seq_group_metadata( - seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - Helpful when the vLLM scheduler runs in the same process as the worker. - The alternative is deep-copying (or other form of deep copy); this has - performance downsides. - """ - # Shallow-copy the SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - # We must shallow-copy seq_group_metadata as is_prompt could change. - new_seq_group_metadata = copy.copy(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data: Dict[int, SequenceData] = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[seq_id].output_token_ids =\ - old_seq_data.output_token_ids[:] - - new_seq_group_metadata.seq_data = new_seq_data - return new_seq_group_metadata - - @staticmethod - def _copy_seq_metadata_excluding_last_token( - seq_group_metadata: SequenceGroupMetadata, - seq_ids_to_copy: Set[int], - ) -> SequenceGroupMetadata: - """ - Creates a shallow copy of the given SequenceGroupMetadata, retaining - only the sequence IDs specified in seq_ids_to_copy. For each of these - sequence IDs, all output_token_ids except the last one are copied. - Sequence IDs not in seq_ids_to_copy are excluded from the copy. - - Parameters: - seq_group_metadata (SequenceGroupMetadata): The original sequence - group metadata. - seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the - copy. - - Returns: - SequenceGroupMetadata: A shallow copy of the sequence group metadata - with the specified modifications. - """ - # Shallow-copy the SequenceGroupMetadata. - new_seq_group_metadata = copy.copy(seq_group_metadata) - # Shallow-copy seq_data and modify the output_token_ids. - new_seq_data: Dict[int, SequenceData] = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - if (seq_id in seq_ids_to_copy): - new_seq_data[seq_id] = copy.copy(old_seq_data) - # Copy all the output token ids except the last. - # Also reduce num_computed_tokens by 1 since we are not - # including the last output token. - # NOTE: num_computed_tokens is not directly used by the - # speculative decoding workers, as it is only relevant for - # chunked prefill, which is disabled for speculative decoding. - # However, to maintain consistency in num_computed_tokens, - # we update it here. - new_seq_data[seq_id].output_token_ids =\ - old_seq_data.output_token_ids[:-1] - new_seq_data[seq_id].update_num_computed_tokens(-1) - new_seq_group_metadata.seq_data = new_seq_data - return new_seq_group_metadata - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") - - def maybe_load_lm_head_weight( - self, - lm_head_weight: torch.Tensor, - ) -> None: - weight_loader = getattr( - self.worker.model_runner.model_runner.model.lm_head.weight, - "weight_loader", default_weight_loader) - weight_loader( - self.worker.model_runner.model_runner.model.lm_head.weight, - lm_head_weight) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py deleted file mode 100644 index 7a1a0e56dc00b..0000000000000 --- a/vllm/spec_decode/ngram_worker.py +++ /dev/null @@ -1,196 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref -from typing import List, Optional, Set, Tuple - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer - - -class _DummyModel(nn.Module): - pass - - -class NGramWorker(NonLLMProposerWorkerBase): - """NGramWorker provides a light drafter without need for model. - - Current NGramWorker only implements prompt lookup decoding, - and in future we may also do RAG type drafter and other scenarios - which don't rely on LLM model to give proposals. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - device_type: str = "cuda", - **kwargs, - ): - super().__init__(vllm_config) - - # Get local_rank/vocab_size from kwargs attribute - self.local_rank = local_rank - self.device_type = device_type - - # Lazy initialization list. - self._proposer: Top1Proposer - - def set_ngram_window_size(self, ngram_prompt_lookup_min: int, - ngram_prompt_lookup_max: int): - # Search valid candidate window between - # ngram_prompt_lookup_min/ngram_prompt_lookup_max - self.ngram_prompt_lookup_max = ngram_prompt_lookup_max - self.ngram_prompt_lookup_min = ngram_prompt_lookup_min - - def init_device(self): - self.device = torch.device(f"{self.device_type}:{self.local_rank}") - - # Current NGramWorker only supports Top1Proposer - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - device=self.device, - vocab_size=self.vocab_size, - ) - - def load_model(self) -> None: - pass # Dummy - - def get_model(self) -> nn.Module: - return _DummyModel() - - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. NGramWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: - """NGram match algo to pick proposal candidate. Returns the list of - sampler output, one per SequenceGroupMetadata. - - For ngram worker, we already done needed transposed internal, so the - indicator pass to sampler_output_to_torch shall be False. - """ - self._raise_if_unsupported(execute_model_req) - - has_spec_out = False - token_id_list: List[Optional[torch.Tensor]] = [] - token_prob_list: List[Optional[torch.Tensor]] = [] - for idx, seq_group_metadata in enumerate( - execute_model_req.seq_group_metadata_list): - seq_data = next(iter(seq_group_metadata.seq_data.values())) - - seq_len = seq_data.get_len() - # When seq_len is less than 3072 (3K), we use CPU to perform - # the ngram match. Otherwise, we use the device specified in - # the model config (normally GPU). 3072 is a rough threshold - # based on profiling on H100, and it can be adjusted based - # on the actual performance on different hardware. - cur_device = "cpu" if seq_len < 3072 else self.device - input_ids = torch.as_tensor(seq_data.get_token_ids(), - dtype=torch.long, - device=cur_device) - input_length = seq_data.get_len() - - for ngram_size in range( - min(self.ngram_prompt_lookup_max, input_length - 1), - self.ngram_prompt_lookup_min - 1, - -1, - ): - ngram_tensor = input_ids[-ngram_size:] - if ngram_size == 1: - # Do not match itself and do not use unfold and all - matches = (input_ids[:-1] == ngram_tensor) - else: - windows = input_ids.unfold(dimension=0, - size=ngram_size, - step=1) - # Do not match itself - matches = (windows[:-1] == ngram_tensor).all(dim=-1) - - # first_match includes "values" (bool), indicating whether - # the match is found, and "indices", indicating the index - # of the first match. - first_match = matches.max(dim=-1) - if first_match.values.item(): - proposal_start_idx = first_match.indices.add_(ngram_size) - spec_indices = ( - proposal_start_idx).repeat(sample_len) + torch.arange( - sample_len, device=cur_device) - spec_indices.clamp_(max=input_ids.shape[-1] - 1) - res = input_ids.gather(dim=-1, - index=spec_indices).to(self.device) - token_id_list.append(res) - token_prob_list.append( - torch.nn.functional.one_hot( - res, - num_classes=self.vocab_size).to(torch.float32)) - has_spec_out = True - break - else: - token_id_list.append(None) - token_prob_list.append(None) - - if not has_spec_out: - return None, False - - outputs: List[Optional[SamplerOutput]] = [] - for idx in range(len(execute_model_req.seq_group_metadata_list)): - if token_id_list[idx] is None: - outputs.append(None) - else: - outputs.append( - SamplerOutput( - outputs=None, - sampled_token_probs=token_prob_list[idx], - logprobs=torch.zeros((sample_len, self.vocab_size), - dtype=torch.float32, - device=self.device), - sampled_token_ids=token_id_list[idx], - )) - - return outputs, False - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - # Unused parameter. NGramWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """NGramWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "NGramWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "NGramWorker does not support beam search.") diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py deleted file mode 100644 index fb44275aa9357..0000000000000 --- a/vllm/spec_decode/proposer_worker_base.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import List, Optional, Set, Tuple - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import LoRANotSupportedWorkerBase - - -class ProposerWorkerBase(LoRANotSupportedWorkerBase, SpeculativeProposer): - """Interface for proposer workers""" - - @abstractmethod - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # A set containing all sequence IDs that were assigned bonus tokens - # in their last forward pass. This set is used to backfill the KV cache - # with the key-value pairs of the penultimate token in the sequences. - # This parameter is only used by the MultiStepWorker, which relies on - # the KV cache for token generation. It is not used by workers that - # do not utilize the KV cache. - seq_ids_with_bonus_token_in_last_step: Set[int] - ) -> Tuple[Optional[List[SamplerOutput]], bool]: - raise NotImplementedError - - def set_include_gpu_probs_tensor(self) -> None: - """Implementation optional""" - pass - - def set_should_modify_greedy_probs_inplace(self) -> None: - """Implementation optional""" - pass - - -class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC): - """Proposer worker which does not use a model with kvcache""" - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """get_spec_proposals is used to get the proposals""" - return [] - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """This is never called on the proposer, only the target model""" - raise NotImplementedError - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - pass - - def get_cache_block_size_bytes(self) -> int: - return 0 diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py deleted file mode 100644 index 91256cab6e799..0000000000000 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ /dev/null @@ -1,196 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch -import torch.nn as nn - -from vllm.distributed.parallel_state import (get_tp_group, - init_model_parallel_group, - patch_tensor_parallel_group) -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase - -logger = init_logger(__name__) - - -class _DummyModel(nn.Module): - pass - - -class SmallerTpProposerWorker(ProposerWorkerBase): - """Class which allows a speculative draft model to run with smaller tensor - parallel degree than target model. - This reduces the communication overhead of small draft models. - - To implement this feature, this class differs behavior based on is_dummy - flag, where dummy means worker that does not participate draft generation. - Participating workers use a smaller tp group by patching vLLM's tensor - parallel group temporarily during forward passes of draft models. - """ - - @classmethod - def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, - target_tensor_parallel_size: int): - """Wrap the worker in a SmallerTpProposerWorker if necessary. - """ - if draft_tensor_parallel_size == target_tensor_parallel_size: - return worker - - # gpu ranks that will generate draft tokens together - draft_ranks = list(range(draft_tensor_parallel_size)) - - logger.info("Wrapping {%s} in {%s}", type(worker), cls) - return cls(worker, draft_ranks) - - def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): - """Create a SmallerTpProposerWorker. - - Args: - worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an - actual worker wrapped with this class - draft_ranks (List[int]): if this value is given, only the GPU ranks - written in this value participate in draft generation - """ - self._worker = worker - self._draft_ranks = draft_ranks - - # init during init_device - self._is_dummy = False - self._tp_group = None - - def _patch_tensor_parallel_group(self): - """Temporarily patch the global tp group state with its own tp group - state. - """ - return patch_tensor_parallel_group(self._tp_group) - - def init_device(self) -> None: - self._is_dummy = get_tp_group().rank not in self._draft_ranks - - # dummy workers do nothing - if self._is_dummy: - return - - # creates tp process group containing only a subset of gpu ranks - local_rank = get_tp_group().local_rank - tp_backend = torch.distributed.get_backend(get_tp_group().device_group) - self._tp_group = init_model_parallel_group([self._draft_ranks], - local_rank, tp_backend) - - with self._patch_tensor_parallel_group(): - self._worker.init_device() - - def set_include_gpu_probs_tensor(self) -> None: - if self._is_dummy: - return - - # Need include_gpu_probs_tensor for multi_step_worker - self._worker.set_include_gpu_probs_tensor() - - def set_should_modify_greedy_probs_inplace(self) -> None: - if self._is_dummy: - return - - self._worker.set_should_modify_greedy_probs_inplace() - - def load_model(self) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - self._worker.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - if self._is_dummy: - # this case is not used now - return -1, -1 - - with self._patch_tensor_parallel_group(): - return self._worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - # Do not check _is_dummy, as it's always called by get_spec_proposals - return self._worker.sampler_output( - execute_model_req, sample_len, - seq_ids_with_bonus_token_in_last_step) - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - if self._is_dummy: - return SpeculativeProposals(None, None, None) - - with self._patch_tensor_parallel_group(): - return self._worker.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def get_model(self) -> nn.Module: - if self._is_dummy: - return _DummyModel() - - with self._patch_tensor_parallel_group(): - return self._worker.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - if self._is_dummy: - return [] - - with self._patch_tensor_parallel_group(): - return self._worker.execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - if self._is_dummy: - # by returning zero, target worker can use the entire kv cache space - return 0 - - return self._worker.get_cache_block_size_bytes() - - @property - def vocab_size(self) -> int: - return self._worker.vocab_size - - def maybe_load_lm_head_weight( - self, - lm_head_weight: torch.Tensor, - ) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - weight_loader = getattr( - self._worker.worker.model_runner.model_runner.model.\ - lm_head.weight, - "weight_loader", - default_weight_loader) - weight_loader( - self._worker.worker.model_runner.model_runner.model.\ - lm_head.weight, - lm_head_weight) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py deleted file mode 100644 index 7dda1cbfe2302..0000000000000 --- a/vllm/spec_decode/spec_decode_worker.py +++ /dev/null @@ -1,1326 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copy -from collections import defaultdict -from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Tuple, Type - -import torch -import torch.nn as nn - -from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig -from vllm.distributed.communication_op import (broadcast_tensor_dict, - get_tp_group, - tensor_model_parallel_gather) -from vllm.distributed.parallel_state import model_parallel_is_initialized -from vllm.logger import init_logger -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.platforms import current_platform -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, ExecuteModelRequest, - HiddenStates, SequenceGroupMetadata, - get_all_seq_ids_and_request_ids) -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer - -if current_platform.is_cuda_alike(): - from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner - -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.medusa_worker import MedusaWorker -from vllm.spec_decode.metrics import AsyncMetricsCollector -from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker -from vllm.spec_decode.mqa_scorer import MQAScorer -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.ngram_worker import NGramWorker -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker -from vllm.spec_decode.target_model_runner import TargetModelRunner -from vllm.spec_decode.util import (Timer, create_logprobs_output, - create_sequence_group_output, - get_all_num_logprobs, - get_sampled_token_logprobs, nvtx_range, - split_batch_by_proposal_len) -from vllm.utils import resolve_obj_by_qualname -from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase - -logger = init_logger(__name__) - - -def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": - """Helper method that is the entrypoint for Executors which use - WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config. - """ - vllm_config: VllmConfig = kwargs.get("vllm_config") - speculative_config: SpeculativeConfig = vllm_config.speculative_config - assert speculative_config is not None - - if vllm_config.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError("Speculative decoding is currently " - "incompatible with pipeline parallelism") - - draft_worker_kwargs = kwargs.copy() - - kwargs["model_runner_cls"] = TargetModelRunner - target_worker_config = copy.deepcopy(vllm_config) - target_worker_config.parallel_config.worker_cls =\ - target_worker_config.parallel_config.sd_worker_cls - cls = resolve_obj_by_qualname( - target_worker_config.parallel_config.worker_cls) - target_worker = cls(*args, **kwargs) - # Set the disable_logprobs variable in the TargetModelRunner instance - # as per its value specified in the SpeculativeConfig. - target_worker.model_runner.disable_logprobs =\ - speculative_config.disable_logprobs - - draft_worker_config = copy.deepcopy(vllm_config) - draft_worker_config.model_config = speculative_config.draft_model_config - draft_worker_config.quant_config = VllmConfig._get_quantization_config( - draft_worker_config.model_config, - vllm_config.load_config, - ) - speculative_config.draft_parallel_config.worker_cls =\ - draft_worker_config.parallel_config.sd_worker_cls - draft_worker_config.parallel_config = speculative_config.draft_parallel_config # noqa - # TODO allow draft-model specific load config. - - # Override draft-model specific worker args. - draft_worker_kwargs.update( - vllm_config=draft_worker_config, - ngram_prompt_lookup_max=speculative_config.prompt_lookup_max, - ngram_prompt_lookup_min=speculative_config.prompt_lookup_min, - ) - - spec_decode_worker = SpecDecodeWorker.create_worker( - scorer_worker=target_worker, - draft_worker_kwargs=draft_worker_kwargs, - disable_mqa_scorer=speculative_config.disable_mqa_scorer, - disable_by_batch_size=speculative_config.disable_by_batch_size, - draft_token_acceptance_method=speculative_config.acceptance_method, - typical_acceptance_sampler_posterior_threshold=speculative_config. - posterior_threshold, - typical_acceptance_sampler_posterior_alpha=speculative_config. - posterior_alpha, - disable_logprobs=speculative_config.disable_logprobs, - disable_log_stats=speculative_config.disable_log_stats, - num_speculative_tokens=speculative_config.num_speculative_tokens, - ) - - return spec_decode_worker - - -# Reminder: Please update docs/features/compatibility_matrix.md -# If the feature combo become valid -class SpecDecodeWorker(LoRANotSupportedWorkerBase): - """Worker which implements speculative decoding. - - Speculative decoding reduces decoding per-token latency by using a proposal - method, such as a small draft model, to speculate ahead of a larger LLM. The - probabilities of the speculative tokens are then determined by the larger - LLM, after which some verification routine determines which (if any) of the - speculative tokens are accepted by the larger LLM. - - See https://github.com/vllm-project/vllm/pull/2188 and - https://github.com/vllm-project/vllm/pull/3103 for more info. - - The current implementation has the following limitations: - * Only draft-model proposal is implemented (contributions for more forms are - welcome!). - * Only top-1 proposal and scoring are implemented. Tree-attention is left as - future work. - * All sequences in a batch must have the same proposal length, or zero. This - can be improved by having per-sequence speculation in the future. - * The scoring forward pass is done without an MQA kernel, which is - suboptimal especially as the batch size, proposal length, and sequence - lengths grow. Contributions to add a MQA scoring are welcome once - correctness tests pass. - More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. - """ - - @classmethod - def create_worker( - cls, - scorer_worker: WorkerBase, - draft_worker_kwargs: Dict[str, Any], - disable_mqa_scorer: bool, - disable_by_batch_size: Optional[int], - draft_token_acceptance_method: str, - typical_acceptance_sampler_posterior_threshold: float, - typical_acceptance_sampler_posterior_alpha: float, - disable_logprobs: bool, - disable_log_stats: bool, - num_speculative_tokens: int, - ) -> "SpecDecodeWorker": - - allow_zero_draft_token_step = True - enable_lm_head_weight_load = False - num_spec_prefill_steps = 1 - ngram_prompt_lookup_max = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_max")) - ngram_prompt_lookup_min = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_min")) - draft_model_config = draft_worker_kwargs["vllm_config"].model_config - draft_parallel_config: ParallelConfig = draft_worker_kwargs[ - 'vllm_config'].parallel_config - if ngram_prompt_lookup_max > 0: - draft_worker_kwargs[ - "device_type"] = scorer_worker.device_config.device.type - proposer_worker = NGramWorker(**draft_worker_kwargs) - proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, - ngram_prompt_lookup_max) - else: - draft_tp = draft_parallel_config.tensor_parallel_size - target_tp = scorer_worker.parallel_config.tensor_parallel_size - - if draft_model_config.hf_config.model_type == "mlp_speculator": - proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) - elif draft_model_config.hf_config.model_type == "medusa": - proposer_worker = MedusaWorker(**draft_worker_kwargs) - else: - if draft_tp == 1: - if current_platform.is_cuda_alike(): - draft_worker_kwargs[ - "model_runner_cls"] = TP1DraftModelRunner - else: - if draft_model_config.hf_config.model_type == "eagle": - raise NotImplementedError( - f"{draft_model_config.hf_config.model_type} " - "does not support TP > 1 yet") - - allow_zero_draft_token_step = False - - # Load lm_head weight for eagle in init_device - if draft_model_config.hf_config.model_type == "eagle": - enable_lm_head_weight_load = True - - proposer_worker = MultiStepWorker(**draft_worker_kwargs) - if draft_model_config.hf_config.model_type == "deepseek_mtp": - num_spec_prefill_steps = \ - draft_model_config.hf_config.n_predict - - proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( - proposer_worker, draft_tp, target_tp) - - logger.info("Configuring SpecDecodeWorker with proposer=%s", - type(proposer_worker)) - - spec_decode_sampler: SpecDecodeBaseSampler = None - if draft_token_acceptance_method == "rejection_sampler": - spec_decode_sampler = RejectionSampler() - elif draft_token_acceptance_method == "typical_acceptance_sampler": - spec_decode_sampler = TypicalAcceptanceSampler( - posterior_threshold=\ - typical_acceptance_sampler_posterior_threshold, - posterior_alpha=typical_acceptance_sampler_posterior_alpha, - ) - logger.info( - "[Speculative Decoding] Configuring" - " SpecDecodeWorker with sampler=%s", type(spec_decode_sampler)) - - if not disable_mqa_scorer: - if scorer_worker.model_runner.attn_backend.get_name( - ) != "FLASH_ATTN": - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "MQA is only available with flash attn backend.") - - if draft_model_config and \ - draft_model_config.max_model_len < \ - scorer_worker.model_config.max_model_len: - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "draft model max_model_len is smaller than the target " - "model max_model_len.") - - if not scorer_worker.model_runner.model_config.enforce_eager: - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "target model is not running in eager mode.") - - return SpecDecodeWorker( - proposer_worker, - scorer_worker, - disable_mqa_scorer=disable_mqa_scorer, - disable_logprobs=disable_logprobs, - disable_log_stats=disable_log_stats, - disable_by_batch_size=disable_by_batch_size, - spec_decode_sampler=spec_decode_sampler, - allow_zero_draft_token_step=allow_zero_draft_token_step, - enable_lm_head_weight_load=enable_lm_head_weight_load, - num_spec_prefill_steps=num_spec_prefill_steps) - - def __init__( - self, - proposer_worker: ProposerWorkerBase, - scorer_worker: WorkerBase, - spec_decode_sampler: SpecDecodeBaseSampler, - disable_mqa_scorer: bool = False, - disable_logprobs: bool = False, - disable_log_stats: bool = False, - metrics_collector: Optional[AsyncMetricsCollector] = None, - disable_by_batch_size: Optional[int] = None, - allow_zero_draft_token_step: Optional[bool] = True, - enable_lm_head_weight_load: Optional[bool] = False, - num_spec_prefill_steps: int = 1, - ): - """ - Create a SpecDecodeWorker. - - Args: - proposer_worker: A worker that can produce speculative tokens for - sequences. - scorer_worker: A worker that produces probabilities of speculative - tokens according to some base model. Typically a vanilla vLLM - Worker. - spec_decode_sampler: A Torch module used to perform acceptance - sampling of the draft tokens in the verification step of - speculative decoding. Currently we support two different - types of sampler namely RejectionSampler and - TypicalAcceptanceSampler. 'spec_decode_sampler' is either an - instance of RejectionSampler or TypicalAcceptanceSampler. - disable_mqa_scorer: If set to True, disable the MQA scorer and use - the BatchExpansionTop1Scorer instead. - disable_logprobs: If set to True, token log probabilities will - not be output in both the draft worker and the target worker. - If set to False, log probabilities will be output by both. - disable_log_stats: If set to True, disable periodic printing of - speculative stage times. - disable_by_batch_size: If the batch size is larger than this, - disable speculative decoding for new incoming requests. - metrics_collector: Helper class for collecting metrics; can be set - for testing purposes. - allow_zero_draft_token_step: whether to allow a step where the draft - model generates no draft token; should disallow when the tp of - draft model is larger than 1 (TODO: #5814) - enable_lm_head_weight_load: whether to load lm_head weight for - draft models like eagle. - num_spec_prefill_steps: number of speculative prefill steps to run - before the speculative decoding starts. This is only used when - the draft model is a deepseek_mtp model that requires prefill - kv cache separately for each MTP layer. - """ - self.proposer_worker = proposer_worker - self.scorer_worker = scorer_worker - scorer_runner = getattr(self.scorer_worker, "model_runner", None) - self.generators = scorer_runner.get_generators( - ) if scorer_runner else None - self.disable_by_batch_size = disable_by_batch_size or float("inf") - self.spec_decode_sampler = spec_decode_sampler - self._allow_zero_draft_token_step = allow_zero_draft_token_step - self._enable_lm_head_weight_load = enable_lm_head_weight_load - self._metrics = AsyncMetricsCollector( - self.spec_decode_sampler - ) if metrics_collector is None else metrics_collector - # Tracks the sequence IDs that received a bonus token ID in - # their last forward pass. Needed only if KV cache is being - # used for token generation such as in the case of MultiStepWorker. - self._seq_with_bonus_token_in_last_step: Set[int] = set() - # Tracks the currently active request ids and the sequence IDs - # corresponding to them - self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set) - # Tracks if the proposer worker uses the KV cache or not. - - self.probs_dtype = self.spec_decode_sampler.probs_dtype - self.token_id_dtype = self.spec_decode_sampler.token_id_dtype - # Lazy initialization. - self.scorer: SpeculativeScorer - self.disable_mqa_scorer = disable_mqa_scorer - - # Hidden states from target model to pass to proposer - # in the subsequent step. - self.previous_hidden_states: Optional[HiddenStates] = None - self._disable_logprobs = disable_logprobs - self._disable_log_stats = disable_log_stats - self._num_spec_prefill_steps = num_spec_prefill_steps - - def init_device(self) -> None: - """Initialize both scorer and proposer models. - """ - # The scorer worker model is initialized first in case the proposer - # model has a smaller TP degree than the target worker. - self.scorer_worker.init_device() - self.proposer_worker.init_device() - - # NOTE(cade): load_model is not part of the WorkerBase interface. - self.scorer_worker.load_model() - self.proposer_worker.load_model() - - if self._enable_lm_head_weight_load: - # NOTE(Shangming): gather lm_head weight when tp enabled - target_lm_head_weight: torch.Tensor = tensor_model_parallel_gather( - self.scorer_worker.model_runner.model_runner.model.lm_head.\ - weight.data, - dim=0, - ) - - self.proposer_worker.maybe_load_lm_head_weight( - target_lm_head_weight) - - self._metrics.init_tensors(self.rank, device_type=self.device) - if model_parallel_is_initialized(): - self.spec_decode_sampler.init_tensors(get_tp_group().local_rank, - device_type=self.device) - else: - self.spec_decode_sampler.init_tensors(self.rank, - device_type=self.device) - - scorer_cls: Type[SpeculativeScorer] - if self.disable_mqa_scorer: - scorer_cls = BatchExpansionTop1Scorer - logger.info("[Speculative Decoding] Use batch " - "expansion for scoring proposals.") - else: - scorer_cls = MQAScorer - logger.info( - "[Speculative Decoding] Use MQA scorer for scoring proposals.") - - self.scorer = scorer_cls(scorer_worker=self.scorer_worker, - device=self.device, - vocab_size=self._vocab_size) - - self._configure_model_sampler_for_spec_decode() - - def load_model(self, *args, **kwargs): - pass - - def _configure_model_sampler_for_spec_decode(self): - """Configure model sampler to emit GPU tensors. This allows spec decode - to keep data on device without transferring to CPU and serializing, - which significantly reduces overhead of sampling during verification. - - NOTE(cade): This breaks abstraction boundaries pretty badly. The better - design is to have the "move to CPU and serialize" sampling decision be - done outside of the model/sampler; this way the "last-mile" worker - object which interfaces with the scheduler can serialize and incur the - performance hit as necessary. This allows us to run the worker several - iterations in a row without incurring the "move to CPU and serialize" - performance penalty. - - Since this requires a large change to vLLM, we defer it to later and - temporarily accept this broken abstraction boundary. - - NOTE(cade): This will require a special check if the proposer worker - does not have a sampler (e.g. ngram speculation). - """ - (self.scorer_worker.model_runner.sampler.include_gpu_probs_tensor - ) = True - (self.scorer_worker.model_runner.sampler. - should_modify_greedy_probs_inplace) = True - self.proposer_worker.set_include_gpu_probs_tensor() - self.proposer_worker.set_should_modify_greedy_probs_inplace() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of cache blocks to use. - - This is done by profiling the scorer model (which is typically the - larger of the two). Then the total memory which would be used by the - scorer cache is divided evenly between the proposer and scorer model KV, - such that the number of blocks is equal in both KV caches. - """ - num_gpu_blocks, num_cpu_blocks = ( - self.scorer_worker.determine_num_available_blocks()) - - scorer_cache_block_size_bytes = ( - self.scorer_worker.get_cache_block_size_bytes()) - proposer_cache_block_size_bytes = ( - self.proposer_worker.get_cache_block_size_bytes()) - - new_num_gpu_blocks = split_num_cache_blocks_evenly( - scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, - num_gpu_blocks) - return new_num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the cache engine of the scorer and proposer workers. - """ - self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def get_model(self) -> nn.Module: - return self.scorer_worker.get_model() - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Perform speculative decoding on the input batch. - """ - if self.rank != self._driver_rank: - self._run_non_driver_rank() - return [] - - if execute_model_req is None: - # This signals that there's no more requests to process for now. - # All workers are running infinite loop with broadcast_tensor_dict, - # and it stops the loop when the driver broadcasts an empty input. - # Send an empty input to notify all other workers to stop their - # execution loop. - broadcast_tensor_dict({}, src=0) - return [] - - self._track_finished_requests(execute_model_req) - disable_all_speculation = self._should_disable_all_speculation( - execute_model_req) - num_lookahead_slots = execute_model_req.num_lookahead_slots - all_prompt = True - atleast_one_prompt = False - all_zero_spec_tokens = True - for sgm in execute_model_req.seq_group_metadata_list: - all_prompt = all_prompt and sgm.is_prompt - atleast_one_prompt = atleast_one_prompt or sgm.is_prompt - all_zero_spec_tokens = all_zero_spec_tokens and ( - sgm.num_speculative_tokens == 0) - - if all_prompt and execute_model_req.seq_group_metadata_list: - assert num_lookahead_slots == 0, ( - "Prompt only runs should have num_lookahead_slots equal to 0. " - "This should never happen, please file a bug at " - "https://github.com/vllm-project/vllm/issues") - # Speculative decoding is disabled in the following cases: - # 1. Prefill phase: Speculative decoding is not - # used during the prefill phase. - # 2. Auto-disable enabled: The running queue size exceeds - # the specified threshold. - # 3. No request: There are no requests in the batch, or - # none of the requests in the batch have spec decoding enabled. - # In any of these cases, the proposer and scorer workers - # are called normally. - # We expect `num_speculative_tokens` to be None for prefills. - no_spec = (num_lookahead_slots == 0 or disable_all_speculation - or all_zero_spec_tokens) - - # Broadcast how many lookahead slots are scheduled for this step, and - # whether all speculation is disabled, to all non-driver workers. - - # This is required as if the number of draft model runs changes - # dynamically, the non-driver workers won't know unless we perform a - # communication to inform them. - - # no_spec is used to signal non-driver worker about prefill vs decode - # stage. This is needed to ensure that order of execution of proposer - # and scorer is same in both driver and non-driver workers (i.e., - # scorer -> proposer for prefill and proposer -> scorer in decode). This - # order is needed to support models like EAGLE that take scorer states - # as inputs. - broadcast_dict = dict( - num_lookahead_slots=num_lookahead_slots, - no_spec=no_spec, - disable_all_speculation=disable_all_speculation, - # When both chunked prefill and speculative decoding are enabled - # it is possible that the same batch contains both prefill - # and decodes. If that happens in the scorer we run the batch - # as one single forward pass. However, in the proposer we - # run them as 2 different batches - one for prefill and - # the other for decodes. The variable indicates to the non-driver - # worker that there are prefills as part of the speculative batch - # and hence it needs to run an extra prefill forward pass. - run_spec_proposer_for_prefill=atleast_one_prompt, - ) - broadcast_tensor_dict(broadcast_dict, src=self._driver_rank) - - assert execute_model_req.seq_group_metadata_list is not None, ( - "speculative decoding requires non-None seq_group_metadata_list") - - self._maybe_disable_speculative_tokens( - disable_all_speculation, execute_model_req.seq_group_metadata_list) - - if no_spec: - return self._run_no_spec(execute_model_req, - skip_proposer=disable_all_speculation) - return self._run_speculative_decoding_step(execute_model_req, - num_lookahead_slots) - - @torch.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop to perform speculative decoding - in parallel worker.""" - while self._run_non_driver_rank(): - pass - - def _should_disable_all_speculation( - self, execute_model_req: ExecuteModelRequest) -> bool: - # When the batch size is too large, disable speculative decoding - # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size - >= self.disable_by_batch_size) - - def _maybe_disable_speculative_tokens( - self, disable_all_speculation: bool, - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: - if not disable_all_speculation: - return - - for seq_group_metadata in seq_group_metadata_list: - # Once num_speculative_tokens is set to 0, the spec decode - # of this request will be disabled forever. - # TODO(comaniac): We currently store spec decoding specific - # state in the global data structure, but we should maintain - # this state within spec decode worker. - seq_group_metadata.num_speculative_tokens = 0 - - def _serialize_sampler_output_no_logprobs( - self, execute_model_req: ExecuteModelRequest, - sampler_output: SamplerOutput) -> List[SamplerOutput]: - """ - Creates and returns a `SamplerOutput` with only the token IDs being - serialized to CPU and populated in `CompletionSequenceGroupOutput`. - All other parameters in `CompletionSequenceGroupOutput` related to log - probabilities are skipped. - - Args: - execute_model_req (ExecuteModelRequest): The model request that - was executed. - sampler_output (SamplerOutput): The output from the sampler with - only GPU tensors populated. - - Returns: - SamplerOutput: A new `SamplerOutput` instance containing a list of - `CompletionSequenceGroupOutput` objects with only token IDs - populated. - """ - seq_output_prompt_logprobs = [ - seq.is_prompt and seq.sampling_params.prompt_logprobs is not None - and seq.sampling_params.prompt_logprobs > 0 - for seq in execute_model_req.seq_group_metadata_list - ] - # ignore slots for prompt tokens that are filled with INVALID_TOKEN_ID - sampled_token_ids_list = (sampler_output.sampled_token_ids[torch.where( - # subtracting is faster than testing for equality - sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] \ - if any(seq_output_prompt_logprobs) else \ - sampler_output.sampled_token_ids).tolist() - - seq_data_entries = [ - (seq_id, seq_data) for sg in \ - execute_model_req.seq_group_metadata_list \ - for seq_id, seq_data in sg.seq_data.items() - ] - completion_seq_group_output_list: List[ - CompletionSequenceGroupOutput] = [] - output_index = 0 - # Make sure the non-terminal prefill chunks are still aligned with - # their own empty output. - for idx, seq_group_meta in enumerate( - execute_model_req.seq_group_metadata_list): - needs_prompt_logprobs = seq_output_prompt_logprobs[idx] - seq_id, seq_data = seq_data_entries[idx] - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - - # Some of these sequences may belong to non-terminal chunks, - # which may still have to report logprobs for prompts. - start = 1 if seq_data._num_computed_tokens == 0 \ - else seq_data._num_computed_tokens - end = (seq_data._num_computed_tokens + \ - seq_group_meta.token_chunk_size) - prompt_token_ids = prompt_token_ids[start:end] - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) for p_token_id in prompt_token_ids - ] - else: - prompt_logprobs = None - - # Since we can get chunks here, we dont always have a sampled token - # (only on last chunk) but we still have to provide an output. - if not seq_group_meta.do_sample: - completion_seq_group_output_list.append( - CompletionSequenceGroupOutput( - samples=[], prompt_logprobs=prompt_logprobs)) - continue - - # Sequence with output. - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[output_index][0], - token_id_logprob_rank=-1, - token_id_logprob=0.0, - seq_id=seq_id, - topk_token_ids=[], - topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - output_index += 1 - - return [SamplerOutput(outputs=completion_seq_group_output_list)] - - @nvtx_range("spec_decode_worker._run_no_spec") - def _run_no_spec(self, execute_model_req: ExecuteModelRequest, - skip_proposer: bool) -> List[SamplerOutput]: - """Run a single generation step without any speculation. The input is - sent to the proposer and scorer model so that the KV cache is consistent - between the two. When skip_proposer is True, the proposer model is - not called, meaning that the kv-cache in proposer for requests is not - updated, so they cannot enable spec decode in the rest decoding. - """ - - sampler_output = self.scorer_worker.execute_model(execute_model_req) - assert len(sampler_output) == 1 - sampler_output = sampler_output[0] - - # Store hidden states from target model execution, BxD. - hidden_states = sampler_output.hidden_states - if hidden_states is not None: - # Only decodes and prefill terminal chunks need a hidden state. - seq_group_meta_with_hidden = [ - sg for sg in execute_model_req.seq_group_metadata_list - if sg.do_sample - ] - if any(seq.is_prompt for seq in seq_group_meta_with_hidden): - # Drop hidden_states with no prediction (eg non-terminal chunks) - hidden_states = hidden_states[ - torch.where(sampler_output.sampled_token_ids - - VLLM_INVALID_TOKEN_ID)[0]] - if self.previous_hidden_states is None and len( - seq_group_meta_with_hidden): - self.previous_hidden_states = HiddenStates( - hidden_states, seq_group_meta_with_hidden) - elif self.previous_hidden_states and len( - seq_group_meta_with_hidden): - self.previous_hidden_states.update(hidden_states, - seq_group_meta_with_hidden) - self.previous_hidden_states.prune(seq_group_meta_with_hidden) - - if not skip_proposer: - # We prepare the prefill hidden states here so that there no - # additional complexity in worker for spec_decode vs non_spec_decode - # flow and execute_model doesn't need additional modifications. - execute_model_req.previous_hidden_states = \ - prepare_prefill_hidden_states( - sampler_output.prefill_hidden_states) - for i in range(self._num_spec_prefill_steps): - execute_model_req.spec_step_idx = i - self.proposer_worker.execute_model(execute_model_req) - - sampler_output_to_return = (self._serialize_sampler_output_no_logprobs( - execute_model_req=execute_model_req, sampler_output=sampler_output) - if self._disable_logprobs else - [sampler_output]) - - # Clear device tensors from sampler output. This reduces communication - # overhead when the engine runs in a different process than the workers. - sampler_output.sampled_token_probs = None - sampler_output.sampled_token_ids = None - sampler_output.logprobs = None - return sampler_output_to_return - - def _run_non_driver_rank(self) -> bool: - """Run proposer and verifier model in non-driver workers. This is used - for both speculation cases (num_lookahead_slots>0) and non-speculation - cases (e.g. prefill). - - Returns True if there are remaining sequences to process. - """ - assert self.rank != self._driver_rank - - data = broadcast_tensor_dict(src=self._driver_rank) - if not data: - return False - num_lookahead_slots = data["num_lookahead_slots"] - - # In case of prefill, scorer_worker has to be run before proposer so - # that the hidden states can be propagated to proposer when needed. - if data["no_spec"]: - self.scorer_worker.execute_model() - - if not data["disable_all_speculation"]: - # Even if num_lookahead_slots is zero, we want to run the - # proposer model as it may have KV. - # - # We run the proposer once per lookahead slot. In the future we - # should delegate how many times it runs to the proposer. - for _ in range(max(num_lookahead_slots, 1)): - self.proposer_worker.execute_model() - - if not data["no_spec"]: - self.scorer_worker.execute_model() - if data["run_spec_proposer_for_prefill"]: - self.proposer_worker.execute_model() - - return True - - @nvtx_range("spec_decode_worker._run_speculative_decoding_step") - def _run_speculative_decoding_step( - self, execute_model_req: ExecuteModelRequest, - num_lookahead_slots: int) -> List[SamplerOutput]: - """Execute a single step of speculative decoding. - - This invokes the proposer worker to get k speculative tokens for each - sequence, then scores each speculative token using the scoring worker. - - When `enable_chunked_prefill` is set, scorer will batch decodes and - prefills, while proposer will sync its KV-cache by running an extra - forward on prefills. - - Returns a list of SamplerOutput, each containing a single token per - sequence. - """ - # With prefill chunking, expect requests to have prompts first - # so that backend gets prefill|decode. - assert num_lookahead_slots == execute_model_req.num_lookahead_slots - - # Pass last hidden states from target model to proposer - execute_model_req.previous_hidden_states = self.previous_hidden_states - self.previous_hidden_states = None - - with Timer() as proposal_timer: - # Generate proposals using draft worker. - proposals = self.proposer_worker.get_spec_proposals( - execute_model_req, self._seq_with_bonus_token_in_last_step) - - if not self._allow_zero_draft_token_step and proposals.no_proposals: - #TODO: Fix it #5814 - raise RuntimeError("Cannot handle cases where distributed draft " - "workers generate no tokens") - - execute_model_req.previous_hidden_states = None - - with Timer() as scoring_timer: - proposal_scores = self.scorer.score_proposals( - execute_model_req, - proposals, - ) - - _, (non_spec_seqs, non_spec_indices) = split_batch_by_proposal_len( - execute_model_req.seq_group_metadata_list, proposals.proposal_lens) - # With prefill chunking enabled, `non_spec_seqs` contains prefills too: - # discard decodes that have already been processed by proposer. - non_spec_indices = [ - idx for idx in non_spec_indices - if execute_model_req.seq_group_metadata_list[idx].is_prompt - ] - if len(non_spec_indices): - all_hidden_states = proposal_scores.hidden_states - if all_hidden_states is not None: - prefill_hidden_states = all_hidden_states[non_spec_indices] - execute_model_req.previous_hidden_states = \ - prepare_prefill_hidden_states(prefill_hidden_states) - # Sync proposer KV cache for prefills. - prefill_req = execute_model_req.clone(non_spec_seqs) - # TODO avoid sampling here? - self.proposer_worker.execute_model(prefill_req) - - with Timer() as verification_timer: - accepted_token_ids, target_logprobs = self._verify_tokens( - execute_model_req.seq_group_metadata_list, proposal_scores, - proposals, execute_model_req.num_lookahead_slots) - - stage_times = (proposal_timer.elapsed_time_ms / num_lookahead_slots, - scoring_timer.elapsed_time_ms, - verification_timer.elapsed_time_ms) - - return self._create_output_sampler_list( - execute_model_req.seq_group_metadata_list, - accepted_token_ids, - target_logprobs=target_logprobs, - prompt_logprobs=proposal_scores.prompt_logprobs - if not self._disable_logprobs else None, - k=execute_model_req.num_lookahead_slots, - stage_times=stage_times) - - @nvtx_range("spec_decode_worker._verify_tokens") - def _verify_tokens( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_scores: SpeculativeScores, - proposals: SpeculativeProposals, - max_proposal_len: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Determine which speculative tokens are accepted using the - probabilities of each token according to the proposer and scorer models. - - Returns a tuple of Tensors, one for the accepted token ids and one for - the logprobs according to the scoring model. - """ - proposal_lens_list = proposals.proposal_lens.tolist() - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len( - seq_group_metadata_list, proposal_lens_list) - original_indices = spec_indices + non_spec_indices - - # Get probabilities of target model, including bonus tokens. - proposal_verifier_probs = proposal_scores.probs[spec_indices] - - # Get non-speculative sampled tokens from target model. - non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] - - # Get bonus tokens from target model. - bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] - - # Get probabilities according to proposal method. - proposal_probs = proposals.proposal_probs[spec_indices] - - # Get proposed tokens. - proposal_token_ids = proposals.proposal_token_ids[spec_indices] - - # Sampler arguments - sampler_extra_kwargs: Dict[str, Any] = {} - if self.generators and isinstance(self.spec_decode_sampler, - SpecDecodeStochasticBaseSampler): - sampler_extra_kwargs["seeded_seqs"] = { - idx: self.generators[sgm.request_id] - for idx, sgm in enumerate(seq_group_metadata_list) - if sgm.sampling_params.seed is not None - } - - accepted_token_ids = self.spec_decode_sampler( - target_with_bonus_probs=proposal_verifier_probs, - bonus_token_ids=bonus_token_ids, - draft_probs=proposal_probs, - draft_token_ids=proposal_token_ids, - **sampler_extra_kwargs, - ) - # Append output tokens from non-speculative sequences to - # the accepted token ids tensor. - non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + - 1).clone() - non_spec_token_ids[:, 1:] = -1 - accepted_token_ids = torch.cat( - [accepted_token_ids, non_spec_token_ids]) - logprobs = proposal_scores.logprobs - # Rearrange so that results are in the order of the original seq group - # metadata. - accepted_token_ids[original_indices] = accepted_token_ids.clone() - - # B x K+1 x D - hidden_states = proposal_scores.hidden_states - if hidden_states is not None: - # Only get terminal hidden states for next step - terminal_metadata = [ - sg for sg in seq_group_metadata_list if sg.do_sample - ] - - # Contract hidden states based on accepted tokens - hs_size = hidden_states.shape[-1] - accepted_index = accepted_token_ids + 1 # Convert -1 to 0 - accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b - # Drop non-terminal prefill chunks hidden states. - hidden_states = hidden_states[accepted_index != - VLLM_INVALID_TOKEN_ID] - accepted_index = accepted_index[accepted_index != - VLLM_INVALID_TOKEN_ID] - assert len(accepted_index) == hidden_states.shape[0] == len( - terminal_metadata) - index = accepted_index[:, None, None].expand(-1, 1, - hs_size) # b x 1 x d - second_last_token_hidden_states = hidden_states[:, -2] # b x d - hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d - # Store hidden states from target model for subsequent decode step - self.previous_hidden_states = HiddenStates( - hidden_states, terminal_metadata, - second_last_token_hidden_states) - return accepted_token_ids, logprobs - - def _create_output_sampler_list( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] - target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] - prompt_logprobs: Optional[ - torch.Tensor], # shape: [nprompt_tokens, vocab_size] - k: int, - stage_times: Tuple[float, float, float], - ) -> List[SamplerOutput]: - """Given the accepted token ids, create a list of SamplerOutput. - - The output is padded with -1 tokens such that each sequence has - the same number of outputs. - """ - batch_size, num_steps = accepted_token_ids.shape - accepted_token_ids_by_step = accepted_token_ids.transpose(0, 1) - if self._disable_logprobs: - # We are skipping the logprobs. Hence don't serialize the - # logprobs related tensors from the GPU. Instead create - # empty/dummy lists. - (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, - topk_logprobs_by_step, topk_indices_by_step) =\ - self._create_dummy_logprob_lists( - batch_size, num_steps, - self.scorer_worker.model_config.max_logprobs) - else: - # Organize input tensors by step instead of by sequence. - target_logprobs_by_step = target_logprobs.transpose(0, 1) - # Serialize all tensors into Python lists. - (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, - topk_logprobs_by_step, topk_indices_by_step) =\ - self._create_logprob_lists_from_tensors( - target_logprobs_by_step, accepted_token_ids_by_step, - self.scorer_worker.model_config.max_logprobs) - - # Get the sequence ids and num_logprobs (sampling parameter) in the - # batch. - seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids( - seq_group_metadata_list) - - num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list) - - # Serialize tensor to CPU Python list. - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - # Construct the output on a per-step, per-sequence basis. - # Non-terminal prefill chunks will end up here as rows with just -1s - # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while - # terminal chunks will only have one generated token at time 0. - sampler_output_list: List[SamplerOutput] = [] - - # Prefills are not multi-step (return at most 1 token), in order to - # avoid padding or repetition to fit decodes, we separate them. - for i, sg in enumerate(seq_group_metadata_list): - if not sg.is_prompt: - # Requests are ordered as prefills|decodes=>no more prefills. - break - num_logprobs = num_logprobs_per_seq[i] - seq_kwargs = dict(token_id=-1, - token_id_logprob_rank=0, - token_id_logprob=-float('inf'), - topk_token_ids=[-1] * num_logprobs, - topk_logprobs=[-float('inf')] * num_logprobs, - seq_id=seq_ids[i]) - # Terminal chunk, has token. - if sg.do_sample: - seq_kwargs.update( - dict( - token_id=accepted_token_ids[i][0].item(), - token_id_logprob_rank=accepted_token_id_ranks_by_step[ - 0][i], - token_id_logprob=accepted_token_id_logprobs_by_step[0] - [i], - topk_token_ids=topk_indices_by_step[0][i] - [:num_logprobs], - # output only so step is 0 - topk_logprobs=topk_logprobs_by_step[0][i] - [:num_logprobs], - )) - needs_plogs = (sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - plogs = None - if prompt_logprobs is not None: - # Even non-terminal prompt chunks can have logprobs here. - plogs = prompt_logprobs[i] - elif needs_plogs: - # Prompt logprobs are requested but `_disable_logprobs` is set. - seq_data = next(iter(sg.seq_data.values())) - # Get only the tokens in this chunk! - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_token_ids = prompt_token_ids[ - seq_data. - _num_computed_tokens:seq_data._num_computed_tokens + - sg.token_chunk_size] - - is_first_chunk = seq_data._num_computed_tokens == 0 - # There's no prob generated for the first token in a sequence. - if is_first_chunk: - prompt_token_ids = prompt_token_ids[1:] - plogs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) for p_token_id in prompt_token_ids - ] - seq_kwargs.update(dict(prompt_logprobs=plogs)) - - sampler_output_list.append( - SamplerOutput( - outputs=[create_sequence_group_output( - **seq_kwargs)])) # type: ignore - - # Decodes, create one SamplerOutput per-step (at most K+1). - for step_index in range(num_steps): - if all(token_id == -1 for sg, token_id in zip( - seq_group_metadata_list, - accepted_token_ids_by_step[step_index]) - if not sg.is_prompt): - break - - step_output_token_ids: List[CompletionSequenceGroupOutput] = [] - for sequence_index in range(batch_size): - seq_meta = seq_group_metadata_list[sequence_index] - # Prompts already processed above. - if seq_meta.is_prompt: - continue - - # Each sequence may have a different num_logprobs; retrieve it. - num_logprobs = num_logprobs_per_seq[sequence_index] - step_output_token_ids.append( - create_sequence_group_output( - token_id=accepted_token_ids_by_step[step_index] - [sequence_index], - token_id_logprob_rank=accepted_token_id_ranks_by_step[ - step_index][sequence_index], - token_id_logprob=accepted_token_id_logprobs_by_step[ - step_index][sequence_index], - seq_id=seq_ids[sequence_index], - topk_token_ids=topk_indices_by_step[step_index] - [sequence_index][:num_logprobs], - topk_logprobs=topk_logprobs_by_step[step_index] - [sequence_index][:num_logprobs], - step_index=step_index)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - - # Populate the data structures needed to keep track of sequences with - # bonus tokens. - self._track_sequences_with_bonus_tokens(seq_ids, - request_ids_seq_ids_mapping, - accepted_token_ids_by_step) - maybe_rejsample_metrics = ( - self._metrics.maybe_collect_rejsample_metrics(k)) - if maybe_rejsample_metrics is not None: - sampler_output_list[ - 0].spec_decode_worker_metrics = maybe_rejsample_metrics - - # Log time spent in each stage periodically. - # This is periodic because the rejection sampler emits metrics - # periodically. - self._maybe_log_stage_times(*stage_times) - # First `n_prefills` entries will contain prefills SamplerOutput when - # chunked prefill is enabled, the rest is decodes in multi-step format. - return sampler_output_list - - def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, - scoring_time_ms: float, - verification_time_ms: float) -> None: - """Log the speculative stage times. If stat logging is disabled, do - nothing. - """ - if self._disable_log_stats: - return - - logger.info( - "SpecDecodeWorker stage times: " - "average_time_per_proposal_tok_ms=%.02f " - "scoring_time_ms=%.02f verification_time_ms=%.02f", - average_time_per_proposal_tok_ms, scoring_time_ms, - verification_time_ms) - - def _create_dummy_logprob_lists( - self, - batch_size: int, - num_steps: int, - num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: - """ - Creates and returns four dummy lists representing token probabilities - and their ranks. - - This method initializes and returns: - - The ranks of the accepted tokens, shaped (num_steps, batch_size) - - The log probabilities of the accepted tokens, - shaped (num_steps, batch_size) - - The log probabilities of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - The token IDs of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - Args: - batch_size (int): The size of the batch. - num_steps (int): The number of steps in the sequence. - num_top_k (int): The number of top-k token log probabilities to - return. - - Returns: - A tuple containing four dummy lists as described above. - """ - accepted_token_id_ranks_by_step = [[-1] * batch_size - for _ in range(num_steps)] - accepted_token_id_logprobs_by_step = [[0.0] * batch_size - for _ in range(num_steps)] - topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[ - [None] * num_top_k for _ in range(batch_size) - ] for _ in range(num_steps)] - topk_indices_by_step: List[List[List[Optional[int]]]] = [[ - [None] * num_top_k for _ in range(batch_size) - ] for _ in range(num_steps)] - return (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, topk_logprobs_by_step, - topk_indices_by_step) - - def _create_logprob_lists_from_tensors( - self, - target_logprobs_by_step: torch.Tensor, - accepted_token_ids_by_step: torch.Tensor, - num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: - """ - Creates and returns four lists representing token probabilities and - their ranks. - - This method initializes and returns four lists containing: - - The ranks of the accepted tokens, shaped (num_steps, batch_size) - - The log probabilities of the accepted tokens, - shaped (num_steps, batch_size) - - The log probabilities of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - The token IDs of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - Args: - target_logprobs_by_step (torch.Tensor): Tensor representing the - log probabilities of the target model, - shaped (num_steps, batch_size, vocab_size) - accepted_token_ids_by_step (torch.Tensor): Tensor representing - the accepted token_ids, shaped (num_steps, batch_size) - num_top_k (int): The number of top-k token log probabilities to - return. - - Returns: - A tuple containing the lists as described above. - """ - # Serialize all tensors to CPU Python lists. - # Get the logprobs/rank of the accepted tokens. - (accepted_token_id_ranks_by_step_tensor, - accepted_token_id_logprobs_by_step_tensor - ) = get_sampled_token_logprobs( - logprob_tensor=target_logprobs_by_step, - sampled_token_ids=accepted_token_ids_by_step, - ) - # Get the top-k logprobs (which may or may not include the - # logprob of the accepted token). - (topk_logprobs_by_step_tensor, - topk_indices_by_step_tensor) = target_logprobs_by_step.topk( - k=num_top_k, - dim=-1, - ) - accepted_token_id_ranks_by_step = ( - accepted_token_id_ranks_by_step_tensor.tolist()) - accepted_token_id_logprobs_by_step = ( - accepted_token_id_logprobs_by_step_tensor.tolist()) - topk_logprobs_by_step = topk_logprobs_by_step_tensor.tolist() - topk_indices_by_step = topk_indices_by_step_tensor.tolist() - return (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, topk_logprobs_by_step, - topk_indices_by_step) - - def _track_finished_requests(self, execute_model_req: ExecuteModelRequest): - """ - Removes the finished requests and their associated sequence ids from - internal book keeping data structures. - """ - for finished_request in execute_model_req.finished_requests_ids: - for seq_id in self._request_id_seq_id_mapping[finished_request]: - self._seq_with_bonus_token_in_last_step.discard(seq_id) - del self._request_id_seq_id_mapping[finished_request] - - def _track_sequences_with_bonus_tokens( - self, seq_ids: List[int], - request_ids_seq_ids_mapping: Dict[str, Set[int]], - accepted_token_ids_by_step: List[List[int]]): - """ - Updates the internal data structures which keep track of sequences - which have been assigned bonus tokens in their last forward pass. - """ - for seq_index, seq_id in enumerate(seq_ids): - last_token_id = accepted_token_ids_by_step[-1][seq_index] - if last_token_id == -1: - self._seq_with_bonus_token_in_last_step.discard(seq_id) - else: - self._seq_with_bonus_token_in_last_step.add(seq_id) - for request_id, sequences in request_ids_seq_ids_mapping.items(): - self._request_id_seq_id_mapping[request_id].update(sequences) - - @cached_property - def _vocab_size(self) -> int: - """Get the vocab size of the model and make sure it's consistent between - draft and target workers. - """ - vocab_sizes = [ - worker.vocab_size - for worker in [self.proposer_worker, self.scorer_worker] - ] - assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes) - return vocab_sizes[0] - - @property - def rank(self): - return self.scorer_worker.rank - - @property - def device(self): - return self.scorer_worker.device - - @property - def _driver_rank(self) -> int: - return 0 - - def get_cache_block_size_bytes(self): - """Return the size of a cache block in bytes. - - This function is only used to compose workers within a SpecDecodeWorker. - We leave composing a SpecDecodeWorker within a SpecDecodeWorker - undefined for now, although it could be implemented in the future. - See https://arxiv.org/abs/2308.04623. - """ - raise NotImplementedError - - def start_profile(self): - if isinstance(self.scorer_worker, WorkerBase): - self.scorer_worker.start_profile() - - def stop_profile(self): - if isinstance(self.scorer_worker, WorkerBase): - self.scorer_worker.stop_profile() - - -def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, - proposer_cache_block_size_bytes: int, - total_num_gpu_blocks: int) -> int: - """Given total_num_gpu_blocks, the number of GPU blocks that could be - allocate to the target model, this function calculates how many blocks - should be given to the draft and target model. - - Note that usually the block size, in bytes, of each model is different, - as it's a function of number of KV/layer, number of heads, and hidden - dimension size. - - Since the target and draft models allocate the same number of blocks, we - simply calculate the number of blocks where if allocated by both models, - the total memory usage from KV cache is no larger than the number of - blocks allocatable by the target model alone. - """ - new_num_gpu_blocks = int( - total_num_gpu_blocks * scorer_cache_block_size_bytes / - (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes)) - - return new_num_gpu_blocks - - -def prepare_prefill_hidden_states( - prefill_hidden_states: torch.Tensor) -> HiddenStates: - # For prefill step in proposer, we run the model for N-1 tokens - # because Nth token will be processed in the first decode step. For - # N-1 tokens, the input should be 0:N-1 hidden states which should - # be concatanated with 1:N token (since output of scorer has to be - # the input for proposer). Therefore, we shift the hidden states to - # align n-1th hidden state with nth token. - return HiddenStates(prefill_hidden_states.roll( - shifts=1, dims=0)) if prefill_hidden_states is not None else None diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py deleted file mode 100644 index ca89eb60ac583..0000000000000 --- a/vllm/spec_decode/target_model_runner.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional - -from vllm.sequence import SequenceGroupMetadata -from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerWrapperBase) - - -class TargetModelRunner(ModelRunnerWrapperBase): - """Specialized model runner for speculative decoding target model. - In speculative decoding, the log probabilities selected finally may not - be the same ones as selected by the target model sampling. This means - that the time spent in the log probability calculation of the target model - is time wasted, since we calculate log probabilities after deciding which - tokens are accepted. For this reason disabling log probabilities in the - target model will make decode faster. The model runner sets the - SamplingMetadata parameters according to whether log probabilities are - requested or not. - """ - - def __init__(self, model_runner: ModelRunnerBase): - # An internal boolean member variable to indicate if token log - # probabilities are needed or not. - super().__init__(model_runner) - self.disable_logprobs = True - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelRunnerInputBase: - model_input: ModelRunnerInputBase =\ - self.model_runner.prepare_model_input( - seq_group_metadata_list, virtual_engine, finished_requests_ids) - # If token log probabilities is disabled then skip generating sampler - # CPU output. We directly serialize the GPU sampled_token_id tensors - # as needed. If log probabilities is enabled then synchronize all the - # sampling related tensors which includes the logprobs tensors. - model_input.sampling_metadata.skip_sampler_cpu_output = ( - self.disable_logprobs) - return model_input diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py deleted file mode 100644 index afd91b42b9433..0000000000000 --- a/vllm/spec_decode/top1_proposer.py +++ /dev/null @@ -1,275 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeProposer) -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.util import sampler_output_to_torch - - -class Top1Proposer(SpeculativeProposer): - """Helper class which separates out sequences which would exceed the max - model length when speculated upon. - - This allows combinations of models such as JackFram/llama-68m draft with - meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of - 2048 while Llama2-13b has max_position_embeddings of 4096. - - We treat the sequences which exceed the proposal draft model length as - "non-spec sequences". Essentially they skip the draft model and go through - normal decoding in the target model. - - Currently, only proposal_lens of 0 and k are supported, where k is a global - batch proposal length. In the future vLLM should support per-sequence - proposal lengths. - """ - - def __init__( - self, - worker: ProposerWorkerBase, - device: str, - vocab_size: int, - max_proposal_len: Optional[int] = None, - ): - self._worker = worker - self._device = device - self.max_proposal_len = max_proposal_len - self._vocab_size = vocab_size - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Get speculative proposals given the input batch. - - Sequences which would exceed the max model length are skipped during - speculation. - """ - proposal_len = execute_model_req.num_lookahead_slots - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - # Split speculative- and non-speculative- sequences. - ( - proposal_lens, - nonzero_proposal_len_seqs, - nonzero_proposal_len_indices, - ) = self._split_by_proposal_len(seq_group_metadata_list, proposal_len) - - if nonzero_proposal_len_seqs: - # Speculate tokens using the draft worker for the speculative - # sequences. - # If sampler_transposed is true, then maybe_sampler_output's - # token_ids is like [batch] format in proposal_len size list, - # while if it is false, the format would be [proposal_len] - # in batch size list - hidden_states = execute_model_req.previous_hidden_states - if hidden_states is not None: - hidden_states.prune(nonzero_proposal_len_seqs) - nonzero_execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=nonzero_proposal_len_seqs, - num_lookahead_slots=proposal_len, - previous_hidden_states=hidden_states, - ) - maybe_sampler_output, transposed = self._worker.sampler_output( - execute_model_req=nonzero_execute_model_req, - sample_len=proposal_len, - seq_ids_with_bonus_token_in_last_step=\ - seq_ids_with_bonus_token_in_last_step, - ) - ( - proposal_lens, - maybe_sampler_output, - nonzero_proposal_len_indices, - ) = self._remove_no_proposal_seqs(proposal_lens, - maybe_sampler_output, - nonzero_proposal_len_indices, - transposed) - else: - # If no sequences can be speculated, set sampler output to None. - maybe_sampler_output = None - transposed = False - - # Combine speculative- and non-speculative sequences into the same - # representation. - proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( - batch_size=len(seq_group_metadata_list), - proposal_len=proposal_len, - maybe_sampler_output=maybe_sampler_output, - proposal_lens=proposal_lens, - nonzero_proposal_len_indices=nonzero_proposal_len_indices, - sampler_transposed=transposed, - ) - - proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output - is None) - return proposals - - def _split_by_proposal_len( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_len: int, - ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: - """Split sequences by two groups: - 1. Sequences with non-zero proposal length. - 2. Sequences with zero proposal length (due to disabled speculation - or exceed the maximum model length). - """ - - proposal_lens: List[int] = [] - nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] - nonzero_proposal_len_indices: List[int] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - # The speculative decoding for this request has either been disabled - # (e.g. due to high traffic) or this is a prompt request. - if (seq_group_metadata.is_prompt - or seq_group_metadata.num_speculative_tokens == 0): - proposal_lens.append(0) - continue - - seq_data = next(iter(seq_group_metadata.seq_data.values())) - seq_len = seq_data.get_len() - - # Currently only proposal lens of 0 or the global batch proposal len - # are supported. - # If max_proposal_len is defined, then we shall not exceed this - # quota for nonzero_proposal - new_k = 0 - if (self.max_proposal_len is None - or seq_len + proposal_len < self.max_proposal_len): - new_k = proposal_len - nonzero_proposal_len_seqs.append(seq_group_metadata) - nonzero_proposal_len_indices.append(i) - proposal_lens.append(new_k) - seq_group_metadata.num_speculative_tokens = new_k - - return ( - proposal_lens, - nonzero_proposal_len_seqs, - nonzero_proposal_len_indices, - ) - - @staticmethod - def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output, - nonzero_proposal_len_indices, transposed): - """Remove sequences from nonzero_proposal_len_indices and reset - their proposal_len to 0 the draft worker does not provide a proposal - (maybe_sampler_output=None). This can avoid scoring overheads. - """ - - # If maybe_sampler_output is None, then the draft worker did not - # provide a proposal for any sequence and thus no action needed. - # Also we do not support transposed maybe_sampler_output for now - # because it seems not straightforward for draft workers outputting - # transposed sampler outputs to handle the case of no proposal. - if maybe_sampler_output is None or transposed: - return (proposal_lens, maybe_sampler_output, - nonzero_proposal_len_indices) - - new_proposal_lens: List[int] = [] - new_nonzero_proposal_len_indices: List[int] = [] - new_maybe_sampler_output: List[SamplerOutput] = [] - nonzero_proposal_len_idx_ptr = 0 - seq_idx = 0 - while seq_idx < len( - proposal_lens) and nonzero_proposal_len_idx_ptr < len( - nonzero_proposal_len_indices): - if seq_idx < nonzero_proposal_len_indices[ - nonzero_proposal_len_idx_ptr]: - # Sequence is not in the original nonzero_proposal_len_indices, - # meaning that it has a proposal length of 0 before sending to - # the draft worker. - assert proposal_lens[seq_idx] == 0 - new_proposal_lens.append(0) - else: - # Sequence is in the original nonzero_proposal_len_indices - if maybe_sampler_output[nonzero_proposal_len_idx_ptr] is None: - # but does not have a proposal from the draft worker. - new_proposal_lens.append(0) - else: - # and has a proposal from the draft worker. Add it to the - # new nonzero proposal list and keep the sampler output. - new_proposal_lens.append(proposal_lens[seq_idx]) - new_nonzero_proposal_len_indices.append(seq_idx) - new_maybe_sampler_output.append( - maybe_sampler_output[nonzero_proposal_len_idx_ptr]) - nonzero_proposal_len_idx_ptr += 1 - seq_idx += 1 - - # The remaining sequences should have proposal length of 0. - new_proposal_lens.extend(proposal_lens[seq_idx:]) - - # We assume sampler_output will not be a list of all Nones. - # In this case this function should not be called. - assert new_maybe_sampler_output - return (new_proposal_lens, new_maybe_sampler_output, - new_nonzero_proposal_len_indices) - - def _merge_outputs( - self, - batch_size: int, - proposal_len: int, - maybe_sampler_output: Optional[List[SamplerOutput]], - proposal_lens: List[int], - nonzero_proposal_len_indices: List[int], - sampler_transposed: bool, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """After speculations are produced, merge the speculation results with - the skipped sequences. - """ - if maybe_sampler_output is None: - # If no speculative tokens, the sampler output will be None. - # In this case we return empty proposals. - proposal_tokens = torch.tensor(-1, - dtype=torch.long, - device=self._device).expand( - batch_size, proposal_len) - proposal_probs = torch.tensor(0, - dtype=torch.float32, - device=self._device).expand( - batch_size, proposal_len, - self._vocab_size) - proposal_lens_tensor = torch.tensor(0, - dtype=torch.long, - device=self._device).expand( - len(proposal_lens)) - return proposal_tokens, proposal_probs, proposal_lens_tensor - - sampler_output = maybe_sampler_output - proposal_tokens, proposal_probs, *_ = sampler_output_to_torch( - sampler_output, sampler_transposed) - - # Now, reformat the output GPU tensors such that each sequence has - # a proposal. the proposal can be empty, e.g. [-1, -1, -1] - - entire_proposal_tokens = proposal_tokens.new_full( - size=(batch_size, *proposal_tokens.shape[1:]), - fill_value=-1, - ) - entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens - entire_proposal_probs = proposal_probs.new_zeros( - batch_size, - *proposal_probs.shape[1:], - ) - entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs - - proposal_tokens, proposal_probs = ( - entire_proposal_tokens, - entire_proposal_probs, - ) - - proposal_lens_tensor = torch.zeros(batch_size, - dtype=torch.long, - device=self._device) - proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len - - return proposal_tokens, proposal_probs, proposal_lens_tensor diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py deleted file mode 100644 index 22d2a4833acf9..0000000000000 --- a/vllm/spec_decode/util.py +++ /dev/null @@ -1,277 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from contextlib import contextmanager -from typing import Dict, List, Optional, Sequence, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - PromptLogprobs, SequenceGroupMetadata, - SequenceOutput) - -SeqId = int - - -def get_all_num_logprobs( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: - """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. - - If the sampling params do not call for any logprobs, return 0 for that - sequence. - """ - - all_num_logprobs: List[int] = [] - for seq_group_metadata in seq_group_metadata_list: - num_logprobs = seq_group_metadata.sampling_params.logprobs - if num_logprobs is None: - num_logprobs = 0 - all_num_logprobs.append(num_logprobs) - - return all_num_logprobs - - -def get_sampled_token_logprobs( - # shape [num_steps, batch_size, vocab_size] - logprob_tensor: torch.Tensor, - sampled_token_ids: torch.Tensor, # shape [num_steps, batch_size] -) -> Tuple[torch.Tensor, torch.Tensor]: - """Get the logprobs for the sampled tokens. Returns the ranks and logprobs. - """ - num_steps, batch_size, vocab_size = logprob_tensor.shape - - selected_logprobs = logprob_tensor[ - torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, - ] - expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( - -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor - > expanded_selected_logprobs).sum(-1).add_(1) - - return sampled_token_ids_ranks, selected_logprobs - - -def create_logprobs_output( - token_id: int, - token_id_logprob_rank: int, - token_id_logprob: float, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], -) -> Dict[int, Logprob]: - """Create a Logprob Dict for a token given the sampling results. - - Args: - token_id (int): The sampled token for the sequence. - token_id_logprob_rank (int): The logprob rank of the sampled token. - token_id_logprob (float): The logprob value of the sampled token. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. - """ - # vLLM logprobs always include the sampled token. In addition, the user may - # request topk-logprobs (where top-k varies per user up to max_logprobs). - logprobs: Dict[int, Logprob] = { - token_id: Logprob( - logprob=token_id_logprob, - rank=token_id_logprob_rank, - ), - } - logprobs.update({ - topk_token_id: Logprob( - logprob=topk_logprob if topk_logprob is not None else 0.0, - rank=topk_index + 1, - ) - for topk_index, (topk_token_id, topk_logprob) \ - in enumerate(zip(topk_token_ids, topk_logprobs)) \ - if topk_token_id is not None - }) - - return logprobs - - -def create_sequence_group_output( - token_id: int, - token_id_logprob_rank: int, - token_id_logprob: float, - seq_id: SeqId, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], - prompt_logprobs: Optional[PromptLogprobs] = None, - step_index: Optional[int] = 0) -> CompletionSequenceGroupOutput: - """Create a SequenceGroupOutput given the sampling results. - - Args: - token_id (int): The sampled token for the sequence. - token_id_logprob_rank (int): The logprob rank of the sampled token. - token_id_logprob (float): The logprob value of the sampled token. - seq_id (int): The sequence id. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. - step_index: (Optional[int]): The index of the speculative token. - """ - - logprobs = create_logprobs_output( - token_id, - token_id_logprob_rank, - token_id_logprob, - topk_token_ids, - topk_logprobs, - ) - - return CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs=logprobs) - ], - prompt_logprobs=prompt_logprobs, - step_index=step_index) - - -def split_batch_by_proposal_len( - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_lens: List[int], -) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[ - List[SequenceGroupMetadata], List[int]]]: - """Utility function that splits a batch based on whether the proposal len is - zero or not. We should remove this once vLLM supports per-sequence proposal - lens in a batch. - """ - - nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) - zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) - for i, (seq_group, proposal_len) in enumerate( - zip(seq_group_metadata_list, proposal_lens)): - seq_groups, indices = nonzero_lists if proposal_len else zero_lists - seq_groups.append(seq_group) - indices.append(i) - return nonzero_lists, zero_lists - - -def sampler_output_to_torch( - sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Utility function which converts a list of SamplerOutput to tensors. - - sampler_transposed here is used as the indicator for whether - we need do additional tensor transpose logic here. - - Returns: - sampled_token_ids: torch.Tensor - shape: [batch_size, len(sampler_output_list)] - - sampled_token_probs: torch.Tensor - shape: [batch_size, len(sampler_output_list), vocab_size] - """ - - # shape: [batch_size, num_sampler_output, vocab_size] - sampled_token_probs = torch.stack( - [ - sampler_output.sampled_token_probs - for sampler_output in sampler_output_list - ], - dim=0, - ) - - # shape: [batch_size, num_sampler_output, vocab_size] - sampled_token_logprobs = torch.stack( - [sampler_output.logprobs for sampler_output in sampler_output_list], - dim=0, - ) - - # shape: [batch_size, num_sampler_output] - sampled_token_ids = torch.stack( - [ - sampler_output.sampled_token_ids.flatten() - for sampler_output in sampler_output_list - ], - dim=0, - ) - - if sampler_transposed: - sampled_token_probs = sampled_token_probs.transpose(0, 1) - sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1) - sampled_token_ids = sampled_token_ids.transpose(0, 1) - - if sampler_output_list[0].hidden_states is not None: - # shape: [batch_size, num_sampler_output, hidden_dim] - sampled_hidden_states = torch.stack( - [ - sampler_output.hidden_states - for sampler_output in sampler_output_list - ], - dim=0, - ) - - if sampler_transposed: - sampled_hidden_states = sampled_hidden_states.transpose(0, 1) - else: - sampled_hidden_states = None - - return (sampled_token_ids, sampled_token_probs, sampled_token_logprobs, - sampled_hidden_states) - - -def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, - vocab_size: int, device: str) -> None: - """Helper method which mocks out the GPU tensors in SamplerOutput with dummy - values. This will be removed in PR 7/9. - https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer - """ - values = [ - sampler_output.sampled_token_probs, sampler_output.sampled_token_ids - ] - assert all(v is None for v in values) or not any(v is None for v in values) - if not any(v is None for v in values): - # Do nothing if the tensors are already created (usually in unit tests). - return - - # Softmax to ensure valid probs. - sampler_output.sampled_token_probs = torch.nn.functional.softmax( - torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), - dim=-1) - - sampler_output.sampled_token_ids = torch.randint(low=10, - high=100, - size=(batch_size, ), - dtype=torch.long, - device=device) - - -@contextmanager -def nvtx_range(msg, *args, **kwargs): - """ - Context manager / decorator that pushes an NVTX range at the beginning - of its scope, and pops it at the end. If extra arguments are given, - they are passed as arguments to msg.format(). - - If running with cuda graphs, you must enable nsys cuda graph profiling. - - Arguments: - msg (string): message to associate with the range - """ - if current_platform.is_cuda_alike(): - torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) - try: - yield - finally: - torch.cuda.nvtx.range_pop() - else: - yield - - -class Timer: - """Basic timer context manager for measuring CPU time. - """ - - def __enter__(self): - self.start_time = time.time() - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.end_time = time.time() - self.elapsed_time_s = self.end_time - self.start_time - self.elapsed_time_ms = self.elapsed_time_s * 1000 diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index fb2e8a1df7052..5445a333c493e 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -6,7 +6,6 @@ from typing import Optional, Union from transformers import AutoConfig, PretrainedConfig -import vllm.envs as envs from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config @@ -44,28 +43,25 @@ class EAGLEConfig(PretrainedConfig): self.truncated_vocab_size = self.model.vocab_size if \ truncated_vocab_size is None else truncated_vocab_size - if not envs.VLLM_USE_V1: - kwargs["architectures"] = ["EAGLEModel"] + # Eagle model name should follow naming convention of + # LlamaForCausalLM -> EagleLlamaForCausalLM + if method == "eagle": + assert self.model is not None, \ + "model should not be None when method is eagle" + kwargs["architectures"] = [ + f"Eagle{arch}" if not arch.startswith("Eagle") \ + else arch for arch in self.model.architectures + ] + elif method == "eagle3": + assert self.model is not None, \ + "model should not be None when method is eagle3" + kwargs["architectures"] = [ + f"Eagle3{arch}" if not arch.startswith("Eagle3") \ + else arch for arch in self.model.architectures + ] else: - # Eagle model name should follow naming convention of - # LlamaForCausalLM -> EagleLlamaForCausalLM - if method == "eagle": - assert self.model is not None, \ - "model should not be None when method is eagle" - kwargs["architectures"] = [ - f"Eagle{arch}" if not arch.startswith("Eagle") \ - else arch for arch in self.model.architectures - ] - elif method == "eagle3": - assert self.model is not None, \ - "model should not be None when method is eagle3" - kwargs["architectures"] = [ - f"Eagle3{arch}" if not arch.startswith("Eagle3") \ - else arch for arch in self.model.architectures - ] - else: - raise ValueError(f"Invalid method {method}. \ - Supported methods are eagle and eagle3.") + raise ValueError(f"Invalid method {method}. \ + Supported methods are eagle and eagle3.") super().__init__(**kwargs) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c382b29ad1990..55705062d3968 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -397,8 +397,6 @@ class LocalOrDistributedWorkerBase(WorkerBase): model_input, worker_input, kwargs = inputs num_steps = worker_input.num_steps - if execute_model_req is not None and execute_model_req.spec_step_idx: - kwargs["spec_step_idx"] = execute_model_req.spec_step_idx self.execute_worker(worker_input)