diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index af9cc0afd26be..e9993fd840619 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -85,11 +85,10 @@ def test_max_model_len(): num_total_tokens = len(output.prompt_token_ids) + len( output.outputs[0].token_ids ) - # Total tokens must not exceed max_model_len + 1 (the last token can be - # generated with the context length equal to the max model length) + # Total tokens must not exceed max_model_len. # It can be less if generation finishes due to other reasons (e.g., EOS) # before reaching the absolute model length limit. - assert num_total_tokens <= max_model_len + 1 + assert num_total_tokens <= max_model_len def test_log_stats(): diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py deleted file mode 100644 index 45a938f6be63a..0000000000000 --- a/tests/v1/e2e/test_context_length.py +++ /dev/null @@ -1,90 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -end-to-end tests for context length corner cases of vLLM v1 model runner -versus HuggingFace's transformers. - -This test verifies the following behavior: allow prefill and decodes on the -model's maximum context length ``max_model_len`` and get one more token. - -Test strategy -- Build a prompt consisting of exactly ``prompt_len`` tokens. -- Run vLLM generation requesting ``max_tokens`` new tokens. -- Run HF generation on the same prompt requesting the same number of tokens. -- Assert both return the same number of generated tokens and the same ids. - -Test cases -- Prefill a prompt of ``max_model_len`` (2048) and request a single token which -will be sampled after the prefill (context length ``max_model_len``). -- Prefill a prompt of ``max_model_len`` - 1 (2047) and request two tokens where -the 1st will be sampled after the prefill and the 2nd after the first decode -(context length ``max_model_len``). - -""" - -import pytest - -from tests.conftest import HfRunner, VllmRunner -from tests.models.utils import check_outputs_equal -from tests.utils import create_new_process_for_each_test - - -@create_new_process_for_each_test() -@pytest.mark.parametrize("model", ["JackFram/llama-160m"]) -@pytest.mark.parametrize( - "prompt_len, max_tokens", - [ - (2048, 1), # prompt_len = max_model_len - (2047, 2), # prompt_len = max_model_len - 1 - ], -) -def test_max_context_length( - model: str, - vllm_runner: type[VllmRunner], - hf_runner: type[HfRunner], - prompt_len: int, - max_tokens: int, -) -> None: - """Compare vLLM and HuggingFace when the prompt already fills the - model's maximum context length and we request a single new token. - - The test ensures vLLM does not raise the "Sampled token IDs exceed the - max model length" assertion and that both vLLM and HF produce the same - single token when given the same inputs. - """ - - # Construct a prompt of size prompt_len - prompt_ids = [[43] * prompt_len] - - # --- vLLM generation --- - with vllm_runner( - model_name=model, - tokenizer_name=model, - max_model_len=2048, - max_num_seqs=1, - tensor_parallel_size=1, - ) as vllm_model: - # Generate max_tokens new tokens deterministically. - vllm_outputs = vllm_model.generate_greedy(prompt_ids, max_tokens) - - # --- HuggingFace generation --- - with hf_runner( - model_name=model, - ) as hf_model: - hf_outputs = hf_model.generate_greedy(prompt_ids, max_tokens) - - # vLLM and HF runners return prompt + generated tokens. Slice off the prompt. - vllm_output_ids = vllm_outputs[0][0][prompt_len:] - hf_output_ids = hf_outputs[0][0][prompt_len:] - - # check that exactly max_tokens tokens were generated with vLLM and HF - assert len(vllm_output_ids) == len(hf_output_ids) == max_tokens - - # check that vLLM outputs (token ids) match HF outputs - # Note: for simplicity don't pass detokenized string - check_outputs_equal( - outputs_0_lst=[(hf_output_ids, "")], - outputs_1_lst=[(vllm_output_ids, "")], - name_0="hf", - name_1="vllm", - ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d9a0ff1aa5c9c..09f1e91a9e858 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -223,7 +223,7 @@ class Scheduler(SchedulerInterface): # Make sure the input position does not exceed the max model len. # This is necessary when using spec decoding. num_new_tokens = min( - num_new_tokens, self.max_model_len - request.num_computed_tokens + num_new_tokens, self.max_model_len - 1 - request.num_computed_tokens ) # Schedule encoder inputs. diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 0979100ed325f..4f17468d2d581 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -44,7 +44,7 @@ def check_stop( request: Request, max_model_len: int, pooler_output: Optional[torch.Tensor] = None ) -> bool: if ( - request.num_tokens > max_model_len + request.num_tokens >= max_model_len or request.num_output_tokens >= request.max_tokens ): request.status = RequestStatus.FINISHED_LENGTH_CAPPED diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8d1940da566f9..b7dc2287b79fa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2317,30 +2317,14 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): start_idx = self.input_batch.num_tokens_no_spec[req_idx] end_idx = start_idx + len(sampled_ids) - assert end_idx <= self.max_model_len + 1, ( - "Sampled token IDs exceed the max model length + 1. " - f"Total number of tokens: {end_idx} > max_model_len + 1: " - f"{self.max_model_len + 1}" + assert end_idx <= self.max_model_len, ( + "Sampled token IDs exceed the max model length. " + f"Total number of tokens: {end_idx} > max_model_len: " + f"{self.max_model_len}" ) - n_tokens_cache = len(sampled_ids) - - # Sampled token IDs exceed the max model length by 1. This is - # legitimate as we can still sample 1 last token when the context - # length equals the max model length. Note that we do not need to - # cache this token ID as the sequence finishes after this step. - # Additionally, the buffers token_ids_cpu and is_token_ids are of - # size max model length only. - if end_idx == self.max_model_len + 1: - n_tokens_cache -= 1 - - self.input_batch.token_ids_cpu[ - req_idx, start_idx : (start_idx + n_tokens_cache) - ] = sampled_ids[:n_tokens_cache] - self.input_batch.is_token_ids[ - req_idx, start_idx : (start_idx + n_tokens_cache) - ] = True - + self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids + self.input_batch.is_token_ids[req_idx, start_idx:end_idx] = True self.input_batch.num_tokens_no_spec[req_idx] = end_idx self.input_batch.num_tokens[req_idx] = end_idx