diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index b6ceb5fb82d70..24980833864b0 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,6 +330,11 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler + if seq_len > model_config.max_model_len: + raise AssertionError( + f"Profiling attempted with sequence length ({seq_len}) " + f"greater than model length ({model_config.max_model_len})") + if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 5f39f2fa4947c..f34597ac05db4 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,6 +281,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 473bd901b5b23..3181483fe8390 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,6 +1302,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index aa1d2cbb2df29..9b484a9f543fe 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,6 +148,7 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, + self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 39957e661c474..2103260d8900c 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,6 +466,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \