From f6137adbcbbdea8b5023a66480de921b558bef83 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 17 Mar 2025 00:13:46 +0800 Subject: [PATCH] Revert "[Bugfix] Limit profiling run sequence length by max_model_len (#14785) (#14892) Signed-off-by: DarkLight1337 --- vllm/inputs/registry.py | 5 ----- vllm/worker/enc_dec_model_runner.py | 1 - vllm/worker/model_runner.py | 1 - vllm/worker/openvino_model_runner.py | 1 - vllm/worker/xpu_model_runner.py | 1 - 5 files changed, 9 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 24980833864b0..b6ceb5fb82d70 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -330,11 +330,6 @@ class InputRegistry: from vllm.multimodal import MultiModalKwargs from vllm.multimodal.profiling import MultiModalProfiler - if seq_len > model_config.max_model_len: - raise AssertionError( - f"Profiling attempted with sequence length ({seq_len}) " - f"greater than model length ({model_config.max_model_len})") - if mm_registry.has_processor(model_config): tokenizer = cached_tokenizer_from_config(model_config) processor = mm_registry.create_processor(model_config, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index f34597ac05db4..5f39f2fa4947c 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -281,7 +281,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len decoder_dummy_data = self.input_registry \ diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3181483fe8390..473bd901b5b23 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1302,7 +1302,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \ diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 9b484a9f543fe..aa1d2cbb2df29 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -148,7 +148,6 @@ class OpenVINOModelRunner(ModelRunnerBase): seq_len = min( seq_data.get_len(), computed_len + seq_group_metadata.token_chunk_size, - self.model_config.max_model_len, ) if is_prompt: tokens = seq_data.get_token_ids()[computed_len:seq_len] diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 2103260d8900c..39957e661c474 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -466,7 +466,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_len = min(seq_len, self.model_config.max_model_len) batch_size += seq_len dummy_data = self.input_registry \