diff --git a/tests/test_config.py b/tests/test_config.py index cb7654c26afc..2e0222fa67a4 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -412,6 +412,8 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ("BAAI/bge-reranker-base", None, 512, False), ("BAAI/bge-reranker-base", 256, 256, False), ("BAAI/bge-reranker-base", 513, 512, True), + ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False), + ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ]) def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len, should_raise): diff --git a/vllm/config.py b/vllm/config.py index 74e7ed2d4874..7863859a6ee6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1442,10 +1442,17 @@ class ModelConfig: return getattr(self.hf_config, "matryoshka_dimensions", None) def get_and_verify_max_len(self, max_model_len: int): - tokenizer_config = try_get_tokenizer_config( - self.tokenizer, - trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision) + # For pooling models, the tokenizer's `model_max_length` is often a + # reliable source for the maximum sequence length. However, for + # generative models, this can be incorrect and unduly limit the + # context window (e.g., DeepSeek-R1). Therefore, we only consider + # tokenizer_config for pooling models. + tokenizer_config = None + if self.runner_type == "pooling": + tokenizer_config = try_get_tokenizer_config( + self.tokenizer, + trust_remote_code=self.trust_remote_code, + revision=self.tokenizer_revision) max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config,