[Cleanup] Remove no-longer-used SpeculativeConfig.enable_chunked_prefill (#27826)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill 2025-10-31 10:57:45 -07:00 committed by GitHub
parent fc16f1c477
commit 9e5bd3076e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 1 additions and 16 deletions

View File

@ -78,10 +78,6 @@ class SpeculativeConfig:
draft_tensor_parallel_size: int | None = Field(default=None, ge=1)
"""The degree of the tensor parallelism for the draft model. Can only be 1
or the same as the target model's tensor parallel size."""
disable_logprobs: bool = True
"""If set to True, token log probabilities are not returned during
speculative decoding. If set to False, token log probabilities are returned
according to the log probability settings in SamplingParams."""
# Draft model configuration
quantization: me_quant.QuantizationMethods | None = None
@ -126,12 +122,6 @@ class SpeculativeConfig:
"""The configuration of the target model."""
target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore
"""The parallel configuration for the target model."""
enable_chunked_prefill: SkipValidation[bool] = None # type: ignore
"""Whether vLLM is configured to use chunked prefill or not. Used for
raising an error since it's not yet compatible with speculative decode."""
disable_log_stats: SkipValidation[bool] = None # type: ignore
"""Whether to disable the periodic printing of stage times in speculative
decoding."""
# params generated in the post-init stage
draft_model_config: SkipValidation[ModelConfig] = None # type: ignore

View File

@ -1246,8 +1246,6 @@ class EngineArgs:
self,
target_model_config: ModelConfig,
target_parallel_config: ParallelConfig,
enable_chunked_prefill: bool,
disable_log_stats: bool,
) -> SpeculativeConfig | None:
"""Initializes and returns a SpeculativeConfig object based on
`speculative_config`.
@ -1267,8 +1265,6 @@ class EngineArgs:
{
"target_model_config": target_model_config,
"target_parallel_config": target_parallel_config,
"enable_chunked_prefill": enable_chunked_prefill,
"disable_log_stats": disable_log_stats,
}
)
return SpeculativeConfig(**self.speculative_config)
@ -1561,8 +1557,6 @@ class EngineArgs:
speculative_config = self.create_speculative_config(
target_model_config=model_config,
target_parallel_config=parallel_config,
enable_chunked_prefill=self.enable_chunked_prefill,
disable_log_stats=self.disable_log_stats,
)
# make sure num_lookahead_slots is set appropriately depending on

View File

@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args(
)
# Don't keep the dummy data in memory
assert async_llm is not None
await async_llm.reset_mm_cache()
yield async_llm