diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index 903b9a26fab88..1f956526dcdc6 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -78,10 +78,6 @@ class SpeculativeConfig: draft_tensor_parallel_size: int | None = Field(default=None, ge=1) """The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.""" - disable_logprobs: bool = True - """If set to True, token log probabilities are not returned during - speculative decoding. If set to False, token log probabilities are returned - according to the log probability settings in SamplingParams.""" # Draft model configuration quantization: me_quant.QuantizationMethods | None = None @@ -126,12 +122,6 @@ class SpeculativeConfig: """The configuration of the target model.""" target_parallel_config: SkipValidation[ParallelConfig] = None # type: ignore """The parallel configuration for the target model.""" - enable_chunked_prefill: SkipValidation[bool] = None # type: ignore - """Whether vLLM is configured to use chunked prefill or not. Used for - raising an error since it's not yet compatible with speculative decode.""" - disable_log_stats: SkipValidation[bool] = None # type: ignore - """Whether to disable the periodic printing of stage times in speculative - decoding.""" # params generated in the post-init stage draft_model_config: SkipValidation[ModelConfig] = None # type: ignore diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b31e4931f2295..4e2c389bf84d3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1246,8 +1246,6 @@ class EngineArgs: self, target_model_config: ModelConfig, target_parallel_config: ParallelConfig, - enable_chunked_prefill: bool, - disable_log_stats: bool, ) -> SpeculativeConfig | None: """Initializes and returns a SpeculativeConfig object based on `speculative_config`. @@ -1267,8 +1265,6 @@ class EngineArgs: { "target_model_config": target_model_config, "target_parallel_config": target_parallel_config, - "enable_chunked_prefill": enable_chunked_prefill, - "disable_log_stats": disable_log_stats, } ) return SpeculativeConfig(**self.speculative_config) @@ -1561,8 +1557,6 @@ class EngineArgs: speculative_config = self.create_speculative_config( target_model_config=model_config, target_parallel_config=parallel_config, - enable_chunked_prefill=self.enable_chunked_prefill, - disable_log_stats=self.disable_log_stats, ) # make sure num_lookahead_slots is set appropriately depending on diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index f3aa5351e5302..8fa71855f8f66 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -241,6 +241,7 @@ async def build_async_engine_client_from_engine_args( ) # Don't keep the dummy data in memory + assert async_llm is not None await async_llm.reset_mm_cache() yield async_llm