diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8a03b23facc3..2db4235c89de 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3602,7 +3602,18 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE and not self.speculative_config.enforce_eager ) - self.drafter.dummy_run(num_tokens, use_cudagraphs=use_cudagraphs) + + # Note(gnovack) - We need to disable cudagraphs for one of the two + # lora cases when cudagraph_specialize_lora is enabled. This is a + # short term mitigation for issue mentioned in + # https://github.com/vllm-project/vllm/issues/28334 + if self.compilation_config.cudagraph_specialize_lora and activate_lora: + use_cudagraphs = False + + self.drafter.dummy_run( + num_tokens, + use_cudagraphs=use_cudagraphs, + ) # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real