diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index cb629ea284fa1..1370862d580a5 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -189,7 +189,6 @@ def _support_torch_compile( CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() or _should_ignore_torch_compile( self.__class__) - self.do_not_compile = True if self.do_not_compile: return diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 93b40c6241b73..3e2947b6d2551 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1918,11 +1918,12 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): scheduler_output, is_dummy_run) # if is_global_first_rank(): # logger.info(f"RUNNING FULL BATCH {num_scheduled_tokens}") + skip_cuda_graphs = self.parallel_config.enable_microbatching with set_forward_context(attn_metadata, vllm_config=self.vllm_config, num_tokens=num_scheduled_tokens or 1, num_tokens_across_dp=num_tokens_across_dp, - skip_cuda_graphs=True): + skip_cuda_graphs=skip_cuda_graphs): return self.model( input_ids=input_ids, positions=positions,