[V1][BugFix] Fix edge case in VLM scheduling (#12065)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-12-22 17:35:31 +08:00 · 2025-01-14 20:21:28 -08:00 · 2025-01-14 20:21:28 -08:00 · b7ee940a82
commit b7ee940a82
parent 9ddac56311
1 changed files with 15 additions and 11 deletions
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@ -373,18 +373,22 @@ class Scheduler:
            if self.encoder_cache_manager.has_cache(request, i):
                # The encoder input is already computed and cached.
                continue
-            if not self.encoder_cache_manager.can_allocate(request, i):
+            if (not self.encoder_cache_manager.can_allocate(request, i)
-                # The encoder cache is full. We can only schedule the decoder
+                    or num_encoder_tokens > encoder_budget):
-                # tokens just before the encoder input.
+                # The encoder cache is full or the encoder budget is exhausted.
-                num_new_tokens = start_pos - num_computed_tokens
+                # NOTE(woosuk): We assume that the encoder input tokens should
-                break
+                # be processed altogether, as the encoder usually uses
            if num_encoder_tokens > encoder_budget:
                # The encoder budget is exhausted. We can only schedule the
                # decoder tokens up until the encoder input.
                # NOTE(woosuk): We assume that the encoder tokens should be
                # processed altogether, as the encoder usually uses
                # bidirectional attention.
                if num_computed_tokens < start_pos:
                    # We only schedule the decoder tokens just before the
                    # encoder input.
                    num_new_tokens = start_pos - num_computed_tokens
                else:
                    # Because of prefix caching, num_computed_tokens is greater
                    # than start_pos even though its encoder input is not
                    # available. In this case, we can't schedule any token for
                    # the request in this step.
                    num_new_tokens = 0
                break
            encoder_budget -= num_encoder_tokens