fix num_tokens_across_dp sizing issue

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-05-22 07:11:18 +08:00 · 2025-08-11 15:24:03 +00:00 · 2025-08-11 15:24:03 +00:00 · e526b1c091
commit e526b1c091
parent 44ead56ad5
1 changed files with 3 additions and 2 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2692,8 +2692,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            assert num_reqs == num_tokens
            assert num_tokens % 2 == 0
            num_tokens_per_ubatch = num_tokens // 2
-            num_tokens_across_dp = torch.tensor([num_tokens_per_ubatch] * 2,
-                                                device="cpu",
+            dp_size = self.vllm_config.parallel_config.data_parallel_size
+            num_tokens_across_dp = torch.tensor([num_tokens_per_ubatch] * dp_size,
+                                                device="cpu", 
                                                dtype=torch.int32)
            ubatch_slices = [(slice(0,
                                    num_reqs // 2), slice(0, num_tokens // 2)),