diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d56f56e1fce32..07f61f501ea58 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2692,8 +2692,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert num_reqs == num_tokens assert num_tokens % 2 == 0 num_tokens_per_ubatch = num_tokens // 2 - num_tokens_across_dp = torch.tensor([num_tokens_per_ubatch] * 2, - device="cpu", + dp_size = self.vllm_config.parallel_config.data_parallel_size + num_tokens_across_dp = torch.tensor([num_tokens_per_ubatch] * dp_size, + device="cpu", dtype=torch.int32) ubatch_slices = [(slice(0, num_reqs // 2), slice(0, num_tokens // 2)),