From 6ac485a953d8bdba2c6b779207af50bd0fcc588b Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Mon, 17 Feb 2025 13:37:45 -0800 Subject: [PATCH] [V1][PP] Fix intermediate tensor values (#13417) Signed-off-by: Cody Yu --- vllm/sequence.py | 3 +++ vllm/v1/worker/gpu_model_runner.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 98578ee04d58..45d0e5bc7680 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1137,6 +1137,9 @@ class IntermediateTensors: def __setitem__(self, key: str, value: torch.Tensor): self.tensors[key] = value + def items(self): + return self.tensors.items() + def __len__(self): return len(self.tensors) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e1d1e43427b8..1119d53b493c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -151,7 +151,8 @@ class GPUModelRunner(LoRAModelRunnerMixin): self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) - # self.intermediate_tensors # Set after load_model + # None in the first PP rank. The rest are set after load_model. + self.intermediate_tensors: Optional[IntermediateTensors] = None # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: @@ -922,6 +923,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): if get_pp_group().is_first_rank: intermediate_tensors = None else: + assert intermediate_tensors is not None + assert self.intermediate_tensors is not None + for k, v in intermediate_tensors.items(): + self.intermediate_tensors[k][:num_input_tokens].copy_( + v[:num_input_tokens], non_blocking=True) intermediate_tensors = IntermediateTensors({ k: v[:num_input_tokens] for k, v in self.intermediate_tensors.items() @@ -1120,7 +1126,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): if get_pp_group().is_first_rank: intermediate_tensors = None else: - if not hasattr(self, "intermediate_tensors"): + if self.intermediate_tensors is None: self.intermediate_tensors = ( self.model.make_empty_intermediate_tensors( batch_size=self.max_num_tokens,