From 90e46ee5e3bdf443fd8dae5999c9563a5f6a9841 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Mon, 2 Jun 2025 18:00:56 +0000 Subject: [PATCH] misc cleanups to prepare for rebase Signed-off-by: Sage Moore --- vllm/model_executor/models/deepseek_v2.py | 6 ------ vllm/worker/model_runner.py | 6 +----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 6b153a6161eff..b78c193c1345a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -49,7 +49,6 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.v1.worker.ubatching import get_current_ubatch_context from .interfaces import SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, @@ -564,8 +563,6 @@ class DeepseekV2DecoderLayer(nn.Module): hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> torch.Tensor: - # if (ubatch_ctx := get_current_ubatch_context()) is not None: - # print("in decoder, ubatch:", ubatch_ctx.id) # Self Attention if residual is None: residual = hidden_states @@ -659,9 +656,6 @@ class DeepseekV2Model(nn.Module): intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - # if (ubatch_ctx := get_current_ubatch_context()) is not None: - # print("in forward, ubatch:", ubatch_ctx.id) - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fac7efaa6753b..99205b34fb8b5 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1417,9 +1417,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): if model_input.attn_metadata is not None: model_input.attn_metadata.enable_kv_scales_calculation = False - import nvtx - with nvtx.annotate("execute_model"): - self.execute_model(model_input, kv_caches, intermediate_tensors) + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() if self.lora_config: self._remove_dummy_loras() @@ -2125,8 +2123,6 @@ class CUDAGraphRunner(nn.Module): ) -> torch.Tensor: attn_metadata: AttentionMetadata = get_forward_context().attn_metadata - print("=== CUDAGraphRunner forward ===") - # Copy the input tensors to the input buffers. self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) if positions is not None: