misc cleanups to prepare for rebase

Signed-off-by: Sage Moore <sage@neuralmagic.com>
2026-07-07 00:17:20 +08:00 · 2025-06-02 18:00:56 +00:00 · 2025-06-02 18:00:56 +00:00 · 90e46ee5e3
commit 90e46ee5e3
parent 8f592524cb
2 changed files with 1 additions and 11 deletions
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@ -49,7 +49,6 @@ from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.v1.worker.ubatching import get_current_ubatch_context

 from .interfaces import SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@ -564,8 +563,6 @@ class DeepseekV2DecoderLayer(nn.Module):
        hidden_states: torch.Tensor,
        residual: Optional[torch.Tensor],
    ) -> torch.Tensor:
-        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
-        #     print("in decoder, ubatch:", ubatch_ctx.id)
        # Self Attention
        if residual is None:
            residual = hidden_states
@ -659,9 +656,6 @@ class DeepseekV2Model(nn.Module):
        intermediate_tensors: Optional[IntermediateTensors],
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, IntermediateTensors]:
-        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
-        #     print("in forward, ubatch:", ubatch_ctx.id)
-        
        if get_pp_group().is_first_rank:
            if inputs_embeds is not None:
                hidden_states = inputs_embeds
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@ -1417,9 +1417,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            if model_input.attn_metadata is not None:
                model_input.attn_metadata.enable_kv_scales_calculation = False

-            import nvtx
-            with nvtx.annotate("execute_model"):
-                self.execute_model(model_input, kv_caches, intermediate_tensors)
+            self.execute_model(model_input, kv_caches, intermediate_tensors)
            torch.cuda.synchronize()
            if self.lora_config:
                self._remove_dummy_loras()
@ -2125,8 +2123,6 @@ class CUDAGraphRunner(nn.Module):
    ) -> torch.Tensor:
        attn_metadata: AttentionMetadata = get_forward_context().attn_metadata
        
-        print("=== CUDAGraphRunner forward ===")
-
        # Copy the input tensors to the input buffers.
        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
        if positions is not None: