From 00a8d7628c202f580d5230eaa7fe94338a0549f5 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 16 Dec 2025 09:46:22 -0500 Subject: [PATCH] [BugFix] Fix memory spike in workspace allocation (#30744) Signed-off-by: Lucas Wilkinson Co-authored-by: Cyrus Leung --- .buildkite/test-pipeline.yaml | 2 ++ vllm/v1/worker/workspace.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2dcca5711b3d5..9d0b3fdd3a02c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1223,6 +1223,8 @@ steps: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py index a16dde1f67800..bbbd7705d54e4 100644 --- a/vllm/v1/worker/workspace.py +++ b/vllm/v1/worker/workspace.py @@ -145,12 +145,20 @@ class WorkspaceManager: for ubatch_id in range(self._num_ubatches): current_workspace = self._current_workspaces[ubatch_id] - if current_workspace is None: + if ( + current_workspace is None + or self._workspace_size_bytes(current_workspace) < required_bytes + ): + # Delete old tensor before allocating new one to avoid + # memory spike from resize_(). resize_() allocates new + # memory before freeing old, which can cause OOM. + # Must clear the list reference first since local var + # is just a copy of the reference. + self._current_workspaces[ubatch_id] = None + del current_workspace self._current_workspaces[ubatch_id] = torch.empty( (required_bytes,), dtype=torch.uint8, device=self._device ) - elif self._workspace_size_bytes(current_workspace) < required_bytes: - current_workspace.resize_(required_bytes) if envs.VLLM_DEBUG_WORKSPACE: logger.info(