From 00a8d7628c202f580d5230eaa7fe94338a0549f5 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Tue, 16 Dec 2025 09:46:22 -0500
Subject: [PATCH] [BugFix] Fix memory spike in workspace allocation (#30744)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml |  2 ++
 vllm/v1/worker/workspace.py   | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2dcca5711b3d5..9d0b3fdd3a02c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -1223,6 +1223,8 @@ steps:
     # FIXIT: find out which code initialize cuda before running the test
     # before the fix, we need to use spawn to test it
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # Alot of these tests are on the edge of OOMing
+    - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
     # There is some Tensor Parallelism related processing logic in LoRA that
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py
index a16dde1f67800..bbbd7705d54e4 100644
--- a/vllm/v1/worker/workspace.py
+++ b/vllm/v1/worker/workspace.py
@@ -145,12 +145,20 @@ class WorkspaceManager:
 
             for ubatch_id in range(self._num_ubatches):
                 current_workspace = self._current_workspaces[ubatch_id]
-                if current_workspace is None:
+                if (
+                    current_workspace is None
+                    or self._workspace_size_bytes(current_workspace) < required_bytes
+                ):
+                    # Delete old tensor before allocating new one to avoid
+                    # memory spike from resize_(). resize_() allocates new
+                    # memory before freeing old, which can cause OOM.
+                    # Must clear the list reference first since local var
+                    # is just a copy of the reference.
+                    self._current_workspaces[ubatch_id] = None
+                    del current_workspace
                     self._current_workspaces[ubatch_id] = torch.empty(
                         (required_bytes,), dtype=torch.uint8, device=self._device
                     )
-                elif self._workspace_size_bytes(current_workspace) < required_bytes:
-                    current_workspace.resize_(required_bytes)
 
             if envs.VLLM_DEBUG_WORKSPACE:
                 logger.info(