From 91ac7f764d04e7a9103e3c839244ce241a43b45e Mon Sep 17 00:00:00 2001
From: wuhang <wuhang6@huawei.com>
Date: Mon, 6 Oct 2025 12:20:06 +0800
Subject: [PATCH] [CI][gpt-oss] Enable python tool tests in CI (#24315)

Signed-off-by: wuhang <wuhang6@huawei.com>
---
 requirements/common.txt                       |  1 +
 .../openai/test_response_api_with_harmony.py  | 38 ++++++++-----------
 vllm/entrypoints/tool.py                      | 11 ++++--
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/requirements/common.txt b/requirements/common.txt
index a52745f698703..1530e5a09e757 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -49,3 +49,4 @@ pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
+gpt-oss >= 0.0.7
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index 3a564bef0d126..fb0035de67c26 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -15,22 +15,15 @@ MODEL_NAME = "openai/gpt-oss-20b"
 
 
 @pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module")
-def server(monkeypatch_module: pytest.MonkeyPatch):
+def server():
     args = ["--enforce-eager", "--tool-server", "demo"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
 
-    with monkeypatch_module.context() as m:
-        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
-        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
-            yield remote_server
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
 
 
 @pytest_asyncio.fixture
@@ -316,7 +309,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
     # TODO: Add back when web search and code interpreter are available in CI
     prompts = [
         "tell me a story about a cat in 20 words",
-        # "What is 13 * 24? Use python to calculate the result.",
+        "What is 13 * 24? Use python to calculate the result.",
         # "When did Jensen found NVIDIA? Search it and answer the year only.",
     ]
 
@@ -329,12 +322,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 # {
                 #     "type": "web_search_preview"
                 # },
-                # {
-                #     "type": "code_interpreter",
-                #     "container": {
-                #         "type": "auto"
-                #     }
-                # },
+                {"type": "code_interpreter", "container": {"type": "auto"}},
             ],
             stream=True,
             background=background,
@@ -412,6 +400,7 @@ async def test_streaming(client: OpenAI, model_name: str, background: bool):
                 async for event in stream:
                     counter += 1
                     assert event == events[counter]
+            assert counter == len(events) - 1
 
 
 @pytest.mark.asyncio
@@ -429,7 +418,6 @@ async def test_web_search(client: OpenAI, model_name: str):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
-@pytest.mark.skip(reason="Code interpreter tool is not available in CI yet.")
 async def test_code_interpreter(client: OpenAI, model_name: str):
     response = await client.responses.create(
         model=model_name,
@@ -443,10 +431,16 @@ async def test_code_interpreter(client: OpenAI, model_name: str):
             "and you must print to see the output."
         ),
         tools=[{"type": "code_interpreter", "container": {"type": "auto"}}],
+        temperature=0.0,  # More deterministic output in response
     )
     assert response is not None
     assert response.status == "completed"
     assert response.usage.output_tokens_details.tool_output_tokens > 0
+    for item in response.output:
+        if item.type == "message":
+            output_string = item.content[0].text
+            print("output_string: ", output_string, flush=True)
+            assert "5846" in output_string
 
 
 def get_weather(latitude, longitude):
diff --git a/vllm/entrypoints/tool.py b/vllm/entrypoints/tool.py
index 1bc6a85a9a25c..c74ce1ee16de1 100644
--- a/vllm/entrypoints/tool.py
+++ b/vllm/entrypoints/tool.py
@@ -14,10 +14,12 @@ if TYPE_CHECKING:
 
 logger = init_logger(__name__)
 
+MIN_GPT_OSS_VERSION = "0.0.7"
+
 
 def validate_gpt_oss_install():
     """
-    Check if the gpt-oss is installed and its version is at least 0.0.3.
+    Check if the gpt-oss is installed and its version is at least 0.0.7.
     If not, raise an ImportError.
     """
     from importlib.metadata import PackageNotFoundError, version
@@ -25,16 +27,17 @@ def validate_gpt_oss_install():
     from packaging.version import InvalidVersion, Version
 
     try:
-        pkg_version_str = version("gpt_oss")  # e.g., "0.0.5"
+        pkg_version_str = version("gpt_oss")
         pkg_version = Version(pkg_version_str)
     except PackageNotFoundError:
         raise ImportError("Package 'gpt_oss' is not installed.") from None
     except InvalidVersion as e:
         raise ImportError(f"Invalid version string for 'gpt_oss': {e}") from None
 
-    if pkg_version < Version("0.0.3"):
+    if pkg_version < Version(MIN_GPT_OSS_VERSION):
         raise ImportError(
-            f"gpt_oss >= 0.0.3 is required, but {pkg_version} is installed."
+            f"gpt_oss >= {MIN_GPT_OSS_VERSION} is required, "
+            f"but {pkg_version} is installed."
         ) from None