vllm/tests/v1/engine/test_abort_final_step.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
Test for the fix in PR #29987: Eagerly abort cancelled final-step requests.

This test verifies that when a request is aborted during its final execution
step (when it would naturally complete), it is properly marked as aborted
rather than being treated as normally completed.

The test uses a dummy KV connector to verify that the connector receives
the correct finish status (FINISHED_ABORTED, not FINISHED_LENGTH_CAPPED).
"""

import asyncio
import tempfile
import time
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pytest

from vllm import SamplingParams
from vllm.config import KVTransferConfig, VllmConfig
from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
from vllm.distributed.kv_transfer.kv_connector.v1.base import (
    KVConnectorBase_V1,
    KVConnectorMetadata,
    KVConnectorRole,
)
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.utils.torch_utils import set_default_torch_num_threads
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request

if not current_platform.is_cuda():
    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)

TEXT_PROMPT = "Hello"


class DummyKVConnectorMetadata(KVConnectorMetadata):
    """Dummy metadata for the test connector."""

    def __init__(self):
        self.requests: list = []


class DummyKVConnector(KVConnectorBase_V1):
    """
    Dummy KV connector that captures request finish statuses to a file.
    This is used to verify the fix - without the fix, a request aborted
    during its final step would be captured as FINISHED_LENGTH_CAPPED
    instead of FINISHED_ABORTED.

    The connector runs in a separate process, so we write statuses to a file
    that can be read by the test process.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
        role: KVConnectorRole,
        kv_cache_config: KVCacheConfig | None = None,
    ):
        super().__init__(vllm_config, role, kv_cache_config)
        # Get the status file path from extra config
        extra_config = vllm_config.kv_transfer_config.kv_connector_extra_config or {}
        self.status_file = extra_config.get("status_file")
        # Log that we were initialized
        if self.status_file:
            try:
                with open(self.status_file, "a") as f:
                    f.write(f"INIT:{role.name}\n")
            except Exception:
                pass

    def get_num_new_matched_tokens(
        self,
        request: Request,
        num_computed_tokens: int,
    ) -> tuple[int | None, bool]:
        return (0, False)

    def update_state_after_alloc(
        self,
        request: Request,
        blocks: Any,
        num_external_tokens: int,
    ):
        pass

    def build_connector_meta(
        self, scheduler_output: SchedulerOutput
    ) -> KVConnectorMetadata:
        return DummyKVConnectorMetadata()

    def request_finished(
        self,
        request: Request,
        block_ids: list[int],
    ) -> tuple[bool, dict[str, Any] | None]:
        """Capture the request status when finished by writing to a file."""
        if self.status_file:
            try:
                with open(self.status_file, "a") as f:
                    # Write the status name (e.g., "FINISHED_ABORTED")
                    f.write(f"{request.status.name}\n")
            except Exception as e:
                # Log but don't fail - this is just test instrumentation
                print(f"[DummyKVConnector] Failed to write status: {e}")
        return False, None

    def start_load_kv(self, forward_context: Any, **kwargs: Any) -> None:
        pass

    def wait_for_layer_load(self, layer_name: str) -> None:
        pass

    def save_kv_layer(
        self,
        layer_name: str,
        kv_layer: Any,
        attn_metadata: Any,
        **kwargs: Any,
    ) -> None:
        pass

    def wait_for_save(self):
        pass


# Register the dummy connector
KVConnectorFactory.register_connector(
    "DummyKVConnector", __name__, DummyKVConnector.__name__
)


@pytest.mark.parametrize("async_scheduling", [False, True])
@pytest.mark.asyncio
async def test_abort_during_final_step(async_scheduling: bool):
    """
    Test that a request aborted during its final execution step is treated as
    aborted rather than completed.

    This test:
    1. Monkeypatches execute_model to wait for a file to be deleted
    2. Configures a dummy KV connector to capture finish statuses
    3. Starts a request with max_tokens=1 (will complete on first decode step)
    4. Aborts the request, then deletes the file to unblock execute_model
    5. Verifies the KV connector received FINISHED_ABORTED not FINISHED_LENGTH_CAPPED

    See https://github.com/vllm-project/vllm/pull/29987.

    Without the fix, the KV connector would see FINISHED_LENGTH_CAPPED because
    update_from_output() would mark the request as completed before processing
    the abort. This causes KV cache blocks to not be freed properly in
    disaggregated prefill scenarios.

    With the fix, _process_aborts_queue() runs before update_from_output(), so the
    abort takes precedence and the KV connector sees FINISHED_ABORTED.
    """

    # Create three temporary files:
    # 1. ready_file: deleted by execute_model to signal it has started
    # 2. block_file: execute_model waits for this to be deleted
    # 3. status_file: KV connector writes finish statuses here
    with tempfile.NamedTemporaryFile(delete=False) as f:
        ready_file = Path(f.name)
    with tempfile.NamedTemporaryFile(delete=False) as f2:
        block_file = Path(f2.name)
    with tempfile.NamedTemporaryFile(delete=False, mode="w") as f3:
        status_file = Path(f3.name)

    try:
        # Get the original execute_model method
        from vllm.v1.worker.gpu_worker import Worker

        original_execute_model = Worker.execute_model

        def execute_model_with_wait(self, scheduler_output):
            # Signal that execute_model has been called by deleting ready_file
            if ready_file.exists():
                ready_file.unlink()

            # Wait for the block file to be deleted (triggered from test after abort)
            # This runs in the worker process (after fork), so we poll the filesystem
            while block_file.exists():
                time.sleep(0.01)
            return original_execute_model(self, scheduler_output)

        # Patch execute_model to inject the wait
        # This happens before the worker process is forked, so the patch applies there
        with patch.object(Worker, "execute_model", execute_model_with_wait):
            request_id = "test-abort-final-step"

            # Configure engine with dummy KV connector
            # Pass the status file path so the connector can write to it
            kv_transfer_config = KVTransferConfig(
                kv_connector="DummyKVConnector",
                kv_role="kv_both",
                kv_connector_extra_config={"status_file": str(status_file)},
            )
            engine_args = AsyncEngineArgs(
                model="meta-llama/Llama-3.2-1B-Instruct",
                enforce_eager=True,
                async_scheduling=async_scheduling,
                kv_transfer_config=kv_transfer_config,
            )

            with set_default_torch_num_threads(1):
                engine = AsyncLLM.from_engine_args(engine_args)

            try:
                # Create a request that will complete after just 1 token
                sampling_params = SamplingParams(
                    max_tokens=1,
                    ignore_eos=True,
                    output_kind=RequestOutputKind.DELTA,
                )

                # Start generation in a task
                outputs = []

                async def generate():
                    async for output in engine.generate(
                        request_id=request_id,
                        prompt=TEXT_PROMPT,
                        sampling_params=sampling_params,
                    ):
                        outputs.append(output)

                gen_task = asyncio.create_task(generate())

                # Wait for execute_model to signal it has started (with timeout)
                timeout = 5.0  # 5 second timeout
                start_time = time.time()
                while ready_file.exists():
                    if time.time() - start_time > timeout:
                        raise TimeoutError(
                            "Timeout waiting for execute_model to start. "
                            "The monkeypatch may not be working correctly, "
                            "for example if spawn was used instead of fork."
                        )
                    await asyncio.sleep(0.01)

                # Abort the request while execute_model is blocked
                await engine.abort(request_id)

                # Now unblock execute_model by deleting the file
                # The abort should be processed before the model output
                block_file.unlink()

                # Wait for generation to complete
                await gen_task

                # Give the scheduler a moment to finish cleanup
                await asyncio.sleep(0.1)

                # Verify we got output
                assert len(outputs) > 0, "Should have received at least one output"

                # The final output should have finish_reason="abort"
                final_output = outputs[-1]
                assert final_output.finished, (
                    "Final output should be marked as finished"
                )
                assert final_output.outputs[0].finish_reason == "abort", (
                    f"Expected finish_reason='abort' but got "
                    f"'{final_output.outputs[0].finish_reason}'. "
                )

                with open(status_file) as f4:
                    status_lines = f4.read().strip().split("\n")
                    # Filter for actual finish statuses (not INIT or empty lines)
                    captured_statuses = [
                        line
                        for line in status_lines
                        if line and line.startswith("FINISHED_")
                    ]

                assert len(captured_statuses) >= 1, (
                    f"Expected at least 1 captured finish status, got "
                    f"{len(captured_statuses)}. File content: {status_lines}"
                )

                assert "FINISHED_ABORTED" in captured_statuses, (
                    f"KV connector should see FINISHED_ABORTED but got "
                    f"{captured_statuses}. "
                )

                # Verify cleanup
                assert not engine.output_processor.has_unfinished_requests()

            finally:
                # Shutdown the engine
                engine.shutdown()

    finally:
        # Clean up temporary files if they still exist
        if ready_file.exists():
            ready_file.unlink()
        if block_file.exists():
            block_file.unlink()
        if status_file.exists():
            status_file.unlink()