vllm/tests/distributed/test_multiproc_executor.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
Integration tests for MultiprocExecutor at the executor level.
This test directly tests the executor without going through the LLM interface,
focusing on executor initialization, RPC calls, and distributed execution.
"""

import multiprocessing
import os

from tests.utils import multi_gpu_test
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import get_open_port
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.executor.multiproc_executor import MultiprocExecutor

MODEL = "facebook/opt-125m"


def create_vllm_config(
    tensor_parallel_size: int = 1,
    pipeline_parallel_size: int = 1,
    max_model_len: int = 256,
    gpu_memory_utilization: float = 0.3,
    distributed_executor_backend: str = "mp",
    nnodes: int = 1,
    node_rank: int = 0,
    master_port: int = 0,
) -> VllmConfig:
    """Create a VllmConfig for testing using EngineArgs."""
    engine_args = EngineArgs(
        model=MODEL,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        max_model_len=max_model_len,
        gpu_memory_utilization=gpu_memory_utilization,
        distributed_executor_backend=distributed_executor_backend,
        enforce_eager=True,
    )
    vllm_config = engine_args.create_engine_config()

    # Override distributed node settings if needed
    if nnodes > 1 or node_rank > 0:
        vllm_config.parallel_config.nnodes = nnodes
        vllm_config.parallel_config.node_rank = node_rank
        vllm_config.parallel_config.master_port = master_port
    if nnodes > 1:
        vllm_config.parallel_config.disable_custom_all_reduce = True

    return vllm_config


def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
    """Create a minimal SchedulerOutput for testing."""
    # This is a simplified version - in practice you'd need proper
    # SchedulerOutput construction based on the actual vLLM v1 API
    return SchedulerOutput(
        scheduled_new_reqs=[],
        scheduled_resumed_reqs=[],
        scheduled_running_reqs=[],
        num_scheduled_tokens={},
        total_num_scheduled_tokens=0,
    )


def test_multiproc_executor_initialization():
    """Test that MultiprocExecutor can be initialized with proper config."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
    )

    # Create executor - this should initialize workers
    executor = MultiprocExecutor(vllm_config=vllm_config)

    # Verify executor properties
    assert executor.world_size == 1, "World size should be 1 for single GPU"
    assert executor.local_world_size == 1, "Local world size should be 1"
    assert hasattr(executor, "workers"), "Executor should have workers"
    assert len(executor.workers) == 1, "Should have 1 worker for single GPU"

    # Clean up
    executor.shutdown()


@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_initialization_tensor_parallel():
    """Test MultiprocExecutor initialization with tensor parallelism."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        pipeline_parallel_size=1,
    )

    # Create executor
    executor = MultiprocExecutor(vllm_config=vllm_config)

    # Verify executor properties
    assert executor.world_size == 2, "World size should be 2 for TP=2"
    assert executor.local_world_size == 2, "Local world size should be 2"
    assert len(executor.workers) == 2, "Should have 2 workers for TP=2"

    # Verify output rank calculation
    output_rank = executor._get_output_rank()
    assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"

    # Clean up
    executor.shutdown()


@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_collective_rpc():
    """Test collective RPC calls to all workers."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        pipeline_parallel_size=1,
    )

    # Create executor
    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Test check_health RPC - should work without errors
        executor.check_health()

        # Test that RPC works correctly
        # Note: We're just testing that the RPC mechanism works,
        # not testing actual model execution here
        assert not executor.is_failed, "Executor should not be in failed state"

    finally:
        # Clean up
        executor.shutdown()


def test_multiproc_executor_failure_callback():
    """Test failure callback registration and invocation."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Test callback registration
        callback_invoked = []

        def test_callback():
            callback_invoked.append(True)

        # Register callback
        executor.register_failure_callback(test_callback)

        # Callback should not be invoked yet
        assert len(callback_invoked) == 0, "Callback should not be invoked immediately"

        # Simulate failure
        executor.is_failed = True

        # Register another callback - should be invoked immediately
        executor.register_failure_callback(test_callback)
        assert len(callback_invoked) == 1, (
            "Callback should be invoked when executor is failed"
        )

    finally:
        # Clean up
        executor.shutdown()


@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_worker_monitor():
    """Test that worker monitor is set up correctly."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        pipeline_parallel_size=1,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Verify all worker processes are alive
        for worker in executor.workers:
            assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"

        # Verify executor is not in failed state
        assert not executor.is_failed, "Executor should not be in failed state"

    finally:
        # Clean up
        executor.shutdown()

        # After shutdown, workers should be terminated
        import time

        time.sleep(0.5)  # Give processes time to terminate
        for worker in executor.workers:
            assert not worker.proc.is_alive(), (
                f"Worker rank {worker.rank} should terminate after shutdown"
            )


@multi_gpu_test(num_gpus=2)
def test_multiproc_executor_get_response_message_queues():
    """Test message queue retrieval for different ranks."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        pipeline_parallel_size=1,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Get all message queues
        all_queues = executor.get_response_mqs()
        assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"

        # Get message queue for specific rank
        rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
        assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"

        rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
        assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"

    finally:
        # Clean up
        executor.shutdown()


def test_multiproc_executor_shutdown_cleanup():
    """Test that shutdown properly cleans up resources."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    # Verify executor is set up
    assert hasattr(executor, "workers"), "Executor should have workers"
    assert len(executor.workers) > 0, "Should have at least one worker"

    # Shutdown
    executor.shutdown()

    # Verify cleanup
    import time

    time.sleep(0.5)  # Give processes time to terminate

    for worker in executor.workers:
        assert not worker.proc.is_alive(), "Worker processes should be terminated"

    # Verify shutdown event is set
    assert executor.shutdown_event.is_set(), "Shutdown event should be set"

    # Multiple shutdowns should be safe (idempotent)
    executor.shutdown()
    executor.shutdown()


@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_pipeline_parallel():
    """Test MultiprocExecutor with pipeline parallelism."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        pipeline_parallel_size=2,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Verify executor properties
        assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
        assert len(executor.workers) == 4, "Should have 4 workers"

        # Verify output rank calculation
        # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
        # Specifically rank 2 (first rank of last PP stage)
        output_rank = executor._get_output_rank()
        assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"

        # Verify max_concurrent_batches for pipeline parallel
        assert executor.max_concurrent_batches == 2, (
            "Max concurrent batches should equal PP size"
        )

    finally:
        # Clean up
        executor.shutdown()


def test_multiproc_executor_properties():
    """Test various executor properties and configurations."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
    )

    executor = MultiprocExecutor(vllm_config=vllm_config)

    try:
        # Test supports_pp property
        assert MultiprocExecutor.supports_pp is True, (
            "MultiprocExecutor should support pipeline parallelism"
        )

        # Test world_size calculation
        assert executor.world_size == (
            executor.parallel_config.tensor_parallel_size
            * executor.parallel_config.pipeline_parallel_size
        ), "World size should equal TP * PP"

        # Test local_world_size calculation
        assert executor.local_world_size == (
            executor.parallel_config.world_size // executor.parallel_config.nnodes
        ), "Local world size should be world_size / nnodes"

    finally:
        # Clean up
        executor.shutdown()


@multi_gpu_test(num_gpus=4)
def test_multiproc_executor_multi_node():
    """
    Test MultiprocExecutor with multi-node configuration.
    This simulates 2 nodes with TP=4:
    - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
    Total world_size = 4, nnodes = 2
    """
    port = get_open_port()
    # symm_mem does not work for simulating multi instance in single node
    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"

    def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
        """Run a single node's executor."""
        executor = None
        try:
            # Set CUDA_VISIBLE_DEVICES for this node
            if node_rank == 0:
                os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
            else:
                os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

            # Create config for this node
            vllm_config = create_vllm_config(
                tensor_parallel_size=4,  # Total TP across all nodes
                pipeline_parallel_size=1,
                nnodes=2,  # 2 nodes
                node_rank=node_rank,
                master_port=port,  # same port
            )

            # Create executor for this node
            executor = MultiprocExecutor(vllm_config=vllm_config)

            # Verify node-specific properties
            assert executor.world_size == 4, (
                f"World size should be 4 on node {node_rank}"
            )
            assert executor.local_world_size == 2, (
                f"Local world size should be 2 on node {node_rank}"
            )
            assert len(executor.workers) == 2, (
                f"Should have 2 local workers on node {node_rank}"
            )

            # Verify worker ranks are correct for this node
            expected_ranks = [node_rank * 2, node_rank * 2 + 1]
            actual_ranks = sorted([w.rank for w in executor.workers])
            assert actual_ranks == expected_ranks, (
                f"Node {node_rank} should have workers "
                f"with ranks {expected_ranks}, got {actual_ranks}"
            )
            # Verify all workers are alive
            for worker in executor.workers:
                assert worker.proc.is_alive(), (
                    f"Worker rank {worker.rank} should be alive on node {node_rank}"
                )
            # executor.gen
            # Put success result in queue BEFORE shutdown to avoid hanging
            result_queue.put({"node": node_rank, "success": True})
            import time

            time.sleep(2)
            executor.shutdown()
        except Exception as e:
            # Put failure result in queue
            result_queue.put({"node": node_rank, "success": False, "error": str(e)})
            raise e
        finally:
            if executor is not None:
                executor.shutdown()

    # Create a queue to collect results from both processes
    result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()

    # Start both node processes
    processes = []
    for node_rank in range(2):
        p = multiprocessing.Process(
            target=run_node,
            args=(node_rank, result_queue, port),
            name=f"Node{node_rank}",
        )
        p.start()
        processes.append(p)

    # Wait for both processes to complete
    all_completed = True
    for p in processes:
        p.join(timeout=60)
        if p.is_alive():
            p.terminate()
            p.join(timeout=20)
            if p.is_alive():
                p.kill()
                p.join()
            all_completed = False

    # Check results from both nodes
    results: list[dict[str, int | bool]] = []
    while len(results) < 2:
        try:
            result = result_queue.get(timeout=1)
            results.append(result)
        except Exception:
            pass
    assert all_completed, "Not all processes completed successfully"
    assert len(results) == 2, f"Expected 2 results, got {len(results)}"
    assert results[0]["success"], f"Node 0 failed: {results[0]}"
    assert results[1]["success"], f"Node 1 failed: {results[1]}"