diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 248bfbc8ab5c..7f57d5cf9b18 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int): +def api_server(tokenizer_pool_size: int, engine_use_ray: bool, + worker_use_ray: bool): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() - uvicorn_process = subprocess.Popen([ + commands = [ sys.executable, "-u", str(script_path), "--model", "facebook/opt-125m", "--host", "127.0.0.1", "--tokenizer-pool-size", str(tokenizer_pool_size) - ]) + ] + if engine_use_ray: + commands.append("--engine-use-ray") + if worker_use_ray: + commands.append("--worker-use-ray") + uvicorn_process = subprocess.Popen(commands) yield uvicorn_process.terminate() @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) -def test_api_server(api_server, tokenizer_pool_size: int): +@pytest.mark.parametrize("worker_use_ray", [False, True]) +@pytest.mark.parametrize("engine_use_ray", [False, True]) +def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool, + engine_use_ray: bool): """ Run the API server and test it. diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index f61049513512..1dbf58904541 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -333,8 +333,7 @@ class AsyncLLMEngine: if engine_config.device_config.device_type == "neuron": raise NotImplementedError("Neuron is not supported for " "async engine yet.") - elif (engine_config.parallel_config.worker_use_ray - or engine_args.engine_use_ray): + elif engine_config.parallel_config.worker_use_ray: initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync executor_class = RayGPUExecutorAsync @@ -410,8 +409,8 @@ class AsyncLLMEngine: else: # FIXME(woosuk): This is a bit hacky. Be careful when changing the # order of the arguments. - cache_config = args[1] - parallel_config = args[2] + cache_config = kwargs["cache_config"] + parallel_config = kwargs["parallel_config"] if parallel_config.tensor_parallel_size == 1: num_gpus = cache_config.gpu_memory_utilization else: