mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-22 16:03:40 +08:00
ffn server use vllm serve and dp
Signed-off-by: jiangkuaixue123 <jiangxiaozhou111@163.com>
This commit is contained in:
parent
28cba040c7
commit
eb2355c600
@ -557,7 +557,6 @@ class ParallelConfig:
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
from vllm.v1.executor import ray_utils
|
||||
|
||||
backend: DistributedExecutorBackend = "mp"
|
||||
|
||||
@ -191,7 +191,6 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
assert external_dp_lb or hybrid_dp_lb or dp_rank == 0
|
||||
|
||||
api_server_manager: APIServerProcessManager | None = None
|
||||
|
||||
with launch_core_engines(
|
||||
vllm_config, executor_class, log_stats, num_api_servers
|
||||
) as (local_engine_manager, coordinator, addresses):
|
||||
|
||||
@ -1402,7 +1402,6 @@ async def run_server_worker(
|
||||
listen_address, sock, args, client_config=None, **uvicorn_kwargs
|
||||
) -> None:
|
||||
"""Run a single API server worker."""
|
||||
|
||||
if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3:
|
||||
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
||||
|
||||
|
||||
@ -103,6 +103,11 @@ class EngineCore:
|
||||
if executor_fail_callback is not None:
|
||||
self.model_executor.register_failure_callback(executor_fail_callback)
|
||||
|
||||
self.afd_config = vllm_config.afd_config
|
||||
if self.afd_config and self.afd_config.afd_role == "ffn":
|
||||
logger.info("jcz EngineCore ffn role")
|
||||
return
|
||||
|
||||
self.available_gpu_memory_for_kv_cache = -1
|
||||
|
||||
# Setup KV Caches and update CacheConfig after profiling.
|
||||
@ -601,6 +606,7 @@ class EngineCoreProc(EngineCore):
|
||||
executor_fail_callback = lambda: self.input_queue.put_nowait(
|
||||
(EngineCoreRequestType.EXECUTOR_FAILED, b"")
|
||||
)
|
||||
self.afd_config = vllm_config.afd_config
|
||||
|
||||
self.engine_index = engine_index
|
||||
identity = self.engine_index.to_bytes(length=2, byteorder="little")
|
||||
@ -855,7 +861,6 @@ class EngineCoreProc(EngineCore):
|
||||
set_process_title("EngineCore")
|
||||
decorate_logs()
|
||||
engine_core = EngineCoreProc(*args, **kwargs)
|
||||
|
||||
engine_core.run_busy_loop()
|
||||
|
||||
except SystemExit:
|
||||
@ -878,6 +883,23 @@ class EngineCoreProc(EngineCore):
|
||||
def run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore."""
|
||||
|
||||
if self.afd_config and self.afd_config.afd_role == "ffn":
|
||||
logger.info("AFD FFN Server started, workers running...")
|
||||
try:
|
||||
# Tell workers to start FFN server loops (one-time call)
|
||||
self.model_executor.collective_rpc("start_ffn_server_loop")
|
||||
|
||||
# Main thread waits without busy polling
|
||||
shutdown_event = threading.Event()
|
||||
shutdown_event.wait() # Block until interrupted
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server shutting down...")
|
||||
self.model_executor.collective_rpc("stop_ffn_server_loop")
|
||||
except Exception as e:
|
||||
logger.error("Server error: %s", e)
|
||||
raise
|
||||
|
||||
# Loop until process is sent a SIGINT or SIGTERM
|
||||
while True:
|
||||
# 1) Poll the input queue until there is work to do.
|
||||
@ -1156,6 +1178,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
|
||||
# Initialize the engine.
|
||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
||||
self.afd_config = vllm_config.afd_config
|
||||
super().__init__(
|
||||
vllm_config,
|
||||
local_client,
|
||||
@ -1238,6 +1261,22 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
|
||||
def run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore for data parallel case."""
|
||||
if self.afd_config and self.afd_config.afd_role == "ffn":
|
||||
logger.info("AFD FFN Server started, workers running...")
|
||||
try:
|
||||
# Tell workers to start FFN server loops (one-time call)
|
||||
self.model_executor.collective_rpc("start_ffn_server_loop")
|
||||
|
||||
# Main thread waits without busy polling
|
||||
shutdown_event = threading.Event()
|
||||
shutdown_event.wait() # Block until interrupted
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Server shutting down...")
|
||||
self.model_executor.collective_rpc("stop_ffn_server_loop")
|
||||
except Exception as e:
|
||||
logger.error("Server error: %s", e)
|
||||
raise
|
||||
|
||||
# Loop until process is sent a SIGINT or SIGTERM
|
||||
while True:
|
||||
|
||||
@ -16,7 +16,7 @@ import msgspec
|
||||
import zmq
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import CacheConfig, ParallelConfig, VllmConfig
|
||||
from vllm.config import AFDConfig, CacheConfig, ParallelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.ray.ray_env import get_env_vars_to_copy
|
||||
@ -908,6 +908,7 @@ def launch_core_engines(
|
||||
vllm_config.cache_config,
|
||||
local_engine_manager,
|
||||
coordinator.proc if coordinator else None,
|
||||
vllm_config.afd_config,
|
||||
)
|
||||
|
||||
|
||||
@ -919,6 +920,7 @@ def wait_for_engine_startup(
|
||||
cache_config: CacheConfig,
|
||||
proc_manager: CoreEngineProcManager | None,
|
||||
coord_process: Process | None,
|
||||
afd_config: AFDConfig | None = None,
|
||||
):
|
||||
# Wait for engine core process(es) to send ready messages.
|
||||
local_count = parallel_config.data_parallel_size_local
|
||||
@ -1020,6 +1022,13 @@ def wait_for_engine_startup(
|
||||
conn_pending[0 if local else 1] -= 1
|
||||
start_pending[0 if local else 1] += 1
|
||||
engine.state = CoreEngineState.CONNECTED
|
||||
elif (
|
||||
status == "READY"
|
||||
and engine.state == CoreEngineState.CONNECTED
|
||||
and afd_config
|
||||
and afd_config.afd_role == "ffn"
|
||||
):
|
||||
engine.state = CoreEngineState.READY
|
||||
elif status == "READY" and engine.state == CoreEngineState.CONNECTED:
|
||||
# Setup KV cache config with initialization state from
|
||||
# engine core process. Sum values from all engines in DP case.
|
||||
|
||||
@ -213,6 +213,9 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
self.output_rank = self._get_output_rank()
|
||||
|
||||
self.afd_config = self.vllm_config.afd_config
|
||||
self.afd_role = self.afd_config.afd_role if self.afd_config else None
|
||||
|
||||
def start_worker_monitor(self, inline=False) -> None:
|
||||
workers = self.workers
|
||||
self_ref = weakref.ref(self)
|
||||
@ -565,6 +568,9 @@ class WorkerProc:
|
||||
# environment variable overrides after this point)
|
||||
enable_envs_cache()
|
||||
|
||||
self.afd_config = vllm_config.afd_config
|
||||
self.afd_role = self.afd_config.afd_role if self.afd_config else None
|
||||
|
||||
@staticmethod
|
||||
def make_worker_process(
|
||||
vllm_config: VllmConfig,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user