vllm/examples/online_serving/ray_serve_deepseek.py

# SPDX-License-Identifier: Apache-2.0
"""
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
See Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html

Run `python3 ray_serve_deepseek.py` to deploy the model.
"""

from ray import serve
from ray.serve.llm import LLMConfig, LLMRouter, LLMServer

llm_config = LLMConfig(
    model_loading_config=dict(
        model_id="deepseek",
        # Change to model download path
        model_source="/path/to/the/model",
    ),
    deployment_config=dict(autoscaling_config=dict(
        min_replicas=1,
        max_replicas=1,
    )),
    # Change to the accelerator type of the node
    accelerator_type="H100",
    runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
    # Customize engine arguments as needed (e.g. vLLM engine kwargs)
    engine_kwargs=dict(
        tensor_parallel_size=8,
        pipeline_parallel_size=2,
        gpu_memory_utilization=0.92,
        dtype="auto",
        max_num_seqs=40,
        max_model_len=16384,
        enable_chunked_prefill=True,
        enable_prefix_caching=True,
        trust_remote_code=True,
    ),
)

# Deploy the application
deployment = LLMServer.as_deployment(
    llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
llm_app = LLMRouter.as_deployment().bind([deployment])
serve.run(llm_app)