diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py new file mode 100644 index 000000000000..7f1507dd546a --- /dev/null +++ b/examples/online_serving/ray_serve_deepseek.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Example to deploy DeepSeek R1 or V3 with Ray Serve LLM. +See Ray Serve LLM documentation at: +https://docs.ray.io/en/latest/serve/llm/serving-llms.html + +Run `python3 ray_serve_deepseek.py` to deploy the model. +""" + +from ray import serve +from ray.serve.llm import LLMConfig, LLMRouter, LLMServer + +llm_config = LLMConfig( + model_loading_config=dict( + model_id="deepseek", + # Change to model download path + model_source="/path/to/the/model", + ), + deployment_config=dict(autoscaling_config=dict( + min_replicas=1, + max_replicas=1, + )), + # Change to the accelerator type of the node + accelerator_type="H100", + runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")), + # Customize engine arguments as needed (e.g. vLLM engine kwargs) + engine_kwargs=dict( + tensor_parallel_size=8, + pipeline_parallel_size=2, + gpu_memory_utilization=0.92, + dtype="auto", + max_num_seqs=40, + max_model_len=16384, + enable_chunked_prefill=True, + enable_prefix_caching=True, + trust_remote_code=True, + ), +) + +# Deploy the application +deployment = LLMServer.as_deployment( + llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config) +llm_app = LLMRouter.as_deployment().bind([deployment]) +serve.run(llm_app)