[Misc] Refine ray_serve_deepseek example (#17204)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
Rui Qiao 2025-04-25 16:06:59 -07:00 committed by GitHub
parent a0e619e62a
commit c53e0730cb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -8,37 +8,41 @@ Run `python3 ray_serve_deepseek.py` to deploy the model.
""" """
from ray import serve from ray import serve
from ray.serve.llm import LLMConfig, LLMRouter, LLMServer from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig( llm_config = LLMConfig(
model_loading_config=dict( model_loading_config={
model_id="deepseek", "model_id": "deepseek",
# Change to model download path # Since DeepSeek model is huge, it is recommended to pre-download
model_source="/path/to/the/model", # the model to local disk, say /path/to/the/model and specify:
), # model_source="/path/to/the/model"
deployment_config=dict(autoscaling_config=dict( "model_source": "deepseek-ai/DeepSeek-R1",
min_replicas=1, },
max_replicas=1, deployment_config={
)), "autoscaling_config": {
"min_replicas": 1,
"max_replicas": 1,
}
},
# Change to the accelerator type of the node # Change to the accelerator type of the node
accelerator_type="H100", accelerator_type="H100",
runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")), runtime_env={"env_vars": {
"VLLM_USE_V1": "1"
}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs) # Customize engine arguments as needed (e.g. vLLM engine kwargs)
engine_kwargs=dict( engine_kwargs={
tensor_parallel_size=8, "tensor_parallel_size": 8,
pipeline_parallel_size=2, "pipeline_parallel_size": 2,
gpu_memory_utilization=0.92, "gpu_memory_utilization": 0.92,
dtype="auto", "dtype": "auto",
max_num_seqs=40, "max_num_seqs": 40,
max_model_len=16384, "max_model_len": 16384,
enable_chunked_prefill=True, "enable_chunked_prefill": True,
enable_prefix_caching=True, "enable_prefix_caching": True,
trust_remote_code=True, "trust_remote_code": True,
), },
) )
# Deploy the application # Deploy the application
deployment = LLMServer.as_deployment( llm_app = build_openai_app({"llm_configs": [llm_config]})
llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
llm_app = LLMRouter.as_deployment().bind([deployment])
serve.run(llm_app) serve.run(llm_app)