mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-13 18:15:35 +08:00
[Misc] Refine ray_serve_deepseek example (#17204)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
parent
a0e619e62a
commit
c53e0730cb
@ -8,37 +8,41 @@ Run `python3 ray_serve_deepseek.py` to deploy the model.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from ray import serve
|
from ray import serve
|
||||||
from ray.serve.llm import LLMConfig, LLMRouter, LLMServer
|
from ray.serve.llm import LLMConfig, build_openai_app
|
||||||
|
|
||||||
llm_config = LLMConfig(
|
llm_config = LLMConfig(
|
||||||
model_loading_config=dict(
|
model_loading_config={
|
||||||
model_id="deepseek",
|
"model_id": "deepseek",
|
||||||
# Change to model download path
|
# Since DeepSeek model is huge, it is recommended to pre-download
|
||||||
model_source="/path/to/the/model",
|
# the model to local disk, say /path/to/the/model and specify:
|
||||||
),
|
# model_source="/path/to/the/model"
|
||||||
deployment_config=dict(autoscaling_config=dict(
|
"model_source": "deepseek-ai/DeepSeek-R1",
|
||||||
min_replicas=1,
|
},
|
||||||
max_replicas=1,
|
deployment_config={
|
||||||
)),
|
"autoscaling_config": {
|
||||||
|
"min_replicas": 1,
|
||||||
|
"max_replicas": 1,
|
||||||
|
}
|
||||||
|
},
|
||||||
# Change to the accelerator type of the node
|
# Change to the accelerator type of the node
|
||||||
accelerator_type="H100",
|
accelerator_type="H100",
|
||||||
runtime_env=dict(env_vars=dict(VLLM_USE_V1="1")),
|
runtime_env={"env_vars": {
|
||||||
|
"VLLM_USE_V1": "1"
|
||||||
|
}},
|
||||||
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
|
||||||
engine_kwargs=dict(
|
engine_kwargs={
|
||||||
tensor_parallel_size=8,
|
"tensor_parallel_size": 8,
|
||||||
pipeline_parallel_size=2,
|
"pipeline_parallel_size": 2,
|
||||||
gpu_memory_utilization=0.92,
|
"gpu_memory_utilization": 0.92,
|
||||||
dtype="auto",
|
"dtype": "auto",
|
||||||
max_num_seqs=40,
|
"max_num_seqs": 40,
|
||||||
max_model_len=16384,
|
"max_model_len": 16384,
|
||||||
enable_chunked_prefill=True,
|
"enable_chunked_prefill": True,
|
||||||
enable_prefix_caching=True,
|
"enable_prefix_caching": True,
|
||||||
trust_remote_code=True,
|
"trust_remote_code": True,
|
||||||
),
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Deploy the application
|
# Deploy the application
|
||||||
deployment = LLMServer.as_deployment(
|
llm_app = build_openai_app({"llm_configs": [llm_config]})
|
||||||
llm_config.get_serve_options(name_prefix="vLLM:")).bind(llm_config)
|
|
||||||
llm_app = LLMRouter.as_deployment().bind([deployment])
|
|
||||||
serve.run(llm_app)
|
serve.run(llm_app)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user