mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-30 02:27:04 +08:00
[Docs] Improve frameworks/helm.md (#20113)
Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
This commit is contained in:
parent
167aca45cb
commit
84c260caeb
@ -5,9 +5,9 @@ title: Helm
|
|||||||
|
|
||||||
A Helm chart to deploy vLLM for Kubernetes
|
A Helm chart to deploy vLLM for Kubernetes
|
||||||
|
|
||||||
Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
|
Helm is a package manager for Kubernetes. It helps automate the deployment of vLLM applications on Kubernetes. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
|
||||||
|
|
||||||
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
|
This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for Helm installation and documentation on architecture and values file.
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
@ -16,17 +16,23 @@ Before you begin, ensure that you have the following:
|
|||||||
- A running Kubernetes cluster
|
- A running Kubernetes cluster
|
||||||
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
|
- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
|
||||||
- Available GPU resources in your cluster
|
- Available GPU resources in your cluster
|
||||||
- S3 with the model which will be deployed
|
- An S3 with the model which will be deployed
|
||||||
|
|
||||||
## Installing the chart
|
## Installing the chart
|
||||||
|
|
||||||
To install the chart with the release name `test-vllm`:
|
To install the chart with the release name `test-vllm`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
|
helm upgrade --install --create-namespace \
|
||||||
|
--namespace=ns-vllm test-vllm . \
|
||||||
|
-f values.yaml \
|
||||||
|
--set secrets.s3endpoint=$ACCESS_POINT \
|
||||||
|
--set secrets.s3bucketname=$BUCKET \
|
||||||
|
--set secrets.s3accesskeyid=$ACCESS_KEY \
|
||||||
|
--set secrets.s3accesskey=$SECRET_KEY
|
||||||
```
|
```
|
||||||
|
|
||||||
## Uninstalling the Chart
|
## Uninstalling the chart
|
||||||
|
|
||||||
To uninstall the `test-vllm` deployment:
|
To uninstall the `test-vllm` deployment:
|
||||||
|
|
||||||
@ -39,57 +45,59 @@ chart **including persistent volumes** and deletes the release.
|
|||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
## Values
|
## Values
|
||||||
|
|
||||||
| Key | Type | Default | Description |
|
The following table describes configurable parameters of the chart in `values.yaml`:
|
||||||
|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
|
|
||||||
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
|
| Key | Type | Default | Description |
|
||||||
| autoscaling.enabled | bool | false | Enable autoscaling |
|
|-----|------|---------|-------------|
|
||||||
| autoscaling.maxReplicas | int | 100 | Maximum replicas |
|
| autoscaling | object | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} | Autoscaling configuration |
|
||||||
| autoscaling.minReplicas | int | 1 | Minimum replicas |
|
| autoscaling.enabled | bool | false | Enable autoscaling |
|
||||||
| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
|
| autoscaling.maxReplicas | int | 100 | Maximum replicas |
|
||||||
| configs | object | {} | Configmap |
|
| autoscaling.minReplicas | int | 1 | Minimum replicas |
|
||||||
| containerPort | int | 8000 | Container port |
|
| autoscaling.targetCPUUtilizationPercentage | int | 80 | Target CPU utilization for autoscaling |
|
||||||
| customObjects | list | [] | Custom Objects configuration |
|
| configs | object | {} | Configmap |
|
||||||
| deploymentStrategy | object | {} | Deployment strategy configuration |
|
| containerPort | int | 8000 | Container port |
|
||||||
| externalConfigs | list | [] | External configuration |
|
| customObjects | list | [] | Custom Objects configuration |
|
||||||
| extraContainers | list | [] | Additional containers configuration |
|
| deploymentStrategy | object | {} | Deployment strategy configuration |
|
||||||
| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
|
| externalConfigs | list | [] | External configuration |
|
||||||
| extraInit.pvcStorage | string | "50Gi" | Storage size of the s3 |
|
| extraContainers | list | [] | Additional containers configuration |
|
||||||
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
|
| extraInit | object | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} | Additional configuration for the init container |
|
||||||
| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
|
| extraInit.pvcStorage | string | "1Gi" | Storage size of the s3 |
|
||||||
| extraPorts | list | [] | Additional ports configuration |
|
| extraInit.s3modelpath | string | "relative_s3_model_path/opt-125m" | Path of the model on the s3 which hosts model weights and config files |
|
||||||
| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
|
| extraInit.awsEc2MetadataDisabled | boolean | true | Disables the use of the Amazon EC2 instance metadata service |
|
||||||
| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
|
| extraPorts | list | [] | Additional ports configuration |
|
||||||
| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
|
| gpuModels | list | ["TYPE_GPU_USED"] | Type of gpu used |
|
||||||
| image.repository | string | "vllm/vllm-openai" | Image repository |
|
| image | object | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration |
|
||||||
| image.tag | string | "latest" | Image tag |
|
| image.command | list | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] | Container launch command |
|
||||||
| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
|
| image.repository | string | "vllm/vllm-openai" | Image repository |
|
||||||
| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
|
| image.tag | string | "latest" | Image tag |
|
||||||
| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server |
|
| livenessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} | Liveness probe configuration |
|
||||||
| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
| livenessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
|
||||||
| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
| livenessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
|
||||||
| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
|
| livenessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
||||||
| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
|
| livenessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
||||||
| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
|
| livenessProbe.initialDelaySeconds | int | 15 | Number of seconds after the container has started before liveness probe is initiated |
|
||||||
| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
|
| livenessProbe.periodSeconds | int | 10 | How often (in seconds) to perform the liveness probe |
|
||||||
| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
|
| maxUnavailablePodDisruptionBudget | string | "" | Disruption Budget Configuration |
|
||||||
| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the Kubelet http request on the server |
|
| readinessProbe | object | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} | Readiness probe configuration |
|
||||||
| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
| readinessProbe.failureThreshold | int | 3 | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
|
||||||
| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
| readinessProbe.httpGet | object | {"path":"/health","port":8000} | Configuration of the kubelet http request on the server |
|
||||||
| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
|
| readinessProbe.httpGet.path | string | "/health" | Path to access on the HTTP server |
|
||||||
| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
|
| readinessProbe.httpGet.port | int | 8000 | Name or number of the port to access on the container, on which the server is listening |
|
||||||
| replicaCount | int | 1 | Number of replicas |
|
| readinessProbe.initialDelaySeconds | int | 5 | Number of seconds after the container has started before readiness probe is initiated |
|
||||||
| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
|
| readinessProbe.periodSeconds | int | 5 | How often (in seconds) to perform the readiness probe |
|
||||||
| resources.limits."nvidia.com/gpu" | int | 1 | Number of gpus used |
|
| replicaCount | int | 1 | Number of replicas |
|
||||||
| resources.limits.cpu | int | 4 | Number of CPUs |
|
| resources | object | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} | Resource configuration |
|
||||||
| resources.limits.memory | string | "16Gi" | CPU memory configuration |
|
| resources.limits."nvidia.com/gpu" | int | 1 | Number of GPUs used |
|
||||||
| resources.requests."nvidia.com/gpu" | int | 1 | Number of gpus used |
|
| resources.limits.cpu | int | 4 | Number of CPUs |
|
||||||
| resources.requests.cpu | int | 4 | Number of CPUs |
|
| resources.limits.memory | string | "16Gi" | CPU memory configuration |
|
||||||
| resources.requests.memory | string | "16Gi" | CPU memory configuration |
|
| resources.requests."nvidia.com/gpu" | int | 1 | Number of GPUs used |
|
||||||
| secrets | object | {} | Secrets configuration |
|
| resources.requests.cpu | int | 4 | Number of CPUs |
|
||||||
| serviceName | string | Service name | |
|
| resources.requests.memory | string | "16Gi" | CPU memory configuration |
|
||||||
| servicePort | int | 80 | Service port |
|
| secrets | object | {} | Secrets configuration |
|
||||||
| labels.environment | string | test | Environment name |
|
| serviceName | string | "" | Service name |
|
||||||
|
| servicePort | int | 80 | Service port |
|
||||||
|
| labels.environment | string | test | Environment name |
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user