Fang Han da855b42d2
[Doc]: Make extraInit containers fully configurable in helm chart (#27497)
Signed-off-by: Fang Han <fhan0520@gmail.com>
2025-11-06 20:27:16 +00:00

175 lines
5.2 KiB
YAML

# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.
# -- Image configuration
image:
# -- Image repository
repository: "vllm/vllm-openai"
# -- Image tag
tag: "latest"
# -- Container launch command
command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]
# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []
# -- Number of replicas
replicaCount: 1
# -- Deployment strategy configuration
deploymentStrategy: {}
# -- Resource configuration
resources:
requests:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
limits:
# -- Number of CPUs
cpu: 4
# -- CPU memory configuration
memory: 16Gi
# -- Number of gpus used
nvidia.com/gpu: 1
# -- Type of gpu used
gpuModels:
- "TYPE_GPU_USED"
# -- Autoscaling configuration
autoscaling:
# -- Enable autoscaling
enabled: false
# -- Minimum replicas
minReplicas: 1
# -- Maximum replicas
maxReplicas: 100
# -- Target CPU utilization for autoscaling
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80
# -- Configmap
configs: {}
# -- Secrets configuration
secrets: {}
# -- External configuration
externalConfigs: []
# -- Custom Objects configuration
customObjects: []
# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""
# -- Additional configuration for the init container
extraInit:
# -- Model download functionality (optional)
modelDownload:
# -- Enable model download job and wait container
enabled: true
# -- Image configuration for model download operations
image:
# -- Image repository
repository: "amazon/aws-cli"
# -- Image tag
tag: "2.6.4"
# -- Image pull policy
pullPolicy: "IfNotPresent"
# -- Wait container configuration (init container that waits for model to be ready)
waitContainer:
# -- Command to execute
command: ["/bin/bash"]
# -- Arguments for the wait container
args:
- "-eucx"
- "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
# env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: "your-token"
# - name: MODEL_ID
# value: "meta-llama/Llama-2-7b"
# -- Download job configuration (job that actually downloads the model)
downloadJob:
# -- Command to execute
command: ["/bin/bash"]
# -- Arguments for the download job
args:
- "-eucx"
- "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
# -- Environment variables (optional, overrides S3 defaults entirely if specified)
# env:
# - name: HUGGING_FACE_HUB_TOKEN
# value: "your-token"
# - name: MODEL_ID
# value: "meta-llama/Llama-2-7b"
# -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
initContainers: []
# Example for llm-d sidecar:
# initContainers:
# - name: llm-d-routing-proxy
# image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
# imagePullPolicy: IfNotPresent
# ports:
# - containerPort: 8080
# name: proxy
# securityContext:
# runAsUser: 1000
# -- Path of the model on the s3 which hosts model weights and config files
s3modelpath: "relative_s3_model_path/opt-125m"
# -- Storage size for the PVC
pvcStorage: "1Gi"
# -- Disable AWS EC2 metadata service
awsEc2MetadataDisabled: true
# -- Additional containers configuration
extraContainers: []
# -- Readiness probe configuration
readinessProbe:
# -- Number of seconds after the container has started before readiness probe is initiated
initialDelaySeconds: 5
# -- How often (in seconds) to perform the readiness probe
periodSeconds: 5
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
failureThreshold: 3
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
# -- Liveness probe configuration
livenessProbe:
# -- Number of seconds after the container has started before liveness probe is initiated
initialDelaySeconds: 15
# -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
failureThreshold: 3
# -- How often (in seconds) to perform the liveness probe
periodSeconds: 10
# -- Configuration of the Kubelet http request on the server
httpGet:
# -- Path to access on the HTTP server
path: /health
# -- Name or number of the port to access on the container, on which the server is listening
port: 8000
labels:
environment: "test"
release: "test"