vllm/examples/online_serving/chart-helm/values.yaml

# -- Default values for chart vllm
# -- Declare variables to be passed into your templates.

# -- Image configuration
image:
  # -- Image repository
  repository: "vllm/vllm-openai"
  # -- Image tag
  tag: "latest"
  # -- Container launch command
  command: ["vllm", "serve", "/data/", "--served-model-name", "opt-125m", "--enforce-eager", "--dtype", "bfloat16", "--block-size", "16", "--host", "0.0.0.0", "--port", "8000"]

# -- Container port
containerPort: 8000
# -- Service name
serviceName:
# -- Service port
servicePort: 80
# -- Additional ports configuration
extraPorts: []

# -- Number of replicas
replicaCount: 1

# -- Deployment strategy configuration
deploymentStrategy: {}

# -- Resource configuration
resources:
  requests:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1
  limits:
    # -- Number of CPUs
    cpu: 4
    # -- CPU memory configuration
    memory: 16Gi
    # -- Number of gpus used
    nvidia.com/gpu: 1

# -- Type of gpu used
gpuModels:
  - "TYPE_GPU_USED"

# -- Autoscaling configuration
autoscaling:
  # -- Enable autoscaling
  enabled: false
  # -- Minimum replicas
  minReplicas: 1
  # -- Maximum replicas
  maxReplicas: 100
  # -- Target CPU utilization for autoscaling
  targetCPUUtilizationPercentage: 80
  # targetMemoryUtilizationPercentage: 80

# -- Configmap
configs: {}

# -- Secrets configuration
secrets: {}

# -- External configuration
externalConfigs: []

# -- Custom Objects configuration
customObjects: []

# -- Disruption Budget Configuration
maxUnavailablePodDisruptionBudget: ""

# -- Additional configuration for the init container
extraInit:
  # -- Model download functionality (optional)
  modelDownload:
    # -- Enable model download job and wait container
    enabled: true
    # -- Image configuration for model download operations
    image:
      # -- Image repository
      repository: "amazon/aws-cli"
      # -- Image tag
      tag: "2.6.4"
      # -- Image pull policy
      pullPolicy: "IfNotPresent"
    # -- Wait container configuration (init container that waits for model to be ready)
    waitContainer:
      # -- Command to execute
      command: ["/bin/bash"]
      # -- Arguments for the wait container
      args:
        - "-eucx"
        - "while aws --endpoint-url $S3_ENDPOINT_URL s3 sync --dryrun s3://$S3_BUCKET_NAME/$S3_PATH /data | grep -q download; do sleep 10; done"
      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
      # env:
      #   - name: HUGGING_FACE_HUB_TOKEN
      #     value: "your-token"
      #   - name: MODEL_ID
      #     value: "meta-llama/Llama-2-7b"
    # -- Download job configuration (job that actually downloads the model)
    downloadJob:
      # -- Command to execute
      command: ["/bin/bash"]
      # -- Arguments for the download job
      args:
        - "-eucx"
        - "aws --endpoint-url $S3_ENDPOINT_URL s3 sync s3://$S3_BUCKET_NAME/$S3_PATH /data"
      # -- Environment variables (optional, overrides S3 defaults entirely if specified)
      # env:
      #   - name: HUGGING_FACE_HUB_TOKEN
      #     value: "your-token"
      #   - name: MODEL_ID
      #     value: "meta-llama/Llama-2-7b"

  # -- Custom init containers (appended after wait-download-model if modelDownload is enabled)
  initContainers: []
  # Example for llm-d sidecar:
  # initContainers:
  #   - name: llm-d-routing-proxy
  #     image: ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0
  #     imagePullPolicy: IfNotPresent
  #     ports:
  #       - containerPort: 8080
  #         name: proxy
  #     securityContext:
  #       runAsUser: 1000

  # -- Path of the model on the s3 which hosts model weights and config files
  s3modelpath: "relative_s3_model_path/opt-125m"
  # -- Storage size for the PVC
  pvcStorage: "1Gi"
  # -- Disable AWS EC2 metadata service
  awsEc2MetadataDisabled: true

# -- Additional containers configuration
extraContainers: []

# -- Readiness probe configuration
readinessProbe:
  # -- Number of seconds after the container has started before readiness probe is initiated
  initialDelaySeconds: 5
  # -- How often (in seconds) to perform the readiness probe
  periodSeconds: 5
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
  failureThreshold: 3
   # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

# -- Liveness probe configuration
livenessProbe:
 # -- Number of seconds after the container has started before liveness probe is initiated
  initialDelaySeconds: 15
  # -- Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
  failureThreshold: 3
  # -- How often (in seconds) to perform the liveness probe
  periodSeconds: 10
  # -- Configuration of the Kubelet http request on the server
  httpGet:
    # -- Path to access on the HTTP server
    path: /health
    # -- Name or number of the port to access on the container, on which the server is listening
    port: 8000

labels:
  environment: "test"
  release: "test"