From 3eb08ed9b1114fc16eaa2386b2e7ad14eae87073 Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Mon, 24 Mar 2025 13:48:43 -0400 Subject: [PATCH] [DOC] Add Kubernetes deployment guide with CPUs (#14865) --- docs/source/conf.py | 1 + docs/source/deployment/k8s.md | 105 +++++++++++++++++++++++++++++++++- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index b72faef9af107..b02b84826c9f2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -85,6 +85,7 @@ html_static_path = ["_static"] html_js_files = ["custom.js"] html_css_files = ["custom.css"] +myst_heading_anchors = 2 myst_url_schemes = { 'http': None, 'https': None, diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index b31344b199663..3885956791365 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -4,6 +4,9 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. +* [Deployment with CPUs](#deployment-with-cpus) +* [Deployment with GPUs](#deployment-with-gpus) + Alternatively, you can deploy vLLM to Kubernetes using any of the following: * [Helm](frameworks/helm.md) * [InftyAI/llmaz](integrations/llmaz.md) @@ -14,11 +17,107 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: * [vllm-project/aibrix](https://github.com/vllm-project/aibrix) * [vllm-project/production-stack](integrations/production-stack.md) -## Pre-requisite +## Deployment with CPUs -Ensure that you have a running [Kubernetes cluster with GPUs](https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/). +:::{note} +The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs. +::: -## Deployment using native K8s +First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: + +```bash +cat <