diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index f3a4773f0fc6c..f51856d6eaebf 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -184,3 +184,13 @@ chat_response = client.chat.completions.create( ) print("Chat response:", chat_response) ``` + +## On Attention Backends + +Currently, vLLM supports multiple backends for efficient Attention computation across different platforms and accelerator architectures. It automatically selects the most performant backend compatible with your system and model specifications. + +If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`. + +```{attention} +There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for instructions on how to install it. +``` diff --git a/vllm/config.py b/vllm/config.py index ace49a86eaefa..a584bc0d930f2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -9,6 +9,7 @@ import sys import warnings from contextlib import contextmanager from dataclasses import dataclass, field, replace +from importlib.util import find_spec from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, Final, List, Literal, Mapping, Optional, Protocol, Set, @@ -294,6 +295,14 @@ class ModelConfig: self.maybe_pull_model_tokenizer_for_s3(model, tokenizer) + if (backend := envs.VLLM_ATTENTION_BACKEND + ) and backend == "FLASHINFER" and find_spec("flashinfer") is None: + raise ValueError( + "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " + "module was not found." + "See https://github.com/vllm-project/vllm/blob/main/Dockerfile" + "for instructions on how to install it.") + # The tokenizer version is consistent with the model version by default. if tokenizer_revision is None: self.tokenizer_revision = revision