mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-16 17:37:14 +08:00
[Doc] Compatibility matrix for mutual exclusive features (#8512)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
This commit is contained in:
parent
1a1823871d
commit
8baf85e4e9
@ -86,6 +86,7 @@ Documentation
|
||||
serving/usage_stats
|
||||
serving/integrations
|
||||
serving/tensorizer
|
||||
serving/compatibility_matrix
|
||||
serving/faq
|
||||
|
||||
.. toctree::
|
||||
|
||||
@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
|
||||
|
||||
You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
|
||||
|
||||
.. _chunked-prefill:
|
||||
|
||||
Chunked Prefill
|
||||
---------------
|
||||
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
||||
|
||||
427
docs/source/serving/compatibility_matrix.rst
Normal file
427
docs/source/serving/compatibility_matrix.rst
Normal file
@ -0,0 +1,427 @@
|
||||
.. _compatibility_matrix:
|
||||
|
||||
Compatibility Matrix
|
||||
====================
|
||||
|
||||
The tables below show mutually exclusive features and the support on some hardware.
|
||||
|
||||
.. note::
|
||||
|
||||
Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
|
||||
|
||||
Feature x Feature
|
||||
-----------------
|
||||
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<style>
|
||||
/* Make smaller to try to improve readability */
|
||||
td {
|
||||
font-size: 0.8rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
th {
|
||||
text-align: center;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
</style>
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: auto
|
||||
|
||||
* - Feature
|
||||
- :ref:`CP <chunked-prefill>`
|
||||
- :ref:`APC <apc>`
|
||||
- :ref:`LoRA <lora>`
|
||||
- :abbr:`prmpt adptr (Prompt Adapter)`
|
||||
- :ref:`SD <spec_decode>`
|
||||
- CUDA graph
|
||||
- :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||
- :abbr:`logP (Logprobs)`
|
||||
- :abbr:`prmpt logP (Prompt Logprobs)`
|
||||
- :abbr:`async output (Async Output Processing)`
|
||||
- multi-step
|
||||
- :abbr:`MM (Multimodal)`
|
||||
- best-of
|
||||
- beam-search
|
||||
- :abbr:`guided dec (Guided Decoding)`
|
||||
* - :ref:`CP <chunked-prefill>`
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :ref:`APC <apc>`
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :ref:`LoRA <lora>`
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/9057>`__
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`prmpt adptr (Prompt Adapter)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :ref:`SD <spec_decode>`
|
||||
- ✗
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - CUDA graph
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||
- ✗
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/7366>`__
|
||||
- ✗
|
||||
- ✗
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/7366>`__
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`logP (Logprobs)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`prmpt logP (Prompt Logprobs)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/8199>`__
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`async output (Async Output Processing)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - multi-step
|
||||
- ✗
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/8198>`__
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - :abbr:`MM (Multimodal)`
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/8346>`__
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/8348>`__
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/7199>`__
|
||||
- ?
|
||||
- ?
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ?
|
||||
-
|
||||
-
|
||||
-
|
||||
-
|
||||
* - best-of
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/6137>`__
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ?
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/7968>`__
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
-
|
||||
* - beam-search
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/6137>`__
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ?
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/7968>`__
|
||||
- ?
|
||||
- ✅
|
||||
-
|
||||
-
|
||||
* - :abbr:`guided dec (Guided Decoding)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ?
|
||||
- ?
|
||||
- ✅
|
||||
- ✅
|
||||
- ?
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ?
|
||||
- ✅
|
||||
- ✅
|
||||
-
|
||||
|
||||
|
||||
Feature x Hardware
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: auto
|
||||
|
||||
* - Feature
|
||||
- Volta
|
||||
- Turing
|
||||
- Ampere
|
||||
- Ada
|
||||
- Hopper
|
||||
- CPU
|
||||
- AMD
|
||||
* - :ref:`CP <chunked-prefill>`
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/2729>`__
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
* - :ref:`APC <apc>`
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/3687>`__
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
* - :ref:`LoRA <lora>`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/pull/4830>`__
|
||||
- ✅
|
||||
* - :abbr:`prmpt adptr (Prompt Adapter)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/8475>`__
|
||||
- ✅
|
||||
* - :ref:`SD <spec_decode>`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - CUDA graph
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ✅
|
||||
* - :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/blob/a84e598e2125960d3b4f716b78863f24ac562947/vllm/worker/cpu_model_runner.py#L125>`__
|
||||
- ✗
|
||||
* - :abbr:`logP (Logprobs)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - :abbr:`prmpt logP (Prompt Logprobs)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - :abbr:`async output (Async Output Processing)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✗
|
||||
- ✗
|
||||
* - multi-step
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- `✗ <https://github.com/vllm-project/vllm/issues/8477>`__
|
||||
- ✅
|
||||
* - :abbr:`MM (Multimodal)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - best-of
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - beam-search
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
* - :abbr:`guided dec (Guided Decoding)`
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
- ✅
|
||||
@ -420,6 +420,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
Returns:
|
||||
shape = [num_tokens, num_heads * head_size]
|
||||
"""
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if attn_type != AttentionType.DECODER:
|
||||
raise NotImplementedError("Encoder self-attention and "
|
||||
"encoder/decoder cross-attention "
|
||||
|
||||
@ -359,6 +359,8 @@ class ModelConfig:
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if device_config.device_type not in ("cuda", "tpu"):
|
||||
logger.warning(
|
||||
"Async output processing is only supported for CUDA or TPU. "
|
||||
@ -372,6 +374,8 @@ class ModelConfig:
|
||||
self.use_async_output_proc = False
|
||||
return
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if device_config.device_type == "cuda" and self.enforce_eager:
|
||||
logger.warning(
|
||||
"To see benefits of async output processing, enable CUDA "
|
||||
@ -385,6 +389,8 @@ class ModelConfig:
|
||||
if self.embedding_mode:
|
||||
self.use_async_output_proc = False
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if speculative_config:
|
||||
logger.warning("Async output processing is not supported with"
|
||||
" speculative decoding currently.")
|
||||
@ -1200,6 +1206,8 @@ class SpeculativeConfig:
|
||||
"speculative decoding is > 1, but got "
|
||||
f"{speculative_disable_by_batch_size=}")
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if enable_chunked_prefill:
|
||||
raise ValueError(
|
||||
"Speculative decoding and chunked prefill are "
|
||||
@ -1561,6 +1569,8 @@ class LoRAConfig:
|
||||
model_config.quantization)
|
||||
|
||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if scheduler_config.chunked_prefill_enabled:
|
||||
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
||||
|
||||
|
||||
@ -1000,6 +1000,8 @@ class EngineArgs:
|
||||
disable_logprobs=self.disable_logprobs_during_spec_decoding,
|
||||
)
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if self.num_scheduler_steps > 1:
|
||||
if speculative_config is not None:
|
||||
raise ValueError("Speculative decoding is not supported with "
|
||||
|
||||
@ -62,6 +62,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
||||
@staticmethod
|
||||
@functools.lru_cache()
|
||||
def _log_prompt_logprob_unsupported_warning_once():
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
logger.warning(
|
||||
"Prompt logprob is not supported by multi step workers. "
|
||||
"(e.g., speculative decode uses multi step workers).")
|
||||
|
||||
@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase):
|
||||
|
||||
def _init_executor(self) -> None:
|
||||
assert self.device_config.device_type == "cpu"
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
assert self.lora_config is None, "cpu backend doesn't support LoRA"
|
||||
|
||||
#
|
||||
@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
||||
if config.dtype == torch.float16:
|
||||
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
|
||||
config.dtype = torch.bfloat16
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if not config.enforce_eager:
|
||||
logger.warning(
|
||||
"CUDA graph is not supported on CPU, fallback to the eager "
|
||||
@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
||||
|
||||
def _verify_and_get_scheduler_config(
|
||||
config: SchedulerConfig) -> SchedulerConfig:
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if config.chunked_prefill_enabled:
|
||||
logger.warning("Chunked prefill is not supported on CPU, disable it.")
|
||||
config.chunked_prefill_enabled = False
|
||||
@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config(
|
||||
|
||||
|
||||
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if config.enable_prefix_caching:
|
||||
logger.warning("Prefix caching is not supported on CPU, disable it.")
|
||||
config.enable_prefix_caching = False
|
||||
|
||||
@ -310,6 +310,8 @@ class InputPreprocessor:
|
||||
encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
|
||||
decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
if decoder_mm_data is not None:
|
||||
raise ValueError(
|
||||
"Multi-modality decoder inputs of encoder-decoder models are "
|
||||
|
||||
@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
|
||||
return spec_decode_worker
|
||||
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
||||
"""Worker which implements speculative decoding.
|
||||
|
||||
|
||||
@ -41,6 +41,9 @@ logger = init_logger(__name__)
|
||||
|
||||
# Exception strings for non-implemented encoder/decoder scenarios
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
|
||||
STR_NOT_IMPL_ENC_DEC_SWA = \
|
||||
"Sliding window attention for encoder/decoder models " + \
|
||||
"is not currently supported."
|
||||
|
||||
@ -816,6 +816,9 @@ def _pythonize_sampler_output(
|
||||
|
||||
for sgdx, (seq_group,
|
||||
sample_result) in enumerate(zip(seq_groups, samples_list)):
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
# (Check for Guided Decoding)
|
||||
if seq_group.sampling_params.logits_processors:
|
||||
assert len(seq_group.sampling_params.logits_processors) == 0, (
|
||||
"Logits Processors are not supported in multi-step decoding")
|
||||
|
||||
@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario(
|
||||
a supported scenario.
|
||||
'''
|
||||
|
||||
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||
# If the feature combo become valid
|
||||
|
||||
if enc_dec_mr.cache_config.enable_prefix_caching:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user