mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-02 15:10:05 +08:00
[Doc] Compatibility matrix for mutual exclusive features (#8512)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
This commit is contained in:
parent
1a1823871d
commit
8baf85e4e9
@ -86,6 +86,7 @@ Documentation
|
|||||||
serving/usage_stats
|
serving/usage_stats
|
||||||
serving/integrations
|
serving/integrations
|
||||||
serving/tensorizer
|
serving/tensorizer
|
||||||
|
serving/compatibility_matrix
|
||||||
serving/faq
|
serving/faq
|
||||||
|
|
||||||
.. toctree::
|
.. toctree::
|
||||||
|
|||||||
@ -22,6 +22,8 @@ If you frequently encounter preemptions from the vLLM engine, consider the follo
|
|||||||
|
|
||||||
You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
|
You can also monitor the number of preemption requests through Prometheus metrics exposed by the vLLM. Additionally, you can log the cumulative number of preemption requests by setting disable_log_stats=False.
|
||||||
|
|
||||||
|
.. _chunked-prefill:
|
||||||
|
|
||||||
Chunked Prefill
|
Chunked Prefill
|
||||||
---------------
|
---------------
|
||||||
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
|
||||||
|
|||||||
427
docs/source/serving/compatibility_matrix.rst
Normal file
427
docs/source/serving/compatibility_matrix.rst
Normal file
@ -0,0 +1,427 @@
|
|||||||
|
.. _compatibility_matrix:
|
||||||
|
|
||||||
|
Compatibility Matrix
|
||||||
|
====================
|
||||||
|
|
||||||
|
The tables below show mutually exclusive features and the support on some hardware.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
Check the '✗' with links to see tracking issue for unsupported feature/hardware combination.
|
||||||
|
|
||||||
|
Feature x Feature
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
.. raw:: html
|
||||||
|
|
||||||
|
<style>
|
||||||
|
/* Make smaller to try to improve readability */
|
||||||
|
td {
|
||||||
|
font-size: 0.8rem;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
th {
|
||||||
|
text-align: center;
|
||||||
|
font-size: 0.8rem;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:widths: auto
|
||||||
|
|
||||||
|
* - Feature
|
||||||
|
- :ref:`CP <chunked-prefill>`
|
||||||
|
- :ref:`APC <apc>`
|
||||||
|
- :ref:`LoRA <lora>`
|
||||||
|
- :abbr:`prmpt adptr (Prompt Adapter)`
|
||||||
|
- :ref:`SD <spec_decode>`
|
||||||
|
- CUDA graph
|
||||||
|
- :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||||
|
- :abbr:`logP (Logprobs)`
|
||||||
|
- :abbr:`prmpt logP (Prompt Logprobs)`
|
||||||
|
- :abbr:`async output (Async Output Processing)`
|
||||||
|
- multi-step
|
||||||
|
- :abbr:`MM (Multimodal)`
|
||||||
|
- best-of
|
||||||
|
- beam-search
|
||||||
|
- :abbr:`guided dec (Guided Decoding)`
|
||||||
|
* - :ref:`CP <chunked-prefill>`
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :ref:`APC <apc>`
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :ref:`LoRA <lora>`
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/9057>`__
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`prmpt adptr (Prompt Adapter)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :ref:`SD <spec_decode>`
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - CUDA graph
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||||
|
- ✗
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/7366>`__
|
||||||
|
- ✗
|
||||||
|
- ✗
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/7366>`__
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`logP (Logprobs)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`prmpt logP (Prompt Logprobs)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/8199>`__
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`async output (Async Output Processing)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - multi-step
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/8198>`__
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`MM (Multimodal)`
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/8346>`__
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/8348>`__
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/7199>`__
|
||||||
|
- ?
|
||||||
|
- ?
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ?
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - best-of
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/6137>`__
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ?
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/7968>`__
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - beam-search
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/6137>`__
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ?
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/7968>`__
|
||||||
|
- ?
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
-
|
||||||
|
* - :abbr:`guided dec (Guided Decoding)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ?
|
||||||
|
- ?
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ?
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ?
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
-
|
||||||
|
|
||||||
|
|
||||||
|
Feature x Hardware
|
||||||
|
^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
.. list-table::
|
||||||
|
:header-rows: 1
|
||||||
|
:widths: auto
|
||||||
|
|
||||||
|
* - Feature
|
||||||
|
- Volta
|
||||||
|
- Turing
|
||||||
|
- Ampere
|
||||||
|
- Ada
|
||||||
|
- Hopper
|
||||||
|
- CPU
|
||||||
|
- AMD
|
||||||
|
* - :ref:`CP <chunked-prefill>`
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/2729>`__
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
* - :ref:`APC <apc>`
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/3687>`__
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
* - :ref:`LoRA <lora>`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/pull/4830>`__
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`prmpt adptr (Prompt Adapter)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/8475>`__
|
||||||
|
- ✅
|
||||||
|
* - :ref:`SD <spec_decode>`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - CUDA graph
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`enc-dec (Encoder-Decoder Models)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/blob/a84e598e2125960d3b4f716b78863f24ac562947/vllm/worker/cpu_model_runner.py#L125>`__
|
||||||
|
- ✗
|
||||||
|
* - :abbr:`logP (Logprobs)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`prmpt logP (Prompt Logprobs)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`async output (Async Output Processing)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✗
|
||||||
|
- ✗
|
||||||
|
* - multi-step
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- `✗ <https://github.com/vllm-project/vllm/issues/8477>`__
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`MM (Multimodal)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - best-of
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - beam-search
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
* - :abbr:`guided dec (Guided Decoding)`
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
|
- ✅
|
||||||
@ -420,6 +420,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
|||||||
Returns:
|
Returns:
|
||||||
shape = [num_tokens, num_heads * head_size]
|
shape = [num_tokens, num_heads * head_size]
|
||||||
"""
|
"""
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if attn_type != AttentionType.DECODER:
|
if attn_type != AttentionType.DECODER:
|
||||||
raise NotImplementedError("Encoder self-attention and "
|
raise NotImplementedError("Encoder self-attention and "
|
||||||
"encoder/decoder cross-attention "
|
"encoder/decoder cross-attention "
|
||||||
|
|||||||
@ -359,6 +359,8 @@ class ModelConfig:
|
|||||||
self.use_async_output_proc = False
|
self.use_async_output_proc = False
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if device_config.device_type not in ("cuda", "tpu"):
|
if device_config.device_type not in ("cuda", "tpu"):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Async output processing is only supported for CUDA or TPU. "
|
"Async output processing is only supported for CUDA or TPU. "
|
||||||
@ -372,6 +374,8 @@ class ModelConfig:
|
|||||||
self.use_async_output_proc = False
|
self.use_async_output_proc = False
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if device_config.device_type == "cuda" and self.enforce_eager:
|
if device_config.device_type == "cuda" and self.enforce_eager:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"To see benefits of async output processing, enable CUDA "
|
"To see benefits of async output processing, enable CUDA "
|
||||||
@ -385,6 +389,8 @@ class ModelConfig:
|
|||||||
if self.embedding_mode:
|
if self.embedding_mode:
|
||||||
self.use_async_output_proc = False
|
self.use_async_output_proc = False
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if speculative_config:
|
if speculative_config:
|
||||||
logger.warning("Async output processing is not supported with"
|
logger.warning("Async output processing is not supported with"
|
||||||
" speculative decoding currently.")
|
" speculative decoding currently.")
|
||||||
@ -1200,6 +1206,8 @@ class SpeculativeConfig:
|
|||||||
"speculative decoding is > 1, but got "
|
"speculative decoding is > 1, but got "
|
||||||
f"{speculative_disable_by_batch_size=}")
|
f"{speculative_disable_by_batch_size=}")
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if enable_chunked_prefill:
|
if enable_chunked_prefill:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Speculative decoding and chunked prefill are "
|
"Speculative decoding and chunked prefill are "
|
||||||
@ -1561,6 +1569,8 @@ class LoRAConfig:
|
|||||||
model_config.quantization)
|
model_config.quantization)
|
||||||
|
|
||||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if scheduler_config.chunked_prefill_enabled:
|
if scheduler_config.chunked_prefill_enabled:
|
||||||
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
raise ValueError("LoRA is not supported with chunked prefill yet.")
|
||||||
|
|
||||||
|
|||||||
@ -1000,6 +1000,8 @@ class EngineArgs:
|
|||||||
disable_logprobs=self.disable_logprobs_during_spec_decoding,
|
disable_logprobs=self.disable_logprobs_during_spec_decoding,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if self.num_scheduler_steps > 1:
|
if self.num_scheduler_steps > 1:
|
||||||
if speculative_config is not None:
|
if speculative_config is not None:
|
||||||
raise ValueError("Speculative decoding is not supported with "
|
raise ValueError("Speculative decoding is not supported with "
|
||||||
|
|||||||
@ -62,6 +62,8 @@ class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@functools.lru_cache()
|
@functools.lru_cache()
|
||||||
def _log_prompt_logprob_unsupported_warning_once():
|
def _log_prompt_logprob_unsupported_warning_once():
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Prompt logprob is not supported by multi step workers. "
|
"Prompt logprob is not supported by multi step workers. "
|
||||||
"(e.g., speculative decode uses multi step workers).")
|
"(e.g., speculative decode uses multi step workers).")
|
||||||
|
|||||||
@ -28,6 +28,8 @@ class CPUExecutor(ExecutorBase):
|
|||||||
|
|
||||||
def _init_executor(self) -> None:
|
def _init_executor(self) -> None:
|
||||||
assert self.device_config.device_type == "cpu"
|
assert self.device_config.device_type == "cpu"
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
assert self.lora_config is None, "cpu backend doesn't support LoRA"
|
assert self.lora_config is None, "cpu backend doesn't support LoRA"
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -324,6 +326,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
|||||||
if config.dtype == torch.float16:
|
if config.dtype == torch.float16:
|
||||||
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
|
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
|
||||||
config.dtype = torch.bfloat16
|
config.dtype = torch.bfloat16
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if not config.enforce_eager:
|
if not config.enforce_eager:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"CUDA graph is not supported on CPU, fallback to the eager "
|
"CUDA graph is not supported on CPU, fallback to the eager "
|
||||||
@ -334,6 +338,8 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
|
|||||||
|
|
||||||
def _verify_and_get_scheduler_config(
|
def _verify_and_get_scheduler_config(
|
||||||
config: SchedulerConfig) -> SchedulerConfig:
|
config: SchedulerConfig) -> SchedulerConfig:
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if config.chunked_prefill_enabled:
|
if config.chunked_prefill_enabled:
|
||||||
logger.warning("Chunked prefill is not supported on CPU, disable it.")
|
logger.warning("Chunked prefill is not supported on CPU, disable it.")
|
||||||
config.chunked_prefill_enabled = False
|
config.chunked_prefill_enabled = False
|
||||||
@ -342,6 +348,8 @@ def _verify_and_get_scheduler_config(
|
|||||||
|
|
||||||
|
|
||||||
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if config.enable_prefix_caching:
|
if config.enable_prefix_caching:
|
||||||
logger.warning("Prefix caching is not supported on CPU, disable it.")
|
logger.warning("Prefix caching is not supported on CPU, disable it.")
|
||||||
config.enable_prefix_caching = False
|
config.enable_prefix_caching = False
|
||||||
|
|||||||
@ -310,6 +310,8 @@ class InputPreprocessor:
|
|||||||
encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
|
encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps
|
||||||
decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
|
decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
if decoder_mm_data is not None:
|
if decoder_mm_data is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Multi-modality decoder inputs of encoder-decoder models are "
|
"Multi-modality decoder inputs of encoder-decoder models are "
|
||||||
|
|||||||
@ -87,6 +87,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
|
|||||||
return spec_decode_worker
|
return spec_decode_worker
|
||||||
|
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
class SpecDecodeWorker(LoraNotSupportedWorkerBase):
|
||||||
"""Worker which implements speculative decoding.
|
"""Worker which implements speculative decoding.
|
||||||
|
|
||||||
|
|||||||
@ -41,6 +41,9 @@ logger = init_logger(__name__)
|
|||||||
|
|
||||||
# Exception strings for non-implemented encoder/decoder scenarios
|
# Exception strings for non-implemented encoder/decoder scenarios
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
|
|
||||||
STR_NOT_IMPL_ENC_DEC_SWA = \
|
STR_NOT_IMPL_ENC_DEC_SWA = \
|
||||||
"Sliding window attention for encoder/decoder models " + \
|
"Sliding window attention for encoder/decoder models " + \
|
||||||
"is not currently supported."
|
"is not currently supported."
|
||||||
|
|||||||
@ -816,6 +816,9 @@ def _pythonize_sampler_output(
|
|||||||
|
|
||||||
for sgdx, (seq_group,
|
for sgdx, (seq_group,
|
||||||
sample_result) in enumerate(zip(seq_groups, samples_list)):
|
sample_result) in enumerate(zip(seq_groups, samples_list)):
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
|
# (Check for Guided Decoding)
|
||||||
if seq_group.sampling_params.logits_processors:
|
if seq_group.sampling_params.logits_processors:
|
||||||
assert len(seq_group.sampling_params.logits_processors) == 0, (
|
assert len(seq_group.sampling_params.logits_processors) == 0, (
|
||||||
"Logits Processors are not supported in multi-step decoding")
|
"Logits Processors are not supported in multi-step decoding")
|
||||||
|
|||||||
@ -13,6 +13,9 @@ def assert_enc_dec_mr_supported_scenario(
|
|||||||
a supported scenario.
|
a supported scenario.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# Reminder: Please update docs/source/serving/compatibility_matrix.rst
|
||||||
|
# If the feature combo become valid
|
||||||
|
|
||||||
if enc_dec_mr.cache_config.enable_prefix_caching:
|
if enc_dec_mr.cache_config.enable_prefix_caching:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
|
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE'])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user