diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 64bd0d9bf507..20234e761133 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i #### Mamba Models Models using selective state-space mechanisms instead of standard transformer attention are supported. -Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1. +Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. +Please note that prefix caching is not yet supported for these models. Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, -`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that -these models currently require disabling prefix caching in V1. +`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). +Please note that prefix caching is not yet supported for these models. Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`). -Please note that these models currently require disabling prefix caching and enforcing eager mode in V1. +Please note that prefix caching is not yet supported for these models. +It is also necessary to enforce eager mode for these models in V1. #### Encoder-Decoder Models diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index f62209326b98..88b3154de2cb 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -292,12 +292,13 @@ class MambaModelConfig(VerifyAndUpdateConfig): return model_config = vllm_config.model_config + cache_config = vllm_config.cache_config compilation_config = vllm_config.compilation_config - model_cls, _ = ModelRegistry.resolve_model_cls( - model_config.architecture, - model_config=model_config, - ) + # TODO(tdoublep): remove once prefix caching is enabled + cache_config.enable_prefix_caching = False + logger.info("Hybrid or mamba-based model detected: disabling prefix " + "caching since it is not yet supported.") # TODO(tdoublep): remove as full cuda graph support is added FCG_NOT_SUPPORTED_MODELS = [