diff --git a/vllm/config.py b/vllm/config.py index 1aadf2c25b430..2669d1a13b37e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -29,7 +29,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) from vllm.model_executor.models import ModelRegistry -from vllm.platforms import CpuArchEnum +from vllm.platforms import CpuArchEnum, current_platform from vllm.sampling_params import GuidedDecodingParams from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import ( @@ -684,6 +684,13 @@ class ModelConfig: self.max_seq_len_to_capture = self.max_model_len self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) + ROCM_UNSUPPORTED_MODELS = ['mllama'] + if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS + and not self.enforce_eager and current_platform.is_rocm()): + logger.warning( + "CUDA graph is not supported for %s on ROCm yet, fallback " + "to the eager mode.", self.hf_config.model_type) + self.enforce_eager = True def _verify_bnb_config(self) -> None: """