diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index b6ea31cc5717..ae7a7b028b15 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -215,7 +215,6 @@ def _run_test( max_num_seqs=2, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT }) as vllm_model: vllm_outputs_per_image = [ @@ -425,7 +424,6 @@ def test_bnb_regression( dtype=dtype, max_model_len=4096, max_num_seqs=2, - enforce_eager=True, quantization="bitsandbytes", load_format="bitsandbytes", ) @@ -481,7 +479,6 @@ def test_explicit_implicit_prompt( max_model_len=4096, max_num_seqs=2, tensor_parallel_size=1, - enforce_eager=True, ) sampling_params = SamplingParams( temperature=0, @@ -513,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, max_model_len=4096, max_num_seqs=2, tensor_parallel_size=1, - enforce_eager=True, limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: diff --git a/vllm/config.py b/vllm/config.py index ffff3b7c8a8e..b7313e68362c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -670,14 +670,6 @@ class ModelConfig: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama'] - if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH - and not self.enforce_eager): - logger.warning( - "CUDA graph is not supported for %s yet, fallback to the eager " - "mode.", self.hf_config.model_type) - self.enforce_eager = True - def _verify_bnb_config(self) -> None: """ The current version of bitsandbytes (0.44.0) with 8-bit models does not diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 8e98eb273cd8..9ed49597cf82 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1368,7 +1368,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal, full_text_row_masked_out_mask = ( attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to(input_ids.device) - skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 + skip_cross_attention = attn_metadata.max_encoder_seq_len == 0 # For image-present prefill. else: