diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2dcca5711b3d5..9d0b3fdd3a02c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1223,6 +1223,8 @@ steps: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # Alot of these tests are on the edge of OOMing + - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index eb026c2ec0209..bec12eeeb48d5 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -3,7 +3,7 @@ from collections import OrderedDict from typing import NamedTuple -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest from huggingface_hub.utils import HfHubHTTPError @@ -194,5 +194,8 @@ def test_get_adapter_absolute_path_huggingface_error( # Hugging Face model identifier with download error path = "org/repo" mock_exist.return_value = False - mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info") + mock_snapshot_download.side_effect = HfHubHTTPError( + "failed to query model info", + response=MagicMock(), + ) assert get_adapter_absolute_path(path) == path diff --git a/tests/models/multimodal/generation/test_vit_backend_functionality.py b/tests/models/multimodal/generation/test_vit_backend_functionality.py index 78797ff7c1979..a4e4ce312ddd4 100644 --- a/tests/models/multimodal/generation/test_vit_backend_functionality.py +++ b/tests/models/multimodal/generation/test_vit_backend_functionality.py @@ -388,6 +388,7 @@ def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner): "mm_encoder_attn_backend", [None] + current_platform.get_supported_vit_attn_backends(), ) +@pytest.mark.skip(reason="Broken test due to memory segmentation fault") @create_new_process_for_each_test() def test_vit_backend_functionality( model_key: str, diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 010817e79a936..c32bf04c71c1f 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -642,48 +642,130 @@ _OPS_REGISTERED = False class rocm_aiter_ops: + """ROCm AITER operations wrapper for AMD GPU acceleration in vLLM. + + This class centralizes the import and registration of AITER ops, + and provides a unified interface for checking if AITER is enabled. + Operations are only available on supported gfx9 + architectures when aiter is installed. + + The class uses environment variables to control which features are enabled, + allowing fine-grained control over which AITER optimizations are used. + + Environment Variables: + VLLM_ROCM_USE_AITER: Main toggle for all AITER operations. + VLLM_ROCM_USE_AITER_LINEAR: Controls GEMM and quantization ops. + VLLM_ROCM_USE_AITER_RMSNORM: Controls RMSNorm operations. + VLLM_ROCM_USE_AITER_MOE: Controls MoE (Mixture of Experts) ops. + VLLM_ROCM_USE_AITER_MLA: Controls MLA (Multi-head Latent Attention) ops. + VLLM_ROCM_USE_AITER_MHA: Controls MHA ops including flash_attn_varlen. + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: Controls Triton unified attention. + VLLM_ROCM_USE_AITER_FP8BMM: Controls FP8 batched matrix multiply. + VLLM_ROCM_USE_AITER_FP4_ASM_GEMM: Controls FP4 assembly GEMM. + VLLM_ROCM_USE_AITER_TRITON_ROPE: Controls Triton rotary embeddings. + VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: Controls shared expert fusion. + VLLM_ROCM_USE_AITER_TRITON_GEMM: Controls Triton unquantized GEMM. + + Note: + The environment variables are assigned when the module is imported, + so you can't change the environment variables after the module is imported. + This is done out of performance consideration. Accessing environment variables + is expensive as described in issue https://github.com/vllm-project/vllm/issues/17067 + so we don't want to do it repeatedly, especially in the hot path (the forward pass). + You can call the refresh_env_variables() function to reload the env variables + after monkey patching the env variables in the unit test. + + Check Functions: + All check functions (is_*_enabled) are decorated with @if_aiter_supported, + which verifies: (1) platform is ROCm, (2) device arch is gfx9, and + (3) aiter library is installed. The check function then also verifies + the corresponding environment variable is enabled. + i.e. ___ + is_enabled() == current_platform.is_rocm() and | checked by + current_platform.is_on_gfx9() and | @if_aiter_supported + IS_AITER_FOUND and _______________| + cls._AITER_ENABLED -----> Check by the logic in `is_enabled()` + + Example: + from vllm._aiter_ops import rocm_aiter_ops + + # Check if aiter is enabled before using operations + if rocm_aiter_ops.is_enabled(): + result = rocm_aiter_ops.rms_norm(x, weight, epsilon) + + Operations: + - RMS normalization: rms_norm, rms_norm2d_with_add + - GEMM operations: gemm_a8w8, gemm_a8w8_blockscale + - Fused MoE: fused_moe, asm_moe_tkw1 + - Routing: topk_softmax, biased_grouped_topk, grouped_topk + - MLA decode: mla_decode_fwd + - Quantization: per_tensor_quant, per_token_quant, group_fp8_quant + - Triton ops: triton_rotary_embed, triton_fp8_bmm, triton_gemm_a8w8_blockscale + """ + + # Check if the env variable is set _AITER_ENABLED = envs.VLLM_ROCM_USE_AITER _LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR _RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM _FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE _MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA - _PG_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_PAGED_ATTN _MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA _TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION + # TODO: Consolidate under _LINEAR_ENABLED _FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM + # TODO: Consolidate under _LINEAR_ENABLED _FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM + # TODO: Consolidate under VLLM_ROCM_USE_AITER_ROPE _TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE _MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS + # TODO: Consolidate under _LINEAR_ENABLED _TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM + @classmethod + def refresh_env_variables(cls): + """ + Since the environment variables are assigned when the module is imported, + This is a helper function to reload all the env variables from + the environment variables. + for example, after monkey patching the env variables in the unit test, + you can call this function to reload the env variables. + """ + cls._AITER_ENABLED = envs.VLLM_ROCM_USE_AITER + cls._LINEAR_ENABLED = envs.VLLM_ROCM_USE_AITER_LINEAR + cls._RMSNORM_ENABLED = envs.VLLM_ROCM_USE_AITER_RMSNORM + cls._FMOE_ENABLED = envs.VLLM_ROCM_USE_AITER_MOE + cls._MLA_ENABLED = envs.VLLM_ROCM_USE_AITER_MLA + cls._MHA_ENABLED = envs.VLLM_ROCM_USE_AITER_MHA + cls._TRITON_UNIFIED_ATTN_ENABLED = envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION + cls._FP8BMM_ENABLED = envs.VLLM_ROCM_USE_AITER_FP8BMM + cls._FP4_GEMM_DYNAMIC_QUANT_ASM = envs.VLLM_ROCM_USE_AITER_FP4_ASM_GEMM + cls._TRITON_ROTARY_EMBED = envs.VLLM_ROCM_USE_AITER_TRITON_ROPE + cls._MOE_SHARED_EXPERTS_ENABLED = envs.VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS + cls._TRITON_UNQUANT_GEMM = envs.VLLM_ROCM_USE_AITER_TRITON_GEMM + @classmethod @if_aiter_supported def is_enabled(cls) -> bool: - """Verifies device specs and availability of aiter main env variable.""" return cls._AITER_ENABLED @classmethod @if_aiter_supported def is_linear_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._LINEAR_ENABLED @classmethod @if_aiter_supported def is_linear_fp8_enaled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls.is_linear_enabled() @classmethod @if_aiter_supported def is_rmsnorm_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._RMSNORM_ENABLED @classmethod @if_aiter_supported def is_fused_moe_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._FMOE_ENABLED @classmethod @@ -694,25 +776,16 @@ class rocm_aiter_ops: @classmethod @if_aiter_supported def is_mla_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._MLA_ENABLED @classmethod @if_aiter_supported def is_mha_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._MHA_ENABLED - @classmethod - @if_aiter_supported - def is_pa_attn_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" - return cls._AITER_ENABLED and cls._PG_ATTN_ENABLED - @classmethod @if_aiter_supported def is_triton_unified_attn_enabled(cls) -> bool: - """ "Verifies device specs and availability of env variable.""" return cls._AITER_ENABLED and cls._TRITON_UNIFIED_ATTN_ENABLED @classmethod diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 1fdb843e1a7c7..4a98494b3c7b3 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -937,7 +937,7 @@ class CompilationConfig: or self.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE ): logger.warning_once( - "Using piecewise compilation with empty splitting_ops" + "Using piecewise cudagraph with empty splitting_ops" ) if self.cudagraph_mode == CUDAGraphMode.PIECEWISE: logger.warning_once( diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py index 60aa1c088b4d8..a143347b19f2c 100644 --- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py @@ -29,14 +29,14 @@ class SharedFusedMoE(FusedMoE): self._shared_experts = shared_experts # Disable shared expert overlap if: - # - we are using eplb, because of correctness issues - # - we are using flashinfer with DP, since there nothing to gain + # - we are using eplb with non-default backend, because of correctness issues + # - we are using flashinfer with DP, since there nothint to gain # - we are using marlin kernels + backend = self.moe_parallel_config.all2all_backend self.use_overlapped = ( use_overlapped and not ( - # TODO(wentao): find the root cause and remove this condition - self.enable_eplb + (self.enable_eplb and backend != "allgather_reducescatter") or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1) ) and self._shared_experts is not None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 18c2ab026b2ba..f650a6eabbb9c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -469,16 +469,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): ) logger.debug_once("Finished shuffling weights for TRT-LLM MOE") - layer.gemm1_weights_fp4_shuffled = Parameter( + layer.w13_weight = Parameter( gemm1_weights_fp4_shuffled, requires_grad=False ) - layer.gemm2_weights_fp4_shuffled = Parameter( - gemm2_weights_fp4_shuffled, requires_grad=False - ) - layer.gemm1_scales_fp4_shuffled = Parameter( + layer.w2_weight = Parameter(gemm2_weights_fp4_shuffled, requires_grad=False) + layer.w13_weight_scale = Parameter( gemm1_scales_fp4_shuffled, requires_grad=False ) - layer.gemm2_scales_fp4_shuffled = Parameter( + layer.w2_weight_scale = Parameter( gemm2_scales_fp4_shuffled, requires_grad=False ) @@ -487,12 +485,6 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32), requires_grad=False, ) - - # Clean up weights that won't be used by TRT-LLM - del layer.w2_weight - del layer.w2_weight_scale - del layer.w13_weight - del layer.w13_weight_scale else: # swizzle weight scales layer.w13_weight_scale = torch.nn.Parameter( diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 030d85080a34d..f71854e6b63c5 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1458,16 +1458,14 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): ) logger.debug_once("Finished shuffling weights for TRT-LLM MOE") - layer.gemm1_weights_fp4_shuffled = Parameter( + layer.w13_weight = Parameter( gemm1_weights_fp4_shuffled, requires_grad=False ) - layer.gemm2_weights_fp4_shuffled = Parameter( - gemm2_weights_fp4_shuffled, requires_grad=False - ) - layer.gemm1_scales_fp4_shuffled = Parameter( + layer.w2_weight = Parameter(gemm2_weights_fp4_shuffled, requires_grad=False) + layer.w13_weight_scale = Parameter( gemm1_scales_fp4_shuffled, requires_grad=False ) - layer.gemm2_scales_fp4_shuffled = Parameter( + layer.w2_weight_scale = Parameter( gemm2_scales_fp4_shuffled, requires_grad=False ) @@ -1476,12 +1474,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32), requires_grad=False, ) - - # Clean up weights that won't be used by TRT-LLM - del layer.w2_weight - del layer.w2_weight_scale - del layer.w13_weight - del layer.w13_weight_scale elif self.use_marlin: # Marlin processing prepare_moe_fp4_layer_for_marlin(layer) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index e424cd0e1ac99..76bce8a8d98d6 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -301,18 +301,14 @@ def flashinfer_trtllm_fp4_moe( hidden_states_scale=hidden_states_scale_linear_fp4.view( torch.float8_e4m3fn ).flatten(), - gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, - gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), + gemm1_weights=layer.w13_weight.data, + gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn), gemm1_bias=None, gemm1_alpha=None, gemm1_beta=None, gemm1_clamp_limit=None, - gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, - gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), + gemm2_weights=layer.w2_weight.data, + gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn), gemm2_bias=None, output1_scale_scalar=layer.g1_scale_c.data, output1_scale_gate_scalar=layer.g1_alphas.data, @@ -380,18 +376,14 @@ def flashinfer_trtllm_fp4_routed_moe( hidden_states_scale=hidden_states_scale_linear_fp4.view( torch.float8_e4m3fn ).flatten(), - gemm1_weights=layer.gemm1_weights_fp4_shuffled.data, - gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), + gemm1_weights=layer.w13_weight.data, + gemm1_weights_scale=layer.w13_weight_scale.data.view(torch.float8_e4m3fn), gemm1_bias=None, gemm1_alpha=None, gemm1_beta=None, gemm1_clamp_limit=None, - gemm2_weights=layer.gemm2_weights_fp4_shuffled.data, - gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view( - torch.float8_e4m3fn - ), + gemm2_weights=layer.w2_weight.data, + gemm2_weights_scale=layer.w2_weight_scale.data.view(torch.float8_e4m3fn), gemm2_bias=None, output1_scale_scalar=layer.g1_scale_c.data, output1_scale_gate_scalar=layer.g1_alphas.data, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index e774cd647ea8c..ee429bf458843 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -55,7 +55,9 @@ class BertEmbedding(nn.Module): "position_ids", torch.arange(config.max_position_embeddings).unsqueeze(0), ) - self.position_embedding_type = config.position_embedding_type + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) if self.position_embedding_type != "absolute": raise ValueError( "Only 'absolute' position_embedding_type" + " is supported" diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 31cc645099141..45b6e93307ac3 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -57,12 +57,6 @@ class RobertaEmbedding(nn.Module): torch.arange(config.max_position_embeddings).unsqueeze(0), ) - self.position_embedding_type = config.position_embedding_type - if self.position_embedding_type != "absolute": - raise ValueError( - "Only 'absolute' position_embedding_type" + " is supported" - ) - def forward( self, input_ids: torch.Tensor, @@ -135,12 +129,12 @@ class RobertaEmbeddingModel(BertEmbeddingModel): def _build_model( self, vllm_config: VllmConfig, prefix: str = "" ) -> BertModel | BertWithRope: - if vllm_config.model_config.hf_config.position_embedding_type == "rotary": - return JinaRobertaModel(vllm_config=vllm_config, prefix=prefix) + hf_config = vllm_config.model_config.hf_config + kwargs = dict(vllm_config=vllm_config, prefix=prefix) + if getattr(hf_config, "position_embedding_type", "absolute") == "absolute": + return BertModel(**kwargs, embedding_class=RobertaEmbedding) else: - return BertModel( - vllm_config=vllm_config, prefix=prefix, embedding_class=RobertaEmbedding - ) + return JinaRobertaModel(**kwargs) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights_list = list(weights) diff --git a/vllm/model_executor/models/swin.py b/vllm/model_executor/models/swin.py index a74fd80c06d8c..fbf5594851ece 100644 --- a/vllm/model_executor/models/swin.py +++ b/vllm/model_executor/models/swin.py @@ -102,7 +102,6 @@ class SwinSelfAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: torch.FloatTensor | None = None, - head_mask: torch.FloatTensor | None = None, output_attentions: bool | None = False, ) -> tuple[torch.Tensor, ...]: batch_size, dim, num_channels = hidden_states.shape @@ -201,12 +200,9 @@ class SwinAttention(nn.Module): self, hidden_states: torch.Tensor, attention_mask: torch.FloatTensor | None = None, - head_mask: torch.FloatTensor | None = None, output_attentions: bool | None = False, ) -> tuple[torch.Tensor]: - self_outputs = self.self( - hidden_states, attention_mask, head_mask, output_attentions - ) + self_outputs = self.self(hidden_states, attention_mask, output_attentions) attention_output = self.output(self_outputs[0], hidden_states) outputs = (attention_output,) + self_outputs[1:] return outputs @@ -339,18 +335,14 @@ class SwinStage(nn.Module): self, hidden_states: torch.Tensor, input_dimensions: tuple[int, int], - head_mask: torch.FloatTensor | None = None, output_attentions: bool | None = False, always_partition: bool | None = False, ) -> tuple[torch.Tensor]: height, width = input_dimensions for i, layer_module in enumerate(self.blocks): - layer_head_mask = head_mask[i] if head_mask is not None else None - layer_outputs = layer_module( hidden_states, input_dimensions, - layer_head_mask, output_attentions, always_partition, ) @@ -425,17 +417,13 @@ class SwinEncoder(nn.Module): self, hidden_states: torch.Tensor, input_dimensions: tuple[int, int], - head_mask: torch.FloatTensor | None = None, output_attentions: bool | None = False, always_partition: bool | None = False, ) -> tuple[torch.Tensor]: for i, layer_module in enumerate(self.layers): - layer_head_mask = head_mask[i] if head_mask is not None else None - layer_outputs = layer_module( hidden_states, input_dimensions, - layer_head_mask, output_attentions, always_partition, ) @@ -473,7 +461,6 @@ class SwinModel(nn.Module): def forward( self, pixel_values: torch.FloatTensor | None = None, - head_mask: torch.FloatTensor | None = None, output_attentions: bool | None = None, ) -> tuple[torch.Tensor]: embedding_output, input_dimensions = self.embeddings(pixel_values) @@ -481,7 +468,6 @@ class SwinModel(nn.Module): encoder_outputs = self.encoder( embedding_output, input_dimensions, - head_mask=head_mask, output_attentions=output_attentions, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 32a2ba1ef38f7..7e1b7c90c9204 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -5,6 +5,7 @@ """PyTorch Ultravox model.""" import copy +import inspect from collections.abc import Iterable, Mapping, Sequence from types import SimpleNamespace from typing import Annotated, Any, Literal, TypeAlias @@ -380,11 +381,17 @@ class UltravoxTransformerProjector(nn.Module, ModuleUtilsMixin): ) hidden_states = hidden_states + positions + # Backward compatibility for Transformers v4 where layer_head_mask + # was a required argument for WhisperEncoderLayer.forward + kwargs = {} + if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters: + kwargs["layer_head_mask"] = None + for layer in self.layers: layer_outputs = layer( hidden_states, attention_mask=extended_attention_mask, - layer_head_mask=None, + **kwargs, ) hidden_states = layer_outputs[0] @@ -479,11 +486,17 @@ class ModifiedWhisperEncoder(WhisperEncoder): attention_mask = self.get_attention_mask_by_audio_len(audio_lens, hidden_states) + # Backward compatibility for Transformers v4 where layer_head_mask + # was a required argument for WhisperEncoderLayer.forward + kwargs = {} + if "layer_head_mask" in inspect.signature(self.layers[0].forward).parameters: + kwargs["layer_head_mask"] = None + for encoder_layer in self.layers: layer_outputs = encoder_layer( hidden_states, attention_mask, - layer_head_mask=None, + **kwargs, ) hidden_states = layer_outputs[0] diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index e469a928da229..c237f7cf887c1 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -124,8 +124,6 @@ def use_rocm_custom_paged_attention( alibi_slopes: torch.Tensor | None = None, sinks: torch.Tensor | None = None, ) -> bool: - from vllm._aiter_ops import rocm_aiter_ops - GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"]) ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"]) @@ -141,7 +139,6 @@ def use_rocm_custom_paged_attention( and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 128 * 1024 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN) - and not (rocm_aiter_ops.is_pa_attn_enabled()) and sinks is None ) diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 00a0a77a1c2f7..589d6ef2f6348 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -15,6 +15,7 @@ from vllm.v1.attention.backends.mla.common import ( MLACommonImpl, MLACommonMetadata, MLACommonMetadataBuilder, + QueryLenSupport, ) from vllm.v1.attention.backends.utils import AttentionCGSupport from vllm.v1.kv_cache_interface import AttentionSpec @@ -51,6 +52,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata): qo_indptr: torch.Tensor | None = None # The dtype of MLA out tensor attn_out_dtype: torch.dtype = torch.bfloat16 + # The max query output length: int + max_qo_len: int | None = None class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): @@ -60,9 +63,8 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]): class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): # TODO(luka, lucas): audit this as part of: # https://github.com/vllm-project/vllm/issues/22945 - _cudagraph_support: ClassVar[AttentionCGSupport] = ( - AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE - ) + _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH + query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM def __init__( self, @@ -97,8 +99,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): max_num_reqs, dtype=torch.int32, device=device ) - self.qo_indptr = torch.arange( - 0, max_num_reqs + 1, dtype=torch.int32, device=device + self.qo_indptr = torch.zeros( + max_num_reqs + 1, dtype=torch.int32, device=device ) def _build_decode( @@ -128,6 +130,8 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): seq_lens_device.cumsum(dim=0, dtype=torch.int32), ] ) + qo_len = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + max_qo_len = qo_len.max().item() if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): num_actual_pages = paged_kv_indices.size(0) @@ -150,6 +154,10 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): self.paged_kv_last_page_len[num_reqs:].fill_(1) paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs] + self.qo_indptr[: 1 + num_reqs].copy_( + query_start_loc_device, non_blocking=True + ) + self.qo_indptr[1 + num_reqs :] = query_start_loc_device[-1] qo_indptr = self.qo_indptr[: 1 + num_reqs] else: @@ -165,6 +173,7 @@ class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]): paged_kv_last_page_len=paged_kv_last_page_len, qo_indptr=qo_indptr, dcp_tot_seq_lens=dcp_tot_seq_lens_device, + max_qo_len=max_qo_len, attn_out_dtype=self.decode_attn_out_dtype, ) @@ -255,16 +264,13 @@ class AiterMLAImpl(MLACommonImpl[AiterMLAMetadata]): kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2) - # max_seqlen_qo must be 1 except for MTP - # TODO: Find the best value for MTP - max_seqlen_qo = 1 rocm_aiter_ops.mla_decode_fwd( q, kv_buffer, o, self.scale, attn_metadata.decode.qo_indptr, - max_seqlen_qo, + attn_metadata.decode.max_qo_len, attn_metadata.decode.paged_kv_indptr, attn_metadata.decode.paged_kv_indices, attn_metadata.decode.paged_kv_last_page_len, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 278970ae7ee88..754e0b9d08316 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -187,6 +187,12 @@ class Scheduler(SchedulerInterface): if self.is_encoder_decoder else EncoderCacheManager(cache_size=encoder_cache_size) ) + # For encoder-decoder models, allocate the maximum number of tokens for Cross + # Attn blocks, as for Whisper its input is always padded to the maximum length. + # TODO (NickLucche): Generalize to models with variable-length encoder inputs. + self._num_encoder_max_input_tokens = ( + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(vllm_config.model_config) + ) speculative_config = vllm_config.speculative_config self.use_eagle = False @@ -568,17 +574,11 @@ class Scheduler(SchedulerInterface): 0 if request.num_computed_tokens == 0 else self.num_lookahead_tokens ) - # Determine if we need to allocate cross-attention blocks. - if self.is_encoder_decoder and request.has_encoder_inputs: - # TODO(russellb): For Whisper, we know that the input is - # always padded to the maximum length. If we support other - # encoder-decoder models, this will need to be updated if we - # want to only allocate what is needed. - num_encoder_tokens = ( - self.scheduler_config.max_num_encoder_input_tokens - ) - else: - num_encoder_tokens = 0 + num_encoder_tokens = ( + self._num_encoder_max_input_tokens + if self.is_encoder_decoder and request.has_encoder_inputs + else 0 + ) new_blocks = self.kv_cache_manager.allocate_slots( request, diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index ae42b33f80f88..cb5ad99cfbdf7 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -21,8 +21,8 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput if TYPE_CHECKING: import outlines_core as oc import transformers.file_utils as file_utils - import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2 import xgrammar as xgr + from transformers.convert_slow_tokenizer import bytes_to_unicode from vllm.tokenizers import TokenizerLike from vllm.v1.worker.gpu_input_batch import InputBatch @@ -30,10 +30,8 @@ else: xgr = LazyLoader("xgr", globals(), "xgrammar") oc = LazyLoader("oc", globals(), "outlines_core") file_utils = LazyLoader("file_utils", globals(), "transformers.file_utils") - tokenization_gpt2 = LazyLoader( - "tokenization_gpt2", - globals(), - "transformers.models.gpt2.tokenization_gpt2", + bytes_to_unicode = LazyLoader( + "bytes_to_unicode", globals(), "transformers.convert_slow_tokenizer" ) TokenizerLike = object @@ -204,7 +202,7 @@ def _reduced_vocabulary( A Dict of token string -> equivalent token ids """ - unicode_to_bytes = {v: k for k, v in tokenization_gpt2.bytes_to_unicode().items()} + unicode_to_bytes = {v: k for k, v in bytes_to_unicode().items()} def convert_token_to_string(token: str) -> str: string = tokenizer.convert_tokens_to_string([token]) diff --git a/vllm/v1/worker/workspace.py b/vllm/v1/worker/workspace.py index a16dde1f67800..bbbd7705d54e4 100644 --- a/vllm/v1/worker/workspace.py +++ b/vllm/v1/worker/workspace.py @@ -145,12 +145,20 @@ class WorkspaceManager: for ubatch_id in range(self._num_ubatches): current_workspace = self._current_workspaces[ubatch_id] - if current_workspace is None: + if ( + current_workspace is None + or self._workspace_size_bytes(current_workspace) < required_bytes + ): + # Delete old tensor before allocating new one to avoid + # memory spike from resize_(). resize_() allocates new + # memory before freeing old, which can cause OOM. + # Must clear the list reference first since local var + # is just a copy of the reference. + self._current_workspaces[ubatch_id] = None + del current_workspace self._current_workspaces[ubatch_id] = torch.empty( (required_bytes,), dtype=torch.uint8, device=self._device ) - elif self._workspace_size_bytes(current_workspace) < required_bytes: - current_workspace.resize_(required_bytes) if envs.VLLM_DEBUG_WORKSPACE: logger.info(