diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index 946c137db6366..99c52ef17d08b 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert); } +#ifndef USE_ROCM #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ - static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \ - "Unsupported warp size. Only 32 and 64 are supported."); \ + static_assert(WARP_SIZE == 32, \ + "Unsupported warp size. Only 32 is supported for CUDA"); \ topkGatingSoftmaxLauncherHelper( \ gating_output, nullptr, topk_weights, topk_indices, \ token_expert_indices, num_tokens, topk, 0, num_experts, stream); +#else +#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \ + if (WARP_SIZE == 64) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, \ + token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ + } else if (WARP_SIZE == 32) { \ + topkGatingSoftmaxLauncherHelper( \ + gating_output, nullptr, topk_weights, topk_indices, \ + token_expert_indices, num_tokens, topk, 0, num_experts, stream); \ + } else { \ + assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \ + } +#endif template void topkGatingSoftmaxKernelLauncher( @@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher( cudaStream_t stream) { static constexpr int WARPS_PER_TB = 4; static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16; +#ifndef USE_ROCM static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8; +#endif switch (num_experts) { case 1: LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2); diff --git a/tests/models/registry.py b/tests/models/registry.py index d7d20d1f3abf7..eb48c0f6a7738 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2", {"alias": "gpt2"}), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder", - {"tiny": "bigcode/tiny_starcoder_py"}), # noqa: E501 + extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501 + min_transformers_version="4.55.1"), "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M", {"6b": "EleutherAI/gpt-j-6b"}), "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m", diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py index f5a7b9cc276b3..d72e50e5196b8 100644 --- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py +++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py @@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationLevel from vllm.distributed import cleanup_dist_env_and_memory from vllm.forward_context import get_forward_context -from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration +from vllm.model_executor.models.gemma3n_mm import ( + Gemma3nForConditionalGeneration) from vllm.model_executor.models.registry import ModelRegistry from vllm.model_executor.models.utils import extract_layer_index from vllm.sequence import IntermediateTensors @@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds, **kwargs) + hidden_states = super().forward(input_ids, positions, + intermediate_tensors, inputs_embeds, + **kwargs) attn_metadata = get_forward_context().attn_metadata # attn_metadata is None during dummy runs if (attn_metadata is not None - and self.cache_config.kv_sharing_fast_prefill): + and self.language_model.cache_config.kv_sharing_fast_prefill): assert isinstance(attn_metadata, dict) # true in V1 # Gemma3n-E2B has 30 layers, with last 20 layers being # cross-decoder layers. Check attention metadata is correct @@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration): # Last layer will be a KV sharing layer layer_attn_metadata = attn_metadata[ - self.model.language_model.layers[-1].self_attn.attn.layer_name] + self.language_model.model.layers[-1].self_attn.attn.layer_name] logits_indices_padded = (layer_attn_metadata.logits_indices_padded) assert logits_indices_padded is not None num_logits_indices = layer_attn_metadata.num_logits_indices diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 599916c0d1cfb..dde95fbe590b3 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -146,7 +146,11 @@ def test_ngram_correctness( marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")), ], ids=[ - "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle", + # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501 + # "qwen3_eagle3", + "llama3_eagle", + "llama3_eagle3", + "llama4_eagle", "llama4_eagle_mm" ]) @pytest.mark.parametrize("attn_backend",