Merge branch 'main' into seemethere/cuda_arm64

2026-05-21 08:36:59 +08:00 · 2025-08-13 17:07:44 -04:00 · 2025-08-13 17:07:44 -04:00 · 5667ed8788
commit 5667ed8788
parent 846aa6dc80 c6cd5ca3d3
4 changed files with 33 additions and 9 deletions
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
        input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
 }

+#ifndef USE_ROCM
 #define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                          \
-    static_assert(WARP_SIZE == 32 || WARP_SIZE == 64,                                 \
-                  "Unsupported warp size. Only 32 and 64 are supported.");            \
+    static_assert(WARP_SIZE == 32,                                                    \
+                  "Unsupported warp size. Only 32 is supported for CUDA");            \
    topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
        gating_output, nullptr, topk_weights, topk_indices,                           \
        token_expert_indices, num_tokens, topk, 0, num_experts, stream);
+#else
+#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES)                             \
+    if (WARP_SIZE == 64) {                                                               \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else if (WARP_SIZE == 32) {                                                        \
+        topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>(       \
+            gating_output, nullptr, topk_weights, topk_indices,                          \
+            token_expert_indices, num_tokens, topk, 0, num_experts, stream);             \
+    } else {                                                                             \
+        assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
+    }
+#endif

 template <typename IndType>
 void topkGatingSoftmaxKernelLauncher(
@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
    cudaStream_t stream) {
    static constexpr int WARPS_PER_TB = 4;
    static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
+#ifndef USE_ROCM
    static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
+#endif
    switch (num_experts) {
        case 1:
            LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
                                       {"alias": "gpt2"}),
    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
-                                             {"tiny": "bigcode/tiny_starcoder_py"}),  # noqa: E501
+                                             extras={"tiny": "bigcode/tiny_starcoder_py"},  # noqa: E501
+                                             min_transformers_version="4.55.1"),
    "GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
                                       {"6b": "EleutherAI/gpt-j-6b"}),
    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationLevel
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
+from vllm.model_executor.models.gemma3n_mm import (
+    Gemma3nForConditionalGeneration)
 from vllm.model_executor.models.registry import ModelRegistry
 from vllm.model_executor.models.utils import extract_layer_index
 from vllm.sequence import IntermediateTensors
@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
        inputs_embeds: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, intermediate_tensors,
-                                   inputs_embeds, **kwargs)
+        hidden_states = super().forward(input_ids, positions,
+                                        intermediate_tensors, inputs_embeds,
+                                        **kwargs)
        attn_metadata = get_forward_context().attn_metadata
        # attn_metadata is None during dummy runs
        if (attn_metadata is not None
-                and self.cache_config.kv_sharing_fast_prefill):
+                and self.language_model.cache_config.kv_sharing_fast_prefill):
            assert isinstance(attn_metadata, dict)  # true in V1
            # Gemma3n-E2B has 30 layers, with last 20 layers being
            # cross-decoder layers. Check attention metadata is correct
@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):

            # Last layer will be a KV sharing layer
            layer_attn_metadata = attn_metadata[
-                self.model.language_model.layers[-1].self_attn.attn.layer_name]
+                self.language_model.model.layers[-1].self_attn.attn.layer_name]
            logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
            assert logits_indices_padded is not None
            num_logits_indices = layer_attn_metadata.num_logits_indices
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@ -146,7 +146,11 @@ def test_ngram_correctness(
            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
    ],
    ids=[
-        "qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
+        # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611  # noqa: E501
+        # "qwen3_eagle3",
+        "llama3_eagle",
+        "llama3_eagle3",
+        "llama4_eagle",
        "llama4_eagle_mm"
    ])
@pytest.mark.parametrize("attn_backend",