mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-05 12:37:03 +08:00
Merge branch 'main' into seemethere/cuda_arm64
This commit is contained in:
commit
5667ed8788
@ -423,12 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
|
||||
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||
}
|
||||
|
||||
#ifndef USE_ROCM
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
static_assert(WARP_SIZE == 32 || WARP_SIZE == 64, \
|
||||
"Unsupported warp size. Only 32 and 64 are supported."); \
|
||||
static_assert(WARP_SIZE == 32, \
|
||||
"Unsupported warp size. Only 32 is supported for CUDA"); \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
|
||||
#else
|
||||
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
|
||||
if (WARP_SIZE == 64) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else if (WARP_SIZE == 32) { \
|
||||
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
|
||||
gating_output, nullptr, topk_weights, topk_indices, \
|
||||
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
|
||||
} else { \
|
||||
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename IndType>
|
||||
void topkGatingSoftmaxKernelLauncher(
|
||||
@ -443,7 +458,9 @@ void topkGatingSoftmaxKernelLauncher(
|
||||
cudaStream_t stream) {
|
||||
static constexpr int WARPS_PER_TB = 4;
|
||||
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
|
||||
#ifndef USE_ROCM
|
||||
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
|
||||
#endif
|
||||
switch (num_experts) {
|
||||
case 1:
|
||||
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
|
||||
|
||||
@ -195,7 +195,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("openai-community/gpt2",
|
||||
{"alias": "gpt2"}),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder",
|
||||
{"tiny": "bigcode/tiny_starcoder_py"}), # noqa: E501
|
||||
extras={"tiny": "bigcode/tiny_starcoder_py"}, # noqa: E501
|
||||
min_transformers_version="4.55.1"),
|
||||
"GPTJForCausalLM": _HfExamplesInfo("Milos/slovak-gpt-j-405M",
|
||||
{"6b": "EleutherAI/gpt-j-6b"}),
|
||||
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-70m",
|
||||
|
||||
@ -11,7 +11,8 @@ from vllm import LLM, SamplingParams
|
||||
from vllm.config import CompilationConfig, CompilationLevel
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.forward_context import get_forward_context
|
||||
from vllm.model_executor.models.gemma3n import Gemma3nForConditionalGeneration
|
||||
from vllm.model_executor.models.gemma3n_mm import (
|
||||
Gemma3nForConditionalGeneration)
|
||||
from vllm.model_executor.models.registry import ModelRegistry
|
||||
from vllm.model_executor.models.utils import extract_layer_index
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@ -32,12 +33,13 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
hidden_states = self.model(input_ids, positions, intermediate_tensors,
|
||||
inputs_embeds, **kwargs)
|
||||
hidden_states = super().forward(input_ids, positions,
|
||||
intermediate_tensors, inputs_embeds,
|
||||
**kwargs)
|
||||
attn_metadata = get_forward_context().attn_metadata
|
||||
# attn_metadata is None during dummy runs
|
||||
if (attn_metadata is not None
|
||||
and self.cache_config.kv_sharing_fast_prefill):
|
||||
and self.language_model.cache_config.kv_sharing_fast_prefill):
|
||||
assert isinstance(attn_metadata, dict) # true in V1
|
||||
# Gemma3n-E2B has 30 layers, with last 20 layers being
|
||||
# cross-decoder layers. Check attention metadata is correct
|
||||
@ -52,7 +54,7 @@ class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
|
||||
|
||||
# Last layer will be a KV sharing layer
|
||||
layer_attn_metadata = attn_metadata[
|
||||
self.model.language_model.layers[-1].self_attn.attn.layer_name]
|
||||
self.language_model.model.layers[-1].self_attn.attn.layer_name]
|
||||
logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
|
||||
assert logits_indices_padded is not None
|
||||
num_logits_indices = layer_attn_metadata.num_logits_indices
|
||||
|
||||
@ -146,7 +146,11 @@ def test_ngram_correctness(
|
||||
marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
|
||||
],
|
||||
ids=[
|
||||
"qwen3_eagle3", "llama3_eagle", "llama3_eagle3", "llama4_eagle",
|
||||
# TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611 # noqa: E501
|
||||
# "qwen3_eagle3",
|
||||
"llama3_eagle",
|
||||
"llama3_eagle3",
|
||||
"llama4_eagle",
|
||||
"llama4_eagle_mm"
|
||||
])
|
||||
@pytest.mark.parametrize("attn_backend",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user