mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-07 12:37:05 +08:00
Merge branch 'main' into v1-sched-interface-2
This commit is contained in:
commit
5b38e984b3
@ -57,8 +57,8 @@ steps:
|
||||
agents:
|
||||
queue: tpu_queue_postmerge
|
||||
commands:
|
||||
- "rm /var/log/syslog"
|
||||
- "rm /var/log/kern.log"
|
||||
- "rm -f /var/log/syslog"
|
||||
- "rm -f /var/log/kern.log"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
||||
- "docker push vllm/vllm-tpu:nightly"
|
||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||
|
||||
@ -19,13 +19,14 @@ remove_docker_container
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
export BUILDKITE_BUILD_NUMBER=$3
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
@ -36,6 +37,7 @@ function cpu_tests() {
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip install -r vllm/requirements/test.txt
|
||||
pip install -r vllm/requirements/cpu.txt
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
@ -85,4 +87,4 @@ function cpu_tests() {
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
||||
|
||||
@ -559,9 +559,12 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
||||
set(VLLM_MOE_EXT_SRC
|
||||
"csrc/moe/torch_bindings.cpp"
|
||||
"csrc/moe/moe_align_sum_kernels.cu"
|
||||
"csrc/moe/moe_wna16.cu"
|
||||
"csrc/moe/topk_softmax_kernels.cu")
|
||||
|
||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||
endif()
|
||||
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
@ -574,6 +577,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
SRCS "${VLLM_MOE_WNA16_SRC}"
|
||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
set(MARLIN_MOE_SRC
|
||||
|
||||
@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
RUN pip install intel_extension_for_pytorch==2.5.0
|
||||
RUN pip install intel_extension_for_pytorch==2.6.0
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
|
||||
@ -149,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||
FetchContent_Declare(
|
||||
oneDNN
|
||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||
GIT_TAG v3.6
|
||||
GIT_TAG v3.7.1
|
||||
GIT_PROGRESS TRUE
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
@ -18,7 +18,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
std::optional<torch::Tensor> b_qzeros,
|
||||
@ -28,3 +28,4 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor num_tokens_post_pad, int64_t top_k,
|
||||
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
|
||||
int64_t BLOCK_SIZE_K, int64_t bit);
|
||||
#endif
|
||||
@ -31,6 +31,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
" Tensor! num_tokens_post_pad) -> ()");
|
||||
m.impl("sgl_moe_align_block_size", torch::kCUDA, &sgl_moe_align_block_size);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
|
||||
"Tensor b_scales, Tensor? b_qzeros, "
|
||||
@ -41,7 +42,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
|
||||
m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
|
||||
|
||||
#ifndef USE_ROCM
|
||||
m.def(
|
||||
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
|
||||
"Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
|
||||
@ -52,6 +52,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||
"int moe_block_size, bool replicate_input, bool apply_weights)"
|
||||
" -> Tensor");
|
||||
// conditionally compiled so impl registration is in source file
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -263,10 +263,15 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Gemma2ForCausalLM`
|
||||
* Gemma2
|
||||
* Gemma 2
|
||||
* `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Gemma3ForCausalLM`
|
||||
* Gemma 3
|
||||
* `google/gemma-3-1b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `GlmForCausalLM`
|
||||
* GLM-4
|
||||
* `THUDM/glm-4-9b-chat-hf`, etc.
|
||||
@ -504,7 +509,7 @@ you should explicitly specify the task type to ensure that the model is used in
|
||||
*
|
||||
*
|
||||
- * `Gemma2Model`
|
||||
* Gemma2-based
|
||||
* Gemma 2-based
|
||||
* `BAAI/bge-multilingual-gemma2`, etc.
|
||||
*
|
||||
* ✅︎
|
||||
@ -752,6 +757,13 @@ See [this page](#generative-models) for more information on how to use generativ
|
||||
*
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
- * `Gemma3ForConditionalGeneration`
|
||||
* Gemma 3
|
||||
* T + I<sup>+</sup>
|
||||
* `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
|
||||
* ✅︎
|
||||
* ✅︎
|
||||
* ✅︎\*
|
||||
- * `GLM4VForCausalLM`<sup>^</sup>
|
||||
* GLM-4V
|
||||
* T + I
|
||||
@ -937,6 +949,31 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
|
||||
To use Qwen2.5-VL series models, you have to install Hugging Face Transformers library from source via `pip install git+https://github.com/huggingface/transformers`.
|
||||
:::
|
||||
|
||||
:::{note}
|
||||
To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
|
||||
`pip install git+https://github.com/huggingface/transformers`.
|
||||
The earliest commit that supports this is [`50d3530aa04e7a7d003e6b255a98f79fd0447357`](https://github.com/huggingface/transformers/commit/50d3530aa04e7a7d003e6b255a98f79fd0447357).
|
||||
|
||||
Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
|
||||
However, there are differences in how they handle text + image inputs:
|
||||
|
||||
V0 correctly implements the model's attention pattern:
|
||||
- Uses bidirectional attention between the image tokens corresponding to the same image
|
||||
- Uses causal attention for other tokens
|
||||
- Implemented via (naive) PyTorch SDPA with masking tensors
|
||||
- Note: May use significant memory for long prompts with image
|
||||
|
||||
V1 currently uses a simplified attention pattern:
|
||||
- Uses causal attention for all tokens, including image tokens
|
||||
- Generates reasonable outputs but does not match the original model's attention for text + image inputs
|
||||
- Will be updated in the future to support the correct behavior
|
||||
|
||||
This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
|
||||
|
||||
Additionally, vLLM's current Gemma 3 implementation does not support the pan-and-scan image pre-processing algorithm, which helps handle images with skewed aspect ratios by intelligently cropping them into multiple views.
|
||||
Without this feature, model performance may degrade when processing images that deviate significantly from square dimensions.
|
||||
:::
|
||||
|
||||
### Pooling Models
|
||||
|
||||
See [this page](pooling-models) for more information on how to use pooling models.
|
||||
|
||||
@ -118,6 +118,23 @@ def run_fuyu(questions: list[str], modality: str):
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
|
||||
# Gemma 3
|
||||
def run_gemma3(questions: list[str], modality: str):
|
||||
assert modality == "image"
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=2048,
|
||||
max_num_seqs=2,
|
||||
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
|
||||
|
||||
prompts = [("<bos><start_of_turn>user\n"
|
||||
f"<start_of_image>{question}<end_of_turn>\n"
|
||||
"<start_of_turn>model\n") for question in questions]
|
||||
stop_token_ids = None
|
||||
return llm, prompts, stop_token_ids
|
||||
|
||||
|
||||
# GLM-4v
|
||||
def run_glm4v(questions: list[str], modality: str):
|
||||
assert modality == "image"
|
||||
@ -405,7 +422,7 @@ def run_mllama(questions: list[str], modality: str):
|
||||
"type": "image"
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": f"{question}"
|
||||
"text": question
|
||||
}]
|
||||
}] for question in questions]
|
||||
prompts = tokenizer.apply_chat_template(messages,
|
||||
@ -664,6 +681,7 @@ model_example_map = {
|
||||
"deepseek_vl_v2": run_deepseek_vl2,
|
||||
"florence2": run_florence2,
|
||||
"fuyu": run_fuyu,
|
||||
"gemma3": run_gemma3,
|
||||
"glm4v": run_glm4v,
|
||||
"h2ovl_chat": run_h2ovl,
|
||||
"idefics3": run_idefics3,
|
||||
|
||||
@ -80,6 +80,42 @@ def load_deepseek_vl2(question: str, image_urls: list[str]):
|
||||
)
|
||||
|
||||
|
||||
def load_gemma3(question, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "google/gemma-3-4b-it"
|
||||
|
||||
llm = LLM(model=model_name,
|
||||
max_model_len=8192,
|
||||
max_num_seqs=2,
|
||||
limit_mm_per_prompt={"image": len(image_urls)})
|
||||
|
||||
placeholders = [{"type": "image", "image": url} for url in image_urls]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*placeholders,
|
||||
{
|
||||
"type": "text",
|
||||
"text": question
|
||||
},
|
||||
],
|
||||
}]
|
||||
|
||||
processor = AutoProcessor.from_pretrained(model_name)
|
||||
|
||||
prompt = processor.apply_chat_template(messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
|
||||
return ModelRequestData(
|
||||
llm=llm,
|
||||
prompt=prompt,
|
||||
stop_token_ids=None,
|
||||
image_data=[fetch_image(url) for url in image_urls],
|
||||
chat_template=None,
|
||||
)
|
||||
|
||||
|
||||
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
|
||||
model_name = "h2oai/h2ovl-mississippi-800m"
|
||||
|
||||
@ -496,6 +532,7 @@ def load_qwen2_5_vl(question, image_urls: list[str]) -> ModelRequestData:
|
||||
model_example_map = {
|
||||
"aria": load_aria,
|
||||
"deepseek_vl_v2": load_deepseek_vl2,
|
||||
"gemma3": load_gemma3,
|
||||
"h2ovl_chat": load_h2ovl,
|
||||
"idefics3": load_idefics3,
|
||||
"internvl_chat": load_internvl,
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
-r common.txt
|
||||
|
||||
# Dependencies for CPUs
|
||||
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" and platform_machine != "s390x"
|
||||
torch==2.6.0+cpu; platform_machine == "x86_64"
|
||||
torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
|
||||
torch==2.7.0.dev20250304; platform_machine == "s390x"
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@pytest.fixture(autouse=not current_platform.is_cpu())
|
||||
def v1(run_with_both_engines_lora):
|
||||
# Simple autouse wrapper to run both engines for each test
|
||||
# This can be promoted up to conftest.py to run for every
|
||||
|
||||
@ -7,6 +7,8 @@ import pytest
|
||||
import torch
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
@ -15,14 +17,21 @@ from transformers import AutoModelForSequenceClassification
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["float"])
|
||||
@pytest.mark.parametrize("dtype",
|
||||
["half"] if current_platform.is_rocm() else ["float"])
|
||||
def test_classification_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
if current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
with vllm_runner(model, dtype=dtype) as vllm_model:
|
||||
vllm_outputs = vllm_model.classify(example_prompts)
|
||||
|
||||
@ -43,4 +52,8 @@ def test_classification_models(
|
||||
hf_output = torch.tensor(hf_output)
|
||||
vllm_output = torch.tensor(vllm_output)
|
||||
|
||||
assert torch.allclose(hf_output, vllm_output, 1e-3)
|
||||
# the tolerance value of 1e-2 is selected based on the
|
||||
# half datatype tests in
|
||||
# tests/models/embedding/language/test_embedding.py
|
||||
assert torch.allclose(hf_output, vllm_output,
|
||||
1e-3 if dtype == "float" else 1e-2)
|
||||
|
||||
@ -6,6 +6,7 @@ Run `pytest tests/models/embedding/language/test_embedding.py`.
|
||||
import pytest
|
||||
|
||||
from vllm.config import PoolerConfig
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
@ -18,15 +19,15 @@ from ..utils import check_embeddings_close
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
|
||||
pytest.param("intfloat/multilingual-e5-small"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
|
||||
# [Decoder-only]
|
||||
pytest.param("BAAI/bge-multilingual-gemma2",
|
||||
marks=[pytest.mark.core_model]),
|
||||
pytest.param("intfloat/e5-mistral-7b-instruct",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
|
||||
pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
|
||||
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
|
||||
# [Encoder-decoder]
|
||||
# [Cross-Encoder]
|
||||
pytest.param("sentence-transformers/stsb-roberta-base-v2"),
|
||||
],
|
||||
)
|
||||
@ -37,11 +38,19 @@ def test_models(
|
||||
example_prompts,
|
||||
model,
|
||||
dtype: str,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
|
||||
if model == "BAAI/bge-multilingual-gemma2" and current_platform.is_rocm():
|
||||
# ROCm Triton FA does not currently support sliding window attention
|
||||
# switch to use ROCm CK FA backend
|
||||
monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False")
|
||||
|
||||
vllm_extra_kwargs = {}
|
||||
if model == "ssmits/Qwen2-7B-Instruct-embed-base":
|
||||
vllm_extra_kwargs["override_pooler_config"] = \
|
||||
PoolerConfig(pooling_type="MEAN")
|
||||
|
||||
if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
|
||||
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
|
||||
|
||||
|
||||
@ -15,8 +15,8 @@ import vllm.config
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
# GritLM embedding implementation is only supported by XFormers backend.
|
||||
pytest.mark.skipif(not importlib.util.find_spec("xformers"),
|
||||
reason="GritLM requires XFormers")
|
||||
pytestmark = pytest.mark.skipif(not importlib.util.find_spec("xformers"),
|
||||
reason="GritLM requires XFormers")
|
||||
|
||||
MODEL_NAME = "parasail-ai/GritLM-7B-vllm"
|
||||
MAX_MODEL_LEN = 4000
|
||||
|
||||
@ -4,10 +4,27 @@ import pytest
|
||||
import torch.nn.functional as F
|
||||
from transformers import AutoModelForVision2Seq
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
|
||||
from ....utils import large_gpu_test
|
||||
from ..utils import check_embeddings_close
|
||||
|
||||
# Llava Next embedding implementation is only supported by CUDA.
|
||||
# If run on ROCm, hf_model.model.resize_token_embeddings will
|
||||
# cause the following error:
|
||||
# RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
|
||||
# requires compiling PyTorch with MAGMA. Please use PyTorch
|
||||
# built with MAGMA support.
|
||||
# If run on CPU, hf_model.model.resize_token_embeddings will
|
||||
# cause the following error:
|
||||
# RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
|
||||
# requires compiling PyTorch with LAPACK. Please use PyTorch
|
||||
# built with LAPACK support.
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="Llava Next model uses op that is only supported in CUDA")
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
|
||||
@ -162,6 +162,7 @@ def _test_processing_correctness(
|
||||
"deepseek-ai/deepseek-vl2-tiny",
|
||||
"microsoft/Florence-2-base",
|
||||
"adept/fuyu-8b",
|
||||
"google/gemma-3-4b-it",
|
||||
"THUDM/glm-4v-9b",
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"OpenGVLab/InternVL2-1B",
|
||||
|
||||
@ -124,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
|
||||
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
|
||||
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
|
||||
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
|
||||
min_transformers_version="4.50"),
|
||||
"GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
|
||||
"GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
|
||||
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
|
||||
@ -241,6 +243,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
|
||||
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
|
||||
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
|
||||
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
|
||||
min_transformers_version="4.50"),
|
||||
"GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
|
||||
trust_remote_code=True,
|
||||
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
|
||||
|
||||
@ -46,6 +46,7 @@ async def generate(engine: AsyncLLM,
|
||||
prompt: PromptType,
|
||||
output_kind: RequestOutputKind,
|
||||
max_tokens: int,
|
||||
n: int = 1,
|
||||
prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
|
||||
# Ensure generate doesn't complete too fast for cancellation test.
|
||||
await asyncio.sleep(0.2)
|
||||
@ -54,13 +55,15 @@ async def generate(engine: AsyncLLM,
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens,
|
||||
ignore_eos=True,
|
||||
output_kind=output_kind,
|
||||
temperature=0,
|
||||
temperature=0.5,
|
||||
seed=33,
|
||||
n=n,
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
async for out in engine.generate(request_id=request_id,
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params):
|
||||
|
||||
num_tokens = len(out.outputs[0].token_ids)
|
||||
num_tokens = sum(len(output.token_ids) for output in out.outputs)
|
||||
if output_kind == RequestOutputKind.DELTA:
|
||||
count += num_tokens
|
||||
else:
|
||||
@ -136,17 +139,22 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
|
||||
|
||||
NUM_REQUESTS = 100
|
||||
NUM_EXPECTED_TOKENS = 100
|
||||
NUM_EXPECTED_TOKENS_LONG = 50000
|
||||
REQUEST_IDS_TO_ABORT = range(1, 100, 10)
|
||||
PARALLEL_SAMPLE_REQ_IDS = range(1, 100, 15)
|
||||
|
||||
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
|
||||
|
||||
# Create concurrent requests.
|
||||
tasks: list[asyncio.Task] = []
|
||||
for request_id in request_ids:
|
||||
for idx, request_id in enumerate(request_ids):
|
||||
max_tokens = NUM_EXPECTED_TOKENS_LONG if (
|
||||
idx in REQUEST_IDS_TO_ABORT) else NUM_EXPECTED_TOKENS
|
||||
n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
generate(engine, request_id, prompt, output_kind,
|
||||
NUM_EXPECTED_TOKENS)))
|
||||
max_tokens, n)))
|
||||
|
||||
# API server cancels requests when they disconnect.
|
||||
for idx in REQUEST_IDS_TO_ABORT:
|
||||
@ -162,10 +170,13 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
|
||||
else:
|
||||
# Otherwise, make sure the request was not impacted.
|
||||
num_generated_tokens, request_id = await task
|
||||
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
|
||||
n = 3 if idx in PARALLEL_SAMPLE_REQ_IDS else 1
|
||||
expected_tokens = NUM_EXPECTED_TOKENS * n
|
||||
assert num_generated_tokens == expected_tokens, (
|
||||
f"{request_id} generated {num_generated_tokens} but "
|
||||
f"expected {NUM_EXPECTED_TOKENS}")
|
||||
f"expected {expected_tokens}")
|
||||
|
||||
# Make sure all aborted requests were really aborted.
|
||||
assert not engine.output_processor.has_unfinished_requests()
|
||||
|
||||
# Confirm we can do another generation.
|
||||
@ -176,3 +187,36 @@ async def test_abort(monkeypatch, output_kind: RequestOutputKind,
|
||||
num_generated_tokens, request_id = await task
|
||||
assert num_generated_tokens == NUM_EXPECTED_TOKENS
|
||||
assert not engine.output_processor.has_unfinished_requests()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [1, 3])
|
||||
@pytest.mark.parametrize("engine_args_and_prompt",
|
||||
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
|
||||
(VISION_ENGINE_ARGS, VISION_PROMPT)])
|
||||
@pytest.mark.asyncio
|
||||
async def test_finished_flag(monkeypatch, n: int,
|
||||
engine_args_and_prompt: tuple[AsyncEngineArgs,
|
||||
PromptType]):
|
||||
|
||||
with monkeypatch.context() as m, ExitStack() as after:
|
||||
m.setenv("VLLM_USE_V1", "1")
|
||||
engine_args, prompt = engine_args_and_prompt
|
||||
|
||||
engine = AsyncLLM.from_engine_args(engine_args)
|
||||
after.callback(engine.shutdown)
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=100,
|
||||
output_kind=RequestOutputKind.DELTA,
|
||||
temperature=1.0,
|
||||
seed=33,
|
||||
n=n)
|
||||
outputs = [
|
||||
out
|
||||
async for out in engine.generate(request_id="request-33",
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params)
|
||||
]
|
||||
|
||||
# Assert only the last output has the finished flag set
|
||||
assert all(not out.finished for out in outputs[:-1])
|
||||
assert outputs[-1].finished
|
||||
|
||||
@ -263,15 +263,16 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
|
||||
|
||||
prompt = "What is an LLM?"
|
||||
n = 3
|
||||
max_tokens = 5
|
||||
max_tokens = 50 # we want some to finish earlier than others
|
||||
|
||||
# High temperature to maximize chance of unique completions.
|
||||
completion = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
n=n,
|
||||
temperature=0.95,
|
||||
temperature=1.0,
|
||||
stream=False,
|
||||
logprobs=0,
|
||||
seed=42)
|
||||
|
||||
# Assert `n` completions
|
||||
@ -279,6 +280,7 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
|
||||
assert num_completions == n, (
|
||||
f"Num completions {num_completions} but expected {n}.")
|
||||
completion_repeats: dict[str, int] = {}
|
||||
output_token_lengths = set()
|
||||
for idx, choice in enumerate(completion.choices):
|
||||
# Assert correct completion index & some finish reason.
|
||||
assert choice.index == idx, (
|
||||
@ -287,6 +289,9 @@ async def test_parallel_no_streaming(client: openai.AsyncOpenAI,
|
||||
"None finish_reason is invalid.")
|
||||
text = choice.text
|
||||
completion_repeats[text] = completion_repeats.get(text, 0) + 1
|
||||
output_token_lengths.add(len(choice.logprobs.tokens))
|
||||
# Assert subrequests finished at different times
|
||||
assert len(output_token_lengths) > 1
|
||||
# Assert `n` unique completions
|
||||
num_unique = len(completion_repeats)
|
||||
if num_unique != n:
|
||||
@ -312,16 +317,16 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
|
||||
prompt = "What is an LLM?"
|
||||
n = 3
|
||||
max_tokens = 5
|
||||
max_tokens = 50 # we want some to finish earlier than others
|
||||
|
||||
stream = await client.completions.create(model=model_name,
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
n=n,
|
||||
temperature=0.95,
|
||||
temperature=1.0,
|
||||
stream=True,
|
||||
seed=42)
|
||||
chunks: list[list[str]] = [[] for i in range(n)]
|
||||
chunks: list[list[str]] = [[] for _ in range(n)]
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
index = chunk.choices[0].index
|
||||
@ -333,14 +338,18 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
|
||||
assert finish_reason_count == n, (
|
||||
f"Expected {n} completions with valid indices and finish_reason.")
|
||||
completion_repeats: dict[str, int] = {}
|
||||
chunk_lengths = set()
|
||||
for chunk in chunks:
|
||||
chunk_len = len(chunk)
|
||||
# Assert correct number of completion tokens
|
||||
assert chunk_len == max_tokens, (
|
||||
chunk_lengths.add(chunk_len)
|
||||
assert chunk_len <= max_tokens, (
|
||||
f"max_tokens={max_tokens} but chunk len is {chunk_len}.")
|
||||
text = "".join(chunk)
|
||||
completion_repeats[text] = completion_repeats.get(text, 0) + 1
|
||||
print(text)
|
||||
# Assert subrequests finished at different times
|
||||
assert len(chunk_lengths) > 1
|
||||
# Assert `n` unique completions
|
||||
num_unique = len(completion_repeats)
|
||||
if num_unique != n:
|
||||
|
||||
@ -1146,6 +1146,10 @@ def moe_wna16_gemm(input: torch.Tensor, output: torch.Tensor,
|
||||
num_tokens_post_pad: torch.Tensor, top_k: int,
|
||||
BLOCK_SIZE_M: int, BLOCK_SIZE_N: int, BLOCK_SIZE_K: int,
|
||||
bit: int) -> torch.Tensor:
|
||||
if not current_platform.is_cuda():
|
||||
raise NotImplementedError(
|
||||
"The optimized moe_wna16_gemm kernel is only "
|
||||
"available on CUDA platforms")
|
||||
torch.ops._moe_C.moe_wna16_gemm(input, output, b_qweight, b_scales,
|
||||
b_qzeros, topk_weights, sorted_token_ids,
|
||||
experts_ids, num_tokens_post_pad, top_k,
|
||||
|
||||
@ -1327,21 +1327,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
|
||||
[0, q.shape[-1] - v.shape[-1]],
|
||||
value=0)
|
||||
|
||||
if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
|
||||
attn_output, attn_softmax_lse = self.triton_fa_func(
|
||||
q,
|
||||
k,
|
||||
v_padded,
|
||||
None,
|
||||
prefill_metadata.query_start_loc,
|
||||
prefill_metadata.context_chunk_cu_seq_lens[i],
|
||||
prefill_metadata.max_query_len,
|
||||
prefill_metadata.context_chunk_max_seq_lens[i],
|
||||
False, # causal
|
||||
self.scale,
|
||||
None, # attn_mask is None unless applying ALiBi mask
|
||||
)
|
||||
elif is_vllm_fa:
|
||||
if is_vllm_fa:
|
||||
attn_output, attn_softmax_lse = self.flash_attn_varlen_func(
|
||||
q=q,
|
||||
k=k,
|
||||
@ -1416,7 +1402,7 @@ class MLACommonImpl(MLAAttentionImpl[T], Generic[T]):
|
||||
v_padded = torch.nn.functional.pad(v, [0, q.shape[-1] - v.shape[-1]],
|
||||
value=0)
|
||||
|
||||
if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN:
|
||||
if is_hip and envs.VLLM_USE_TRITON_FLASH_ATTN and not has_context:
|
||||
output = self.triton_fa_func(
|
||||
q,
|
||||
k,
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Attention layer ROCm GPUs."""
|
||||
import itertools
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
|
||||
|
||||
@ -342,28 +343,27 @@ def _get_seq_len_block_table_args(
|
||||
Decoder attn -> select entirely decoder self-attention-related fields
|
||||
Encoder/decoder cross-attn -> select encoder sequence lengths
|
||||
Encoder attn -> select encoder sequence lengths fields
|
||||
Encoder-only attn -> select prefill sequence lengths with
|
||||
bidirectional attention
|
||||
|
||||
Arguments:
|
||||
|
||||
* attn_metadata: Attention metadata structure associated with attention op
|
||||
* attn_type: encoder attention, decoder self-attention,
|
||||
encoder/decoder cross-attention
|
||||
encoder/decoder cross-attention, encoder-only
|
||||
|
||||
Returns:
|
||||
|
||||
* Appropriate sequence-lengths tensors for query and key
|
||||
* Appropriate max sequence-length scalar
|
||||
* Causal masking flag
|
||||
'''
|
||||
|
||||
partial_prefix_sum = 0
|
||||
if attn_type == AttentionType.ENCODER:
|
||||
assert attn_metadata.encoder_seq_lens is not None
|
||||
assert attn_metadata.encoder_seq_lens_tensor is not None
|
||||
query_seq_start_loc = torch.tensor(
|
||||
[0] + [
|
||||
partial_prefix_sum := partial_prefix_sum + i
|
||||
for i in attn_metadata.encoder_seq_lens
|
||||
],
|
||||
list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
|
||||
device=attn_metadata.encoder_seq_lens_tensor.device,
|
||||
dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
|
||||
causal_mask = False
|
||||
@ -372,16 +372,29 @@ def _get_seq_len_block_table_args(
|
||||
return (query_seq_start_loc, attn_metadata.max_encoder_seq_len,
|
||||
query_seq_start_loc, attn_metadata.max_encoder_seq_len,
|
||||
attn_metadata.encoder_seq_lens, causal_mask)
|
||||
|
||||
elif attn_type == AttentionType.ENCODER_ONLY:
|
||||
# For encoder-only models, we use the prefill sequence lengths
|
||||
assert attn_metadata.seq_lens is not None
|
||||
assert attn_metadata.seq_lens_tensor is not None
|
||||
query_seq_start_loc = torch.tensor(
|
||||
list(itertools.accumulate([0] + attn_metadata.seq_lens)),
|
||||
device=attn_metadata.seq_lens_tensor.device,
|
||||
dtype=attn_metadata.seq_lens_tensor.dtype)
|
||||
max_seq_len = attn_metadata.max_prefill_seq_len
|
||||
# Encoder-only models typically use bidirectional attention
|
||||
causal_mask = False
|
||||
|
||||
return (query_seq_start_loc, max_seq_len, query_seq_start_loc,
|
||||
max_seq_len, attn_metadata.seq_lens, causal_mask)
|
||||
|
||||
elif attn_type == AttentionType.DECODER:
|
||||
# Decoder self-attention
|
||||
# Choose max_seq_len based on whether we are in prompt_run
|
||||
assert attn_metadata.seq_lens is not None
|
||||
assert attn_metadata.seq_lens_tensor is not None
|
||||
query_seq_start_loc = torch.tensor(
|
||||
[0] + [
|
||||
partial_prefix_sum := partial_prefix_sum + i
|
||||
for i in attn_metadata.seq_lens
|
||||
],
|
||||
list(itertools.accumulate([0] + attn_metadata.seq_lens)),
|
||||
device=attn_metadata.seq_lens_tensor.device,
|
||||
dtype=attn_metadata.seq_lens_tensor.dtype)
|
||||
max_seq_len = attn_metadata.max_prefill_seq_len
|
||||
@ -393,21 +406,14 @@ def _get_seq_len_block_table_args(
|
||||
assert attn_metadata.seq_lens is not None
|
||||
assert attn_metadata.encoder_seq_lens_tensor is not None
|
||||
query_start_loc = torch.tensor(
|
||||
[0] + [
|
||||
partial_prefix_sum := partial_prefix_sum + i
|
||||
for i in attn_metadata.seq_lens
|
||||
],
|
||||
list(itertools.accumulate([0] + attn_metadata.seq_lens)),
|
||||
device=attn_metadata.encoder_seq_lens_tensor.device,
|
||||
dtype=attn_metadata.encoder_seq_lens_tensor.dtype)
|
||||
|
||||
partial_prefix_sum = 0
|
||||
assert attn_metadata.encoder_seq_lens is not None
|
||||
assert attn_metadata.seq_lens_tensor is not None
|
||||
key_seq_start_loc = torch.tensor(
|
||||
[0] + [
|
||||
partial_prefix_sum := partial_prefix_sum + i
|
||||
for i in attn_metadata.encoder_seq_lens
|
||||
],
|
||||
list(itertools.accumulate([0] + attn_metadata.encoder_seq_lens)),
|
||||
device=attn_metadata.seq_lens_tensor.device,
|
||||
dtype=attn_metadata.seq_lens_tensor.dtype)
|
||||
causal_mask = False
|
||||
@ -584,6 +590,8 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
will match encoder sequence lengths, pass encoder sequence
|
||||
attributes to kernel (encoder_seq_lens/encoder_seq_lens_tensor/
|
||||
max_encoder_seq_len)
|
||||
* ENCODER_ONLY: bidirectional attention with no KV caching;
|
||||
use prefill sequence attributes
|
||||
|
||||
Args:
|
||||
query: shape = [num_tokens, num_heads * head_size]
|
||||
@ -608,7 +616,11 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
else:
|
||||
assert value is None
|
||||
|
||||
if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
|
||||
# Only update KV cache for decoder self-attention
|
||||
# and encoder-decoder cross-attention
|
||||
if self.attn_type not in [
|
||||
AttentionType.ENCODER, AttentionType.ENCODER_ONLY
|
||||
] and kv_cache.numel() > 0:
|
||||
key_cache, value_cache = PagedAttention.split_kv_cache(
|
||||
kv_cache, self.num_kv_heads, self.head_size)
|
||||
|
||||
@ -632,6 +644,9 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
|
||||
if self.attn_type != AttentionType.ENCODER:
|
||||
num_prefill_tokens = attn_metadata.num_prefill_tokens
|
||||
elif self.attn_type == AttentionType.ENCODER_ONLY:
|
||||
# For encoder-only models, all tokens are processed in one go
|
||||
num_prefill_tokens = query.shape[0]
|
||||
else:
|
||||
assert attn_metadata.num_encoder_tokens is not None
|
||||
num_prefill_tokens = attn_metadata.num_encoder_tokens
|
||||
@ -642,8 +657,13 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
# QKV for prefill.
|
||||
query = query[:num_prefill_tokens]
|
||||
|
||||
# For encoder-only and encoder models,
|
||||
# we process all tokens at once
|
||||
# For decoder and encoder-decoder,
|
||||
# we may need to limit key/value to prefill tokens
|
||||
if key is not None and value is not None \
|
||||
and self.attn_type != AttentionType.ENCODER_DECODER:
|
||||
and self.attn_type not in [AttentionType.ENCODER_DECODER,
|
||||
AttentionType.ENCODER_ONLY]:
|
||||
key = key[:num_prefill_tokens]
|
||||
value = value[:num_prefill_tokens]
|
||||
|
||||
@ -678,7 +698,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
self.alibi_slopes,
|
||||
query.dtype,
|
||||
seq_lens,
|
||||
make_attn_mask=False) # type: ignore
|
||||
make_attn_mask=causal_mask) # type: ignore
|
||||
out, _ = self.attn_func(
|
||||
query,
|
||||
key,
|
||||
@ -703,7 +723,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
self.alibi_slopes,
|
||||
query.dtype,
|
||||
attn_metadata.seq_lens,
|
||||
make_attn_mask=True) # type: ignore
|
||||
make_attn_mask=causal_mask) # type: ignore
|
||||
query = query.movedim(0, query.dim() - 2)
|
||||
key = key.movedim(0, key.dim() - 2)
|
||||
value = value.movedim(0, value.dim() - 2)
|
||||
@ -729,7 +749,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
max_seqlen_q=prefill_meta.max_prefill_seq_len,
|
||||
max_seqlen_k=key_max_seq_len,
|
||||
softmax_scale=self.scale,
|
||||
causal=True,
|
||||
causal=causal_mask,
|
||||
window_size=self.sliding_window,
|
||||
alibi_slopes=self.alibi_slopes,
|
||||
softcap=self.logits_soft_cap,
|
||||
@ -742,25 +762,29 @@ class ROCmFlashAttentionImpl(AttentionImpl):
|
||||
else:
|
||||
output = out
|
||||
else:
|
||||
# prefix-enabled attention
|
||||
output[:num_prefill_tokens] = PagedAttention.forward_prefix(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
self.kv_cache_dtype,
|
||||
key_cache,
|
||||
value_cache,
|
||||
prefill_meta.block_tables,
|
||||
prefill_meta.query_start_loc,
|
||||
prefill_meta.seq_lens_tensor,
|
||||
prefill_meta.max_query_len,
|
||||
self.alibi_slopes,
|
||||
self.sliding_window[0],
|
||||
layer._k_scale,
|
||||
layer._v_scale,
|
||||
)
|
||||
|
||||
if decode_meta := attn_metadata.decode_metadata:
|
||||
# prefix-enabled attention -
|
||||
# not applicable for encoder-only models
|
||||
if self.attn_type != AttentionType.ENCODER_ONLY:
|
||||
output[:
|
||||
num_prefill_tokens] = PagedAttention.forward_prefix(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
self.kv_cache_dtype,
|
||||
key_cache,
|
||||
value_cache,
|
||||
prefill_meta.block_tables,
|
||||
prefill_meta.query_start_loc,
|
||||
prefill_meta.seq_lens_tensor,
|
||||
prefill_meta.max_query_len,
|
||||
self.alibi_slopes,
|
||||
self.sliding_window[0],
|
||||
layer._k_scale,
|
||||
layer._v_scale,
|
||||
)
|
||||
# Skip decode phase for encoder-only models
|
||||
if (decode_meta := attn_metadata.decode_metadata) and (
|
||||
self.attn_type != AttentionType.ENCODER_ONLY):
|
||||
# Decoding run.
|
||||
# Whether to use rocm custom paged attention or not
|
||||
num_seqs, num_heads, head_size = decode_query.shape
|
||||
@ -885,4 +909,4 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
|
||||
and (qtype == torch.half or qtype == torch.bfloat16)
|
||||
and (head_size == 64 or head_size == 128)
|
||||
and (block_size == 16 or block_size == 32)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
|
||||
and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768)
|
||||
|
||||
@ -17,7 +17,7 @@ class _PagedAttention:
|
||||
|
||||
@staticmethod
|
||||
def get_supported_head_sizes() -> List[int]:
|
||||
return [32, 64, 80, 96, 112, 128, 256]
|
||||
return [32, 64, 80, 96, 112, 128, 192, 256]
|
||||
|
||||
@staticmethod
|
||||
def get_kv_cache_shape(
|
||||
|
||||
@ -350,10 +350,11 @@ class ModelConfig:
|
||||
if self.enforce_eager is None:
|
||||
self.enforce_eager = False
|
||||
|
||||
interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
|
||||
sliding_window = getattr(self.hf_text_config, "sliding_window", None)
|
||||
has_interleaved_attention = (sliding_window is not None) and (
|
||||
isinstance(sliding_window, list) or
|
||||
(self.hf_text_config.model_type in ["gemma2", "cohere2"]))
|
||||
(self.hf_text_config.model_type in interleaved_attn_models))
|
||||
|
||||
if (not self.disable_sliding_window and has_interleaved_attention):
|
||||
if (backend :=
|
||||
@ -1145,7 +1146,7 @@ class CacheConfig:
|
||||
if not self.enable_prefix_caching:
|
||||
return
|
||||
|
||||
if self.sliding_window is not None:
|
||||
if self.sliding_window is not None and not envs.VLLM_USE_V1:
|
||||
raise NotImplementedError(
|
||||
"Prefix caching is not supported with sliding window. "
|
||||
"Run with --disable-sliding-window to use prefix caching.")
|
||||
@ -2501,11 +2502,11 @@ def _get_and_verify_dtype(
|
||||
dtype = dtype.lower()
|
||||
if dtype == "auto":
|
||||
if config_dtype == torch.float32:
|
||||
if config.model_type == "gemma2":
|
||||
if config.model_type in ("gemma2", "gemma3", "gemma3_text"):
|
||||
logger.info(
|
||||
"For Gemma 2, we downcast float32 to bfloat16 instead "
|
||||
"of float16 by default. Please specify `dtype` if you "
|
||||
"want to use float16.")
|
||||
"For Gemma 2 and 3, we downcast float32 to bfloat16 "
|
||||
"instead of float16 by default. Please specify `dtype` "
|
||||
"if you want to use float16.")
|
||||
torch_dtype = torch.bfloat16
|
||||
else:
|
||||
# Following the common practice, we use float16 for float32
|
||||
@ -2637,7 +2638,9 @@ def _get_and_verify_max_len(
|
||||
derived_max_model_len = default_max_len
|
||||
|
||||
rope_scaling = getattr(hf_config, "rope_scaling", None)
|
||||
if rope_scaling is not None:
|
||||
# NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
|
||||
# scaling, so we skip applying the scaling factor again.
|
||||
if rope_scaling is not None and "gemma3" not in hf_config.model_type:
|
||||
# No need to consider "type" key because of patch_rope_scaling when
|
||||
# loading HF config
|
||||
rope_type = rope_scaling["rope_type"]
|
||||
@ -3447,9 +3450,9 @@ class VllmConfig:
|
||||
self.compilation_config.level = CompilationLevel.NO_COMPILATION
|
||||
|
||||
if self.model_config and self.model_config.use_mla and \
|
||||
not current_platform.is_cuda():
|
||||
not (current_platform.is_cuda() or current_platform.is_rocm()):
|
||||
logger.info(
|
||||
"MLA is enabled on a non-cuda platform; forcing chunked "
|
||||
"MLA is enabled on a non-GPU platform; forcing chunked "
|
||||
"prefill and prefix caching to be disabled.")
|
||||
self.scheduler_config.enable_chunked_prefill = False
|
||||
self.scheduler_config.chunked_prefill_enabled = False
|
||||
|
||||
@ -433,6 +433,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
return "<image>"
|
||||
if model_type == "aria":
|
||||
return "<|fim_prefix|><|img|><|fim_suffix|>"
|
||||
if model_type == "gemma3":
|
||||
return "<start_of_image>"
|
||||
|
||||
raise TypeError(f"Unknown {modality} model type: {model_type}")
|
||||
elif modality == "audio":
|
||||
|
||||
@ -254,10 +254,11 @@ def _run_worker_process(
|
||||
# online (in situ) tuning is enabled.
|
||||
# Offline tuning API (record_untuned_is_enabled()) only
|
||||
# available in PyTorch 2.6 or later.
|
||||
import torch.cuda.tunable as tunable
|
||||
if (tunable.is_enabled() and tunable.tuning_is_enabled()
|
||||
and not tunable.record_untuned_is_enabled()):
|
||||
tunable.write_file()
|
||||
if torch.cuda.is_available():
|
||||
import torch.cuda.tunable as tunable
|
||||
if (tunable.is_enabled() and tunable.tuning_is_enabled()
|
||||
and not tunable.record_untuned_is_enabled()):
|
||||
tunable.write_file()
|
||||
|
||||
logger.info("Worker exiting")
|
||||
|
||||
|
||||
@ -193,10 +193,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
global_num_experts: int = -1,
|
||||
expert_map: Optional[torch.Tensor] = None,
|
||||
custom_routing_function: Optional[Callable] = None,
|
||||
scoring_func: str = "softmax",
|
||||
e_score_correction_bias: Optional[torch.Tensor] = None,
|
||||
activation: str = "silu",
|
||||
**kwargs,
|
||||
):
|
||||
assert custom_routing_function is None
|
||||
assert activation == "silu", f"{activation} is not supported."
|
||||
return layer.ipex_fusion(
|
||||
x,
|
||||
@ -206,6 +207,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
renormalize,
|
||||
topk_group,
|
||||
num_expert_group,
|
||||
custom_routing_function,
|
||||
scoring_func,
|
||||
e_score_correction_bias,
|
||||
)
|
||||
|
||||
def forward_tpu(
|
||||
|
||||
533
vllm/model_executor/models/gemma3.py
Normal file
533
vllm/model_executor/models/gemma3.py
Normal file
@ -0,0 +1,533 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# Copyright 2025 The vLLM team.
|
||||
# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from typing import Iterable, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from transformers import Gemma3TextConfig
|
||||
|
||||
from vllm.attention import Attention
|
||||
from vllm.compilation.decorators import support_torch_compile
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import GeluAndMul
|
||||
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
|
||||
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
RowParallelLinear)
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
VocabParallelEmbedding)
|
||||
from vllm.model_executor.model_loader.weight_utils import (
|
||||
default_weight_loader, maybe_remap_kv_scale_name)
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsLoRA, SupportsPP
|
||||
from .utils import (AutoWeightsLoader, extract_layer_index,
|
||||
is_pp_missing_parameter,
|
||||
make_empty_intermediate_tensors_factory, make_layers,
|
||||
maybe_prefix)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Gemma3MLP(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size: int,
|
||||
intermediate_size: int,
|
||||
hidden_activation: str,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.gate_up_proj = MergedColumnParallelLinear(
|
||||
hidden_size, [intermediate_size] * 2,
|
||||
bias=False,
|
||||
quant_config=quant_config)
|
||||
self.down_proj = RowParallelLinear(intermediate_size,
|
||||
hidden_size,
|
||||
bias=False,
|
||||
quant_config=quant_config)
|
||||
if hidden_activation != "gelu_pytorch_tanh":
|
||||
raise ValueError(
|
||||
"Gemma3 uses `gelu_pytorch_tanh` as the hidden activation "
|
||||
"function. Please set `hidden_act` and `hidden_activation` to "
|
||||
"`gelu_pytorch_tanh`.")
|
||||
self.act_fn = GeluAndMul(approximate="tanh")
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
gate_up, _ = self.gate_up_proj(x)
|
||||
x = self.act_fn(gate_up)
|
||||
x, _ = self.down_proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class Gemma3Attention(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
config: Gemma3TextConfig,
|
||||
hidden_size: int,
|
||||
num_heads: int,
|
||||
num_kv_heads: int,
|
||||
head_dim: int,
|
||||
max_position_embeddings: int,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
attn_logits_soft_cap: Optional[float] = None,
|
||||
prefix: str = "") -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.hidden_size = hidden_size
|
||||
tp_size = get_tensor_model_parallel_world_size()
|
||||
self.total_num_heads = num_heads
|
||||
assert self.total_num_heads % tp_size == 0
|
||||
self.num_heads = self.total_num_heads // tp_size
|
||||
self.total_num_kv_heads = num_kv_heads
|
||||
if self.total_num_kv_heads >= tp_size:
|
||||
# Number of KV heads is greater than TP size, so we partition
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert self.total_num_kv_heads % tp_size == 0
|
||||
else:
|
||||
# Number of KV heads is less than TP size, so we replicate
|
||||
# the KV heads across multiple tensor parallel GPUs.
|
||||
assert tp_size % self.total_num_kv_heads == 0
|
||||
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
||||
self.head_dim = head_dim
|
||||
self.q_size = self.num_heads * self.head_dim
|
||||
self.kv_size = self.num_kv_heads * self.head_dim
|
||||
self.scaling = config.query_pre_attn_scalar**-0.5
|
||||
|
||||
self.qkv_proj = QKVParallelLinear(
|
||||
hidden_size,
|
||||
self.head_dim,
|
||||
self.total_num_heads,
|
||||
self.total_num_kv_heads,
|
||||
bias=config.attention_bias,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
self.o_proj = RowParallelLinear(
|
||||
self.total_num_heads * self.head_dim,
|
||||
hidden_size,
|
||||
bias=config.attention_bias,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
|
||||
self.q_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
||||
self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
||||
|
||||
# TODO(woosuk): Add reference to the original HF implementation.
|
||||
layer_idx = extract_layer_index(prefix)
|
||||
self.is_sliding = bool((layer_idx + 1) % config.sliding_window_pattern)
|
||||
# Initialize the rotary embedding.
|
||||
if self.is_sliding:
|
||||
# Local attention. Override the values in config.json.
|
||||
self.rope_theta = config.rope_local_base_freq
|
||||
self.rope_scaling = {"rope_type": "default"}
|
||||
self.sliding_window = config.interleaved_sliding_window
|
||||
else:
|
||||
# Global attention. Use the values in config.json.
|
||||
self.rope_theta = config.rope_theta
|
||||
self.rope_scaling = config.rope_scaling
|
||||
self.sliding_window = None
|
||||
self.rotary_emb = get_rope(
|
||||
self.head_dim,
|
||||
rotary_dim=self.head_dim,
|
||||
max_position=max_position_embeddings,
|
||||
base=self.rope_theta,
|
||||
is_neox_style=True,
|
||||
rope_scaling=self.rope_scaling,
|
||||
)
|
||||
|
||||
# Initialize the attention.
|
||||
self.attn = Attention(self.num_heads,
|
||||
self.head_dim,
|
||||
self.scaling,
|
||||
num_kv_heads=self.num_kv_heads,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
logits_soft_cap=attn_logits_soft_cap,
|
||||
per_layer_sliding_window=self.sliding_window,
|
||||
prefix=f"{prefix}.attn")
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
qkv, _ = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
||||
|
||||
q = q.unflatten(-1, (self.num_heads, self.head_dim))
|
||||
q = self.q_norm(q)
|
||||
q = q.flatten(-2, -1)
|
||||
k = k.unflatten(-1, (self.num_kv_heads, self.head_dim))
|
||||
k = self.k_norm(k)
|
||||
k = k.flatten(-2, -1)
|
||||
|
||||
q, k = self.rotary_emb(positions, q, k)
|
||||
attn_output = self.attn(q, k, v)
|
||||
|
||||
if not kwargs.get("has_images", False):
|
||||
# Fast path for text-only inputs. The performance for the text-only
|
||||
# inputs are not affected by the naive attention below.
|
||||
output, _ = self.o_proj(attn_output)
|
||||
return output
|
||||
|
||||
# NOTE(woosuk): Gemma3 uses bidirectional attention between image tokens
|
||||
# that correspond to the same image while using causal attention
|
||||
# otherwise. Current attention backends cannot handle this pattern, so
|
||||
# we temporarily use a naive attention implementation with mask tensors.
|
||||
|
||||
# We intentionally keep the attention backend as-is and only override
|
||||
# `attn_output` with the naive implementation's output. This minimizes
|
||||
# changes to existing model runners and attention backends. The call to
|
||||
# `self.attn(q, k, v)` is only used to populate the KV cache - its
|
||||
# output is discarded and overwritten below. While this duplicates
|
||||
# computation, it maintains compatibility.
|
||||
# TODO(woosuk): Optimize by implementing custom attention kernels.
|
||||
attn_output = self.naive_attn_with_masks(q,
|
||||
k,
|
||||
v,
|
||||
out=attn_output,
|
||||
**kwargs)
|
||||
output, _ = self.o_proj(attn_output)
|
||||
return output
|
||||
|
||||
def naive_attn_with_masks(
|
||||
self,
|
||||
q: torch.Tensor,
|
||||
k: torch.Tensor,
|
||||
v: torch.Tensor,
|
||||
out: torch.Tensor,
|
||||
**kwargs,
|
||||
) -> torch.Tensor:
|
||||
# NOTE(woosuk): As described in the comment above, this code is not
|
||||
# meant to be performant. It is only meant to be correct.
|
||||
q = q.view(-1, self.num_heads, self.head_dim)
|
||||
# Expand the key and value to handle GQA.
|
||||
num_queries_per_kv = self.num_heads // self.num_kv_heads
|
||||
k = k.view(-1, self.num_kv_heads, self.head_dim)
|
||||
k = k.repeat_interleave(num_queries_per_kv, dim=-2)
|
||||
v = v.view(-1, self.num_kv_heads, self.head_dim)
|
||||
v = v.repeat_interleave(num_queries_per_kv, dim=-2)
|
||||
|
||||
if self.is_sliding:
|
||||
attn_masks = kwargs["local_attn_masks"]
|
||||
else:
|
||||
attn_masks = kwargs["global_attn_masks"]
|
||||
|
||||
seq_lens = kwargs["seq_lens"]
|
||||
start_idx = 0
|
||||
for seq_len, attn_mask in zip(seq_lens, attn_masks):
|
||||
end_idx = start_idx + seq_len
|
||||
query = q[start_idx:end_idx].unsqueeze(0)
|
||||
key = k[start_idx:end_idx].unsqueeze(0)
|
||||
value = v[start_idx:end_idx].unsqueeze(0)
|
||||
|
||||
# Transpose.
|
||||
query = query.transpose(1, 2)
|
||||
key = key.transpose(1, 2)
|
||||
value = value.transpose(1, 2)
|
||||
|
||||
output = F.scaled_dot_product_attention(
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
attn_mask,
|
||||
self.scaling,
|
||||
)
|
||||
output = output.transpose(1, 2).flatten(-2, -1)
|
||||
out[start_idx:end_idx] = output
|
||||
start_idx = end_idx
|
||||
return out
|
||||
|
||||
|
||||
class Gemma3DecoderLayer(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: Gemma3TextConfig,
|
||||
cache_config: Optional[CacheConfig] = None,
|
||||
quant_config: Optional[QuantizationConfig] = None,
|
||||
prefix: str = "",
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.hidden_size = config.hidden_size
|
||||
self.self_attn = Gemma3Attention(
|
||||
config=config,
|
||||
hidden_size=self.hidden_size,
|
||||
num_heads=config.num_attention_heads,
|
||||
num_kv_heads=config.num_key_value_heads,
|
||||
head_dim=config.head_dim,
|
||||
max_position_embeddings=config.max_position_embeddings,
|
||||
cache_config=cache_config,
|
||||
quant_config=quant_config,
|
||||
attn_logits_soft_cap=None,
|
||||
prefix=f"{prefix}.self_attn",
|
||||
)
|
||||
self.hidden_size = config.hidden_size
|
||||
self.mlp = Gemma3MLP(
|
||||
hidden_size=self.hidden_size,
|
||||
intermediate_size=config.intermediate_size,
|
||||
hidden_activation=config.hidden_activation,
|
||||
quant_config=quant_config,
|
||||
)
|
||||
self.input_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
self.pre_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
self.post_feedforward_layernorm = GemmaRMSNorm(config.hidden_size,
|
||||
eps=config.rms_norm_eps)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
positions: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
residual: Optional[torch.Tensor],
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
if residual is None:
|
||||
residual = hidden_states
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
else:
|
||||
hidden_states, residual = self.input_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.self_attn(
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
**kwargs,
|
||||
)
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
|
||||
hidden_states, residual = self.pre_feedforward_layernorm(
|
||||
hidden_states, residual)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
hidden_states = self.post_feedforward_layernorm(hidden_states)
|
||||
return hidden_states, residual
|
||||
|
||||
|
||||
@support_torch_compile
|
||||
class Gemma3Model(nn.Module):
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
cache_config = vllm_config.cache_config
|
||||
quant_config = vllm_config.quant_config
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
|
||||
self.embed_tokens = VocabParallelEmbedding(
|
||||
config.vocab_size,
|
||||
config.hidden_size,
|
||||
)
|
||||
self.start_layer, self.end_layer, self.layers = make_layers(
|
||||
config.num_hidden_layers,
|
||||
lambda prefix: Gemma3DecoderLayer(
|
||||
config, cache_config, quant_config, prefix=prefix),
|
||||
prefix=f"{prefix}.layers")
|
||||
self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
||||
|
||||
# Normalize the embedding by sqrt(hidden_size)
|
||||
# The normalizer's data type should be downcasted to the model's
|
||||
# data type such as bfloat16, not float32.
|
||||
# See https://github.com/huggingface/transformers/pull/29402
|
||||
normalizer = self.config.hidden_size**0.5
|
||||
self.register_buffer("normalizer", torch.tensor(normalizer))
|
||||
self.make_empty_intermediate_tensors = (
|
||||
make_empty_intermediate_tensors_factory(
|
||||
["hidden_states", "residual"], config.hidden_size))
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
# NOTE(woosuk): Only apply the normalizer to the output of
|
||||
# vocab embedding. Don't apply it to the vision embedding.
|
||||
return self.embed_tokens(input_ids) * self.normalizer
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: Optional[torch.Tensor],
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors],
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
if get_pp_group().is_first_rank:
|
||||
if inputs_embeds is not None:
|
||||
hidden_states = inputs_embeds
|
||||
else:
|
||||
hidden_states = self.get_input_embeddings(input_ids)
|
||||
residual = None
|
||||
else:
|
||||
assert intermediate_tensors is not None
|
||||
hidden_states = intermediate_tensors["hidden_states"]
|
||||
residual = intermediate_tensors["residual"]
|
||||
for layer in self.layers[self.start_layer:self.end_layer]:
|
||||
hidden_states, residual = layer(
|
||||
positions,
|
||||
hidden_states,
|
||||
residual,
|
||||
**kwargs,
|
||||
)
|
||||
if not get_pp_group().is_last_rank:
|
||||
return IntermediateTensors({
|
||||
"hidden_states": hidden_states,
|
||||
"residual": residual
|
||||
})
|
||||
hidden_states, _ = self.norm(hidden_states, residual)
|
||||
return hidden_states
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params: Set[str] = set()
|
||||
for name, loaded_weight in weights:
|
||||
if (self.quant_config is not None and
|
||||
(scale_name := self.quant_config.get_cache_scale(name))):
|
||||
# Loading kv cache scales for compressed-tensors quantization
|
||||
param = params_dict[scale_name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
loaded_weight = loaded_weight[0]
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(scale_name)
|
||||
continue
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
continue
|
||||
name = name.replace(shard_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
# Remapping the name of FP8 kv-scale.
|
||||
name = maybe_remap_kv_scale_name(name, params_dict)
|
||||
if name is None:
|
||||
continue
|
||||
if is_pp_missing_parameter(name, self):
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader",
|
||||
default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
|
||||
unloaded_params = params_dict.keys() - loaded_params
|
||||
if unloaded_params:
|
||||
logger.warning(
|
||||
"Some weights are not initialized from checkpoints: %s",
|
||||
unloaded_params)
|
||||
return loaded_params
|
||||
|
||||
|
||||
class Gemma3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
lora_config = vllm_config.lora_config
|
||||
del lora_config # Unused.
|
||||
super().__init__()
|
||||
self.config = config
|
||||
# currently all existing Gemma models have `tie_word_embeddings` enabled
|
||||
assert config.tie_word_embeddings
|
||||
self.quant_config = quant_config
|
||||
self.model = Gemma3Model(vllm_config=vllm_config,
|
||||
prefix=maybe_prefix(prefix, "model"))
|
||||
self.logits_processor = LogitsProcessor(
|
||||
config.vocab_size, soft_cap=config.final_logit_softcapping)
|
||||
self.sampler = get_sampler()
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.model.make_empty_intermediate_tensors)
|
||||
|
||||
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
|
||||
return self.model.get_input_embeddings(input_ids)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Union[torch.Tensor, IntermediateTensors]:
|
||||
hidden_states = self.model(input_ids, positions, intermediate_tensors,
|
||||
inputs_embeds, **kwargs)
|
||||
return hidden_states
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[torch.Tensor]:
|
||||
logits = self.logits_processor(self.model.embed_tokens, hidden_states,
|
||||
sampling_metadata)
|
||||
return logits
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
next_tokens = self.sampler(logits, sampling_metadata)
|
||||
return next_tokens
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
loader = AutoWeightsLoader(
|
||||
self,
|
||||
skip_prefixes=(["lm_head."]
|
||||
if self.config.tie_word_embeddings else None),
|
||||
)
|
||||
return loader.load_weights(weights)
|
||||
425
vllm/model_executor/models/gemma3_mm.py
Normal file
425
vllm/model_executor/models/gemma3_mm.py
Normal file
@ -0,0 +1,425 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
from typing import (Any, Iterable, Literal, Mapping, Optional, Sequence, Set,
|
||||
Tuple, TypedDict, Union)
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from transformers import BatchFeature, Gemma3Config, ProcessorMixin
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
|
||||
from vllm.model_executor.layers.sampler import SamplerOutput
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
|
||||
NestedTensors)
|
||||
from vllm.multimodal.parse import ImageSize, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
BaseProcessingInfo, PromptReplacement,
|
||||
PromptUpdate, PromptUpdateDetails)
|
||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
from .siglip import SiglipVisionModel
|
||||
from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
|
||||
maybe_prefix, merge_multimodal_embeddings)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class Gemma3ImagePixelInputs(TypedDict):
|
||||
type: Literal["pixel_values"]
|
||||
data: torch.Tensor
|
||||
"""Shape: `(batch_size * num_images, num_channels, height, width)`"""
|
||||
|
||||
|
||||
Gemma3ImageInputs = Gemma3ImagePixelInputs
|
||||
|
||||
|
||||
class Gemma3ProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||
return {"image": None}
|
||||
|
||||
def get_mm_max_tokens_per_item(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> Mapping[str, int]:
|
||||
hf_config = self.ctx.get_hf_config()
|
||||
return {"image": hf_config.mm_tokens_per_image}
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: Optional[ProcessorMixin],
|
||||
) -> int:
|
||||
hf_config = self.ctx.get_hf_config()
|
||||
return hf_config.mm_tokens_per_image
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
# Result in the max possible feature size (h:w = 16:1)
|
||||
return ImageSize(height=8000, width=50)
|
||||
|
||||
|
||||
class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]):
|
||||
|
||||
def get_dummy_processor_inputs(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> ProcessorInputs:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
boi_token = tokenizer.boi_token
|
||||
|
||||
num_images = mm_counts.get("image", 0)
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
mm_data = {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
return ProcessorInputs(
|
||||
prompt_text=" ".join([boi_token] * num_images),
|
||||
mm_data=mm_data,
|
||||
)
|
||||
|
||||
|
||||
class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# TODO(woosuk): Support pan-and-scan.
|
||||
img_kwargs = mm_kwargs.get("images_kwargs", {})
|
||||
img_kwargs["do_pan_and_scan"] = False
|
||||
mm_kwargs["images_kwargs"] = img_kwargs
|
||||
return super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
)
|
||||
|
||||
def _get_mm_fields_config(
|
||||
self,
|
||||
hf_inputs: BatchFeature,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
|
||||
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, Any],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_config = self.info.get_hf_config()
|
||||
|
||||
boi_token = tokenizer.boi_token
|
||||
image_token = tokenizer.image_token
|
||||
mm_tokens_per_image = hf_config.mm_tokens_per_image
|
||||
image_tokens_expanded = "".join([image_token] * mm_tokens_per_image)
|
||||
|
||||
def get_replacement_gemma3(item_idx: int):
|
||||
return PromptUpdateDetails(
|
||||
full=hf_processor.full_image_sequence,
|
||||
features=image_tokens_expanded,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=boi_token,
|
||||
replacement=get_replacement_gemma3,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class Gemma3MultiModalProjector(nn.Module):
|
||||
|
||||
def __init__(self, config: Gemma3Config):
|
||||
super().__init__()
|
||||
|
||||
self.mm_input_projection_weight = nn.Parameter(
|
||||
torch.zeros(config.vision_config.hidden_size,
|
||||
config.text_config.hidden_size))
|
||||
|
||||
self.mm_soft_emb_norm = GemmaRMSNorm(
|
||||
config.vision_config.hidden_size,
|
||||
eps=config.vision_config.layer_norm_eps)
|
||||
|
||||
self.patches_per_image = int(config.vision_config.image_size //
|
||||
config.vision_config.patch_size)
|
||||
self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
|
||||
self.kernel_size = self.patches_per_image // self.tokens_per_side
|
||||
self.avg_pool = nn.AvgPool2d(kernel_size=self.kernel_size,
|
||||
stride=self.kernel_size)
|
||||
|
||||
def forward(self, vision_outputs: torch.Tensor):
|
||||
batch_size, _, seq_length = vision_outputs.shape
|
||||
|
||||
reshaped_vision_outputs = vision_outputs.transpose(1, 2)
|
||||
reshaped_vision_outputs = reshaped_vision_outputs.reshape(
|
||||
batch_size, seq_length, self.patches_per_image,
|
||||
self.patches_per_image)
|
||||
reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
|
||||
|
||||
pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
|
||||
pooled_vision_outputs = pooled_vision_outputs.flatten(2)
|
||||
pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
|
||||
|
||||
normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
|
||||
|
||||
projected_vision_outputs = torch.matmul(
|
||||
normed_vision_outputs, self.mm_input_projection_weight)
|
||||
return projected_vision_outputs.type_as(vision_outputs)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(Gemma3MultiModalProcessor,
|
||||
info=Gemma3ProcessingInfo,
|
||||
dummy_inputs=Gemma3DummyInputsBuilder)
|
||||
class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
SupportsPP):
|
||||
packed_modules_mapping = {
|
||||
"qkv_proj": [
|
||||
"q_proj",
|
||||
"k_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"gate_up_proj": [
|
||||
"gate_proj",
|
||||
"up_proj",
|
||||
],
|
||||
}
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
config = vllm_config.model_config.hf_config
|
||||
quant_config = vllm_config.quant_config
|
||||
multimodal_config = vllm_config.model_config.multimodal_config
|
||||
self.config = config
|
||||
self.quant_config = quant_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.sliding_window = config.text_config.interleaved_sliding_window
|
||||
|
||||
self.vision_tower = SiglipVisionModel(config.vision_config,
|
||||
quant_config,
|
||||
prefix=maybe_prefix(
|
||||
prefix, "vision_tower"))
|
||||
self.multi_modal_projector = Gemma3MultiModalProjector(config)
|
||||
|
||||
self.language_model = init_vllm_registered_model(
|
||||
vllm_config=vllm_config,
|
||||
hf_config=config.text_config,
|
||||
prefix=maybe_prefix(prefix, "language_model"),
|
||||
architectures=["Gemma3ForCausalLM"],
|
||||
)
|
||||
logit_scale = getattr(config, "logit_scale", 1.0)
|
||||
self.language_model.logits_processor.scale *= logit_scale
|
||||
|
||||
self.make_empty_intermediate_tensors = (
|
||||
self.language_model.make_empty_intermediate_tensors)
|
||||
|
||||
@property
|
||||
def sampler(self):
|
||||
return self.language_model.sampler
|
||||
|
||||
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
|
||||
h = w = self.config.vision_config.image_size
|
||||
expected_dims = (3, h, w)
|
||||
|
||||
def _validate_shape(d: torch.Tensor):
|
||||
if d.shape != expected_dims:
|
||||
raise ValueError(
|
||||
"The expected shape of pixel values per image per batch "
|
||||
f"is {expected_dims}. You supplied {tuple(d.shape)}.")
|
||||
|
||||
for d in data:
|
||||
_validate_shape(d)
|
||||
|
||||
return data
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self, **kwargs: object) -> Optional[Gemma3ImageInputs]:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
image_embeds = kwargs.pop("image_embeds", None)
|
||||
assert image_embeds is None, "Gemma3 does not support image_embeds."
|
||||
if pixel_values is None:
|
||||
return None
|
||||
|
||||
if not isinstance(pixel_values, (torch.Tensor, list[torch.Tensor])):
|
||||
raise ValueError("Incorrect type of pixel values. "
|
||||
f"Got type: {type(pixel_values)}")
|
||||
|
||||
pixel_values = flatten_bn(pixel_values, concat=True)
|
||||
return Gemma3ImagePixelInputs(
|
||||
type="pixel_values",
|
||||
data=self._validate_pixel_values(pixel_values),
|
||||
)
|
||||
|
||||
def _image_pixels_to_features(
|
||||
self,
|
||||
vision_tower: SiglipVisionModel,
|
||||
pixel_values: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
target_dtype = vision_tower.get_input_embeddings().weight.dtype
|
||||
image_features = vision_tower(pixel_values.to(dtype=target_dtype))
|
||||
return image_features
|
||||
|
||||
def _process_image_input(
|
||||
self,
|
||||
image_input: Gemma3ImageInputs,
|
||||
) -> torch.Tensor:
|
||||
assert self.vision_tower is not None
|
||||
pixel_values = image_input["data"]
|
||||
vision_outputs = self._image_pixels_to_features(
|
||||
self.vision_tower,
|
||||
pixel_values,
|
||||
)
|
||||
return self.multi_modal_projector(vision_outputs)
|
||||
|
||||
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[NestedTensors] = None,
|
||||
) -> torch.Tensor:
|
||||
if multimodal_embeddings is None:
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
else:
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids, inputs_embeds, multimodal_embeddings,
|
||||
self.config.image_token_index)
|
||||
return inputs_embeds
|
||||
|
||||
def forward(self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||
inputs_embeds: Optional[torch.Tensor] = None,
|
||||
**kwargs: object) -> Union[SamplerOutput, IntermediateTensors]:
|
||||
if intermediate_tensors is not None:
|
||||
inputs_embeds = None
|
||||
|
||||
# NOTE: In v1, inputs_embeds is always generated at model runner, this
|
||||
# condition is for v0 compatibility.
|
||||
elif inputs_embeds is None:
|
||||
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
|
||||
|
||||
inputs_embeds = self.get_input_embeddings(input_ids,
|
||||
vision_embeddings)
|
||||
if vision_embeddings is not None:
|
||||
kwargs = self.prepare_attn_masks(
|
||||
input_ids,
|
||||
positions,
|
||||
mask_dtype=vision_embeddings.dtype,
|
||||
**kwargs)
|
||||
input_ids = None
|
||||
|
||||
hidden_states = self.language_model.model(input_ids,
|
||||
positions,
|
||||
intermediate_tensors,
|
||||
inputs_embeds=inputs_embeds,
|
||||
**kwargs)
|
||||
|
||||
return hidden_states
|
||||
|
||||
def prepare_attn_masks(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
mask_dtype: torch.dtype,
|
||||
**kwargs,
|
||||
):
|
||||
kwargs["has_images"] = True
|
||||
# NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
|
||||
# This is a HACK. Fix this.
|
||||
start_idices = (positions == 0).cpu().nonzero()
|
||||
num_seqs = len(start_idices)
|
||||
seq_lens = []
|
||||
for i in range(num_seqs):
|
||||
start_idx = start_idices[i].item()
|
||||
if i < num_seqs - 1:
|
||||
end_idx = start_idices[i + 1].item()
|
||||
else:
|
||||
end_idx = len(input_ids)
|
||||
seq_lens.append(end_idx - start_idx)
|
||||
kwargs["seq_lens"] = seq_lens
|
||||
|
||||
global_attn_masks = []
|
||||
local_attn_masks = []
|
||||
start_idx = 0
|
||||
for seq_len in seq_lens:
|
||||
end_idx = start_idx + seq_len
|
||||
input_token_ids = input_ids[start_idx:end_idx]
|
||||
start_idx = end_idx
|
||||
# Create a global causal mask.
|
||||
global_attn_mask = torch.empty(
|
||||
1,
|
||||
1,
|
||||
seq_len,
|
||||
seq_len,
|
||||
dtype=mask_dtype,
|
||||
device=input_ids.device,
|
||||
)
|
||||
global_attn_mask.fill_(float("-inf"))
|
||||
# Fill the lower triangle with 0.
|
||||
global_attn_mask = global_attn_mask.triu(diagonal=1)
|
||||
|
||||
# Consider the bidirectional attention between image tokens.
|
||||
img_mask = torch.zeros_like(global_attn_mask)
|
||||
img_pos = (input_token_ids == self.config.image_token_index)
|
||||
img_mask[:, :, :, img_pos] += 1
|
||||
img_mask[:, :, img_pos, :] += 1
|
||||
global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
|
||||
global_attn_masks.append(global_attn_mask)
|
||||
|
||||
# Create a local causal mask with sliding window (1024).
|
||||
local_attn_mask = torch.ones_like(global_attn_mask)
|
||||
local_attn_mask = torch.tril(local_attn_mask,
|
||||
diagonal=-self.sliding_window)
|
||||
local_attn_mask = torch.where(local_attn_mask == 0,
|
||||
global_attn_mask, float("-inf"))
|
||||
local_attn_masks.append(local_attn_mask)
|
||||
kwargs["global_attn_masks"] = global_attn_masks
|
||||
kwargs["local_attn_masks"] = local_attn_masks
|
||||
return kwargs
|
||||
|
||||
def compute_logits(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[torch.Tensor]:
|
||||
return self.language_model.compute_logits(hidden_states,
|
||||
sampling_metadata)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
logits: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
) -> Optional[SamplerOutput]:
|
||||
return self.language_model.sample(logits, sampling_metadata)
|
||||
|
||||
def load_weights(self, weights: Iterable[Tuple[str,
|
||||
torch.Tensor]]) -> Set[str]:
|
||||
loader = AutoWeightsLoader(self)
|
||||
return loader.load_weights(weights)
|
||||
@ -45,7 +45,7 @@ class NVLMProcessor(BaseInternVLProcessor):
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
|
||||
if self.use_thumbnail and num_patches != 1:
|
||||
if self.use_thumbnail:
|
||||
tile_pos_identifiers += ["<tile_global_thumbnail>"]
|
||||
|
||||
context_size = feature_size // num_patches
|
||||
|
||||
@ -53,6 +53,7 @@ _TEXT_GENERATION_MODELS = {
|
||||
"Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"),
|
||||
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
|
||||
"Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
|
||||
"Gemma3ForCausalLM": ("gemma3", "Gemma3ForCausalLM"),
|
||||
"GlmForCausalLM": ("glm", "GlmForCausalLM"),
|
||||
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
||||
"GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"),
|
||||
@ -161,6 +162,7 @@ _MULTIMODAL_MODELS = {
|
||||
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
|
||||
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
|
||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||
"Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"), # noqa: E501
|
||||
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
|
||||
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
|
||||
@ -134,57 +134,29 @@ class RequestOutput:
|
||||
self.encoder_prompt_token_ids = encoder_prompt_token_ids
|
||||
self.num_cached_tokens = num_cached_tokens
|
||||
|
||||
@classmethod
|
||||
def new(
|
||||
cls,
|
||||
request_id: str,
|
||||
prompt: Optional[str],
|
||||
prompt_token_ids: Optional[list[int]],
|
||||
text: str,
|
||||
token_ids: list[int],
|
||||
logprobs: Optional[SampleLogprobs],
|
||||
prompt_logprobs: Optional[PromptLogprobs],
|
||||
cumulative_logprob: Optional[float],
|
||||
finished: bool = False,
|
||||
) -> "RequestOutput":
|
||||
"""Initialize a new RequestOutput object."""
|
||||
|
||||
# TODO: Support `n` > 1.
|
||||
completion_output = CompletionOutput(
|
||||
index=0,
|
||||
text=text,
|
||||
token_ids=token_ids,
|
||||
cumulative_logprob=cumulative_logprob,
|
||||
logprobs=logprobs)
|
||||
|
||||
return RequestOutput(
|
||||
request_id=request_id,
|
||||
prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
prompt_logprobs=prompt_logprobs,
|
||||
outputs=[completion_output],
|
||||
finished=finished,
|
||||
)
|
||||
|
||||
def add(self, next_output: "RequestOutput") -> None:
|
||||
"""Merge subsequent RequestOutput into this one"""
|
||||
|
||||
self.prompt = next_output.prompt
|
||||
self.prompt_token_ids = next_output.prompt_token_ids
|
||||
self.prompt_logprobs = next_output.prompt_logprobs
|
||||
self.finished |= next_output.finished
|
||||
|
||||
#TODO assuming n == 1 for now
|
||||
completion = self.outputs[0]
|
||||
next_completion = next_output.outputs[0]
|
||||
completion.text += next_completion.text
|
||||
if not isinstance(completion.token_ids, MutableSequence):
|
||||
completion.token_ids = list(completion.token_ids)
|
||||
completion.token_ids.extend(next_completion.token_ids)
|
||||
if next_completion.logprobs:
|
||||
assert completion.logprobs is not None
|
||||
completion.logprobs.extend(next_completion.logprobs)
|
||||
completion.cumulative_logprob = next_completion.cumulative_logprob
|
||||
for next_completion in next_output.outputs:
|
||||
for completion in self.outputs:
|
||||
if completion.index == next_completion.index:
|
||||
# Merge outputs with same index
|
||||
completion.text += next_completion.text
|
||||
if not isinstance(completion.token_ids, MutableSequence):
|
||||
completion.token_ids = list(completion.token_ids)
|
||||
completion.token_ids.extend(next_completion.token_ids)
|
||||
if next_completion.logprobs:
|
||||
assert completion.logprobs is not None
|
||||
completion.logprobs.extend(next_completion.logprobs)
|
||||
completion.cumulative_logprob = (
|
||||
next_completion.cumulative_logprob)
|
||||
completion.finish_reason = next_completion.finish_reason
|
||||
completion.stop_reason = next_completion.stop_reason
|
||||
break
|
||||
else:
|
||||
self.outputs.append(next_completion)
|
||||
|
||||
@classmethod
|
||||
def from_seq_group(
|
||||
|
||||
@ -121,6 +121,9 @@ class CpuPlatform(Platform):
|
||||
# Disable torch async compiling which won't work with daemonic processes
|
||||
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
|
||||
# MLA attention is not supported
|
||||
os.environ["VLLM_MLA_DISABLE"] = "1"
|
||||
|
||||
# Intel OpenMP setting
|
||||
ld_prealod_str = os.getenv("LD_PRELOAD", "")
|
||||
if "libiomp5.so" in ld_prealod_str:
|
||||
|
||||
@ -298,9 +298,8 @@ class AsyncLLM(EngineClient):
|
||||
async def abort(self, request_id: str) -> None:
|
||||
"""Abort RequestId in OutputProcessor and EngineCore."""
|
||||
|
||||
request_ids = [request_id]
|
||||
request_ids = self.output_processor.abort_requests((request_id, ))
|
||||
await self.engine_core.abort_requests_async(request_ids)
|
||||
self.output_processor.abort_requests(request_ids)
|
||||
|
||||
if self.log_requests:
|
||||
logger.info("Aborted request %s.", request_id)
|
||||
|
||||
@ -137,8 +137,8 @@ class LLMEngine:
|
||||
def abort_request(self, request_ids: list[str]) -> None:
|
||||
"""Remove request_ids from EngineCore and Detokenizer."""
|
||||
|
||||
request_ids = self.output_processor.abort_requests(request_ids)
|
||||
self.engine_core.abort_requests(request_ids)
|
||||
self.output_processor.abort_requests(request_ids)
|
||||
|
||||
def add_request(
|
||||
self,
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import asyncio
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Union
|
||||
|
||||
@ -102,8 +103,7 @@ class RequestState:
|
||||
) -> Optional[RequestOutput]:
|
||||
|
||||
finished = finish_reason is not None
|
||||
output_kind = self.output_kind
|
||||
final_only = output_kind == RequestOutputKind.FINAL_ONLY
|
||||
final_only = self.output_kind == RequestOutputKind.FINAL_ONLY
|
||||
|
||||
# In follow up, we will switch to invariant where EngineCore
|
||||
# does not stream partial prefills.
|
||||
@ -111,24 +111,24 @@ class RequestState:
|
||||
# Only the final output is required in FINAL_ONLY mode.
|
||||
return None
|
||||
|
||||
def new_request_output(request_id: str) -> RequestOutput:
|
||||
return self._new_request_output(request_id, finished)
|
||||
|
||||
completion_output = self._new_completion_output(
|
||||
new_token_ids, finish_reason, stop_reason)
|
||||
|
||||
if self.parent_req is not None:
|
||||
return self.parent_req.make_request_output(final_only,
|
||||
completion_output,
|
||||
new_request_output)
|
||||
request_id = self.request_id
|
||||
if self.parent_req is None:
|
||||
outputs = [completion_output]
|
||||
else:
|
||||
request_id, outputs, finished = self.parent_req.get_outputs(
|
||||
request_id, completion_output)
|
||||
if not outputs:
|
||||
return None
|
||||
|
||||
request_output = new_request_output(self.request_id)
|
||||
request_output.outputs.append(completion_output)
|
||||
return request_output
|
||||
return self._new_request_output(request_id, outputs, finished)
|
||||
|
||||
def _new_request_output(
|
||||
self,
|
||||
request_id: str,
|
||||
outputs: list[CompletionOutput],
|
||||
finished: bool,
|
||||
) -> RequestOutput:
|
||||
|
||||
@ -143,7 +143,7 @@ class RequestState:
|
||||
prompt=self.prompt,
|
||||
prompt_token_ids=self.prompt_token_ids,
|
||||
prompt_logprobs=prompt_logprobs,
|
||||
outputs=[],
|
||||
outputs=outputs,
|
||||
finished=finished,
|
||||
)
|
||||
|
||||
@ -188,6 +188,7 @@ class OutputProcessor:
|
||||
self.log_stats = log_stats
|
||||
self.tokenizer = tokenizer
|
||||
self.request_states: dict[str, RequestState] = {}
|
||||
self.parent_requests: dict[str, ParentRequest] = {}
|
||||
self.lora_states = LoRARequestStates()
|
||||
|
||||
def get_num_unfinished_requests(self):
|
||||
@ -198,14 +199,20 @@ class OutputProcessor:
|
||||
|
||||
def abort_requests(
|
||||
self,
|
||||
request_ids: list[str],
|
||||
) -> None:
|
||||
request_ids: Iterable[str],
|
||||
) -> list[str]:
|
||||
request_ids_to_abort = []
|
||||
for request_id in request_ids:
|
||||
req_state = self.request_states.pop(request_id, None)
|
||||
if req_state is not None:
|
||||
self.lora_states.abort_request(req_state)
|
||||
if req_state.parent_req is not None:
|
||||
req_state.parent_req.finish_child_request(request_id)
|
||||
request_ids_to_abort.append(request_id)
|
||||
else:
|
||||
parent = self.parent_requests.pop(request_id, None)
|
||||
if parent and parent.child_requests:
|
||||
self.abort_requests(parent.child_requests)
|
||||
request_ids_to_abort.extend(parent.child_requests)
|
||||
return request_ids_to_abort
|
||||
|
||||
def add_request(
|
||||
self,
|
||||
@ -227,6 +234,8 @@ class OutputProcessor:
|
||||
log_stats=self.log_stats)
|
||||
self.request_states[request_id] = req_state
|
||||
self.lora_states.add_request(req_state)
|
||||
if parent_req:
|
||||
self.parent_requests[parent_req.request_id] = parent_req
|
||||
|
||||
def process_outputs(
|
||||
self,
|
||||
@ -314,12 +323,14 @@ class OutputProcessor:
|
||||
# Free completed requests.
|
||||
if finish_reason is not None:
|
||||
self.request_states.pop(req_id)
|
||||
# Remove parent request if applicable.
|
||||
parent_req = req_state.parent_req
|
||||
if parent_req and not parent_req.child_requests:
|
||||
self.parent_requests.pop(parent_req.request_id, None)
|
||||
if not engine_core_output.finished:
|
||||
# If req not finished in EngineCore, but Detokenizer
|
||||
# detected stop string, abort needed in EngineCore.
|
||||
reqs_to_abort.append(req_id)
|
||||
if req_state.parent_req is not None:
|
||||
req_state.parent_req.finish_child_request(req_id)
|
||||
|
||||
# Track per-request stats
|
||||
self._update_stats_from_finished(req_state, finish_reason,
|
||||
|
||||
@ -1,11 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
from copy import copy
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Optional, Union
|
||||
|
||||
from vllm.outputs import CompletionOutput, RequestOutput
|
||||
from vllm.outputs import CompletionOutput
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
from vllm.v1.metrics.stats import IterationStats
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ class ParentRequest:
|
||||
child_requests: set[str]
|
||||
|
||||
# To aggregate child completions when not streaming
|
||||
output_aggregator: Optional[RequestOutput]
|
||||
output_aggregator: list[CompletionOutput]
|
||||
|
||||
# To find the max number of generated tokens across all children
|
||||
max_num_generation_tokens: int
|
||||
@ -37,7 +37,9 @@ class ParentRequest:
|
||||
self.sampling_params = sampling_params
|
||||
|
||||
self.child_requests = set()
|
||||
self.output_aggregator = None
|
||||
self.output_aggregator = [None] * sampling_params.n if (
|
||||
sampling_params.output_kind
|
||||
== RequestOutputKind.FINAL_ONLY) else []
|
||||
self.max_num_generation_tokens = 0
|
||||
self.cached_child_sampling_params = None
|
||||
|
||||
@ -93,43 +95,30 @@ class ParentRequest:
|
||||
"""
|
||||
child_req_id = f"{index}_{self.request_id}"
|
||||
self.child_requests.add(child_req_id)
|
||||
return (child_req_id, self._get_child_sampling_params(index))
|
||||
|
||||
def finish_child_request(self, req_id: str):
|
||||
self.child_requests.remove(req_id)
|
||||
return child_req_id, self._get_child_sampling_params(index)
|
||||
|
||||
@property
|
||||
def n(self) -> int:
|
||||
return self.sampling_params.n
|
||||
|
||||
def make_request_output(
|
||||
def get_outputs(
|
||||
self,
|
||||
final_only: bool,
|
||||
child_request_id: str,
|
||||
completion_output: CompletionOutput,
|
||||
new_request_output: Callable[[str], RequestOutput],
|
||||
) -> Optional[RequestOutput]:
|
||||
# Use an existing RequestOutput if we're aggregating
|
||||
request_output = self.output_aggregator
|
||||
) -> tuple[str, list[CompletionOutput], bool]:
|
||||
if completion_output.finished():
|
||||
self.child_requests.remove(child_request_id)
|
||||
|
||||
# Make new RequestOutput otherwise
|
||||
if request_output is None:
|
||||
request_output = new_request_output(self.request_id)
|
||||
if self.sampling_params.output_kind != RequestOutputKind.FINAL_ONLY:
|
||||
# If streaming, just return the current output.
|
||||
outputs = [completion_output]
|
||||
else:
|
||||
# If not streaming, aggregate the n final outputs.
|
||||
self.output_aggregator[completion_output.index] = completion_output
|
||||
outputs = [] if self.child_requests else self.output_aggregator
|
||||
|
||||
# Add a new completion
|
||||
request_output.outputs.append(completion_output)
|
||||
|
||||
# If not streaming, aggregate until all child requests complete
|
||||
if final_only and len(request_output.outputs) != self.n:
|
||||
self.output_aggregator = request_output
|
||||
return None
|
||||
|
||||
# We're done aggregating
|
||||
self.output_aggregator = None
|
||||
|
||||
# Parent completion output list must be sorted by index
|
||||
request_output.outputs = sorted(request_output.outputs,
|
||||
key=lambda x: x.index)
|
||||
return request_output
|
||||
finished = not self.child_requests
|
||||
return self.request_id, outputs, finished
|
||||
|
||||
def observe_num_generation_tokens(self, num_generation_tokens: int):
|
||||
self.max_num_generation_tokens = max(num_generation_tokens,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user