Merge branch 'main' into hacking_full_cudagraph

This commit is contained in:
Tyler Michael Smith 2025-01-03 15:06:23 -05:00
commit 76732ff701
38 changed files with 1234 additions and 158 deletions

View File

@ -1,5 +1,6 @@
steps:
- label: "Wait for container to be ready"
key: wait-for-container-image
agents:
queue: A100
plugins:
@ -10,12 +11,11 @@ steps:
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
plugins:
- kubernetes:
podSpec:
@ -49,6 +49,7 @@ steps:
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -73,7 +74,7 @@ steps:
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: ~
depends_on: wait-for-container-image
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

View File

@ -363,12 +363,14 @@ steps:
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/audio_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/audio_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model

View File

@ -234,8 +234,8 @@ RUN mv vllm test_docs/
#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV VLLM_USAGE_SOURCE production-docker-image
# define sagemaker first, so it is not default from `docker build`
FROM vllm-openai-base AS vllm-sagemaker
COPY examples/sagemaker-entrypoint.sh .
RUN chmod +x sagemaker-entrypoint.sh
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################

View File

@ -834,6 +834,7 @@ __global__ void Marlin(
int4* sh_g_idx = sh_b + (stages * b_sh_stage);
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
int4* sh_s = sh_zp + (stages * zp_sh_stage);
int4* sh_red = sh_s + (stages * s_sh_stage);
// Register storage for double buffer of shared memory reads.
FragA frag_a[2][thread_m_blocks];
@ -932,11 +933,11 @@ __global__ void Marlin(
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
if constexpr (group_blocks >= thread_k_blocks) {
if (s_sh_wr_pred) {
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
}
// Only fetch scales if this tile starts a new group
if (pipe % (group_blocks / thread_k_blocks) == 0) {
if (s_sh_wr_pred) {
cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
}
if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) {
s_gl_rd += s_gl_rd_delta;
}
} else {
@ -1038,9 +1039,7 @@ __global__ void Marlin(
// No act-order case
if constexpr (group_blocks != -1) {
if constexpr (group_blocks >= thread_k_blocks) {
int4* sh_s_stage =
sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
(pipe / (group_blocks / thread_k_blocks)));
int4* sh_s_stage = sh_s + s_sh_stage * pipe;
reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
} else {
int warp_id = threadIdx.x / 32;
@ -1339,15 +1338,15 @@ __global__ void Marlin(
int red_sh_wr =
red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
if (i < red_off) {
float* c_rd =
reinterpret_cast<float*>(&sh[red_sh_delta * j + red_sh_rd]);
float* c_wr = reinterpret_cast<float*>(&sh[red_sh_wr]);
float* c_rd = reinterpret_cast<float*>(
&sh_red[red_sh_delta * j + red_sh_rd]);
float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
#pragma unroll
for (int k = 0; k < 4; k++)
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
c_rd[k] + c_wr[k];
}
sh[red_sh_wr] =
sh_red[red_sh_wr] =
reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
}
}
@ -1357,7 +1356,7 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < 4 * 2; i++) {
float* c_rd =
reinterpret_cast<float*>(&sh[red_sh_delta * i + red_sh_rd]);
reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
#pragma unroll
for (int j = 0; j < 4; j++)
reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
@ -1397,7 +1396,7 @@ __global__ void Marlin(
#pragma unroll
for (int i = 0; i < thread_m_blocks * 4; i++) {
cp_async4_pred(
&sh[c_sh_wr + c_sh_wr_delta * i],
&sh_red[c_sh_wr + c_sh_wr_delta * i],
&C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
c_gl_wr_delta_i * (i % 2)],
i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
@ -1410,7 +1409,7 @@ __global__ void Marlin(
for (int i = 0; i < thread_m_blocks * 4; i++) {
if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) {
if (!first) {
int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta];
int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
#pragma unroll
for (int j = 0; j < 2 * 4; j++) {
reinterpret_cast<float*>(
@ -1461,10 +1460,10 @@ __global__ void Marlin(
float* frag_c_ptr = reinterpret_cast<float*>(&frag_c);
#pragma unroll
for (int k = 0; k < th_size; k++) {
sh[threadIdx.x] =
sh_red[threadIdx.x] =
C_tmp[c_cur_offset + active_threads * k + threadIdx.x];
float* sh_c_ptr = reinterpret_cast<float*>(&sh[threadIdx.x]);
float* sh_c_ptr = reinterpret_cast<float*>(&sh_red[threadIdx.x]);
#pragma unroll
for (int f = 0; f < 4; f++) {
frag_c_ptr[k * 4 + f] += sh_c_ptr[f];
@ -1515,7 +1514,7 @@ __global__ void Marlin(
res = __hmul2(res, s[0]);
}
((scalar_t2*)sh)[idx] = res;
((scalar_t2*)sh_red)[idx] = res;
};
if (threadIdx.x / 32 < thread_n_blocks / 4) {
@ -1543,7 +1542,7 @@ __global__ void Marlin(
i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks));
i++) {
if (c_gl_wr < c_gl_wr_end) {
C[c_gl_wr] = sh[c_sh_rd];
C[c_gl_wr] = sh_red[c_sh_rd];
c_gl_wr += c_gl_wr_delta;
c_sh_rd += c_sh_rd_delta;
}
@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks,
float pipe_size = (a_size + b_size) * pipe_stages;
float reduce_size = max(th_config.num_threads * 32 * 4,
(tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2);
TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity
return pipe_size < 0.95f * (max_shared_mem - scales_cache_size);
return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size);
}
bool is_valid_config(thread_config_t const& th_config, int max_m_blocks,

View File

@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma
```python
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
# Set max_num_batched_tokens to tune performance.
# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill.
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048)
```
By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch.
@ -49,13 +49,12 @@ This policy has two benefits:
- It improves ITL and generation decode because decode requests are prioritized.
- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
You can tune the performance by changing `max_num_batched_tokens`.
By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B).
You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048.
Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes.
Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch.
- If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler.
We recommend you set `max_num_batched_tokens > 2048` for throughput.

View File

@ -0,0 +1,59 @@
import time
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
# Create a Whisper encoder/decoder model instance
llm = LLM(
model="openai/whisper-large-v3",
max_model_len=448,
max_num_seqs=400,
limit_mm_per_prompt={"audio": 1},
kv_cache_dtype="fp8",
)
prompts = [
{
"prompt": "<|startoftranscript|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt": "<|startoftranscript|>",
}
] * 1024
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
start = time.time()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Encoder prompt: {encoder_prompt!r}, "
f"Decoder prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
duration = time.time() - start
print("Duration:", duration)
print("RPS:", len(prompts) / duration)

View File

@ -0,0 +1,24 @@
#!/bin/bash
# Define the prefix for environment variables to look for
PREFIX="SM_VLLM_"
ARG_PREFIX="--"
# Initialize an array for storing the arguments
# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response
ARGS=(--port 8080)
# Loop through all environment variables
while IFS='=' read -r key value; do
# Remove the prefix from the key, convert to lowercase, and replace underscores with dashes
arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-')
# Add the argument name and value to the ARGS array
ARGS+=("${ARG_PREFIX}${arg_name}")
if [ -n "$value" ]; then
ARGS+=("$value")
fi
done < <(env | grep "^${PREFIX}")
# Pass the collected arguments to the main entrypoint
exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}"

View File

@ -0,0 +1,136 @@
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional
import pytest
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from ....utils import fork_new_process_for_each_test, multi_gpu_test
PROMPTS = [
{
"prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
},
{ # Test explicit encoder/decoder prompt
"encoder_prompt": {
"prompt": "",
"multi_modal_data": {
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
}
]
EXPECTED = {
"openai/whisper-tiny": [
" He has birth words I spoke in the original corner of that. And a"
" little piece of black coat poetry. Mary had a little sandwich,"
" sweet, with white and snow. And everyone had it very went the last"
" would sure to go.",
" >> And the old one, fit John the way to Edgar Martinez. >> One more"
" to line down the field line for our base camp. Here comes joy. Here"
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five."
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite a"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the old one pitch on the way to Edgar Martinez one month. Here"
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its fleece was quite as"
" slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez swung on the line"
" down the left field line for Obeyshev. Here comes Joy. Here is"
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its feet were quite as"
" slow, and everywhere that Mary went, the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
" Now the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
" of practical poetry. Mary had a little lamb, its streets were quite"
" as slow, and everywhere that Mary went the lamb was sure to go.",
" And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
" down the left field line for a base hit. Here comes Joy. Here is"
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
]
}
def run_test(
model: str,
*,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
) -> None:
prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10
llm = LLM(
model=model,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text)
assert output.outputs[0].text == expected
@fork_new_process_for_each_test
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None:
run_test(model, tensor_parallel_size=1)
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None:
run_test(model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend)

View File

@ -204,6 +204,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
# [Encoder-decoder]
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
}
_SPECULATIVE_DECODING_EXAMPLE_MODELS = {

View File

@ -142,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
client.abort_requests([request.request_id])
# Shutdown the client.
client.shutdown()
@pytest.mark.asyncio
async def test_engine_core_client_asyncio(monkeypatch):
@ -200,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
else:
assert len(outputs[req_id]) == MAX_TOKENS, (
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
# Shutdown the client.
client.shutdown()

View File

@ -2312,6 +2312,8 @@ def _get_and_verify_max_len(
"seq_length",
# Command-R
"model_max_length",
# Whisper
"max_target_positions",
# Others
"max_sequence_length",
"max_seq_length",

View File

@ -1579,6 +1579,7 @@ class Scheduler:
seq.status = SequenceStatus.WAITING
self.free_seq(seq)
seq.reset_state_for_recompute()
self._free_seq_group_cross_attn_blocks(seq_group)
def _preempt_by_swap(
self,

View File

@ -232,11 +232,6 @@ class LLM:
self.request_counter = Counter()
def __del__(self):
if hasattr(self, 'llm_engine') and self.llm_engine and hasattr(
self.llm_engine, "shutdown"):
self.llm_engine.shutdown()
@staticmethod
def get_engine_class() -> Type[LLMEngine]:
if envs.VLLM_USE_V1:

View File

@ -16,7 +16,7 @@ from http import HTTPStatus
from typing import AsyncIterator, Optional, Set, Tuple
import uvloop
from fastapi import APIRouter, FastAPI, Request
from fastapi import APIRouter, FastAPI, HTTPException, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
@ -44,11 +44,15 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionResponse,
DetokenizeRequest,
DetokenizeResponse,
EmbeddingChatRequest,
EmbeddingCompletionRequest,
EmbeddingRequest,
EmbeddingResponse,
EmbeddingResponseData,
ErrorResponse,
LoadLoraAdapterRequest,
PoolingChatRequest,
PoolingCompletionRequest,
PoolingRequest, PoolingResponse,
ScoreRequest, ScoreResponse,
TokenizeRequest,
@ -310,6 +314,12 @@ async def health(raw_request: Request) -> Response:
return Response(status_code=200)
@router.api_route("/ping", methods=["GET", "POST"])
async def ping(raw_request: Request) -> Response:
"""Ping check. Endpoint required for SageMaker"""
return await health(raw_request)
@router.post("/tokenize")
@with_cancellation
async def tokenize(request: TokenizeRequest, raw_request: Request):
@ -483,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request):
return await create_score(request, raw_request)
TASK_HANDLERS = {
"generate": {
"messages": (ChatCompletionRequest, create_chat_completion),
"default": (CompletionRequest, create_completion),
},
"embed": {
"messages": (EmbeddingChatRequest, create_embedding),
"default": (EmbeddingCompletionRequest, create_embedding),
},
"score": {
"default": (ScoreRequest, create_score),
},
"reward": {
"messages": (PoolingChatRequest, create_pooling),
"default": (PoolingCompletionRequest, create_pooling),
},
"classify": {
"messages": (PoolingChatRequest, create_pooling),
"default": (PoolingCompletionRequest, create_pooling),
},
}
@router.post("/invocations")
async def invocations(raw_request: Request):
"""
For SageMaker, routes requests to other handlers based on model `task`.
"""
body = await raw_request.json()
task = raw_request.app.state.task
if task not in TASK_HANDLERS:
raise HTTPException(
status_code=400,
detail=f"Unsupported task: '{task}' for '/invocations'. "
f"Expected one of {set(TASK_HANDLERS.keys())}")
handler_config = TASK_HANDLERS[task]
if "messages" in body:
request_model, handler = handler_config["messages"]
else:
request_model, handler = handler_config["default"]
# this is required since we lose the FastAPI automatic casting
request = request_model.model_validate(body)
return await handler(request, raw_request)
if envs.VLLM_TORCH_PROFILER_DIR:
logger.warning(
"Torch Profiler is enabled in the API server. This should ONLY be "
@ -687,6 +745,7 @@ def init_app_state(
chat_template=resolved_chat_template,
chat_template_content_format=args.chat_template_content_format,
)
state.task = model_config.task
def create_server_socket(addr: Tuple[str, int]) -> socket.socket:

View File

@ -184,10 +184,16 @@ class InputPreprocessor:
corresponding token IDs.
"""
tokenizer = self.get_tokenizer_group()
add_special_tokens = None
if self.model_config.hf_config.model_type == "whisper":
# For Whisper, special tokens should be provided by the user based
# on the task and language of their request. Also needed to avoid
# appending an EOS token to the prompt which disrupts generation.
add_special_tokens = False
return tokenizer.encode(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
lora_request=lora_request,
add_special_tokens=add_special_tokens)
async def _tokenize_prompt_async(
self,
@ -197,10 +203,17 @@ class InputPreprocessor:
) -> List[int]:
"""Async version of :meth:`_tokenize_prompt`."""
tokenizer = self.get_tokenizer_group()
return await tokenizer.encode_async(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
add_special_tokens = None
if self.model_config.hf_config.model_type == "whisper":
# For Whisper, special tokens should be provided by the user based
# on the task and language of their request. Also needed to avoid
# appending an EOS token to the prompt which disrupts generation.
add_special_tokens = False
return await tokenizer.encode_async(
request_id=request_id,
prompt=prompt,
lora_request=lora_request,
add_special_tokens=add_special_tokens)
def _can_process_multimodal(self) -> bool:
model_config = self.model_config
@ -439,8 +452,15 @@ class InputPreprocessor:
assert_never(encoder_inputs) # type: ignore[arg-type]
if decoder_inputs is None:
dec_token_ids = self._prepare_decoder_input_ids_for_generation(
None)
if self.model_config.hf_config.model_type == "whisper":
# For Whisper models, the text prompt should go to the decoder.
# If no explicit encoder/decoder inputs, then copy the prompt
# from the encoder to the decoder. The encoder tokens are later
# overridden by the audio features.
dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
else:
dec_token_ids = self._prepare_decoder_input_ids_for_generation(
None)
decoder_inputs = token_inputs(dec_token_ids)
elif (decoder_inputs["type"] == "token"
or decoder_inputs["type"] == "multimodal"):

View File

@ -118,7 +118,7 @@ class RejectionSampler(SpecDecodeStochasticBaseSampler):
# If use Flashinfer chain_speculative_sampling kernel
# for rejection sampling
if self.use_flashinfer:
if self.use_flashinfer and chain_speculative_sampling is not None:
batch_size, k, _ = draft_probs.shape
uniform_samples = self._create_uniform_samples(
seeded_seqs, batch_size, k, draft_probs.device)

View File

@ -606,8 +606,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# which is consistent with the practice of setting
# scaling_factor = tensor_amax / FPtype_max
scaling_factor *= 2
if hasattr(layer_self_attn, "kv_scale"):
layer_self_attn.attn._kv_scale = scaling_factor
if hasattr(layer_self_attn.attn, "_k_scale"):
layer_self_attn.attn._k_scale = scaling_factor
layer_self_attn.attn._v_scale = scaling_factor
else:
raise RuntimeError("Self attention has no KV cache scaling "
"factor attribute!")

View File

@ -545,8 +545,9 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# which is consistent with the practice of setting
# scaling_factor = tensor_amax / FPtype_max
scaling_factor *= 2
if hasattr(layer_self_attn, "kv_scale"):
layer_self_attn.attn._kv_scale = scaling_factor
if hasattr(layer_self_attn.attn, "_k_scale"):
layer_self_attn.attn._k_scale = scaling_factor
layer_self_attn.attn._v_scale = scaling_factor
else:
raise RuntimeError("Self attention has no KV cache scaling "
"factor attribute!")

View File

@ -452,8 +452,9 @@ class LlamaModel(nn.Module):
# which is consistent with the practice of setting
# scaling_factor = tensor_amax / FPtype_max
scaling_factor *= 2
if hasattr(layer_self_attn, "kv_scale"):
layer_self_attn.attn._kv_scale = scaling_factor
if hasattr(layer_self_attn.attn, "_k_scale"):
layer_self_attn.attn._k_scale = scaling_factor
layer_self_attn.attn._v_scale = scaling_factor
else:
raise RuntimeError("Self attention has no KV cache scaling "
"factor attribute!")

View File

@ -170,6 +170,7 @@ _MULTIMODAL_MODELS = {
"UltravoxModel": ("ultravox", "UltravoxModel"),
# [Encoder-decoder]
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
"WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501
}
_SPECULATIVE_DECODING_MODELS = {

View File

@ -565,8 +565,9 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
# which is consistent with the practice of setting
# scaling_factor = tensor_amax / FPtype_max
scaling_factor *= 2
if hasattr(layer_self_attn, "kv_scale"):
layer_self_attn.attn._kv_scale = scaling_factor
if hasattr(layer_self_attn.attn, "_k_scale"):
layer_self_attn.attn._k_scale = scaling_factor
layer_self_attn.attn._v_scale = scaling_factor
else:
raise RuntimeError("Self attention has no KV cache scaling "
"factor attribute!")

View File

@ -0,0 +1,737 @@
import math
from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
Union)
import numpy as np
import torch
from torch import nn
from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention import Attention, AttentionMetadata, AttentionType
from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors)
from vllm.multimodal.audio import resample_audio
from vllm.sequence import SequenceData
from vllm.transformers_utils.processor import cached_get_processor
from .interfaces import SupportsMultiModal
from .utils import AutoWeightsLoader, WeightsMapper, make_layers
logger = init_logger(__name__)
class WhisperAudioInputs(TypedDict):
input_features: NestedTensors
"""Shape: `(batch_size, 128, M)`"""
class WhisperPositionalEmbedding(nn.Embedding):
def __init__(self,
num_positions: int,
embedding_dim: int,
padding_idx: Optional[int] = None):
super().__init__(num_positions, embedding_dim)
def forward(self, position_ids):
return self.weight[position_ids]
class WhisperAttention(nn.Module):
def __init__(
self,
embed_dim: int,
num_heads: int,
bias: bool = True,
attn_type: AttentionType = AttentionType.DECODER,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__()
self.embed_dim = embed_dim
tp_size = get_tensor_model_parallel_world_size()
self.total_num_heads = num_heads
assert self.total_num_heads % tp_size == 0
self.num_heads = self.total_num_heads // tp_size
if self.total_num_heads >= tp_size:
# Number of heads is greater than TP size, so we partition
# the KV heads across multiple tensor parallel GPUs.
assert self.total_num_heads % tp_size == 0
else:
# Number of heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert tp_size % self.total_num_heads == 0
self.num_kv_heads = max(1, self.total_num_heads // tp_size)
self.head_dim = self.embed_dim // self.total_num_heads
self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim
self.attn_type = attn_type
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: "
f"{self.embed_dim} and `num_heads`: {num_heads}).")
self.scaling = self.head_dim**-0.5
self._init_qkv(embed_dim, bias, quant_config, prefix=prefix)
self.out_proj = RowParallelLinear(
input_size=embed_dim,
output_size=embed_dim,
bias=bias,
quant_config=quant_config,
prefix=f"{prefix}.out_proj",
)
self.attn = Attention(
self.num_heads,
self.head_dim,
self.scaling,
num_kv_heads=self.num_kv_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.attn",
)
def _init_qkv(
self,
embed_dim: int,
bias: bool = True,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
self.qkv_proj = QKVParallelLinear(
hidden_size=embed_dim,
head_size=self.head_dim,
total_num_heads=self.total_num_heads,
total_num_kv_heads=self.total_num_heads,
bias=bias,
quant_config=quant_config,
prefix=f"{prefix}.qkv_proj",
)
def forward(
self,
hidden_states: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
):
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
attn_output = self.attn(q,
k,
v,
kv_cache,
attn_metadata,
attn_type=self.attn_type)
output, _ = self.out_proj(attn_output)
return output
class WhisperCrossAttention(WhisperAttention):
def __init__(
self,
embed_dim: int,
num_heads: int,
bias: bool = True,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__(
embed_dim=embed_dim,
num_heads=num_heads,
bias=bias,
cache_config=cache_config,
quant_config=quant_config,
prefix=prefix,
)
def _init_qkv(
self,
embed_dim: int,
bias: bool = True,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
self.q_proj = ColumnParallelLinear(
input_size=embed_dim,
output_size=embed_dim,
bias=bias,
quant_config=quant_config,
prefix=f"{prefix}.q_proj",
)
self.kv_proj = QKVParallelLinear(
hidden_size=embed_dim,
head_size=self.head_dim,
total_num_heads=0,
total_num_kv_heads=self.total_num_heads,
bias=bias,
quant_config=quant_config,
prefix=f"{prefix}.kv_proj",
)
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor],
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
):
q, _ = self.q_proj(hidden_states)
# Encoder hidden states are only computed once during prefill phase.
# Afterwards, the keys and values should be available in the kv-cache.
if encoder_hidden_states is not None:
kv, _ = self.kv_proj(encoder_hidden_states)
k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
else:
k = v = None
attn_output = self.attn(q,
k,
v,
kv_cache,
attn_metadata,
attn_type=AttentionType.ENCODER_DECODER)
output, _ = self.out_proj(attn_output)
return output
class WhisperMLP(nn.Module):
def __init__(
self,
embed_dim: int,
ffn_dim: int,
act_fn: str,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__()
self.activation_fn = get_act_fn(act_fn)
self.fc1 = ColumnParallelLinear(
input_size=embed_dim,
output_size=ffn_dim,
quant_config=quant_config,
prefix=f"{prefix}.fc1",
)
self.fc2 = RowParallelLinear(
input_size=ffn_dim,
output_size=embed_dim,
quant_config=quant_config,
prefix=f"{prefix}.fc2",
)
def forward(self, hidden_states: torch.Tensor):
hidden_states, _ = self.fc1(hidden_states)
hidden_states = self.activation_fn(hidden_states)
hidden_states, _ = self.fc2(hidden_states)
return hidden_states
class WhisperEncoderLayer(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
self.embed_dim = config.d_model
self.self_attn = WhisperAttention(
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
attn_type=AttentionType.ENCODER,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.mlp = WhisperMLP(
embed_dim=config.d_model,
ffn_dim=config.encoder_ffn_dim,
act_fn=config.activation_function,
quant_config=quant_config,
prefix=f"{prefix}.mlp",
)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
):
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = self.self_attn(
hidden_states=hidden_states,
kv_cache=kv_cache,
attn_metadata=attn_metadata,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
if hidden_states.isinf().any() or hidden_states.isnan().any():
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states,
min=-clamp_value,
max=clamp_value)
return hidden_states
class WhisperDecoderLayer(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
self.self_attn = WhisperAttention(
embed_dim=config.d_model,
num_heads=config.decoder_attention_heads,
attn_type=AttentionType.DECODER,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.self_attn",
)
self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
self.encoder_attn = WhisperCrossAttention(
embed_dim=config.d_model,
num_heads=config.decoder_attention_heads,
cache_config=cache_config,
quant_config=quant_config,
prefix=f"{prefix}.encoder_attn",
)
self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
self.mlp = WhisperMLP(
embed_dim=config.d_model,
ffn_dim=config.decoder_ffn_dim,
act_fn=config.activation_function,
quant_config=quant_config,
prefix=f"{prefix}.mlp",
)
self.final_layer_norm = nn.LayerNorm(config.d_model)
def forward(
self,
hidden_states: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor],
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
):
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states = self.self_attn(hidden_states=hidden_states,
kv_cache=kv_cache,
attn_metadata=attn_metadata)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
hidden_states = self.encoder_attn(
hidden_states=hidden_states,
encoder_hidden_states=encoder_hidden_states,
kv_cache=kv_cache,
attn_metadata=attn_metadata,
)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
class WhisperEncoder(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
embed_dim = config.d_model
self.num_mel_bins = config.num_mel_bins
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_source_positions
self.embed_scale = (math.sqrt(embed_dim)
if config.scale_embedding else 1.0)
self.conv1 = nn.Conv1d(self.num_mel_bins,
embed_dim,
kernel_size=3,
padding=1)
self.conv2 = nn.Conv1d(embed_dim,
embed_dim,
kernel_size=3,
stride=2,
padding=1)
self.embed_positions = nn.Embedding(self.max_source_positions,
embed_dim)
self.start_layer, self.end_layer, self.layers = make_layers(
config.encoder_layers,
lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config,
prefix=f"{prefix}.layers"),
prefix=f"{prefix}.layers",
)
self.layer_norm = nn.LayerNorm(config.d_model)
with torch.no_grad():
self.embed_positions.weight.copy_(
sinusoids(*self.embed_positions.weight.shape))
def forward(
self,
input_features: Union[torch.Tensor, List[torch.Tensor]],
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
):
hidden_states = []
for features in input_features:
embeds = nn.functional.gelu(self.conv1(features))
embeds = nn.functional.gelu(self.conv2(embeds))
embeds = embeds.permute(1, 0)
embeds = embeds + self.embed_positions.weight[:embeds.size(0), :]
hidden_states.append(embeds)
hidden_states = torch.cat(hidden_states)
for idx, encoder_layer in enumerate(self.layers):
hidden_states = encoder_layer(
hidden_states,
kv_cache=kv_caches[idx],
attn_metadata=attn_metadata,
)
hidden_states = self.layer_norm(hidden_states)
return hidden_states
class WhisperDecoder(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_target_positions
self.max_source_positions = config.max_source_positions
self.embed_scale = (math.sqrt(config.d_model)
if config.scale_embedding else 1.0)
self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model,
self.padding_idx)
self.embed_positions = WhisperPositionalEmbedding(
self.max_target_positions, config.d_model)
self.start_layer, self.end_layer, self.layers = make_layers(
config.decoder_layers,
lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config,
prefix=f"{prefix}.layers"),
prefix=f"{prefix}.layers",
)
self.layer_norm = nn.LayerNorm(config.d_model)
def forward(
self,
input_ids,
positions: torch.Tensor,
encoder_hidden_states: Optional[torch.Tensor],
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
):
inputs_embeds = self.get_input_embeddings(input_ids)
positions = self.embed_positions(positions)
hidden_states = inputs_embeds + positions
for idx, decoder_layer in enumerate(self.layers):
hidden_states = decoder_layer(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
kv_cache=kv_caches[idx],
attn_metadata=attn_metadata,
)
hidden_states = self.layer_norm(hidden_states)
return hidden_states
def get_input_embeddings(
self,
input_ids: torch.Tensor,
) -> torch.Tensor:
return self.embed_tokens(input_ids)
class WhisperModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.encoder = WhisperEncoder(vllm_config=vllm_config,
prefix=f"{prefix}.encoder")
self.decoder = WhisperDecoder(vllm_config=vllm_config,
prefix=f"{prefix}.decoder")
def forward(
self,
input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
input_ids: Optional[torch.Tensor],
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
encoder_outputs = self.get_encoder_outputs(
input_features,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
decoder_outputs = self.decoder(
input_ids=input_ids,
positions=positions,
encoder_hidden_states=encoder_outputs,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
return decoder_outputs
def get_encoder_outputs(
self,
input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]],
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> Optional[torch.Tensor]:
if input_features is None:
return None
return self.encoder(
input_features,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
(".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
(".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
(".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
(".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"),
(".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"),
]
params_dict = dict(self.named_parameters())
loaded_params: Set[str] = set()
for name, loaded_weight in weights:
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
return loaded_params
def get_max_whisper_audio_tokens(ctx: InputContext) -> int:
return ctx.model_config.hf_config.max_source_positions
def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
assert mm_counts["audio"] == 1
num_tokens = get_max_whisper_audio_tokens(ctx)
processor = cached_get_processor(ctx.model_config.model)
chunk_length = processor.feature_extractor.chunk_length
sampling_rate = processor.feature_extractor.sampling_rate
num_samples = chunk_length * sampling_rate
return DummyData(
SequenceData.from_prompt_token_counts((0, num_tokens)),
{"audio": [(np.zeros(num_samples), sampling_rate)]},
)
def input_processor_for_whisper(ctx: InputContext, inputs):
multi_modal_data = inputs["encoder"]["multi_modal_data"]
if isinstance(multi_modal_data["audio"], list):
assert len(multi_modal_data["audio"]) == 1
multi_modal_data["audio"] = multi_modal_data["audio"][0]
# Resample and process audio
audio, orig_sr = multi_modal_data["audio"]
processor = cached_get_processor(ctx.model_config.model)
target_sr = processor.feature_extractor.sampling_rate
audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr)
multi_modal_data["audio"] = (audio, target_sr)
# Pre-allocate placeholder tokens in encoder sequence
num_tokens = get_max_whisper_audio_tokens(ctx)
inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens
return inputs
def input_mapper_for_whisper(
ctx: InputContext,
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
) -> MultiModalKwargs:
if not isinstance(multi_modal_data, list):
multi_modal_data = [multi_modal_data]
assert len(multi_modal_data) == 1
if len(multi_modal_data) == 0:
return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model)
sampling_rate = processor.feature_extractor.sampling_rate
audios = [audio for audio, _ in multi_modal_data]
kwargs = processor(audios,
sampling_rate=sampling_rate,
return_tensors="pt")
kwargs["input_features"] = kwargs["input_features"].squeeze(0).to(
ctx.model_config.dtype)
return MultiModalKwargs(kwargs)
@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper)
@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper)
@MULTIMODAL_REGISTRY.register_max_multimodal_tokens(
"audio", get_max_whisper_audio_tokens)
class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.config = config
self.dtype = vllm_config.model_config.dtype
self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix)
self.unpadded_vocab_size = config.vocab_size
self.proj_out = ParallelLMHead(config.vocab_size,
config.d_model,
quant_config=quant_config)
self.proj_out = self.proj_out.tie_weights(
self.model.decoder.embed_tokens)
logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size, logit_scale)
self.sampler = Sampler()
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
**kwargs,
) -> torch.Tensor:
audio_input = self._parse_and_validate_audio_input(**kwargs)
decoder_outputs = self.model(
input_features=audio_input["input_features"],
input_ids=input_ids,
positions=positions,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
return decoder_outputs
def get_multimodal_embeddings(
self,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
**kwargs,
) -> Optional[NestedTensors]:
# TODO: This method does not obey the interface for SupportsMultiModal.
# Refactor this once encoder/decoder support is implemented in V1.
audio_input = self._parse_and_validate_audio_input(**kwargs)
return self.model.get_encoder_outputs(
audio_input["input_features"],
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
attn_metadata: Optional[AttentionMetadata] = None,
) -> torch.Tensor:
# TODO: This method just returns the decoder sequence embeddings since
# Whisper does not have encoder text tokens. Refactor this once
# encoder/decoder support is implemented in V1.
return self.model.decoder.get_input_embeddings(input_ids)
def _parse_and_validate_audio_input(
self, **kwargs: object) -> WhisperAudioInputs:
input_features = kwargs.pop("input_features", None)
if input_features is not None:
if not isinstance(input_features, (torch.Tensor, list)):
raise ValueError("Incorrect type of audio features. "
f"Got type: {type(input_features)}")
input_features = [feat.to(self.dtype) for feat in input_features]
return WhisperAudioInputs(input_features=input_features)
def compute_logits(self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> torch.Tensor:
logits = self.logits_processor(self.proj_out, hidden_states,
sampling_metadata)
return logits
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
def load_weights(self, weights: Iterable[Tuple[str,
torch.Tensor]]) -> Set[str]:
loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."])
loaded_weights = [(name, loaded_weight)
for name, loaded_weight in weights]
mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."})
return loader.load_weights(loaded_weights, mapper=mapper)

View File

@ -16,7 +16,7 @@ from transformers import BatchFeature, ProcessorMixin
from vllm.inputs import DummyData, InputProcessingContext
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
@ -57,24 +57,6 @@ class PromptReplacement:
)
def _encode(
tokenizer: AnyTokenizer,
text: str,
*,
add_special_tokens: bool = False,
) -> list[int]:
"""
Backend-agnostic equivalent of HF's
:code:`tokenizer.encode(text, add_special_tokens=...)`.
"""
if isinstance(tokenizer, MistralTokenizer):
return tokenizer.tokenizer.encode(text,
bos=add_special_tokens,
eos=add_special_tokens)
return tokenizer.encode(text, add_special_tokens=add_special_tokens)
@lru_cache(maxsize=2048)
def _cached_encode(
tokenizer: AnyTokenizer,
@ -82,7 +64,9 @@ def _cached_encode(
*,
add_special_tokens: bool = False,
) -> list[int]:
return _encode(tokenizer, text, add_special_tokens=add_special_tokens)
return encode_tokens(tokenizer,
text,
add_special_tokens=add_special_tokens)
def _decode(
@ -983,7 +967,9 @@ class BaseMultiModalProcessor(ABC):
mm_item_counts,
)
token_ids = _encode(tokenizer, text)
token_ids = encode_tokens(tokenizer,
text,
add_special_tokens=False)
matched_repls = [match.prompt_repl for match in text_matches]
placeholders = self._find_placeholders(matched_repls, token_ids,

View File

@ -710,15 +710,27 @@ class SequenceGroup:
@property
def multi_modal_data(self) -> MultiModalDataDict:
return self.first_seq.multi_modal_data
if self.first_seq.multi_modal_data:
return self.first_seq.multi_modal_data
elif self.encoder_seq is not None:
return self.encoder_seq.multi_modal_data
return {}
@property
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
return self.first_seq.multi_modal_placeholders
if self.first_seq.multi_modal_data:
return self.first_seq.multi_modal_placeholders
elif self.encoder_seq is not None:
return self.encoder_seq.multi_modal_placeholders
return {}
@property
def mm_processor_kwargs(self) -> Dict[str, Any]:
return self.first_seq.mm_processor_kwargs
if self.first_seq.multi_modal_data:
return self.first_seq.mm_processor_kwargs
elif self.encoder_seq is not None:
return self.encoder_seq.mm_processor_kwargs
return {}
@property
def lora_int_id(self) -> int:

View File

@ -21,6 +21,25 @@ AnyTokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast,
MistralTokenizer]
def encode_tokens(
tokenizer: AnyTokenizer,
text: str,
*,
add_special_tokens: Optional[bool] = None,
) -> list[int]:
"""
Backend-agnostic equivalent of HF's
:code:`tokenizer.encode(text, add_special_tokens=...)`.
"""
if isinstance(tokenizer, MistralTokenizer):
return tokenizer.tokenizer.encode(text,
bos=add_special_tokens,
eos=add_special_tokens)
elif add_special_tokens is not None:
return tokenizer.encode(text, add_special_tokens=add_special_tokens)
return tokenizer.encode(text)
def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
"""Get tokenizer with cached properties.

View File

@ -32,7 +32,8 @@ class BaseTokenizerGroup(ABC):
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass
@ -41,7 +42,8 @@ class BaseTokenizerGroup(ABC):
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass

View File

@ -112,7 +112,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
@ -132,7 +133,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
ret = ray.get(
actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request))
lora_request=lora_request,
add_special_tokens=add_special_tokens))
except ActorDiedError as e:
# If the actor is dead, we first try to reinitialize it.
logger.warning("%s died with ActorDiedError, reinitializing.",
@ -143,7 +145,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
ret = ray.get(
actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request))
lora_request=lora_request,
add_special_tokens=add_special_tokens))
except ActorDiedError as e:
logger.error(
"%s died for second time in a row, marking "
@ -160,7 +163,8 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
@ -177,9 +181,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
actor_is_alive = True
original_actor = actor
try:
ret = await actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
ret = await actor.encode.remote(
request_id=request_id,
prompt=prompt,
lora_request=lora_request,
add_special_tokens=add_special_tokens)
except ActorDiedError as e:
# If the actor is dead, we first try to reinitialize it.
logger.warning("%s died with ActorDiedError, reinitializing.",
@ -187,9 +193,11 @@ class RayTokenizerGroupPool(BaseTokenizerGroup):
exc_info=e)
actor = self._init_actor()
try:
ret = await actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
ret = await actor.encode.remote(
request_id=request_id,
prompt=prompt,
lora_request=lora_request,
add_special_tokens=add_special_tokens)
except ActorDiedError as e:
logger.error(
"%s died for second time in a row, marking "

View File

@ -2,7 +2,7 @@ from typing import List, Optional
from vllm.config import TokenizerPoolConfig
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer,
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
get_lora_tokenizer,
get_lora_tokenizer_async,
get_tokenizer)
@ -55,9 +55,12 @@ class TokenizerGroup(BaseTokenizerGroup):
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
tokenizer = self.get_lora_tokenizer(lora_request)
ret = tokenizer.encode(prompt)
ret = encode_tokens(tokenizer,
prompt,
add_special_tokens=add_special_tokens)
self._raise_if_input_too_long(ret, lora_request)
return ret
@ -65,9 +68,12 @@ class TokenizerGroup(BaseTokenizerGroup):
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
lora_request: Optional[LoRARequest] = None,
add_special_tokens: Optional[bool] = None) -> List[int]:
tokenizer = await self.get_lora_tokenizer_async(lora_request)
ret = tokenizer.encode(prompt)
ret = encode_tokens(tokenizer,
prompt,
add_special_tokens=add_special_tokens)
self._raise_if_input_too_long(ret, lora_request)
return ret

View File

@ -103,9 +103,6 @@ class AsyncLLM(EngineClient):
self.output_handler: Optional[asyncio.Task] = None
def __del__(self):
self.shutdown()
@classmethod
def from_engine_args(
cls,

View File

@ -203,7 +203,6 @@ class EngineCoreProc(EngineCore):
finally:
if engine_core is not None:
engine_core.shutdown()
engine_core = None
def run_busy_loop(self):
"""Core busy loop of the EngineCore."""

View File

@ -1,4 +1,6 @@
from typing import List, Optional, Type
import weakref
from abc import ABC, abstractmethod
from typing import List, Type
import msgspec
import zmq
@ -18,7 +20,7 @@ from vllm.v1.utils import BackgroundProcHandle
logger = init_logger(__name__)
class EngineCoreClient:
class EngineCoreClient(ABC):
"""
EngineCoreClient: subclasses handle different methods for pushing
and pulling from the EngineCore for asyncio / multiprocessing.
@ -52,8 +54,9 @@ class EngineCoreClient:
return InprocClient(vllm_config, executor_class, log_stats)
@abstractmethod
def shutdown(self):
pass
...
def get_output(self) -> List[EngineCoreOutput]:
raise NotImplementedError
@ -107,9 +110,6 @@ class InprocClient(EngineCoreClient):
def shutdown(self):
self.engine_core.shutdown()
def __del__(self):
self.shutdown()
def profile(self, is_start: bool = True) -> None:
self.engine_core.profile(is_start)
@ -139,10 +139,14 @@ class MPClient(EngineCoreClient):
self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
# ZMQ setup.
if asyncio_mode:
self.ctx = zmq.asyncio.Context()
else:
self.ctx = zmq.Context() # type: ignore[attr-defined]
self.ctx = (
zmq.asyncio.Context() # type: ignore[attr-defined]
if asyncio_mode else zmq.Context()) # type: ignore[attr-defined]
# Note(rob): shutdown function cannot be a bound method,
# else the gc cannot collect the object.
self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0),
self.ctx)
# Paths and sockets for IPC.
output_path = get_open_zmq_ipc_path()
@ -153,7 +157,6 @@ class MPClient(EngineCoreClient):
zmq.constants.PUSH)
# Start EngineCore in background process.
self.proc_handle: Optional[BackgroundProcHandle]
self.proc_handle = BackgroundProcHandle(
input_path=input_path,
output_path=output_path,
@ -166,12 +169,11 @@ class MPClient(EngineCoreClient):
})
def shutdown(self):
# Shut down the zmq context.
self.ctx.destroy(linger=0)
if hasattr(self, "proc_handle") and self.proc_handle:
"""Clean up background resources."""
if hasattr(self, "proc_handle"):
self.proc_handle.shutdown()
self.proc_handle = None
self._finalizer()
class SyncMPClient(MPClient):

View File

@ -205,10 +205,3 @@ class LLMEngine:
f"found type: {type(tokenizer_group)}")
return tokenizer_group
def __del__(self):
self.shutdown()
def shutdown(self):
if engine_core := getattr(self, "engine_core", None):
engine_core.shutdown()

View File

@ -1,3 +1,4 @@
import multiprocessing
import os
import weakref
from collections.abc import Sequence
@ -91,8 +92,6 @@ class BackgroundProcHandle:
target_fn: Callable,
process_kwargs: Dict[Any, Any],
):
self._finalizer = weakref.finalize(self, self.shutdown)
context = get_mp_context()
reader, writer = context.Pipe(duplex=False)
@ -102,11 +101,11 @@ class BackgroundProcHandle:
process_kwargs["ready_pipe"] = writer
process_kwargs["input_path"] = input_path
process_kwargs["output_path"] = output_path
self.input_path = input_path
self.output_path = output_path
# Run Detokenizer busy loop in background process.
# Run busy loop in background process.
self.proc = context.Process(target=target_fn, kwargs=process_kwargs)
self._finalizer = weakref.finalize(self, shutdown, self.proc,
input_path, output_path)
self.proc.start()
# Wait for startup.
@ -114,21 +113,24 @@ class BackgroundProcHandle:
raise RuntimeError(f"{process_name} initialization failed. "
"See root cause above.")
def __del__(self):
self.shutdown()
def shutdown(self):
# Shutdown the process if needed.
if hasattr(self, "proc") and self.proc.is_alive():
self.proc.terminate()
self.proc.join(5)
self._finalizer()
if self.proc.is_alive():
kill_process_tree(self.proc.pid)
# Remove zmq ipc socket files
ipc_sockets = [self.output_path, self.input_path]
for ipc_socket in ipc_sockets:
socket_file = ipc_socket.replace("ipc://", "")
if os and os.path.exists(socket_file):
os.remove(socket_file)
# Note(rob): shutdown function cannot be a bound method,
# else the gc cannot collect the object.
def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
# Shutdown the process.
if proc.is_alive():
proc.terminate()
proc.join(5)
if proc.is_alive():
kill_process_tree(proc.pid)
# Remove zmq ipc socket files.
ipc_sockets = [output_path, input_path]
for ipc_socket in ipc_sockets:
socket_file = ipc_socket.replace("ipc://", "")
if os and os.path.exists(socket_file):
os.remove(socket_file)

View File

@ -66,8 +66,9 @@ class InputBatch:
pin_memory=False,
)
self.token_ids_cpu = self.token_ids_cpu_tensor.numpy()
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32)
self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32)
# Attention-related.
self.block_table = torch.zeros(
@ -189,6 +190,7 @@ class InputBatch:
end_idx = start_idx + len(request.output_token_ids)
self.token_ids_cpu[req_index,
start_idx:end_idx] = request.output_token_ids
self.num_tokens[req_index] = request.num_tokens
self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens
num_blocks = len(request.block_ids)
@ -290,14 +292,15 @@ class InputBatch:
self.req_ids[last_req_index] = None
self.req_id_to_index[req_id] = empty_index
# TODO(woosuk): Optimize the copy of token_ids_cpu and
# block_table_cpu.
self.token_ids_cpu[empty_index] = self.token_ids_cpu[
last_req_index]
num_tokens = self.num_tokens[last_req_index]
self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
last_req_index, :num_tokens]
self.num_tokens[empty_index] = num_tokens
self.num_prompt_tokens[empty_index] = \
self.num_prompt_tokens[last_req_index]
self.num_computed_tokens_cpu[
empty_index] = self.num_computed_tokens_cpu[last_req_index]
# TODO(woosuk): Optimize the copy of block_table_cpu.
self.block_table_cpu[empty_index] = self.block_table_cpu[
last_req_index]
self.temperature_cpu[empty_index] = self.temperature_cpu[

View File

@ -661,6 +661,7 @@ class GPUModelRunner:
# Append the sampled token to the output token ids.
token_id = sampled_token_ids[i]
self.input_batch.token_ids_cpu[i, seq_len] = token_id
self.input_batch.num_tokens[i] += 1
req_state.output_token_ids.append(token_id)
else:
# Ignore the sampled token from the partial request.

View File

@ -287,12 +287,11 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
seq_len,
self.mm_registry,
is_encoder_data=False)
encoder_dummy_data \
= self.input_registry.dummy_data_for_profiling(
self.model_config,
seq_len,
self.mm_registry,
is_encoder_data=True)
encoder_dummy_data = self.input_registry \
.dummy_data_for_profiling(self.model_config,
seq_len,
self.mm_registry,
is_encoder_data=True)
# Having more tokens is over-conservative but otherwise fine
assert len(

View File

@ -1136,7 +1136,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
self.prompt_adapter_manager.create_prompt_adapter_manager(
self.model))
if self.kv_cache_dtype == "fp8" and current_platform.is_rocm():
if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm()
or current_platform.is_cuda()):
# Currently only ROCm accepts kv-cache scaling factors
# via quantization_param_path and this will be deprecated
# in the future.