mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-07 04:59:07 +08:00
Merge branch 'mlm-full-lora-support' of https://github.com/jeejeelee/vllm into mlm-full-lora-support
This commit is contained in:
commit
bbd90e8bae
@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Build ACL with CMake
|
# Build ACL with CMake
|
||||||
set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF")
|
|
||||||
set(CMAKE_BUILD_TYPE "Release")
|
|
||||||
set(ARM_COMPUTE_ARCH "armv8.2-a")
|
|
||||||
set(ARM_COMPUTE_ENABLE_ASSERTS "OFF")
|
|
||||||
set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF")
|
|
||||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
|
||||||
set(ARM_COMPUTE_ENABLE_OPENMP "ON")
|
|
||||||
set(ARM_COMPUTE_ENABLE_WERROR "OFF")
|
|
||||||
set(ARM_COMPUTE_BUILD_EXAMPLES "OFF")
|
|
||||||
set(ARM_COMPUTE_BUILD_TESTING "OFF")
|
|
||||||
|
|
||||||
set(_cmake_config_cmd
|
set(_cmake_config_cmd
|
||||||
${CMAKE_COMMAND} -G Ninja -B build
|
${CMAKE_COMMAND} -G Ninja -B build
|
||||||
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
-DARM_COMPUTE_BUILD_SHARED_LIB=OFF
|
||||||
|
|||||||
@ -186,7 +186,7 @@ struct AttentionMetadata {
|
|||||||
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
// - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2
|
||||||
// * q_tile_size * 4, partial output, max + sum (float)
|
// * q_tile_size * 4, partial output, max + sum (float)
|
||||||
// Reduction scratchpad contains:
|
// Reduction scratchpad contains:
|
||||||
// - flags: bool array to indicate wether the split is finished
|
// - flags: bool array to indicate whether the split is finished
|
||||||
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
// - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size
|
||||||
// - max, sum: 2 * split_num * q_tile_size * 4
|
// - max, sum: 2 * split_num * q_tile_size * 4
|
||||||
class AttentionScratchPad {
|
class AttentionScratchPad {
|
||||||
|
|||||||
@ -617,7 +617,7 @@ struct MacheteCollectiveMma {
|
|||||||
|
|
||||||
// Same as upstream, should be kept the same when possible, not formatted for
|
// Same as upstream, should be kept the same when possible, not formatted for
|
||||||
// easier comparison
|
// easier comparison
|
||||||
// with `SwapAB ? N : M -> M` since we dont support SwapAB
|
// with `SwapAB ? N : M -> M` since we don't support SwapAB
|
||||||
// clang-format off
|
// clang-format off
|
||||||
template<class ProblemShape>
|
template<class ProblemShape>
|
||||||
static bool
|
static bool
|
||||||
|
|||||||
@ -22,7 +22,7 @@ python tools/install_nixl_from_source_ubuntu.py
|
|||||||
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
|
NixlConnector uses NIXL library for underlying communication, which supports multiple transport backends. UCX (Unified Communication X) is the primary default transport library used by NIXL. Configure transport environment variables:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Example UCX configuration, adjust according to your enviroment
|
# Example UCX configuration, adjust according to your environment
|
||||||
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
|
export UCX_TLS=all # or specify specific transports like "rc,ud,sm,^cuda_ipc" ..etc
|
||||||
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
|
export UCX_NET_DEVICES=all # or specify network devices like "mlx5_0:1,mlx5_1:1"
|
||||||
```
|
```
|
||||||
|
|||||||
@ -75,7 +75,7 @@ torchgeo==0.7.0
|
|||||||
mteb==2.1.2
|
mteb==2.1.2
|
||||||
|
|
||||||
# Data processing
|
# Data processing
|
||||||
xgrammar==0.1.27
|
xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
|
||||||
# Test async scheduling
|
# Test async scheduling
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import math
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import CpuArchEnum, current_platform
|
||||||
|
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
|
||||||
|
|
||||||
if not current_platform.is_cpu():
|
if not current_platform.is_cpu():
|
||||||
pytest.skip("skipping CPU-only tests", allow_module_level=True)
|
pytest.skip("skipping CPU-only tests", allow_module_level=True)
|
||||||
@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_attn_isa(
|
||||||
|
block_size: int | None = None,
|
||||||
|
dtype: torch.dtype | None = None,
|
||||||
|
):
|
||||||
|
if block_size and dtype:
|
||||||
|
return _get_attn_isa(dtype, block_size)
|
||||||
|
else:
|
||||||
|
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
|
||||||
|
return "neon"
|
||||||
|
elif torch._C._cpu._is_amx_tile_supported():
|
||||||
|
return "amx"
|
||||||
|
else:
|
||||||
|
return "vec"
|
||||||
|
|
||||||
|
|
||||||
# rand number generation takes too much time, cache rand tensors
|
# rand number generation takes too much time, cache rand tensors
|
||||||
@functools.lru_cache(maxsize=128, typed=False)
|
@functools.lru_cache(maxsize=128, typed=False)
|
||||||
def tensor_cache(
|
def tensor_cache(
|
||||||
@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
|
||||||
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
|
@pytest.mark.parametrize("head_size", HEAD_SIZES)
|
||||||
|
@pytest.mark.parametrize("block_size", [96, 128])
|
||||||
|
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
|
||||||
|
@pytest.mark.parametrize("dtype", QTYPES)
|
||||||
|
@pytest.mark.parametrize("soft_cap", [None])
|
||||||
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
|
@pytest.mark.parametrize("use_alibi", [False])
|
||||||
|
@pytest.mark.parametrize("use_sink", [False])
|
||||||
|
@pytest.mark.parametrize("isa", ["neon"])
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
|
||||||
|
reason="Not an Arm CPU.",
|
||||||
|
)
|
||||||
|
def test_varlen_with_paged_kv_normal_neon(
|
||||||
|
seq_lens: list[tuple[int, int]],
|
||||||
|
num_heads: tuple[int, int],
|
||||||
|
head_size: int,
|
||||||
|
sliding_window: int | None,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
block_size: int,
|
||||||
|
soft_cap: float | None,
|
||||||
|
num_blocks: int,
|
||||||
|
use_alibi: bool,
|
||||||
|
use_sink: bool,
|
||||||
|
isa: str,
|
||||||
|
) -> None:
|
||||||
|
varlen_with_paged_kv(
|
||||||
|
seq_lens=seq_lens,
|
||||||
|
num_heads=num_heads,
|
||||||
|
head_size=head_size,
|
||||||
|
sliding_window=sliding_window,
|
||||||
|
dtype=dtype,
|
||||||
|
block_size=block_size,
|
||||||
|
soft_cap=soft_cap,
|
||||||
|
num_blocks=num_blocks,
|
||||||
|
use_alibi=use_alibi,
|
||||||
|
use_sink=use_sink,
|
||||||
|
isa=isa,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
|
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
|
||||||
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
@pytest.mark.parametrize("num_heads", NUM_HEADS)
|
||||||
@pytest.mark.parametrize("head_size", [96])
|
@pytest.mark.parametrize("head_size", [96])
|
||||||
@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("use_alibi", [False])
|
@pytest.mark.parametrize("use_alibi", [False])
|
||||||
@pytest.mark.parametrize("use_sink", [False])
|
@pytest.mark.parametrize("use_sink", [False])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
|
||||||
)
|
|
||||||
def test_varlen_with_paged_kv_softcap(
|
def test_varlen_with_paged_kv_softcap(
|
||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("use_alibi", [True])
|
@pytest.mark.parametrize("use_alibi", [True])
|
||||||
@pytest.mark.parametrize("use_sink", [False])
|
@pytest.mark.parametrize("use_sink", [False])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
|
||||||
)
|
|
||||||
def test_varlen_with_paged_kv_alibi(
|
def test_varlen_with_paged_kv_alibi(
|
||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
|
|||||||
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
|
||||||
@pytest.mark.parametrize("use_alibi", [False])
|
@pytest.mark.parametrize("use_alibi", [False])
|
||||||
@pytest.mark.parametrize("use_sink", [True])
|
@pytest.mark.parametrize("use_sink", [True])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("isa", [get_attn_isa()])
|
||||||
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
|
|
||||||
)
|
|
||||||
def test_varlen_with_paged_kv_sink(
|
def test_varlen_with_paged_kv_sink(
|
||||||
seq_lens: list[tuple[int, int]],
|
seq_lens: list[tuple[int, int]],
|
||||||
num_heads: tuple[int, int],
|
num_heads: tuple[int, int],
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion():
|
|||||||
connector.fetch_image(broken_img)
|
connector.fetch_image(broken_img)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.flaky(reruns=3, reruns_delay=5)
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||||
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
|
||||||
@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
video_sync, metadata_sync = connector.fetch_video(video_url)
|
try:
|
||||||
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
video_sync, metadata_sync = connector.fetch_video(video_url)
|
||||||
|
video_async, metadata_async = await connector.fetch_video_async(video_url)
|
||||||
|
except (TimeoutError, asyncio.TimeoutError) as e:
|
||||||
|
pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
|
||||||
|
|
||||||
assert np.array_equal(video_sync, video_async)
|
assert np.array_equal(video_sync, video_async)
|
||||||
assert metadata_sync == metadata_async
|
assert metadata_sync == metadata_async
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
|
|||||||
|
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
from vllm.logprobs import Logprob
|
from vllm.logprobs import Logprob
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.sampling_params import StructuredOutputsParams
|
from vllm.sampling_params import StructuredOutputsParams
|
||||||
from vllm.v1.metrics.reader import Metric
|
from vllm.v1.metrics.reader import Metric
|
||||||
|
|
||||||
@ -70,6 +71,18 @@ def test_without_spec_decoding(
|
|||||||
(True, "uni", True, None, True),
|
(True, "uni", True, None, True),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if current_platform.is_rocm():
|
||||||
|
# On ROCm, Only test with structured_outputs (deterministic)
|
||||||
|
# and skip chunk_prefill (more variable).
|
||||||
|
test_configs = [
|
||||||
|
cfg
|
||||||
|
for cfg in test_configs
|
||||||
|
if not cfg[4] # skip chunk_prefill=True
|
||||||
|
]
|
||||||
|
test_sampling_params = [
|
||||||
|
p for p in test_sampling_params if p.get("structured_outputs") is not None
|
||||||
|
]
|
||||||
|
|
||||||
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
|
run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
|
|||||||
(True, "uni", True, spec_config_short, True),
|
(True, "uni", True, spec_config_short, True),
|
||||||
]
|
]
|
||||||
|
|
||||||
run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
|
# On ROCm, use TRITON_ATTN + float32 for better numerical consistency
|
||||||
|
run_tests(
|
||||||
|
monkeypatch,
|
||||||
|
MTP_MODEL,
|
||||||
|
test_configs,
|
||||||
|
test_sampling_params,
|
||||||
|
is_testing_with_spec_decoding=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dynamo_config.patch(cache_size_limit=16)
|
@dynamo_config.patch(cache_size_limit=16)
|
||||||
@ -117,13 +137,21 @@ def run_tests(
|
|||||||
model: str,
|
model: str,
|
||||||
test_configs: list[tuple],
|
test_configs: list[tuple],
|
||||||
test_sampling_params: list[dict[str, Any]],
|
test_sampling_params: list[dict[str, Any]],
|
||||||
|
is_testing_with_spec_decoding: bool = False,
|
||||||
):
|
):
|
||||||
"""Test consistency of combos of async scheduling, preemption,
|
"""Test consistency of combos of async scheduling, preemption,
|
||||||
uni/multiproc executor with spec decoding."""
|
uni/multiproc executor with spec decoding."""
|
||||||
|
|
||||||
with monkeypatch.context() as m:
|
with monkeypatch.context() as m:
|
||||||
# avoid precision errors
|
# avoid precision errors
|
||||||
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
if current_platform.is_rocm():
|
||||||
|
if is_testing_with_spec_decoding:
|
||||||
|
# Use TRITON_ATTN for spec decoding test for consistency
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
|
||||||
|
else:
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
|
||||||
|
else:
|
||||||
|
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
|
||||||
# lock matmul precision to full FP32
|
# lock matmul precision to full FP32
|
||||||
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
|
||||||
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
# m.setenv("VLLM_BATCH_INVARIANT", "1")
|
||||||
@ -145,6 +173,7 @@ def run_tests(
|
|||||||
async_scheduling,
|
async_scheduling,
|
||||||
spec_config,
|
spec_config,
|
||||||
test_prefill_chunking=test_prefill_chunking,
|
test_prefill_chunking=test_prefill_chunking,
|
||||||
|
is_testing_with_spec_decoding=is_testing_with_spec_decoding,
|
||||||
)
|
)
|
||||||
outputs.append(test_results)
|
outputs.append(test_results)
|
||||||
|
|
||||||
@ -174,17 +203,34 @@ def run_tests(
|
|||||||
name_0=f"baseline=[{baseline_config}], params={params}",
|
name_0=f"baseline=[{baseline_config}], params={params}",
|
||||||
name_1=f"config=[{test_config}], params={params}",
|
name_1=f"config=[{test_config}], params={params}",
|
||||||
)
|
)
|
||||||
assert _all_logprobs_match(base_logprobs, test_logprobs)
|
|
||||||
|
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
|
||||||
|
# logprobs comparison when logprobs are requested
|
||||||
|
skip_logprobs_check = (
|
||||||
|
current_platform.is_rocm()
|
||||||
|
and params.get("logprobs")
|
||||||
|
and is_testing_with_spec_decoding
|
||||||
|
)
|
||||||
|
if not skip_logprobs_check:
|
||||||
|
assert _all_logprobs_match(base_logprobs, test_logprobs)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
base_acceptance_rate is not None
|
base_acceptance_rate is not None
|
||||||
and test_acceptance_rate is not None
|
and test_acceptance_rate is not None
|
||||||
):
|
):
|
||||||
if "spec_mml=None" in test_config:
|
if "spec_mml=None" in test_config:
|
||||||
|
# Preemption causes more variance in acceptance rates
|
||||||
|
if (
|
||||||
|
current_platform.is_rocm()
|
||||||
|
and "preemption=True" in test_config
|
||||||
|
):
|
||||||
|
tolerance = 0.10
|
||||||
|
else:
|
||||||
|
tolerance = 0.05
|
||||||
assert (
|
assert (
|
||||||
test_acceptance_rate > base_acceptance_rate
|
test_acceptance_rate > base_acceptance_rate
|
||||||
or test_acceptance_rate
|
or test_acceptance_rate
|
||||||
== pytest.approx(base_acceptance_rate, rel=5e-2)
|
== pytest.approx(base_acceptance_rate, rel=tolerance)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Currently the reported acceptance rate is expected to be
|
# Currently the reported acceptance rate is expected to be
|
||||||
@ -215,6 +261,7 @@ def run_test(
|
|||||||
async_scheduling: bool,
|
async_scheduling: bool,
|
||||||
spec_config: dict[str, Any] | None,
|
spec_config: dict[str, Any] | None,
|
||||||
test_prefill_chunking: bool,
|
test_prefill_chunking: bool,
|
||||||
|
is_testing_with_spec_decoding: bool = False,
|
||||||
):
|
):
|
||||||
spec_decoding = spec_config is not None
|
spec_decoding = spec_config is not None
|
||||||
cache_arg: dict[str, Any] = (
|
cache_arg: dict[str, Any] = (
|
||||||
@ -233,6 +280,15 @@ def run_test(
|
|||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
print(f"---- TESTING {test_str}: {test_config}")
|
print(f"---- TESTING {test_str}: {test_config}")
|
||||||
print("-" * 80)
|
print("-" * 80)
|
||||||
|
|
||||||
|
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
|
||||||
|
# spec decoding test (TRITON_ATTN) for better precision.
|
||||||
|
# On others: always use float32.
|
||||||
|
if current_platform.is_rocm() and not is_testing_with_spec_decoding:
|
||||||
|
dtype = "float16"
|
||||||
|
else:
|
||||||
|
dtype = "float32"
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model,
|
model,
|
||||||
max_model_len=512,
|
max_model_len=512,
|
||||||
@ -242,7 +298,7 @@ def run_test(
|
|||||||
# enforce_eager=True,
|
# enforce_eager=True,
|
||||||
async_scheduling=async_scheduling,
|
async_scheduling=async_scheduling,
|
||||||
distributed_executor_backend=executor,
|
distributed_executor_backend=executor,
|
||||||
dtype="float32", # avoid precision errors
|
dtype=dtype,
|
||||||
speculative_config=spec_config,
|
speculative_config=spec_config,
|
||||||
disable_log_stats=False,
|
disable_log_stats=False,
|
||||||
**cache_arg,
|
**cache_arg,
|
||||||
@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
|
def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
|
||||||
return len(lps_a) == len(lps_b) and all(
|
if current_platform.is_rocm():
|
||||||
a.decoded_token == b.decoded_token
|
# ROCm has higher numerical variance
|
||||||
and a.rank == b.rank
|
# due to use of float16.
|
||||||
and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
|
rel_tol, abs_tol = 5e-2, 1e-5
|
||||||
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
|
else:
|
||||||
|
rel_tol, abs_tol = 1e-3, 1e-6
|
||||||
|
return (
|
||||||
|
len(lps_a) == len(lps_b)
|
||||||
|
and lps_a.keys() == lps_b.keys()
|
||||||
|
and all(
|
||||||
|
a.decoded_token == b.decoded_token
|
||||||
|
and a.rank == b.rank
|
||||||
|
and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol)
|
||||||
|
for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -881,7 +881,7 @@ class FusedMoE(CustomOp):
|
|||||||
# Record that the clone will be used by shared_experts_stream
|
# Record that the clone will be used by shared_experts_stream
|
||||||
# to avoid gc issue from deallocation of hidden_states_clone
|
# to avoid gc issue from deallocation of hidden_states_clone
|
||||||
# For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
|
# For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
|
||||||
# NOTE: We dont need shared_output.record_stream(current_stream())
|
# NOTE: We don't need shared_output.record_stream(current_stream())
|
||||||
# because we synch the streams before using shared_output.
|
# because we synch the streams before using shared_output.
|
||||||
hidden_states_clone.record_stream(self.shared_experts_stream)
|
hidden_states_clone.record_stream(self.shared_experts_stream)
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_min_capability(cls) -> int:
|
def get_min_capability(cls) -> int:
|
||||||
# dont restrict as emulations
|
# don't restrict as emulations
|
||||||
return 80
|
return 80
|
||||||
|
|
||||||
def create_weights(
|
def create_weights(
|
||||||
|
|||||||
@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module):
|
|||||||
self.padding_idx = config.pad_token_id
|
self.padding_idx = config.pad_token_id
|
||||||
self.vocab_size = config.vocab_size
|
self.vocab_size = config.vocab_size
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.quant_config = quant_config
|
||||||
self.embed_tokens = VocabParallelEmbedding(
|
self.embed_tokens = VocabParallelEmbedding(
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module):
|
|||||||
loaded_params: set[str] = set()
|
loaded_params: set[str] = set()
|
||||||
expert_params_mapping = self.get_expert_mapping()
|
expert_params_mapping = self.get_expert_mapping()
|
||||||
for name, loaded_weight in weights:
|
for name, loaded_weight in weights:
|
||||||
|
if self.quant_config is not None and (
|
||||||
|
scale_name := self.quant_config.get_cache_scale(name)
|
||||||
|
):
|
||||||
|
# Loading kv cache quantization scales
|
||||||
|
param = params_dict[scale_name]
|
||||||
|
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||||
|
assert loaded_weight.numel() == 1, (
|
||||||
|
f"KV scale numel {loaded_weight.numel()} != 1"
|
||||||
|
)
|
||||||
|
loaded_weight = loaded_weight.squeeze()
|
||||||
|
weight_loader(param, loaded_weight)
|
||||||
|
loaded_params.add(scale_name)
|
||||||
|
continue
|
||||||
for param_name, weight_name, shard_id in stacked_params_mapping:
|
for param_name, weight_name, shard_id in stacked_params_mapping:
|
||||||
# Skip non-stacked layers and experts (experts handled below).
|
# Skip non-stacked layers and experts (experts handled below).
|
||||||
if weight_name not in name:
|
if weight_name not in name:
|
||||||
|
|||||||
@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
|||||||
self.name_or_path = (
|
self.name_or_path = (
|
||||||
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
|
tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else ""
|
||||||
)
|
)
|
||||||
|
self._added_vocab = self.tokenizer.get_added_vocab()
|
||||||
|
self._added_vocab_size = len(self._added_vocab)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(
|
def from_pretrained(
|
||||||
@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
|||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
# </think> is an added token in DeepseekV32 tokenizer
|
# </think> is an added token in DeepseekV32 tokenizer
|
||||||
return self.vocab_size + len(self.get_added_vocab())
|
return self.vocab_size + self._added_vocab_size
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer):
|
|||||||
return self.tokenizer.get_vocab()
|
return self.tokenizer.get_vocab()
|
||||||
|
|
||||||
def get_added_vocab(self) -> dict[str, int]:
|
def get_added_vocab(self) -> dict[str, int]:
|
||||||
return self.tokenizer.get_added_vocab()
|
return self._added_vocab.copy()
|
||||||
|
|
||||||
def encode(
|
def encode(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@ -10,7 +10,7 @@ import torch
|
|||||||
import vllm.envs
|
import vllm.envs
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.tokenizers import MistralTokenizer
|
from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer
|
||||||
from vllm.utils.import_utils import LazyLoader
|
from vllm.utils.import_utils import LazyLoader
|
||||||
from vllm.v1.structured_output.backend_types import (
|
from vllm.v1.structured_output.backend_types import (
|
||||||
StructuredOutputBackend,
|
StructuredOutputBackend,
|
||||||
@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend):
|
|||||||
stop_token_ids=stop_token_ids,
|
stop_token_ids=stop_token_ids,
|
||||||
add_prefix_space=True,
|
add_prefix_space=True,
|
||||||
)
|
)
|
||||||
|
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
|
||||||
|
# copy from xgr.TokenizerInfo.from_huggingface()
|
||||||
|
# because we are using a custom tokenizer wrapper here.
|
||||||
|
vocab_dict = self.tokenizer.get_vocab()
|
||||||
|
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
|
||||||
|
vocab_size = self.vocab_size or tokenizer_vocab_size
|
||||||
|
# maintain tokenizer's indexing
|
||||||
|
encoded_vocab = [""] * vocab_size
|
||||||
|
for token, idx in vocab_dict.items():
|
||||||
|
if idx < vocab_size:
|
||||||
|
encoded_vocab[idx] = token
|
||||||
|
stop_token_ids = [self.tokenizer.eos_token_id]
|
||||||
|
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str()
|
||||||
|
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
|
||||||
|
tokenizer_info = xgr.TokenizerInfo(
|
||||||
|
encoded_vocab=encoded_vocab,
|
||||||
|
vocab_type=metadata["vocab_type"],
|
||||||
|
vocab_size=vocab_size,
|
||||||
|
stop_token_ids=stop_token_ids,
|
||||||
|
add_prefix_space=metadata["add_prefix_space"],
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
|
||||||
self.tokenizer,
|
self.tokenizer,
|
||||||
|
|||||||
@ -4981,7 +4981,7 @@ class GPUModelRunner(
|
|||||||
# we need to adjust the cudagraph sizes to be a multiple of the uniform
|
# we need to adjust the cudagraph sizes to be a multiple of the uniform
|
||||||
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
|
# decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
|
||||||
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
|
# temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
|
||||||
# Will be removed in the near future when we have seperate cudagraph capture
|
# Will be removed in the near future when we have separate cudagraph capture
|
||||||
# sizes for decode and mixed prefill-decode.
|
# sizes for decode and mixed prefill-decode.
|
||||||
if (
|
if (
|
||||||
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
|
||||||
|
|||||||
@ -135,7 +135,7 @@ class AttentionGroup:
|
|||||||
kv_cache_spec: KVCacheSpec
|
kv_cache_spec: KVCacheSpec
|
||||||
kv_cache_group_id: int
|
kv_cache_group_id: int
|
||||||
# When ubatching is enabled we will have a metadata builder for each ubatch
|
# When ubatching is enabled we will have a metadata builder for each ubatch
|
||||||
# so that if they use internal persistant buffers for cudagraphs, and they
|
# so that if they use internal persistent buffers for cudagraphs, and they
|
||||||
# won't have to worry about conflicting with the other ubatches.
|
# won't have to worry about conflicting with the other ubatches.
|
||||||
metadata_builders: list[AttentionMetadataBuilder] = field(
|
metadata_builders: list[AttentionMetadataBuilder] = field(
|
||||||
default_factory=lambda: []
|
default_factory=lambda: []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user