Merge branch 'main' into mlm-full-lora-support

This commit is contained in:
Jee Jee Li 2025-12-21 19:30:22 +08:00 committed by GitHub
commit 8aedddd546
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 443 additions and 361 deletions

View File

@ -1254,13 +1254,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1508,7 +1508,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####

View File

@ -1109,13 +1109,13 @@ steps:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 68min
timeout_in_minutes: 90
@ -1334,7 +1334,7 @@ steps:
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
##### B200 test #####

View File

@ -145,7 +145,7 @@ steps:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
- pytest -v -s tests/v1/distributed/test_dbo.py
- label: Distributed Tests (2 GPUs)(B200)
@ -171,7 +171,7 @@ steps:
- tests/distributed/
- tests/examples/offline_inference/data_parallel.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
- label: Distributed NixlConnector PD accuracy (4 GPUs)
timeout_in_minutes: 30

1
.github/CODEOWNERS vendored
View File

@ -15,6 +15,7 @@
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm @chaunceyjiang
/vllm/entrypoints @aarnphm @chaunceyjiang
/vllm/tool_parsers @aarnphm @chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche @ApostaC
CMakeLists.txt @tlrmchlsmth @LucasWilkinson

View File

@ -5,25 +5,25 @@ Usage:
Single node:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2
-dp=2 \
-tp=2
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
-dp=2 \
-tp=2 \
--nnodes=2 \
--node-rank=0 \
--master-addr=10.99.48.128 \
--master-port=13345
Node 1:
python examples/offline_inference/data_parallel.py \
--model="ibm-research/PowerMoE-3b" \
--dp-size=2 \
--tp-size=2 \
--node-size=2 \
-dp=2 \
-tp=2 \
--nnodes=2 \
--node-rank=1 \
--master-addr=10.99.48.128 \
--master-port=13345
@ -32,103 +32,40 @@ Multi-node:
import os
from time import sleep
from vllm import LLM, SamplingParams
from vllm import LLM, EngineArgs, SamplingParams
from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import get_open_port
def parse_args():
import argparse
def create_parser():
parser = FlexibleArgumentParser(description="Data Parallel Inference")
parser = argparse.ArgumentParser(description="Data Parallel Inference")
parser.add_argument(
"--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path",
)
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
parser.add_argument(
"--node-size", type=int, default=1, help="Total number of nodes"
)
parser.add_argument(
"--node-rank", type=int, default=0, help="Rank of the current node"
)
parser.add_argument(
"--master-addr", type=str, default="", help="Master node IP address"
)
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
parser.add_argument(
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
)
parser.add_argument(
"--trust-remote-code", action="store_true", help="Trust remote code."
)
parser.add_argument(
"--max-num-seqs",
type=int,
default=64,
help=("Maximum number of sequences to be processed in a single iteration."),
)
parser.add_argument(
"--max-model-len",
type=int,
help=("Maximum number of tokens to be processed in a single iteration."),
# Add all engine args
EngineArgs.add_cli_args(parser)
parser.set_defaults(
model="ibm-research/PowerMoE-3b",
enable_expert_parallel=True,
)
# Add timeout (not in EngineArgs)
parser.add_argument(
"--timeout",
type=int,
default=300,
help=("Number of seconds before unresponsive process is killed."),
help="Number of seconds before unresponsive process is killed.",
)
parser.add_argument(
"--gpu-memory-utilization",
type=float,
default=0.8,
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
)
parser.add_argument(
"--enable-dbo",
action="store_true",
help=("Enable microbatched execution"),
)
parser.add_argument(
"--compilation-config",
type=int,
help=("Compilation optimization (O) mode 0-3."),
)
parser.add_argument(
"--quantization",
type=str,
)
parser.add_argument(
"--disable-expert-parallel",
dest="enable_expert_parallel",
action="store_false",
help="Disable expert parallel (default: enabled).",
)
parser.set_defaults(enable_expert_parallel=True)
return parser.parse_args()
return parser
def main(
model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
GPUs_per_dp_rank,
enforce_eager,
enable_expert_parallel,
trust_remote_code,
max_num_seqs,
max_model_len,
compilation_config,
gpu_memory_utilization,
enable_dbo,
quantization,
engine_args,
):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
@ -173,19 +110,7 @@ def main(
)
# Create an LLM.
llm = LLM(
model=model,
tensor_parallel_size=GPUs_per_dp_rank,
enforce_eager=enforce_eager,
enable_expert_parallel=enable_expert_parallel,
trust_remote_code=trust_remote_code,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
enable_dbo=enable_dbo,
quantization=quantization,
compilation_config=compilation_config,
)
llm = LLM(**engine_args)
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for i, output in enumerate(outputs):
@ -204,22 +129,29 @@ def main(
if __name__ == "__main__":
args = parse_args()
parser = create_parser()
args = vars(parser.parse_args())
dp_size = args.dp_size
tp_size = args.tp_size
node_size = args.node_size
node_rank = args.node_rank
# Extract DP-specific args
dp_size = args.pop("data_parallel_size")
nnodes = args.get("nnodes", 1)
node_rank = args.get("node_rank", 0)
master_addr = args.get("master_addr", "")
master_port = args.get("master_port", 0)
timeout = args.pop("timeout")
if node_size == 1:
# Remaining args are engine args
engine_args = args
if nnodes == 1:
dp_master_ip = "127.0.0.1"
dp_master_port = get_open_port()
else:
dp_master_ip = args.master_addr
dp_master_port = args.master_port
dp_master_ip = master_addr
dp_master_port = master_port
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
dp_per_node = dp_size // node_size
assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
dp_per_node = dp_size // nnodes
from multiprocessing import Process
@ -235,29 +167,19 @@ if __name__ == "__main__":
proc = Process(
target=main,
args=(
args.model,
dp_size,
local_dp_rank,
global_dp_rank,
dp_master_ip,
dp_master_port,
tp_size,
args.enforce_eager,
args.enable_expert_parallel,
args.trust_remote_code,
args.max_num_seqs,
args.max_model_len,
args.compilation_config,
args.gpu_memory_utilization,
args.enable_dbo,
args.quantization,
engine_args,
),
)
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join(timeout=args.timeout)
proc.join(timeout=timeout)
if proc.exitcode is None:
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
proc.kill()

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.util
import json
import time
@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
assert (
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
response = await client.chat.completions.create(
model=model_name,
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
temperature=0.0,
max_tokens=250,
)
choice = response.choices[0]
assert choice.finish_reason == "length", (
f"Expected finish_reason='length', got {choice.finish_reason}"
)
assert choice.message.content is not None, (
"Content should not be None when truncated"
)
assert len(choice.message.content) > 0, "Content should not be empty"

View File

@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
],
)
@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
input_messages_2,
[
{"role": "system"},
{"role": "developer"},
{"role": "user"},
# The analysis message should be dropped on subsequent inputs because
# of the subsequent assistant message to the final channel.
@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the second turn's input
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
verify_harmony_messages(
input_messages_2,
@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the third turn's input
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
verify_harmony_messages(
input_messages_3,
@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
)
# Test the Harmony messages for the fourth turn's input
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
verify_harmony_messages(
input_messages_4,
@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
# The reasoning that would have resulted in an analysis message is
# dropped because of a later assistant message to the final channel.
@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",
@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
input_messages,
[
{"role": "system"},
{"role": "developer"},
{"role": "user", "content": messages[0]["content"]},
{
"role": "assistant",

View File

@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
envs.environment_variables[env_name] = lambda s=strategy: s
# Test the select_experts method
topk_weights, topk_ids, _ = fused_moe.select_experts(
topk_weights, topk_ids = fused_moe.select_experts(
hidden_states=hidden_states,
router_logits=router_logits,
)

View File

@ -71,7 +71,11 @@ class EngineClient(ABC):
truncate_prompt_tokens: int | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
) -> AsyncGenerator[PoolingRequestOutput, None]:
"""Generate outputs for a request from a pooling model."""
"""Generate outputs for a request from a pooling model.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove this argument in v0.15.
"""
...
@abstractmethod

View File

@ -1828,10 +1828,11 @@ class OpenAIServingChat(OpenAIServing):
messages.append(sys_msg)
# Add developer message.
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None
)
messages.append(dev_msg)
if request.tools:
dev_msg = get_developer_message(
tools=request.tools if should_include_tools else None
)
messages.append(dev_msg)
# Add user message.
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))

View File

@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
UnquantizedFusedMoEMethod,
)
from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
ZeroExpertFusedMoE,
)
from vllm.triton_utils import HAS_TRITON
_config: dict[str, Any] | None = None
@ -54,6 +57,7 @@ __all__ = [
"FusedMoEPrepareAndFinalize",
"RoutingMethodType",
"SharedFusedMoE",
"ZeroExpertFusedMoE",
"activation_without_mul",
"override_config",
"get_config",

View File

@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
expert_map=None if self.disable_expert_map else layer.expert_map,
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result

View File

@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig,
RoutingMethodType,
)
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
init_aiter_topK_meta_data,
)
@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
num_redundant_experts: int = 0,
has_bias: bool = False,
is_sequence_parallel=False,
zero_expert_num: int | None = 0,
zero_expert_type: str | None = None,
expert_mapping: list[tuple[str, str, int, str]] | None = None,
n_shared_experts: int | None = None,
routing_method_type: int | None = None,
@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
self.global_num_experts = num_experts + num_redundant_experts
self.logical_num_experts = num_experts
self.zero_expert_num = zero_expert_num
self.zero_expert_type = zero_expert_type
# Expert mapping used in self.load_weights
self.expert_mapping = expert_mapping
@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Route the input hidden states to the top-k experts based on the
router logits.
Returns:
(topk_weights, topk_ids, zero_expert_result)
(tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
The weights, expert ids, and zero expert computation result.
(topk_weights, topk_ids)
(tuple[torch.Tensor, torch.Tensor]):
The weights and expert ids.
**Compatibility**: When EPLB is not enabled, the returned ids are
equivalent to global logical ids, so should be compatible with
@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
assert topk_ids.dtype == indices_type or indices_type is None
# Compute zero expert result if needed
if (
self.zero_expert_num is not None
and self.zero_expert_num > 0
and self.zero_expert_type is not None
and self.global_num_experts is not None
):
zero_expert_result = zero_experts_compute_triton(
expert_indices=topk_ids,
expert_scales=topk_weights,
num_experts=self.global_num_experts,
zero_expert_type=self.zero_expert_type,
hidden_states=hidden_states,
)
else:
zero_expert_result = None
return topk_weights, topk_ids, zero_expert_result
return topk_weights, topk_ids
def must_reduce_shared_expert_outputs(self) -> bool:
"""
@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
fused_output = torch.ops.vllm.moe_forward(
hidden_states, router_logits, self.layer_name
)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(fused_output, tuple)
fused_output, zero_expert_result = fused_output
return (reduce_output(fused_output) + zero_expert_result)[
..., :og_hidden_states
]
else:
return reduce_output(fused_output)[..., :og_hidden_states]
return reduce_output(fused_output)[..., :og_hidden_states]
else:
if current_platform.is_tpu() or current_platform.is_cpu():
# TODO: Once the OOM issue for the TPU backend is resolved, we
@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
final_hidden_states,
)
if self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)
assert self.shared_experts is None
final_hidden_states, zero_expert_result = final_hidden_states
if zero_expert_result is not None:
final_hidden_states += zero_expert_result
if not skip_result_store:
if self.shared_experts is None:
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
shared_output,
final_hidden_states,
)
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, tuple)
final_hidden_states, zero_expert_result = final_hidden_states
def combine_output(states: torch.Tensor) -> torch.Tensor:
if do_naive_dispatch_combine:
@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
final_hidden_states[0],
combine_output(final_hidden_states[1]),
)
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
assert isinstance(final_hidden_states, torch.Tensor)
return (combine_output(final_hidden_states), zero_expert_result)
else:
return combine_output(final_hidden_states)

View File

@ -295,7 +295,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -336,13 +336,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_map=layer.expert_map,
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result
def forward_cpu(
self,

View File

@ -0,0 +1,189 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from contextlib import contextmanager
import torch
from torch import nn
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
class ZeroExpertFusedMoE(FusedMoE):
"""
A FusedMoE operation that also computes the results of zero experts.
Zero experts perform identity operations (scaled pass-through) instead
of full MLP computations.
This class uses memoization to avoid redundant routing computation:
routing is computed once and reused for both zero expert computation
and the main FusedMoE forward pass.
"""
def __init__(
self,
zero_expert_num: int,
zero_expert_type: str,
router: nn.Module,
**kwargs,
):
# ZeroExpertFusedMoE manages its own custom_routing_function for memoization
assert (
"custom_routing_function" not in kwargs
or kwargs.get("custom_routing_function") is None
), (
"ZeroExpertFusedMoE does not support external custom_routing_function. "
"It manages its own for routing memoization."
)
# Automatically slice router's e_score_correction_bias to only include
# real experts (not zero_experts) for the base FusedMoE.
# The full bias will be used temporarily in forward() for routing.
if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
num_real_experts = kwargs["num_experts"]
router_bias = router.e_score_correction_bias
user_bias = kwargs.get("e_score_correction_bias")
# Use router's bias if:
# 1. User didn't provide bias, or
# 2. User provided full bias (same size as router)
if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
# FusedMoE no longer accepts zero_expert_num/zero_expert_type.
# We handle zero experts ourselves in forward().
super().__init__(**kwargs)
# Store the actual zero_expert_num and zero_expert_type for our own use
self._actual_zero_expert_num = zero_expert_num
self._actual_zero_expert_type = zero_expert_type
self._router = router # Full router (includes zero experts)
# Expose zero_expert_num and zero_expert_type as attributes for
# compatibility with quantization methods that check these attributes
self.zero_expert_num = 0
self.zero_expert_type = None
# Memoization state for routing results
self._memoized_topk_weights: torch.Tensor | None = None
self._memoized_topk_ids: torch.Tensor | None = None
# Create custom_routing_function to reuse memoized routing results
def custom_routing_function(hidden_states, gating_output, topk, renormalize):
"""Return memoized `topk_weights` and `topk_ids`."""
if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
raise RuntimeError(
"ZeroExpertFusedMoE: routing results not memoized. "
"Call select_experts first to compute routing."
)
return self._memoized_topk_weights, self._memoized_topk_ids
self.custom_routing_function = custom_routing_function
@contextmanager
def _temporarily_set_attrs(self, **attrs):
"""
Temporarily set attributes using object.__setattr__ and restore them.
This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
When PyTorch Dynamo traces the forward pass, it cannot handle
nn.Module.__setattr__ calls (which include parameter registration logic),
resulting in "Unsupported" errors. Using object.__setattr__ directly
sets the attribute without triggering nn.Module's custom __setattr__,
allowing Dynamo to trace the code successfully.
"""
originals = {key: getattr(self, key) for key in attrs}
try:
for key, value in attrs.items():
object.__setattr__(self, key, value)
yield
finally:
for key, value in originals.items():
object.__setattr__(self, key, value)
def _compute_zero_expert_result(
self,
hidden_states: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
) -> torch.Tensor | None:
"""Compute zero expert results using pre-computed routing."""
if (
self._actual_zero_expert_num is None
or self._actual_zero_expert_num <= 0
or self._actual_zero_expert_type is None
):
return None
return zero_experts_compute_triton(
expert_indices=topk_ids.clone(),
expert_scales=topk_weights.clone(),
num_experts=self.logical_num_experts,
zero_expert_type=self._actual_zero_expert_type,
hidden_states=hidden_states,
)
def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor, # Full logits including zero experts
) -> torch.Tensor:
"""
Forward pass with zero expert support and routing memoization.
Args:
hidden_states: Input hidden states
router_logits: Full router logits (including zero experts)
Returns:
Combined output from real experts and zero experts
"""
# Prepare temporary attribute overrides for routing computation
temp_attrs = {
"custom_routing_function": None, # Disable for first routing
}
if self._router is not None:
temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
# Compute routing with temporary attributes
# Pass full router_logits (including zero experts) so that zero experts
# can be properly identified in topk_ids
with self._temporarily_set_attrs(**temp_attrs):
topk_weights, topk_ids = self.select_experts(
hidden_states=hidden_states,
router_logits=router_logits, # Full logits (includes zero experts)
)
# Compute zero expert result if needed
zero_expert_result = self._compute_zero_expert_result(
hidden_states=hidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
)
# Memoize routing results for reuse in super().forward()
self._memoized_topk_weights = topk_weights
self._memoized_topk_ids = topk_ids
# Slice router_logits for real experts only
router_logits_sliced = router_logits[..., : self.logical_num_experts]
# Compute real expert results (will reuse memoized routing via
# custom_routing_function)
# zero_expert_num is already 0, so FusedMoE won't handle zero experts
fused_out = super().forward(
hidden_states=hidden_states,
router_logits=router_logits_sliced,
)
# Combine results
# Both zero_expert_result and fused_out are computed from the same
# hidden_states, so they should be on the same device.
if zero_expert_result is not None:
fused_out = fused_out + zero_expert_result
# Clear memoization after use
self._memoized_topk_weights = None
self._memoized_topk_ids = None
return fused_out

View File

@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
e_score_correction_bias=layer.e_score_correction_bias,
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
f"{layer.activation} not supported for Marlin MoE."
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
"EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
)
assert self.moe_quant_config is not None
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
from vllm.model_executor.layers.fused_moe import fused_experts
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -124,11 +124,13 @@ def get_fp8_moe_backend(
block_quant: bool,
moe_parallel_config: FusedMoEParallelConfig,
with_lora_support: bool,
) -> Fp8MoeBackend:
) -> Fp8MoeBackend | None:
"""
Select the primary FP8 MoE backend
Note: Shape-specific fallbacks may still occur at runtime.
"""
if current_platform.is_xpu():
return None
if with_lora_support:
return Fp8MoeBackend.TRITON
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
return UnquantizedLinearMethod()
return XPUFp8LinearMethod(fp8_config)
elif isinstance(layer, FusedMoE):
if is_layer_skipped(
prefix=prefix,
ignored_layers=self.ignored_layers,
fused_mapping=self.packed_modules_mapping,
):
return UnquantizedFusedMoEMethod(layer.moe_config)
return XPUFp8MoEMethod(fp8_config, layer)
elif isinstance(layer, Attention):
return Fp8KVCacheMethod(self)
@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
) -> mk.FusedMoEPrepareAndFinalize | None:
if (
self.rocm_aiter_moe_enabled
current_platform.is_xpu()
or self.rocm_aiter_moe_enabled
or self.use_marlin
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
):
@ -1282,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
select_result = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
topk_weights, topk_ids, zero_expert_result = select_result
if self.rocm_aiter_moe_enabled:
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
rocm_aiter_fused_experts,
@ -1343,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
apply_router_weight_on_input=layer.apply_router_weight_on_input,
)
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
assert not isinstance(result, tuple), (
"Shared + zero experts are mutually exclusive not yet supported"
)
return result, zero_expert_result
else:
return result
return result
class Fp8OnlineMoEMethod(Fp8MoEMethod):

View File

@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
"fused GGUF MoE method."
)
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -6,13 +6,8 @@ from typing import Any, Optional
import torch
from packaging import version
from torch.nn import Module
from torch.nn.parameter import Parameter
from vllm._ipex_ops import ipex_ops as ops
from vllm.model_executor.layers.fused_moe import (
FusedMoEMethodBase,
FusedMoeWeightScaleSupported,
)
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.linear import (
LinearBase,
@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
QuantizationMethods,
)
from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
from vllm.model_executor.layers.quantization.fp8 import (
Fp8Config,
Fp8LinearMethod,
Fp8OnlineMoEMethod,
)
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
maybe_create_device_identity,
)
from vllm.model_executor.parameter import ModelWeightParameter
from vllm.model_executor.utils import set_weight_attrs
from vllm.model_executor.utils import replace_parameter
from vllm.platforms import current_platform
MIN_IPEX_VERSION = "2.6.0"
@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
def __init__(self, quant_config: Fp8Config):
super().__init__(quant_config)
def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: list[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
maybe_create_device_identity()
output_size_per_partition = sum(output_partition_sizes)
weight_loader = extra_weight_attrs.get("weight_loader")
layer.logical_widths = output_partition_sizes
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
layer.orig_dtype = params_dtype
layer.weight_block_size = None
weight = ModelWeightParameter(
data=torch.empty(
output_size_per_partition,
input_size_per_partition,
dtype=params_dtype,
),
input_dim=1,
output_dim=0,
weight_loader=weight_loader,
)
layer.register_parameter("weight", weight)
def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False):
return
# If checkpoint not serialized fp8, quantize the weights.
if not self.quant_config.is_checkpoint_fp8_serialized:
qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
# Update the layer with the new values.
layer.weight = Parameter(qweight, requires_grad=False)
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
replace_parameter(layer, "weight", qweight.data)
replace_parameter(layer, "weight_scale", weight_scale.data)
layer.input_scale = None
def apply(
@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
return output
class XPUFp8MoEMethod(FusedMoEMethodBase):
class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
super().__init__(layer.moe_config)
super().__init__(quant_config, layer)
self.quant_config = quant_config
def create_weights(
self,
layer: Module,
num_experts: int,
hidden_size: int,
intermediate_size_per_partition: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
layer.intermediate_size_per_partition = intermediate_size_per_partition
layer.hidden_size = hidden_size
layer.num_experts = num_experts
layer.orig_dtype = params_dtype
layer.weight_block_size = None
# WEIGHTS
w13_weight = torch.nn.Parameter(
torch.empty(
num_experts,
2 * intermediate_size_per_partition,
hidden_size,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w13_weight", w13_weight)
set_weight_attrs(w13_weight, extra_weight_attrs)
w2_weight = torch.nn.Parameter(
torch.empty(
num_experts,
hidden_size,
intermediate_size_per_partition,
dtype=params_dtype,
),
requires_grad=False,
)
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)
# Allocate 2 scales for w1 and w3 respectively.
# They will be combined to a single scale after weight loading.
w13_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
)
w2_weight_scale = torch.nn.Parameter(
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
)
layer.register_parameter("w13_weight_scale", w13_weight_scale)
layer.register_parameter("w2_weight_scale", w2_weight_scale)
extra_weight_attrs.update(
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
)
# INPUT_SCALES
layer.w13_input_scale = None
layer.w2_input_scale = None
def process_weights_after_loading(self, layer: Module) -> None:
if getattr(layer, "_already_called_process_weights_after_loading", False):
return
if not self.quant_config.is_checkpoint_fp8_serialized:
fp8_dtype = current_platform.fp8_dtype()
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
)
layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
replace_parameter(layer, "w13_weight", w13_weight)
replace_parameter(layer, "w2_weight", w2_weight)
import intel_extension_for_pytorch as ipex
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts

View File

@ -796,7 +796,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
)
# Expert selection
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -1599,7 +1599,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
x_routing, _ = x
else:
x_routing = x
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x_routing,
router_logits=router_logits,
)

View File

@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
from vllm.model_executor.layers.fused_moe import fused_experts
assert layer.activation == "silu", "Only SiLU activation is supported."
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
raise NotImplementedError("EPLB is not supported for mxfp4")
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
):
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)
@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
x: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
topk_weights, topk_ids, _ = layer.select_experts(
topk_weights, topk_ids = layer.select_experts(
hidden_states=x,
router_logits=router_logits,
)

View File

@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
a_scales = None
if input_dtype == torch.int8:
assert quant_type == scalar_types.uint4, (
"W8A8-INT8 is not supported by marlin kernel."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
a_scales = a_scales * input_global_scale
elif input_dtype == torch.float8_e4m3fn:
assert quant_type == scalar_types.uint4, (
"INT8 weight + FP8 activation is not supported."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
output = ops.gptq_marlin_gemm(
@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
a_scales = None
if input_dtype == torch.int8:
assert quant_type == scalar_types.uint4b8, (
"W8A8-INT8 is not supported by marlin kernel."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
a_scales = a_scales * input_global_scale
elif input_dtype == torch.float8_e4m3fn:
assert quant_type == scalar_types.uint4b8, (
"INT8 weight + FP8 activation is not supported."
)
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
output = ops.gptq_marlin_gemm(

View File

@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
)
is_nvfp4 = hasattr(layer, "weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
elif input_dtype != torch.float8_e4m3fn:
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
group_size = 16 if is_nvfp4 else 32
part_size_n = layer.output_size_per_partition
@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
)
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
if input_dtype is not None and input_dtype.itemsize == 1:
if is_nvfp4:
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
elif input_dtype != torch.float8_e4m3fn:
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
group_size = 16 if is_nvfp4 else 32
e = layer.num_experts

View File

@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
if input_dtype is not None and input_dtype.itemsize == 1:
raise RuntimeError("Marlin W8A8 is not supported.")
part_size_n = layer.output_size_per_partition
part_size_k = layer.input_size_per_partition
@ -142,10 +144,20 @@ def prepare_fp8_layer_for_marlin(
# marlin kernel only support channel-wise and group-wise quantization
# we need to convert the scales
if weight_block_size is None:
logical_widths = getattr(layer, "logical_widths", [])
if scales.nelement() == 1:
# tensor-wise quantization -> channel-wise quantization
# (1, 1) =>(repeat)=> (1, size_n)
scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
elif scales.nelement() == len(logical_widths):
# tensor-wise quantization with logical_widths ->
# channel-wise quantization
assert sum(logical_widths) == part_size_n, (
f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
f"to part_size_n ({part_size_n})"
)
lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
elif scales.nelement() > 1 and scales.nelement() != part_size_n:
assert part_size_n % scales.nelement() == 0
s_size = scales.nelement()
@ -196,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
"be used leveraging the Marlin kernel. This may degrade "
"performance for compute-heavy workloads."
)
if input_dtype is not None and input_dtype.itemsize == 1:
raise RuntimeError("Marlin W8A8 is not supported.")
e = layer.num_experts
k = layer.hidden_size

View File

@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (
MergedColumnParallelLinear,
@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
self.intermediate_size = (
self.ffn_hidden_size
if hasattr(self, "ffn_hidden_size")
else self.intermediate_size
else intermediate_size
)
if hasattr(self, "moe_intermediate_size"):
self.moe_intermediate_size = self.moe_intermediate_size
@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
):
super().__init__()
self.hidden_size = hidden_size
self.zero_expert_num = config.zero_expert_num
self.zero_expert_type = config.zero_expert_type
self.routed_scaling_factor = config.routed_scaling_factor
self.enable_eplb = enable_eplb
# Gate always runs at half / full precision for now.
self.rounter_params_dtype = params_dtype
if config.router_dtype == "float32":
@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
self.router = LongcatRouter(
config=config,
zero_expert_num=self.zero_expert_num,
zero_expert_num=config.zero_expert_num,
rounter_params_dtype=self.rounter_params_dtype,
prefix=f"{prefix}.gate",
)
self.experts = FusedMoE(
assert config.zero_expert_num is not None
assert config.zero_expert_type is not None
self.experts = ZeroExpertFusedMoE(
zero_expert_num=config.zero_expert_num,
zero_expert_type=config.zero_expert_type,
router=self.router,
num_experts=num_experts,
top_k=top_k,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
reduce_results=True,
params_dtype=params_dtype,
e_score_correction_bias=self.router.e_score_correction_bias,
renormalize=False,
quant_config=quant_config,
prefix=f"{prefix}.experts",
zero_expert_num=self.zero_expert_num,
zero_expert_type=self.zero_expert_type,
enable_eplb=self.enable_eplb,
enable_eplb=enable_eplb,
routed_scaling_factor=config.routed_scaling_factor,
)
@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
num_tokens, hidden_dim = hidden_states.shape
hidden_states = hidden_states.view(-1, hidden_dim)
router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
final_hidden_states = self.experts(
hidden_states=hidden_states, router_logits=router_logits
# Align to FusedMoE padded hidden size to avoid dim mismatch
padded_hidden = self.experts.hidden_size
if hidden_dim < padded_hidden:
hidden_states_padded = torch.nn.functional.pad(
hidden_states,
(0, padded_hidden - hidden_dim),
mode="constant",
value=0.0,
)
else:
hidden_states_padded = hidden_states
router_logits_full = self.router(
hidden_states_padded.to(self.rounter_params_dtype)
)
# ZeroExpertFusedMoE handles routing memoization and zero expert computation
# internally. Pass full router_logits (including zero experts) so that
# zero experts can be properly identified in routing.
final_hidden_states = self.experts(
hidden_states=hidden_states_padded,
router_logits=router_logits_full, # Full logits (includes zero experts)
)
# Crop back to original hidden dimension if padded earlier
if padded_hidden != hidden_dim:
final_hidden_states = final_hidden_states[..., :hidden_dim]
return final_hidden_states.view(num_tokens, hidden_dim)
@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
hidden_states = self.self_attn[0](
positions=positions,
hidden_states=hidden_states,
llama_4_scaling=None,
)
hidden_states, residual = self.post_attention_layernorm[0](
@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
hidden_states = self.self_attn[1](
positions=positions,
hidden_states=hidden_states,
llama_4_scaling=None,
)
hidden_states, residual = self.post_attention_layernorm[1](
hidden_states, residual

View File

@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
elif msg.channel == "commentary" and not msg.recipient:
commentary_content = msg_text
# Extract partial content from the parser state if the generation was truncated
if parser.current_content:
if parser.current_channel == "final":
final_content = parser.current_content
elif (
parser.current_channel == "commentary" and not parser.current_recipient
):
commentary_content = parser.current_content
return ExtractedToolCallInformation(
tools_called=len(tool_calls) > 0,
tool_calls=tool_calls,

View File

@ -4,6 +4,7 @@ import asyncio
import os
import socket
import time
import warnings
from collections.abc import AsyncGenerator, Iterable, Mapping
from copy import copy
from typing import Any, cast
@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
The caller of generate() iterates the returned AsyncGenerator,
returning the RequestOutput back to the caller.
NOTE: truncate_prompt_tokens is deprecated in v0.14.
TODO: Remove truncate_prompt_tokens in v0.15.
"""
try:
@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
if tokenization_kwargs is None:
tokenization_kwargs = {}
if truncate_prompt_tokens is not None:
warnings.warn(
"The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
"is deprecated and will be removed in v0.15. "
"Please use `pooling_params.truncate_prompt_tokens` instead.",
DeprecationWarning,
stacklevel=2,
)
_validate_truncation_size(
self.model_config.max_model_len,
truncate_prompt_tokens,
pooling_params.truncate_prompt_tokens,
tokenization_kwargs,
)