mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-29 18:17:13 +08:00
Merge branch 'main' into mlm-full-lora-support
This commit is contained in:
commit
8aedddd546
@ -1254,13 +1254,13 @@ steps:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
@ -1508,7 +1508,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- HIP_VISIBLE_DEVICES=0,1 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
@ -1109,13 +1109,13 @@ steps:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 68min
|
||||
timeout_in_minutes: 90
|
||||
@ -1334,7 +1334,7 @@ steps:
|
||||
- "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
##### B200 test #####
|
||||
|
||||
@ -145,7 +145,7 @@ steps:
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
||||
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 --all2all-backend deepep_high_throughput
|
||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
|
||||
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||
|
||||
- label: Distributed Tests (2 GPUs)(B200)
|
||||
@ -171,7 +171,7 @@ steps:
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --nnodes=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
||||
|
||||
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
||||
timeout_in_minutes: 30
|
||||
|
||||
1
.github/CODEOWNERS
vendored
1
.github/CODEOWNERS
vendored
@ -15,6 +15,7 @@
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||
/vllm/tool_parsers @aarnphm @chaunceyjiang
|
||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||
|
||||
@ -5,25 +5,25 @@ Usage:
|
||||
Single node:
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2
|
||||
-dp=2 \
|
||||
-tp=2
|
||||
|
||||
Multi-node:
|
||||
Node 0 (assume the node has ip of 10.99.48.128):
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
-dp=2 \
|
||||
-tp=2 \
|
||||
--nnodes=2 \
|
||||
--node-rank=0 \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
Node 1:
|
||||
python examples/offline_inference/data_parallel.py \
|
||||
--model="ibm-research/PowerMoE-3b" \
|
||||
--dp-size=2 \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
-dp=2 \
|
||||
-tp=2 \
|
||||
--nnodes=2 \
|
||||
--node-rank=1 \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
@ -32,103 +32,40 @@ Multi-node:
|
||||
import os
|
||||
from time import sleep
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm import LLM, EngineArgs, SamplingParams
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
from vllm.utils.network_utils import get_open_port
|
||||
|
||||
|
||||
def parse_args():
|
||||
import argparse
|
||||
def create_parser():
|
||||
parser = FlexibleArgumentParser(description="Data Parallel Inference")
|
||||
|
||||
parser = argparse.ArgumentParser(description="Data Parallel Inference")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="ibm-research/PowerMoE-3b",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
|
||||
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
|
||||
parser.add_argument(
|
||||
"--node-size", type=int, default=1, help="Total number of nodes"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--node-rank", type=int, default=0, help="Rank of the current node"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--master-addr", type=str, default="", help="Master node IP address"
|
||||
)
|
||||
parser.add_argument("--master-port", type=int, default=0, help="Master node port")
|
||||
parser.add_argument(
|
||||
"--enforce-eager", action="store_true", help="Enforce eager mode execution."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust-remote-code", action="store_true", help="Trust remote code."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-num-seqs",
|
||||
type=int,
|
||||
default=64,
|
||||
help=("Maximum number of sequences to be processed in a single iteration."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-model-len",
|
||||
type=int,
|
||||
help=("Maximum number of tokens to be processed in a single iteration."),
|
||||
# Add all engine args
|
||||
EngineArgs.add_cli_args(parser)
|
||||
parser.set_defaults(
|
||||
model="ibm-research/PowerMoE-3b",
|
||||
enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
# Add timeout (not in EngineArgs)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=300,
|
||||
help=("Number of seconds before unresponsive process is killed."),
|
||||
help="Number of seconds before unresponsive process is killed.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpu-memory-utilization",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-dbo",
|
||||
action="store_true",
|
||||
help=("Enable microbatched execution"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--compilation-config",
|
||||
type=int,
|
||||
help=("Compilation optimization (O) mode 0-3."),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
type=str,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-expert-parallel",
|
||||
dest="enable_expert_parallel",
|
||||
action="store_false",
|
||||
help="Disable expert parallel (default: enabled).",
|
||||
)
|
||||
parser.set_defaults(enable_expert_parallel=True)
|
||||
return parser.parse_args()
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main(
|
||||
model,
|
||||
dp_size,
|
||||
local_dp_rank,
|
||||
global_dp_rank,
|
||||
dp_master_ip,
|
||||
dp_master_port,
|
||||
GPUs_per_dp_rank,
|
||||
enforce_eager,
|
||||
enable_expert_parallel,
|
||||
trust_remote_code,
|
||||
max_num_seqs,
|
||||
max_model_len,
|
||||
compilation_config,
|
||||
gpu_memory_utilization,
|
||||
enable_dbo,
|
||||
quantization,
|
||||
engine_args,
|
||||
):
|
||||
os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
|
||||
os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
|
||||
@ -173,19 +110,7 @@ def main(
|
||||
)
|
||||
|
||||
# Create an LLM.
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tensor_parallel_size=GPUs_per_dp_rank,
|
||||
enforce_eager=enforce_eager,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
trust_remote_code=trust_remote_code,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
enable_dbo=enable_dbo,
|
||||
quantization=quantization,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
llm = LLM(**engine_args)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
# Print the outputs.
|
||||
for i, output in enumerate(outputs):
|
||||
@ -204,22 +129,29 @@ def main(
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
parser = create_parser()
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
dp_size = args.dp_size
|
||||
tp_size = args.tp_size
|
||||
node_size = args.node_size
|
||||
node_rank = args.node_rank
|
||||
# Extract DP-specific args
|
||||
dp_size = args.pop("data_parallel_size")
|
||||
nnodes = args.get("nnodes", 1)
|
||||
node_rank = args.get("node_rank", 0)
|
||||
master_addr = args.get("master_addr", "")
|
||||
master_port = args.get("master_port", 0)
|
||||
timeout = args.pop("timeout")
|
||||
|
||||
if node_size == 1:
|
||||
# Remaining args are engine args
|
||||
engine_args = args
|
||||
|
||||
if nnodes == 1:
|
||||
dp_master_ip = "127.0.0.1"
|
||||
dp_master_port = get_open_port()
|
||||
else:
|
||||
dp_master_ip = args.master_addr
|
||||
dp_master_port = args.master_port
|
||||
dp_master_ip = master_addr
|
||||
dp_master_port = master_port
|
||||
|
||||
assert dp_size % node_size == 0, "dp_size should be divisible by node_size"
|
||||
dp_per_node = dp_size // node_size
|
||||
assert dp_size % nnodes == 0, "dp_size should be divisible by nnodes"
|
||||
dp_per_node = dp_size // nnodes
|
||||
|
||||
from multiprocessing import Process
|
||||
|
||||
@ -235,29 +167,19 @@ if __name__ == "__main__":
|
||||
proc = Process(
|
||||
target=main,
|
||||
args=(
|
||||
args.model,
|
||||
dp_size,
|
||||
local_dp_rank,
|
||||
global_dp_rank,
|
||||
dp_master_ip,
|
||||
dp_master_port,
|
||||
tp_size,
|
||||
args.enforce_eager,
|
||||
args.enable_expert_parallel,
|
||||
args.trust_remote_code,
|
||||
args.max_num_seqs,
|
||||
args.max_model_len,
|
||||
args.compilation_config,
|
||||
args.gpu_memory_utilization,
|
||||
args.enable_dbo,
|
||||
args.quantization,
|
||||
engine_args,
|
||||
),
|
||||
)
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
exit_code = 0
|
||||
for proc in procs:
|
||||
proc.join(timeout=args.timeout)
|
||||
proc.join(timeout=timeout)
|
||||
if proc.exitcode is None:
|
||||
print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
|
||||
proc.kill()
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import importlib
|
||||
import importlib.util
|
||||
import json
|
||||
import time
|
||||
|
||||
@ -986,3 +987,23 @@ async def test_function_call_with_previous_input_messages(
|
||||
assert (
|
||||
"aquarius" in output_text or "otter" in output_text or "tuesday" in output_text
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_chat_truncation_content_not_null(client: OpenAI, model_name: str):
|
||||
response = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[{"role": "user", "content": "What is the role of AI in medicine?"}],
|
||||
temperature=0.0,
|
||||
max_tokens=250,
|
||||
)
|
||||
|
||||
choice = response.choices[0]
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length', got {choice.finish_reason}"
|
||||
)
|
||||
assert choice.message.content is not None, (
|
||||
"Content should not be None when truncated"
|
||||
)
|
||||
assert len(choice.message.content) > 0, "Content should not be empty"
|
||||
|
||||
@ -955,7 +955,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
],
|
||||
)
|
||||
@ -983,7 +982,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages_2,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user"},
|
||||
# The analysis message should be dropped on subsequent inputs because
|
||||
# of the subsequent assistant message to the final channel.
|
||||
@ -1043,7 +1041,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1124,7 +1122,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1205,7 +1203,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the second turn's input
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_2 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_2, _ = serving_chat._make_request_with_harmony(req_2)
|
||||
verify_harmony_messages(
|
||||
input_messages_2,
|
||||
@ -1255,7 +1253,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the third turn's input
|
||||
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_3 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_3, _ = serving_chat._make_request_with_harmony(req_3)
|
||||
verify_harmony_messages(
|
||||
input_messages_3,
|
||||
@ -1318,7 +1316,7 @@ class TestServingChatWithHarmony:
|
||||
)
|
||||
|
||||
# Test the Harmony messages for the fourth turn's input
|
||||
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages)
|
||||
req_4 = ChatCompletionRequest(model=MODEL_NAME, messages=messages, tools=tools)
|
||||
input_messages_4, _ = serving_chat._make_request_with_harmony(req_4)
|
||||
verify_harmony_messages(
|
||||
input_messages_4,
|
||||
@ -1374,7 +1372,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
# The reasoning that would have resulted in an analysis message is
|
||||
# dropped because of a later assistant message to the final channel.
|
||||
@ -1406,7 +1403,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
{
|
||||
"role": "assistant",
|
||||
@ -1436,7 +1432,6 @@ class TestServingChatWithHarmony:
|
||||
input_messages,
|
||||
[
|
||||
{"role": "system"},
|
||||
{"role": "developer"},
|
||||
{"role": "user", "content": messages[0]["content"]},
|
||||
{
|
||||
"role": "assistant",
|
||||
|
||||
@ -127,7 +127,7 @@ def test_routing_strategy_integration(monkeypatch, device):
|
||||
envs.environment_variables[env_name] = lambda s=strategy: s
|
||||
|
||||
# Test the select_experts method
|
||||
topk_weights, topk_ids, _ = fused_moe.select_experts(
|
||||
topk_weights, topk_ids = fused_moe.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -71,7 +71,11 @@ class EngineClient(ABC):
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> AsyncGenerator[PoolingRequestOutput, None]:
|
||||
"""Generate outputs for a request from a pooling model."""
|
||||
"""Generate outputs for a request from a pooling model.
|
||||
|
||||
NOTE: truncate_prompt_tokens is deprecated in v0.14.
|
||||
TODO: Remove this argument in v0.15.
|
||||
"""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@ -1828,10 +1828,11 @@ class OpenAIServingChat(OpenAIServing):
|
||||
messages.append(sys_msg)
|
||||
|
||||
# Add developer message.
|
||||
dev_msg = get_developer_message(
|
||||
tools=request.tools if should_include_tools else None
|
||||
)
|
||||
messages.append(dev_msg)
|
||||
if request.tools:
|
||||
dev_msg = get_developer_message(
|
||||
tools=request.tools if should_include_tools else None
|
||||
)
|
||||
messages.append(dev_msg)
|
||||
|
||||
# Add user message.
|
||||
messages.extend(parse_chat_inputs_to_harmony_messages(request.messages))
|
||||
|
||||
@ -25,6 +25,9 @@ from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
|
||||
UnquantizedFusedMoEMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
|
||||
from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
|
||||
ZeroExpertFusedMoE,
|
||||
)
|
||||
from vllm.triton_utils import HAS_TRITON
|
||||
|
||||
_config: dict[str, Any] | None = None
|
||||
@ -54,6 +57,7 @@ __all__ = [
|
||||
"FusedMoEPrepareAndFinalize",
|
||||
"RoutingMethodType",
|
||||
"SharedFusedMoE",
|
||||
"ZeroExpertFusedMoE",
|
||||
"activation_without_mul",
|
||||
"override_config",
|
||||
"get_config",
|
||||
|
||||
@ -92,7 +92,7 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -110,10 +110,4 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp):
|
||||
expert_map=None if self.disable_expert_map else layer.expert_map,
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
@ -32,7 +32,6 @@ from vllm.model_executor.layers.fused_moe.config import (
|
||||
FusedMoEQuantConfig,
|
||||
RoutingMethodType,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
|
||||
init_aiter_topK_meta_data,
|
||||
)
|
||||
@ -350,8 +349,6 @@ class FusedMoE(CustomOp):
|
||||
num_redundant_experts: int = 0,
|
||||
has_bias: bool = False,
|
||||
is_sequence_parallel=False,
|
||||
zero_expert_num: int | None = 0,
|
||||
zero_expert_type: str | None = None,
|
||||
expert_mapping: list[tuple[str, str, int, str]] | None = None,
|
||||
n_shared_experts: int | None = None,
|
||||
routing_method_type: int | None = None,
|
||||
@ -409,8 +406,6 @@ class FusedMoE(CustomOp):
|
||||
|
||||
self.global_num_experts = num_experts + num_redundant_experts
|
||||
self.logical_num_experts = num_experts
|
||||
self.zero_expert_num = zero_expert_num
|
||||
self.zero_expert_type = zero_expert_type
|
||||
|
||||
# Expert mapping used in self.load_weights
|
||||
self.expert_mapping = expert_mapping
|
||||
@ -1525,15 +1520,15 @@ class FusedMoE(CustomOp):
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Route the input hidden states to the top-k experts based on the
|
||||
router logits.
|
||||
|
||||
Returns:
|
||||
(topk_weights, topk_ids, zero_expert_result)
|
||||
(tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||
The weights, expert ids, and zero expert computation result.
|
||||
(topk_weights, topk_ids)
|
||||
(tuple[torch.Tensor, torch.Tensor]):
|
||||
The weights and expert ids.
|
||||
|
||||
**Compatibility**: When EPLB is not enabled, the returned ids are
|
||||
equivalent to global logical ids, so should be compatible with
|
||||
@ -1655,23 +1650,7 @@ class FusedMoE(CustomOp):
|
||||
|
||||
assert topk_ids.dtype == indices_type or indices_type is None
|
||||
|
||||
# Compute zero expert result if needed
|
||||
if (
|
||||
self.zero_expert_num is not None
|
||||
and self.zero_expert_num > 0
|
||||
and self.zero_expert_type is not None
|
||||
and self.global_num_experts is not None
|
||||
):
|
||||
zero_expert_result = zero_experts_compute_triton(
|
||||
expert_indices=topk_ids,
|
||||
expert_scales=topk_weights,
|
||||
num_experts=self.global_num_experts,
|
||||
zero_expert_type=self.zero_expert_type,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
else:
|
||||
zero_expert_result = None
|
||||
return topk_weights, topk_ids, zero_expert_result
|
||||
return topk_weights, topk_ids
|
||||
|
||||
def must_reduce_shared_expert_outputs(self) -> bool:
|
||||
"""
|
||||
@ -1736,14 +1715,7 @@ class FusedMoE(CustomOp):
|
||||
fused_output = torch.ops.vllm.moe_forward(
|
||||
hidden_states, router_logits, self.layer_name
|
||||
)
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(fused_output, tuple)
|
||||
fused_output, zero_expert_result = fused_output
|
||||
return (reduce_output(fused_output) + zero_expert_result)[
|
||||
..., :og_hidden_states
|
||||
]
|
||||
else:
|
||||
return reduce_output(fused_output)[..., :og_hidden_states]
|
||||
return reduce_output(fused_output)[..., :og_hidden_states]
|
||||
else:
|
||||
if current_platform.is_tpu() or current_platform.is_cpu():
|
||||
# TODO: Once the OOM issue for the TPU backend is resolved, we
|
||||
@ -1841,13 +1813,6 @@ class FusedMoE(CustomOp):
|
||||
final_hidden_states,
|
||||
)
|
||||
|
||||
if self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, tuple)
|
||||
assert self.shared_experts is None
|
||||
final_hidden_states, zero_expert_result = final_hidden_states
|
||||
if zero_expert_result is not None:
|
||||
final_hidden_states += zero_expert_result
|
||||
|
||||
if not skip_result_store:
|
||||
if self.shared_experts is None:
|
||||
full_fused_final_hidden_states[chunk_start:chunk_end, :].copy_(
|
||||
@ -2030,9 +1995,6 @@ class FusedMoE(CustomOp):
|
||||
shared_output,
|
||||
final_hidden_states,
|
||||
)
|
||||
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, tuple)
|
||||
final_hidden_states, zero_expert_result = final_hidden_states
|
||||
|
||||
def combine_output(states: torch.Tensor) -> torch.Tensor:
|
||||
if do_naive_dispatch_combine:
|
||||
@ -2051,9 +2013,6 @@ class FusedMoE(CustomOp):
|
||||
final_hidden_states[0],
|
||||
combine_output(final_hidden_states[1]),
|
||||
)
|
||||
elif self.zero_expert_num is not None and self.zero_expert_num > 0:
|
||||
assert isinstance(final_hidden_states, torch.Tensor)
|
||||
return (combine_output(final_hidden_states), zero_expert_result)
|
||||
else:
|
||||
return combine_output(final_hidden_states)
|
||||
|
||||
|
||||
@ -295,7 +295,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, zero_expert_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -336,13 +336,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
|
||||
expert_map=layer.expert_map,
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
def forward_cpu(
|
||||
self,
|
||||
|
||||
189
vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
Normal file
189
vllm/model_executor/layers/fused_moe/zero_expert_fused_moe.py
Normal file
@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton
|
||||
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
|
||||
|
||||
|
||||
class ZeroExpertFusedMoE(FusedMoE):
|
||||
"""
|
||||
A FusedMoE operation that also computes the results of zero experts.
|
||||
Zero experts perform identity operations (scaled pass-through) instead
|
||||
of full MLP computations.
|
||||
|
||||
This class uses memoization to avoid redundant routing computation:
|
||||
routing is computed once and reused for both zero expert computation
|
||||
and the main FusedMoE forward pass.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
zero_expert_num: int,
|
||||
zero_expert_type: str,
|
||||
router: nn.Module,
|
||||
**kwargs,
|
||||
):
|
||||
# ZeroExpertFusedMoE manages its own custom_routing_function for memoization
|
||||
assert (
|
||||
"custom_routing_function" not in kwargs
|
||||
or kwargs.get("custom_routing_function") is None
|
||||
), (
|
||||
"ZeroExpertFusedMoE does not support external custom_routing_function. "
|
||||
"It manages its own for routing memoization."
|
||||
)
|
||||
|
||||
# Automatically slice router's e_score_correction_bias to only include
|
||||
# real experts (not zero_experts) for the base FusedMoE.
|
||||
# The full bias will be used temporarily in forward() for routing.
|
||||
if hasattr(router, "e_score_correction_bias") and "num_experts" in kwargs:
|
||||
num_real_experts = kwargs["num_experts"]
|
||||
router_bias = router.e_score_correction_bias
|
||||
user_bias = kwargs.get("e_score_correction_bias")
|
||||
|
||||
# Use router's bias if:
|
||||
# 1. User didn't provide bias, or
|
||||
# 2. User provided full bias (same size as router)
|
||||
if user_bias is None or user_bias.shape[0] == router_bias.shape[0]:
|
||||
kwargs["e_score_correction_bias"] = router_bias[:num_real_experts]
|
||||
|
||||
# FusedMoE no longer accepts zero_expert_num/zero_expert_type.
|
||||
# We handle zero experts ourselves in forward().
|
||||
super().__init__(**kwargs)
|
||||
# Store the actual zero_expert_num and zero_expert_type for our own use
|
||||
self._actual_zero_expert_num = zero_expert_num
|
||||
self._actual_zero_expert_type = zero_expert_type
|
||||
self._router = router # Full router (includes zero experts)
|
||||
|
||||
# Expose zero_expert_num and zero_expert_type as attributes for
|
||||
# compatibility with quantization methods that check these attributes
|
||||
self.zero_expert_num = 0
|
||||
self.zero_expert_type = None
|
||||
|
||||
# Memoization state for routing results
|
||||
self._memoized_topk_weights: torch.Tensor | None = None
|
||||
self._memoized_topk_ids: torch.Tensor | None = None
|
||||
|
||||
# Create custom_routing_function to reuse memoized routing results
|
||||
def custom_routing_function(hidden_states, gating_output, topk, renormalize):
|
||||
"""Return memoized `topk_weights` and `topk_ids`."""
|
||||
if self._memoized_topk_weights is None or self._memoized_topk_ids is None:
|
||||
raise RuntimeError(
|
||||
"ZeroExpertFusedMoE: routing results not memoized. "
|
||||
"Call select_experts first to compute routing."
|
||||
)
|
||||
return self._memoized_topk_weights, self._memoized_topk_ids
|
||||
|
||||
self.custom_routing_function = custom_routing_function
|
||||
|
||||
@contextmanager
|
||||
def _temporarily_set_attrs(self, **attrs):
|
||||
"""
|
||||
Temporarily set attributes using object.__setattr__ and restore them.
|
||||
|
||||
This bypasses nn.Module.__setattr__ to avoid Dynamo tracing issues.
|
||||
When PyTorch Dynamo traces the forward pass, it cannot handle
|
||||
nn.Module.__setattr__ calls (which include parameter registration logic),
|
||||
resulting in "Unsupported" errors. Using object.__setattr__ directly
|
||||
sets the attribute without triggering nn.Module's custom __setattr__,
|
||||
allowing Dynamo to trace the code successfully.
|
||||
"""
|
||||
originals = {key: getattr(self, key) for key in attrs}
|
||||
try:
|
||||
for key, value in attrs.items():
|
||||
object.__setattr__(self, key, value)
|
||||
yield
|
||||
finally:
|
||||
for key, value in originals.items():
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
def _compute_zero_expert_result(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
) -> torch.Tensor | None:
|
||||
"""Compute zero expert results using pre-computed routing."""
|
||||
if (
|
||||
self._actual_zero_expert_num is None
|
||||
or self._actual_zero_expert_num <= 0
|
||||
or self._actual_zero_expert_type is None
|
||||
):
|
||||
return None
|
||||
|
||||
return zero_experts_compute_triton(
|
||||
expert_indices=topk_ids.clone(),
|
||||
expert_scales=topk_weights.clone(),
|
||||
num_experts=self.logical_num_experts,
|
||||
zero_expert_type=self._actual_zero_expert_type,
|
||||
hidden_states=hidden_states,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
router_logits: torch.Tensor, # Full logits including zero experts
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Forward pass with zero expert support and routing memoization.
|
||||
|
||||
Args:
|
||||
hidden_states: Input hidden states
|
||||
router_logits: Full router logits (including zero experts)
|
||||
|
||||
Returns:
|
||||
Combined output from real experts and zero experts
|
||||
"""
|
||||
# Prepare temporary attribute overrides for routing computation
|
||||
temp_attrs = {
|
||||
"custom_routing_function": None, # Disable for first routing
|
||||
}
|
||||
if self._router is not None:
|
||||
temp_attrs["e_score_correction_bias"] = self._router.e_score_correction_bias
|
||||
|
||||
# Compute routing with temporary attributes
|
||||
# Pass full router_logits (including zero experts) so that zero experts
|
||||
# can be properly identified in topk_ids
|
||||
with self._temporarily_set_attrs(**temp_attrs):
|
||||
topk_weights, topk_ids = self.select_experts(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits, # Full logits (includes zero experts)
|
||||
)
|
||||
|
||||
# Compute zero expert result if needed
|
||||
zero_expert_result = self._compute_zero_expert_result(
|
||||
hidden_states=hidden_states,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
)
|
||||
|
||||
# Memoize routing results for reuse in super().forward()
|
||||
self._memoized_topk_weights = topk_weights
|
||||
self._memoized_topk_ids = topk_ids
|
||||
|
||||
# Slice router_logits for real experts only
|
||||
router_logits_sliced = router_logits[..., : self.logical_num_experts]
|
||||
|
||||
# Compute real expert results (will reuse memoized routing via
|
||||
# custom_routing_function)
|
||||
# zero_expert_num is already 0, so FusedMoE won't handle zero experts
|
||||
fused_out = super().forward(
|
||||
hidden_states=hidden_states,
|
||||
router_logits=router_logits_sliced,
|
||||
)
|
||||
|
||||
# Combine results
|
||||
# Both zero_expert_result and fused_out are computed from the same
|
||||
# hidden_states, so they should be on the same device.
|
||||
if zero_expert_result is not None:
|
||||
fused_out = fused_out + zero_expert_result
|
||||
|
||||
# Clear memoization after use
|
||||
self._memoized_topk_weights = None
|
||||
self._memoized_topk_ids = None
|
||||
|
||||
return fused_out
|
||||
@ -764,7 +764,7 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -500,7 +500,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -574,7 +574,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
|
||||
e_score_correction_bias=layer.e_score_correction_bias,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1166,7 +1166,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1403,7 +1403,7 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1765,7 +1765,7 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod):
|
||||
f"{layer.activation} not supported for Marlin MoE."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1991,7 +1991,7 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -2607,7 +2607,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod):
|
||||
"EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet."
|
||||
)
|
||||
assert self.moe_quant_config is not None
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -142,7 +142,7 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -124,11 +124,13 @@ def get_fp8_moe_backend(
|
||||
block_quant: bool,
|
||||
moe_parallel_config: FusedMoEParallelConfig,
|
||||
with_lora_support: bool,
|
||||
) -> Fp8MoeBackend:
|
||||
) -> Fp8MoeBackend | None:
|
||||
"""
|
||||
Select the primary FP8 MoE backend
|
||||
Note: Shape-specific fallbacks may still occur at runtime.
|
||||
"""
|
||||
if current_platform.is_xpu():
|
||||
return None
|
||||
if with_lora_support:
|
||||
return Fp8MoeBackend.TRITON
|
||||
# Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
|
||||
@ -292,6 +294,13 @@ class Fp8Config(QuantizationConfig):
|
||||
return UnquantizedLinearMethod()
|
||||
return XPUFp8LinearMethod(fp8_config)
|
||||
elif isinstance(layer, FusedMoE):
|
||||
if is_layer_skipped(
|
||||
prefix=prefix,
|
||||
ignored_layers=self.ignored_layers,
|
||||
fused_mapping=self.packed_modules_mapping,
|
||||
):
|
||||
return UnquantizedFusedMoEMethod(layer.moe_config)
|
||||
|
||||
return XPUFp8MoEMethod(fp8_config, layer)
|
||||
elif isinstance(layer, Attention):
|
||||
return Fp8KVCacheMethod(self)
|
||||
@ -1107,7 +1116,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
|
||||
) -> mk.FusedMoEPrepareAndFinalize | None:
|
||||
if (
|
||||
self.rocm_aiter_moe_enabled
|
||||
current_platform.is_xpu()
|
||||
or self.rocm_aiter_moe_enabled
|
||||
or self.use_marlin
|
||||
or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
|
||||
):
|
||||
@ -1282,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
select_result = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, zero_expert_result = select_result
|
||||
|
||||
if self.rocm_aiter_moe_enabled:
|
||||
from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501
|
||||
rocm_aiter_fused_experts,
|
||||
@ -1343,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
|
||||
apply_router_weight_on_input=layer.apply_router_weight_on_input,
|
||||
)
|
||||
|
||||
if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
|
||||
assert not isinstance(result, tuple), (
|
||||
"Shared + zero experts are mutually exclusive not yet supported"
|
||||
)
|
||||
return result, zero_expert_result
|
||||
else:
|
||||
return result
|
||||
return result
|
||||
|
||||
|
||||
class Fp8OnlineMoEMethod(Fp8MoEMethod):
|
||||
|
||||
@ -639,7 +639,7 @@ class GGUFMoEMethod(FusedMoEMethodBase):
|
||||
"fused GGUF MoE method."
|
||||
)
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -900,7 +900,7 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase):
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -6,13 +6,8 @@ from typing import Any, Optional
|
||||
import torch
|
||||
from packaging import version
|
||||
from torch.nn import Module
|
||||
from torch.nn.parameter import Parameter
|
||||
|
||||
from vllm._ipex_ops import ipex_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe import (
|
||||
FusedMoEMethodBase,
|
||||
FusedMoeWeightScaleSupported,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
|
||||
from vllm.model_executor.layers.linear import (
|
||||
LinearBase,
|
||||
@ -24,14 +19,14 @@ from vllm.model_executor.layers.quantization import (
|
||||
QuantizationMethods,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.awq import AWQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8LinearMethod
|
||||
from vllm.model_executor.layers.quantization.fp8 import (
|
||||
Fp8Config,
|
||||
Fp8LinearMethod,
|
||||
Fp8OnlineMoEMethod,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
|
||||
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||
maybe_create_device_identity,
|
||||
)
|
||||
from vllm.model_executor.parameter import ModelWeightParameter
|
||||
from vllm.model_executor.utils import set_weight_attrs
|
||||
from vllm.model_executor.utils import replace_parameter
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MIN_IPEX_VERSION = "2.6.0"
|
||||
@ -309,44 +304,15 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
|
||||
def __init__(self, quant_config: Fp8Config):
|
||||
super().__init__(quant_config)
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: torch.nn.Module,
|
||||
input_size_per_partition: int,
|
||||
output_partition_sizes: list[int],
|
||||
input_size: int,
|
||||
output_size: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
maybe_create_device_identity()
|
||||
|
||||
output_size_per_partition = sum(output_partition_sizes)
|
||||
weight_loader = extra_weight_attrs.get("weight_loader")
|
||||
layer.logical_widths = output_partition_sizes
|
||||
layer.input_size_per_partition = input_size_per_partition
|
||||
layer.output_size_per_partition = output_size_per_partition
|
||||
layer.orig_dtype = params_dtype
|
||||
layer.weight_block_size = None
|
||||
weight = ModelWeightParameter(
|
||||
data=torch.empty(
|
||||
output_size_per_partition,
|
||||
input_size_per_partition,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
input_dim=1,
|
||||
output_dim=0,
|
||||
weight_loader=weight_loader,
|
||||
)
|
||||
layer.register_parameter("weight", weight)
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
if getattr(layer, "_already_called_process_weights_after_loading", False):
|
||||
return
|
||||
# If checkpoint not serialized fp8, quantize the weights.
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None)
|
||||
# Update the layer with the new values.
|
||||
layer.weight = Parameter(qweight, requires_grad=False)
|
||||
layer.weight_scale = Parameter(weight_scale, requires_grad=False)
|
||||
replace_parameter(layer, "weight", qweight.data)
|
||||
replace_parameter(layer, "weight_scale", weight_scale.data)
|
||||
layer.input_scale = None
|
||||
|
||||
def apply(
|
||||
@ -363,69 +329,14 @@ class XPUFp8LinearMethod(Fp8LinearMethod):
|
||||
return output
|
||||
|
||||
|
||||
class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
class XPUFp8MoEMethod(Fp8OnlineMoEMethod):
|
||||
def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
|
||||
super().__init__(layer.moe_config)
|
||||
super().__init__(quant_config, layer)
|
||||
self.quant_config = quant_config
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: Module,
|
||||
num_experts: int,
|
||||
hidden_size: int,
|
||||
intermediate_size_per_partition: int,
|
||||
params_dtype: torch.dtype,
|
||||
**extra_weight_attrs,
|
||||
):
|
||||
layer.intermediate_size_per_partition = intermediate_size_per_partition
|
||||
layer.hidden_size = hidden_size
|
||||
layer.num_experts = num_experts
|
||||
layer.orig_dtype = params_dtype
|
||||
layer.weight_block_size = None
|
||||
# WEIGHTS
|
||||
w13_weight = torch.nn.Parameter(
|
||||
torch.empty(
|
||||
num_experts,
|
||||
2 * intermediate_size_per_partition,
|
||||
hidden_size,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w13_weight", w13_weight)
|
||||
set_weight_attrs(w13_weight, extra_weight_attrs)
|
||||
|
||||
w2_weight = torch.nn.Parameter(
|
||||
torch.empty(
|
||||
num_experts,
|
||||
hidden_size,
|
||||
intermediate_size_per_partition,
|
||||
dtype=params_dtype,
|
||||
),
|
||||
requires_grad=False,
|
||||
)
|
||||
layer.register_parameter("w2_weight", w2_weight)
|
||||
set_weight_attrs(w2_weight, extra_weight_attrs)
|
||||
|
||||
# Allocate 2 scales for w1 and w3 respectively.
|
||||
# They will be combined to a single scale after weight loading.
|
||||
w13_weight_scale = torch.nn.Parameter(
|
||||
torch.ones(num_experts, 2, dtype=torch.float32), requires_grad=False
|
||||
)
|
||||
w2_weight_scale = torch.nn.Parameter(
|
||||
torch.ones(num_experts, dtype=torch.float32), requires_grad=False
|
||||
)
|
||||
layer.register_parameter("w13_weight_scale", w13_weight_scale)
|
||||
layer.register_parameter("w2_weight_scale", w2_weight_scale)
|
||||
|
||||
extra_weight_attrs.update(
|
||||
{"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
|
||||
)
|
||||
# INPUT_SCALES
|
||||
layer.w13_input_scale = None
|
||||
layer.w2_input_scale = None
|
||||
|
||||
def process_weights_after_loading(self, layer: Module) -> None:
|
||||
if getattr(layer, "_already_called_process_weights_after_loading", False):
|
||||
return
|
||||
if not self.quant_config.is_checkpoint_fp8_serialized:
|
||||
fp8_dtype = current_platform.fp8_dtype()
|
||||
w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype)
|
||||
@ -448,8 +359,9 @@ class XPUFp8MoEMethod(FusedMoEMethodBase):
|
||||
w2_weight[expert, :, :], layer.w2_weight_scale[expert] = (
|
||||
ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :])
|
||||
)
|
||||
layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False)
|
||||
layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False)
|
||||
replace_parameter(layer, "w13_weight", w13_weight)
|
||||
replace_parameter(layer, "w2_weight", w2_weight)
|
||||
|
||||
import intel_extension_for_pytorch as ipex
|
||||
|
||||
ep_rank_start = self.moe.ep_rank * self.moe.num_local_experts
|
||||
|
||||
@ -796,7 +796,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
|
||||
)
|
||||
|
||||
# Expert selection
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -1599,7 +1599,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
|
||||
x_routing, _ = x
|
||||
else:
|
||||
x_routing = x
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x_routing,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -370,7 +370,7 @@ class MoeWNA16Method(FusedMoEMethodBase):
|
||||
from vllm.model_executor.layers.fused_moe import fused_experts
|
||||
|
||||
assert layer.activation == "silu", "Only SiLU activation is supported."
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -896,7 +896,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
raise NotImplementedError("EPLB is not supported for mxfp4")
|
||||
|
||||
if self.mxfp4_backend == Mxfp4Backend.MARLIN:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -990,7 +990,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
|
||||
):
|
||||
from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
|
||||
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -338,7 +338,7 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -530,7 +530,7 @@ class QuarkW4A8Fp8MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
@ -738,7 +738,7 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -359,7 +359,7 @@ class RTNMoEMethod(FusedMoEMethodBase):
|
||||
x: torch.Tensor,
|
||||
router_logits: torch.Tensor,
|
||||
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
|
||||
topk_weights, topk_ids, _ = layer.select_experts(
|
||||
topk_weights, topk_ids = layer.select_experts(
|
||||
hidden_states=x,
|
||||
router_logits=router_logits,
|
||||
)
|
||||
|
||||
@ -594,9 +594,15 @@ def apply_awq_marlin_linear(
|
||||
|
||||
a_scales = None
|
||||
if input_dtype == torch.int8:
|
||||
assert quant_type == scalar_types.uint4, (
|
||||
"W8A8-INT8 is not supported by marlin kernel."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
a_scales = a_scales * input_global_scale
|
||||
elif input_dtype == torch.float8_e4m3fn:
|
||||
assert quant_type == scalar_types.uint4, (
|
||||
"INT8 weight + FP8 activation is not supported."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
@ -649,9 +655,15 @@ def apply_rtn_marlin_linear(
|
||||
|
||||
a_scales = None
|
||||
if input_dtype == torch.int8:
|
||||
assert quant_type == scalar_types.uint4b8, (
|
||||
"W8A8-INT8 is not supported by marlin kernel."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
a_scales = a_scales * input_global_scale
|
||||
elif input_dtype == torch.float8_e4m3fn:
|
||||
assert quant_type == scalar_types.uint4b8, (
|
||||
"INT8 weight + FP8 activation is not supported."
|
||||
)
|
||||
reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
|
||||
|
||||
output = ops.gptq_marlin_gemm(
|
||||
|
||||
@ -154,6 +154,12 @@ def prepare_fp4_layer_for_marlin(
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "weight_scale_2")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
|
||||
elif input_dtype != torch.float8_e4m3fn:
|
||||
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
|
||||
|
||||
group_size = 16 if is_nvfp4 else 32
|
||||
|
||||
part_size_n = layer.output_size_per_partition
|
||||
@ -231,6 +237,12 @@ def prepare_moe_fp4_layer_for_marlin(
|
||||
)
|
||||
|
||||
is_nvfp4 = hasattr(layer, "w13_weight_scale_2")
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
if is_nvfp4:
|
||||
raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
|
||||
elif input_dtype != torch.float8_e4m3fn:
|
||||
raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
|
||||
|
||||
group_size = 16 if is_nvfp4 else 32
|
||||
|
||||
e = layer.num_experts
|
||||
|
||||
@ -99,6 +99,8 @@ def prepare_fp8_layer_for_marlin(
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
raise RuntimeError("Marlin W8A8 is not supported.")
|
||||
|
||||
part_size_n = layer.output_size_per_partition
|
||||
part_size_k = layer.input_size_per_partition
|
||||
@ -142,10 +144,20 @@ def prepare_fp8_layer_for_marlin(
|
||||
# marlin kernel only support channel-wise and group-wise quantization
|
||||
# we need to convert the scales
|
||||
if weight_block_size is None:
|
||||
logical_widths = getattr(layer, "logical_widths", [])
|
||||
if scales.nelement() == 1:
|
||||
# tensor-wise quantization -> channel-wise quantization
|
||||
# (1, 1) =>(repeat)=> (1, size_n)
|
||||
scales = scales.view(1, 1).repeat_interleave(part_size_n, 1)
|
||||
elif scales.nelement() == len(logical_widths):
|
||||
# tensor-wise quantization with logical_widths ->
|
||||
# channel-wise quantization
|
||||
assert sum(logical_widths) == part_size_n, (
|
||||
f"Sum of logical_widths ({sum(logical_widths)}) must be equal "
|
||||
f"to part_size_n ({part_size_n})"
|
||||
)
|
||||
lw_tensor = scales.new_tensor(logical_widths, dtype=torch.int64)
|
||||
scales = scales.view(1, -1).repeat_interleave(lw_tensor, dim=1)
|
||||
elif scales.nelement() > 1 and scales.nelement() != part_size_n:
|
||||
assert part_size_n % scales.nelement() == 0
|
||||
s_size = scales.nelement()
|
||||
@ -196,6 +208,8 @@ def prepare_moe_fp8_layer_for_marlin(
|
||||
"be used leveraging the Marlin kernel. This may degrade "
|
||||
"performance for compute-heavy workloads."
|
||||
)
|
||||
if input_dtype is not None and input_dtype.itemsize == 1:
|
||||
raise RuntimeError("Marlin W8A8 is not supported.")
|
||||
|
||||
e = layer.num_experts
|
||||
k = layer.hidden_size
|
||||
|
||||
@ -46,7 +46,7 @@ from vllm.config import CacheConfig, VllmConfig
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE
|
||||
from vllm.model_executor.layers.fused_moe import FusedMoE, ZeroExpertFusedMoE
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
from vllm.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
@ -179,7 +179,7 @@ class FlashConfig(PretrainedConfig):
|
||||
self.intermediate_size = (
|
||||
self.ffn_hidden_size
|
||||
if hasattr(self, "ffn_hidden_size")
|
||||
else self.intermediate_size
|
||||
else intermediate_size
|
||||
)
|
||||
if hasattr(self, "moe_intermediate_size"):
|
||||
self.moe_intermediate_size = self.moe_intermediate_size
|
||||
@ -280,10 +280,6 @@ class LongcatMoe(nn.Module):
|
||||
):
|
||||
super().__init__()
|
||||
self.hidden_size = hidden_size
|
||||
self.zero_expert_num = config.zero_expert_num
|
||||
self.zero_expert_type = config.zero_expert_type
|
||||
self.routed_scaling_factor = config.routed_scaling_factor
|
||||
self.enable_eplb = enable_eplb
|
||||
# Gate always runs at half / full precision for now.
|
||||
self.rounter_params_dtype = params_dtype
|
||||
if config.router_dtype == "float32":
|
||||
@ -291,25 +287,27 @@ class LongcatMoe(nn.Module):
|
||||
|
||||
self.router = LongcatRouter(
|
||||
config=config,
|
||||
zero_expert_num=self.zero_expert_num,
|
||||
zero_expert_num=config.zero_expert_num,
|
||||
rounter_params_dtype=self.rounter_params_dtype,
|
||||
prefix=f"{prefix}.gate",
|
||||
)
|
||||
|
||||
self.experts = FusedMoE(
|
||||
assert config.zero_expert_num is not None
|
||||
assert config.zero_expert_type is not None
|
||||
self.experts = ZeroExpertFusedMoE(
|
||||
zero_expert_num=config.zero_expert_num,
|
||||
zero_expert_type=config.zero_expert_type,
|
||||
router=self.router,
|
||||
num_experts=num_experts,
|
||||
top_k=top_k,
|
||||
hidden_size=hidden_size,
|
||||
intermediate_size=intermediate_size,
|
||||
reduce_results=True,
|
||||
params_dtype=params_dtype,
|
||||
e_score_correction_bias=self.router.e_score_correction_bias,
|
||||
renormalize=False,
|
||||
quant_config=quant_config,
|
||||
prefix=f"{prefix}.experts",
|
||||
zero_expert_num=self.zero_expert_num,
|
||||
zero_expert_type=self.zero_expert_type,
|
||||
enable_eplb=self.enable_eplb,
|
||||
enable_eplb=enable_eplb,
|
||||
routed_scaling_factor=config.routed_scaling_factor,
|
||||
)
|
||||
|
||||
@ -317,11 +315,34 @@ class LongcatMoe(nn.Module):
|
||||
num_tokens, hidden_dim = hidden_states.shape
|
||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||
|
||||
router_logits = self.router(hidden_states.to(self.rounter_params_dtype))
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=hidden_states, router_logits=router_logits
|
||||
# Align to FusedMoE padded hidden size to avoid dim mismatch
|
||||
padded_hidden = self.experts.hidden_size
|
||||
if hidden_dim < padded_hidden:
|
||||
hidden_states_padded = torch.nn.functional.pad(
|
||||
hidden_states,
|
||||
(0, padded_hidden - hidden_dim),
|
||||
mode="constant",
|
||||
value=0.0,
|
||||
)
|
||||
else:
|
||||
hidden_states_padded = hidden_states
|
||||
|
||||
router_logits_full = self.router(
|
||||
hidden_states_padded.to(self.rounter_params_dtype)
|
||||
)
|
||||
|
||||
# ZeroExpertFusedMoE handles routing memoization and zero expert computation
|
||||
# internally. Pass full router_logits (including zero experts) so that
|
||||
# zero experts can be properly identified in routing.
|
||||
final_hidden_states = self.experts(
|
||||
hidden_states=hidden_states_padded,
|
||||
router_logits=router_logits_full, # Full logits (includes zero experts)
|
||||
)
|
||||
|
||||
# Crop back to original hidden dimension if padded earlier
|
||||
if padded_hidden != hidden_dim:
|
||||
final_hidden_states = final_hidden_states[..., :hidden_dim]
|
||||
|
||||
return final_hidden_states.view(num_tokens, hidden_dim)
|
||||
|
||||
|
||||
@ -419,6 +440,7 @@ class FlashDecoderLayer(nn.Module):
|
||||
hidden_states = self.self_attn[0](
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
llama_4_scaling=None,
|
||||
)
|
||||
|
||||
hidden_states, residual = self.post_attention_layernorm[0](
|
||||
@ -438,6 +460,7 @@ class FlashDecoderLayer(nn.Module):
|
||||
hidden_states = self.self_attn[1](
|
||||
positions=positions,
|
||||
hidden_states=hidden_states,
|
||||
llama_4_scaling=None,
|
||||
)
|
||||
hidden_states, residual = self.post_attention_layernorm[1](
|
||||
hidden_states, residual
|
||||
|
||||
@ -79,6 +79,15 @@ class OpenAIToolParser(ToolParser):
|
||||
elif msg.channel == "commentary" and not msg.recipient:
|
||||
commentary_content = msg_text
|
||||
|
||||
# Extract partial content from the parser state if the generation was truncated
|
||||
if parser.current_content:
|
||||
if parser.current_channel == "final":
|
||||
final_content = parser.current_content
|
||||
elif (
|
||||
parser.current_channel == "commentary" and not parser.current_recipient
|
||||
):
|
||||
commentary_content = parser.current_content
|
||||
|
||||
return ExtractedToolCallInformation(
|
||||
tools_called=len(tool_calls) > 0,
|
||||
tool_calls=tool_calls,
|
||||
|
||||
@ -4,6 +4,7 @@ import asyncio
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
import warnings
|
||||
from collections.abc import AsyncGenerator, Iterable, Mapping
|
||||
from copy import copy
|
||||
from typing import Any, cast
|
||||
@ -627,6 +628,9 @@ class AsyncLLM(EngineClient):
|
||||
|
||||
The caller of generate() iterates the returned AsyncGenerator,
|
||||
returning the RequestOutput back to the caller.
|
||||
|
||||
NOTE: truncate_prompt_tokens is deprecated in v0.14.
|
||||
TODO: Remove truncate_prompt_tokens in v0.15.
|
||||
"""
|
||||
|
||||
try:
|
||||
@ -641,9 +645,19 @@ class AsyncLLM(EngineClient):
|
||||
|
||||
if tokenization_kwargs is None:
|
||||
tokenization_kwargs = {}
|
||||
|
||||
if truncate_prompt_tokens is not None:
|
||||
warnings.warn(
|
||||
"The `truncate_prompt_tokens` parameter in `AsyncLLM.encode()` "
|
||||
"is deprecated and will be removed in v0.15. "
|
||||
"Please use `pooling_params.truncate_prompt_tokens` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
_validate_truncation_size(
|
||||
self.model_config.max_model_len,
|
||||
truncate_prompt_tokens,
|
||||
pooling_params.truncate_prompt_tokens,
|
||||
tokenization_kwargs,
|
||||
)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user