mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 11:45:59 +08:00
Signed-off-by: SungMinCho <tjdals4565@gmail.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com>
898 lines
30 KiB
Python
898 lines
30 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
Tests for the analytic estimators in metrics/flops.py.
|
|
"""
|
|
|
|
import types
|
|
from types import SimpleNamespace
|
|
|
|
from transformers.models.deepseek_v3.configuration_deepseek_v3 import DeepseekV3Config
|
|
from transformers.models.llama4.configuration_llama4 import (
|
|
Llama4Config,
|
|
Llama4TextConfig,
|
|
)
|
|
from transformers.models.qwen3.configuration_qwen3 import Qwen3Config
|
|
from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
|
|
|
|
from vllm.config.model import ModelConfig, get_hf_text_config
|
|
from vllm.v1.metrics.perf import (
|
|
AttentionMetrics,
|
|
BaseConfigParser,
|
|
ExecutionContext,
|
|
FfnMetrics,
|
|
ModelMetrics,
|
|
ParsedArgs,
|
|
UnembedMetrics,
|
|
)
|
|
|
|
|
|
class MockModelConfig:
|
|
"""Mock ModelConfig that implements the getter methods used by parsers."""
|
|
|
|
def __init__(self, hf_config, dtype):
|
|
self.hf_config = hf_config
|
|
self.hf_text_config = get_hf_text_config(hf_config)
|
|
self.dtype = dtype
|
|
self.is_attention_free = False
|
|
|
|
def __getattr__(self, name):
|
|
# 1. Check if ModelConfig actually has this attribute
|
|
if not hasattr(ModelConfig, name):
|
|
raise AttributeError(
|
|
f"'{type(self).__name__}' object has no attribute '{name}' "
|
|
f"and neither does 'ModelConfig'."
|
|
)
|
|
|
|
# 2. Fetch the attribute from the ModelConfig CLASS
|
|
attr = getattr(ModelConfig, name)
|
|
|
|
# 3. Case A: It is a @property
|
|
if isinstance(attr, property):
|
|
# Manually invoke the property's getter, passing 'self' (this mock instance)
|
|
return attr.__get__(self, self.__class__)
|
|
|
|
# 4. Case B: It is a standard method (function)
|
|
if isinstance(attr, types.FunctionType):
|
|
# Bind the function to 'self' so it acts like a method of
|
|
# this instance. This creates a bound method where 'self' is
|
|
# automatically passed as the first arg.
|
|
return types.MethodType(attr, self)
|
|
|
|
# 5. Case C: It is a class attribute / static variable
|
|
return attr
|
|
|
|
|
|
def create_mock_vllm_config(
|
|
hf_config,
|
|
model_dtype="bfloat16",
|
|
cache_dtype="auto",
|
|
quant_config=None,
|
|
data_parallel_size=1,
|
|
tensor_parallel_size=1,
|
|
pipeline_parallel_size=1,
|
|
enable_expert_parallel=False,
|
|
) -> SimpleNamespace:
|
|
vllm_config = SimpleNamespace()
|
|
vllm_config.model_config = MockModelConfig(hf_config, model_dtype)
|
|
|
|
vllm_config.cache_config = SimpleNamespace()
|
|
vllm_config.cache_config.cache_dtype = cache_dtype
|
|
|
|
vllm_config.quant_config = quant_config
|
|
|
|
vllm_config.parallel_config = SimpleNamespace()
|
|
vllm_config.parallel_config.data_parallel_size = data_parallel_size
|
|
vllm_config.parallel_config.tensor_parallel_size = tensor_parallel_size
|
|
vllm_config.parallel_config.pipeline_parallel_size = pipeline_parallel_size
|
|
vllm_config.parallel_config.enable_expert_parallel = enable_expert_parallel
|
|
|
|
return vllm_config
|
|
|
|
|
|
#### Parser Tests ####
|
|
|
|
|
|
def test_base_config_parser():
|
|
"""Test BaseConfigParser extracts base model attributes correctly."""
|
|
hf_config = Qwen3Config(
|
|
vocab_size=50000,
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_hidden_layers=24,
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config, model_dtype="float16")
|
|
|
|
parser = BaseConfigParser()
|
|
args = ParsedArgs()
|
|
result = parser.parse(args, vllm_config)
|
|
|
|
assert result.vocab_size == 50000
|
|
assert result.hidden_size == 2048
|
|
assert result.num_attention_heads == 16
|
|
assert result.num_hidden_layers == 24
|
|
assert result.weight_byte_size == 2 # float16 is 2 bytes
|
|
assert result.activation_byte_size == 2 # default activation size
|
|
|
|
|
|
def test_base_attention_config_parser_with_gqa():
|
|
"""Test BaseAttentionConfigParser with grouped query attention."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=8, # GQA with 4:1 ratio
|
|
head_dim=128,
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = AttentionMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
assert result.num_key_value_heads == 8
|
|
assert result.head_dim == 128
|
|
|
|
|
|
def test_base_attention_config_parser_without_gqa():
|
|
"""
|
|
Test BaseAttentionConfigParser defaults to MHA when num_key_value_heads not
|
|
specified.
|
|
"""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
num_attention_heads=32,
|
|
# No num_key_value_heads specified
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = AttentionMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
# Should default to MHA (num_key_value_heads = num_attention_heads)
|
|
assert result.num_key_value_heads == 32
|
|
|
|
|
|
def test_base_ffn_config_parser_dense():
|
|
"""Test BaseFfnConfigParser for dense FFN."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
intermediate_size=11008,
|
|
num_hidden_layers=32,
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = FfnMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
assert result.intermediate_size == 11008
|
|
assert result.num_experts == 0
|
|
assert result.num_experts_per_tok == 0
|
|
assert result.num_moe_layers == 0 # No MoE
|
|
|
|
|
|
def test_base_ffn_config_parser_moe():
|
|
"""Test BaseFfnConfigParser for MoE FFN."""
|
|
hf_config = Qwen3MoeConfig(
|
|
hidden_size=4096,
|
|
intermediate_size=11008,
|
|
num_hidden_layers=32,
|
|
num_experts=64,
|
|
num_experts_per_tok=8,
|
|
moe_intermediate_size=14336,
|
|
n_shared_experts=2,
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = FfnMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
assert result.num_experts == 64
|
|
assert result.num_experts_per_tok == 8
|
|
assert result.moe_intermediate_size == 14336
|
|
assert result.num_shared_experts == 2
|
|
assert result.num_moe_layers == 32 # All layers are MoE by default
|
|
|
|
|
|
def test_interleave_moe_layer_step_parser():
|
|
"""Test InterleaveMoeLayerStepParser correctly computes MoE layer count."""
|
|
hf_config = Llama4Config(
|
|
text_config=Llama4TextConfig(
|
|
num_hidden_layers=32,
|
|
num_local_experts=64,
|
|
interleave_moe_layer_step=4, # Every 4th layer is MoE
|
|
),
|
|
)
|
|
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = FfnMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
assert result.num_moe_layers == 8
|
|
|
|
|
|
def test_moe_layer_freq_parser():
|
|
"""Test MoeLayerFreqParser correctly computes MoE layer count."""
|
|
hf_config = DeepseekV3Config(
|
|
num_hidden_layers=30,
|
|
n_routed_experts=64,
|
|
moe_layer_freq=3, # Every 3rd layer after first_k_dense_replace
|
|
first_k_dense_replace=6, # First 6 layers are dense
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config)
|
|
|
|
parser_chain = FfnMetrics.get_parser()
|
|
result = parser_chain.parse(vllm_config)
|
|
|
|
# Layers >= 6 and divisible by 3: 6, 9, 12, 15, 18, 21, 24, 27
|
|
expected_moe_layers = len(
|
|
[layer for layer in range(30) if layer >= 6 and layer % 3 == 0]
|
|
)
|
|
assert expected_moe_layers == 8
|
|
assert result.num_moe_layers == expected_moe_layers
|
|
|
|
|
|
#### ComponentMetrics Tests ####
|
|
|
|
|
|
def test_attention_metrics_scaling():
|
|
"""Test that attention metrics scale proportionally with model dimensions."""
|
|
base_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_key_value_heads=16,
|
|
num_hidden_layers=12,
|
|
head_dim=128,
|
|
)
|
|
|
|
base_vllm_config = create_mock_vllm_config(base_hf_config)
|
|
base_metrics = AttentionMetrics.from_vllm_config(base_vllm_config)
|
|
|
|
# Test scaling with number of layers
|
|
double_layers_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_key_value_heads=16,
|
|
num_hidden_layers=24, # Double the layers
|
|
head_dim=128,
|
|
)
|
|
double_layers_vllm_config = create_mock_vllm_config(double_layers_hf_config)
|
|
double_layers_metrics = AttentionMetrics.from_vllm_config(double_layers_vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# FLOPS should double when layers double
|
|
base_flops = base_metrics.get_num_flops(ctx)
|
|
double_flops = double_layers_metrics.get_num_flops(ctx)
|
|
assert double_flops == 2 * base_flops
|
|
|
|
# Read/write bytes should also scale proportionally
|
|
base_read = base_metrics.get_read_bytes(ctx)
|
|
double_read = double_layers_metrics.get_read_bytes(ctx)
|
|
assert double_read == 2 * base_read
|
|
|
|
base_write = base_metrics.get_write_bytes(ctx)
|
|
double_write = double_layers_metrics.get_write_bytes(ctx)
|
|
assert double_write == 2 * base_write
|
|
|
|
|
|
def test_attention_metrics_grouped_query():
|
|
"""Test attention metrics handle grouped query attention correctly."""
|
|
mha_hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=32, # MHA
|
|
num_hidden_layers=1,
|
|
)
|
|
mha_config = create_mock_vllm_config(mha_hf_config)
|
|
|
|
gqa_hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=8, # GQA with 4:1 ratio
|
|
num_hidden_layers=1,
|
|
)
|
|
gqa_config = create_mock_vllm_config(gqa_hf_config)
|
|
|
|
mha_metrics = AttentionMetrics.from_vllm_config(mha_config)
|
|
gqa_metrics = AttentionMetrics.from_vllm_config(gqa_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=1, context_len=1024, is_prefill=False
|
|
)
|
|
|
|
# GQA should have less KV cache reads since fewer KV heads
|
|
mha_read = mha_metrics.get_read_bytes(ctx)
|
|
gqa_read = gqa_metrics.get_read_bytes(ctx)
|
|
assert gqa_read < mha_read
|
|
|
|
|
|
def test_ffn_metrics_scaling():
|
|
"""Test FFN metrics scale proportionally with model dimensions."""
|
|
base_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
)
|
|
base_vllm_config = create_mock_vllm_config(base_hf_config)
|
|
base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
|
|
|
|
# Test scaling with intermediate size
|
|
larger_ffn_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
intermediate_size=16384, # Double intermediate size
|
|
num_hidden_layers=12,
|
|
)
|
|
larger_ffn_vllm_config = create_mock_vllm_config(larger_ffn_hf_config)
|
|
larger_ffn_metrics = FfnMetrics.from_vllm_config(larger_ffn_vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# FLOPS should double when intermediate size doubles
|
|
base_flops = base_metrics.get_num_flops(ctx)
|
|
larger_flops = larger_ffn_metrics.get_num_flops(ctx)
|
|
assert larger_flops == base_flops * 2
|
|
|
|
|
|
def test_moe_metrics_vs_dense():
|
|
"""Test MoE metrics versus dense metrics."""
|
|
dense_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
)
|
|
dense_config = create_mock_vllm_config(dense_hf_config)
|
|
|
|
moe_hf_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
num_experts=64,
|
|
num_experts_per_tok=2, # 2 routed expert
|
|
moe_intermediate_size=8192,
|
|
n_shared_experts=0,
|
|
)
|
|
moe_config = create_mock_vllm_config(moe_hf_config)
|
|
|
|
dense_metrics = FfnMetrics.from_vllm_config(dense_config)
|
|
moe_metrics = FfnMetrics.from_vllm_config(moe_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# MoE should have different compute/memory characteristics
|
|
dense_flops = dense_metrics.get_num_flops(ctx)
|
|
moe_flops = moe_metrics.get_num_flops(ctx)
|
|
|
|
# 2 routed experts vs 1 dense.
|
|
assert moe_flops == dense_flops * 2
|
|
|
|
|
|
def test_unembed_metrics_scaling():
|
|
"""Test unembedding metrics scale with vocab size."""
|
|
small_vocab_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
vocab_size=32000,
|
|
)
|
|
small_vocab_config = create_mock_vllm_config(small_vocab_hf_config)
|
|
|
|
large_vocab_hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
vocab_size=64000, # Double vocab size
|
|
)
|
|
large_vocab_config = create_mock_vllm_config(large_vocab_hf_config)
|
|
|
|
small_vocab_metrics = UnembedMetrics.from_vllm_config(small_vocab_config)
|
|
large_vocab_metrics = UnembedMetrics.from_vllm_config(large_vocab_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# FLOPS should double when vocab size doubles
|
|
small_flops = small_vocab_metrics.get_num_flops(ctx)
|
|
large_flops = large_vocab_metrics.get_num_flops(ctx)
|
|
assert large_flops == 2 * small_flops
|
|
|
|
|
|
def test_prefill_vs_decode_differences():
|
|
"""Test that prefill and decode have different memory access patterns."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_key_value_heads=16,
|
|
num_hidden_layers=1,
|
|
)
|
|
config = create_mock_vllm_config(hf_config)
|
|
|
|
metrics = AttentionMetrics.from_vllm_config(config)
|
|
|
|
prefill_ctx = ExecutionContext.from_single_request(
|
|
num_tokens=512, context_len=512, is_prefill=True
|
|
)
|
|
decode_ctx = ExecutionContext.from_single_request(
|
|
num_tokens=1, context_len=512, is_prefill=False
|
|
)
|
|
|
|
prefill_read = metrics.get_read_bytes(prefill_ctx)
|
|
decode_read = metrics.get_read_bytes(decode_ctx)
|
|
|
|
assert prefill_read != decode_read
|
|
|
|
|
|
def test_model_metrics_aggregation():
|
|
"""Test ModelMetrics correctly aggregates across components."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_hidden_layers=12,
|
|
vocab_size=32000,
|
|
intermediate_size=8192,
|
|
)
|
|
config = create_mock_vllm_config(hf_config)
|
|
|
|
model_metrics = ModelMetrics(config)
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Should have metrics for attention, ffn, and unembed
|
|
total_flops = model_metrics.get_num_flops(ctx)
|
|
breakdown = model_metrics.get_num_flops_breakdown(ctx)
|
|
|
|
# Breakdown should sum to total
|
|
assert total_flops == sum(breakdown.values())
|
|
|
|
|
|
def test_moe_expert_activation_proportional_scaling():
|
|
"""Test that routed expert metrics scale proportionally with num_experts_per_tok."""
|
|
base_moe_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
num_experts=64,
|
|
num_experts_per_tok=1, # 1 expert per token
|
|
moe_intermediate_size=8192,
|
|
n_shared_experts=2,
|
|
)
|
|
|
|
double_experts_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
num_experts=64,
|
|
num_experts_per_tok=2, # 2 experts per token (double)
|
|
moe_intermediate_size=8192,
|
|
n_shared_experts=2, # Same shared experts
|
|
)
|
|
|
|
triple_experts_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
num_experts=64,
|
|
num_experts_per_tok=3, # 3 experts per token (triple)
|
|
moe_intermediate_size=8192,
|
|
n_shared_experts=2, # Same shared experts
|
|
)
|
|
|
|
base_vllm_config = create_mock_vllm_config(base_moe_config)
|
|
double_vllm_config = create_mock_vllm_config(double_experts_config)
|
|
triple_vllm_config = create_mock_vllm_config(triple_experts_config)
|
|
|
|
base_metrics = FfnMetrics.from_vllm_config(base_vllm_config)
|
|
double_metrics = FfnMetrics.from_vllm_config(double_vllm_config)
|
|
triple_metrics = FfnMetrics.from_vllm_config(triple_vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Get total metrics - the key insight is that differences should be proportional
|
|
base_flops = base_metrics.get_num_flops(ctx)
|
|
double_flops = double_metrics.get_num_flops(ctx)
|
|
triple_flops = triple_metrics.get_num_flops(ctx)
|
|
|
|
# The difference between double and base should equal one additional expert
|
|
one_expert_diff = double_flops - base_flops
|
|
|
|
# The difference between triple and base should equal two additional experts
|
|
two_expert_diff = triple_flops - base_flops
|
|
|
|
# Proportional scaling: 2 * (1 expert diff) should equal (2 expert diff)
|
|
assert two_expert_diff == 2 * one_expert_diff
|
|
|
|
# Same logic applies to memory operations
|
|
base_read = base_metrics.get_read_bytes(ctx)
|
|
double_read = double_metrics.get_read_bytes(ctx)
|
|
triple_read = triple_metrics.get_read_bytes(ctx)
|
|
|
|
one_expert_read_diff = double_read - base_read
|
|
two_expert_read_diff = triple_read - base_read
|
|
|
|
assert two_expert_read_diff == 2 * one_expert_read_diff
|
|
|
|
# Same for write bytes
|
|
base_write = base_metrics.get_write_bytes(ctx)
|
|
double_write = double_metrics.get_write_bytes(ctx)
|
|
triple_write = triple_metrics.get_write_bytes(ctx)
|
|
|
|
one_expert_write_diff = double_write - base_write
|
|
two_expert_write_diff = triple_write - base_write
|
|
|
|
assert two_expert_write_diff == 2 * one_expert_write_diff
|
|
|
|
|
|
def test_quantization_config_parser_fp8():
|
|
"""Test quantization parsers with fp8."""
|
|
|
|
class MockQuantConfig:
|
|
def get_name(self):
|
|
return "fp8"
|
|
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048, num_attention_heads=16, num_hidden_layers=1
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
|
|
|
|
attn_result = AttentionMetrics.get_parser().parse(vllm_config)
|
|
assert attn_result.weight_byte_size == 1 # fp8
|
|
|
|
ffn_result = FfnMetrics.get_parser().parse(vllm_config)
|
|
assert ffn_result.weight_byte_size == 1 # fp8
|
|
|
|
|
|
def test_quantization_config_parser_mxfp4():
|
|
"""Test quantization parsers with mxfp4."""
|
|
|
|
class MockQuantConfig:
|
|
def get_name(self):
|
|
return "mxfp4"
|
|
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048, intermediate_size=8192, num_hidden_layers=1
|
|
)
|
|
vllm_config = create_mock_vllm_config(hf_config, quant_config=MockQuantConfig())
|
|
|
|
ffn_result = FfnMetrics.get_parser().parse(vllm_config)
|
|
assert ffn_result.weight_byte_size == 0.5 # mxfp4
|
|
|
|
|
|
#### Per-GPU Tests ####
|
|
|
|
|
|
def test_attention_per_gpu_with_tensor_parallelism():
|
|
"""Test attention metrics with tensor parallelism - per_gpu vs global."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
num_attention_heads=32,
|
|
num_key_value_heads=8,
|
|
num_hidden_layers=24,
|
|
)
|
|
|
|
# Test with TP=4
|
|
vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
|
|
metrics = AttentionMetrics.from_vllm_config(vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=128, context_len=1024, is_prefill=True
|
|
)
|
|
|
|
# Get global and per-gpu metrics
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
|
|
# With TP=4, global flops should be 4x per-gpu flops (heads divided by 4)
|
|
assert global_flops == 4 * per_gpu_flops
|
|
|
|
# Same for read/write bytes
|
|
global_read = metrics.get_read_bytes(ctx, per_gpu=False)
|
|
per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
|
|
# Reads should scale similarly (weight reads are divided by TP)
|
|
assert global_read > per_gpu_read
|
|
|
|
global_write = metrics.get_write_bytes(ctx, per_gpu=False)
|
|
per_gpu_write = metrics.get_write_bytes(ctx, per_gpu=True)
|
|
assert global_write > per_gpu_write
|
|
|
|
|
|
def test_attention_per_gpu_with_pipeline_parallelism():
|
|
"""Test attention metrics with pipeline parallelism - per_gpu vs global."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_hidden_layers=32,
|
|
)
|
|
|
|
# Test with PP=4
|
|
vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=4)
|
|
metrics = AttentionMetrics.from_vllm_config(vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=False
|
|
)
|
|
|
|
# Get global and per-gpu metrics
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
|
|
# With PP=4, global flops should be 4x per-gpu flops (layers divided by 4)
|
|
assert global_flops == 4 * per_gpu_flops
|
|
|
|
global_read = metrics.get_read_bytes(ctx, per_gpu=False)
|
|
per_gpu_read = metrics.get_read_bytes(ctx, per_gpu=True)
|
|
assert global_read == 4 * per_gpu_read
|
|
|
|
|
|
def test_ffn_per_gpu_with_tensor_parallelism():
|
|
"""Test FFN metrics with tensor parallelism - per_gpu vs global."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
intermediate_size=14336,
|
|
num_hidden_layers=32,
|
|
)
|
|
|
|
# Test with DP=2, TP=4 (ffn_tp_size will be 8)
|
|
vllm_config = create_mock_vllm_config(
|
|
hf_config,
|
|
data_parallel_size=2,
|
|
tensor_parallel_size=4,
|
|
)
|
|
metrics = FfnMetrics.from_vllm_config(vllm_config)
|
|
|
|
# ffn_tp_size should be dp_size * tp_size = 8 (when EP not enabled)
|
|
assert metrics.ffn_tp_size == 8
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=128, context_len=2048, is_prefill=True
|
|
)
|
|
|
|
# Get global and per-gpu metrics
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
|
|
# With ffn_tp_size=8, global should be 8x per-gpu
|
|
assert global_flops == 8 * per_gpu_flops
|
|
|
|
|
|
def test_ffn_per_gpu_with_pipeline_parallelism():
|
|
"""Test FFN metrics with pipeline parallelism - per_gpu vs global."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=24,
|
|
)
|
|
|
|
# Test with PP=6
|
|
vllm_config = create_mock_vllm_config(hf_config, pipeline_parallel_size=6)
|
|
metrics = FfnMetrics.from_vllm_config(vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Get global and per-gpu metrics
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
|
|
# With PP=6, global should be 6x per-gpu (layers divided by 6)
|
|
assert global_flops == 6 * per_gpu_flops
|
|
|
|
|
|
def test_moe_per_gpu_with_expert_parallelism():
|
|
"""
|
|
Test MoE metrics with expert parallelism - verifies num_activated_experts bug fix.
|
|
"""
|
|
hf_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=24,
|
|
num_experts=64,
|
|
num_experts_per_tok=8,
|
|
moe_intermediate_size=14336,
|
|
n_shared_experts=2,
|
|
)
|
|
|
|
# Test with DP=2, TP=4, EP enabled (ffn_ep_size will be 8)
|
|
vllm_config = create_mock_vllm_config(
|
|
hf_config,
|
|
data_parallel_size=2,
|
|
tensor_parallel_size=4,
|
|
enable_expert_parallel=True,
|
|
)
|
|
metrics = FfnMetrics.from_vllm_config(vllm_config)
|
|
|
|
# When EP enabled, ffn_ep_size = dp_size * tp_size = 8
|
|
assert metrics.ffn_ep_size == 8
|
|
assert metrics.ffn_tp_size == 1
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Get per-gpu metrics
|
|
per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
|
|
global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
|
|
|
|
# Verify that routed expert weight reads are reasonable
|
|
# With per_gpu=True, each GPU has 64/8 = 8 experts
|
|
# T=100, E_per_gpu=8/8=1, so T*E=100 expert activations
|
|
# num_activated_experts should be min(100, 8) = 8
|
|
|
|
# Check that weight reads scale appropriately
|
|
# Global has all 64 experts, per-gpu has 8 experts
|
|
# So weight reads should reflect this difference
|
|
if "routed_up_gate_weights" in per_gpu_read_breakdown:
|
|
per_gpu_weight_reads = per_gpu_read_breakdown["routed_up_gate_weights"]
|
|
global_weight_reads = global_read_breakdown["routed_up_gate_weights"]
|
|
|
|
# The ratio should reflect the expert count difference
|
|
# This verifies the bug fix works correctly
|
|
assert per_gpu_weight_reads < global_weight_reads
|
|
|
|
# Global should read more experts than per-gpu
|
|
# Exact ratio depends on num_activated_experts calculation
|
|
ratio = global_weight_reads / per_gpu_weight_reads
|
|
# Should be > 1 since global has more experts to read
|
|
assert ratio > 1
|
|
|
|
|
|
def test_moe_per_gpu_expert_activation_accounting():
|
|
"""
|
|
Test that MoE correctly accounts for expert activations with small batch sizes.
|
|
"""
|
|
hf_config = Qwen3MoeConfig(
|
|
hidden_size=2048,
|
|
intermediate_size=8192,
|
|
num_hidden_layers=12,
|
|
num_experts=64,
|
|
num_experts_per_tok=8,
|
|
moe_intermediate_size=14336,
|
|
n_shared_experts=0, # No shared experts for this test
|
|
)
|
|
|
|
# Test with EP=8
|
|
vllm_config = create_mock_vllm_config(
|
|
hf_config,
|
|
data_parallel_size=8,
|
|
enable_expert_parallel=True,
|
|
)
|
|
metrics = FfnMetrics.from_vllm_config(vllm_config)
|
|
|
|
# Small batch: T=10, E_per_gpu=8/8=1
|
|
# Each GPU: T*E = 10*1 = 10 activations
|
|
# Experts per GPU: 64/8 = 8
|
|
# So num_activated_experts should be min(10, 8) = 8
|
|
small_ctx = ExecutionContext.from_single_request(
|
|
num_tokens=10, context_len=512, is_prefill=True
|
|
)
|
|
small_read = metrics.get_read_bytes_breakdown(small_ctx, per_gpu=True)
|
|
|
|
# Large batch: T=1000, E_per_gpu=1
|
|
# Each GPU: T*E = 1000*1 = 1000 activations
|
|
# Experts per GPU: 8
|
|
# So num_activated_experts should be min(1000, 8) = 8 (all experts activated)
|
|
large_ctx = ExecutionContext.from_single_request(
|
|
num_tokens=1000, context_len=512, is_prefill=True
|
|
)
|
|
large_read = metrics.get_read_bytes_breakdown(large_ctx, per_gpu=True)
|
|
|
|
# Weight reads should be similar (both activate all 8 experts per GPU)
|
|
# But activation reads should differ (proportional to T*E)
|
|
if "routed_up_gate_weights" in small_read:
|
|
small_weight = small_read["routed_up_gate_weights"]
|
|
large_weight = large_read["routed_up_gate_weights"]
|
|
|
|
# Weight reads should be the same (both read all 8 experts)
|
|
assert small_weight == large_weight
|
|
|
|
# But input activation reads should scale with T*E
|
|
small_input = small_read["routed_up_gate_input"]
|
|
large_input = large_read["routed_up_gate_input"]
|
|
assert large_input == 100 * small_input # 1000/10 = 100x
|
|
|
|
|
|
def test_unembed_per_gpu_with_tensor_parallelism():
|
|
"""Test unembed metrics with tensor parallelism - per_gpu vs global."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=4096,
|
|
vocab_size=128000,
|
|
)
|
|
|
|
# Test with TP=8
|
|
vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=8)
|
|
metrics = UnembedMetrics.from_vllm_config(vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Get global and per-gpu metrics
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
|
|
# With TP=8, vocab is divided by 8, so global should be 8x per-gpu
|
|
assert global_flops == 8 * per_gpu_flops
|
|
|
|
# For read bytes, weight reads scale with TP but input reads don't (replicated)
|
|
global_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=False)
|
|
per_gpu_read_breakdown = metrics.get_read_bytes_breakdown(ctx, per_gpu=True)
|
|
|
|
# Input reads should be the same (replicated across TP ranks)
|
|
assert global_read_breakdown["input"] == per_gpu_read_breakdown["input"]
|
|
|
|
# Weight reads should scale 8x (divided by TP)
|
|
assert global_read_breakdown["weight"] == 8 * per_gpu_read_breakdown["weight"]
|
|
|
|
|
|
def test_model_metrics_per_gpu_aggregation():
|
|
"""Test ModelMetrics correctly aggregates per_gpu metrics across components."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=16,
|
|
num_hidden_layers=12,
|
|
vocab_size=32000,
|
|
intermediate_size=8192,
|
|
)
|
|
|
|
# Test with mixed parallelism: TP=2, PP=2
|
|
vllm_config = create_mock_vllm_config(
|
|
hf_config,
|
|
tensor_parallel_size=2,
|
|
pipeline_parallel_size=2,
|
|
)
|
|
|
|
model_metrics = ModelMetrics(vllm_config)
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=100, context_len=512, is_prefill=True
|
|
)
|
|
|
|
# Get breakdowns for both modes
|
|
per_gpu_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=True)
|
|
global_breakdown = model_metrics.get_num_flops_breakdown(ctx, per_gpu=False)
|
|
|
|
# Verify breakdown sums match totals
|
|
per_gpu_total = model_metrics.get_num_flops(ctx, per_gpu=True)
|
|
global_total = model_metrics.get_num_flops(ctx, per_gpu=False)
|
|
|
|
assert per_gpu_total == sum(per_gpu_breakdown.values())
|
|
assert global_total == sum(global_breakdown.values())
|
|
|
|
# Global should be larger than per-gpu due to parallelism
|
|
assert global_total > per_gpu_total
|
|
|
|
# With TP=2 and PP=2, the ratio depends on which parallelism applies to
|
|
# which component but we can verify that global is reasonably larger
|
|
ratio = global_total / per_gpu_total
|
|
assert ratio > 1 # Should be between PP and TP*PP depending on component mix
|
|
|
|
|
|
def test_attention_per_gpu_heads_not_evenly_divisible():
|
|
"""Test attention with heads not evenly divisible by TP."""
|
|
hf_config = Qwen3Config(
|
|
hidden_size=2048,
|
|
num_attention_heads=17, # Not divisible by 4
|
|
num_key_value_heads=5, # Not divisible by 4
|
|
num_hidden_layers=8,
|
|
)
|
|
|
|
vllm_config = create_mock_vllm_config(hf_config, tensor_parallel_size=4)
|
|
metrics = AttentionMetrics.from_vllm_config(vllm_config)
|
|
|
|
ctx = ExecutionContext.from_single_request(
|
|
num_tokens=64, context_len=256, is_prefill=True
|
|
)
|
|
|
|
# Should not crash and should handle max(1, ...) correctly
|
|
per_gpu_flops = metrics.get_num_flops(ctx, per_gpu=True)
|
|
global_flops = metrics.get_num_flops(ctx, per_gpu=False)
|
|
|
|
# Both should be positive
|
|
assert per_gpu_flops > 0
|
|
assert global_flops > 0
|
|
assert global_flops > per_gpu_flops
|