From 3453b964a3ed84d99c9ae33bc0fae00790df36ef Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 15 Mar 2025 18:46:17 -0700
Subject: [PATCH 01/34] [Misc][Doc] Minor benchmark README update (#14874)
Signed-off-by: Roger Wang
---
benchmarks/README.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/benchmarks/README.md b/benchmarks/README.md
index c64c24fd3ad05..3225a4b0db3a0 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -82,10 +82,10 @@ Then run the benchmarking script
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
-BACKEND="openai-chat"
+BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="/ShareGPT_V3_unfiltered_cleaned_split.json"
-python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/chat/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
+python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
From def232e122624504e49f1e5ff0ae01a7285de1a3 Mon Sep 17 00:00:00 2001
From: Isotr0py
Date: Sun, 16 Mar 2025 09:53:52 +0800
Subject: [PATCH 02/34] [VLM] Clean up Phi-4-MM ViT implementation (#14812)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung
---
requirements/test.in | 1 +
requirements/test.txt | 2 +
.../vision_language/test_phi4mm.py | 229 ++
vllm/model_executor/models/aria.py | 4 +-
.../models/idefics2_vision_model.py | 57 +-
vllm/model_executor/models/phi4mm.py | 45 +-
.../models/vision_siglip_navit.py | 1966 -----------------
7 files changed, 316 insertions(+), 1988 deletions(-)
create mode 100644 tests/models/decoder_only/vision_language/test_phi4mm.py
delete mode 100644 vllm/model_executor/models/vision_siglip_navit.py
diff --git a/requirements/test.in b/requirements/test.in
index cc89d518c7eec..c171e8d41ddc2 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -8,6 +8,7 @@ pytest-shard
# testing utils
awscli
+backoff # required for phi4mm test
decord # required for video tests
einops # required for MPT, qwen-vl and Mamba
httpx
diff --git a/requirements/test.txt b/requirements/test.txt
index c2cdd2c8664d8..10fb1f14c3a18 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -33,6 +33,8 @@ audioread==3.0.1
# via librosa
awscli==1.35.23
# via -r requirements/test.in
+backoff==2.2.1
+ # via -r requirements/test.in
bitsandbytes==0.45.3
# via -r requirements/test.in
black==24.10.0
diff --git a/tests/models/decoder_only/vision_language/test_phi4mm.py b/tests/models/decoder_only/vision_language/test_phi4mm.py
new file mode 100644
index 0000000000000..fb69beaf77598
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_phi4mm.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import re
+from typing import Optional
+
+import pytest
+from huggingface_hub import snapshot_download
+from transformers import AutoTokenizer
+
+from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import rescale_image_size
+from vllm.platforms import current_platform
+from vllm.sequence import SampleLogprobs
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+ "stop_sign":
+ "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
+ "cherry_blossom":
+ "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
+})
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
+
+model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+# Since the vision-lora and speech-lora co-exist with the base model,
+# we have to manually specify the path of the lora weights.
+vision_lora_path = os.path.join(model_path, "vision-lora")
+models = [model_path]
+
+
+def vllm_to_hf_output(vllm_output: tuple[list[int], str,
+ Optional[SampleLogprobs]],
+ model: str):
+ """Sanitize vllm output to be comparable with hf output."""
+ _, output_str, out_logprobs = vllm_output
+
+ output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+ assert output_str_without_image[0] == " "
+ output_str_without_image = output_str_without_image[1:]
+
+ hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
+
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ hf_output_ids = tokenizer.encode(output_str_without_image)
+ assert hf_output_ids[0] == 1
+ hf_output_ids = hf_output_ids[1:]
+
+ return hf_output_ids, hf_output_str, out_logprobs
+
+
+target_dtype = "half"
+
+# ROCm Triton FA can run into shared memory issues with these models,
+# use other backends in the meantime
+# FIXME (mattwong, gshtrasb, hongxiayan)
+if current_platform.is_rocm():
+ os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
+
+
+def run_test(
+ hf_runner: type[HfRunner],
+ vllm_runner: type[VllmRunner],
+ inputs: list[tuple[list[str], PromptImageInput]],
+ model: str,
+ *,
+ max_model_len: int,
+ dtype: str,
+ max_tokens: int,
+ num_logprobs: int,
+ mm_limit: int,
+ tensor_parallel_size: int,
+ distributed_executor_backend: Optional[str] = None,
+):
+ """Inference result should be the same between hf and vllm.
+
+ All the image fixtures for the test are from IMAGE_ASSETS.
+ For huggingface runner, we provide the PIL images as input.
+ For vllm runner, we provide MultiModalDataDict objects
+ and corresponding MultiModalConfig as input.
+ Note, the text input is also adjusted to abide by vllm contract.
+ The text output is sanitized to be able to compare with hf.
+ """
+ # NOTE: take care of the order. run vLLM first, and then run HF.
+ # vLLM needs a fresh new process without cuda initialization.
+ # if we run HF first, the cuda initialization will be done and it
+ # will hurt multiprocessing backend with fork method (the default method).
+ # max_model_len should be greater than image_feature_size
+ with vllm_runner(
+ model,
+ task="generate",
+ max_model_len=max_model_len,
+ max_num_seqs=2,
+ dtype=dtype,
+ limit_mm_per_prompt={"image": mm_limit},
+ tensor_parallel_size=tensor_parallel_size,
+ distributed_executor_backend=distributed_executor_backend,
+ enable_lora=True,
+ max_lora_rank=320,
+ lora_extra_vocab_size=0,
+ gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
+ enforce_eager=True,
+ ) as vllm_model:
+ lora_request = LoRARequest("vision", 1, vision_lora_path)
+ vllm_model.model.llm_engine.add_lora(lora_request=lora_request)
+ vllm_outputs_per_case = [
+ vllm_model.generate_greedy_logprobs(prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ images=images)
+ for prompts, images in inputs
+ ]
+
+ # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+ hf_model_kwargs = {"_attn_implementation": "eager"}
+ with hf_runner(model, dtype=dtype,
+ model_kwargs=hf_model_kwargs) as hf_model:
+ eos_token_id = hf_model.processor.tokenizer.eos_token_id
+ hf_outputs_per_case = [
+ hf_model.generate_greedy_logprobs_limit(prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ images=images,
+ eos_token_id=eos_token_id,
+ num_logits_to_keep=0)
+ for prompts, images in inputs
+ ]
+
+ for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
+ vllm_outputs_per_case):
+ check_logprobs_close(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
+
+
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+ "size_factors",
+ [
+ # No image
+ [],
+ # Single-scale
+ [1.0],
+ # Single-scale, batched
+ [1.0, 1.0, 1.0],
+ # Multi-scale
+ [0.7, 0.75, 1.0],
+ ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [4096])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+ dtype: str, max_model_len: int, max_tokens: int,
+ num_logprobs: int) -> None:
+ images = [asset.pil_image for asset in image_assets]
+
+ inputs_per_image = [(
+ [prompt for _ in size_factors],
+ [rescale_image_size(image, factor) for factor in size_factors],
+ ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+ run_test(
+ hf_runner,
+ vllm_runner,
+ inputs_per_image,
+ model,
+ dtype=dtype,
+ max_model_len=max_model_len,
+ max_tokens=max_tokens,
+ num_logprobs=num_logprobs,
+ mm_limit=1,
+ tensor_parallel_size=1,
+ )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+ "size_factors",
+ [
+ # No image
+ # [],
+ # Single-scale
+ [1.0],
+ # Single-scale, batched
+ [1.0, 1.0, 1.0],
+ # Multi-scale
+ [0.25, 0.5, 1.0],
+ ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_model_len", [10000])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.xfail(
+ reason="Phi-4-MM multi-image inference is divergent with hf model.")
+def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
+ size_factors, dtype: str, max_model_len: int,
+ max_tokens: int, num_logprobs: int) -> None:
+ images = [asset.pil_image for asset in image_assets]
+
+ inputs_per_case = [
+ ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
+ [[rescale_image_size(image, factor) for image in images]
+ for factor in size_factors])
+ ]
+
+ run_test(
+ hf_runner,
+ vllm_runner,
+ inputs_per_case,
+ model,
+ dtype=dtype,
+ max_model_len=max_model_len,
+ max_tokens=max_tokens,
+ num_logprobs=num_logprobs,
+ mm_limit=2,
+ tensor_parallel_size=1,
+ )
diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py
index ecd0a04b1dff7..8cd3be90ca8da 100644
--- a/vllm/model_executor/models/aria.py
+++ b/vllm/model_executor/models/aria.py
@@ -60,7 +60,7 @@ class AriaVisionTransformer(Idefics3VisionTransformer, SupportsQuant):
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
- super().__init__(config, quant_config, prefix)
+ super().__init__(config, quant_config=quant_config, prefix=prefix)
# Unlike Idefics3VisionTransformer which uses LayerNorm after the
# final layer, Aria omits this normalization, so we replace it with an
# Identity layer
@@ -512,7 +512,7 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
self.config = config
self.vision_tower = AriaVisionTransformer(
config.vision_config,
- quant_config,
+ quant_config=quant_config,
prefix=f"{prefix}.vision_tower",
)
self.multi_modal_projector = AriaProjector(config)
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index f9c2175b29881..cb0379c10f3a6 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -113,7 +113,7 @@ class Idefics2VisionAttention(nn.Module):
def __init__(
self,
- config: Idefics2Config,
+ config: Idefics2VisionConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
@@ -164,7 +164,7 @@ class Idefics2VisionMLP(nn.Module):
def __init__(
self,
- config: Idefics2Config,
+ config: Idefics2VisionConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> None:
@@ -249,16 +249,24 @@ class Idefics2Encoder(nn.Module):
self,
config: Idefics2Config,
quant_config: Optional[QuantizationConfig] = None,
+ *,
+ num_hidden_layers_override: Optional[int] = None,
prefix: str = "",
) -> None:
super().__init__()
self.config = config
+
+ if num_hidden_layers_override is None:
+ num_hidden_layers = config.num_hidden_layers
+ else:
+ num_hidden_layers = num_hidden_layers_override
+
self.layers = nn.ModuleList([
Idefics2EncoderLayer(config,
quant_config=quant_config,
prefix=f"{prefix}.layers.{layer_idx}")
- for layer_idx in range(config.num_hidden_layers)
+ for layer_idx in range(num_hidden_layers)
])
def forward(
@@ -287,6 +295,9 @@ class Idefics2VisionTransformer(nn.Module):
self,
config: Idefics2VisionConfig,
quant_config: Optional[QuantizationConfig] = None,
+ *,
+ num_hidden_layers_override: Optional[int] = None,
+ require_post_norm: bool = True,
prefix: str = "",
) -> None:
super().__init__()
@@ -294,11 +305,24 @@ class Idefics2VisionTransformer(nn.Module):
embed_dim = config.hidden_size
self.config = config
self.embeddings = Idefics2VisionEmbeddings(config)
- self.encoder = Idefics2Encoder(config,
- quant_config=quant_config,
- prefix=f"{prefix}.encoder")
- self.post_layernorm = nn.LayerNorm(embed_dim,
- eps=config.layer_norm_eps)
+ self.encoder = Idefics2Encoder(
+ config,
+ quant_config=quant_config,
+ num_hidden_layers_override=num_hidden_layers_override,
+ prefix=f"{prefix}.encoder")
+
+ num_hidden_layers = config.num_hidden_layers
+ if len(self.encoder.layers) > config.num_hidden_layers:
+ raise ValueError(
+ f"The original encoder only has {num_hidden_layers} "
+ f"layers, but you requested {len(self.encoder.layers)} layers."
+ )
+
+ self.require_post_norm = require_post_norm
+ self.post_layernorm = nn.LayerNorm(
+ embed_dim,
+ eps=config.layer_norm_eps,
+ ) if require_post_norm else nn.Identity()
def get_input_embeddings(self):
return self.embeddings
@@ -328,7 +352,24 @@ class Idefics2VisionTransformer(nn.Module):
]
params_dict = dict(self.named_parameters())
loaded_params: Set[str] = set()
+ layer_count = len(self.encoder.layers)
+
for name, loaded_weight in weights:
+ # skip pooling header
+ if name.startswith("head."):
+ continue
+
+ # post_layernorm is optional
+ if (name.startswith("post_layernorm.")
+ and not self.require_post_norm):
+ continue
+
+ # omit layers when num_hidden_layers_override is set
+ if name.startswith("encoder.layers."):
+ layer_idx = int(name.split(".")[2])
+ if layer_idx >= layer_count:
+ continue
+
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 2a839f3a50317..7250aaba557eb 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -11,7 +11,7 @@ import torch
import torch.nn as nn
import torchvision.transforms as T
from PIL import Image
-from transformers import PretrainedConfig
+from transformers import PretrainedConfig, SiglipVisionConfig
from transformers.utils import logging
from vllm.config import VllmConfig
@@ -32,10 +32,10 @@ from vllm.multimodal.inputs import MultiModalInputs, NestedTensors
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from .idefics2_vision_model import Idefics2VisionTransformer
from .interfaces import SupportsLoRA, SupportsMultiModal
from .phi4mm_audio import AudioEmbedding
from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
-from .vision_siglip_navit import get_siglip_vision_model
# <|endoftext10|> (see vocab.json in hf model)
_IMAGE_PLACEHOLDER_TOKEN_ID = 200010
@@ -339,6 +339,33 @@ def preprocess(images, dynamic_hd_size, vit_resolution, vit_patch_size):
return data
+def get_navit_vision_model(layer_idx: int = -1, **kwargs):
+ vision_config = {
+ "hidden_size": 1152,
+ "image_size": 448,
+ "intermediate_size": 4304,
+ "model_type": "siglip_vision_model",
+ "num_attention_heads": 16,
+ "num_hidden_layers": 27,
+ "patch_size": 14,
+ }
+
+ model_config = SiglipVisionConfig(**vision_config, **kwargs)
+ if layer_idx < 0:
+ num_hidden_layers = model_config.num_hidden_layers \
+ + layer_idx + 1
+ else:
+ num_hidden_layers = layer_idx + 1
+
+ vision_model = Idefics2VisionTransformer(
+ config=model_config,
+ require_post_norm=False,
+ num_hidden_layers_override=num_hidden_layers,
+ )
+
+ return vision_model
+
+
class Phi4MMImageEncoder(nn.Module):
"""Image embedding."""
@@ -362,8 +389,7 @@ class Phi4MMImageEncoder(nn.Module):
self.layer_idx = -2
self.type_feature = 'patch'
- self.img_processor = get_siglip_vision_model(
- _flash_attn_2_enabled=True)
+ self.img_processor = get_navit_vision_model(layer_idx=self.layer_idx)
pe_weight = self.img_processor.embeddings.position_embedding.weight
L, D = pe_weight.size()
@@ -430,16 +456,11 @@ class Phi4MMImageEncoder(nn.Module):
def get_img_features(self,
img_embeds: torch.FloatTensor,
attention_mask=None) -> torch.FloatTensor:
- LAYER_IDX = self.layer_idx
- TYPE_FEATURE = self.type_feature
- img_processor_output = self.img_processor(
- img_embeds,
- output_hidden_states=True,
- patch_attention_mask=attention_mask)
- img_feature = img_processor_output.hidden_states[LAYER_IDX]
+ img_feature = self.img_processor(img_embeds,
+ patch_attention_mask=attention_mask)
- if TYPE_FEATURE == "patch":
+ if self.type_feature == "patch":
patch_feature = img_feature
use_token_compression = self.image_token_compression is not None
diff --git a/vllm/model_executor/models/vision_siglip_navit.py b/vllm/model_executor/models/vision_siglip_navit.py
deleted file mode 100644
index 3a9597a845ff9..0000000000000
--- a/vllm/model_executor/models/vision_siglip_navit.py
+++ /dev/null
@@ -1,1966 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Siglip model configuration"""
-
-import math
-import os
-import warnings
-from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn.init import _calculate_fan_in_and_fan_out
-from transformers.activations import ACT2FN
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
-from transformers.modeling_outputs import (BaseModelOutput,
- BaseModelOutputWithPooling)
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import (ModelOutput, add_start_docstrings,
- add_start_docstrings_to_model_forward, logging,
- replace_return_docstrings)
-
-from vllm.platforms import _Backend
-
-from .vision import get_vit_attn_backend
-
-logger = logging.get_logger(__name__)
-
-SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
- "google/siglip-base-patch16-224":
- "https://huggingface.co/google/siglip-base-patch16-224/"\
- "resolve/main/config.json",
-}
-
-
-class SiglipTextConfig(PretrainedConfig):
- r"""
- This is the configuration class to store the configuration of a
- [`SiglipTextModel`]. It is used to instantiate a Siglip text encoder
- according to the specified arguments, defining the model architecture.
- Instantiating a configuration with the defaults will yield a similar
- configuration to that of the text encoder of the Siglip [google/
- siglip-base-patch16-224](https://huggingface.co/google/siglip-base
- -patch16-224) architecture.
- Configuration objects inherit from [`PretrainedConfig`] and can be used
- to control the model outputs. Read the documentation from
- [`PretrainedConfig`] for more information.
- Args:
- vocab_size (`int`, *optional*, defaults to 32000):
- Vocabulary size of the Siglip text model. Defines the number of
- different tokens that can be represented by the `inputs_ids`
- passed when calling [`SiglipModel`].
- hidden_size (`int`, *optional*, defaults to 768):
- Dimensionality of the encoder layers and the pooler layer.
- intermediate_size (`int`, *optional*, defaults to 3072):
- Dimensionality of the "intermediate" (i.e., feed-forward) layer
- in the Transformer encoder.
- num_hidden_layers (`int`, *optional*, defaults to 12):
- Number of hidden layers in the Transformer encoder.
- num_attention_heads (`int`, *optional*, defaults to 12):
- Number of attention heads for each attention layer in the
- Transformer encoder.
- max_position_embeddings (`int`, *optional*, defaults to 64):
- The maximum sequence length that this model might ever be used
- with. Typically set this to something large
- just in case (e.g., 512 or 1024 or 2048).
- hidden_act (`str` or `function`, *optional*, defaults to
- `"gelu_pytorch_tanh"`):
- The non-linear activation function (function or string) in the
- encoder and pooler. If string, `"gelu"`,
- `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
- layer_norm_eps (`float`, *optional*, defaults to 1e-06):
- The epsilon used by the layer normalization layers.
- attention_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for the attention probabilities.
- pad_token_id (`int`, *optional*, defaults to 1):
- The id of the padding token in the vocabulary.
- bos_token_id (`int`, *optional*, defaults to 49406):
- The id of the beginning-of-sequence token in the vocabulary.
- eos_token_id (`int`, *optional*, defaults to 49407):
- The id of the end-of-sequence token in the vocabulary.
- Example:
- ```python
- >>> from transformers import SiglipTextConfig, SiglipTextModel
- >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224
- style configuration
- >>> configuration = SiglipTextConfig()
- >>> # Initializing a SiglipTextModel (with random weights) from the
- google/siglip-base-patch16-224 style configuration
- >>> model = SiglipTextModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```"""
-
- model_type = "siglip_text_model"
-
- def __init__(
- self,
- vocab_size=32000,
- hidden_size=768,
- intermediate_size=3072,
- num_hidden_layers=12,
- num_attention_heads=12,
- max_position_embeddings=64,
- hidden_act="gelu_pytorch_tanh",
- layer_norm_eps=1e-6,
- attention_dropout=0.0,
- # This differs from `CLIPTokenizer`'s default and from openai/siglip
- # See https://github.com/huggingface/transformers/pull/24773#
- # issuecomment-1632287538
- pad_token_id=1,
- bos_token_id=49406,
- eos_token_id=49407,
- _flash_attn_2_enabled=True,
- **kwargs,
- ):
- super().__init__(pad_token_id=pad_token_id,
- bos_token_id=bos_token_id,
- eos_token_id=eos_token_id,
- **kwargs)
-
- self.vocab_size = vocab_size
- self.hidden_size = hidden_size
- self.intermediate_size = intermediate_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.max_position_embeddings = max_position_embeddings
- self.layer_norm_eps = layer_norm_eps
- self.hidden_act = hidden_act
- self.attention_dropout = attention_dropout
- self._flash_attn_2_enabled = _flash_attn_2_enabled
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
- os.PathLike],
- **kwargs) -> "PretrainedConfig":
- cls._set_token_in_kwargs(kwargs)
-
- config_dict, kwargs = cls.get_config_dict(
- pretrained_model_name_or_path, **kwargs)
-
- # get the text config dict if we are loading from SiglipConfig
- if config_dict.get("model_type") == "siglip":
- config_dict = config_dict["text_config"]
-
- if "model_type" in config_dict and hasattr(
- cls,
- "model_type") and config_dict["model_type"] != cls.model_type:
- logger.warning(
- "You are using a model of type %s to instantiate a model of "
- "type %s. This is not supported for all configurations of "
- "models and can yield errors.", config_dict['model_type'],
- cls.model_type)
-
- return cls.from_dict(config_dict, **kwargs)
-
-
-class SiglipVisionConfig(PretrainedConfig):
- r"""
- This is the configuration class to store the configuration of a
- [`SiglipVisionModel`]. It is used to instantiate a
- Siglip vision encoder according to the specified arguments, defining the
- model architecture. Instantiating a configuration with the defaults will
- yield a similar configuration to that of the vision encoder of the Siglip
- [google/siglip-base-patch16-224](https://huggingface.co/google/
- siglip-base-patch16-224) architecture.
- Configuration objects inherit from [`PretrainedConfig`] and can be used
- to control the model outputs. Read the
- documentation from [`PretrainedConfig`] for more information.
- Args:
- hidden_size (`int`, *optional*, defaults to 768):
- Dimensionality of the encoder layers and the pooler layer.
- intermediate_size (`int`, *optional*, defaults to 3072):
- Dimensionality of the "intermediate" (i.e., feed-forward) layer
- in the Transformer encoder.
- num_hidden_layers (`int`, *optional*, defaults to 12):
- Number of hidden layers in the Transformer encoder.
- num_attention_heads (`int`, *optional*, defaults to 12):
- Number of attention heads for each attention layer in the
- Transformer encoder.
- num_channels (`int`, *optional*, defaults to 3):
- Number of channels in the input images.
- image_size (`int`, *optional*, defaults to 224):
- The size (resolution) of each image.
- patch_size (`int`, *optional*, defaults to 16):
- The size (resolution) of each patch.
- hidden_act (`str` or `function`, *optional*, defaults to
- `"gelu_pytorch_tanh"`):
- The non-linear activation function (function or string) in the
- encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and
- `"gelu_new"` ``"quick_gelu"` are supported.
- layer_norm_eps (`float`, *optional*, defaults to 1e-06):
- The epsilon used by the layer normalization layers.
- attention_dropout (`float`, *optional*, defaults to 0.0):
- The dropout ratio for the attention probabilities.
- Example:
- ```python
- >>> from transformers import SiglipVisionConfig, SiglipVisionModel
- >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224
- style configuration
- >>> configuration = SiglipVisionConfig()
- >>> # Initializing a SiglipVisionModel (with random weights) from the
- google/siglip-base-patch16-224 style configuration
- >>> model = SiglipVisionModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- ```"""
-
- model_type = "siglip_vision_model"
-
- def __init__(
- self,
- hidden_size=768,
- intermediate_size=3072,
- num_hidden_layers=12,
- num_attention_heads=12,
- num_channels=3,
- image_size=224,
- patch_size=16,
- hidden_act="gelu_pytorch_tanh",
- layer_norm_eps=1e-6,
- attention_dropout=0.0,
- _flash_attn_2_enabled=True,
- **kwargs,
- ):
- super().__init__(**kwargs)
-
- self.hidden_size = hidden_size
- self.intermediate_size = intermediate_size
- self.num_hidden_layers = num_hidden_layers
- self.num_attention_heads = num_attention_heads
- self.num_channels = num_channels
- self.patch_size = patch_size
- self.image_size = image_size
- self.attention_dropout = attention_dropout
- self.layer_norm_eps = layer_norm_eps
- self.hidden_act = hidden_act
- self._flash_attn_2_enabled = _flash_attn_2_enabled
-
- @classmethod
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str,
- os.PathLike],
- **kwargs) -> "PretrainedConfig":
- cls._set_token_in_kwargs(kwargs)
-
- config_dict, kwargs = cls.get_config_dict(
- pretrained_model_name_or_path, **kwargs)
-
- # get the vision config dict if we are loading from SiglipConfig
- if config_dict.get("model_type") == "siglip":
- config_dict = config_dict["vision_config"]
-
- if "model_type" in config_dict and hasattr(
- cls,
- "model_type") and config_dict["model_type"] != cls.model_type:
- logger.warning(
- "You are using a model of type %s to "
- "instantiate a model of type %s. This is not"
- " supported for all configurations of models and can yield"
- " errors.", config_dict['model_type'], cls.model_type)
-
- return cls.from_dict(config_dict, **kwargs)
-
-
-class SiglipConfig(PretrainedConfig):
- r"""
- [`SiglipConfig`] is the configuration class to store the configuration of a
- [`SiglipModel`]. It is used to instantiate a Siglip model according to the
- specified arguments, defining the text model and vision model configs.
- Instantiating a configuration with the defaults will yield a similar
- configuration to that of the Siglip [google/siglip-base-patch16-224](
- https://huggingface.co/google/siglip-base-patch16-224) architecture.
- Configuration objects inherit from [`PretrainedConfig`] and can be used to
- control the model outputs. Read the documentation from
- [`PretrainedConfig`] for more information.
- Args:
- text_config (`dict`, *optional*):
- Dictionary of configuration options used to initialize
- [`SiglipTextConfig`].
- vision_config (`dict`, *optional*):
- Dictionary of configuration options used to initialize
- [`SiglipVisionConfig`].
- kwargs (*optional*):
- Dictionary of keyword arguments.
- Example:
- ```python
- >>> from transformers import SiglipConfig, SiglipModel
- >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224
- style configuration
- >>> configuration = SiglipConfig()
- >>> # Initializing a SiglipModel (with random weights) from the
- google/siglip-base-patch16-224 style configuration
- >>> model = SiglipModel(configuration)
- >>> # Accessing the model configuration
- >>> configuration = model.config
- >>> # We can also initialize a SiglipConfig from a SiglipTextConfig
- and a SiglipVisionConfig
- >>> from transformers import SiglipTextConfig, SiglipVisionConfig
- >>> # Initializing a SiglipText and SiglipVision configuration
- >>> config_text = SiglipTextConfig()
- >>> config_vision = SiglipVisionConfig()
- >>> config = SiglipConfig.from_text_vision_configs(config_text,
- config_vision)
- ```"""
-
- model_type = "siglip"
-
- def __init__(self, text_config=None, vision_config=None, **kwargs):
- super().__init__(**kwargs)
-
- if text_config is None:
- text_config = {}
- logger.info(
- "`text_config` is `None`. Initializing the `SiglipTextConfig`"
- " with default values.")
-
- if vision_config is None:
- vision_config = {}
- logger.info("`vision_config` is `None`. initializing the "
- "`SiglipVisionConfig` with default values.")
-
- self.text_config = SiglipTextConfig(**text_config)
- self.vision_config = SiglipVisionConfig(**vision_config)
-
- self.initializer_factor = 1.0
-
- @classmethod
- def from_text_vision_configs(cls, text_config: SiglipTextConfig,
- vision_config: SiglipVisionConfig, **kwargs):
- r"""
- Instantiate a [`SiglipConfig`] (or a derived class) from siglip text
- model configuration and siglip vision
- model configuration.
- Returns:
- [`SiglipConfig`]: An instance of a configuration object
- """
-
- return cls(text_config=text_config.to_dict(),
- vision_config=vision_config.to_dict(),
- **kwargs)
-
-
-# coding=utf-8
-# Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Siglip model."""
-
-_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
-
-SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
- "google/siglip-base-patch16-224",
- # See all SigLIP models at https://huggingface.co/models?filter=siglip
-]
-
-
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
- seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
- indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
- max_seqlen_in_batch = seqlens_in_batch.max().item()
- cu_seqlens = F.pad(
- torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
- return (
- indices,
- cu_seqlens,
- max_seqlen_in_batch,
- )
-
-
-def _trunc_normal_(tensor, mean, std, a, b):
- # Cut & paste from PyTorch official master until it's in a few official
- # releases - RW
- # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/
- # truncated_normal.pdf
- def norm_cdf(x):
- # Computes standard normal cumulative distribution function
- return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-
- if (mean < a - 2 * std) or (mean > b + 2 * std):
- warnings.warn(
- "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
- "The distribution of values may be incorrect.",
- stacklevel=2,
- )
-
- # Values are generated by using a truncated uniform distribution and
- # then using the inverse CDF for the normal distribution.
- # Get upper and lower cdf values
- l = norm_cdf((a - mean) / std) # noqa
- u = norm_cdf((b - mean) / std) # noqa
-
- # Uniformly fill tensor with values from [l, u], then translate to
- # [2l-1, 2u-1].
- tensor.uniform_(2 * l - 1, 2 * u - 1)
-
- # Use inverse cdf transform for normal distribution to get truncated
- # standard normal
- if tensor.dtype in [torch.float16, torch.bfloat16]:
- # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
- og_dtype = tensor.dtype
- tensor = tensor.to(torch.float32)
- tensor.erfinv_()
- tensor = tensor.to(og_dtype)
- else:
- tensor.erfinv_()
-
- # Transform to proper mean, std
- tensor.mul_(std * math.sqrt(2.0))
- tensor.add_(mean)
-
- # Clamp to ensure it's in the proper range
- if tensor.dtype == torch.float16:
- # The `clamp_` op is not (yet?) defined in float16+cpu
- tensor = tensor.to(torch.float32)
- tensor.clamp_(min=a, max=b)
- tensor = tensor.to(torch.float16)
- else:
- tensor.clamp_(min=a, max=b)
-
-
-def trunc_normal_tf_(tensor: torch.Tensor,
- mean: float = 0.0,
- std: float = 1.0,
- a: float = -2.0,
- b: float = 2.0) -> torch.Tensor:
- """Fills the input Tensor with values drawn from a truncated
- normal distribution. The values are effectively drawn from the
- normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
- with values outside :math:`[a, b]` redrawn until they are within
- the bounds. The method used for generating the random values works
- best when :math:`a \\leq \text{mean} \\leq b`.
- NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where
- the bounds [a, b] are applied when sampling the normal distribution with
- mean=0, std=1.0 and the result is subsequently scaled and shifted by the
- mean and std args.
- Args:
- tensor: an n-dimensional `torch.Tensor`
- mean: the mean of the normal distribution
- std: the standard deviation of the normal distribution
- a: the minimum cutoff value
- b: the maximum cutoff value
- """
- with torch.no_grad():
- _trunc_normal_(tensor, 0, 1.0, a, b)
- tensor.mul_(std).add_(mean)
-
-
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
- fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
- if mode == "fan_in":
- denom = fan_in
- elif mode == "fan_out":
- denom = fan_out
- elif mode == "fan_avg":
- denom = (fan_in + fan_out) / 2
-
- variance = scale / denom
-
- if distribution == "truncated_normal":
- # constant is stddev of standard normal truncated to (-2, 2)
- trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
- elif distribution == "normal":
- with torch.no_grad():
- tensor.normal_(std=math.sqrt(variance))
- elif distribution == "uniform":
- bound = math.sqrt(3 * variance)
- with torch.no_grad():
- tensor.uniform_(-bound, bound)
- else:
- raise ValueError(f"invalid distribution {distribution}")
-
-
-def lecun_normal_(tensor):
- variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
-
-
-def default_flax_embed_init(tensor):
- variance_scaling_(tensor, mode="fan_in", distribution="normal")
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with
-# CLIP->Siglip
-class SiglipVisionModelOutput(ModelOutput):
- """
- Base class for vision model's outputs that also contains image embeddings
- of the pooling of the last hidden states.
- Args:
- image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
- *optional* returned when model is initialized with
- `with_projection=True`):
- The image embeddings obtained by applying the projection layer to
- the pooler_output.
- last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
- sequence_length, hidden_size)`):
- Sequence of hidden-states at the output of the last layer of the
- model.
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
- `output_hidden_states=True` is passed or when
- `config.output_hidden_states=True`):
- Tuple of `torch.FloatTensor` (one for the output of the embeddings,
- if the model has an embedding layer, + one for the output of each
- layer) of shape `(batch_size, sequence_length, hidden_size)`.
- Hidden-states of the model at the output of each layer plus the
- optional initial embedding outputs.
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
- `output_attentions=True` is passed or when
- `config.output_attentions=True`):
- Tuple of `torch.FloatTensor` (one for each layer) of shape
- `(batch_size, num_heads, sequence_length, sequence_length)`.
- Attentions weights after the attention softmax, used to compute the
- weighted average in the self-attention heads.
- """
-
- image_embeds: Optional[torch.FloatTensor] = None
- last_hidden_state: torch.FloatTensor = None
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
- attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with
-# CLIP->Siglip
-class SiglipTextModelOutput(ModelOutput):
- """
- Base class for text model's outputs that also contains a pooling of the
- last hidden states.
- Args:
- text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
- *optional* returned when model is initialized with
- `with_projection=True`):
- The text embeddings obtained by applying the projection layer to
- model.
- the pooler_output.
- last_hidden_state (`torch.FloatTensor` of shape `(batch_size,
- sequence_length, hidden_size)`):
- Sequence of hidden-states at the output of the last layer of the
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when
- `output_hidden_states=True` is passed or when
- `config.output_hidden_states=True`):
- Tuple of `torch.FloatTensor` (one for the output of the
- embeddings, if the model has an embedding layer, + one for the
- output of each layer) of shape `(batch_size, sequence_length,
- hidden_size)`.
- Hidden-states of the model at the output of each layer plus the
- optional initial embedding outputs.
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when
- `output_attentions=True` is passed or when
- `config.output_attentions=True`):
- Tuple of `torch.FloatTensor` (one for each layer) of shape
- `(batch_size, num_heads, sequence_length, sequence_length)`.
- Attentions weights after the attention softmax, used to compute
- the weighted average in the self-attention heads.
- """
-
- text_embeds: Optional[torch.FloatTensor] = None
- last_hidden_state: torch.FloatTensor = None
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
- attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
-@dataclass
-# Copied from transformers.models.clip.modeling_clip.CLIPOutput with
-# CLIP->Siglip
-class SiglipOutput(ModelOutput):
- """
- Args:
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when
- `return_loss` is `True`):
- Contrastive loss for image-text similarity.
- logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size,
- text_batch_size)`):
- The scaled dot product scores between `image_embeds` and
- `text_embeds`. This represents the image-text similarity scores.
- logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size,
- image_batch_size)`):
- The scaled dot product scores between `text_embeds` and
- `image_embeds`. This represents the text-image similarity scores.
- text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
- The text embeddings obtained by applying the projection layer to
- the pooled output of [`SiglipTextModel`].
- image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
- The image embeddings obtained by applying the projection layer to
- the pooled output of [`SiglipVisionModel`].
- text_model_output(`BaseModelOutputWithPooling`):
- The output of the [`SiglipTextModel`].
- vision_model_output(`BaseModelOutputWithPooling`):
- The output of the [`SiglipVisionModel`].
- """
-
- loss: Optional[torch.FloatTensor] = None
- logits_per_image: torch.FloatTensor = None
- logits_per_text: torch.FloatTensor = None
- text_embeds: torch.FloatTensor = None
- image_embeds: torch.FloatTensor = None
- text_model_output: BaseModelOutputWithPooling = None
- vision_model_output: BaseModelOutputWithPooling = None
-
- def to_tuple(self) -> Tuple[Any]:
- return tuple(
- self[k] if k not in ["text_model_output", "vision_model_output"
- ] else getattr(self, k).to_tuple()
- for k in self.keys())
-
-
-class SiglipVisionEmbeddings(nn.Module):
-
- def __init__(self, config: SiglipVisionConfig):
- super().__init__()
- self.config = config
- self.embed_dim = config.hidden_size
- self.image_size = config.image_size
- self.patch_size = config.patch_size
-
- self.patch_embedding = nn.Conv2d(
- in_channels=config.num_channels,
- out_channels=self.embed_dim,
- kernel_size=self.patch_size,
- stride=self.patch_size,
- padding="valid",
- )
-
- self.num_patches_per_side = self.image_size // self.patch_size
- self.num_patches = self.num_patches_per_side**2
- self.num_positions = self.num_patches
- self.position_embedding = nn.Embedding(self.num_positions,
- self.embed_dim)
-
- def forward(self, pixel_values: torch.FloatTensor,
- patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
- batch_size = pixel_values.size(0)
-
- patch_embeds = self.patch_embedding(pixel_values)
- embeddings = patch_embeds.flatten(2).transpose(1, 2)
-
- max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
- max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, \
- max_im_w // self.patch_size
- boundaries = torch.arange(1 / self.num_patches_per_side, 1.0,
- 1 / self.num_patches_per_side)
- position_ids = torch.full(
- size=(
- batch_size,
- max_nb_patches_h * max_nb_patches_w,
- ),
- fill_value=0,
- )
-
- for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
- nb_patches_h = p_attn_mask[:, 0].sum()
- nb_patches_w = p_attn_mask[0].sum()
-
- fractional_coords_h = torch.linspace(0, 1 - 1 / nb_patches_h,
- nb_patches_h)
- fractional_coords_w = torch.linspace(0, 1 - 1 / nb_patches_w,
- nb_patches_w)
-
- bucket_coords_h = torch.bucketize(fractional_coords_h,
- boundaries,
- right=True)
- bucket_coords_w = torch.bucketize(fractional_coords_w,
- boundaries,
- right=True)
-
- pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side +
- bucket_coords_w).flatten()
- position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
-
- position_ids = position_ids.to(self.position_embedding.weight.device)
-
- embeddings = embeddings + self.position_embedding(position_ids)
- return embeddings
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with
-# CLIP->Siglip
-class SiglipTextEmbeddings(nn.Module):
-
- def __init__(self, config: SiglipTextConfig):
- super().__init__()
- embed_dim = config.hidden_size
-
- self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
- self.position_embedding = nn.Embedding(config.max_position_embeddings,
- embed_dim)
-
- # position_ids (1, len position emb) is contiguous in memory and
- # exported when serialized
- self.register_buffer(
- "position_ids",
- torch.arange(config.max_position_embeddings).expand((1, -1)),
- persistent=False)
-
- def forward(
- self,
- input_ids: Optional[torch.LongTensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- ) -> torch.Tensor:
- seq_length = input_ids.shape[
- -1] if input_ids is not None else inputs_embeds.shape[-2]
-
- if position_ids is None:
- position_ids = self.position_ids[:, :seq_length]
-
- if inputs_embeds is None:
- inputs_embeds = self.token_embedding(input_ids)
-
- position_embeddings = self.position_embedding(position_ids)
- embeddings = inputs_embeds + position_embeddings
-
- return embeddings
-
-
-class SiglipAttention(nn.Module):
- """Multi-headed attention from 'Attention Is All You Need' paper"""
-
- # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
- def __init__(self, config):
- super().__init__()
- self.config = config
- self.embed_dim = config.hidden_size
- self.num_heads = config.num_attention_heads
- self.head_dim = self.embed_dim // self.num_heads
- if self.head_dim * self.num_heads != self.embed_dim:
- raise ValueError(
- f"embed_dim must be divisible by num_heads (got `embed_dim`:"
- f" {self.embed_dim} and `num_heads`: {self.num_heads}).")
- self.scale = self.head_dim**-0.5
- self.dropout = config.attention_dropout
-
- self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
- self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
- self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
- self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = False,
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
- Optional[Tuple[torch.Tensor]]]:
- """Input shape: Batch x Time x Channel"""
-
- batch_size, q_len, _ = hidden_states.size()
-
- query_states = self.q_proj(hidden_states)
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
-
- query_states = query_states.view(batch_size, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
- key_states = key_states.view(batch_size, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
- value_states = value_states.view(batch_size, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
-
- k_v_seq_len = key_states.shape[-2]
- attn_weights = torch.matmul(query_states, key_states.transpose(
- 2, 3)) * self.scale
-
- if attn_weights.size() != (batch_size, self.num_heads, q_len,
- k_v_seq_len):
- raise ValueError(
- f"Attention weights should be of size "
- f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
- f" {attn_weights.size()}")
-
- if attention_mask is not None:
- if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
- raise ValueError(f"Attention mask should be of size "
- f"{(batch_size, 1, q_len, k_v_seq_len)}, "
- f"but is {attention_mask.size()}")
- attn_weights = attn_weights + attention_mask
-
- # upcast attention to fp32
- attn_weights = nn.functional.softmax(attn_weights,
- dim=-1,
- dtype=torch.float32).to(
- query_states.dtype)
- attn_weights = nn.functional.dropout(attn_weights,
- p=self.dropout,
- training=self.training)
- attn_output = torch.matmul(attn_weights, value_states)
-
- if attn_output.size() != (batch_size, self.num_heads, q_len,
- self.head_dim):
- raise ValueError(
- f"`attn_output` should be of size "
- f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
- f" {attn_output.size()}")
-
- attn_output = attn_output.transpose(1, 2).contiguous()
- attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
- attn_output = self.out_proj(attn_output)
-
- return attn_output, attn_weights
-
-
-class SiglipFlashAttention2(SiglipAttention):
- """
- Llama flash attention module. This module inherits from `LlamaAttention` as
- the weights of the module stays untouched. The only required change would
- be on the forward pass where it needs to correctly call the public API of
- flash attention and deal with padding tokens in case the input contains any
- of them.
- """
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.is_causal = False # Hack to make sure we don't use a causal mask
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: Optional[torch.LongTensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
- output_attentions: bool = False,
- use_cache: bool = False,
- **kwargs,
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
- Optional[Tuple[torch.Tensor]]]:
- output_attentions = False
-
- bsz, q_len, _ = hidden_states.size()
-
- query_states = self.q_proj(hidden_states)
- key_states = self.k_proj(hidden_states)
- value_states = self.v_proj(hidden_states)
-
- # Flash attention requires the input to have the shape
- # batch_size x seq_length x head_dim x hidden_dim
- # therefore we just need to keep the original shape
- query_states = query_states.view(bsz, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
- key_states = key_states.view(bsz, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
- value_states = value_states.view(bsz, q_len, self.num_heads,
- self.head_dim).transpose(1, 2)
-
- kv_seq_len = key_states.shape[-2]
- if past_key_value is not None:
- kv_seq_len += past_key_value.get_usable_length(
- kv_seq_len, self.layer_idx)
-
- # TODO: These transpose are quite inefficient but Flash Attention
- # requires the layout [batch_size, sequence_length, num_heads,
- # head_dim]. We would need to refactor the KV cache
- # to be able to avoid many of these transpose/reshape/view.
- query_states = query_states.transpose(1, 2)
- key_states = key_states.transpose(1, 2)
- value_states = value_states.transpose(1, 2)
-
- dropout_rate = self.dropout if self.training else 0.0
-
- # In PEFT, usually we cast the layer norms in float32 for training
- # stability reasons therefore the input hidden states gets silently
- # casted in float32. Hence, we need cast them back in the correct
- # dtype just to be sure everything works as expected.
- # This might slowdown training & inference so it is recommended to
- # not cast the LayerNorms in fp32. (LlamaRMSNorm handles it correctly)
-
- input_dtype = query_states.dtype
- if input_dtype == torch.float32:
- if torch.is_autocast_enabled():
- target_dtype = torch.get_autocast_gpu_dtype()
- # Handle the case where the model is quantized
- elif hasattr(self.config, "_pre_quantization_dtype"):
- target_dtype = self.config._pre_quantization_dtype
- else:
- target_dtype = self.q_proj.weight.dtype
-
- logger.warning_once(
- "The input hidden states seems to be silently casted in "
- "float32, this might be related to the fact you have upcasted "
- "embedding or layer norm layers in float32. We will cast "
- f"back the input in {target_dtype}.")
-
- query_states = query_states.to(target_dtype)
- key_states = key_states.to(target_dtype)
- value_states = value_states.to(target_dtype)
-
- attn_output = self._flash_attention_forward(query_states,
- key_states,
- value_states,
- attention_mask,
- q_len,
- dropout=dropout_rate)
-
- attn_output = attn_output.reshape(bsz, q_len,
- self.embed_dim).contiguous()
- attn_output = self.out_proj(attn_output)
-
- if not output_attentions:
- attn_weights = None
-
- return attn_output, attn_weights
-
- def _flash_attention_forward(self,
- query_states,
- key_states,
- value_states,
- attention_mask,
- query_length,
- dropout=0.0,
- softmax_scale=None):
- """
- Calls the forward method of Flash Attention - if the input hidden
- states contain at least one padding token first unpad the input,
- then computes the attention scores and pad the final attention
- scores.
- Args:
- query_states (`torch.Tensor`):
- Input query states to be passed to Flash Attention API
- key_states (`torch.Tensor`):
- Input key states to be passed to Flash Attention API
- value_states (`torch.Tensor`):
- Input value states to be passed to Flash Attention API
- attention_mask (`torch.Tensor`):
- The padding mask - corresponds to a tensor of size
- `(batch_size, seq_len)` where 0 stands for the position
- of padding tokens and 1 for the position of non-padding
- tokens.
- dropout (`int`, *optional*):
- Attention dropout
- softmax_scale (`float`, *optional*):
- The scaling of QK^T before applying softmax. Default to 1 /
- sqrt(head_dim)
- """
- from flash_attn import flash_attn_func, flash_attn_varlen_func
- from flash_attn.bert_padding import pad_input # noqa
-
- # TODO: Remove the `query_length != 1` check once Flash Attention for
- # RoCm is bumped to 2.1. For details, please see the comment in
- # LlamaFlashAttention2 __init__.
- causal = self.is_causal and query_length != 1
-
- # Contains at least one padding token in the sequence
- if attention_mask is not None:
- batch_size = query_states.shape[0]
- query_states, key_states, value_states, indices_q, cu_seq_lens, \
- max_seq_lens = self._upad_input(
- query_states, key_states, value_states, attention_mask,
- query_length)
-
- cu_seqlens_q, cu_seqlens_k = cu_seq_lens
- max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
- attn_output_unpad = flash_attn_varlen_func(
- query_states,
- key_states,
- value_states,
- cu_seqlens_q=cu_seqlens_q,
- cu_seqlens_k=cu_seqlens_k,
- max_seqlen_q=max_seqlen_in_batch_q,
- max_seqlen_k=max_seqlen_in_batch_k,
- dropout_p=dropout,
- softmax_scale=softmax_scale,
- causal=causal,
- )
-
- attn_output = pad_input(attn_output_unpad, indices_q, batch_size,
- query_length)
- else:
- attn_output = flash_attn_func(query_states,
- key_states,
- value_states,
- dropout,
- softmax_scale=softmax_scale,
- causal=causal)
-
- return attn_output
-
- def _upad_input(self, query_layer, key_layer, value_layer, attention_mask,
- query_length):
- from flash_attn.bert_padding import index_first_axis, unpad_input
- indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
- attention_mask)
- batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-
- key_layer = index_first_axis(
- key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
- head_dim), indices_k)
- value_layer = index_first_axis(
- value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads,
- head_dim), indices_k)
- if query_length == kv_seq_len:
- query_layer = index_first_axis(
- query_layer.reshape(batch_size * kv_seq_len, self.num_heads,
- head_dim), indices_k)
- cu_seqlens_q = cu_seqlens_k
- max_seqlen_in_batch_q = max_seqlen_in_batch_k
- indices_q = indices_k
- elif query_length == 1:
- max_seqlen_in_batch_q = 1
- cu_seqlens_q = torch.arange(
- batch_size + 1, dtype=torch.int32, device=query_layer.device
- ) # There is a memcpy here, that is very bad.
- indices_q = cu_seqlens_q[:-1]
- query_layer = query_layer.squeeze(1)
- else:
- # The -q_len: slice assumes left padding.
- attention_mask = attention_mask[:, -query_length:]
- query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = \
- unpad_input(query_layer, attention_mask)
-
- return (
- query_layer,
- key_layer,
- value_layer,
- indices_q,
- (cu_seqlens_q, cu_seqlens_k),
- (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
- )
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
-class SiglipMLP(nn.Module):
-
- def __init__(self, config):
- super().__init__()
- self.config = config
- self.activation_fn = ACT2FN[config.hidden_act]
- self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
- self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
- hidden_states = self.fc1(hidden_states)
- hidden_states = self.activation_fn(hidden_states)
- hidden_states = self.fc2(hidden_states)
- return hidden_states
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with
-# CLIP->Siglip
-class SiglipEncoderLayer(nn.Module):
-
- def __init__(self, config: SiglipConfig):
- super().__init__()
- self.embed_dim = config.hidden_size
- self.self_attn = (SiglipAttention(config) if
- not getattr(config, "_flash_attn_2_enabled", False)
- else SiglipFlashAttention2(config))
- self.layer_norm1 = nn.LayerNorm(self.embed_dim,
- eps=config.layer_norm_eps)
- self.mlp = SiglipMLP(config)
- self.layer_norm2 = nn.LayerNorm(self.embed_dim,
- eps=config.layer_norm_eps)
-
- def forward(
- self,
- hidden_states: torch.Tensor,
- attention_mask: torch.Tensor,
- output_attentions: Optional[bool] = False,
- ) -> Tuple[torch.FloatTensor]:
- """
- Args:
- hidden_states (`torch.FloatTensor`):
- Input to the layer of shape `(batch, seq_len, embed_dim)`.
- attention_mask (`torch.FloatTensor`):
- Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where
- padding elements are indicated by very large negative values.
- output_attentions (`bool`, *optional*, defaults to `False`):
- Whether or not to return the attentions tensors of all
- attention layers. See `attentions` under returned tensors for
- more detail.
- """
- residual = hidden_states
-
- hidden_states = self.layer_norm1(hidden_states)
- hidden_states, attn_weights = self.self_attn(
- hidden_states=hidden_states,
- attention_mask=attention_mask,
- output_attentions=output_attentions,
- )
- hidden_states = residual + hidden_states
-
- residual = hidden_states
- hidden_states = self.layer_norm2(hidden_states)
- hidden_states = self.mlp(hidden_states)
- hidden_states = residual + hidden_states
-
- outputs = (hidden_states, )
-
- if output_attentions:
- outputs += (attn_weights, )
-
- return outputs
-
-
-class SiglipPreTrainedModel(PreTrainedModel):
- """
- An abstract class to handle weights initialization and a simple interface
- for downloading and loading pretrained models.
- """
-
- config_class = SiglipConfig
- base_model_prefix = "siglip"
- supports_gradient_checkpointing = True
-
- def _init_weights(self, module):
- """Initialize the weights"""
-
- if isinstance(module, SiglipVisionEmbeddings):
- width = (self.config.vision_config.hidden_size if isinstance(
- self.config, SiglipConfig) else self.config.hidden_size)
- nn.init.normal_(module.position_embedding.weight,
- std=1 / np.sqrt(width))
- elif isinstance(module, nn.Embedding):
- default_flax_embed_init(module.weight)
- elif isinstance(module, SiglipAttention):
- nn.init.normal_(module.q_proj.weight)
- nn.init.normal_(module.k_proj.weight)
- nn.init.normal_(module.v_proj.weight)
- nn.init.normal_(module.out_proj.weight)
- nn.init.zeros_(module.q_proj.bias)
- nn.init.zeros_(module.k_proj.bias)
- nn.init.zeros_(module.v_proj.bias)
- nn.init.zeros_(module.out_proj.bias)
- elif isinstance(module, SiglipMLP):
- nn.init.normal_(module.fc1.weight)
- nn.init.normal_(module.fc2.weight)
- nn.init.normal_(module.fc1.bias, std=1e-6)
- nn.init.normal_(module.fc2.bias, std=1e-6)
- elif isinstance(module, SiglipMultiheadAttentionPoolingHead):
- nn.init.normal_(module.probe.data)
- nn.init.normal_(module.attention.in_proj_weight.data)
- nn.init.zeros_(module.attention.in_proj_bias.data)
- elif isinstance(module, SiglipModel):
- logit_scale_init = torch.tensor(0.0)
- module.logit_scale.data.fill_(logit_scale_init)
- module.logit_bias.data.zero_()
- elif isinstance(module, (nn.Linear, nn.Conv2d)):
- lecun_normal_(module.weight)
- if module.bias is not None:
- nn.init.zeros_(module.bias)
- elif isinstance(module, nn.LayerNorm):
- module.bias.data.zero_()
- module.weight.data.fill_(1.0)
-
-
-SIGLIP_START_DOCSTRING = r"""
- This model inherits from [`PreTrainedModel`]. Check the superclass
- documentation for the generic methods the library implements for all
- its model (such as downloading or saving, resizing the input embeddings,
- pruning heads etc.)
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/
- stable/nn.html#torch.nn.Module) subclass.
- Use it as a regular PyTorch Module and refer to the PyTorch documentation
- for all matter related to general usage and behavior.
- Parameters:
- config ([`SiglipConfig`]): Model configuration class with all the
- parameters of the model.
- Initializing with a config file does not load the weights
- associated with the model, only the configuration. Check out
- the [`~PreTrainedModel.from_pretrained`] method to load the
- model weights.
-"""
-
-SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)
- `):
- Indices of input sequence tokens in the vocabulary. Padding will
- be ignored by default should you provide it.
- Indices can be obtained using [`AutoTokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details. [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size,
- sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices. Mask
- values selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- position_ids (`torch.LongTensor` of shape `(batch_size,
- sequence_length)`, *optional*):
- Indices of positions of each input sequence tokens in the position
- embeddings. Selected in the range `[0,
- config.max_position_embeddings - 1]`.
- [What are position IDs?](../glossary#position-ids)
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention
- layers. See `attentions` under returned tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See
- `hidden_states` under returned tensors for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a
- plain tuple.
-"""
-
-SIGLIP_VISION_INPUTS_DOCSTRING = r"""
- Args:
- pixel_values (`torch.FloatTensor` of shape `(batch_size,
- num_channels, height, width)`):
- Pixel values. Padding will be ignored by default should you
- provide it. Pixel values can be obtained using
- [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`]
- for details.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention
- layers. See `attentions` under returned
- tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See
- `hidden_states` under returned tensors for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a
- plain tuple.
-"""
-
-SIGLIP_INPUTS_DOCSTRING = r"""
- Args:
- input_ids (`torch.LongTensor` of shape `(batch_size,
- sequence_length)`):
- Indices of input sequence tokens in the vocabulary. Padding
- will be ignored by default should you provide it.
- Indices can be obtained using [`AutoTokenizer`]. See
- [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
- for details. [What are input IDs?](../glossary#input-ids)
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`
- , *optional*):
- Mask to avoid performing attention on padding token indices. Mask
- values selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- position_ids (`torch.LongTensor` of shape `(batch_size,
- sequence_length)`, *optional*):
- Indices of positions of each input sequence tokens in the position
- embeddings. Selected in the range `[0,
- config.max_position_embeddings - 1]`.
- [What are position IDs?](../glossary#position-ids)
- pixel_values (`torch.FloatTensor` of shape `(batch_size,
- num_channels, height, width)`):
- Pixel values. Padding will be ignored by default should you
- provide it. Pixel values can be obtained using
- [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`]
- for details.
- return_loss (`bool`, *optional*):
- Whether or not to return the contrastive loss.
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all attention
- layers. See `attentions` under returned tensors for more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See
- `hidden_states` under returned tensors for
- more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a
- plain tuple.
-"""
-
-
-# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with
-# CLIP->Siglip
-class SiglipEncoder(nn.Module):
- """
- Transformer encoder consisting of `config.num_hidden_layers`
- self attention layers. Each layer is a [`SiglipEncoderLayer`].
- Args:
- config: SiglipConfig
- """
-
- def __init__(self, config: SiglipConfig):
- super().__init__()
- self.config = config
- self.layers = nn.ModuleList([
- SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)
- ])
- self.gradient_checkpointing = False
-
- # Ignore copy
- def forward(
- self,
- inputs_embeds,
- attention_mask: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, BaseModelOutput]:
- r"""
- Args:
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size,
- sequence_length, hidden_size)`):
- Optionally, instead of passing `input_ids` you can choose to
- directly pass an embedded representation.
- This is useful if you want more control over how to convert
- `input_ids` indices into associated vectors
- than the model's internal embedding lookup matrix.
- attention_mask (`torch.Tensor` of shape `(batch_size,
- sequence_length)`, *optional*):
- Mask to avoid performing attention on padding token indices.
- Mask values selected in `[0, 1]`:
- - 1 for tokens that are **not masked**,
- - 0 for tokens that are **masked**.
- [What are attention masks?](../glossary#attention-mask)
- output_attentions (`bool`, *optional*):
- Whether or not to return the attentions tensors of all
- attention layers. See `attentions` under returned tensors for
- more detail.
- output_hidden_states (`bool`, *optional*):
- Whether or not to return the hidden states of all layers. See
- `hidden_states` under returned tensors for more detail.
- return_dict (`bool`, *optional*):
- Whether or not to return a [`~utils.ModelOutput`] instead of a
- plain tuple.
- """
- output_attentions = output_attentions if output_attentions \
- is not None else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None else \
- self.config.use_return_dict
-
- encoder_states = () if output_hidden_states else None
- all_attentions = () if output_attentions else None
-
- hidden_states = inputs_embeds
- for encoder_layer in self.layers:
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states, )
- if self.gradient_checkpointing and self.training:
- layer_outputs = self._gradient_checkpointing_func(
- encoder_layer.__call__,
- hidden_states,
- attention_mask,
- output_attentions,
- )
- else:
- layer_outputs = encoder_layer(
- hidden_states,
- attention_mask,
- output_attentions=output_attentions,
- )
-
- hidden_states = layer_outputs[0]
-
- if output_attentions:
- all_attentions = all_attentions + (layer_outputs[1], )
-
- if output_hidden_states:
- encoder_states = encoder_states + (hidden_states, )
-
- if not return_dict:
- return tuple(
- v for v in [hidden_states, encoder_states, all_attentions]
- if v is not None)
- return BaseModelOutput(last_hidden_state=hidden_states,
- hidden_states=encoder_states,
- attentions=all_attentions)
-
-
-class SiglipTextTransformer(nn.Module):
-
- def __init__(self, config: SiglipTextConfig):
- super().__init__()
- self.config = config
- embed_dim = config.hidden_size
- self.embeddings = SiglipTextEmbeddings(config)
- self.encoder = SiglipEncoder(config)
- self.final_layer_norm = nn.LayerNorm(embed_dim,
- eps=config.layer_norm_eps)
-
- self.head = nn.Linear(embed_dim, embed_dim)
-
- @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
- config_class=SiglipTextConfig)
- def forward(
- self,
- input_ids: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, BaseModelOutputWithPooling]:
- r"""
- Returns:
- """
- output_attentions = output_attentions if output_attentions \
- is not None else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states \
- is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None else \
- self.config.use_return_dict
-
- if input_ids is None:
- raise ValueError("You have to specify input_ids")
-
- input_shape = input_ids.size()
- input_ids = input_ids.view(-1, input_shape[-1])
-
- hidden_states = self.embeddings(input_ids=input_ids,
- position_ids=position_ids)
-
- # note: SigLIP's text model does not use a causal mask, unlike the
- # original CLIP model.
- # expand attention_mask
- if attention_mask is not None:
- # [batch_size, seq_len] ->
- # [batch_size, 1, tgt_seq_len, src_seq_len]
- attention_mask = _prepare_4d_attention_mask(
- attention_mask, hidden_states.dtype)
-
- encoder_outputs = self.encoder(
- inputs_embeds=hidden_states,
- attention_mask=attention_mask,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- last_hidden_state = encoder_outputs[0]
- last_hidden_state = self.final_layer_norm(last_hidden_state)
-
- # Assuming "sticky" EOS tokenization, last token is always EOS.
- pooled_output = last_hidden_state[:, -1, :]
- pooled_output = self.head(pooled_output)
-
- if not return_dict:
- return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
- return BaseModelOutputWithPooling(
- last_hidden_state=last_hidden_state,
- pooler_output=pooled_output,
- hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions,
- )
-
-
-@add_start_docstrings(
- """The text model from SigLIP without any head or projection on top.""",
- SIGLIP_START_DOCSTRING,
-)
-class SiglipTextModel(SiglipPreTrainedModel):
- config_class = SiglipTextConfig
-
- _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
-
- def __init__(self, config: SiglipTextConfig):
- super().__init__(config)
- self.text_model = SiglipTextTransformer(config)
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self) -> nn.Module:
- return self.text_model.embeddings.token_embedding
-
- def set_input_embeddings(self, value):
- self.text_model.embeddings.token_embedding = value
-
- @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
- config_class=SiglipTextConfig)
- def forward(
- self,
- input_ids: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, BaseModelOutputWithPooling]:
- r"""
- Returns:
- Examples:
- ```python
- >>> from transformers import AutoTokenizer, SiglipTextModel
- >>> model = SiglipTextModel.
- from_pretrained("google/siglip-base-patch16-224")
- >>> tokenizer = AutoTokenizer.
- from_pretrained("google/siglip-base-patch16-224")
- >>> # important: make sure to set padding="max_length"
- as that's how the model was trained
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],
- padding="max_length", return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooled_output = outputs.pooler_output # pooled (EOS token)
- states
- ```"""
- return_dict = return_dict if return_dict is not None else \
- self.config.use_return_dict
-
- return self.text_model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- position_ids=position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
-
-class SiglipVisionTransformer(nn.Module):
-
- def __init__(self, config: SiglipVisionConfig):
- super().__init__()
- self.config = config
- embed_dim = config.hidden_size
-
- self.embeddings = SiglipVisionEmbeddings(config)
- self.encoder = SiglipEncoder(config)
- self.post_layernorm = nn.LayerNorm(embed_dim,
- eps=config.layer_norm_eps)
- self.head = SiglipMultiheadAttentionPoolingHead(config)
-
- @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
- config_class=SiglipVisionConfig)
- def forward(
- self,
- pixel_values,
- patch_attention_mask: Optional[torch.BoolTensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, BaseModelOutputWithPooling]:
- r"""
- Returns:
- """
- output_attentions = output_attentions if output_attentions is not None\
- else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None \
- else self.config.use_return_dict
-
- batch_size = pixel_values.size(0)
- if patch_attention_mask is None:
- patch_attention_mask = torch.ones(
- size=(
- batch_size,
- pixel_values.size(2) // self.config.patch_size,
- pixel_values.size(3) // self.config.patch_size,
- ),
- dtype=torch.bool,
- device=pixel_values.device,
- )
-
- hidden_states = self.embeddings(
- pixel_values=pixel_values,
- patch_attention_mask=patch_attention_mask)
-
- patch_attention_mask = patch_attention_mask.view(batch_size, -1)
- # The call to `_upad_input` in `_flash_attention_forward` is expensive
- # So when the `patch_attention_mask` is full of 1s (i.e. attending
- # to the whole sequence), avoiding passing the attention_mask, which
- # is equivalent to attending to the full sequence
- if not torch.any(~patch_attention_mask):
- attention_mask = None
- else:
- attention_mask = (_prepare_4d_attention_mask(
- patch_attention_mask, hidden_states.dtype)
- if not self.config._flash_attn_2_enabled else
- patch_attention_mask)
-
- encoder_outputs = self.encoder(
- inputs_embeds=hidden_states,
- attention_mask=attention_mask,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- last_hidden_state = encoder_outputs[0]
- last_hidden_state = self.post_layernorm(last_hidden_state)
-
- pooled_output = self.head(
- hidden_state=last_hidden_state,
- attention_mask=patch_attention_mask,
- )
-
- if not return_dict:
- return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
- return BaseModelOutputWithPooling(
- last_hidden_state=last_hidden_state,
- pooler_output=pooled_output,
- hidden_states=encoder_outputs.hidden_states,
- attentions=encoder_outputs.attentions,
- )
-
-
-class SiglipMultiheadAttentionPoolingHead(nn.Module):
- """Multihead Attention Pooling."""
-
- def __init__(self, config: SiglipVisionConfig):
- super().__init__()
-
- self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
- self.attention = torch.nn.MultiheadAttention(
- config.hidden_size, config.num_attention_heads, batch_first=True)
- self.layernorm = nn.LayerNorm(config.hidden_size,
- eps=config.layer_norm_eps)
- self.mlp = SiglipMLP(config)
-
- def forward(self, hidden_state, attention_mask):
- batch_size = hidden_state.shape[0]
- probe = self.probe.repeat(batch_size, 1, 1)
-
- hidden_state = self.attention(query=probe,
- key=hidden_state,
- value=hidden_state,
- key_padding_mask=~attention_mask)[0]
-
- residual = hidden_state
- hidden_state = self.layernorm(hidden_state)
- hidden_state = residual + self.mlp(hidden_state)
-
- return hidden_state[:, 0]
-
-
-@add_start_docstrings(
- """The vision model from SigLIP without any head or projection on top.""",
- SIGLIP_START_DOCSTRING,
-)
-class SiglipVisionModel(SiglipPreTrainedModel):
- config_class = SiglipVisionConfig
- main_input_name = "pixel_values"
-
- def __init__(self, config: SiglipVisionConfig):
- super().__init__(config)
-
- self.vision_model = SiglipVisionTransformer(config)
-
- # Initialize weights and apply final processing
- self.post_init()
-
- def get_input_embeddings(self) -> nn.Module:
- return self.vision_model.embeddings.patch_embedding
-
- @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=BaseModelOutputWithPooling,
- config_class=SiglipVisionConfig)
- def forward(
- self,
- pixel_values,
- patch_attention_mask: Optional[torch.BoolTensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, BaseModelOutputWithPooling]:
- r"""
- Returns:
- Examples:
- ```python
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import AutoProcessor, SiglipVisionModel
- >>> model = SiglipVisionModel.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> processor = AutoProcessor.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> url =
- "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="pt")
- >>> outputs = model(**inputs)
- >>> last_hidden_state = outputs.last_hidden_state
- >>> pooled_output = outputs.pooler_output # pooled features
- ```"""
- return_dict = return_dict if return_dict is not None \
- else self.config.use_return_dict
-
- return self.vision_model(
- pixel_values=pixel_values,
- patch_attention_mask=patch_attention_mask,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
-
-@add_start_docstrings(SIGLIP_START_DOCSTRING)
-class SiglipModel(SiglipPreTrainedModel):
- config_class = SiglipConfig
-
- def __init__(self, config: SiglipConfig):
- super().__init__(config)
-
- if not isinstance(config.text_config, SiglipTextConfig):
- raise ValueError("config.text_config is expected to be of type "
- f"SiglipTextConfig but is of type"
- f" {type(config.text_config)}.")
-
- if not isinstance(config.vision_config, SiglipVisionConfig):
- raise ValueError("config.vision_config is expected to be of type "
- "SiglipVisionConfig but is of type"
- f" {type(config.vision_config)}.")
-
- text_config = config.text_config
- vision_config = config.vision_config
-
- self.text_model = SiglipTextTransformer(text_config)
- self.vision_model = SiglipVisionTransformer(vision_config)
-
- self.logit_scale = nn.Parameter(torch.randn(1))
- self.logit_bias = nn.Parameter(torch.randn(1))
-
- # Initialize weights and apply final processing
- self.post_init()
-
- @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
- def get_text_features(
- self,
- input_ids: Optional[torch.Tensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> torch.FloatTensor:
- r"""
- Returns:
- text_features (`torch.FloatTensor` of shape `(batch_size,
- output_dim`): The text embeddings obtained by
- applying the projection layer to the pooled output
- of [`SiglipTextModel`].
- Examples:
- ```python
- >>> from transformers import AutoTokenizer, AutoModel
- >>> import torch
- >>> model = AutoModel.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> tokenizer = AutoTokenizer.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> # important: make sure to set padding="max_length" as that's
- how the model was trained
- >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],
- padding="max_length", return_tensors="pt")
- >>> with torch.no_grad():
- ... text_features = model.get_text_features(**inputs)
- ```"""
- # Use SigLIP model's config for some fields (if specified) instead
- # of those of vision & text components.
- output_attentions = output_attentions if output_attentions is not None\
- else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None \
- else self.config.use_return_dict
-
- text_outputs = self.text_model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- position_ids=position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- pooled_output = text_outputs[1]
-
- return pooled_output
-
- @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
- def get_image_features(
- self,
- pixel_values: Optional[torch.FloatTensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> torch.FloatTensor:
- r"""
- Returns:
- image_features (`torch.FloatTensor` of shape `(batch_size,
- output_dim`): The image embeddings obtained by applying the
- projection layer to the pooled output of [`SiglipVisionModel`].
- Examples:
- ```python
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import AutoProcessor, AutoModel
- >>> import torch
- >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
- >>> processor = AutoProcessor.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> inputs = processor(images=image, return_tensors="pt")
- >>> with torch.no_grad():
- ... image_features = model.get_image_features(**inputs)
- ```"""
- # Use SiglipModel's config for some fields (if specified) instead
- # of those of vision & text components.
- output_attentions = output_attentions if output_attentions \
- is not None else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None else \
- self.config.use_return_dict
-
- vision_outputs = self.vision_model(
- pixel_values=pixel_values,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- pooled_output = vision_outputs[1]
-
- return pooled_output
-
- @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
- @replace_return_docstrings(output_type=SiglipOutput,
- config_class=SiglipConfig)
- def forward(
- self,
- input_ids: Optional[torch.LongTensor] = None,
- pixel_values: Optional[torch.FloatTensor] = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- return_loss: Optional[bool] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[Tuple, SiglipOutput]:
- r"""
- Returns:
- Examples:
- ```python
- >>> from PIL import Image
- >>> import requests
- >>> from transformers import AutoProcessor, AutoModel
- >>> import torch
- >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
- >>> processor = AutoProcessor.from_pretrained(
- "google/siglip-base-patch16-224")
- >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
- >>> image = Image.open(requests.get(url, stream=True).raw)
- >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
- >>> # important: we pass `padding=max_length` since the model was
- trained with this
- >>> inputs = processor(text=texts, images=image,
- padding="max_length", return_tensors="pt")
- >>> with torch.no_grad():
- ... outputs = model(**inputs)
- >>> logits_per_image = outputs.logits_per_image
- >>> probs = torch.sigmoid(logits_per_image) # these are the
- probabilities
- >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
- 31.9% that image 0 is 'a photo of 2 cats'
- ```"""
- # Use SigLIP model's config for some fields (if specified) instead of
- # those of vision & text components.
- output_attentions = output_attentions if output_attentions \
- is not None else self.config.output_attentions
- output_hidden_states = (output_hidden_states
- if output_hidden_states is not None else
- self.config.output_hidden_states)
- return_dict = return_dict if return_dict is not None else \
- self.config.use_return_dict
-
- vision_outputs = self.vision_model(
- pixel_values=pixel_values,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- text_outputs = self.text_model(
- input_ids=input_ids,
- attention_mask=attention_mask,
- position_ids=position_ids,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- return_dict=return_dict,
- )
-
- image_embeds = vision_outputs[1]
- text_embeds = text_outputs[1]
-
- # normalized features
- image_embeds = image_embeds / image_embeds.norm(
- p=2, dim=-1, keepdim=True)
- text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
-
- # cosine similarity as logits
- logits_per_text = torch.matmul(text_embeds, image_embeds.t(
- )) * self.logit_scale.exp() + self.logit_bias
- logits_per_image = logits_per_text.t()
-
- loss = None
- if return_loss:
- raise NotImplementedError("SigLIP loss to be implemented")
-
- if not return_dict:
- output = (logits_per_image, logits_per_text, text_embeds,
- image_embeds, text_outputs, vision_outputs)
- return ((loss, ) + output) if loss is not None else output
-
- return SiglipOutput(
- loss=loss,
- logits_per_image=logits_per_image,
- logits_per_text=logits_per_text,
- text_embeds=text_embeds,
- image_embeds=image_embeds,
- text_model_output=text_outputs,
- vision_model_output=vision_outputs,
- )
-
-
-def get_siglip_vision_model(_flash_attn_2_enabled=True, **kwargs):
- siglip_vision_config = {
- "hidden_size": 1152,
- "image_size": 448,
- "intermediate_size": 4304,
- "model_type": "siglip_vision_model",
- "num_attention_heads": 16,
- "num_hidden_layers": 27,
- "patch_size": 14,
- }
-
- # Detect attention implementation.
- attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
- if attn_backend != _Backend.FLASH_ATTN:
- _flash_attn_2_enabled = False
-
- model_config = SiglipVisionConfig(
- **siglip_vision_config,
- _flash_attn_2_enabled=_flash_attn_2_enabled,
- **kwargs)
-
- vision_model = SiglipVisionModel(model_config).vision_model
-
- return vision_model
From b30c75dda4f6c5e0d8b3d2b39134da38b72ea96e Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 15 Mar 2025 20:21:11 -0700
Subject: [PATCH 03/34] [V1] Remove V0 fallback for mistral-tokenizer (#14873)
Signed-off-by: Roger Wang
---
vllm/engine/arg_utils.py | 7 -------
1 file changed, 7 deletions(-)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31d567de0efa5..4e695da4ef765 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1487,13 +1487,6 @@ class EngineArgs:
recommend_to_remove=False)
return False
- # No MistralTokenizer support so far (not compatible
- # with xgrammar)
- if model_config.tokenizer_mode == "mistral":
- _raise_or_fallback(feature_name="--tokenizer-mode mistral",
- recommend_to_remove=False)
- return False
-
# No CPU offloading yet.
if self.cpu_offload_gb != EngineArgs.cpu_offload_gb:
_raise_or_fallback(feature_name="--cpu-offload-gb",
From 71c1e0710783e1b0427610ba9e32bed7724fa36f Mon Sep 17 00:00:00 2001
From: Simon Mo
Date: Sat, 15 Mar 2025 20:25:03 -0700
Subject: [PATCH 04/34] [Kernel] Add more tuned configs (#14877)
Signed-off-by: simon-mo
---
...192,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...=64,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
...280,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=64,N=1280,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=64,N=2560,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...320,device_name=NVIDIA_H100_80GB_HBM3.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=64,N=320,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...640,device_name=NVIDIA_A800-SXM4-80GB.json | 146 ++++++++++++++++++
...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=64,N=640,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=14336,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=1792,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=2048,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...VIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=3584,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=4096,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../E=8,N=7168,device_name=NVIDIA_H200.json | 146 ++++++++++++++++++
...evice_name=NVIDIA_H200,dtype=fp8_w8a8.json | 146 ++++++++++++++++++
.../layers/fused_moe/configs/README | 3 +
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...,dtype=fp8_w8a8,block_shape=[128,128].json | 26 ++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
...dtype=int8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
105 files changed, 13627 insertions(+)
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000..0611620eb3362
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4dd00d110e486
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000..48f9697af2639
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..a8c05712ba587
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
new file mode 100644
index 0000000000000..f1244c61efb01
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..a2ee05da1d7c6
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..fc573cd6e8561
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..c6d7e96c7f0ae
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000..21f60229ff875
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000..39a9912fa4bdd
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..05b54639d234e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..c17a4ec346915
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..170ae7f3fff1d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..1d9d352edebc3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..9ad5b31675005
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..2883dfd11e7f3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=2560,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..8abfd84a776b7
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
new file mode 100644
index 0000000000000..2fc18a5e43d29
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..be8d4a7fd23d9
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..71fdd88643c6f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=320,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
new file mode 100644
index 0000000000000..c02de2f628b71
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..3e0bc75ff87c4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..9f7ed6726f44e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..21b72557e365d
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..eaf32f6d76c0a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=64,N=640,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..841044a4fc6e2
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..59be497fc4287
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..e4110a5d2e70f
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..0883ef40582ea
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..1a0aa33193329
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..9952be6ba4abe
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..32bbadbb9eae8
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..e6f753cdba35b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..53f3394693f06
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..4dd475c02a19b
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..2ed15f30fe603
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..eb817268d4120
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
new file mode 100644
index 0000000000000..0c7062aea6c4e
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H200.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 0000000000000..96cbc111c7fff
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/fused_moe/configs/README b/vllm/model_executor/layers/fused_moe/configs/README
index 45d40cbfb1a2e..787bd06116646 100644
--- a/vllm/model_executor/layers/fused_moe/configs/README
+++ b/vllm/model_executor/layers/fused_moe/configs/README
@@ -8,3 +8,6 @@ the JSON file contains a mapping from M (batch size) to the chosen configuration
The example configurations provided are for the Mixtral model for TP2 on H100
and TP4 on A100. Mixtral has intermediate size N = 14336, i.e. for TP2 we have
N = 7168 and for TP4 we have N = 3584.
+
+Please feel free to tune the configurations using scripts in `benchmarks/kernels/benchmark_moe.py`
+Some of the configurations files are copied from the SGLang repository. Thank you!
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..3e8ebf3f7301c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..2bb5b457d774a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6e2aeee9b75c2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..b0f9442a6aaa8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..bee8d03ba47cf
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..9da876d3ccb43
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..0a1a252a5e032
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..d6279a1e37b6f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..3bc003647cda8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..310dff4635c28
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..206c8a2bac667
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..edc23530ea745
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..43b5bdbdff5db
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..bffa749724ad3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..f96f12787f6fb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..fe3e18cf01aa1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..e4d5b2dd02a8c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..137b9ddaca305
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..38cac4690a8a6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8e6ebe21fc3c6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..1225d847b7d5e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..d44e38438c9f6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..c559a69a77eed
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8ec2005f02e88
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..65840aa538bc6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 5
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4e120d6d08432
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..5c298746788d9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4990268b2a9eb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..18afdd96fbfb2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..51d10bb0ee1a4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..1480e09293213
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6bd350c388972
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..2b9f0d1ec64ed
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..d979c6b66d048
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6eb22deb8dd2b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..c746e7080522d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..0b4746ceeb61d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8ec2005f02e88
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..202acf23f8ca7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..11a9bceb77c85
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..386ee59beae38
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..60df5e33eed5d
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..4f1747b81f58e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..53bbaca407af6
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..ffe67dcf48c23
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..2a17e164e9ec7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..b259993b617c3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..a71ab88d43c1e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..eda96e76cb6d9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..bd0767b5ef66f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..29f7651876940
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "2": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6db13852c9d4e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..1a47cae9e17bd
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8dd5ae5c49715
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..6d1a8b56a2831
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..e77abaf396831
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..01327b2c4f907
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..12eea5fb6687a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..9db9daece8c18
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..365f8d0d8abc0
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..f080ea5da7dd1
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..e9bf04442a91f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,26 @@
+{
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..c37aced26e8d5
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..d6bef7f60c614
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "16": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
new file mode 100644
index 0000000000000..8df6e4b6e5dc8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "2": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 5
+ },
+ "4": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "8": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "16": {
+ "BLOCK_SIZE_M": 32,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "24": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 5
+ },
+ "32": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "64": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 2
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "256": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "512": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 4,
+ "num_stages": 3
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 2
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 3
+ }
+}
\ No newline at end of file
From b82662d9523d9aa1386d8d1de410426781a1fa3b Mon Sep 17 00:00:00 2001
From: Nick Hill
Date: Sat, 15 Mar 2025 20:26:19 -0700
Subject: [PATCH 05/34] [BugFix] Fix torch distributed stateless PG backend
init (#14870)
Signed-off-by: Nick Hill
---
examples/offline_inference/data_parallel.py | 5 +++++
vllm/distributed/utils.py | 6 +++---
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index b00519314d8bd..b73770ce382cf 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -76,5 +76,10 @@ if __name__ == "__main__":
GPUs_per_dp_rank))
proc.start()
procs.append(proc)
+ exit_code = 0
for proc in procs:
proc.join()
+ if proc.exitcode:
+ exit_code = proc.exitcode
+
+ exit(exit_code)
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 25202062e9757..84899358a6d66 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -299,13 +299,10 @@ def stateless_init_torch_distributed_process_group(
# different systems (e.g. RPC) in case the store is multi-tenant.
prefix_store = PrefixStore(init_method, store)
- pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
-
pg: ProcessGroup = ProcessGroup(
prefix_store,
group_rank,
group_size,
- pg_options,
)
if backend == "gloo":
@@ -327,7 +324,10 @@ def stateless_init_torch_distributed_process_group(
backend_options)
backend_type = ProcessGroup.BackendType.NCCL
device = torch.device("cuda")
+ else:
+ raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+ pg._set_default_backend(backend_type)
backend_class._set_sequence_number_for_group()
pg._register_backend(device, backend_type, backend_class)
From d1ad2a57af72fb4c9bb4b6c7cfc58e0159693fc6 Mon Sep 17 00:00:00 2001
From: Lily Liu
Date: Sun, 16 Mar 2025 00:29:22 -0700
Subject: [PATCH 06/34] [V1] [Spec Decode] Fix ngram tests (#14878)
---
tests/v1/spec_decode/test_ngram.py | 55 ++++++++++++++++--------------
1 file changed, 30 insertions(+), 25 deletions(-)
diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py
index ec663c84d0d2a..2c2e125ade48c 100644
--- a/tests/v1/spec_decode/test_ngram.py
+++ b/tests/v1/spec_decode/test_ngram.py
@@ -1,32 +1,37 @@
# SPDX-License-Identifier: Apache-2.0
-import pytest
-from vllm.v1.spec_decode.ngram_proposer import NgramProposer
-from vllm.v1.utils import ConstantList
+import numpy as np
+
+from vllm.v1.spec_decode.ngram_proposer import (_find_subarray_kmp,
+ _kmp_lps_array)
-@pytest.fixture
-def proposer():
- return NgramProposer()
+def test_kmp_lps_array():
+ np.testing.assert_array_equal(_kmp_lps_array(np.array([])), np.array([]))
+ np.testing.assert_array_equal(_kmp_lps_array(np.array([1])), np.array([0]))
+ np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 1, 1])),
+ np.array([0, 1, 2]))
+ np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 3, 4])),
+ np.array([0, 0, 0, 0]))
+ np.testing.assert_array_equal(_kmp_lps_array(np.array([1, 2, 1, 2, 3])),
+ np.array([0, 0, 1, 2, 0]))
-def test_kmp_lps_array(proposer):
- assert proposer._kmp_lps_array([]) == []
- assert proposer._kmp_lps_array([1]) == [0]
- assert proposer._kmp_lps_array([1, 1, 1]) == [0, 1, 2]
- assert proposer._kmp_lps_array([1, 2, 3, 4]) == [0, 0, 0, 0]
- assert proposer._kmp_lps_array([1, 2, 1, 2, 3]) == [0, 0, 1, 2, 0]
-
-
-def test_find_subarray_kmp(proposer):
- X = ConstantList([1, 2, 3, 4, 1, 2, 3, 5, 6])
- assert proposer._find_subarray_kmp(X, 2, 2) is None
- X = ConstantList([1, 2, 3, 4, 1, 2, 3])
- assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
- assert proposer._find_subarray_kmp(X, 2, 2) == [4, 1]
- assert proposer._find_subarray_kmp(X, 1, 3) == [4, 1, 2]
- assert proposer._find_subarray_kmp(X, 1, 2) == [4, 1]
- X = ConstantList([1, 3, 6, 2, 3, 4, 1, 2, 3])
- assert proposer._find_subarray_kmp(X, 2, 3) == [4, 1, 2]
+def test_find_subarray_kmp():
+ X = np.array([1, 2, 3, 4, 1, 2, 3, 5, 6])
+ assert _find_subarray_kmp(X, 2, 2) is None
+ X = np.array([1, 2, 3, 4, 1, 2, 3])
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+ np.array([4, 1, 2]))
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 2), np.array([4,
+ 1]))
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+ np.array([4, 1, 2]))
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 2), np.array([4,
+ 1]))
+ X = np.array([1, 3, 6, 2, 3, 4, 1, 2, 3])
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 2, 3),
+ np.array([4, 1, 2]))
# Return on the first match
- assert proposer._find_subarray_kmp(X, 1, 3) == [6, 2, 3]
\ No newline at end of file
+ np.testing.assert_array_equal(_find_subarray_kmp(X, 1, 3),
+ np.array([6, 2, 3]))
From d30aa7e9e6afd6147865c8c9fae8cd21f5ddce3d Mon Sep 17 00:00:00 2001
From: Kyle Sayers
Date: Sun, 16 Mar 2025 10:44:19 -0400
Subject: [PATCH 07/34] [Bugfix] Limit profiling run sequence length by
max_model_len (#14785)
Signed-off-by: Kyle Sayers
---
vllm/inputs/registry.py | 5 +++++
vllm/worker/enc_dec_model_runner.py | 1 +
vllm/worker/model_runner.py | 1 +
vllm/worker/openvino_model_runner.py | 1 +
vllm/worker/xpu_model_runner.py | 1 +
5 files changed, 9 insertions(+)
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b6ceb5fb82d70..24980833864b0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -330,6 +330,11 @@ class InputRegistry:
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.profiling import MultiModalProfiler
+ if seq_len > model_config.max_model_len:
+ raise AssertionError(
+ f"Profiling attempted with sequence length ({seq_len}) "
+ f"greater than model length ({model_config.max_model_len})")
+
if mm_registry.has_processor(model_config):
tokenizer = cached_tokenizer_from_config(model_config)
processor = mm_registry.create_processor(model_config,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 5f39f2fa4947c..f34597ac05db4 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -281,6 +281,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
+ seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
decoder_dummy_data = self.input_registry \
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 473bd901b5b23..3181483fe8390 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1302,6 +1302,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
+ seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
dummy_data = self.input_registry \
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index aa1d2cbb2df29..9b484a9f543fe 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -148,6 +148,7 @@ class OpenVINOModelRunner(ModelRunnerBase):
seq_len = min(
seq_data.get_len(),
computed_len + seq_group_metadata.token_chunk_size,
+ self.model_config.max_model_len,
)
if is_prompt:
tokens = seq_data.get_token_ids()[computed_len:seq_len]
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 39957e661c474..2103260d8900c 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -466,6 +466,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
+ seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
dummy_data = self.input_registry \
From e53b1350f289d65011d9251fd826646c169018df Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 00:05:40 +0800
Subject: [PATCH 08/34] [Bugfix] Explicitly disable Phi-4-multimodal in V1
(#14889)
Signed-off-by: DarkLight1337
---
vllm/model_executor/models/phi4mm.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 7250aaba557eb..3d4505d556e2c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -33,7 +33,7 @@ from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from .idefics2_vision_model import Idefics2VisionTransformer
-from .interfaces import SupportsLoRA, SupportsMultiModal
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsV0Only
from .phi4mm_audio import AudioEmbedding
from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
@@ -1433,7 +1433,8 @@ def cat_with_pad(tensors, dim, padding_value=0):
"image", get_max_phi4mm_image_tokens)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi4mm)
@INPUT_REGISTRY.register_input_processor(input_processor_for_phi4mm)
-class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
+class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal,
+ SupportsV0Only):
"""
Implements the Phi-4-multimodal-instruct model in vLLM.
"""
From f6137adbcbbdea8b5023a66480de921b558bef83 Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 00:13:46 +0800
Subject: [PATCH 09/34] Revert "[Bugfix] Limit profiling run sequence length by
max_model_len (#14785) (#14892)
Signed-off-by: DarkLight1337
---
vllm/inputs/registry.py | 5 -----
vllm/worker/enc_dec_model_runner.py | 1 -
vllm/worker/model_runner.py | 1 -
vllm/worker/openvino_model_runner.py | 1 -
vllm/worker/xpu_model_runner.py | 1 -
5 files changed, 9 deletions(-)
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 24980833864b0..b6ceb5fb82d70 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -330,11 +330,6 @@ class InputRegistry:
from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.profiling import MultiModalProfiler
- if seq_len > model_config.max_model_len:
- raise AssertionError(
- f"Profiling attempted with sequence length ({seq_len}) "
- f"greater than model length ({model_config.max_model_len})")
-
if mm_registry.has_processor(model_config):
tokenizer = cached_tokenizer_from_config(model_config)
processor = mm_registry.create_processor(model_config,
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index f34597ac05db4..5f39f2fa4947c 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -281,7 +281,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
- seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
decoder_dummy_data = self.input_registry \
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 3181483fe8390..473bd901b5b23 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1302,7 +1302,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
- seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
dummy_data = self.input_registry \
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 9b484a9f543fe..aa1d2cbb2df29 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -148,7 +148,6 @@ class OpenVINOModelRunner(ModelRunnerBase):
seq_len = min(
seq_data.get_len(),
computed_len + seq_group_metadata.token_chunk_size,
- self.model_config.max_model_len,
)
if is_prompt:
tokens = seq_data.get_token_ids()[computed_len:seq_len]
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 2103260d8900c..39957e661c474 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -466,7 +466,6 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
- seq_len = min(seq_len, self.model_config.max_model_len)
batch_size += seq_len
dummy_data = self.input_registry \
From fc1f67715d95f24885288b75c736cc1fc1be0103 Mon Sep 17 00:00:00 2001
From: Nick Hill
Date: Sun, 16 Mar 2025 14:53:34 -0700
Subject: [PATCH 10/34] [BugFix][V1] Fix overhead related to bad_words sampling
when not in use (#14894)
Signed-off-by: Nick Hill
---
tests/v1/worker/test_gpu_input_batch.py | 5 +++--
vllm/sampling_params.py | 7 ++++---
vllm/v1/worker/gpu_input_batch.py | 5 +++--
3 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 192ddefe102d2..2486c26c6071a 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -124,8 +124,9 @@ def _construct_expected_sampling_metadata(
if req.sampling_params.allowed_token_ids:
allowed_token_ids_mask[index_in_input_batch][
req.sampling_params.allowed_token_ids] = True
- bad_words_token_ids[
- index_in_input_batch] = req.sampling_params.bad_words_token_ids
+ if req.sampling_params.bad_words_token_ids:
+ bad_words_token_ids[
+ index_in_input_batch] = req.sampling_params.bad_words_token_ids
return SamplingMetadata(
temperature=torch.tensor(temperature, dtype=torch.float,
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index b0a5777cc8d56..9b474a37b96b6 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -235,7 +235,7 @@ class SamplingParams(
# Fields used for bad words
bad_words: Optional[list[str]] = None
- _bad_words_token_ids: list[list[int]] = msgspec.field(default_factory=list)
+ _bad_words_token_ids: Optional[list[list[int]]] = None
@staticmethod
def from_optional(
@@ -464,8 +464,9 @@ class SamplingParams(
self.stop_token_ids = list(eos_ids)
def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
- if self.bad_words is None:
+ if not self.bad_words:
return
+ self._bad_words_token_ids = []
for bad_word in self.bad_words:
# To prohibit words both at the beginning
# and in the middle of text
@@ -516,7 +517,7 @@ class SamplingParams(
return self._all_stop_token_ids
@property
- def bad_words_token_ids(self) -> list[list[int]]:
+ def bad_words_token_ids(self) -> Optional[list[list[int]]]:
# For internal use only. Backward compatibility not guaranteed
return self._bad_words_token_ids
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 9707cb5774cd0..55d5429a8935d 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -324,8 +324,9 @@ class InputBatch:
self.allowed_token_ids_mask_cpu_tensor[req_index][
sampling_params.allowed_token_ids] = False
- self.bad_words_token_ids[
- req_index] = sampling_params.bad_words_token_ids
+ if sampling_params.bad_words_token_ids:
+ self.bad_words_token_ids[
+ req_index] = sampling_params.bad_words_token_ids
# Add request lora ID
if request.lora_request:
From 31060b2757fb19ec67894b7c441383ceec9f1272 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Sun, 16 Mar 2025 14:53:53 -0700
Subject: [PATCH 11/34] [V1][BugFix] Detect interleaved sliding window
attention (#14896)
Signed-off-by: Woosuk Kwon
---
vllm/v1/worker/gpu_model_runner.py | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c2a976108e4d4..8dd7521ff49a2 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -82,8 +82,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
cache_config.cache_dtype]
- self.is_multimodal_model = model_config.is_multimodal_model
+ # NOTE(woosuk): sliding_window is None for models with interleaved
+ # attention. Use interleaved_sliding_window instead.
self.sliding_window = model_config.get_sliding_window()
+ self.interleaved_sliding_window = getattr(
+ model_config.hf_text_config, "interleaved_sliding_window", None)
+ self.window_size = (self.sliding_window
+ or self.interleaved_sliding_window)
+
+ self.is_multimodal_model = model_config.is_multimodal_model
self.block_size = cache_config.block_size
self.max_model_len = model_config.max_model_len
self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size)
@@ -674,7 +681,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_query_heads=self.num_query_heads,
num_kv_heads=self.num_kv_heads,
use_alibi=False, # FIXME
- use_sliding_window=self.sliding_window is not None,
+ use_sliding_window=self.window_size is not None,
num_sms=self.num_sms,
)
return common_prefix_len if use_cascade else 0
From b9b5bdfc7d5cd0f8610a4de7a79327d10a09dfab Mon Sep 17 00:00:00 2001
From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:46:42 -0700
Subject: [PATCH 12/34] [Misc] Catching Ray Compiled Graph PP test failures for
V1 (#14847)
---
tests/distributed/test_pipeline_parallel.py | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 05b6ba40506a2..4d3306509c8f2 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -350,6 +350,10 @@ def _compare_tp(
else:
pp_env = None
+ tp_env = {
+ "VLLM_USE_V1": vllm_major_version,
+ }
+
pp_args = [
*common_args,
"--pipeline-parallel-size",
@@ -374,14 +378,20 @@ def _compare_tp(
]
try:
- compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
+ compare_two_settings(model_id,
+ pp_args,
+ tp_args,
+ pp_env,
+ tp_env,
+ method=method)
except Exception:
- if pp_env is None:
- raise
- else:
- # Ray Compiled Graph tests are flaky,
+ testing_ray_compiled_graph = pp_env is not None
+ if testing_ray_compiled_graph and vllm_major_version == "0":
+ # Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
+ else:
+ raise
@pytest.mark.parametrize(
From 90df7f23aadad4aafc509fa950bd9b967a996e84 Mon Sep 17 00:00:00 2001
From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Date: Mon, 17 Mar 2025 03:10:04 +0400
Subject: [PATCH 13/34] [Doc] Add guidance for using `ccache` with `pip install
-e .` in doc (#14901)
---
docs/source/getting_started/installation/gpu/cuda.inc.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md
index 7e3b884c2ab1e..d3e375aec10cb 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@@ -131,6 +131,8 @@ Building from source requires a lot of compilation. If you are building from sou
For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+
[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
:::
From aecc780dba30db6b503754926564642374cb2c2e Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 16 Mar 2025 20:56:16 -0400
Subject: [PATCH 14/34] [V1] Enable Entrypoints Tests (#14903)
---
.buildkite/test-pipeline.yaml | 1 +
tests/v1/entrypoints/llm/test_struct_output_generate.py | 3 +++
2 files changed, 4 insertions(+)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 93ac8a29c676c..a6616d7b41480 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -198,6 +198,7 @@ steps:
commands:
# split the test to avoid interference
- pytest -v -s v1/core
+ - pytest -v -s v1/entrypoints
- pytest -v -s v1/engine
- pytest -v -s v1/sample
- pytest -v -s v1/worker
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index b4eb475c23baa..98983fa05b83f 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -18,6 +18,9 @@ MODELS_TO_TEST = [
"Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
]
+# Undo after https://github.com/vllm-project/vllm/pull/14868
+pytest.skip(allow_module_level=True)
+
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize("guided_decoding_backend",
From bb3aeddfaf338a9bbac10e3c75027b7f8c5c08e0 Mon Sep 17 00:00:00 2001
From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Date: Sun, 16 Mar 2025 22:06:43 -0400
Subject: [PATCH 15/34] [CI] Nightly Tests (#14898)
Signed-off-by: rshaw@neuralmagic.com
Signed-off-by: rshaw@neuralmagic.com
Co-authored-by: rshaw@neuralmagic.com
---
.../models/decoder_only/language/test_mistral.py | 1 +
tests/tool_use/utils.py | 15 +++++++++++++--
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 7e1337b7d4876..4c2055361d445 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -201,6 +201,7 @@ def test_models(
)
+@pytest.mark.skip("RE-ENABLE: test is currently failing on main.")
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index aad37eb9b8f3a..df117b96cd07b 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -46,6 +46,7 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"NousResearch/Hermes-3-Llama-3.1-8B",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "hermes", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
],
@@ -60,6 +61,7 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
],
@@ -70,6 +72,7 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"meta-llama/Llama-3.2-3B-Instruct",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "llama3_json", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
],
@@ -80,6 +83,7 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"mistralai/Mistral-7B-Instruct-v0.3",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "mistral", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
"--ignore-patterns=\"consolidated.safetensors\""
@@ -111,22 +115,28 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"ibm-granite/granite-3.0-8b-instruct",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "granite", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
],
},
"granite-3.1-8b": {
- "model": "ibm-granite/granite-3.1-8b-instruct",
+ "model":
+ "ibm-granite/granite-3.1-8b-instruct",
"arguments": [
+ "--enforce-eager",
+ "--no-enable-prefix-caching",
"--tool-call-parser",
"granite",
],
- "supports_parallel": True,
+ "supports_parallel":
+ True,
},
"internlm": {
"model":
"internlm/internlm2_5-7b-chat",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "internlm", "--chat-template",
str(VLLM_PATH /
"examples/tool_chat_template_internlm2_tool.jinja"),
@@ -139,6 +149,7 @@ CONFIGS: dict[str, ServerConfig] = {
"model":
"Team-ACE/ToolACE-8B",
"arguments": [
+ "--enforce-eager", "--no-enable-prefix-caching",
"--tool-call-parser", "pythonic", "--chat-template",
str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
],
From 8a5a9b70d702feb17e79691870c638b0f1e71192 Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 10:38:15 +0800
Subject: [PATCH 16/34] [CI/Build] Update defaults for test reproducibility
(#14893)
Signed-off-by: DarkLight1337
---
tests/conftest.py | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/tests/conftest.py b/tests/conftest.py
index 4716ca2e315b7..41c0e62ce14f3 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -681,6 +681,17 @@ def hf_runner():
class VllmRunner:
+ """
+ The default value of some arguments have been modified from
+ :class:`~vllm.LLM` as follows:
+ - `trust_remote_code`: Set to `True` instead of `False` for convenience.
+ - `seed`: Set to `0` instead of `None` for test reproducibility.
+ - `max_model_len`: Set to `1024` instead of `None` to reduce memory usage.
+ - `block_size`: Set to `16` instead of `None` to reduce memory usage.
+ - `enable_chunked_prefill`: Set to `False` instead of `None` for
+ test reproducibility.
+ - `enforce_eager`: Set to `False` instead of `None` to test CUDA graph.
+ """
def __init__(
self,
@@ -688,6 +699,8 @@ class VllmRunner:
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
tokenizer_mode: str = "auto",
+ trust_remote_code: bool = True,
+ seed: Optional[int] = 0,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
max_model_len: int = 1024,
@@ -695,7 +708,7 @@ class VllmRunner:
disable_log_stats: bool = True,
tensor_parallel_size: int = 1,
block_size: int = 16,
- enable_chunked_prefill: bool = False,
+ enable_chunked_prefill: Optional[bool] = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
**kwargs,
@@ -705,8 +718,9 @@ class VllmRunner:
task=task,
tokenizer=tokenizer_name,
tokenizer_mode=tokenizer_mode,
- trust_remote_code=True,
+ trust_remote_code=trust_remote_code,
dtype=dtype,
+ seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
From faa02757307583f2c5557ff23cb41f1db4f1f29c Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Sun, 16 Mar 2025 20:19:30 -0700
Subject: [PATCH 17/34] [V1] Optimize the overhead of rewinding (#14905)
Signed-off-by: Woosuk Kwon
---
vllm/v1/worker/gpu_model_runner.py | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8dd7521ff49a2..4059d5b17b71b 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1032,17 +1032,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# TODO(woosuk): The following loop can be slow since it iterates over
# the requests one by one. Optimize.
- for i, req_id in enumerate(self.input_batch.req_ids):
+ for i, generator in self.input_batch.generators.items():
+ req_id = self.input_batch.req_ids[i]
req_state = self.requests[req_id]
seq_len = (req_state.num_computed_tokens +
scheduler_output.num_scheduled_tokens[req_id])
if seq_len < req_state.num_tokens:
- # Ignore the sampled token.
+ # Ignore the sampled token for partial prefills.
# Rewind the generator state as if the token was not sampled.
- generator = self.input_batch.generators.get(i)
- if generator is not None:
- # This relies on cuda-specific torch-internal impl details
- generator.set_offset(generator.get_offset() - 4)
+ # This relies on cuda-specific torch-internal impl details
+ generator.set_offset(generator.get_offset() - 4)
# NOTE: GPU -> CPU Sync happens here.
# Move as many CPU operations as possible before this sync point.
From 7f6c5ee06c4861ae1310f4ea5caaa2104efb4d22 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon
Date: Sun, 16 Mar 2025 20:20:15 -0700
Subject: [PATCH 18/34] [V1][Minor] Add __repr__ to ConstantList (#14907)
Signed-off-by: Woosuk Kwon
---
vllm/v1/utils.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 8e1fb18cca05b..6c01ed3de52d7 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -86,6 +86,9 @@ class ConstantList(Generic[T], Sequence):
def __len__(self):
return len(self._x)
+ def __repr__(self):
+ return f"ConstantList({self._x})"
+
class BackgroundProcHandle:
"""
From 1e799b7ec1b1c61952d2ae24c85ecf3fcb0f6de3 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson
Date: Sun, 16 Mar 2025 23:35:37 -0400
Subject: [PATCH 19/34] [BugFix] Fix MLA + V1 + TP==1 causing reinitialization
of cuda context (#14910)
---
vllm/platforms/cuda.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 3897584307e91..8a53337ebc087 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -152,7 +152,7 @@ class CudaPlatformBase(Platform):
# here
use_flashmla = (envs.VLLM_ATTENTION_BACKEND is None \
or envs.VLLM_ATTENTION_BACKEND == "FLASHMLA")
- from vllm.attention.backends.flashmla import is_flashmla_supported
+ from vllm.attention.ops.flashmla import is_flashmla_supported
if use_flashmla and is_flashmla_supported()[0] \
and cache_config.block_size != 64:
cache_config.block_size = 64
From a73e183e36a818ea95f442ae1751bc66cf4f135d Mon Sep 17 00:00:00 2001
From: Sibi <85477603+t-sibiraj@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:35:57 +0800
Subject: [PATCH 20/34] [Misc] Replace os environ to monkeypatch in test suite
(#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham
Co-authored-by: Cyrus Leung
Co-authored-by: Aaron Pham
---
.buildkite/test-pipeline.yaml | 2 +-
.../test_basic_correctness.py | 105 +++---
.../basic_correctness/test_chunked_prefill.py | 168 +++++-----
tests/basic_correctness/test_cumem.py | 62 ++--
tests/compile/test_basic_correctness.py | 207 ++++++------
tests/compile/test_full_graph.py | 115 ++++++-
tests/compile/utils.py | 93 ------
tests/conftest.py | 2 +-
tests/distributed/test_comm_ops.py | 85 +++--
tests/distributed/test_custom_all_reduce.py | 173 +++++-----
tests/distributed/test_pipeline_partition.py | 60 ++--
tests/distributed/test_pp_cudagraph.py | 38 ++-
tests/entrypoints/llm/test_accuracy.py | 4 +-
.../offline_mode/test_offline_mode.py | 49 +--
.../openai/correctness/test_lmeval.py | 5 +-
tests/kernels/test_attention_selector.py | 129 +++++---
tests/kernels/test_awq.py | 60 ++--
tests/kernels/test_rocm_attention_selector.py | 18 +-
tests/kernels/utils.py | 64 ++--
.../{disagg_test.py => test_disagg.py} | 0
.../{module_test.py => test_module.py} | 0
.../models/decoder_only/language/test_fp8.py | 120 +++----
.../models/embedding/language/test_gritlm.py | 96 +++---
tests/models/test_oot_registration.py | 130 ++++----
tests/mq_llm_engine/test_error_handling.py | 31 +-
.../multi_step/test_correctness_async_llm.py | 202 ++++++------
tests/multi_step/test_correctness_llm.py | 299 ++++++++---------
tests/neuron/1_core/test_block_table.py | 80 ++---
tests/neuron/1_core/test_prefix_prefill.py | 306 +++++++++---------
tests/plugins_tests/test_platform_plugins.py | 13 +-
tests/plugins_tests/test_scheduler_plugins.py | 62 ++--
tests/prefix_caching/test_prefix_caching.py | 111 ++++---
tests/test_regression.py | 16 +-
tests/test_utils.py | 63 ++--
tests/tpu/test_custom_dispatcher.py | 25 +-
tests/tracing/test_tracing.py | 277 ++++++++--------
tests/utils.py | 11 +-
tests/v1/e2e/test_ngram_spec_decode.py | 11 +-
tests/v1/engine/test_async_llm.py | 11 +-
tests/v1/engine/test_engine_core.py | 10 +-
tests/v1/engine/test_engine_core_client.py | 5 +-
tests/v1/sample/test_logprobs.py | 224 +++++++------
tests/v1/tpu/test_basic.py | 16 +-
43 files changed, 1900 insertions(+), 1658 deletions(-)
delete mode 100644 tests/compile/utils.py
rename tests/kv_transfer/{disagg_test.py => test_disagg.py} (100%)
rename tests/kv_transfer/{module_test.py => test_module.py} (100%)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index a6616d7b41480..f85572e7c234c 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -522,7 +522,7 @@ steps:
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+ - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cb3b739b7245..1458f0893a93c 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -47,6 +47,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False])
def test_models(
+ monkeypatch: pytest.MonkeyPatch,
hf_runner,
model: str,
backend: str,
@@ -63,31 +64,33 @@ def test_models(
pytest.skip(
f"{backend} does not support gemma2 with full context length.")
- os.environ["VLLM_ATTENTION_BACKEND"] = backend
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_ATTENTION_BACKEND", backend)
- # 5042 tokens for gemma2
- # gemma2 has alternating sliding window size of 4096
- # we need a prompt with more than 4096 tokens to test the sliding window
- prompt = "The following numbers of the sequence " + ", ".join(
- str(i) for i in range(1024)) + " are:"
- example_prompts = [prompt]
+ # 5042 tokens for gemma2
+ # gemma2 has alternating sliding window size of 4096
+ # we need a prompt with more than 4096 tokens to test the sliding window
+ prompt = "The following numbers of the sequence " + ", ".join(
+ str(i) for i in range(1024)) + " are:"
+ example_prompts = [prompt]
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- with VllmRunner(model,
- max_model_len=8192,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+ with VllmRunner(model,
+ max_model_len=8192,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy(example_prompts,
+ max_tokens)
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@multi_gpu_test(num_gpus=2)
@@ -104,6 +107,7 @@ def test_models(
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
+ monkeypatch: pytest.MonkeyPatch,
hf_runner,
vllm_runner,
example_prompts,
@@ -116,34 +120,41 @@ def test_models_distributed(
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")
- if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
- # test Ray Compiled Graph
- os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
- os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+ with monkeypatch.context() as monkeypatch_context:
+ if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
+ # test Ray Compiled Graph
+ monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+ monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
- if attention_backend:
- os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
+ if attention_backend:
+ monkeypatch_context.setenv(
+ "VLLM_ATTENTION_BACKEND",
+ attention_backend,
+ )
- dtype = "half"
- max_tokens = 5
+ dtype = "half"
+ max_tokens = 5
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
- with vllm_runner(model,
- dtype=dtype,
- tensor_parallel_size=2,
- distributed_executor_backend=distributed_executor_backend
- ) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+ # NOTE: take care of the order. run vLLM first, and then run HF.
+ # vLLM needs a fresh new process without cuda initialization.
+ # if we run HF first, the cuda initialization will be done and it
+ # will hurt multiprocessing backend with fork method
+ # (the default method).
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ tensor_parallel_size=2,
+ distributed_executor_backend=distributed_executor_backend,
+ ) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy(example_prompts,
+ max_tokens)
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index be007de321c8a..06c9e25ed8dd8 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -7,16 +7,22 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`.
"""
-import os
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
import pytest
-from tests.kernels.utils import override_backend_env_variable
from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
+if TYPE_CHECKING:
+ from .conftest import HfRunner, VllmRunner
+
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-3.2-1B-Instruct",
@@ -24,12 +30,14 @@ MODELS = [
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the file.
"""
- monkeypatch.setenv('VLLM_USE_V1', '0')
+ with monkeypatch.context() as m:
+ m.setenv('VLLM_USE_V1', '0')
+ yield
@pytest.mark.parametrize("model", MODELS)
@@ -42,8 +50,8 @@ def use_v0_only(monkeypatch):
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models(
- hf_runner,
- vllm_runner,
+ hf_runner: HfRunner,
+ vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
@@ -52,37 +60,39 @@ def test_models(
enforce_eager: bool,
tensor_parallel_size: int,
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Checks exact match decode between huggingface model and vllm runner with
chunked prefill.
"""
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- max_num_seqs = chunked_prefill_token_size
- max_num_batched_tokens = chunked_prefill_token_size
+ max_num_seqs = chunked_prefill_token_size
+ max_num_batched_tokens = chunked_prefill_token_size
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- with vllm_runner(
- model,
- dtype=dtype,
- max_num_batched_tokens=max_num_batched_tokens,
- enable_chunked_prefill=True,
- tensor_parallel_size=tensor_parallel_size,
- enforce_eager=enforce_eager,
- max_num_seqs=max_num_seqs,
- ) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ max_num_batched_tokens=max_num_batched_tokens,
+ enable_chunked_prefill=True,
+ tensor_parallel_size=tensor_parallel_size,
+ enforce_eager=enforce_eager,
+ max_num_seqs=max_num_seqs,
+ ) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy(example_prompts,
+ max_tokens)
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@multi_gpu_test(num_gpus=2)
@@ -90,57 +100,61 @@ def test_models(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
def test_models_distributed(
- hf_runner,
- vllm_runner,
+ hf_runner: HfRunner,
+ vllm_runner: VllmRunner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
+ if (model == "meta-llama/Llama-3.2-1B-Instruct"
+ and distributed_executor_backend == "ray"):
+ # test Ray Compiled Graph
+ m.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
+ m.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
- if (model == "meta-llama/Llama-3.2-1B-Instruct"
- and distributed_executor_backend == "ray"):
- # test Ray Compiled Graph
- os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
- os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
+ dtype = "half"
+ max_tokens = 5
+ chunked_prefill_token_size = 16
- dtype = "half"
- max_tokens = 5
- chunked_prefill_token_size = 16
+ # Add a chunked prefill config.
+ max_num_seqs = min(chunked_prefill_token_size, 256)
+ assert chunked_prefill_token_size != -1
+ enable_chunked_prefill = True
+ max_num_batched_tokens = chunked_prefill_token_size
- # Add a chunked prefill config.
- max_num_seqs = min(chunked_prefill_token_size, 256)
- assert chunked_prefill_token_size != -1
- enable_chunked_prefill = True
- max_num_batched_tokens = chunked_prefill_token_size
+ # NOTE: take care of the order. run vLLM first, and then run HF.
+ # vLLM needs a fresh new process without cuda initialization.
+ # if we run HF first, the cuda initialization will be done and it
+ # will hurt multiprocessing backend with
+ # fork method (the default method).
- # NOTE: take care of the order. run vLLM first, and then run HF.
- # vLLM needs a fresh new process without cuda initialization.
- # if we run HF first, the cuda initialization will be done and it
- # will hurt multiprocessing backend with fork method (the default method).
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ tensor_parallel_size=2,
+ max_num_seqs=max_num_seqs,
+ enable_chunked_prefill=enable_chunked_prefill,
+ max_num_batched_tokens=max_num_batched_tokens,
+ distributed_executor_backend=distributed_executor_backend,
+ ) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy(
+ example_prompts,
+ max_tokens,
+ )
- with vllm_runner(
- model,
- dtype=dtype,
- tensor_parallel_size=2,
- max_num_seqs=max_num_seqs,
- enable_chunked_prefill=enable_chunked_prefill,
- max_num_batched_tokens=max_num_batched_tokens,
- distributed_executor_backend=distributed_executor_backend,
- ) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize(
@@ -158,7 +172,7 @@ def test_models_distributed(
# the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
def test_models_with_fp8_kv_cache(
- vllm_runner,
+ vllm_runner: VllmRunner,
example_prompts,
kv_cache_dtype: str,
model: str,
@@ -218,7 +232,7 @@ def test_models_with_fp8_kv_cache(
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("dtype", ["half"])
def test_with_prefix_caching(
- vllm_runner,
+ vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
@@ -254,8 +268,10 @@ def test_with_prefix_caching(
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
- outputs[enable] += vllm_model.generate_greedy([prompt],
- max_tokens)
+ outputs[enable] += vllm_model.generate_greedy(
+ [prompt],
+ max_tokens,
+ )
check_outputs_equal(
outputs_0_lst=outputs[False],
@@ -274,8 +290,8 @@ def test_with_prefix_caching(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_models_cpu(
- hf_runner,
- vllm_runner,
+ hf_runner: HfRunner,
+ vllm_runner: VllmRunner,
example_prompts,
model: str,
dtype: str,
@@ -283,7 +299,7 @@ def test_models_cpu(
chunked_prefill_token_size: int,
enforce_eager: bool,
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
test_models(
hf_runner,
@@ -307,7 +323,7 @@ def test_models_cpu(
@pytest.mark.cpu_model
@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
def test_with_prefix_caching_cpu(
- vllm_runner,
+ vllm_runner: VllmRunner,
max_tokens: int,
enforce_eager: bool,
chunk_size: int,
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index ba81f2bb79d11..f5ee469fb00a9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -123,40 +123,38 @@ def test_cumem_with_cudagraph():
# sleep mode with pytorch checkpoint
("facebook/opt-125m", False),
])
-def test_end_to_end(model: str, use_v1: bool):
- import os
- os.environ["VLLM_USE_V1"] = "1" if use_v1 else "0"
- free, total = torch.cuda.mem_get_info()
- used_bytes_baseline = total - free # in case other process is running
- llm = LLM(model, enable_sleep_mode=True)
- prompt = "How are you?"
- sampling_params = SamplingParams(temperature=0, max_tokens=10)
- output = llm.generate(prompt, sampling_params)
+def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+ free, total = torch.cuda.mem_get_info()
+ used_bytes_baseline = total - free # in case other process is running
+ llm = LLM(model, enable_sleep_mode=True)
+ prompt = "How are you?"
+ sampling_params = SamplingParams(temperature=0, max_tokens=10)
+ output = llm.generate(prompt, sampling_params)
- # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
- # which is difficult to measure in the test. therefore, we only
- # test sleep level 1 here.
- llm.sleep(level=1)
+ # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
+ # which is difficult to measure in the test. therefore, we only
+ # test sleep level 1 here.
+ llm.sleep(level=1)
- free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
- used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
- # now the memory usage is mostly cudagraph memory pool,
- # and it should be less than the model weights (1B model, 2GiB weights)
+ free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
+ used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
+ # now the memory usage is mostly cudagraph memory pool,
+ # and it should be less than the model weights (1B model, 2GiB weights)
- # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
- # is captured but cannot be releasesd from PyTorch due to a known bug,
- # therefore high memory usage after `llm.sleep` is called is expected.
- # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
- # in V1.
- if use_v1:
- assert used_bytes < 7 * GiB_bytes
- else:
- assert used_bytes < 2 * GiB_bytes
+ # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
+ # is captured but cannot be releasesd from PyTorch due to a known bug,
+ # therefore high memory usage after `llm.sleep` is called is expected.
+ # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
+ # in V1.
+ if use_v1:
+ assert used_bytes < 7 * GiB_bytes
+ else:
+ assert used_bytes < 2 * GiB_bytes
- llm.wake_up()
- output2 = llm.generate(prompt, sampling_params)
+ llm.wake_up()
+ output2 = llm.generate(prompt, sampling_params)
- # cmp output
- assert output[0].outputs[0].text == output2[0].outputs[0].text
-
- del os.environ["VLLM_USE_V1"]
+ # cmp output
+ assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 48323b21a8c42..b639fd719ca0a 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
import dataclasses
-from typing import Optional
import pytest
@@ -22,75 +22,76 @@ class TestSetting:
fullgraph: bool
-# representative settings for testing
-test_settings = [
- # basic llama model
- TestSetting(
- model="meta-llama/Llama-3.2-1B-Instruct",
- model_args=[],
- pp_size=2,
- tp_size=2,
- attn_backend="FLASHINFER",
- method="generate",
- fullgraph=True,
- ),
- # llama model with quantization
- TestSetting(
- model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
- model_args=["--quantization", "gptq"],
- pp_size=1,
- tp_size=1,
- attn_backend="FLASH_ATTN",
- method="generate",
- fullgraph=True,
- ),
- # MoE model
- TestSetting(
- model="ibm/PowerMoE-3b",
- model_args=[],
- pp_size=1,
- tp_size=2,
- attn_backend="FLASH_ATTN",
- method="generate",
- fullgraph=True,
- ),
- # embedding model
- TestSetting(
- model="BAAI/bge-multilingual-gemma2",
- model_args=["--task", "embed"],
- pp_size=1,
- tp_size=1,
- attn_backend="FLASH_ATTN",
- method="encode",
- fullgraph=True,
- ),
- # encoder-based embedding model (BERT)
- TestSetting(
- model="BAAI/bge-base-en-v1.5",
- model_args=["--task", "embed"],
- pp_size=1,
- tp_size=1,
- attn_backend="XFORMERS",
- method="encode",
- fullgraph=True,
- ),
- # vision language model
- TestSetting(
- model="microsoft/Phi-3.5-vision-instruct",
- model_args=["--trust-remote-code", "--max-model-len", "2048"],
- pp_size=2,
- tp_size=1,
- attn_backend="FLASH_ATTN",
- method="generate_with_image",
- fullgraph=False,
- ),
-]
-
-
# we cannot afford testing the full Catesian product
# of all models and all levels
-@pytest.mark.parametrize("test_setting", test_settings)
-def test_compile_correctness(test_setting: TestSetting):
+@pytest.mark.parametrize(
+ "test_setting",
+ [
+ # basic llama model
+ TestSetting(
+ model="meta-llama/Llama-3.2-1B-Instruct",
+ model_args=[],
+ pp_size=2,
+ tp_size=2,
+ attn_backend="FLASHINFER",
+ method="generate",
+ fullgraph=True,
+ ),
+ # llama model with quantization
+ TestSetting(
+ model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+ model_args=["--quantization", "gptq"],
+ pp_size=1,
+ tp_size=1,
+ attn_backend="FLASH_ATTN",
+ method="generate",
+ fullgraph=True,
+ ),
+ # MoE model
+ TestSetting(
+ model="ibm/PowerMoE-3b",
+ model_args=[],
+ pp_size=1,
+ tp_size=2,
+ attn_backend="FLASH_ATTN",
+ method="generate",
+ fullgraph=True,
+ ),
+ # embedding model
+ TestSetting(
+ model="BAAI/bge-multilingual-gemma2",
+ model_args=["--task", "embed"],
+ pp_size=1,
+ tp_size=1,
+ attn_backend="FLASH_ATTN",
+ method="encode",
+ fullgraph=True,
+ ),
+ # encoder-based embedding model (BERT)
+ TestSetting(
+ model="BAAI/bge-base-en-v1.5",
+ model_args=["--task", "embed"],
+ pp_size=1,
+ tp_size=1,
+ attn_backend="XFORMERS",
+ method="encode",
+ fullgraph=True,
+ ),
+ # vision language model
+ TestSetting(
+ model="microsoft/Phi-3.5-vision-instruct",
+ model_args=["--trust-remote-code", "--max-model-len", "2048"],
+ pp_size=2,
+ tp_size=1,
+ attn_backend="FLASH_ATTN",
+ method="generate_with_image",
+ fullgraph=False,
+ ),
+ ])
+def test_compile_correctness(
+ monkeypatch: pytest.MonkeyPatch,
+ test_setting: TestSetting,
+):
# this test is run under multiple suits, with different GPUs.
# make sure we only run the test with correct CUDA devices.
# don't use "<", as it will duplicate the tests.
@@ -103,41 +104,45 @@ def test_compile_correctness(test_setting: TestSetting):
fullgraph = test_setting.fullgraph
if cuda_device_count_stateless() != pp_size * tp_size:
pytest.skip("Not correct CUDA devices for the test.")
- import os
- os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend
- final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \
- ["-tp", str(tp_size)]
- all_args: list[list[str]] = []
- all_envs: list[Optional[dict[str, str]]] = []
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+ final_args = [
+ "--enforce-eager", *model_args, "-pp",
+ str(pp_size), "-tp",
+ str(tp_size)
+ ]
- for level in [
- CompilationLevel.NO_COMPILATION,
- CompilationLevel.PIECEWISE,
- ]:
- all_args.append(final_args + [f"-O{level}"])
- all_envs.append({})
+ all_args: list[list[str]] = []
+ all_envs: list[dict[str, str] | None] = []
- # inductor will change the output, so we only compare if the output
- # is close, not exactly the same.
- compare_all_settings(
- model,
- all_args,
- all_envs,
- method=method if method != "generate" else "generate_close")
- all_envs.clear()
- all_args.clear()
+ for level in [
+ CompilationLevel.NO_COMPILATION,
+ CompilationLevel.PIECEWISE,
+ ]:
+ all_args.append(final_args + [f"-O{level}"])
+ all_envs.append({})
- for level in [
- CompilationLevel.NO_COMPILATION,
- CompilationLevel.DYNAMO_AS_IS,
- CompilationLevel.DYNAMO_ONCE,
- ]:
- all_args.append(final_args + [f"-O{level}"])
- all_envs.append({})
- if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
- # "DYNAMO_ONCE" will always use fullgraph
- all_envs[-1][
- "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
+ # inductor will change the output, so we only compare if the output
+ # is close, not exactly the same.
+ compare_all_settings(
+ model,
+ all_args,
+ all_envs,
+ method=method if method != "generate" else "generate_close")
+ all_envs.clear()
+ all_args.clear()
- compare_all_settings(model, all_args * 3, all_envs, method=method)
+ for level in [
+ CompilationLevel.NO_COMPILATION,
+ CompilationLevel.DYNAMO_AS_IS,
+ CompilationLevel.DYNAMO_ONCE,
+ ]:
+ all_args.append(final_args + [f"-O{level}"])
+ all_envs.append({})
+ if level != CompilationLevel.DYNAMO_ONCE and not fullgraph:
+ # "DYNAMO_ONCE" will always use fullgraph
+ all_envs[-1][
+ "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
+
+ compare_all_settings(model, all_args * 3, all_envs, method=method)
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 6e83fa36881e4..cf463f3e75254 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,22 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
-import pytest
+from __future__ import annotations
+from typing import Any
+
+import pytest
+import torch
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm import LLM, SamplingParams
from vllm.config import CompilationLevel
+from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test
-from .utils import TEST_MODELS, check_full_graph_support
-@pytest.mark.parametrize("model_info", TEST_MODELS)
+@pytest.fixture(params=None, name="model_info")
+def models_list_fixture(request):
+ TEST_MODELS: list[tuple[str, dict[str, Any]]] = [
+ ("facebook/opt-125m", {}),
+ ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+ "dtype": torch.float16,
+ "quantization": "compressed-tensors"
+ }),
+ ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
+ "dtype": torch.float16,
+ "quantization": "compressed-tensors"
+ }),
+ ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
+ "quantization": "compressed-tensors"
+ }),
+ ("meta-llama/Llama-3.2-1B-Instruct", {}),
+ ]
+
+ if is_quant_method_supported("aqlm"):
+ TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
+ "quantization": "aqlm"
+ }))
+
+ # TODO: figure out why this fails.
+ if False and is_quant_method_supported("gguf"): # noqa: SIM223
+ TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
+ "quantization": "gguf"
+ }))
+
+ if is_quant_method_supported("gptq"):
+ TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
+ "quantization": "gptq"
+ }))
+
+ if is_quant_method_supported("gptq_marlin"):
+ TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
+ "quantization": "gptq_marlin"
+ }))
+
+ if is_quant_method_supported("gptq_marlin_24"):
+ TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
+ "quantization": "gptq_marlin_24"
+ }))
+
+ if is_quant_method_supported("marlin"):
+ TEST_MODELS.append(
+ ("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
+ "quantization": "marlin"
+ }))
+
+ if not current_platform.is_rocm() and is_quant_method_supported("awq"):
+ TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
+ "quantization": "AWQ"
+ }))
+
+ return TEST_MODELS
+
+
@pytest.mark.parametrize(
"optimization_level",
- [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE])
+ [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE],
+)
+@pytest.mark.parametrize("model_info", "", indirect=True)
@fork_new_process_for_each_test
-def test_full_graph(model_info, optimization_level):
- model = model_info[0]
- model_kwargs = model_info[1]
- check_full_graph_support(model,
- model_kwargs,
- optimization_level,
- tp_size=1)
+def test_full_graph(
+ monkeypatch: pytest.MonkeyPatch,
+ model_info: tuple[str, dict[str, Any]],
+ optimization_level: int,
+):
+ model, model_kwargs = model_info
+
+ with monkeypatch.context() as m:
+ # make sure these models can be captured in full graph mode
+ m.setenv("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1")
+ print(f"MODEL={model}")
+
+ prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+ ]
+ sampling_params = SamplingParams(temperature=0)
+ llm = LLM(
+ model=model,
+ enforce_eager=True,
+ tensor_parallel_size=1,
+ disable_custom_all_reduce=True,
+ compilation_config=optimization_level,
+ **model_kwargs,
+ )
+ outputs = llm.generate(prompts, sampling_params)
+
+ # Print the outputs.
+ for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
deleted file mode 100644
index fb8270c26b1b0..0000000000000
--- a/tests/compile/utils.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-import torch
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-TEST_MODELS = [
- ("facebook/opt-125m", {}),
- ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
- "dtype": torch.float16,
- "quantization": "compressed-tensors"
- }),
- ("neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic", {
- "dtype": torch.float16,
- "quantization": "compressed-tensors"
- }),
- ("neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8", {
- "quantization": "compressed-tensors"
- }),
- ("meta-llama/Llama-3.2-1B-Instruct", {}),
-]
-
-if is_quant_method_supported("aqlm"):
- TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
- "quantization": "aqlm"
- }))
-
-# TODO: figure out why this fails.
-if False and is_quant_method_supported("gguf"): # noqa: SIM223
- TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
- "quantization": "gguf"
- }))
-
-if is_quant_method_supported("gptq"):
- TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
- "quantization": "gptq"
- }))
-
-if is_quant_method_supported("gptq_marlin"):
- TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
- "quantization": "gptq_marlin"
- }))
-
-if is_quant_method_supported("gptq_marlin_24"):
- TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
- "quantization": "gptq_marlin_24"
- }))
-
-if is_quant_method_supported("marlin"):
- TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
- "quantization": "marlin"
- }))
-
-if not current_platform.is_rocm() and is_quant_method_supported("awq"):
- TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
- "quantization": "AWQ"
- }))
-
-
-def check_full_graph_support(model,
- model_kwargs,
- optimization_level,
- tp_size=1):
- # make sure these models can be captured in full graph mode
- os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-
- print(f"MODEL={model}")
-
- prompts = [
- "Hello, my name is",
- "The president of the United States is",
- "The capital of France is",
- "The future of AI is",
- ]
- sampling_params = SamplingParams(temperature=0)
- llm = LLM(model=model,
- enforce_eager=True,
- tensor_parallel_size=tp_size,
- disable_custom_all_reduce=True,
- compilation_config=optimization_level,
- **model_kwargs)
-
- outputs = llm.generate(prompts, sampling_params)
-
- # Print the outputs.
- for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs[0].text
- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/tests/conftest.py b/tests/conftest.py
index 41c0e62ce14f3..30e5ca2eb137a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1110,4 +1110,4 @@ def pytest_collection_modifyitems(config, items):
skip_optional = pytest.mark.skip(reason="need --optional option to run")
for item in items:
if "optional" in item.keywords:
- item.add_marker(skip_optional)
+ item.add_marker(skip_optional)
\ No newline at end of file
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index 7b0346b8ab50f..ac6d6aae30063 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -3,7 +3,10 @@
Run `pytest tests/distributed/test_comm_ops.py`.
"""
-import os
+
+from __future__ import annotations
+
+from typing import Any, Callable
import pytest
import ray
@@ -17,12 +20,18 @@ from ..utils import init_test_distributed_environment, multi_process_parallel
@ray.remote(num_gpus=1, max_calls=1)
-def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
- distributed_init_port: str):
+def all_reduce_test_worker(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ pp_size: int,
+ rank: int,
+ distributed_init_port: str,
+):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+ monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -39,12 +48,17 @@ def all_reduce_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
- distributed_init_port: str):
+def all_gather_test_worker(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ pp_size: int,
+ rank: int,
+ distributed_init_port: str,
+):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+ monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -67,12 +81,17 @@ def all_gather_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
- distributed_init_port: str):
+def broadcast_tensor_dict_test_worker(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ pp_size: int,
+ rank: int,
+ distributed_init_port: str,
+):
# it is important to delete the CUDA_VISIBLE_DEVICES environment variable
# so that each worker can see all the GPUs
# they will be able to set the device to the correct GPU
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+ monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -106,9 +125,14 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
- distributed_init_port: str):
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_tensor_dict_test_worker(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ pp_size: int,
+ rank: int,
+ distributed_init_port: str,
+):
+ monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -146,9 +170,14 @@ def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
@ray.remote(num_gpus=1, max_calls=1)
-def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
- distributed_init_port: str):
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+def send_recv_test_worker(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ pp_size: int,
+ rank: int,
+ distributed_init_port: str,
+):
+ monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
init_test_distributed_environment(tp_size, pp_size, rank,
@@ -174,8 +203,12 @@ def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
all_reduce_test_worker, all_gather_test_worker,
broadcast_tensor_dict_test_worker
])
-def test_multi_process_tensor_parallel(tp_size, test_target):
- multi_process_parallel(tp_size, 1, test_target)
+def test_multi_process_tensor_parallel(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size: int,
+ test_target: Callable[..., Any],
+):
+ multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
@@ -183,8 +216,12 @@ def test_multi_process_tensor_parallel(tp_size, test_target):
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
-def test_multi_process_pipeline_parallel(pp_size, test_target):
- multi_process_parallel(1, pp_size, test_target)
+def test_multi_process_pipeline_parallel(
+ monkeypatch: pytest.MonkeyPatch,
+ pp_size: int,
+ test_target: Callable[..., Any],
+):
+ multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
@@ -197,5 +234,9 @@ def test_multi_process_pipeline_parallel(pp_size, test_target):
broadcast_tensor_dict_test_worker
])
def test_multi_process_tensor_parallel_pipeline_parallel(
- tp_size, pp_size, test_target):
- multi_process_parallel(tp_size, pp_size, test_target)
+ tp_size: int,
+ pp_size: int,
+ test_target: Callable[..., Any],
+ monkeypatch: pytest.MonkeyPatch,
+):
+ multi_process_parallel(monkeypatch, tp_size, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 4928690bebb07..bfa7d06c4d075 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
-import os
import random
import pytest
@@ -23,95 +22,115 @@ for i, v in enumerate(test_sizes):
@ray.remote(num_gpus=1, max_calls=1)
-def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
- device = torch.device(f"cuda:{rank}")
- torch.cuda.set_device(device)
- init_test_distributed_environment(tp_size, pp_size, rank,
- distributed_init_port)
- ensure_model_parallel_initialized(tp_size, pp_size)
- group = get_tensor_model_parallel_group().device_group
+def graph_allreduce(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size,
+ pp_size,
+ rank,
+ distributed_init_port,
+):
+ with monkeypatch.context() as m:
+ m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+ init_test_distributed_environment(tp_size, pp_size, rank,
+ distributed_init_port)
+ ensure_model_parallel_initialized(tp_size, pp_size)
+ group = get_tensor_model_parallel_group().device_group
- # A small all_reduce for warmup.
- # this is needed because device communicators might be created lazily
- # (e.g. NCCL). This will ensure that the communicator is initialized
- # before any communication happens, so that this group can be used for
- # graph capture immediately.
- data = torch.zeros(1)
- data = data.to(device=device)
- torch.distributed.all_reduce(data, group=group)
- torch.cuda.synchronize()
- del data
+ # A small all_reduce for warmup.
+ # this is needed because device communicators might be created lazily
+ # (e.g. NCCL). This will ensure that the communicator is initialized
+ # before any communication happens, so that this group can be used for
+ # graph capture immediately.
+ data = torch.zeros(1)
+ data = data.to(device=device)
+ torch.distributed.all_reduce(data, group=group)
+ torch.cuda.synchronize()
+ del data
- # we use the first group to communicate once
- # and the second group to communicate twice
- # and so on
- # this is used to demonstrate that each group can
- # communicate independently
- num_communication = rank // tp_size + 1
+ # we use the first group to communicate once
+ # and the second group to communicate twice
+ # and so on
+ # this is used to demonstrate that each group can
+ # communicate independently
+ num_communication = rank // tp_size + 1
- for sz in test_sizes:
- for dtype in [torch.float32, torch.float16, torch.bfloat16]:
- with graph_capture(device=device) as graph_capture_context:
- # use integers so result matches NCCL exactly
- inp1 = torch.randint(1,
- 16, (sz, ),
- dtype=dtype,
- device=torch.cuda.current_device())
- inp2 = torch.randint(1,
- 16, (sz, ),
- dtype=dtype,
- device=torch.cuda.current_device())
- torch.cuda.synchronize()
- graph = torch.cuda.CUDAGraph()
- with torch.cuda.graph(graph,
- stream=graph_capture_context.stream):
- for i in range(num_communication):
- out1 = tensor_model_parallel_all_reduce(inp1)
- # the input buffer is immediately modified to test
- # synchronization
- dist.all_reduce(inp1, group=group)
- out2 = tensor_model_parallel_all_reduce(inp2)
- dist.all_reduce(inp2, group=group)
- graph.replay()
- torch.testing.assert_close(out1, inp1)
- torch.testing.assert_close(out2, inp2)
+ for sz in test_sizes:
+ for dtype in [torch.float32, torch.float16, torch.bfloat16]:
+ with graph_capture(device=device) as graph_capture_context:
+ # use integers so result matches NCCL exactly
+ inp1 = torch.randint(1,
+ 16, (sz, ),
+ dtype=dtype,
+ device=torch.cuda.current_device())
+ inp2 = torch.randint(1,
+ 16, (sz, ),
+ dtype=dtype,
+ device=torch.cuda.current_device())
+ torch.cuda.synchronize()
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph,
+ stream=graph_capture_context.stream):
+ for i in range(num_communication):
+ out1 = tensor_model_parallel_all_reduce(inp1)
+ # the input buffer is immediately modified to test
+ # synchronization
+ dist.all_reduce(inp1, group=group)
+ out2 = tensor_model_parallel_all_reduce(inp2)
+ dist.all_reduce(inp2, group=group)
+ graph.replay()
+ torch.testing.assert_close(out1, inp1)
+ torch.testing.assert_close(out2, inp2)
@ray.remote(num_gpus=1, max_calls=1)
-def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
- os.environ.pop("CUDA_VISIBLE_DEVICES", None)
- device = torch.device(f"cuda:{rank}")
- torch.cuda.set_device(device)
- init_test_distributed_environment(tp_size, pp_size, rank,
- distributed_init_port)
+def eager_allreduce(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size,
+ pp_size,
+ rank,
+ distributed_init_port,
+):
+ with monkeypatch.context() as m:
+ m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+ device = torch.device(f"cuda:{rank}")
+ torch.cuda.set_device(device)
+ init_test_distributed_environment(tp_size, pp_size, rank,
+ distributed_init_port)
- # we use the first group to communicate once
- # and the second group to communicate twice
- # and so on
- # this is used to demonstrate that each group can
- # communicate independently
- num_communication = rank // tp_size + 1
- sz = 1024
- fa = get_tp_group().ca_comm
- inp = torch.ones(sz, dtype=torch.float32, device=device)
- out = inp
- for _ in range(num_communication):
- out = fa.all_reduce(out, registered=False)
- torch.testing.assert_close(out, inp * (tp_size**num_communication))
+ # we use the first group to communicate once
+ # and the second group to communicate twice
+ # and so on
+ # this is used to demonstrate that each group can
+ # communicate independently
+ num_communication = rank // tp_size + 1
+ sz = 1024
+ fa = get_tp_group().ca_comm
+ inp = torch.ones(sz, dtype=torch.float32, device=device)
+ out = inp
+ for _ in range(num_communication):
+ out = fa.all_reduce(out, registered=False)
+ torch.testing.assert_close(out, inp * (tp_size**num_communication))
- inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
- out = inp
- for _ in range(num_communication):
- out = fa.all_reduce(out, registered=False)
- torch.testing.assert_close(out, inp * (tp_size**num_communication))
+ inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
+ out = inp
+ for _ in range(num_communication):
+ out = fa.all_reduce(out, registered=False)
+ torch.testing.assert_close(out, inp * (tp_size**num_communication))
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [eager_allreduce, graph_allreduce])
-def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
+def test_custom_allreduce(
+ monkeypatch: pytest.MonkeyPatch,
+ tp_size,
+ pipeline_parallel_size,
+ test_target,
+):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
- multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
+ multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
+ test_target)
diff --git a/tests/distributed/test_pipeline_partition.py b/tests/distributed/test_pipeline_partition.py
index 18c5be29c5ce1..7bf93f270148b 100644
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -7,33 +7,35 @@ import pytest
from vllm.distributed.utils import get_pp_indices
-def test_custom_layer_partition():
+def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
- def _verify(partition_str, num_layers, pp_size, goldens):
- bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
- os.environ["VLLM_PP_LAYER_PARTITION"] = partition_str
- for pp_rank, golden in enumerate(goldens):
- assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
- if bak is not None:
- os.environ["VLLM_PP_LAYER_PARTITION"] = bak
+ with monkeypatch.context() as m:
- # Even partition
- _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
- # Balanced partition
- _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
- # Put reminder somewhere
- _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
- # Invalid partition strings
- with pytest.raises(ValueError):
- _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
- with pytest.raises(ValueError):
- _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
- # Wrong number of partitions
- with pytest.raises(ValueError):
- _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
- # Wrong number of layers
- with pytest.raises(ValueError):
- _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+ def _verify(partition_str, num_layers, pp_size, goldens):
+ bak = os.environ.get("VLLM_PP_LAYER_PARTITION", None)
+ m.setenv("VLLM_PP_LAYER_PARTITION", partition_str)
+ for pp_rank, golden in enumerate(goldens):
+ assert get_pp_indices(num_layers, pp_rank, pp_size) == golden
+ if bak is not None:
+ m.setenv("VLLM_PP_LAYER_PARTITION", bak)
+
+ # Even partition
+ _verify("5,5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+ # Balanced partition
+ _verify("4,6,6,4", 20, 4, [(0, 4), (4, 10), (10, 16), (16, 20)])
+ # Put reminder somewhere
+ _verify("5,6,5,6", 22, 4, [(0, 5), (5, 11), (11, 16), (16, 22)])
+ # Invalid partition strings
+ with pytest.raises(ValueError):
+ _verify("5,5,5,5,", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+ with pytest.raises(ValueError):
+ _verify("5,5,5,a", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+ # Wrong number of partitions
+ with pytest.raises(ValueError):
+ _verify("5,5,5", 20, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
+ # Wrong number of layers
+ with pytest.raises(ValueError):
+ _verify("5,5,5,5", 21, 4, [(0, 5), (5, 10), (10, 15), (15, 20)])
@pytest.mark.parametrize(
@@ -55,6 +57,10 @@ def test_custom_layer_partition():
(5, 3, 1, (2, 4)),
(5, 3, 2, (4, 5)),
])
-def test_uneven_auto_partition(num_hidden_layers: int, pp_size: int,
- pp_rank: int, indices: tuple[int, int]):
+def test_uneven_auto_partition(
+ num_hidden_layers: int,
+ pp_size: int,
+ pp_rank: int,
+ indices: tuple[int, int],
+):
assert indices == get_pp_indices(num_hidden_layers, pp_rank, pp_size)
diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
index 3bc85b05e7d15..19414971f2b46 100644
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -1,11 +1,15 @@
# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
-import os
+from typing import TYPE_CHECKING
import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test
+if TYPE_CHECKING:
+ from typing_extensions import LiteralString
+
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
@@ -15,18 +19,24 @@ from ..utils import compare_two_settings, fork_new_process_for_each_test
"FLASHINFER",
])
@fork_new_process_for_each_test
-def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
- cudagraph_args = [
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "float16",
- "--pipeline-parallel-size",
- str(PP_SIZE),
- "--distributed-executor-backend",
- "mp",
- ]
- os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+def test_pp_cudagraph(
+ monkeypatch: pytest.MonkeyPatch,
+ PP_SIZE: int,
+ MODEL_NAME: str,
+ ATTN_BACKEND: LiteralString,
+):
+ with monkeypatch.context() as m:
+ cudagraph_args = [
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "float16",
+ "--pipeline-parallel-size",
+ str(PP_SIZE),
+ "--distributed-executor-backend",
+ "mp",
+ ]
+ m.setenv("VLLM_ATTENTION_BACKEND", ATTN_BACKEND)
- eager_args = cudagraph_args + ["--enforce-eager"]
+ eager_args = cudagraph_args + ["--enforce-eager"]
- compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
+ compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
index 3ebc5a44d80c6..77fbb5827da9e 100644
--- a/tests/entrypoints/llm/test_accuracy.py
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -49,7 +49,7 @@ TPU_TP_TEST_STR = "" #"tensor_parallel_size=4"
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 is currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
@@ -67,7 +67,7 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
run_test(more_args)
-def test_lm_eval_accuracy_v0_engine(monkeypatch):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 85156d6931c8c..23fd72f4ebbb9 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -53,32 +53,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
-def test_offline_mode(monkeypatch):
+def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
- try:
- monkeypatch.setenv("HF_HUB_OFFLINE", "1")
- monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+ with monkeypatch.context() as m:
+ try:
+ m.setenv("HF_HUB_OFFLINE", "1")
+ m.setenv("VLLM_NO_USAGE_STATS", "1")
- def disable_connect(*args, **kwargs):
- raise RuntimeError("No http calls allowed")
+ def disable_connect(*args, **kwargs):
+ raise RuntimeError("No http calls allowed")
- monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
- disable_connect)
- monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
- disable_connect)
+ m.setattr(
+ urllib3.connection.HTTPConnection,
+ "connect",
+ disable_connect,
+ )
+ m.setattr(
+ urllib3.connection.HTTPSConnection,
+ "connect",
+ disable_connect,
+ )
- # Need to re-import huggingface_hub and friends to setup offline mode
- _re_import_modules()
- # Cached model files should be used in offline mode
- for model_config in MODEL_CONFIGS:
- LLM(**model_config)
- finally:
- # Reset the environment after the test
- # NB: Assuming tests are run in online mode
- monkeypatch.delenv("HF_HUB_OFFLINE")
- monkeypatch.delenv("VLLM_NO_USAGE_STATS")
- _re_import_modules()
- pass
+ # Need to re-import huggingface_hub
+ # and friends to setup offline mode
+ _re_import_modules()
+ # Cached model files should be used in offline mode
+ for model_config in MODEL_CONFIGS:
+ LLM(**model_config)
+ finally:
+ # Reset the environment after the test
+ # NB: Assuming tests are run in online mode
+ _re_import_modules()
def _re_import_modules():
diff --git a/tests/entrypoints/openai/correctness/test_lmeval.py b/tests/entrypoints/openai/correctness/test_lmeval.py
index e4c087db3d4f0..d3948e2ed575e 100644
--- a/tests/entrypoints/openai/correctness/test_lmeval.py
+++ b/tests/entrypoints/openai/correctness/test_lmeval.py
@@ -70,7 +70,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU")
-def test_lm_eval_accuracy_v1_engine(monkeypatch):
+def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
@@ -85,7 +85,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
+ more_args):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 570e643e0364d..66db7509cc474 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -5,13 +5,12 @@ from unittest.mock import Mock, patch
import pytest
import torch
-from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.cpu import CpuPlatform
from vllm.platforms.cuda import CudaPlatform
from vllm.platforms.openvino import OpenVinoPlatform
from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
@pytest.fixture(autouse=True)
@@ -25,87 +24,111 @@ def clear_cache():
"name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
@pytest.mark.parametrize("use_v1", [True, False])
@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
-def test_env(name: str, use_v1: bool, device: str, monkeypatch):
+def test_env(
+ name: str,
+ use_v1: bool,
+ device: str,
+ monkeypatch: pytest.MonkeyPatch,
+):
"""Test that the attention selector can be set via environment variable.
Note that we do not test FlashAttn because it is the default backend.
"""
- monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
- override_backend_env_variable(monkeypatch, name)
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+ m.setenv(STR_BACKEND_ENV_VAR, name)
- if device == "cpu":
- with patch("vllm.attention.selector.current_platform", CpuPlatform()):
- backend = get_attn_backend(16, torch.float16, torch.float16, 16,
- False)
- assert backend.get_name() == "TORCH_SDPA"
- elif device == "hip":
- with patch("vllm.attention.selector.current_platform", RocmPlatform()):
- backend = get_attn_backend(16, torch.float16, torch.float16, 16,
- False)
- EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
- assert backend.get_name() == EXPECTED
- elif device == "openvino":
- with patch("vllm.attention.selector.current_platform",
- OpenVinoPlatform()), patch.dict('sys.modules',
- {'openvino': Mock()}):
- backend = get_attn_backend(16, torch.float16, torch.float16, 16,
- False)
- assert backend.get_name() == "OPENVINO"
- else:
- if name in ["XFORMERS", "FLASHINFER"]:
+ if device == "cpu":
with patch("vllm.attention.selector.current_platform",
- CudaPlatform()):
+ CpuPlatform()):
backend = get_attn_backend(16, torch.float16, torch.float16,
16, False)
- EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+ assert backend.get_name() == "TORCH_SDPA"
+ elif device == "hip":
+ with patch("vllm.attention.selector.current_platform",
+ RocmPlatform()):
+ backend = get_attn_backend(16, torch.float16, torch.float16,
+ 16, False)
+ EXPECTED = "ROCM_ATTN_VLLM_V1" if use_v1 else "ROCM_FLASH"
assert backend.get_name() == EXPECTED
+ elif device == "openvino":
+ with patch("vllm.attention.selector.current_platform",
+ OpenVinoPlatform()), patch.dict('sys.modules',
+ {'openvino': Mock()}):
+ backend = get_attn_backend(16, torch.float16, torch.float16,
+ 16, False)
+ assert backend.get_name() == "OPENVINO"
+ else:
+ if name in ["XFORMERS", "FLASHINFER"]:
+ with patch("vllm.attention.selector.current_platform",
+ CudaPlatform()):
+ backend = get_attn_backend(16, torch.float16,
+ torch.float16, 16, False)
+ EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else name
+ assert backend.get_name() == EXPECTED
-def test_flash_attn(monkeypatch):
+def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
"""Test FlashAttn validation."""
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
- override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
- # Unsupported CUDA arch
- with patch("torch.cuda.get_device_capability", return_value=(7, 5)):
+ # Unsupported CUDA arch
+ monkeypatch.setattr(torch.cuda, "get_device_capability", lambda:
+ (7, 5))
backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
- # Unsupported data type
- backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
- assert backend.get_name() != STR_FLASH_ATTN_VAL
+ # Reset the monkeypatch for subsequent tests
+ monkeypatch.undo()
- # Unsupported kv cache data type
- backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
- assert backend.get_name() != STR_FLASH_ATTN_VAL
+ # Unsupported data type
+ backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16, False)
+ assert backend.get_name() != STR_FLASH_ATTN_VAL
- # Unsupported block size
- backend = get_attn_backend(16, torch.float16, None, 8, False)
- assert backend.get_name() != STR_FLASH_ATTN_VAL
+ # Unsupported kv cache data type
+ backend = get_attn_backend(16, torch.float16, "fp8", 16, False)
+ assert backend.get_name() != STR_FLASH_ATTN_VAL
- # flash-attn is not installed
- with patch.dict('sys.modules', {'vllm_flash_attn': None}):
+ # Unsupported block size
+ backend = get_attn_backend(16, torch.float16, None, 8, False)
+ assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+ # flash-attn is not installed
+ import sys
+ original_module = sys.modules.get('vllm_flash_attn')
+ monkeypatch.setitem(sys.modules, 'vllm_flash_attn', None)
backend = get_attn_backend(16, torch.float16, None, 16, False)
assert backend.get_name() != STR_FLASH_ATTN_VAL
- # Unsupported head size
- backend = get_attn_backend(17, torch.float16, None, 16, False)
- assert backend.get_name() != STR_FLASH_ATTN_VAL
+ # Restore the original module if it existed
+ if original_module is not None:
+ monkeypatch.setitem(sys.modules, 'vllm_flash_attn',
+ original_module)
+ else:
+ monkeypatch.delitem(sys.modules, 'vllm_flash_attn', raising=False)
- # Attention-free models should bypass env and use PlaceholderAttention
- backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
- assert backend.get_name() != STR_FLASH_ATTN_VAL
+ # Unsupported head size
+ backend = get_attn_backend(17, torch.float16, None, 16, False)
+ assert backend.get_name() != STR_FLASH_ATTN_VAL
+
+ # Attention-free models should bypass env and use PlaceholderAttention
+ backend = get_attn_backend(16, torch.float16, torch.float16, 16, True)
+ assert backend.get_name() != STR_FLASH_ATTN_VAL
@pytest.mark.parametrize("use_v1", [True, False])
-def test_invalid_env(use_v1: bool, monkeypatch):
- """Ignore the invalid env variable if it is set."""
- monkeypatch.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
- override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
+def test_invalid_env(use_v1: bool, monkeypatch: pytest.MonkeyPatch):
- with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+ with monkeypatch.context() as m, patch(
+ "vllm.attention.selector.current_platform", CudaPlatform()):
+ m.setenv("VLLM_USE_V1", "1" if use_v1 else "0")
+ m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+
+ # Test with head size 32
backend = get_attn_backend(32, torch.float16, None, 16, False)
EXPECTED = "FLASH_ATTN_VLLM_V1" if use_v1 else "FLASH_ATTN"
assert backend.get_name() == EXPECTED
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index 37ce00c74030a..248b294e546b3 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
-import os
-
import pytest
import torch
@@ -11,36 +9,38 @@ from vllm import _custom_ops as ops # noqa: F401
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_dequantize"),
reason="AWQ is not supported on this GPU type.")
-def test_awq_dequantize_opcheck():
- os.environ["VLLM_USE_TRITON_AWQ"] = "0"
- qweight = torch.randint(-2000000000,
- 2000000000, (8192, 256),
- device='cuda',
- dtype=torch.int32)
- scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
- zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
- split_k_iters = 0
- thx = 0
- thy = 0
- opcheck(torch.ops._C.awq_dequantize,
- (qweight, scales, zeros, split_k_iters, thx, thy))
+def test_awq_dequantize_opcheck(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_TRITON_AWQ", "0")
+ qweight = torch.randint(-2000000000,
+ 2000000000, (8192, 256),
+ device='cuda',
+ dtype=torch.int32)
+ scales = torch.rand((64, 2048), device='cuda', dtype=torch.float16)
+ zeros = torch.empty((64, 256), device='cuda', dtype=torch.int32)
+ split_k_iters = 0
+ thx = 0
+ thy = 0
+ opcheck(torch.ops._C.awq_dequantize,
+ (qweight, scales, zeros, split_k_iters, thx, thy))
@pytest.mark.skip(reason="Not working; needs investigation.")
@pytest.mark.skipif(not hasattr(torch.ops._C, "awq_gemm"),
reason="AWQ is not supported on this GPU type.")
-def test_awq_gemm_opcheck():
- os.environ["VLLM_USE_TRITON_AWQ"] = "0"
- input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
- qweight = torch.randint(-2000000000,
- 2000000000, (8192, 256),
- device='cuda',
- dtype=torch.int32)
- scales = torch.randint(-2000000000,
- 2000000000, (64, 256),
- device='cuda',
- dtype=torch.int32)
- qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
- split_k_iters = 8
- opcheck(torch.ops._C.awq_gemm,
- (input, qweight, qzeros, scales, split_k_iters))
+def test_awq_gemm_opcheck(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_TRITON_AWQ", "0")
+ input = torch.rand((2, 8192), device='cuda', dtype=torch.float16)
+ qweight = torch.randint(-2000000000,
+ 2000000000, (8192, 256),
+ device='cuda',
+ dtype=torch.int32)
+ scales = torch.randint(-2000000000,
+ 2000000000, (64, 256),
+ device='cuda',
+ dtype=torch.int32)
+ qzeros = torch.empty((64, 2048), device='cuda', dtype=torch.float16)
+ split_k_iters = 8
+ opcheck(torch.ops._C.awq_gemm,
+ (input, qweight, qzeros, scales, split_k_iters))
diff --git a/tests/kernels/test_rocm_attention_selector.py b/tests/kernels/test_rocm_attention_selector.py
index 7cd6082486605..724f0af283f70 100644
--- a/tests/kernels/test_rocm_attention_selector.py
+++ b/tests/kernels/test_rocm_attention_selector.py
@@ -1,13 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
-from unittest.mock import patch
-
import pytest
import torch
-from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
from vllm.platforms.rocm import RocmPlatform
+from vllm.utils import STR_BACKEND_ENV_VAR
@pytest.fixture(autouse=True)
@@ -17,15 +15,19 @@ def clear_cache():
_cached_get_attn_backend.cache_clear()
-def test_selector(monkeypatch):
- """Test that the attention selector for ROCm.
- """
- override_backend_env_variable(monkeypatch, "ROCM_FLASH")
+def test_selector(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, "ROCM_FLASH")
- with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+ # Set the current platform to ROCm using monkeypatch
+ monkeypatch.setattr("vllm.attention.selector.current_platform",
+ RocmPlatform())
+
+ # Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "ROCM_ATTN_VLLM_V1")
+
# mla test for deepseek related
backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False,
False, True)
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 010974076ba8f..22b3d7c2be7a5 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -36,12 +36,12 @@ ALL_OPCHECK_TEST_UTILS: tuple[str, ...] = (
class QKVInputs(NamedTuple):
'''
- Data structure for representing unpacked attention inputs,
+ Data structure for representing unpacked attention inputs,
query/key/values and their sequence lengths.
Attributes:
- * {query,key,value}: unpacked (batch_size x padded_seq_len x
+ * {query,key,value}: unpacked (batch_size x padded_seq_len x
num_heads x head_size) attention inputs
* q_seq_lens: query sequence lengths list
* kv_seq_lens: shared key/value sequence lengths list
@@ -56,14 +56,14 @@ class QKVInputs(NamedTuple):
class QKVO(NamedTuple):
'''
- Data structure for representing unpacked attention inputs,
+ Data structure for representing unpacked attention inputs,
alongside unpacked known-correct attention output
Attributes:
- * qkv: unpacked (batch_size x padded_seq_len x
+ * qkv: unpacked (batch_size x padded_seq_len x
num_heads x head_size) attention inputs
- * ideal_output: unpacked (batch_size x padded_seq_len x
+ * ideal_output: unpacked (batch_size x padded_seq_len x
num_heads x head_size) known-correct attention output
'''
@@ -77,7 +77,7 @@ class PackedQKVInputs(NamedTuple):
Attributes:
- * {query,key,value}: packed (number_of_tokens x num_heads
+ * {query,key,value}: packed (number_of_tokens x num_heads
x head_size) attention inputs
* q_start_loc_list: list of query start locations within packed tensor
* kv_start_loc_list: shared list of key/value start locations within
@@ -97,14 +97,14 @@ class PackedQKVInputs(NamedTuple):
class PackedQKVO(NamedTuple):
'''
- Data structure for representing packed attention inputs,
+ Data structure for representing packed attention inputs,
alongside packed known-correct attention output
Attributes:
- * packed_qkv: packed (number_of_tokens x num_heads
+ * packed_qkv: packed (number_of_tokens x num_heads
x head_size) attention inputs
- * ideal_output: packed (number_of_tokens x num_heads
+ * ideal_output: packed (number_of_tokens x num_heads
x head_size) known-correct attention output
'''
@@ -134,7 +134,7 @@ class PhaseTestParameters(NamedTuple):
Attributes:
- * packed_qkvo: packed (number_of_tokens x num_heads
+ * packed_qkvo: packed (number_of_tokens x num_heads
x head_size) attention inputs & known-correct
output
* kv_mmap: KV cache memory mapping, specific to this test phase &
@@ -195,7 +195,7 @@ def make_causal_mask(
Create a q_max_seq_len x kv_max_seq_len causal mask
Arguments:
-
+
* q_max_seq_len: query max seq len
* kv_max_seq_len: key/value max seq len
@@ -320,9 +320,9 @@ def make_qkv(
* max_kv_seq_len: max key/value seq len
* num_heads
* head_size
- * is_encoder_decoder_attn: if True, query seqlen may differ from
- key/value seqlen (as is often the case for cross-attention);
- o/w, query/key/value seqlens match at each batch index
+ * is_encoder_decoder_attn: if True, query seqlen may differ from
+ key/value seqlen (as is often the case for cross-attention);
+ o/w, query/key/value seqlens match at each batch index
(max_kv_seq_len is unused)
* force_kv_seq_lens: if not None, overrides kv sequence lengths
* attn_type: encoder, decoder self, or enc/dec cross attention
@@ -469,7 +469,7 @@ def pack_qkv(qkv: QKVInputs, device: Union[torch.device,
Individually pack each of Q, K and V, each with dimensions batch_size x
padded_seq_len x num_heads x head_size, into respective number_of_tokens x
num_heads x head_size tensors.
-
+
For Q, number_of_tokens = sum(q_seq_lens).
For K and V, number_of_tokens = sum(kv_seq_lens)
@@ -619,9 +619,9 @@ def make_kv_cache(num_blocks: int,
Returns:
* kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
- * for backend 'XFORMERS'
+ * for backend 'XFORMERS'
* kv_cache: 2 x num_blocks x block_size x num_heads x head_size
- * for backend 'FLASH_ATTN'
+ * for backend 'FLASH_ATTN'
'''
if backend == 'XFORMERS':
kv_cache = torch.rand(
@@ -662,20 +662,20 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
Context:
* Your goal is to test (1) prefill of N prompts, with prompt-lengths
{K_i \\forall i \\in [0,N)}, followed by (2) decoding of a single token
- for all N prompts (N tokens total); the resultant sequence lengths
+ for all N prompts (N tokens total); the resultant sequence lengths
after decode would be {K_i + 1 for i \\in [0,N)}
- * The test you want to do requires (1) having the prefill slot mapping
- for all tokens present during prefill, the number of which is
- M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
+ * The test you want to do requires (1) having the prefill slot mapping
+ for all tokens present during prefill, the number of which is
+ M = \\sum_i{K_i}, and (2) having the decode slot mapping for all N
decoded tokens
-
- This function consumes a single 1D slot mapping, which is the
+
+ This function consumes a single 1D slot mapping, which is the
concatenation of N slot mappings each of length K_i + 1 (corresponding
to the sequence lengths after decode), with a total length of
P = \\sum_i{K_i + 1} = M + N
The prefill-phase slot mapping results from excising the (K_i + 1)-th entry
- from each of the N subsequences in the slot mapping (i.e. omitting the
+ from each of the N subsequences in the slot mapping (i.e. omitting the
decoded token's mapping.)
The N excised entries are appended to obtain the decode-phase slot mapping
@@ -684,15 +684,15 @@ def split_slot_mapping(slot_mapping_list: torch.Tensor, seq_lens: list[int],
* slot_mapping_list: Length-P 1D slot mapping (as list) reflecting all N
post-decode sequences
- * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
+ * seq_lens: list of N post-decode sequence lengths (K_i + 1 in the
description above)
* device: cuda, cpu, etc.
Returns:
- * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
+ * prefill_slot_mapping: Length-M 1D slot mapping (as Tensor)
reflecting all N prefill prompts
- * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
+ * decode_slot_mapping: Length-N 1D slot mapping (as Tensor) reflecting
all N decoded tokens
'''
@@ -725,7 +725,7 @@ def make_block_tables_slot_mapping(
Then the minimum KV cache size in blocks is
- total_cache_blocks = sum(num_blocks for all seqs)
+ total_cache_blocks = sum(num_blocks for all seqs)
Then, the blocktable mapping counts downward from
@@ -734,7 +734,7 @@ def make_block_tables_slot_mapping(
to
block_base_addr
-
+
The constructed block-tables and slot-mapping are sized to the
lengths of the sequences in their entirety (as reflected by seq_lens),
@@ -749,7 +749,7 @@ def make_block_tables_slot_mapping(
Return:
- * block_tables_tensor: block table for sequence
+ * block_tables_tensor: block table for sequence
* slot_mapping_list: slot mapping for sequence
* max_block_idx: the highest block address within this block table
'''
@@ -807,7 +807,7 @@ def make_test_metadata(
encoder_test_params and cross_test_params arguments allow encoder
attention and enc/dec cross-attention (respectively) to use distinct
metadata values from decoder self-attention (decoder_test_params.)
-
+
if encoder_test_params and cross_test_params are None, the attention
metadata will support decoder-only scenario.
@@ -820,7 +820,7 @@ def make_test_metadata(
* attn_backend_name: Backend for sourcing attention kernels
* is_prompt: prefill if True, o/w decode
* seq_lens: list of token counts for each sequence
- * decoder_test_params: decoder self-attention test params;
+ * decoder_test_params: decoder self-attention test params;
this function requires
kv_mmap (memory mapping) field
* device: CPU or CUDA device
diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/test_disagg.py
similarity index 100%
rename from tests/kv_transfer/disagg_test.py
rename to tests/kv_transfer/test_disagg.py
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/test_module.py
similarity index 100%
rename from tests/kv_transfer/module_test.py
rename to tests/kv_transfer/test_module.py
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index faca7a566e79c..51abcb7172cb7 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -12,11 +12,10 @@ import pytest
from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported
from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
from ...utils import check_logprobs_close
-os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@@ -55,45 +54,47 @@ def test_models(
backend: str,
tensor_parallel_size: int,
disable_async_output_proc: bool,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
- override_backend_env_variable(monkeypatch, backend)
+ with monkeypatch.context() as m:
+ m.setenv("TOKENIZERS_PARALLELISM", 'true')
+ m.setenv(STR_BACKEND_ENV_VAR, backend)
- MAX_MODEL_LEN = 1024
- NUM_LOG_PROBS = 8
+ MAX_MODEL_LEN = 1024
+ NUM_LOG_PROBS = 8
- with vllm_runner(
- base_model,
- max_model_len=MAX_MODEL_LEN,
- tensor_parallel_size=tensor_parallel_size,
- enforce_eager=enforce_eager,
- kv_cache_dtype="auto",
- disable_async_output_proc=disable_async_output_proc,
- ) as vllm_model:
- baseline_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, NUM_LOG_PROBS)
+ with vllm_runner(
+ base_model,
+ max_model_len=MAX_MODEL_LEN,
+ tensor_parallel_size=tensor_parallel_size,
+ enforce_eager=enforce_eager,
+ kv_cache_dtype="auto",
+ disable_async_output_proc=disable_async_output_proc,
+ ) as vllm_model:
+ baseline_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
- with vllm_runner(
- test_model,
- max_model_len=MAX_MODEL_LEN,
- tensor_parallel_size=tensor_parallel_size,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- disable_async_output_proc=disable_async_output_proc,
- ) as vllm_model:
- test_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, NUM_LOG_PROBS)
+ with vllm_runner(
+ test_model,
+ max_model_len=MAX_MODEL_LEN,
+ tensor_parallel_size=tensor_parallel_size,
+ enforce_eager=enforce_eager,
+ kv_cache_dtype=kv_cache_dtype,
+ disable_async_output_proc=disable_async_output_proc,
+ ) as vllm_model:
+ test_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
- check_logprobs_close(
- outputs_0_lst=baseline_outputs,
- outputs_1_lst=test_outputs,
- name_0="fp16_kv_cache",
- name_1="fp8_kv_cache",
- )
+ check_logprobs_close(
+ outputs_0_lst=baseline_outputs,
+ outputs_1_lst=test_outputs,
+ name_0="fp16_kv_cache",
+ name_1="fp8_kv_cache",
+ )
@pytest.mark.cpu_model
@@ -119,38 +120,41 @@ def test_cpu_models(
test_model: str,
max_tokens: int,
disable_async_output_proc: bool,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Only checks log probs match to cover the discrepancy in
numerical sensitive kernels.
"""
+ with monkeypatch.context() as m:
+ m.setenv("TOKENIZERS_PARALLELISM", 'true')
- MAX_MODEL_LEN = 1024
- NUM_LOG_PROBS = 8
+ MAX_MODEL_LEN = 1024
+ NUM_LOG_PROBS = 8
- with vllm_runner(
- base_model,
- max_model_len=MAX_MODEL_LEN,
- dtype="bfloat16",
- kv_cache_dtype="auto",
- disable_async_output_proc=disable_async_output_proc,
- ) as vllm_model:
- baseline_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, NUM_LOG_PROBS)
+ with vllm_runner(
+ base_model,
+ max_model_len=MAX_MODEL_LEN,
+ dtype="bfloat16",
+ kv_cache_dtype="auto",
+ disable_async_output_proc=disable_async_output_proc,
+ ) as vllm_model:
+ baseline_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
- with vllm_runner(
- test_model,
- max_model_len=MAX_MODEL_LEN,
- dtype="bfloat16",
- kv_cache_dtype=kv_cache_dtype,
- disable_async_output_proc=disable_async_output_proc,
- ) as vllm_model:
- test_outputs = vllm_model.generate_greedy_logprobs(
- example_prompts, max_tokens, NUM_LOG_PROBS)
+ with vllm_runner(
+ test_model,
+ max_model_len=MAX_MODEL_LEN,
+ dtype="bfloat16",
+ kv_cache_dtype=kv_cache_dtype,
+ disable_async_output_proc=disable_async_output_proc,
+ ) as vllm_model:
+ test_outputs = vllm_model.generate_greedy_logprobs(
+ example_prompts, max_tokens, NUM_LOG_PROBS)
- check_logprobs_close(
- outputs_0_lst=baseline_outputs,
- outputs_1_lst=test_outputs,
- name_0="bf16_kv_cache",
- name_1="fp8_kv_cache",
- )
+ check_logprobs_close(
+ outputs_0_lst=baseline_outputs,
+ outputs_1_lst=test_outputs,
+ name_0="bf16_kv_cache",
+ name_1="fp8_kv_cache",
+ )
diff --git a/tests/models/embedding/language/test_gritlm.py b/tests/models/embedding/language/test_gritlm.py
index cae3e1a5c6244..d6bf7d2706397 100644
--- a/tests/models/embedding/language/test_gritlm.py
+++ b/tests/models/embedding/language/test_gritlm.py
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
import importlib.util
import math
@@ -11,6 +12,7 @@ from scipy.spatial.distance import cosine
import vllm
import vllm.config
+from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer
@@ -29,36 +31,34 @@ def _arr(arr):
return array("i", arr)
-def test_find_array(monkeypatch):
+def test_find_array(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
- monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
- from vllm.model_executor.models.gritlm import GritLMPooler
+ from vllm.model_executor.models.gritlm import GritLMPooler
- # Create an LLM object to get the model config.
- llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
- pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
+ # Create an LLM object to get the model config.
+ llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+ pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
- arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+ arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
- assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
- assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
- assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
- assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
+ assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
+ assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
+ assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
+ assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
- with pytest.raises(ValueError):
- pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
+ with pytest.raises(ValueError):
+ pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
- with pytest.MonkeyPatch.context() as mp:
- mp.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
-
- args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
- with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
- yield remote_server
+ args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
+ with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
@@ -69,9 +69,12 @@ def server_generate():
@pytest_asyncio.fixture
-async def client_embedding(server_embedding: RemoteOpenAIServer):
- async with server_embedding.get_async_client() as async_client:
- yield async_client
+async def client_embedding(monkeypatch: pytest.MonkeyPatch,
+ server_embedding: RemoteOpenAIServer):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+ async with server_embedding.get_async_client() as async_client:
+ yield async_client
@pytest_asyncio.fixture
@@ -80,14 +83,20 @@ async def client_generate(server_generate: RemoteOpenAIServer):
yield async_client
-def run_llm_encode(llm: vllm.LLM, queries: list[str],
- instruction: str) -> list[float]:
+def run_llm_encode(
+ llm: vllm.LLM,
+ queries: list[str],
+ instruction: str,
+) -> list[float]:
outputs = llm.encode([instruction + q for q in queries], )
return [output.outputs.embedding for output in outputs]
-async def run_client_embeddings(client: vllm.LLM, queries: list[str],
- instruction: str) -> list[float]:
+async def run_client_embeddings(
+ client: vllm.LLM,
+ queries: list[str],
+ instruction: str,
+) -> list[float]:
outputs = await client.embeddings.create(
model=MODEL_NAME,
input=[instruction + q for q in queries],
@@ -106,7 +115,7 @@ def get_test_data():
README.md in https://github.com/ContextualAI/gritlm
"""
q_instruction = gritlm_instruction(
- "Given a scientific paper title, retrieve the paper's abstract")
+ "Given a scientific paper title, retrieve the paper's abstract", )
queries = [
"Bitcoin: A Peer-to-Peer Electronic Cash System",
"Generative Representational Instruction Tuning",
@@ -136,31 +145,32 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
-def test_gritlm_offline_embedding(monkeypatch):
+def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
- monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
- queries, q_instruction, documents, d_instruction = get_test_data()
+ queries, q_instruction, documents, d_instruction = get_test_data()
- llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
+ llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
- d_rep = run_llm_encode(
- llm,
- documents,
- d_instruction,
- )
- q_rep = run_llm_encode(
- llm,
- queries,
- q_instruction,
- )
+ d_rep = run_llm_encode(
+ llm,
+ documents,
+ d_instruction,
+ )
+ q_rep = run_llm_encode(
+ llm,
+ queries,
+ q_instruction,
+ )
- validate_embed_output(q_rep, d_rep)
+ validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding(
- client_embedding: openai.AsyncOpenAI):
+ client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings(
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index d3d07d0d9acfc..465c496f4c0f3 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
-import os
-
import pytest
from vllm import LLM, SamplingParams
@@ -11,76 +9,92 @@ from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test
-def test_plugin(dummy_opt_path, monkeypatch):
+def test_plugin(
+ monkeypatch: pytest.MonkeyPatch,
+ dummy_opt_path: str,
+):
# V1 shuts down rather than raising an error here.
- monkeypatch.setenv("VLLM_USE_V1", "0")
- os.environ["VLLM_PLUGINS"] = ""
- with pytest.raises(Exception) as excinfo:
- LLM(model=dummy_opt_path, load_format="dummy")
- error_msg = "has no vLLM implementation and " \
- "the Transformers implementation is not compatible with vLLM"
- assert (error_msg in str(excinfo.value))
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ m.setenv("VLLM_PLUGINS", "")
+
+ with pytest.raises(Exception) as excinfo:
+ LLM(model=dummy_opt_path, load_format="dummy")
+ error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501
+ assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test
-def test_oot_registration_text_generation(dummy_opt_path):
- os.environ["VLLM_PLUGINS"] = "register_dummy_model"
- prompts = ["Hello, my name is", "The text does not matter"]
- sampling_params = SamplingParams(temperature=0)
- llm = LLM(model=dummy_opt_path, load_format="dummy")
- first_token = llm.get_tokenizer().decode(0)
- outputs = llm.generate(prompts, sampling_params)
+def test_oot_registration_text_generation(
+ monkeypatch: pytest.MonkeyPatch,
+ dummy_opt_path: str,
+):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_PLUGINS", "register_dummy_model")
+ prompts = ["Hello, my name is", "The text does not matter"]
+ sampling_params = SamplingParams(temperature=0)
+ llm = LLM(model=dummy_opt_path, load_format="dummy")
+ first_token = llm.get_tokenizer().decode(0)
+ outputs = llm.generate(prompts, sampling_params)
- for output in outputs:
- generated_text = output.outputs[0].text
- # make sure only the first token is generated
- rest = generated_text.replace(first_token, "")
- assert rest == ""
+ for output in outputs:
+ generated_text = output.outputs[0].text
+ # make sure only the first token is generated
+ rest = generated_text.replace(first_token, "")
+ assert rest == ""
@fork_new_process_for_each_test
-def test_oot_registration_embedding(dummy_gemma2_embedding_path):
- os.environ["VLLM_PLUGINS"] = "register_dummy_model"
- prompts = ["Hello, my name is", "The text does not matter"]
- llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
- outputs = llm.embed(prompts)
+def test_oot_registration_embedding(
+ monkeypatch: pytest.MonkeyPatch,
+ dummy_gemma2_embedding_path: str,
+):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_PLUGINS", "register_dummy_model")
+ prompts = ["Hello, my name is", "The text does not matter"]
+ llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
+ outputs = llm.embed(prompts)
- for output in outputs:
- assert all(v == 0 for v in output.outputs.embedding)
+ for output in outputs:
+ assert all(v == 0 for v in output.outputs.embedding)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test
-def test_oot_registration_multimodal(dummy_llava_path, monkeypatch):
- os.environ["VLLM_PLUGINS"] = "register_dummy_model"
- prompts = [{
- "prompt": "What's in the image?",
- "multi_modal_data": {
- "image": image
- },
- }, {
- "prompt": "Describe the image",
- "multi_modal_data": {
- "image": image
- },
- }]
+def test_oot_registration_multimodal(
+ monkeypatch: pytest.MonkeyPatch,
+ dummy_llava_path: str,
+):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_PLUGINS", "register_dummy_model")
+ prompts = [{
+ "prompt": "What's in the image?",
+ "multi_modal_data": {
+ "image": image
+ },
+ }, {
+ "prompt": "Describe the image",
+ "multi_modal_data": {
+ "image": image
+ },
+ }]
- sampling_params = SamplingParams(temperature=0)
- llm = LLM(model=dummy_llava_path,
- load_format="dummy",
- max_num_seqs=1,
- trust_remote_code=True,
- gpu_memory_utilization=0.98,
- max_model_len=4096,
- enforce_eager=True,
- limit_mm_per_prompt={"image": 1})
- first_token = llm.get_tokenizer().decode(0)
- outputs = llm.generate(prompts, sampling_params)
+ sampling_params = SamplingParams(temperature=0)
+ llm = LLM(model=dummy_llava_path,
+ load_format="dummy",
+ max_num_seqs=1,
+ trust_remote_code=True,
+ gpu_memory_utilization=0.98,
+ max_model_len=4096,
+ enforce_eager=True,
+ limit_mm_per_prompt={"image": 1})
+ first_token = llm.get_tokenizer().decode(0)
+ outputs = llm.generate(prompts, sampling_params)
- for output in outputs:
- generated_text = output.outputs[0].text
- # make sure only the first token is generated
- rest = generated_text.replace(first_token, "")
- assert rest == ""
+ for output in outputs:
+ generated_text = output.outputs[0].text
+ # make sure only the first token is generated
+ rest = generated_text.replace(first_token, "")
+ assert rest == ""
diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py
index aad7fc5303c13..e617bd057f1f4 100644
--- a/tests/mq_llm_engine/test_error_handling.py
+++ b/tests/mq_llm_engine/test_error_handling.py
@@ -235,25 +235,28 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio
-async def test_mp_crash_detection(monkeypatch):
+async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
- parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
- parser = make_arg_parser(parser)
- args = parser.parse_args([])
+ parser = FlexibleArgumentParser(
+ description="vLLM's remote OpenAI server.")
+ parser = make_arg_parser(parser)
+ args = parser.parse_args([])
- # When LLMEngine is loaded, it will crash.
- def mock_init():
- raise ValueError
+ # When LLMEngine is loaded, it will crash.
+ def mock_init():
+ raise ValueError
- monkeypatch.setattr(LLMEngine, "__init__", mock_init)
+ m.setattr(LLMEngine, "__init__", mock_init)
- start = time.perf_counter()
- async with build_async_engine_client(args):
- pass
- end = time.perf_counter()
+ start = time.perf_counter()
+ async with build_async_engine_client(args):
+ pass
+ end = time.perf_counter()
- assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
- "if there is an error in the startup.")
+ assert end - start < 60, (
+ "Expected vLLM to gracefully shutdown in <60s "
+ "if there is an error in the startup.")
@pytest.mark.asyncio
diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py
index f925e42f46d37..ce716e6474cb4 100644
--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -5,7 +5,7 @@ from typing import Optional
import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close
from ..utils import (completions_with_server_args, get_client_text_generations,
@@ -52,7 +52,7 @@ async def test_multi_step(
num_logprobs: Optional[int],
attention_backend: str,
enable_chunked_prefill: bool,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment.
@@ -82,67 +82,70 @@ async def test_multi_step(
pytest.skip("Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend")
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- prompts = example_prompts
- if len(prompts) < num_prompts:
- prompts = prompts * ((num_prompts // len(prompts)) + 1)
- prompts = prompts[:num_prompts]
- assert len(prompts) == num_prompts
+ prompts = example_prompts
+ if len(prompts) < num_prompts:
+ prompts = prompts * ((num_prompts // len(prompts)) + 1)
+ prompts = prompts[:num_prompts]
+ assert len(prompts) == num_prompts
- server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
- ms_server_args = DEFAULT_SERVER_ARGS + \
- ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+ server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
+ ms_server_args = DEFAULT_SERVER_ARGS + \
+ ["--num-scheduler-steps", f"{num_scheduler_steps}"]
- if not is_async:
- ms_server_args += ["--disable-async-output-proc"]
+ if not is_async:
+ ms_server_args += ["--disable-async-output-proc"]
- if eager_mode:
- ms_server_args.append("--enforce-eager")
+ if eager_mode:
+ ms_server_args.append("--enforce-eager")
- if enable_chunked_prefill:
- ms_server_args.append("--enable-chunked-prefill")
+ if enable_chunked_prefill:
+ ms_server_args.append("--enable-chunked-prefill")
- distributed_args = [
- "--tensor-parallel-size",
- str(tp_size),
- "--pipeline-parallel-size",
- str(pp_size),
- ]
+ distributed_args = [
+ "--tensor-parallel-size",
+ str(tp_size),
+ "--pipeline-parallel-size",
+ str(pp_size),
+ ]
- # Spin up client/server & issue completion API requests.
- # Default `max_wait_seconds` is 240 but was empirically
- # was raised 5x to 1200 *just for this test* due to
- # observed timeouts in GHA CI
- ref_completions = await completions_with_server_args(
- prompts,
- model,
- server_args + distributed_args,
- num_logprobs,
- max_wait_seconds=5 * 240)
- test_completions = await completions_with_server_args(
- prompts,
- model,
- ms_server_args + distributed_args,
- num_logprobs,
- max_wait_seconds=5 * 240)
+ # Spin up client/server & issue completion API requests.
+ # Default `max_wait_seconds` is 240 but was empirically
+ # was raised 5x to 1200 *just for this test* due to
+ # observed timeouts in GHA CI
+ ref_completions = await completions_with_server_args(
+ prompts,
+ model,
+ server_args + distributed_args,
+ num_logprobs,
+ max_wait_seconds=5 * 240)
+ test_completions = await completions_with_server_args(
+ prompts,
+ model,
+ ms_server_args + distributed_args,
+ num_logprobs,
+ max_wait_seconds=5 * 240)
- # Assert multi-step scheduling produces identical tokens
- # to single-step scheduling.
- ref_generations = get_client_text_generations(ref_completions)
- test_generations = get_client_text_generations(test_completions)
- assert ref_generations == test_generations
+ # Assert multi-step scheduling produces identical tokens
+ # to single-step scheduling.
+ ref_generations = get_client_text_generations(ref_completions)
+ test_generations = get_client_text_generations(test_completions)
+ assert ref_generations == test_generations
- # Assert multi-step scheduling produces nearly-identical logprobs
- # to single-step scheduling.
- ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
- test_text_logprobs = get_client_text_logprob_generations(test_completions)
- check_logprobs_close(
- outputs_0_lst=ref_text_logprobs,
- outputs_1_lst=test_text_logprobs,
- name_0="hf",
- name_1="vllm",
- )
+ # Assert multi-step scheduling produces nearly-identical logprobs
+ # to single-step scheduling.
+ ref_text_logprobs = get_client_text_logprob_generations(
+ ref_completions)
+ test_text_logprobs = get_client_text_logprob_generations(
+ test_completions)
+ check_logprobs_close(
+ outputs_0_lst=ref_text_logprobs,
+ outputs_1_lst=test_text_logprobs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize(("tp_size, pp_size"), [
@@ -152,7 +155,7 @@ async def test_multi_step(
async def test_multi_step_pp_smoke(
tp_size: int,
pp_size: int,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
@@ -174,54 +177,55 @@ async def test_multi_step_pp_smoke(
attention_backend = "FLASH_ATTN"
max_num_seqs = 3
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- # Prompt from the ShareGPT dataset
- prompts = [
- "in the jtbd context whats a push?", # codespell:ignore
- "in the jtbd context whats a push?", # codespell:ignore
- "in the jtbd context whats a push?", # codespell:ignore
- "in the jtbd context whats a push?", # codespell:ignore
- ]
- # Use varying max_tokens to introduce scheduling randomness.
- max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
- assert len(prompts) == len(max_tokens)
+ # Prompt from the ShareGPT dataset
+ prompts = [
+ "in the jtbd context whats a push?", # codespell:ignore
+ "in the jtbd context whats a push?", # codespell:ignore
+ "in the jtbd context whats a push?", # codespell:ignore
+ "in the jtbd context whats a push?", # codespell:ignore
+ ]
+ # Use varying max_tokens to introduce scheduling randomness.
+ max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
+ assert len(prompts) == len(max_tokens)
- test_args = [
- "--tensor-parallel-size",
- str(tp_size), "--pipeline-parallel-size",
- str(pp_size), "--max-num-seqs",
- str(max_num_seqs)
- ]
+ test_args = [
+ "--tensor-parallel-size",
+ str(tp_size), "--pipeline-parallel-size",
+ str(pp_size), "--max-num-seqs",
+ str(max_num_seqs)
+ ]
- server_args = DEFAULT_SERVER_ARGS + test_args
- ms_server_args = DEFAULT_SERVER_ARGS + \
- ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
- test_args
+ server_args = DEFAULT_SERVER_ARGS + test_args
+ ms_server_args = DEFAULT_SERVER_ARGS + \
+ ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
+ test_args
- # Spin up client/server & issue completion API requests.
- # Default `max_wait_seconds` is 240 but was empirically
- # was raised 3x to 720 *just for this test* due to
- # observed timeouts in GHA CI
- ref_completions = await completions_with_server_args(
- prompts=prompts,
- model_name=model,
- server_cli_args=server_args,
- num_logprobs=None,
- max_wait_seconds=5 * 240,
- max_tokens=max_tokens)
+ # Spin up client/server & issue completion API requests.
+ # Default `max_wait_seconds` is 240 but was empirically
+ # was raised 3x to 720 *just for this test* due to
+ # observed timeouts in GHA CI
+ ref_completions = await completions_with_server_args(
+ prompts=prompts,
+ model_name=model,
+ server_cli_args=server_args,
+ num_logprobs=None,
+ max_wait_seconds=5 * 240,
+ max_tokens=max_tokens)
- test_completions = await completions_with_server_args(
- prompts=prompts,
- model_name=model,
- server_cli_args=ms_server_args,
- num_logprobs=None,
- max_wait_seconds=5 * 240,
- max_tokens=max_tokens)
+ test_completions = await completions_with_server_args(
+ prompts=prompts,
+ model_name=model,
+ server_cli_args=ms_server_args,
+ num_logprobs=None,
+ max_wait_seconds=5 * 240,
+ max_tokens=max_tokens)
- # Assert multi-step scheduling produces identical tokens
- # to single-step scheduling.
- ref_generations = get_client_text_generations(ref_completions)
- test_generations = get_client_text_generations(test_completions)
+ # Assert multi-step scheduling produces identical tokens
+ # to single-step scheduling.
+ ref_generations = get_client_text_generations(ref_completions)
+ test_generations = get_client_text_generations(test_completions)
- assert ref_generations == test_generations
+ assert ref_generations == test_generations
diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py
index 29d5ffd4c9cb1..a823e484beab6 100644
--- a/tests/multi_step/test_correctness_llm.py
+++ b/tests/multi_step/test_correctness_llm.py
@@ -7,7 +7,7 @@ from typing import Optional
import pytest
-from tests.kernels.utils import override_backend_env_variable
+from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
@@ -42,7 +42,7 @@ def test_multi_step_llm(
num_prompts: int,
num_logprobs: Optional[int],
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
@@ -70,48 +70,49 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- prompts = example_prompts
- if len(prompts) < num_prompts:
- prompts = prompts * ((num_prompts // len(prompts)) + 1)
- prompts = prompts[:num_prompts]
- assert len(prompts) == num_prompts
+ prompts = example_prompts
+ if len(prompts) < num_prompts:
+ prompts = prompts * ((num_prompts // len(prompts)) + 1)
+ prompts = prompts[:num_prompts]
+ assert len(prompts) == num_prompts
- with vllm_runner(
- model,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7,
- tensor_parallel_size=tp_size,
- enable_chunked_prefill=enable_chunked_prefill,
- num_scheduler_steps=num_scheduler_steps,
- ) as vllm_model:
- vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
- if num_logprobs is None else
- vllm_model.generate_greedy_logprobs(
- prompts, max_tokens, num_logprobs))
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7,
+ tensor_parallel_size=tp_size,
+ enable_chunked_prefill=enable_chunked_prefill,
+ num_scheduler_steps=num_scheduler_steps,
+ ) as vllm_model:
+ vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
+ if num_logprobs is None else
+ vllm_model.generate_greedy_logprobs(
+ prompts, max_tokens, num_logprobs))
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
- if num_logprobs is None else
- hf_model.generate_greedy_logprobs_limit(
- prompts, max_tokens, num_logprobs))
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
+ if num_logprobs is None else
+ hf_model.generate_greedy_logprobs_limit(
+ prompts, max_tokens, num_logprobs))
- if num_logprobs is None:
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
- else:
- check_logprobs_close(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ if num_logprobs is None:
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
+ else:
+ check_logprobs_close(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize("model", MODELS)
@@ -136,7 +137,7 @@ def test_multi_step_llm_w_prompt_logprobs(
num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
@@ -166,47 +167,48 @@ def test_multi_step_llm_w_prompt_logprobs(
note that this argument is not supported by the
OpenAI completions endpoint.
"""
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- prompts = example_prompts
- if len(prompts) < num_prompts:
- prompts = prompts * ((num_prompts // len(prompts)) + 1)
- prompts = prompts[:num_prompts]
- assert len(prompts) == num_prompts
+ prompts = example_prompts
+ if len(prompts) < num_prompts:
+ prompts = prompts * ((num_prompts // len(prompts)) + 1)
+ prompts = prompts[:num_prompts]
+ assert len(prompts) == num_prompts
- with vllm_runner(
- model,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7,
- tensor_parallel_size=tp_size,
- num_scheduler_steps=num_scheduler_steps,
- ) as vllm_model:
- vllm_outputs = vllm_model.generate_greedy_logprobs(
- prompts,
- max_tokens,
- num_logprobs,
- num_prompt_logprobs=num_prompt_logprobs)
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7,
+ tensor_parallel_size=tp_size,
+ num_scheduler_steps=num_scheduler_steps,
+ ) as vllm_model:
+ vllm_outputs = vllm_model.generate_greedy_logprobs(
+ prompts,
+ max_tokens,
+ num_logprobs,
+ num_prompt_logprobs=num_prompt_logprobs)
- with vllm_runner(
- model,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7,
- tensor_parallel_size=tp_size,
- ) as vllm_model:
- single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
- prompts,
- max_tokens,
- num_logprobs,
- num_prompt_logprobs=num_prompt_logprobs)
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7,
+ tensor_parallel_size=tp_size,
+ ) as vllm_model:
+ single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
+ prompts,
+ max_tokens,
+ num_logprobs,
+ num_prompt_logprobs=num_prompt_logprobs)
- check_logprobs_close(
- outputs_0_lst=single_step_vllm_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_logprobs_close(
+ outputs_0_lst=single_step_vllm_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize("model", MODELS)
@@ -230,7 +232,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
num_prompts: int,
num_logprobs: Optional[int],
attention_backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
@@ -293,77 +295,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
- override_backend_env_variable(monkeypatch, attention_backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
- assert len(example_prompts) >= 2
- challenge_prompts = copy.deepcopy(example_prompts)
- challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
- 'inference and serving engine for LLMs.\n'
- ) # 24 tok
- challenge_prompts[1] = (
- 'Briefly describe the major milestones in the '
- 'development of artificial intelligence from 1950 to 2020.\n'
- ) # 30 tok
+ assert len(example_prompts) >= 2
+ challenge_prompts = copy.deepcopy(example_prompts)
+ challenge_prompts[0] = (
+ 'vLLM is a high-throughput and memory-efficient '
+ 'inference and serving engine for LLMs.\n') # 24 tok
+ challenge_prompts[1] = (
+ 'Briefly describe the major milestones in the '
+ 'development of artificial intelligence from 1950 to 2020.\n'
+ ) # 30 tok
- # If necessary, adjust the length of `challenge_prompts` to match
- # `num_prompts`
- if len(challenge_prompts) < num_prompts:
- challenge_prompts = (challenge_prompts *
- ((num_prompts // len(challenge_prompts)) + 1))
- challenge_prompts = challenge_prompts[:num_prompts]
- assert len(challenge_prompts) == num_prompts
+ # If necessary, adjust the length of `challenge_prompts` to match
+ # `num_prompts`
+ if len(challenge_prompts) < num_prompts:
+ challenge_prompts = (challenge_prompts *
+ ((num_prompts // len(challenge_prompts)) + 1))
+ challenge_prompts = challenge_prompts[:num_prompts]
+ assert len(challenge_prompts) == num_prompts
- # Single-step scheduler baseline
- with vllm_runner(
- model,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7,
- tensor_parallel_size=tp_size,
- num_scheduler_steps=num_scheduler_steps,
- max_model_len=48,
- max_num_batched_tokens=48,
- max_num_seqs=4,
- block_size=16,
- ) as vllm_model:
- outputs_baseline = (vllm_model.generate_greedy(
- challenge_prompts, max_tokens) if num_logprobs is None else
- vllm_model.generate_greedy_logprobs(
- challenge_prompts, max_tokens, num_logprobs))
+ # Single-step scheduler baseline
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7,
+ tensor_parallel_size=tp_size,
+ num_scheduler_steps=num_scheduler_steps,
+ max_model_len=48,
+ max_num_batched_tokens=48,
+ max_num_seqs=4,
+ block_size=16,
+ ) as vllm_model:
+ outputs_baseline = (
+ vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+ num_logprobs is None else vllm_model.generate_greedy_logprobs(
+ challenge_prompts, max_tokens, num_logprobs))
- # multi-step+"single-step chunked prefill"+APC
- with vllm_runner(
- model,
- dtype=dtype,
- enforce_eager=enforce_eager,
- gpu_memory_utilization=0.7,
- tensor_parallel_size=tp_size,
- enable_chunked_prefill=True,
- enable_prefix_caching=True,
- num_scheduler_steps=num_scheduler_steps,
- max_model_len=48,
- max_num_batched_tokens=48,
- max_num_seqs=4,
- block_size=16,
- ) as vllm_model:
- outputs_w_features = (vllm_model.generate_greedy(
- challenge_prompts, max_tokens) if num_logprobs is None else
- vllm_model.generate_greedy_logprobs(
- challenge_prompts, max_tokens, num_logprobs))
+ # multi-step+"single-step chunked prefill"+APC
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enforce_eager=enforce_eager,
+ gpu_memory_utilization=0.7,
+ tensor_parallel_size=tp_size,
+ enable_chunked_prefill=True,
+ enable_prefix_caching=True,
+ num_scheduler_steps=num_scheduler_steps,
+ max_model_len=48,
+ max_num_batched_tokens=48,
+ max_num_seqs=4,
+ block_size=16,
+ ) as vllm_model:
+ outputs_w_features = (
+ vllm_model.generate_greedy(challenge_prompts, max_tokens) if
+ num_logprobs is None else vllm_model.generate_greedy_logprobs(
+ challenge_prompts, max_tokens, num_logprobs))
- if num_logprobs is None:
- # No-logprobs test
- check_outputs_equal(
- outputs_0_lst=outputs_baseline,
- outputs_1_lst=outputs_w_features,
- name_0="multi-step",
- name_1="multi-step+features",
- )
- else:
- # Yes-logprobs test
- check_logprobs_close(
- outputs_0_lst=outputs_baseline,
- outputs_1_lst=outputs_w_features,
- name_0="multi-step",
- name_1="multi-step+features",
- )
+ if num_logprobs is None:
+ # No-logprobs test
+ check_outputs_equal(
+ outputs_0_lst=outputs_baseline,
+ outputs_1_lst=outputs_w_features,
+ name_0="multi-step",
+ name_1="multi-step+features",
+ )
+ else:
+ # Yes-logprobs test
+ check_logprobs_close(
+ outputs_0_lst=outputs_baseline,
+ outputs_1_lst=outputs_w_features,
+ name_0="multi-step",
+ name_1="multi-step+features",
+ )
diff --git a/tests/neuron/1_core/test_block_table.py b/tests/neuron/1_core/test_block_table.py
index 30dcdd573edf3..033a36b4156b0 100644
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
@@ -1,5 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
-import os
import neuronxcc.nki.language as nl
import pytest
@@ -99,6 +98,7 @@ def ref_block_tables_transform(
)
@torch.inference_mode()
def test_load_and_transform_block_tables(
+ monkeypatch: pytest.MonkeyPatch,
num_tiles,
num_blocks_per_tile,
q_head_per_kv_head,
@@ -108,46 +108,46 @@ def test_load_and_transform_block_tables(
device = xm.xla_device()
- compiler_flags = [
+ compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
- ]
- compiler_flags_str = " ".join(compiler_flags)
- os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+ ])
+ with monkeypatch.context() as m:
+ m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
- torch.manual_seed(10000)
- torch.set_printoptions(sci_mode=False)
+ torch.manual_seed(10000)
+ torch.set_printoptions(sci_mode=False)
- # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
- B_P_SIZE = 128
- if num_blocks_per_tile < B_P_SIZE:
- assert B_P_SIZE % num_blocks_per_tile == 0
- block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
- else:
- block_size_tiling_factor = 1
- max_num_blocks = 100000
- block_tables = torch.randint(
- 0,
- max_num_blocks,
- (num_tiles * num_blocks_per_tile, ),
- dtype=torch.int32,
- )
- nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
- block_tables.to(device=device),
- num_tiles,
- num_blocks_per_tile,
- q_head_per_kv_head,
- head_id,
- block_size_tiling_factor,
- ).cpu()
- ref_out = ref_block_tables_transform(
- block_tables,
- num_tiles,
- num_blocks_per_tile,
- q_head_per_kv_head,
- head_id,
- block_size_tiling_factor,
- )
- assert (nki_out.shape == ref_out.shape
- ), f"{nki_out.shape=} != {ref_out.shape=}"
- assert torch.all(nki_out == ref_out)
+ # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
+ B_P_SIZE = 128
+ if num_blocks_per_tile < B_P_SIZE:
+ assert B_P_SIZE % num_blocks_per_tile == 0
+ block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
+ else:
+ block_size_tiling_factor = 1
+ max_num_blocks = 100000
+ block_tables = torch.randint(
+ 0,
+ max_num_blocks,
+ (num_tiles * num_blocks_per_tile, ),
+ dtype=torch.int32,
+ )
+ nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
+ block_tables.to(device=device),
+ num_tiles,
+ num_blocks_per_tile,
+ q_head_per_kv_head,
+ head_id,
+ block_size_tiling_factor,
+ ).cpu()
+ ref_out = ref_block_tables_transform(
+ block_tables,
+ num_tiles,
+ num_blocks_per_tile,
+ q_head_per_kv_head,
+ head_id,
+ block_size_tiling_factor,
+ )
+ assert (nki_out.shape == ref_out.shape
+ ), f"{nki_out.shape=} != {ref_out.shape=}"
+ assert torch.all(nki_out == ref_out)
diff --git a/tests/neuron/1_core/test_prefix_prefill.py b/tests/neuron/1_core/test_prefix_prefill.py
index 326a1f82e9b30..37d6679f8d55b 100644
--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -320,6 +320,7 @@ def get_active_block_tables(block_tables, query_lens, seq_lens, block_size,
])
@torch.inference_mode()
def test_contexted_kv_attention(
+ monkeypatch: pytest.MonkeyPatch,
prefill_batch_size: int,
decode_batch_size: int,
num_heads: int,
@@ -329,7 +330,6 @@ def test_contexted_kv_attention(
large_tile_size,
mixed_precision: bool,
) -> None:
- import os
import torch_xla.core.xla_model as xm
@@ -340,174 +340,178 @@ def test_contexted_kv_attention(
device = xm.xla_device()
- compiler_flags = [
+ compiler_flags_str = " ".join([
"-O1",
"--retry_failed_compilation",
- ]
- compiler_flags_str = " ".join(compiler_flags)
- os.environ["NEURON_CC_FLAGS"] = compiler_flags_str
+ ])
+ with monkeypatch.context() as m:
+ m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
- torch.manual_seed(0)
- torch.set_printoptions(sci_mode=False)
- torch.set_default_device("cpu")
- dtype = torch.float32
+ torch.manual_seed(0)
+ torch.set_printoptions(sci_mode=False)
+ torch.set_default_device("cpu")
+ dtype = torch.float32
- min_ctx_len = 32
- max_ctx_len = 1024
- min_query_len = 16
- max_query_len = 512
- num_kv_heads = num_heads // num_queries_per_kv
- (
- query,
- k_active,
- v_active,
- k_cache,
- v_cache,
- block_table,
- key,
- value,
- query_lens,
- seq_lens,
- ) = sample_inputs(
- prefill_batch_size=prefill_batch_size,
- decode_batch_size=decode_batch_size,
- min_query_len=min_query_len,
- max_query_len=max_query_len,
- min_ctx_len=min_ctx_len,
- max_ctx_len=max_ctx_len,
- block_size=block_size,
- num_heads=num_heads,
- num_kv_heads=num_kv_heads,
- head_size=head_size,
- dtype=dtype,
- )
+ min_ctx_len = 32
+ max_ctx_len = 1024
+ min_query_len = 16
+ max_query_len = 512
+ num_kv_heads = num_heads // num_queries_per_kv
+ (
+ query,
+ k_active,
+ v_active,
+ k_cache,
+ v_cache,
+ block_table,
+ key,
+ value,
+ query_lens,
+ seq_lens,
+ ) = sample_inputs(
+ prefill_batch_size=prefill_batch_size,
+ decode_batch_size=decode_batch_size,
+ min_query_len=min_query_len,
+ max_query_len=max_query_len,
+ min_ctx_len=min_ctx_len,
+ max_ctx_len=max_ctx_len,
+ block_size=block_size,
+ num_heads=num_heads,
+ num_kv_heads=num_kv_heads,
+ head_size=head_size,
+ dtype=dtype,
+ )
- output_ref = ref_context_attention(
- query,
- key,
- value,
- query_lens,
- seq_lens,
- head_size,
- num_queries_per_kv,
- return_max_reduce=False,
- )
+ output_ref = ref_context_attention(
+ query,
+ key,
+ value,
+ query_lens,
+ seq_lens,
+ head_size,
+ num_queries_per_kv,
+ return_max_reduce=False,
+ )
- # build neuron program
- B_P_SIZE = 128
- assert (large_tile_size >= B_P_SIZE
- ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
+ # build neuron program
+ B_P_SIZE = 128
+ assert (large_tile_size >= B_P_SIZE
+ ), f"Expect {large_tile_size=} to be larger than {B_P_SIZE=}"
- def ceil_div(a, b):
- return (a + b - 1) // b
+ def ceil_div(a, b):
+ return (a + b - 1) // b
- def pad_to_multiple(a, b):
- return ceil_div(a, b) * b
+ def pad_to_multiple(a, b):
+ return ceil_div(a, b) * b
- def pad_to_next_power_of_2(a):
- assert a > 0
- return 2**int(a - 1).bit_length()
+ def pad_to_next_power_of_2(a):
+ assert a > 0
+ return 2**int(a - 1).bit_length()
- # calculate input shapes
- max_num_queries = pad_to_next_power_of_2(sum(query_lens))
- context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
- num_active_blocks = ceil_div(context_lens, block_size).sum().item()
- num_active_blocks = pad_to_multiple(num_active_blocks,
- large_tile_size // block_size)
- context_kv_len = num_active_blocks * block_size
- assert (context_kv_len %
+ # calculate input shapes
+ max_num_queries = pad_to_next_power_of_2(sum(query_lens))
+ context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens)
+ num_active_blocks = ceil_div(context_lens, block_size).sum().item()
+ num_active_blocks = pad_to_multiple(num_active_blocks,
+ large_tile_size // block_size)
+ context_kv_len = num_active_blocks * block_size
+ assert (
+ context_kv_len %
large_tile_size == 0), f"invalid context_kv_len={context_kv_len}"
- # pad QKV tensors
- pad_dims = (
- 0,
- 0,
- 0,
- 0,
- 0,
- max_num_queries - query.shape[0],
- )
- query = F.pad(query, pad_dims, "constant", 0)
- k = F.pad(k_active, pad_dims, "constant", 0)
- v = F.pad(v_active, pad_dims, "constant", 0)
-
- # permute QKV tensors
- # query: (1, n_heads, d, seq_q)
- # key: (1, n_kv_heads, d, seq_k)
- # value: (1, n_kv_heads, seq_v, d)
- query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
- k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
- v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
- k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
- v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
-
- # transform block table
- active_block_table = get_active_block_tables(
- block_table.cpu(),
- torch.tensor(query_lens).cpu(),
- torch.tensor(seq_lens).cpu(),
- block_size,
- num_active_blocks,
- )
-
- # Build attention masks
- prior_mask, active_mask = (
- BlockDiagonalCausalFromBottomRightMask.from_seqlens(
- query_lens, seq_lens, block_size=block_size))
- prior_mask_padded = F.pad(
- prior_mask,
- (
+ # pad QKV tensors
+ pad_dims = (
0,
- context_kv_len - prior_mask.shape[1],
0,
- max_num_queries - prior_mask.shape[0],
- ),
- "constant",
- 0,
- ).bool()
- active_mask_padded = F.pad(
- active_mask,
- (
0,
- max_num_queries - active_mask.shape[1],
0,
- max_num_queries - active_mask.shape[0],
- ),
- "constant",
- 0,
- ).bool()
- attn_mask = torch.concat([prior_mask_padded, active_mask_padded], dim=1)
+ 0,
+ max_num_queries - query.shape[0],
+ )
+ query = F.pad(query, pad_dims, "constant", 0)
+ k = F.pad(k_active, pad_dims, "constant", 0)
+ v = F.pad(v_active, pad_dims, "constant", 0)
- attn_mask = reorder_context_mask(attn_mask, large_tile_size, block_size)
+ # permute QKV tensors
+ # query: (1, n_heads, d, seq_q)
+ # key: (1, n_kv_heads, d, seq_k)
+ # value: (1, n_kv_heads, seq_v, d)
+ query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+ k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
+ v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
+ k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+ v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
- input_args = (
- query.to(device=device),
- k.to(device=device),
- v.to(device=device),
- k_cache.to(device=device),
- v_cache.to(device=device),
- active_block_table.to(device=device),
- attn_mask.to(device=device),
- )
- input_kwargs = dict(
- n_kv_head=num_kv_heads,
- head_size=head_size,
- mixed_precision=mixed_precision,
- LARGE_TILE_SZ=large_tile_size,
- )
+ # transform block table
+ active_block_table = get_active_block_tables(
+ block_table.cpu(),
+ torch.tensor(query_lens).cpu(),
+ torch.tensor(seq_lens).cpu(),
+ block_size,
+ num_active_blocks,
+ )
- output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+ # Build attention masks
+ prior_mask, active_mask = (
+ BlockDiagonalCausalFromBottomRightMask.from_seqlens(
+ query_lens, seq_lens, block_size=block_size))
+ prior_mask_padded = F.pad(
+ prior_mask,
+ (
+ 0,
+ context_kv_len - prior_mask.shape[1],
+ 0,
+ max_num_queries - prior_mask.shape[0],
+ ),
+ "constant",
+ 0,
+ ).bool()
+ active_mask_padded = F.pad(
+ active_mask,
+ (
+ 0,
+ max_num_queries - active_mask.shape[1],
+ 0,
+ max_num_queries - active_mask.shape[0],
+ ),
+ "constant",
+ 0,
+ ).bool()
+ attn_mask = torch.concat([prior_mask_padded, active_mask_padded],
+ dim=1)
- num_actual_tokens = sum(query_lens)
- # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
- output_nki = output_nki.cpu().permute(0, 2, 1, 3)
- output_nki = output_nki[0, :num_actual_tokens, :, :]
- output_ref_padded = F.pad(
- output_ref,
- (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
- "constant",
- 0,
- )
- output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :]
+ attn_mask = reorder_context_mask(attn_mask, large_tile_size,
+ block_size)
- torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
+ input_args = (
+ query.to(device=device),
+ k.to(device=device),
+ v.to(device=device),
+ k_cache.to(device=device),
+ v_cache.to(device=device),
+ active_block_table.to(device=device),
+ attn_mask.to(device=device),
+ )
+ input_kwargs = dict(
+ n_kv_head=num_kv_heads,
+ head_size=head_size,
+ mixed_precision=mixed_precision,
+ LARGE_TILE_SZ=large_tile_size,
+ )
+
+ output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs)
+
+ num_actual_tokens = sum(query_lens)
+ # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d)
+ output_nki = output_nki.cpu().permute(0, 2, 1, 3)
+ output_nki = output_nki[0, :num_actual_tokens, :, :]
+ output_ref_padded = F.pad(
+ output_ref,
+ (0, 0, 0, 0, 0, 0, 0, max_num_queries - output_ref.shape[0]),
+ "constant",
+ 0,
+ )
+ output_ref = output_ref_padded.transpose(
+ 0, 1)[0, :num_actual_tokens, :, :]
+
+ torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0)
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 3be248f5aca45..9d6872e0e0772 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -1,10 +1,10 @@
# SPDX-License-Identifier: Apache-2.0
+import pytest
import torch
-from tests.kernels.utils import override_backend_env_variable
from vllm.attention.selector import get_attn_backend
-from vllm.utils import STR_INVALID_VAL
+from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
def test_platform_plugins():
@@ -25,8 +25,9 @@ def test_platform_plugins():
f" is loaded. The first import:\n{_init_trace}")
-def test_oot_attention_backend(monkeypatch):
+def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
# ignore the backend env variable if it is set
- override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
- backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
- assert backend.get_name() == "Dummy_Backend"
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+ backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+ assert backend.get_name() == "Dummy_Backend"
diff --git a/tests/plugins_tests/test_scheduler_plugins.py b/tests/plugins_tests/test_scheduler_plugins.py
index 98981a81e909c..7abf5066a4133 100644
--- a/tests/plugins_tests/test_scheduler_plugins.py
+++ b/tests/plugins_tests/test_scheduler_plugins.py
@@ -22,43 +22,47 @@ class DummyV1Scheduler(V1Scheduler):
raise Exception("Exception raised by DummyV1Scheduler")
-def test_scheduler_plugins_v0(monkeypatch):
- monkeypatch.setenv("VLLM_USE_V1", "0")
- with pytest.raises(Exception) as exception_info:
+def test_scheduler_plugins_v0(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
+ with pytest.raises(Exception) as exception_info:
- engine_args = EngineArgs(
- model="facebook/opt-125m",
- enforce_eager=True, # reduce test time
- scheduler_cls=DummyV0Scheduler,
- )
+ engine_args = EngineArgs(
+ model="facebook/opt-125m",
+ enforce_eager=True, # reduce test time
+ scheduler_cls=DummyV0Scheduler,
+ )
- engine = LLMEngine.from_engine_args(engine_args=engine_args)
+ engine = LLMEngine.from_engine_args(engine_args=engine_args)
- sampling_params = SamplingParams(max_tokens=1)
- engine.add_request("0", "foo", sampling_params)
- engine.step()
+ sampling_params = SamplingParams(max_tokens=1)
+ engine.add_request("0", "foo", sampling_params)
+ engine.step()
- assert str(exception_info.value) == "Exception raised by DummyV0Scheduler"
+ assert str(
+ exception_info.value) == "Exception raised by DummyV0Scheduler"
-def test_scheduler_plugins_v1(monkeypatch):
- monkeypatch.setenv("VLLM_USE_V1", "1")
- # Explicitly turn off engine multiprocessing so that the scheduler runs in
- # this process
- monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+def test_scheduler_plugins_v1(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1")
+ # Explicitly turn off engine multiprocessing so
+ # that the scheduler runs in this process
+ m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
- with pytest.raises(Exception) as exception_info:
+ with pytest.raises(Exception) as exception_info:
- engine_args = EngineArgs(
- model="facebook/opt-125m",
- enforce_eager=True, # reduce test time
- scheduler_cls=DummyV1Scheduler,
- )
+ engine_args = EngineArgs(
+ model="facebook/opt-125m",
+ enforce_eager=True, # reduce test time
+ scheduler_cls=DummyV1Scheduler,
+ )
- engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
+ engine = V1LLMEngine.from_engine_args(engine_args=engine_args)
- sampling_params = SamplingParams(max_tokens=1)
- engine.add_request("0", "foo", sampling_params)
- engine.step()
+ sampling_params = SamplingParams(max_tokens=1)
+ engine.add_request("0", "foo", sampling_params)
+ engine.step()
- assert str(exception_info.value) == "Exception raised by DummyV1Scheduler"
+ assert str(
+ exception_info.value) == "Exception raised by DummyV1Scheduler"
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index 7a4bc7aecc0f4..607b6c43e02e2 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -4,25 +4,29 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
+from __future__ import annotations
+
import pytest
from tests.conftest import VllmRunner
from tests.core.utils import SchedulerProxy, create_dummy_prompt
-from tests.kernels.utils import override_backend_env_variable
from vllm import SamplingParams, TokensPrompt
from vllm.core.scheduler import Scheduler
from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
+from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_outputs_equal
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
This module relies on V0 internals, so set VLLM_USE_V1=0.
"""
- monkeypatch.setenv('VLLM_USE_V1', '0')
+ with monkeypatch.context() as m:
+ m.setenv('VLLM_USE_V1', '0')
+ yield
MODELS = [
@@ -56,7 +60,7 @@ def test_mixed_requests(
cached_position: int,
enable_chunked_prefill: bool,
block_size: int,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Test the case when some sequences have the prefix cache hit
@@ -67,72 +71,77 @@ def test_mixed_requests(
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
- override_backend_env_variable(monkeypatch, backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, backend)
- with hf_runner(model, dtype=dtype) as hf_model:
- hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
- cached_prompt = example_prompts[cached_position]
- with vllm_runner(
- model,
- dtype=dtype,
- enable_prefix_caching=True,
- enable_chunked_prefill=enable_chunked_prefill,
- block_size=block_size,
- ) as vllm_model:
- # Run the first prompt so the cache is populated
- vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
+ cached_prompt = example_prompts[cached_position]
+ with vllm_runner(
+ model,
+ dtype=dtype,
+ enable_prefix_caching=True,
+ enable_chunked_prefill=enable_chunked_prefill,
+ block_size=block_size,
+ ) as vllm_model:
+ # Run the first prompt so the cache is populated
+ vllm_outputs = vllm_model.generate_greedy([cached_prompt],
+ max_tokens)
- # Run all the promopts
- greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
- req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
+ # Run all the promopts
+ greedy_params = SamplingParams(temperature=0.0,
+ max_tokens=max_tokens)
+ req_outputs = vllm_model.model.generate(example_prompts,
+ greedy_params)
- # Verify number of cached tokens
- for i in range(len(req_outputs)):
- if i == cached_position:
- expected_num_cached_tokens = (
- len(req_outputs[i].prompt_token_ids) //
- block_size) * block_size
- else:
- expected_num_cached_tokens = 0
- assert (
- req_outputs[i].num_cached_tokens == expected_num_cached_tokens)
+ # Verify number of cached tokens
+ for i in range(len(req_outputs)):
+ if i == cached_position:
+ expected_num_cached_tokens = (
+ len(req_outputs[i].prompt_token_ids) //
+ block_size) * block_size
+ else:
+ expected_num_cached_tokens = 0
+ assert (req_outputs[i].num_cached_tokens ==
+ expected_num_cached_tokens)
- vllm_outputs = [(
- output.prompt_token_ids + list(output.outputs[0].token_ids),
- output.prompt + output.outputs[0].text,
- ) for output in req_outputs]
+ vllm_outputs = [(
+ output.prompt_token_ids + list(output.outputs[0].token_ids),
+ output.prompt + output.outputs[0].text,
+ ) for output in req_outputs]
- check_outputs_equal(
- outputs_0_lst=hf_outputs,
- outputs_1_lst=vllm_outputs,
- name_0="hf",
- name_1="vllm",
- )
+ check_outputs_equal(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=vllm_outputs,
+ name_0="hf",
+ name_1="vllm",
+ )
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
def test_unstable_prompt_sequence(
vllm_runner,
backend: str,
- monkeypatch,
+ monkeypatch: pytest.MonkeyPatch,
) -> None:
if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and current_platform.is_rocm():
pytest.skip("Xformers does not support ROCm/HIP.")
- override_backend_env_variable(monkeypatch, backend)
+ with monkeypatch.context() as m:
+ m.setenv(STR_BACKEND_ENV_VAR, backend)
- with vllm_runner(
- "Qwen/Qwen2.5-0.5B-Instruct",
- enable_chunked_prefill=True,
- enable_prefix_caching=True,
- max_model_len=4096,
- ) as vllm_model:
- for prompt in UNSTABLE_PROMPT_SEQUENCE:
- vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
- SamplingParams(max_tokens=1))
+ with vllm_runner(
+ "Qwen/Qwen2.5-0.5B-Instruct",
+ enable_chunked_prefill=True,
+ enable_prefix_caching=True,
+ max_model_len=4096,
+ ) as vllm_model:
+ for prompt in UNSTABLE_PROMPT_SEQUENCE:
+ vllm_model.generate(TokensPrompt(prompt_token_ids=prompt),
+ SamplingParams(max_tokens=1))
@pytest.mark.parametrize("model", MODELS)
diff --git a/tests/test_regression.py b/tests/test_regression.py
index b54dc6af3e9a6..8c9d4a91c73be 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -56,12 +56,11 @@ def test_gc():
assert allocated < 50 * 1024 * 1024
-def test_model_from_modelscope(monkeypatch):
+def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
- MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
- monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
- try:
- llm = LLM(model=MODELSCOPE_MODEL_NAME)
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_MODELSCOPE", "True")
+ llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
prompts = [
"Hello, my name is",
@@ -73,10 +72,3 @@ def test_model_from_modelscope(monkeypatch):
outputs = llm.generate(prompts, sampling_params)
assert len(outputs) == 4
- finally:
- monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
-
-
-if __name__ == "__main__":
- import pytest
- pytest.main([__file__])
diff --git a/tests/test_utils.py b/tests/test_utils.py
index dcca7d5965e9e..ae4fddd046d45 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
import asyncio
-import os
import socket
from collections.abc import AsyncIterator
from unittest.mock import patch
@@ -112,16 +112,16 @@ def test_deprecate_kwargs_additional_message():
dummy(old_arg=1)
-def test_get_open_port():
- os.environ["VLLM_PORT"] = "5678"
- # make sure we can get multiple ports, even if the env var is set
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
- s1.bind(("localhost", get_open_port()))
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
- s2.bind(("localhost", get_open_port()))
- with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
- s3.bind(("localhost", get_open_port()))
- os.environ.pop("VLLM_PORT")
+def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_PORT", "5678")
+ # make sure we can get multiple ports, even if the env var is set
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+ s1.bind(("localhost", get_open_port()))
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+ s2.bind(("localhost", get_open_port()))
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+ s3.bind(("localhost", get_open_port()))
# Tests for FlexibleArgumentParser
@@ -366,31 +366,32 @@ def test_bind_kv_cache_non_attention():
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
-def test_bind_kv_cache_encoder_decoder(monkeypatch):
+def test_bind_kv_cache_encoder_decoder(monkeypatch: pytest.MonkeyPatch):
# V1 TESTS: ENCODER_DECODER is not supported on V1 yet.
- monkeypatch.setenv("VLLM_USE_V1", "0")
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "0")
- from vllm.attention import Attention, AttentionType
+ from vllm.attention import Attention, AttentionType
- # example from bart
- ctx = {
- 'encoder.layers.0.self_attn.attn':
- Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
- 'decoder.layers.0.encoder_attn.attn':
- Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
- 'decoder.layers.0.self_attn.attn':
- Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
- }
+ # example from bart
+ ctx = {
+ 'encoder.layers.0.self_attn.attn':
+ Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER),
+ 'decoder.layers.0.encoder_attn.attn':
+ Attention(32, 128, 0.1, attn_type=AttentionType.ENCODER_DECODER),
+ 'decoder.layers.0.self_attn.attn':
+ Attention(32, 128, 0.1, attn_type=AttentionType.DECODER),
+ }
- kv_cache = [
- torch.zeros((1, )),
- ]
- encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
+ kv_cache = [
+ torch.zeros((1, )),
+ ]
+ encoder_kv_cache = ctx['encoder.layers.0.self_attn.attn'].kv_cache
- bind_kv_cache(ctx, [kv_cache])
- assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
- assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
- assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
+ bind_kv_cache(ctx, [kv_cache])
+ assert ctx['encoder.layers.0.self_attn.attn'].kv_cache is encoder_kv_cache
+ assert ctx['decoder.layers.0.encoder_attn.attn'].kv_cache[0] is kv_cache[0]
+ assert ctx['decoder.layers.0.self_attn.attn'].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_pp():
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index e94bbd2877225..f7a59f054b61b 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
-import os
+import pytest
from vllm.config import CompilationLevel
@@ -9,16 +9,17 @@ from ..utils import compare_two_settings
# --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s
-os.environ["VLLM_RPC_TIMEOUT"] = "30000"
-def test_custom_dispatcher():
- compare_two_settings(
- "google/gemma-2b",
- arg1=[
- "--enforce-eager",
- f"-O{CompilationLevel.DYNAMO_ONCE}",
- ],
- arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
- env1={},
- env2={})
+def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_RPC_TIMEOUT", "30000")
+ compare_two_settings(
+ "google/gemma-2b",
+ arg1=[
+ "--enforce-eager",
+ f"-O{CompilationLevel.DYNAMO_ONCE}",
+ ],
+ arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"],
+ env1={},
+ env2={})
diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py
index 97149884497af..a781b8b563be1 100644
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -1,10 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
-import os
import threading
from collections.abc import Iterable
from concurrent import futures
-from typing import Callable, Literal
+from typing import Callable, Generator, Literal
import grpc
import pytest
@@ -21,12 +23,14 @@ from vllm.tracing import SpanAttributes
@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
+def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
- monkeypatch.setenv('VLLM_USE_V1', '0')
+ with monkeypatch.context() as m:
+ m.setenv('VLLM_USE_V1', '0')
+ yield
FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
@@ -67,7 +71,7 @@ class FakeTraceService(TraceServiceServicer):
@pytest.fixture
-def trace_service():
+def trace_service() -> Generator[FakeTraceService, None, None]:
"""Fixture to set up a fake gRPC trace service"""
server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
service = FakeTraceService()
@@ -80,136 +84,153 @@ def trace_service():
server.stop(None)
-def test_traces(trace_service):
- os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+def test_traces(
+ monkeypatch: pytest.MonkeyPatch,
+ trace_service: FakeTraceService,
+):
+ with monkeypatch.context() as m:
+ m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
- sampling_params = SamplingParams(temperature=0.01,
- top_p=0.1,
- max_tokens=256)
- model = "facebook/opt-125m"
- llm = LLM(
- model=model,
- otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
- )
- prompts = ["This is a short prompt"]
- outputs = llm.generate(prompts, sampling_params=sampling_params)
+ sampling_params = SamplingParams(
+ temperature=0.01,
+ top_p=0.1,
+ max_tokens=256,
+ )
+ model = "facebook/opt-125m"
+ llm = LLM(
+ model=model,
+ otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+ )
+ prompts = ["This is a short prompt"]
+ outputs = llm.generate(prompts, sampling_params=sampling_params)
- timeout = 5
- if not trace_service.evt.wait(timeout):
- raise TimeoutError(
- f"The fake trace service didn't receive a trace within "
- f"the {timeout} seconds timeout")
+ timeout = 5
+ if not trace_service.evt.wait(timeout):
+ raise TimeoutError(
+ f"The fake trace service didn't receive a trace within "
+ f"the {timeout} seconds timeout")
- request = trace_service.request
- assert len(request.resource_spans) == 1, (
- f"Expected 1 resource span, "
- f"but got {len(request.resource_spans)}")
- assert len(request.resource_spans[0].scope_spans) == 1, (
- f"Expected 1 scope span, "
- f"but got {len(request.resource_spans[0].scope_spans)}")
- assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
- f"Expected 1 span, "
- f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+ request = trace_service.request
+ assert len(request.resource_spans) == 1, (
+ f"Expected 1 resource span, "
+ f"but got {len(request.resource_spans)}")
+ assert len(request.resource_spans[0].scope_spans) == 1, (
+ f"Expected 1 scope span, "
+ f"but got {len(request.resource_spans[0].scope_spans)}")
+ assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+ f"Expected 1 span, "
+ f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
- attributes = decode_attributes(
- request.resource_spans[0].scope_spans[0].spans[0].attributes)
- assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
- assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
- ) == sampling_params.temperature
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
- assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
- assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
- outputs[0].prompt_token_ids)
- completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
- assert attributes.get(
- SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
- metrics = outputs[0].metrics
- assert attributes.get(
- SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
- ttft = metrics.first_token_time - metrics.arrival_time
- assert attributes.get(
- SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
- e2e_time = metrics.finished_time - metrics.arrival_time
- assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
- assert metrics.scheduler_time > 0
- assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
- ) == metrics.scheduler_time
- # Model forward and model execute should be none, since detailed traces is
- # not enabled.
- assert metrics.model_forward_time is None
- assert metrics.model_execute_time is None
+ attributes = decode_attributes(
+ request.resource_spans[0].scope_spans[0].spans[0].attributes)
+ assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+ assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+ ) == sampling_params.temperature
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+ assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+ ) == sampling_params.max_tokens
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+ assert attributes.get(
+ SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+ outputs[0].prompt_token_ids)
+ completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+ assert attributes.get(
+ SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+ metrics = outputs[0].metrics
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+ ) == metrics.time_in_queue
+ ttft = metrics.first_token_time - metrics.arrival_time
+ assert attributes.get(
+ SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+ e2e_time = metrics.finished_time - metrics.arrival_time
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+ assert metrics.scheduler_time > 0
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+ ) == metrics.scheduler_time
+ # Model forward and model execute should be none, since detailed traces is
+ # not enabled.
+ assert metrics.model_forward_time is None
+ assert metrics.model_execute_time is None
-def test_traces_with_detailed_steps(trace_service):
- os.environ[OTEL_EXPORTER_OTLP_TRACES_INSECURE] = "true"
+def test_traces_with_detailed_steps(
+ monkeypatch: pytest.MonkeyPatch,
+ trace_service: FakeTraceService,
+):
+ with monkeypatch.context() as m:
+ m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
- sampling_params = SamplingParams(temperature=0.01,
- top_p=0.1,
- max_tokens=256)
- model = "facebook/opt-125m"
- llm = LLM(
- model=model,
- otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
- collect_detailed_traces="all",
- )
- prompts = ["This is a short prompt"]
- outputs = llm.generate(prompts, sampling_params=sampling_params)
+ sampling_params = SamplingParams(
+ temperature=0.01,
+ top_p=0.1,
+ max_tokens=256,
+ )
+ model = "facebook/opt-125m"
+ llm = LLM(
+ model=model,
+ otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+ collect_detailed_traces="all",
+ )
+ prompts = ["This is a short prompt"]
+ outputs = llm.generate(prompts, sampling_params=sampling_params)
- timeout = 5
- if not trace_service.evt.wait(timeout):
- raise TimeoutError(
- f"The fake trace service didn't receive a trace within "
- f"the {timeout} seconds timeout")
+ timeout = 5
+ if not trace_service.evt.wait(timeout):
+ raise TimeoutError(
+ f"The fake trace service didn't receive a trace within "
+ f"the {timeout} seconds timeout")
- request = trace_service.request
- assert len(request.resource_spans) == 1, (
- f"Expected 1 resource span, "
- f"but got {len(request.resource_spans)}")
- assert len(request.resource_spans[0].scope_spans) == 1, (
- f"Expected 1 scope span, "
- f"but got {len(request.resource_spans[0].scope_spans)}")
- assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
- f"Expected 1 span, "
- f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+ request = trace_service.request
+ assert len(request.resource_spans) == 1, (
+ f"Expected 1 resource span, "
+ f"but got {len(request.resource_spans)}")
+ assert len(request.resource_spans[0].scope_spans) == 1, (
+ f"Expected 1 scope span, "
+ f"but got {len(request.resource_spans[0].scope_spans)}")
+ assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+ f"Expected 1 span, "
+ f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
- attributes = decode_attributes(
- request.resource_spans[0].scope_spans[0].spans[0].attributes)
- assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
- assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
- ) == sampling_params.temperature
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
- assert attributes.get(
- SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
- assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
- assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
- outputs[0].prompt_token_ids)
- completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
- assert attributes.get(
- SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
- metrics = outputs[0].metrics
- assert attributes.get(
- SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
- ttft = metrics.first_token_time - metrics.arrival_time
- assert attributes.get(
- SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
- e2e_time = metrics.finished_time - metrics.arrival_time
- assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
- assert metrics.scheduler_time > 0
- assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
- ) == metrics.scheduler_time
- assert metrics.model_forward_time > 0
- assert attributes.get(
- SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
- metrics.model_forward_time / 1000)
- assert metrics.model_execute_time > 0
- assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
- ) == metrics.model_execute_time
- assert metrics.model_forward_time < 1000 * metrics.model_execute_time
+ attributes = decode_attributes(
+ request.resource_spans[0].scope_spans[0].spans[0].attributes)
+ assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+ assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+ ) == sampling_params.temperature
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+ assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+ ) == sampling_params.max_tokens
+ assert attributes.get(
+ SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+ assert attributes.get(
+ SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+ outputs[0].prompt_token_ids)
+ completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+ assert attributes.get(
+ SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+ metrics = outputs[0].metrics
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE
+ ) == metrics.time_in_queue
+ ttft = metrics.first_token_time - metrics.arrival_time
+ assert attributes.get(
+ SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+ e2e_time = metrics.finished_time - metrics.arrival_time
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
+ assert metrics.scheduler_time > 0
+ assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+ ) == metrics.scheduler_time
+ assert metrics.model_forward_time > 0
+ assert attributes.get(
+ SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD
+ ) == pytest.approx(metrics.model_forward_time / 1000)
+ assert metrics.model_execute_time > 0
+ assert attributes.get(
+ SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
+ ) == metrics.model_execute_time
+ assert metrics.model_forward_time < 1000 * metrics.model_execute_time
diff --git a/tests/utils.py b/tests/utils.py
index fc19c8d031b16..06ba8a2421c16 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -566,6 +566,7 @@ def init_test_distributed_environment(
def multi_process_parallel(
+ monkeypatch: pytest.MonkeyPatch,
tp_size: int,
pp_size: int,
test_target: Any,
@@ -582,7 +583,13 @@ def multi_process_parallel(
refs = []
for rank in range(tp_size * pp_size):
refs.append(
- test_target.remote(tp_size, pp_size, rank, distributed_init_port))
+ test_target.remote(
+ monkeypatch,
+ tp_size,
+ pp_size,
+ rank,
+ distributed_init_port,
+ ), )
ray.get(refs)
ray.shutdown()
@@ -700,7 +707,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
"""
Get a pytest mark, which skips the test if the GPU doesn't meet
a minimum memory requirement in GB.
-
+
This can be leveraged via `@large_gpu_test` to skip tests in environments
without enough resources, or called when filtering tests to run directly.
"""
diff --git a/tests/v1/e2e/test_ngram_spec_decode.py b/tests/v1/e2e/test_ngram_spec_decode.py
index 519a74cab84bc..6cca324514565 100644
--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -1,5 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
import random
+from typing import Any
import pytest
@@ -50,8 +53,12 @@ def model_name():
return "meta-llama/Meta-Llama-3-8B-Instruct"
-def test_ngram_correctness(monkeypatch, test_prompts, sampling_config,
- model_name):
+def test_ngram_correctness(
+ monkeypatch: pytest.MonkeyPatch,
+ test_prompts: list[list[dict[str, Any]]],
+ sampling_config: SamplingParams,
+ model_name: str,
+):
'''
Compare the outputs of a original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
index 5b9725d59ddc5..0ff804976ada6 100644
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -80,9 +80,11 @@ async def generate(engine: AsyncLLM,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_load(monkeypatch, output_kind: RequestOutputKind,
- engine_args_and_prompt: tuple[AsyncEngineArgs,
- PromptType]):
+async def test_load(
+ monkeypatch: pytest.MonkeyPatch,
+ output_kind: RequestOutputKind,
+ engine_args_and_prompt: tuple[AsyncEngineArgs, PromptType],
+):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
@@ -126,7 +128,8 @@ async def test_load(monkeypatch, output_kind: RequestOutputKind,
[(TEXT_ENGINE_ARGS, TEXT_PROMPT),
(VISION_ENGINE_ARGS, VISION_PROMPT)])
@pytest.mark.asyncio
-async def test_abort(monkeypatch, output_kind: RequestOutputKind,
+async def test_abort(monkeypatch: pytest.MonkeyPatch,
+ output_kind: RequestOutputKind,
engine_args_and_prompt: tuple[AsyncEngineArgs,
PromptType]):
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 5fdbcf5b99636..2ec4f7e034af8 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -45,7 +45,7 @@ def make_request() -> EngineCoreRequest:
@fork_new_process_for_each_test
-def test_engine_core(monkeypatch):
+def test_engine_core(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
@@ -159,10 +159,10 @@ def test_engine_core(monkeypatch):
@fork_new_process_for_each_test
-def test_engine_core_advanced_sampling(monkeypatch):
+def test_engine_core_advanced_sampling(monkeypatch: pytest.MonkeyPatch):
"""
- A basic end-to-end test to verify that the engine functions correctly
- when additional sampling parameters, such as top_p, min_tokens, and
+ A basic end-to-end test to verify that the engine functions correctly
+ when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
with monkeypatch.context() as m:
@@ -209,7 +209,7 @@ def test_engine_core_advanced_sampling(monkeypatch):
@fork_new_process_for_each_test
-def test_engine_core_concurrent_batches(monkeypatch):
+def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
"""
Test that the engine can handle multiple concurrent batches.
"""
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index e646ccbd46030..004b4dc82f4d9 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -90,7 +90,8 @@ def echo(self, msg: str, err_msg: Optional[str] = None) -> str:
@fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
-def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,
+ multiprocessing_mode: bool):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
@@ -175,7 +176,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
@pytest.mark.asyncio(loop_scope="function")
-async def test_engine_core_client_asyncio(monkeypatch):
+async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index e763aa2c86998..3800cb392fbad 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -57,7 +57,7 @@ def _repeat_logprob_config(
logprob_prompt_logprob_list: BatchLogprobsSpecType,
) -> BatchLogprobsSpecType:
"""Ensure each test prompt has a logprob config.
-
+
A logprob config specifies the optional (i.e.
may-be-`None`) number of sample logprobs and
the optional number of prompt logprobs.
@@ -80,7 +80,7 @@ def _repeat_logprob_config(
(optional num sample logprob,
optional num prompt logprob)
tuples
-
+
Returns:
list of
(optional num sample logprob,optional num prompt logprob)
@@ -255,14 +255,12 @@ def _run_and_validate(
[NONE, SAMPLE, PROMPT, SAMPLE_PROMPT])
@pytest.mark.parametrize("temperature", [0.0, 2.0])
def test_get_logprobs_and_prompt_logprobs(
- hf_model,
- vllm_model,
- batch_logprobs_composition: BatchLogprobsComposition,
- temperature: float,
- example_prompts,
-) -> None:
+ hf_model, vllm_model,
+ batch_logprobs_composition: BatchLogprobsComposition,
+ temperature: float, example_prompts: list[str],
+ monkeypatch: pytest.MonkeyPatch) -> None:
"""Test V1 Engine logprobs & prompt logprobs
-
+
Exercise a variety of combinations of `logprobs` and `prompt_logprobs`
settings and validate that
* The generated logprobs and prompt logprobs are consistent with the
@@ -279,7 +277,7 @@ def test_get_logprobs_and_prompt_logprobs(
To save time, only test one APC-enabled scenario
(sample & prompt logprobs enabled, temperature>0.0).
-
+
Args:
hf_model: HuggingFace reference model fixture
vllm_model: vLLM model fixture
@@ -287,128 +285,140 @@ def test_get_logprobs_and_prompt_logprobs(
temperature: "temperature" sampling parameter
example_prompts: example prompt fixture
"""
- do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
- if do_apc and (temperature < 2.0
- or batch_logprobs_composition != SAMPLE_PROMPT):
- # Skip some test-cases to save time.
- pytest.skip()
- test_prompts = example_prompts
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1")
+ do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+ if do_apc and (temperature < 2.0
+ or batch_logprobs_composition != SAMPLE_PROMPT):
+ # Skip some test-cases to save time.
+ pytest.skip()
+ test_prompts = example_prompts
- max_tokens = 5
- hf_outputs = hf_model.generate_greedy(
- test_prompts,
- max_tokens=max_tokens,
- )
- hf_logprobs = hf_model.generate_greedy_logprobs(
- test_prompts,
- max_tokens=max_tokens,
- )
-
- # Batch has mixed sample params
- # (different logprobs/prompt logprobs combos)
- logprob_prompt_logprob_list = get_test_batch(batch_logprobs_composition)
-
- # Ensure that each test prompt has a logprob config for testing
- logprob_prompt_logprob_list = _repeat_logprob_config(
- test_prompts, logprob_prompt_logprob_list)
- # Generate SamplingParams
- vllm_sampling_params = [
- SamplingParams(max_tokens=max_tokens,
- logprobs=num_lp,
- prompt_logprobs=num_plp,
- temperature=temperature,
- seed=1984)
- for num_lp, num_plp in logprob_prompt_logprob_list
- ]
- for _ in range(2 if do_apc else 1):
- _run_and_validate(
- vllm_model=vllm_model,
- test_prompts=test_prompts,
- vllm_sampling_params=vllm_sampling_params,
- hf_logprobs=hf_logprobs,
- hf_outputs=hf_outputs,
- logprob_prompt_logprob_list=logprob_prompt_logprob_list,
- temperature=temperature,
+ max_tokens = 5
+ hf_outputs = hf_model.generate_greedy(
+ test_prompts,
max_tokens=max_tokens,
- do_apc=do_apc)
+ )
+ hf_logprobs = hf_model.generate_greedy_logprobs(
+ test_prompts,
+ max_tokens=max_tokens,
+ )
+
+ # Batch has mixed sample params
+ # (different logprobs/prompt logprobs combos)
+ logprob_prompt_logprob_list = get_test_batch(
+ batch_logprobs_composition)
+
+ # Ensure that each test prompt has a logprob config for testing
+ logprob_prompt_logprob_list = _repeat_logprob_config(
+ test_prompts, logprob_prompt_logprob_list)
+ # Generate SamplingParams
+ vllm_sampling_params = [
+ SamplingParams(max_tokens=max_tokens,
+ logprobs=num_lp,
+ prompt_logprobs=num_plp,
+ temperature=temperature,
+ seed=1984)
+ for num_lp, num_plp in logprob_prompt_logprob_list
+ ]
+ for _ in range(2 if do_apc else 1):
+ _run_and_validate(
+ vllm_model=vllm_model,
+ test_prompts=test_prompts,
+ vllm_sampling_params=vllm_sampling_params,
+ hf_logprobs=hf_logprobs,
+ hf_outputs=hf_outputs,
+ logprob_prompt_logprob_list=logprob_prompt_logprob_list,
+ temperature=temperature,
+ max_tokens=max_tokens,
+ do_apc=do_apc)
-def test_max_logprobs():
+def test_max_logprobs(monkeypatch: pytest.MonkeyPatch):
"""vLLM v1 engine should fail a request with `logprobs > max_logprobs`
-
Should also fail for `prompt_logprobs > max_logprobs`
-
APC should not matter as this test checks basic request validation.
-
- Args:
- monkeypatch
"""
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1")
- runner = VllmRunner("facebook/opt-125m",
- max_logprobs=1,
- enable_prefix_caching=False,
- max_model_len=256)
- vllm_sampling_params = SamplingParams(logprobs=1)
- # should pass
- runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
+ runner = VllmRunner("facebook/opt-125m",
+ max_logprobs=1,
+ enable_prefix_caching=False,
+ max_model_len=256)
+ vllm_sampling_params = SamplingParams(logprobs=1)
+ # should pass
+ runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
- bad_sampling_params = SamplingParams(logprobs=2)
- with pytest.raises(ValueError):
- runner.generate(["Hello world"], sampling_params=bad_sampling_params)
+ bad_sampling_params = SamplingParams(logprobs=2)
+ with pytest.raises(ValueError):
+ runner.generate(["Hello world"],
+ sampling_params=bad_sampling_params)
-def test_none_logprobs(vllm_model, example_prompts):
+def test_none_logprobs(vllm_model, example_prompts,
+ monkeypatch: pytest.MonkeyPatch):
"""Engine should return `logprobs` and `prompt_logprobs` as `None`
-
+
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
- max_tokens = 5
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1")
+ max_tokens = 5
- sampling_params_logprobs_none = SamplingParams(max_tokens=max_tokens,
- logprobs=None,
- prompt_logprobs=None,
- temperature=0.0)
- results_logprobs_none = vllm_model.model.generate(
- example_prompts, sampling_params=sampling_params_logprobs_none)
+ sampling_params_logprobs_none = SamplingParams(
+ max_tokens=max_tokens,
+ logprobs=None,
+ prompt_logprobs=None,
+ temperature=0.0,
+ )
+ results_logprobs_none = vllm_model.model.generate(
+ example_prompts,
+ sampling_params=sampling_params_logprobs_none,
+ )
- for i in range(len(results_logprobs_none)):
- # Check sample logprobs are None
- assert results_logprobs_none[i].outputs[0].logprobs is None
- assert results_logprobs_none[i].outputs[0].cumulative_logprob is None
- # Check prompt logprobs are None
- assert results_logprobs_none[i].prompt_logprobs is None
+ for i in range(len(results_logprobs_none)):
+ # Check sample logprobs are None
+ assert results_logprobs_none[i].outputs[0].logprobs is None
+ assert results_logprobs_none[i].outputs[
+ 0].cumulative_logprob is None
+ # Check prompt logprobs are None
+ assert results_logprobs_none[i].prompt_logprobs is None
-def test_zero_logprobs(vllm_model, example_prompts):
+def test_zero_logprobs(vllm_model, example_prompts,
+ monkeypatch: pytest.MonkeyPatch):
"""Engine should return sampled token and prompt token logprobs
-
+
Args:
vllm_model: vLLM model fixture
example_prompts: list of example prompts (test fixture)
"""
- max_tokens = 5
+ with monkeypatch.context() as m:
+ m.setenv("VLLM_USE_V1", "1")
+ max_tokens = 5
- sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
- logprobs=0,
- prompt_logprobs=0,
- temperature=0.0)
- results_logprobs_zero = vllm_model.model.generate(
- example_prompts, sampling_params=sampling_params_logprobs_zero)
+ sampling_params_logprobs_zero = SamplingParams(max_tokens=max_tokens,
+ logprobs=0,
+ prompt_logprobs=0,
+ temperature=0.0)
+ results_logprobs_zero = vllm_model.model.generate(
+ example_prompts, sampling_params=sampling_params_logprobs_zero)
- for i in range(len(results_logprobs_zero)):
- # Check that there is one sample logprob dict for each
- # sample token
- logprobs = results_logprobs_zero[i].outputs[0].logprobs
- prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
- sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
- prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
- assert logprobs is not None
- assert len(sampled_token_ids) == len(logprobs)
- assert results_logprobs_zero[i].outputs[
- 0].cumulative_logprob is not None
- # Check that there is one prompt logprob dict for each
- # prompt token
- assert prompt_logprobs is not None
- assert len(prompt_token_ids) == len(prompt_logprobs)
+ for i in range(len(results_logprobs_zero)):
+ # Check that there is one sample logprob dict for each
+ # sample token
+ logprobs = results_logprobs_zero[i].outputs[0].logprobs
+ prompt_logprobs = results_logprobs_zero[i].prompt_logprobs
+ sampled_token_ids = results_logprobs_zero[i].outputs[0].token_ids
+ prompt_token_ids = results_logprobs_zero[i].prompt_token_ids
+ assert logprobs is not None
+ assert len(sampled_token_ids) == len(logprobs)
+ assert results_logprobs_zero[i].outputs[
+ 0].cumulative_logprob is not None
+ # Check that there is one prompt logprob dict for each
+ # prompt token
+ assert prompt_logprobs is not None
+ assert len(prompt_token_ids) == len(prompt_logprobs)
diff --git a/tests/v1/tpu/test_basic.py b/tests/v1/tpu/test_basic.py
index 0309f545ea49e..241f49e4faea8 100644
--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -3,11 +3,16 @@
Run `pytest tests/v1/tpu/test_basic.py`.
"""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
import pytest
from vllm.platforms import current_platform
-from ...conftest import VllmRunner
+if TYPE_CHECKING:
+ from tests.conftest import VllmRunner
MODELS = [
# "Qwen/Qwen2-7B-Instruct",
@@ -28,7 +33,8 @@ TENSOR_PARALLEL_SIZES = [1]
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
def test_models(
- monkeypatch,
+ vllm_runner: type[VllmRunner],
+ monkeypatch: pytest.MonkeyPatch,
model: str,
max_tokens: int,
enforce_eager: bool,
@@ -41,7 +47,7 @@ def test_models(
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
- with VllmRunner(
+ with vllm_runner(
model,
max_model_len=8192,
enforce_eager=enforce_eager,
@@ -50,5 +56,5 @@ def test_models(
tensor_parallel_size=tensor_parallel_size) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
- output = vllm_outputs[0][1]
- assert "1024" in output
+ output = vllm_outputs[0][1]
+ assert "1024" in output
From 583a9778e0bc65b031bc3e430d8f13655f727ec7 Mon Sep 17 00:00:00 2001
From: Simon Mo
Date: Sun, 16 Mar 2025 21:48:11 -0700
Subject: [PATCH 21/34] [Benchmark] Do not save detailed info to json by
default (#14879)
Signed-off-by: simon-mo
---
benchmarks/backend_request_func.py | 5 ++++-
benchmarks/benchmark_serving.py | 15 +++++++++++++++
2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 6a7db920b5b63..09c8e23ebb1c3 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -14,7 +14,8 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
-from vllm.model_executor.model_loader.weight_utils import get_lock
+# NOTE(simon): do not import vLLM here so the benchmark script
+# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -427,6 +428,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
+ from vllm.model_executor.model_loader.weight_utils import get_lock
+
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 1dd01ca968678..47627126b6688 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -684,6 +684,15 @@ def main(args: argparse.Namespace):
"Invalid metadata format. Please use KEY=VALUE format."
)
+ if not args.save_detailed:
+ # Remove fields with too many data points
+ for field in [
+ "input_lens", "output_lens", "ttfts", "itls",
+ "generated_texts", "errors"
+ ]:
+ if field in result_json:
+ del result_json[field]
+
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
@@ -828,6 +837,12 @@ if __name__ == "__main__":
action="store_true",
help="Specify to save benchmark results to a json file",
)
+ parser.add_argument(
+ "--save-detailed",
+ action="store_true",
+ help="When saving the results, whether to include per request "
+ "information such as response, error, ttfs, tpots, etc.",
+ )
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
From 8d6cf89526ff983b7eb74aad3903138004ae95cd Mon Sep 17 00:00:00 2001
From: Lily Liu
Date: Sun, 16 Mar 2025 22:00:20 -0700
Subject: [PATCH 22/34] [V1] [Spec Decode] Support random sampling for spec
decode (#13933)
Co-authored-by: Woosuk Kwon
---
tests/v1/sample/test_rejection_sampler.py | 301 +++++++++++++---
vllm/v1/sample/rejection_sampler.py | 400 +++++++++++++++-------
vllm/v1/sample/sampler.py | 8 -
vllm/v1/spec_decode/utils.py | 22 ++
vllm/v1/worker/gpu_model_runner.py | 31 +-
5 files changed, 568 insertions(+), 194 deletions(-)
create mode 100644 vllm/v1/spec_decode/utils.py
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 190927745f1fe..84139a40b544a 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -1,37 +1,51 @@
# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Optional
import pytest
import torch
+import torch.nn.functional as F
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
+DEVICE = "cpu"
+
@pytest.fixture
def sampler():
return RejectionSampler()
-def create_logits_tensor(token_ids: list[int],
+def create_logits_tensor(token_ids: list[list[int]],
vocab_size: int = 100) -> torch.Tensor:
"""Helper function to create logits tensor that
will produce desired token ids on argmax"""
- logits = torch.full((len(token_ids), vocab_size), -100.0).cuda()
- for i, token_id in enumerate(token_ids):
- logits[i, token_id] = 100.0
+ num_total_tokens = sum(len(tokens) for tokens in token_ids)
+ logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+ start_loc = 0
+ for tokens in token_ids:
+ for j, token_id in enumerate(tokens):
+ logits[start_loc + j, token_id] = 100.0
+ start_loc += len(tokens)
return logits
-def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
- batch_size = len(spec_tokens)
+def create_sampling_metadata(
+ all_greedy: bool,
+ generators: Optional[dict[int, Any]] = None) -> SamplingMetadata:
+ """Create a v1 sampling metadata object with all_greedy set
+ to the given value. Either all greedy or all random sampling
+ is used.
+ """
+ generators = generators or {}
return SamplingMetadata(
temperature=torch.tensor([]),
- all_greedy=True,
- all_random=False,
+ all_greedy=all_greedy,
+ all_random=not all_greedy,
top_p=None,
top_k=None,
- min_p=torch.empty(batch_size, ),
- generators={},
+ min_p=torch.empty(1, ),
+ generators=generators,
max_num_logprobs=0,
no_penalties=False,
prompt_token_ids=None,
@@ -40,129 +54,310 @@ def create_sampling_metadata(spec_tokens: list[list[int]]) -> SamplingMetadata:
repetition_penalties=torch.tensor([]),
output_token_ids=[],
min_tokens={},
- logit_bias=[None] * batch_size,
+ logit_bias=[None],
allowed_token_ids_mask=None,
bad_words_token_ids={},
)
+########################### Tests for Greedy Sampling ###################
def test_perfect_match(sampler):
"""Test when output tokens perfectly match speculated tokens"""
spec_tokens = [[1, 2, 3]]
- output_tokens = [1, 2, 3, 4] # 4 is the bonus token
+ output_tokens = [[1, 2, 3, 4]] # 4 is the bonus token
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+ device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[1, 2, 3, 4]],
dtype=torch.int,
device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
def test_early_mismatch(sampler):
"""Test when there's an early mismatch in tokens"""
spec_tokens = [[1, 2, 3]]
- output_tokens = [1, 5, 3, 4] # Mismatch at position 1
+ output_tokens = [[1, 5, 3, 4]] # Mismatch at position 1
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+ device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[1, 5, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
def test_multiple_sequences(sampler):
"""Test handling multiple sequences of speculated tokens"""
spec_tokens = [[1, 2], [3]]
- output_tokens = [1, 2, 5, 3, 4] # Two sequences with bonus tokens 5 and 4
+ output_tokens = [[1, 2, 5], [3,
+ 4]] # Two sequences with bonus tokens 5 and 4
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor(
+ [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[1, 2, 5], [3, 4, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
def test_single_token_sequence(sampler):
"""Test handling sequences with single token"""
spec_tokens = [[1]]
- output_tokens = [1, 2] # Single token with bonus token 2
+ output_tokens = [[1, 2]] # Single token with bonus token 2
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+ device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
def test_empty_sequence(sampler):
"""Test handling empty sequence of speculated tokens"""
spec_tokens: list[list[int]] = [[]]
- output_tokens = [5] # Just the bonus token
+ output_tokens = [[5]] # Just the bonus token
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor([output_tokens[0][-1]],
+ device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
def test_multiple_mismatches(sampler):
"""Test handling multiple sequences with mismatches"""
spec_tokens = [[1, 2, 3], [4, 5, 6]]
- output_tokens = [1, 2, 7, 6, 4, 8, 6, 9] # Mismatches in both sequences
+ output_tokens = [[1, 2, 7, 6], [4, 8, 6,
+ 9]] # Mismatches in both sequences
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor(
+ [output_tokens[0][-1], output_tokens[1][-1]], device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected = torch.tensor([[1, 2, 7, INVALID_TOKEN_ID],
[4, 8, INVALID_TOKEN_ID, INVALID_TOKEN_ID]],
dtype=torch.int,
device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
+ assert torch.equal(output, expected)
@pytest.mark.parametrize(
"spec_tokens,output_tokens,expected",
[
- ([[1, 2]], [1, 2, 3], [[1, 2, 3]]), # Perfect match with bonus
- ([[1]], [2, 3], [[2, INVALID_TOKEN_ID]]), # First mismatch
- ([[1, 2], [3, 4]], [1, 5, 6, 3, 4, 7], [[1, 5, INVALID_TOKEN_ID],
- [3, 4, 7]]), # Mixed matches
+ ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]), # Perfect match with bonus
+ ([[1]], [[2, 3]], [[2, INVALID_TOKEN_ID]]), # First mismatch
+ ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
+ [[1, 5, INVALID_TOKEN_ID], [3, 4, 7]]), # Mixed matches
])
def test_parametrized_cases(sampler, spec_tokens, output_tokens, expected):
"""Parametrized test for various matching scenarios"""
- metadata = create_sampling_metadata(spec_tokens)
+ metadata = create_sampling_metadata(all_greedy=True)
logits = create_logits_tensor(output_tokens)
+ bonus_token_tensor = torch.tensor([tokens[-1] for tokens in output_tokens],
+ device=logits.device)
- output = sampler(spec_tokens, logits, metadata)
+ output = sampler(spec_tokens, None, bonus_token_tensor, logits, metadata)
expected_tensor = torch.tensor(expected,
dtype=torch.int,
device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected_tensor)
+ assert torch.equal(output, expected_tensor)
-def test_logits_shape_handling(sampler):
- """Test handling of different logits tensor shapes"""
- spec_tokens = [[1, 2]]
- output_tokens = [1, 2, 3]
- vocab_size = 1000
+########################### Tests for Random Sampling ###################
+@pytest.mark.parametrize("k", [1, 3, 5])
+@pytest.mark.parametrize("vocab_size", [1000])
+@pytest.mark.parametrize("batch_size", [1, 4, 8])
+@pytest.mark.parametrize("frac_seeded", [0.0, 0.5])
+@pytest.mark.parametrize("n_rep", [20])
+def test_deterministic_when_seeded(sampler, k: int, vocab_size: int,
+ batch_size: int, frac_seeded: float,
+ n_rep: int):
+ draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)
+ target_probs = torch.rand(batch_size * (k + 1),
+ vocab_size,
+ dtype=torch.float32)
+ bonus_token_ids = torch.randint(low=0,
+ high=vocab_size,
+ size=(batch_size, 1),
+ dtype=torch.int64)
+ draft_token_ids = torch.randint(low=0,
+ high=vocab_size,
+ size=(batch_size, k),
+ dtype=torch.int64)
- metadata = create_sampling_metadata(spec_tokens)
- logits = create_logits_tensor(output_tokens, vocab_size)
+ seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
- output = sampler(spec_tokens, logits, metadata)
- expected = torch.tensor([[1, 2, 3]], dtype=torch.int, device=logits.device)
- assert torch.equal(output.sampled_token_ids, expected)
- assert logits.shape[-1] == vocab_size
+ results = []
+ for _ in range(n_rep):
+ seeded_seqs = {
+ i: torch.Generator(device=DEVICE).manual_seed(i)
+ for i in range(batch_size) if seeded_mask[i]
+ }
+
+ sampling_metadata = create_sampling_metadata(all_greedy=False,
+ generators=seeded_seqs)
+ rep_result = sampler(draft_token_ids.tolist(), draft_probs,
+ bonus_token_ids, target_probs, sampling_metadata)
+
+ results.append(rep_result)
+
+ for i in range(batch_size):
+ if seeded_mask[i]:
+ for j in range(1, n_rep):
+ assert torch.equal(results[j][i], results[0][i])
+
+
+def test_rejection_sampling_approximates_target_distribution():
+ """Verify rejection sampling approximates target distribution,
+ despite sampling from a potentially distinct draft distribution.
+
+ This is done by first creating a random target probability
+ distribution and a random draft probability distribution. We then
+ sample token ids from the rejection sampler using these draft
+ and target distributions. The samples are used to estimate
+ the output probability distribution, which we expect to approximate
+ the target distribution.
+
+ A basic distance metric is used to determine similarity between
+ distributions.
+
+ We expect that as we increase the number of samples,
+ the distance between the observed distribution and the target
+ distribution decreases. To measure this, we compare the distance
+ of the observed distribution against both the target distribution
+ and a uniform random distribution. We expect the distance between
+ the observed distribution and the target distribution to improve
+ much more than the distance improvement between the observed
+ distribution and the random distribution.
+ """
+ torch.set_default_device(DEVICE)
+ vocab_size = 10
+ k = 2
+ num_reference_probs = 100
+
+ # Prepare draft, target, and reference probability distributions
+ draft_probs, target_probs = (F.softmax(
+ torch.rand(vocab_size, dtype=torch.float32),
+ dim=-1,
+ ) for _ in range(2))
+ reference_probs = F.softmax(
+ torch.rand(num_reference_probs, vocab_size, dtype=torch.float32),
+ dim=-1,
+ )
+
+ sample_sizes = [10, 100, 1_000, 10_000, 100_000]
+ distance_wrt_reference: list[float] = []
+ distance_wrt_target: list[float] = []
+
+ for num_samples in sample_sizes:
+ # Sample using rejection sampling.
+ rej_sample_probs = estimate_rejection_sampling_pdf(
+ draft_probs, target_probs, k, vocab_size, num_samples)
+ rej_sample_probs = rej_sample_probs.to(DEVICE)
+
+ # Average distance from reference probs.
+ reference_vs_rejsample_dist = torch.dist(
+ reference_probs,
+ rej_sample_probs).item() / reference_probs.shape[0]
+ target_vs_rejsample_dist = torch.dist(target_probs,
+ rej_sample_probs).item()
+
+ distance_wrt_reference.append(reference_vs_rejsample_dist)
+ distance_wrt_target.append(target_vs_rejsample_dist)
+
+ relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+ distance_wrt_target)
+ relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+ distance_wrt_reference)
+
+ print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} "
+ f"{reference_vs_rejsample_dist=:.05f}")
+ print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} "
+ f"{relative_change_in_distance_wrt_reference=:.02f}")
+
+ relative_change_in_distance_wrt_target = get_ratio_first_to_last(
+ distance_wrt_target)
+ relative_change_in_distance_wrt_reference = get_ratio_first_to_last(
+ distance_wrt_reference)
+
+ expected_improvement_multiplier = 20
+ assert (relative_change_in_distance_wrt_target
+ > relative_change_in_distance_wrt_reference *
+ expected_improvement_multiplier)
+
+
+def get_ratio_first_to_last(elements: list[float]) -> float:
+ return elements[0] / elements[-1]
+
+
+def estimate_rejection_sampling_pdf(
+ draft_probs: torch.Tensor,
+ target_probs: torch.Tensor,
+ k: int,
+ vocab_size: int,
+ num_samples: int,
+) -> torch.Tensor:
+ """Estimate the probability distribution of the output tokens
+ using rejection sampling.
+
+ Args:
+ draft_probs: Draft probability distribution.
+ target_probs: Target probability distribution.
+ num_samples: Number of samples to draw.
+
+ Returns:
+ Estimated probability distribution of the output tokens.
+ """
+ sampler = RejectionSampler()
+ # Repeat draft probs num_samples times.
+ draft_probs = draft_probs.reshape(1, 1,
+ vocab_size).repeat(num_samples, k, 1)
+
+ # Repeat target probs num_samples * (k + 1) times.
+ target_probs = target_probs.reshape(1, 1, vocab_size).repeat(
+ num_samples, k + 1, 1).reshape(num_samples * (k + 1), vocab_size)
+
+ # Randomly sample draft token ids from draft probs.
+ draft_token_ids = torch.multinomial(draft_probs[:, 0, :],
+ num_samples=k,
+ replacement=True).reshape(
+ num_samples, k)
+
+ # Bonus tokens not used but required.
+ bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64,
+ device=DEVICE).repeat(num_samples, 1)
+
+ sampling_metadata = create_sampling_metadata(all_greedy=False)
+ output_token_ids = sampler(draft_token_ids.tolist(), draft_probs,
+ bonus_token_ids, target_probs,
+ sampling_metadata)
+ output_token_ids = output_token_ids[:, :-1].flatten()
+
+ hist = torch.histogram(output_token_ids.to(dtype=torch.float,
+ device="cpu"),
+ bins=vocab_size,
+ range=(0, vocab_size),
+ density=True)
+
+ return hist.hist
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index ea7f3353c115f..5601c62e91fc0 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,87 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
-from vllm import envs
from vllm.logger import init_logger
-from vllm.platforms import current_platform
-from vllm.v1.outputs import SamplerOutput
from vllm.v1.sample.metadata import SamplingMetadata
-
-try:
- import flashinfer.sampling as fs
- is_flashinfer_available = True
-except ImportError:
- is_flashinfer_available = False
+from vllm.v1.spec_decode.utils import random_sample
logger = init_logger(__name__)
INVALID_TOKEN_ID = -1
class RejectionSampler(nn.Module):
+ """
+ The implementation strictly follows the algorithm described in
+ https://arxiv.org/abs/2211.17192.
+ However, we want to clarify the terminology used in the implementation:
+ accepted tokens: tokens that are accepted based on the relationship
+ between the "raw" draft and target probabilities.
+ recovered tokens: tokens that are sampled based on the adjusted probability
+ distribution, which is derived from both the draft and target
+ probabilities.
+ bonus tokens:
+ If all proposed tokens are accepted, the bonus token is added to the
+ end of the sequence. The bonus token is only sampled from the target
+ probabilities. We pass in the bonus tokens instead of sampling them
+ in the rejection sampler to allow for more flexibility in the
+ sampling process. For example, we can use top_p, top_k sampling for
+ bonus tokens, while spec decode does not support these sampling
+ strategies.
+ output tokens:
+ Tokens are finally generated with the rejection sampler.
+ output tokens = accepted tokens + recovered tokens + bonus tokens
+ """
def __init__(self):
super().__init__()
- if current_platform.is_cuda():
- if is_flashinfer_available:
- if envs.VLLM_USE_FLASHINFER_SAMPLER is not False:
- # FIXME(woosuk): Currently, we have errors when using
- # FlashInfer for rejection sampling. As a workaround, we
- # disable FlashInfer for rejection sampling by default.
- logger.info("Currently, FlashInfer rejection sampler is "
- "disabled because of a bug. Falling back to "
- "the PyTorch-native implementation of "
- "rejection sampling.")
- self.forward_method = self.forward_native
- # NOTE(woosuk): The V0 sampler doesn't use FlashInfer for
- # sampling unless VLLM_USE_FLASHINFER_SAMPLER=1 (i.e., by
- # default it is unused). For backward compatibility, we set
- # `VLLM_USE_FLASHINFER_SAMPLER` as None by default and
- # interpret it differently in V0 and V1 samplers: In V0,
- # None means False, while in V1, None means True. This is
- # why we use the condition
- # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
- # logger.info("Using FlashInfer for rejection sampling.")
- # self.forward_method = self.flashinfer_sample
- else:
- logger.warning(
- "FlashInfer is available, but it is not enabled. "
- "Falling back to the PyTorch-native implementation of "
- "rejection sampling. For the best performance, "
- "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
- self.forward_method = self.forward_native
- else:
- logger.warning(
- "FlashInfer is not available. Falling back to the PyTorch-"
- "native implementation of rejection sampling. For the "
- "best performance, please install FlashInfer.")
- self.forward_method = self.forward_native
- else:
- self.forward_method = self.forward_native
-
- def forward(self, draft_token_ids: list[list[int]],
- target_probs: torch.Tensor,
- sampling_metadata: SamplingMetadata) -> SamplerOutput:
- if not sampling_metadata.all_greedy:
- raise NotImplementedError(
- "Currently, only greedy sampling is supported by "
- "rejection sampler.")
- return self.forward_method(draft_token_ids, target_probs,
- sampling_metadata)
-
- def flashinfer_sample(
+ def forward(
self,
draft_token_ids: list[list[int]],
- target_probs: torch.Tensor,
+ draft_probs: Optional[torch.Tensor],
+ bonus_token_ids_tensor: torch.Tensor, # [batch_size, 1]
+ target_probs: torch.Tensor, # [num_total_tokens, vocab_size]
sampling_metadata: SamplingMetadata,
- ) -> SamplerOutput:
+ ) -> torch.Tensor:
+ '''
+ Args:
+ draft_token_ids (List[List[int]]):
+ A 2D list of token IDs for each request in the batch.
+ Each request might have different number of draft tokens.
+ It may also contain empty lists for requests that have
+ no draft tokens.
+ draft_probs (Optional[torch.Tensor]):
+ Probability distribution for the draft tokens. Shape is
+ [batch_size, max_spec_len, vocab_size]. Can be None if
+ probabilities are not provided, which is the case for
+ ngram spec decode.
+ bonus_token_ids_tensor (torch.Tensor):
+ A tensor containing bonus tokens. Shape is [batch_size, 1].
+ Bonus tokens are added to the end of the sequence if all
+ proposed tokens are accepted. We generate the bonus tokens
+ outside of the rejection sampler with the default sampling
+ strategy. It allows for more flexibility in the sampling
+ process such as top_p, top_k sampling.
+ target_probs (torch.Tensor):
+ Target model probability distribution.
+ Shape is [num_total_tokens, vocab_size]. num_total_tokens
+ is the total number of tokens from all requests. Here,
+ probabilities from different requests are flattened into
+ a single tensor because this is the shape of the output
+ logits.
+ sampling_metadata (SamplingMetadata):
+ Additional metadata needed for sampling, such as temperature,
+ top-k/top-p parameters, or other relevant information.
+ Returns:
+ output_token_ids (torch.Tensor):
+ A tensor containing the final output token IDs.
+ '''
+
# NOTE: The following input preparationg can be moved
# to the model runner with a persistent manner for better
# performance.
- sample_lens = [len(x) + 1 for x in draft_token_ids]
# Convert draft token IDs to a tensor, split by sample_lens, then pad.
draft_token_ids = [
torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
@@ -90,90 +92,171 @@ class RejectionSampler(nn.Module):
batch_first=True,
padding_value=INVALID_TOKEN_ID)
- if sampling_metadata.all_greedy:
- target_token_ids = target_probs.argmax(dim=-1).view(-1)
- target_token_ids = target_token_ids.split(sample_lens)
- target_token_ids = pad_sequence(target_token_ids,
- batch_first=True,
- padding_value=INVALID_TOKEN_ID)
+ # NOTE: CPU <-> GPU synchronization happens here.
+ draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
+ # Create one-hot tensor for draft token ids.
+ # This is used for ngram where we don't have draft_probs.
+ if draft_probs is None and not sampling_metadata.all_greedy:
vocab_size = target_probs.size(-1)
- # NOTE: CPU <-> GPU synchronization happens here.
- draft_token_ids_tensor = draft_token_ids_tensor.to(
- target_probs.device)
draft_probs = _create_greedy_token_probs(draft_token_ids_tensor,
vocab_size,
target_probs.device)
- target_probs = _create_greedy_token_probs(target_token_ids,
- vocab_size,
- target_probs.device)
- uniform_samples = torch.zeros(draft_token_ids_tensor.size(0),
- draft_token_ids_tensor.size(1) + 1,
- device=target_probs.device)
- else:
- raise NotImplementedError(
- "Currently, only greedy sampling is supported by "
- "rejection sampler.")
+ sample_lens = [len(x) + 1 for x in draft_token_ids]
+ target_probs = _convert_2d_probs(target_probs, sample_lens)
- sampled_token_ids, _, _ = fs.chain_speculative_sampling(
- draft_probs,
- draft_token_ids_tensor,
- uniform_samples,
- target_probs,
- )
- return SamplerOutput(sampled_token_ids=sampled_token_ids,
- logprobs_tensors=None)
+ return self.forward_native(draft_token_ids_tensor, draft_probs,
+ bonus_token_ids_tensor, target_probs,
+ sampling_metadata)
# TODO: The following method can be optimized for better performance.
def forward_native(
self,
- draft_token_ids: list[list[int]],
+ draft_token_ids_tensor: torch.Tensor,
+ # [batch_size, max_spec_len, vocab_size]
+ draft_probs: Optional[torch.Tensor],
+ bonus_token_ids_tensor: torch.Tensor,
+ # [batch_size, max_spec_len + 1, vocab_size]
target_probs: torch.Tensor,
sampling_metadata: SamplingMetadata,
- ) -> SamplerOutput:
- sample_lens = [len(x) + 1 for x in draft_token_ids]
- # Convert draft token IDs to a tensor, split by sample_lens, then pad.
- draft_token_ids = [
- torch.tensor(x, dtype=int, device='cpu') for x in draft_token_ids
- ]
- draft_token_ids_tensor = pad_sequence(draft_token_ids,
- batch_first=True,
- padding_value=INVALID_TOKEN_ID)
- draft_token_ids_tensor = draft_token_ids_tensor.to(target_probs.device)
+ ) -> torch.Tensor:
# Add 1 to include the 'bonus' token.
if sampling_metadata.all_greedy:
- output_token_ids = target_probs.argmax(dim=-1).view(-1)
- output_token_ids = output_token_ids.split(sample_lens)
- output_token_ids = pad_sequence(output_token_ids,
- batch_first=True,
- padding_value=INVALID_TOKEN_ID)
# Produce a mask that remains 1 (True) until the first
# mismatch (cumprod turns 0 after a mismatch).
- accept_mask = (
- output_token_ids[:, :-1] == draft_token_ids_tensor).cumprod(
- dim=1)
- else:
- raise NotImplementedError(
- "Currently, only greedy sampling is supported by "
- "rejection sampler.")
- # Identify valid positions (non-padding).
- valid_mask = output_token_ids != INVALID_TOKEN_ID
- # Generate mask with bonus token.
- generate_mask = torch.cat([
- accept_mask,
- torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
- ],
- dim=1).to(torch.bool) & valid_mask
- zeros_mask = (generate_mask == 0)
- first_zero_idx = zeros_mask.float().argmax(dim=1)
- # Figure out which rows actually contain at least one zero.
- rows_with_zero = zeros_mask.any(dim=1)
- # Use indexing to set the first zero in each of those rows to 1.
- generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
+ target_token_ids_tensor = target_probs.argmax(dim=-1)
+ accept_mask = (target_token_ids_tensor[:, :-1] ==
+ draft_token_ids_tensor).cumprod(dim=1)
- output_token_ids[~generate_mask] = INVALID_TOKEN_ID
- return SamplerOutput(sampled_token_ids=output_token_ids,
- logprobs_tensors=None)
+ # Identify valid positions (non-padding).
+ valid_mask = target_token_ids_tensor != INVALID_TOKEN_ID
+ # Generate mask with bonus token.
+ generate_mask = torch.cat([
+ accept_mask,
+ torch.zeros(accept_mask.size(0), 1, device=accept_mask.device)
+ ],
+ dim=1).to(torch.bool) & valid_mask
+ zeros_mask = (generate_mask == 0)
+ first_zero_idx = zeros_mask.float().argmax(dim=1)
+ # Figure out which rows actually contain at least one zero.
+ rows_with_zero = zeros_mask.any(dim=1)
+ # Use indexing to set the first zero in each of those rows to 1.
+ generate_mask[rows_with_zero, first_zero_idx[rows_with_zero]] = 1
+
+ output_token_ids = target_token_ids_tensor
+ output_token_ids[~generate_mask] = INVALID_TOKEN_ID
+ else:
+ # Reference: https://arxiv.org/pdf/2211.17192
+ # 1. Extract the probabilities of the draft tokens.
+ # [batch_size, max_spec_len]
+ batch_size = draft_token_ids_tensor.size(0)
+ max_spec_len = draft_token_ids_tensor.size(1)
+ invalid_idx = draft_token_ids_tensor == INVALID_TOKEN_ID
+ draft_token_ids_tensor[invalid_idx] = 0
+ assert draft_probs is not None
+ draft_token_probs = draft_probs.gather(
+ dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
+ target_token_probs = target_probs.gather(
+ dim=-1, index=draft_token_ids_tensor.unsqueeze(-1)).squeeze(-1)
+ # Force the probabilities of invalid tokens to inf
+ # so that they are not accepted.
+ draft_token_probs[invalid_idx] = float('inf')
+
+ # 2. Generate uniform samples.
+ # [batch_size, max_spec_len + 1]
+ uniform_samples = _create_uniform_samples(
+ sampling_metadata.generators, batch_size, max_spec_len,
+ target_probs.device)
+
+ # 3. Accept or reject the samples.
+ # [batch_size, max_spec_len]
+ # If the draft token probabilities are 0, set them to the smallest
+ # positive normal value representable by float32.
+ safe_draft_probs = torch.where(draft_token_probs > 0,
+ draft_token_probs,
+ torch.finfo(torch.float32).tiny)
+ accepted = uniform_samples <= target_token_probs / safe_draft_probs
+ accept_mask = accepted.cumprod(dim=1)
+ # Set the token ids to the draft token ids if accepted, otherwise
+ # set them to INVALID_TOKEN_ID.
+ accepted_token_ids = (draft_token_ids_tensor * accept_mask +
+ INVALID_TOKEN_ID * (1 - accept_mask))
+
+ # 4. Adjust the distribution for the recovered tokens.
+ # Clamp the bonus probabilities to the smallest positive normal
+ # value representable by float32.
+ bonus_prob = torch.clamp(target_probs[:, :-1, :] - draft_probs,
+ min=torch.finfo(torch.float32).tiny)
+ normalized_bonus_prob = bonus_prob / bonus_prob.sum(dim=-1,
+ keepdim=True)
+
+ # 5. Sample recovered token ids.
+ recovered_token_ids = random_sample(
+ normalized_bonus_prob,
+ sampling_metadata.generators).reshape(batch_size, max_spec_len)
+
+ # 6. Get the final output token ids.
+ # output_token_ids = accepted_token_ids +
+ # recovered_token_ids +
+ # bonus_token_id
+ recovered_bonus_token_ids = torch.cat(
+ [recovered_token_ids, bonus_token_ids_tensor], dim=1)
+ # Generate mask with bonus tokens.
+ generate_mask = torch.cat([
+ accept_mask,
+ torch.zeros(batch_size, 1, device=accept_mask.device)
+ ],
+ dim=1).to(torch.bool)
+ zeros_mask = (generate_mask == 0)
+ first_zero_idx = zeros_mask.float().argmax(dim=1)
+ output_token_ids = torch.cat([
+ accepted_token_ids,
+ torch.full((batch_size, 1),
+ fill_value=INVALID_TOKEN_ID,
+ device=accept_mask.device)
+ ],
+ dim=1)
+ output_token_ids[torch.arange(batch_size),
+ first_zero_idx] = recovered_bonus_token_ids[
+ torch.arange(batch_size), first_zero_idx]
+
+ return output_token_ids
+
+ def compute_probs(self, logits: torch.Tensor,
+ sampling_metadata: SamplingMetadata,
+ sample_lens: list[int]) -> torch.Tensor:
+ """
+ Compute probability distribution from logits based on sampling metadata.
+
+ This function applies temperature scaling to the logits and converts
+ them to probabilities using softmax. Note that division by
+ temperature is not performed inplace to preserve the original logits
+ tensor, which will be used by the original sampler to get bonus tokens.
+
+ Args:
+ logits: Input logits tensor to be converted to probabilities
+ sampling_metadata: Metadata containing sampling parameters such
+ as temperature and whether greedy sampling is used
+ sample_lens: List of sample lengths used for repeating
+ temperature values
+
+ Returns:
+ torch.Tensor: Probability distribution (softmax of scaled logits)
+ if non-greedy sampling is used, otherwise returns the
+ original logits
+ """
+ if sampling_metadata.all_greedy:
+ return logits
+ assert sampling_metadata.temperature is not None
+ # We should optimize the following code as
+ # it will cause CPU -> GPU synchronization.
+ temperature = torch.repeat_interleave(
+ sampling_metadata.temperature,
+ torch.tensor(sample_lens,
+ device=sampling_metadata.temperature.device))
+ temperature = temperature.unsqueeze(dim=1)
+ logits = logits / temperature
+ return logits.softmax(dim=-1, dtype=torch.float32)
def _create_greedy_token_probs(
@@ -199,3 +282,66 @@ def _create_greedy_token_probs(
src=valid_mask.unsqueeze(-1).float())
return token_probs
+
+
+def _convert_2d_probs(
+ probs: torch.Tensor, # [num_total_tokens, vocab_size]
+ sample_lens: list[int]) -> torch.Tensor:
+ """
+ Converts a 2D tensor of probabilities to a 3D tensor with padding.
+ [num_total_tokens, vocab_size] ->
+ [batch_size, max_spec_len + 1, vocab_size]
+ """
+ cumulative_lens = torch.cumsum(torch.tensor(sample_lens,
+ device=probs.device),
+ dim=0)
+ split_indices = cumulative_lens[:-1].tolist() # Exclude last index
+
+ # Split into chunks without loops
+ chunks = torch.tensor_split(probs, split_indices, dim=0)
+
+ # Pad all sequences to maximum length
+ padded_probs = pad_sequence(chunks, batch_first=True, padding_value=0.0)
+ return padded_probs
+
+
+def _create_uniform_samples(seeded_seqs: dict[int, torch.Generator],
+ batch_size: int, k: int,
+ device: torch.device) -> torch.Tensor:
+ """
+ Generates a batch of uniform random samples, with optional seeding
+ for specific sequences.
+
+ This method creates a tensor of shape `(batch_size, k)` filled
+ with uniform random values in the range [0, 1). If `seeded_seqs`
+ is provided, the sequences corresponding to specific indices
+ will be generated using the provided `torch.Generator` for
+ reproducibility. The other sequences will be generated without
+ a seed.
+
+ Args:
+ seeded_seqs : Optional[Dict[int, torch.Generator]]
+ A dictionary mapping indices in the batch to
+ `torch.Generator` objects.
+ batch_size : int
+ The number of sequences to generate.
+ k : int
+ The number of random samples per sequence.
+ device : torch.device
+ The device on which to allocate the tensor.
+
+ Returns:
+ uniform_rand : torch.Tensor
+ A tensor of shape `(batch_size, k)` containing uniform
+ random values in the range [0, 1).
+ """
+
+ uniform_rand = torch.rand(batch_size,
+ k,
+ dtype=torch.float32,
+ device=device)
+ # Apply seeded generators only where needed
+ if seeded_seqs:
+ for idx, generator in seeded_seqs.items():
+ uniform_rand[idx].uniform_(0, 1, generator=generator)
+ return uniform_rand
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 96f6d807b10ce..d91c057083f31 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -119,14 +119,6 @@ class Sampler(nn.Module):
)
return sampled
- def compute_probs(self, logits: torch.Tensor,
- sampling_metadata: SamplingMetadata) -> torch.Tensor:
- if sampling_metadata.all_greedy:
- return logits
- # Apply temperature. This is an in-place op changing logits.
- logits = self.apply_temperature(logits, sampling_metadata.temperature)
- return logits.softmax(dim=-1, dtype=torch.float32)
-
def compute_logprobs(self, logits: torch.Tensor) -> torch.Tensor:
return logits.log_softmax(dim=-1, dtype=torch.float32)
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
new file mode 100644
index 0000000000000..5841401367788
--- /dev/null
+++ b/vllm/v1/spec_decode/utils.py
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.sample.ops.topk_topp_sampler import random_sample # noqa
+from vllm.v1.worker.gpu_input_batch import InputBatch
+
+
+def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
+ if req_id in input_batch.top_k_reqs or req_id in input_batch.top_p_reqs:
+ # Spec decode doesn't support top_p/top_k sampling.
+ return False
+ elif req_id in input_batch.min_p_reqs:
+ # Spec decode doesn't support min_p sampling.
+ return False
+ elif (req_id in input_batch.frequency_penalties_reqs
+ or req_id in input_batch.presence_penalties_reqs
+ or req_id in input_batch.repetition_penalties_reqs):
+ # Spec decode doesn't support penalties.
+ return False
+ elif req_id in input_batch.num_logprobs:
+ # Spec decode doesn't support logprobs.
+ return False
+
+ return True
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4059d5b17b71b..2a98bea562dcb 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -37,6 +37,7 @@ from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.rejection_sampler import INVALID_TOKEN_ID, RejectionSampler
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.spec_decode.utils import is_spec_decode_supported
from vllm.v1.utils import bind_kv_cache
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
@@ -1020,15 +1021,26 @@ class GPUModelRunner(LoRAModelRunnerMixin):
sampling_metadata=sampling_metadata,
)
else:
- target_probs = self.model.sampler.compute_probs(
- logits, sampling_metadata)
draft_token_ids = [
scheduler_output.scheduled_spec_decode_tokens.get(req_id, [])
for req_id in self.input_batch.req_ids
]
- sampler_output = self.rejection_sampler(draft_token_ids,
- target_probs,
- sampling_metadata)
+ sample_lens = [len(tokens) + 1 for tokens in draft_token_ids]
+ recover_logits_idx = np.cumsum(sample_lens) - 1
+ target_probs = self.rejection_sampler.compute_probs(
+ logits, sampling_metadata, sample_lens)
+ sampler_output = self.model.sample(
+ logits=logits[recover_logits_idx, :],
+ sampling_metadata=sampling_metadata,
+ )
+ bonus_token_ids = sampler_output.sampled_token_ids
+ output_token_ids = self.rejection_sampler(
+ draft_token_ids,
+ None, # draft_probs
+ bonus_token_ids,
+ target_probs,
+ sampling_metadata)
+ sampler_output.sampled_token_ids = output_token_ids
# TODO(woosuk): The following loop can be slow since it iterates over
# the requests one by one. Optimize.
@@ -1075,7 +1087,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
spec_token_ids = None
else:
spec_token_ids = self.generate_draft_token_ids(
- valid_sampled_token_ids)
+ valid_sampled_token_ids, sampling_metadata)
return ModelRunnerOutput(
req_ids=self.input_batch.req_ids,
@@ -1089,6 +1101,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def generate_draft_token_ids(
self,
sampled_token_ids: list[list[int]],
+ sampling_metadata: SamplingMetadata,
) -> list[list[int]]:
# TODO(woosuk): Optimize.
draft_token_ids: list[list[int]] = []
@@ -1099,6 +1112,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
draft_token_ids.append([])
continue
+ # Skip requests that require top-p, top-k, etc.
+ req_id = self.input_batch.req_ids[i]
+ if not is_spec_decode_supported(req_id, self.input_batch):
+ draft_token_ids.append([])
+ continue
+
# Add sampled_token_ids to token_ids_cpu.
start_idx = self.input_batch.num_tokens_no_spec[i]
end_idx = start_idx + num_sampled_ids
From b539222d4e81512e0cfa6cf56927a70c3aaca9d2 Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 14:42:06 +0800
Subject: [PATCH 23/34] [V1] Remove input cache client (#14864)
Signed-off-by: DarkLight1337
Signed-off-by: Roger Wang
Co-authored-by: Roger Wang
---
vllm/inputs/preprocess.py | 6 ++
vllm/v1/engine/__init__.py | 2 +-
vllm/v1/engine/mm_input_cache.py | 122 +++--------------------------
vllm/v1/engine/processor.py | 80 ++++++-------------
vllm/v1/worker/gpu_model_runner.py | 39 ++-------
5 files changed, 48 insertions(+), 201 deletions(-)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f56cff292b68b..af35e43d825a2 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -379,6 +379,7 @@ class InputPreprocessor:
multi_modal_data,
mm_processor_kwargs,
lora_request=lora_request,
+ return_mm_hashes=return_mm_hashes,
)
prompt_token_ids = self._tokenize_prompt(
@@ -401,6 +402,7 @@ class InputPreprocessor:
prompt: SingletonPrompt,
request_id: str,
lora_request: Optional[LoRARequest] = None,
+ return_mm_hashes: bool = False,
) -> SingletonInputs:
"""Async version of :meth:`_extract_prompt_components`."""
parsed = parse_singleton_prompt(prompt)
@@ -431,6 +433,7 @@ class InputPreprocessor:
multi_modal_data,
mm_processor_kwargs,
lora_request=lora_request,
+ return_mm_hashes=return_mm_hashes,
)
return token_inputs(
@@ -452,6 +455,7 @@ class InputPreprocessor:
multi_modal_data,
mm_processor_kwargs,
lora_request=lora_request,
+ return_mm_hashes=return_mm_hashes,
)
prompt_token_ids = await self._tokenize_prompt_async(
@@ -726,6 +730,7 @@ class InputPreprocessor:
prompt,
request_id=request_id,
lora_request=lora_request,
+ return_mm_hashes=return_mm_hashes,
)
return self._build_decoder_only_llm_inputs(
@@ -746,6 +751,7 @@ class InputPreprocessor:
prompt,
request_id=request_id,
lora_request=lora_request,
+ return_mm_hashes=return_mm_hashes,
)
return self._build_decoder_only_llm_inputs(
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index cd29c2d7d57c0..3699779b3a0fe 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -52,7 +52,7 @@ class EngineCoreRequest(
# Detokenizer, but set to None when it is added to EngineCoreClient.
prompt: Optional[str]
prompt_token_ids: list[int]
- mm_inputs: Optional[list[Optional[MultiModalKwargs]]]
+ mm_inputs: Optional[list[MultiModalKwargs]]
mm_hashes: Optional[list[str]]
mm_placeholders: Optional[list[PlaceholderRange]]
sampling_params: SamplingParams
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index e2dda73ba4299..61a55d2499bd1 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -1,131 +1,30 @@
# SPDX-License-Identifier: Apache-2.0
-from typing import Any, Optional
-
-from vllm.config import ModelConfig
from vllm.envs import VLLM_MM_INPUT_CACHE_GIB
-from vllm.logger import init_logger
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
- MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal import MultiModalKwargs
from vllm.multimodal.processing import ProcessingCache
-logger = init_logger(__name__)
-
# The idea of multimodal preprocessing caching is based on having a client and
# a server, where the client executes in the frontend process (=P0) and the
# server in the core process (=P1).
#
# -- Client:
-# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
-# - Perform caching of the generated MultiModalKwargs.
-# - This client can be deprecated once all mutimodal models migrate to use
-# merged preprocessor with built-in caching functionality.
+# - BaseMultiModalProcessor to process MultiModalData into MultiModalKwargs
+# with built-in caching functionality, with mm_hash as its identifier.
#
# -- Server:
-# - Perform caching of the received MultiModalKwargs.
+# - MMInputCacheServer to perform caching of the received MultiModalKwargs.
#
-# The caching for both client and server is mirrored/similar, and this allows us
+# The caching for both client and server is mirrored, and this allows us
# to avoid the serialization of "mm_inputs" (like pixel values) between
-# client (=P0) and server (=P1) processes.
+# client (=P0) and server (=P1) processes if the mm_hash is found in the client
+# cache.
# Both Client and Server must use the same cache size
# (to perform mirrored caching). This cache size is set by the environment
# variable VLLM_MM_INPUT_CACHE_GIB.
-# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
-# merged preprocessor with built-in caching functionality.
-class MMInputCacheClient:
-
- def __init__(
- self,
- model_config: ModelConfig,
- mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
- ):
- self.model_config = model_config
- self.mm_registry = mm_registry
- self.multi_modal_input_mapper = mm_registry.create_input_mapper(
- model_config)
- self.mm_registry.init_mm_limits_per_prompt(model_config)
-
- # Init cache
- self.use_cache = not model_config.disable_mm_preprocessor_cache
- self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB,
- MultiModalKwargs)
-
- # DEBUG: Set to None to disable
- self.mm_debug_cache_hit_ratio_steps = None
- self.mm_debug_cache_hits = 0
- self.mm_debug_cache_total = 0
-
- def cache_hit_ratio(self, steps):
- total = self.mm_debug_cache_total
-
- if total > 0 and total % steps == 0:
- logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
- self.mm_debug_cache_hits / total)
-
- # NOTE: process_inputs only supports image inputs since all multimodal
- # models with other modalities have migrated to use merged preprocessor.
- def process_inputs(
- self,
- mm_data: MultiModalDataDict,
- mm_hashes: Optional[list[str]],
- mm_processor_kwargs: Optional[dict[str, Any]],
- precomputed_mm_inputs: Optional[list[MultiModalKwargs]],
- ) -> list[Optional[MultiModalKwargs]]:
- if precomputed_mm_inputs is None:
- image_inputs = mm_data["image"]
- if not isinstance(image_inputs, list):
- image_inputs = [image_inputs]
- num_inputs = len(image_inputs)
- else:
- num_inputs = len(precomputed_mm_inputs)
-
- # Sanity
- if self.use_cache:
- assert mm_hashes is not None
- assert num_inputs == len(mm_hashes)
-
- # Process each image input separately, so that later we can schedule
- # them in a fine-grained manner.
- # Apply caching (if enabled) and reuse precomputed inputs (if provided)
- ret_inputs: list[Optional[MultiModalKwargs]] = []
- for input_id in range(num_inputs):
- if self.mm_debug_cache_hit_ratio_steps is not None:
- self.cache_hit_ratio(self.mm_debug_cache_hit_ratio_steps)
-
- mm_input = None
- if self.use_cache:
- assert mm_hashes is not None
- mm_hash = mm_hashes[input_id]
- mm_input = self.mm_cache.get(mm_hash)
-
- self.mm_debug_cache_total += 1
- if mm_input is None:
- if precomputed_mm_inputs is not None:
- # Reuse precomputed input (for merged preprocessor)
- mm_input = precomputed_mm_inputs[input_id]
- else:
- # Apply legacy input_mapper
- mm_input = self.multi_modal_input_mapper(
- {"image": [image_inputs[input_id]]},
- mm_processor_kwargs=mm_processor_kwargs,
- )
-
- if self.use_cache:
- # Add to cache
- assert mm_hash is not None
- self.mm_cache[mm_hash] = mm_input
- else:
- self.mm_debug_cache_hits += 1
- mm_input = None # Avoids sending mm_input to Server
-
- ret_inputs.append(mm_input)
-
- return ret_inputs
-
-
class MMInputCacheServer:
def __init__(self, model_config):
@@ -135,9 +34,9 @@ class MMInputCacheServer:
def get_and_update(
self,
- mm_inputs: list[Optional[MultiModalKwargs]],
+ mm_inputs: list[MultiModalKwargs],
mm_hashes: list[str],
- ) -> list[Optional[MultiModalKwargs]]:
+ ) -> list[MultiModalKwargs]:
assert len(mm_inputs) == len(mm_hashes)
if not self.use_cache:
@@ -147,8 +46,7 @@ class MMInputCacheServer:
for mm_input, mm_hash in zip(mm_inputs, mm_hashes):
assert mm_hash is not None
if mm_input is None:
- mm_input = self.mm_cache.get(mm_hash)
- assert mm_input is not None
+ mm_input = self.mm_cache[mm_hash]
else:
self.mm_cache[mm_hash] = mm_input
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 663e1e36f7561..4e9e5506bb587 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -11,15 +11,15 @@ from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
from vllm.inputs.parse import is_encoder_decoder_inputs
from vllm.inputs.preprocess import InputPreprocessor
from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
- MultiModalKwargs, MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
+ MultiModalRegistry)
+from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MMInputCacheClient
from vllm.v1.structured_output.utils import validate_structured_output_request
@@ -45,11 +45,6 @@ class Processor:
self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer,
mm_registry)
- self.input_processor = input_registry.create_input_processor(
- self.model_config)
-
- # Multi-modal (huggingface) input mapper
- self.mm_input_cache_client = MMInputCacheClient(self.model_config)
# Multi-modal hasher (for images)
self.use_hash = (
@@ -171,7 +166,7 @@ class Processor:
# 2. For multimodal models with a merged preprocessor, preprocess
# multimodal data and expand prompt token ids accordingly.
# 3. Apply prompt adapter to prompt token ids if one exists.
- preprocessed_inputs = self.input_preprocessor.preprocess(
+ processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
prompt,
request_id=request_id,
lora_request=lora_request,
@@ -180,10 +175,6 @@ class Processor:
)
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
- # Process prompt and prompt token ids.
- # Only applicable to multimodal models with legacy input processor.
- processed_inputs = self.input_processor(preprocessed_inputs)
-
self._validate_model_inputs(processed_inputs, lora_request)
if is_encoder_decoder_inputs(processed_inputs):
@@ -212,36 +203,22 @@ class Processor:
self.tokenizer.get_lora_tokenizer(lora_request))
# Multimodal related.
- # Compute MM hashes (if enabled)
- mm_hashes = None
- if self.use_hash:
- # Use mm_hashes from processed inputs if the model has merged
- # input processor.
- if decoder_inputs.multi_modal_hashes:
- mm_hashes = decoder_inputs.multi_modal_hashes
- # Fallback to using MultiModalHasher directly.
- else:
- mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+ sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None
+ sorted_mm_positions: Optional[list[PlaceholderRange]] = None
+ sorted_mm_hashes: Optional[list[str]] = None
+ if (decoder_mm_inputs := decoder_inputs.multi_modal_data):
+ assert isinstance(decoder_mm_inputs, MultiModalKwargs)
- # For merged preprocessor, mm_data is already mm_inputs
- precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
- decoder_mm_data = decoder_inputs.multi_modal_data
- if isinstance(decoder_mm_data, MultiModalKwargs):
- # The output of merged multi-modal processor (`decoder_mm_data`)
+ # The output of merged multi-modal processor (`decoder_mm_inputs`)
# contains the kwargs for all items from all modalities.
# This code separates them so that there is one set of kwargs
# per item per modality.
- precomputed_mm_inputs = [
+ individual_mm_inputs = [
MultiModalKwargs.from_items([item])
- for modality in decoder_mm_data.modalities
- for item in decoder_mm_data.get_items(modality)
+ for modality in decoder_mm_inputs.modalities
+ for item in decoder_mm_inputs.get_items(modality)
]
- mm_positions = decoder_inputs.multi_modal_placeholders
-
- # Last-mile processing of multimodal metadata and inputs.
- if mm_positions:
-
# Merge and flatten multimodal placeholders, hashes and inputs
# from dictionaries to lists, and sort them by each item's position
# in the input sequence.
@@ -251,14 +228,13 @@ class Processor:
sorted_mm_positions,
sorted_mm_hashes,
) = merge_and_sort_multimodal_metadata(
- mm_positions,
- mm_hashes,
+ decoder_inputs.multi_modal_placeholders,
+ decoder_inputs.multi_modal_hashes if self.use_hash else None,
)
# NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
- # modalities involved AND the model supports merged input processor.
- if len(sorted_modalities) > 1 and precomputed_mm_inputs:
-
+ # modalities involved.
+ if len(sorted_modalities) > 1:
modality_order_dict = {
modality: order
for order, modality in enumerate(sorted_modalities)
@@ -266,26 +242,16 @@ class Processor:
# Sanity check to make sure each multimodal input has only one
# modality key.
- for mm_input in precomputed_mm_inputs:
+ for mm_input in individual_mm_inputs:
assert len(mm_input.modalities) == 1
- # Sort MultiModalKwags to match sorted_mm_positions
- precomputed_mm_inputs = sorted(
- precomputed_mm_inputs,
+ # Sort MultiModalKwargs to match sorted_mm_positions
+ sorted_mm_inputs = sorted(
+ individual_mm_inputs,
key=lambda mm_input: modality_order_dict[list(
mm_input.modalities)[0]])
-
- # Apply mm input cache update and legacy input mapper if one exists.
- sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
- mm_data=decoder_mm_data,
- mm_hashes=sorted_mm_hashes,
- mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
- precomputed_mm_inputs=precomputed_mm_inputs,
- )
- else:
- sorted_mm_inputs = None
- sorted_mm_hashes = None
- sorted_mm_positions = None
+ else:
+ sorted_mm_inputs = individual_mm_inputs
return EngineCoreRequest(
request_id=request_id,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2a98bea562dcb..66015382bfe85 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -29,7 +29,6 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
is_pin_memory_available)
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.engine.mm_input_cache import MMInputCacheClient
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheSpec)
from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
@@ -133,14 +132,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.mm_registry = MULTIMODAL_REGISTRY
self.uses_mrope = model_config.uses_mrope
- if self.is_multimodal_model:
- # NOTE: Initialized client is only used for processing dummy
- # multimodal data into multimodal kwargs for GPU memory profiling.
- # Only applicable to multimodal models with legacy input mapper.
- self.mm_input_mapper_profiling = MMInputCacheClient(
- self.model_config)
- self.mm_input_mapper_profiling.use_cache = False
-
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
model_config=model_config,
scheduler_config=scheduler_config,
@@ -1376,32 +1367,18 @@ class GPUModelRunner(LoRAModelRunnerMixin):
mm_registry=self.mm_registry,
)
dummy_mm_data = dummy_request_data.multi_modal_data
+ if not isinstance(dummy_mm_data, MultiModalKwargs):
+ # TODO: Delete this check once input mapper is fully removed.
+ raise RuntimeError(
+ "Legacy input mapper is not supported in V1")
- # Dummy data definition in V0 may contain multiple multimodal items
+ # Dummy data definition may contain multiple multimodal items
# (e.g, multiple images) for a single request, therefore here we
# always replicate first item by max_num_mm_items times since in V1
# they are scheduled to be processed separately.
-
- # Case when models have a merged processor, their dummy data is
- # already batched `MultiModalKwargs`, therefore we take the first
- # `MultiModalKwargsItem` from the desired modality to profile on.
- if isinstance(dummy_mm_data, MultiModalKwargs):
- dummy_mm_item = dummy_mm_data.get_item(
- modality=dummy_data_modality, item_index=0)
- dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
-
- # Case when models have dummy data explicitly defined as
- # `MultiModalDataDict`, so they need to be processed through input
- # mapper.
- # TODO (ywang96): deprecate this path once merged processor is
- # supported on all models.
- else:
- mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
- mm_data=dummy_mm_data,
- mm_hashes=None,
- mm_processor_kwargs=None,
- precomputed_mm_inputs=None)
- dummy_mm_kwargs = mm_kwargs_list[0]
+ dummy_mm_item = dummy_mm_data.get_item(
+ modality=dummy_data_modality, item_index=0)
+ dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
batched_dummy_mm_inputs = MultiModalKwargs.batch(
[dummy_mm_kwargs] * max_num_mm_items)
From 9b87a579aaf82338d5304219350932abae9b19ac Mon Sep 17 00:00:00 2001
From: Yan Ma
Date: Mon, 17 Mar 2025 16:22:14 +0800
Subject: [PATCH 24/34] [Misc][XPU] Use None as device capacity for XPU
(#14932)
Signed-off-by: yan ma
---
vllm/platforms/xpu.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index d99d4ef3dac06..225e756cd7ce8 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -37,10 +37,11 @@ class XPUPlatform(Platform):
return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
@staticmethod
- def get_device_capability(device_id: int = 0) -> DeviceCapability:
- major, minor, *_ = torch.xpu.get_device_capability(
- device_id)['version'].split('.')
- return DeviceCapability(major=int(major), minor=int(minor))
+ def get_device_capability(
+ device_id: int = 0) -> Optional[DeviceCapability]:
+ # capacity format differs from cuda's and will cause unexpected
+ # failure, so use None directly
+ return None
@staticmethod
def get_device_name(device_id: int = 0) -> str:
From dd3b865854c21c99ebc5d1bd34c12936002174c2 Mon Sep 17 00:00:00 2001
From: Chen Zhang
Date: Mon, 17 Mar 2025 16:29:36 +0800
Subject: [PATCH 25/34] [Doc] Add vLLM Beijing meetup slide (#14938)
Signed-off-by: Chen Zhang
---
README.md | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/README.md b/README.md
index bfab7faf598b6..f61b4218e1824 100644
--- a/README.md
+++ b/README.md
@@ -13,18 +13,9 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Twitter/X | Developer Slack |
----
-
-We’re excited to invite you to the first **vLLM China Meetup** on **March 16** in **Beijing**!
-
-Join us to connect with the **vLLM team** and explore how vLLM is leveraged in **post-training, fine-tuning, and deployment**, including [verl](https://github.com/volcengine/verl), [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory), and [vllm-ascend](https://github.com/vllm-project/vllm-ascend).
-
-👉 **[Register Now](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)** to be part of the discussion!
-
----
-
*Latest News* 🔥
+- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
From 0a74bfce9cb9e51616c50b007e53400244cbc24a Mon Sep 17 00:00:00 2001
From: Russell Bryant
Date: Mon, 17 Mar 2025 04:37:42 -0400
Subject: [PATCH 26/34] setup.py: drop assumption about local `main` branch
(#14692)
Signed-off-by: Russell Bryant
---
setup.py | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/setup.py b/setup.py
index d18fe53f12de1..d412f34b3e3dc 100755
--- a/setup.py
+++ b/setup.py
@@ -294,26 +294,28 @@ class repackage_wheel(build_ext):
]).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
- # Check if the local main branch is up-to-date. This is to ensure
- # the base commit we found is the most recent commit on the main
- # branch.
- local_main_commit = subprocess.check_output(
- ["git", "rev-parse", "main"]).decode("utf-8").strip()
- if local_main_commit != upstream_main_commit:
- raise ValueError(
- f"Local main branch ({local_main_commit}) is not "
- "up-to-date with upstream main branch "
- f"({upstream_main_commit}). Please pull the latest "
- "changes from upstream main branch first.")
+ # Check if the upstream_main_commit exists in the local repo
+ try:
+ subprocess.check_output(
+ ["git", "cat-file", "-e", f"{upstream_main_commit}"])
+ except subprocess.CalledProcessError:
+ # If not present, fetch it from the remote repository.
+ # Note that this does not update any local branches,
+ # but ensures that this commit ref and its history are
+ # available in our local repo.
+ subprocess.check_call([
+ "git", "fetch", "https://github.com/vllm-project/vllm",
+ "main"
+ ])
# Then get the commit hash of the current branch that is the same as
# the upstream main commit.
current_branch = subprocess.check_output(
["git", "branch", "--show-current"]).decode("utf-8").strip()
- base_commit = subprocess.check_output(
- ["git", "merge-base", "main",
- current_branch]).decode("utf-8").strip()
+ base_commit = subprocess.check_output([
+ "git", "merge-base", f"{upstream_main_commit}", current_branch
+ ]).decode("utf-8").strip()
return base_commit
except ValueError as err:
raise ValueError(err) from None
From cd0cd85102e4b5971dd44109776942df5cdca70f Mon Sep 17 00:00:00 2001
From: Lu Fang <30275821+houseroad@users.noreply.github.com>
Date: Mon, 17 Mar 2025 01:40:41 -0700
Subject: [PATCH 27/34] [MISC] More AMD unused var clean up (#14926)
Signed-off-by: Lu Fang
---
csrc/rocm/attention.cu | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index 90f0b54d2f006..c500d00ea528e 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -127,7 +127,7 @@ __device__ __forceinline__ T from_float(const float& inp) {
template
__device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -160,7 +160,7 @@ __device__ __forceinline__ _B16x4 from_floatx4(const floatx4& inp) {
template
__device__ __forceinline__ _B16x4 addx4(const _B16x4& inp1,
const _B16x4& inp2) {
- union tmpcvt {
+ [[maybe_unused]] union tmpcvt {
uint16_t u;
_Float16 f;
__hip_bfloat16 b;
@@ -1273,9 +1273,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
const int seq_idx = blockIdx.y;
const int context_len = context_lens[seq_idx];
const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
- constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+ [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int warpid = threadIdx.x / WARP_SIZE;
- const int laneid = threadIdx.x % WARP_SIZE;
+ [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
__shared__ float shared_global_exp_sum;
// max num partitions supported is warp_size * NPAR_LOOPS
From 69698f257e3a329fd68276459e82e37cd5ae43f2 Mon Sep 17 00:00:00 2001
From: kushanam <42385577+kushanam@users.noreply.github.com>
Date: Mon, 17 Mar 2025 01:47:58 -0700
Subject: [PATCH 28/34] fix minor miscalled method (#14327)
From b4ad56c1bd2fd39028f64919a11a4c5af96bf0c5 Mon Sep 17 00:00:00 2001
From: iefgnoix
Date: Mon, 17 Mar 2025 01:48:28 -0700
Subject: [PATCH 29/34] [V1][TPU] Apply the ragged paged attention kernel fix
and remove the padding. (#14846)
Signed-off-by: Xiongfei Wei
---
requirements/tpu.txt | 12 ++++++------
vllm/v1/worker/tpu_model_runner.py | 7 ++-----
2 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 97a39bcd4a6d6..7246fc19bfa97 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -17,9 +17,9 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250306%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250314%2Bcxx11-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index effcac7e7bdef..00869467be341 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -23,8 +23,7 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality
from vllm.sampling_params import SamplingType
from vllm.sequence import IntermediateTensors
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
- PallasAttentionBackend,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackend,
PallasMetadata)
from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
@@ -139,10 +138,8 @@ class TPUModelRunner:
device="cpu")
self.slot_mapping_np = self.slot_mapping_cpu.numpy()
- padded_max_num_blocks_per_req = _get_padded_number(
- self.max_num_blocks_per_req, NUM_KV_PAGES_PER_BLOCK)
self.block_table_cpu = torch.zeros(
- (self.max_num_tokens, padded_max_num_blocks_per_req),
+ (self.max_num_tokens, self.max_num_blocks_per_req),
dtype=self.input_batch.block_table.get_cpu_tensor().dtype,
device="cpu")
From 868a8c5b2c8c042fc869eb30bce29fb8e19d979e Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 17:15:20 +0800
Subject: [PATCH 30/34] [Bugfix] Fix Ultravox on V1 (#14929)
Signed-off-by: DarkLight1337
---
vllm/model_executor/models/ultravox.py | 42 +++++++++++++++-----------
1 file changed, 25 insertions(+), 17 deletions(-)
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d368c145d55f9..cb1e143838496 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -5,7 +5,7 @@
import math
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
-from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union
+from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union
import torch
import torch.utils.checkpoint
@@ -36,7 +36,7 @@ from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
- SupportsMultiModal, SupportsPP, SupportsV0Only)
+ SupportsMultiModal, SupportsPP)
from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
init_vllm_registered_model, maybe_prefix,
merge_multimodal_embeddings,
@@ -50,14 +50,14 @@ _MAX_ENCODER_BATCH_SIZE = 16
class UltravoxAudioFeatureInputs(TypedDict):
type: Literal["audio_features"]
- data: NestedTensors
+ data: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
"""Shape: `(batch_size, num_chunks, 80, M)`"""
- lens: NestedTensors
+ lens: Union[torch.Tensor, list[torch.Tensor]]
"""
Length of the audio frames. Used for attention mask in WhisperEncoder.
Shape: `(batch_size, num_chunks)`
"""
- token_len: NestedTensors
+ token_len: Union[torch.Tensor, list[torch.Tensor]]
"""
Length of the audio tokens. Used for flattening the audio features.
Shape: `(batch_size, num_chunks)`
@@ -405,8 +405,7 @@ class ModifiedWhisperEncoder(WhisperEncoder):
UltravoxMultiModalProcessor,
info=UltravoxProcessingInfo,
dummy_inputs=UltravoxDummyInputsBuilder)
-class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
- SupportsV0Only):
+class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
@@ -506,6 +505,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
if not isinstance(audio_features, (torch.Tensor, list)):
raise ValueError("Incorrect type of audio features. "
f"Got type: {type(audio_features)}")
+ if not isinstance(audio_lens, (torch.Tensor, list)):
+ raise ValueError("Incorrect type of audio_lens. "
+ f"Got type: {type(audio_features)}")
+ if not isinstance(audio_token_len, (torch.Tensor, list)):
+ raise ValueError("Incorrect type of audio_token_len. "
+ f"Got type: {type(audio_features)}")
return UltravoxAudioFeatureInputs(type="audio_features",
data=audio_features,
@@ -523,7 +528,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
raise AssertionError("This line should be unreachable.")
def _process_audio_input(
- self, audio_input: UltravoxAudioInputs) -> NestedTensors:
+ self,
+ audio_input: UltravoxAudioInputs,
+ ) -> Union[NestedTensors, tuple[torch.Tensor, ...]]:
if audio_input["type"] == "audio_embeds":
return audio_input["data"]
@@ -531,13 +538,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
audio_features = pad_and_concat_to_dim3(audio_input["data"])
- if isinstance(audio_input['lens'], list):
- # [B1, B2] -> [B1+B2]
- audio_lens = torch.cat(audio_input['lens'])
- audio_token_len = torch.cat(audio_input['token_len'])
- else:
- audio_lens = flatten_bn(audio_input['lens'])
- audio_token_len = flatten_bn(audio_input['token_len'])
+ # [B1, B2] -> [B1+B2]
+ audio_lens = flatten_bn(audio_input['lens'], concat=True)
+ audio_token_len = flatten_bn(audio_input['token_len'], concat=True)
embeddings = self._audio_features_to_embeddings(
audio_features, audio_lens)
@@ -554,7 +557,12 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
# Apply mask and flatten
flattened_embeddings = embeddings[mask]
- return flattened_embeddings
+ # Return one tensor per input audio
+ embed_lens = [
+ token_len_item.sum().item()
+ for token_len_item in audio_input['token_len']
+ ]
+ return flattened_embeddings.split(embed_lens)
def get_multimodal_embeddings(
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
@@ -646,7 +654,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
def pad_and_concat_to_dim3(
- features: Union[torch.Tensor, List[torch.Tensor], List[List[torch.Tensor]]]
+ features: Union[torch.Tensor, list[torch.Tensor], list[list[torch.Tensor]]]
) -> torch.Tensor:
"""
Pad and concatenate a list of tensors.
From 6eaf1e5c52d5e72a577ad03d378a28b39f0e849e Mon Sep 17 00:00:00 2001
From: Cyrus Leung
Date: Mon, 17 Mar 2025 18:00:17 +0800
Subject: [PATCH 31/34] [Misc] Add `--seed` option to offline multi-modal
examples (#14934)
Signed-off-by: DarkLight1337
---
.buildkite/test-pipeline.yaml | 7 +-
examples/offline_inference/audio_language.py | 132 +++--
.../encoder_decoder_multimodal.py | 48 +-
examples/offline_inference/vision_language.py | 455 ++++++++++++------
.../vision_language_embedding.py | 31 +-
.../vision_language_multi_image.py | 179 ++++---
6 files changed, 537 insertions(+), 315 deletions(-)
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f85572e7c234c..f5be8dca05f1d 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -226,10 +226,13 @@ steps:
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- - python3 offline_inference/vision_language.py
- - python3 offline_inference/vision_language_multi_image.py
+ - python3 offline_inference/audio_language.py --seed 0
+ - python3 offline_inference/vision_language.py --seed 0
+ - python3 offline_inference/vision_language_embedding.py --seed 0
+ - python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
+ - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index 293b9fddac89e..02dbdcb64232f 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -7,11 +7,13 @@ For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
import os
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
@@ -23,21 +25,31 @@ question_per_audio_count = {
2: "What sport and what nursery rhyme are referenced?"
}
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompt: str
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# MiniCPM-O
-def run_minicpmo(question: str, audio_count: int):
+def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
- llm = LLM(model=model_name,
- trust_remote_code=True,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ trust_remote_code=True,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@@ -52,11 +64,16 @@ def run_minicpmo(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True,
chat_template=audio_chat_template)
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ stop_token_ids=stop_token_ids,
+ )
# Phi-4-multimodal-instruct
-def run_phi4mm(questions: str, audio_count: int):
+def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
"""
Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
show how to process audio inputs.
@@ -67,9 +84,9 @@ def run_phi4mm(questions: str, audio_count: int):
speech_lora_path = os.path.join(model_path, "speech-lora")
placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
- prompts = f"<|user|>{placeholders}{questions}<|end|><|assistant|>"
+ prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_path,
trust_remote_code=True,
max_model_len=4096,
@@ -79,24 +96,24 @@ def run_phi4mm(questions: str, audio_count: int):
lora_extra_vocab_size=0,
limit_mm_per_prompt={"audio": audio_count},
)
- lora_request = LoRARequest("speech", 1, speech_lora_path)
- # To maintain code compatibility in this script, we add LoRA here.
- llm.llm_engine.add_lora(lora_request=lora_request)
- # You can also add LoRA using:
- # llm.generate(prompts, lora_request=lora_request,...)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompts,
+ lora_requests=[LoRARequest("speech", 1, speech_lora_path)],
+ )
# Qwen2-Audio
-def run_qwen2_audio(question: str, audio_count: int):
+def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
model_name = "Qwen/Qwen2-Audio-7B-Instruct"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
audio_in_prompt = "".join([
f"Audio {idx+1}: "
@@ -107,12 +124,15 @@ def run_qwen2_audio(question: str, audio_count: int):
"<|im_start|>user\n"
f"{audio_in_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Ultravox 0.5-1B
-def run_ultravox(question: str, audio_count: int):
+def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -124,29 +144,39 @@ def run_ultravox(question: str, audio_count: int):
tokenize=False,
add_generation_prompt=True)
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=5,
- trust_remote_code=True,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=5,
+ trust_remote_code=True,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
# Whisper
-def run_whisper(question: str, audio_count: int):
+def run_whisper(question: str, audio_count: int) -> ModelRequestData:
assert audio_count == 1, (
"Whisper only support single audio input per prompt")
model_name = "openai/whisper-large-v3-turbo"
prompt = "<|startoftranscript|>"
- llm = LLM(model=model_name,
- max_model_len=448,
- max_num_seqs=5,
- limit_mm_per_prompt={"audio": audio_count})
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=448,
+ max_num_seqs=5,
+ limit_mm_per_prompt={"audio": audio_count},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ )
model_example_map = {
@@ -164,14 +194,24 @@ def main(args):
raise ValueError(f"Model type {model} is not supported.")
audio_count = args.num_audios
- llm, prompt, stop_token_ids = model_example_map[model](
- question_per_audio_count[audio_count], audio_count)
+ req_data = model_example_map[model](question_per_audio_count[audio_count],
+ audio_count)
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ # To maintain code compatibility in this script, we add LoRA here.
+ # You can also add LoRA using:
+ # llm.generate(prompts, lora_request=lora_request,...)
+ if req_data.lora_requests:
+ for lora_request in req_data.lora_requests:
+ llm.llm_engine.add_lora(lora_request=lora_request)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
- stop_token_ids=stop_token_ids)
+ stop_token_ids=req_data.stop_token_ids)
mm_data = {}
if audio_count > 0:
@@ -183,7 +223,7 @@ def main(args):
}
assert args.num_prompts > 0
- inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+ inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1:
# Batch inference
inputs = [inputs] * args.num_prompts
@@ -214,6 +254,10 @@ if __name__ == "__main__":
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index f44bc423658ec..6d0c3ac1ee09a 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -4,16 +4,23 @@ This example shows how to use vLLM for running offline inference with
the explicit/implicit prompt format on enc-dec LMMs for text generation.
"""
import time
+from collections.abc import Sequence
+from dataclasses import asdict
+from typing import NamedTuple
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParser
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: Sequence[PromptType]
+
+
def run_florence2():
- # Create a Florence-2 encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large",
max_num_seqs=8,
@@ -39,12 +46,15 @@ def run_florence2():
"decoder_prompt": "",
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_mllama():
- # Create a Mllama encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_model_len=4096,
max_num_seqs=2,
@@ -69,12 +79,15 @@ def run_mllama():
"decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.", # noqa: E501
},
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
def run_whisper():
- # Create a Whisper encoder/decoder model instance
- llm = LLM(
+ engine_args = EngineArgs(
model="openai/whisper-large-v3-turbo",
max_model_len=448,
max_num_seqs=16,
@@ -99,7 +112,11 @@ def run_whisper():
"decoder_prompt": "<|startoftranscript|>",
}
]
- return llm, prompts
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
model_example_map = {
@@ -114,7 +131,12 @@ def main(args):
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")
- llm, prompts = model_example_map[model]()
+ req_data = model_example_map[model]()
+
+ engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+ llm = LLM(**engine_args)
+
+ prompts = req_data.prompts
# Create a sampling params object.
sampling_params = SamplingParams(
@@ -153,6 +175,10 @@ if __name__ == "__main__":
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
+ parser.add_argument("--seed",
+ type=int,
+ default=None,
+ help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 432cda5e24396..58fd5e53bf8dc 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -8,122 +8,164 @@ on HuggingFace model repository.
"""
import os
import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
-from vllm import LLM, SamplingParams
+from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+ engine_args: EngineArgs
+ prompts: list[str]
+ stop_token_ids: Optional[list[int]] = None
+ lora_requests: Optional[list[LoRARequest]] = None
+
+
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# Aria
-def run_aria(questions: list[str], modality: str):
+def run_aria(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "rhymes-ai/Aria"
# NOTE: Need L40 (or equivalent) to avoid OOM
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [(f"<|im_start|>user\n<|img|>{question}"
"<|im_end|>\n<|im_start|>assistant\n")
for question in questions]
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# BLIP-2
-def run_blip2(questions: list[str], modality: str):
+def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompts = [f"Question: {question} Answer:" for question in questions]
- llm = LLM(model="Salesforce/blip2-opt-2.7b",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="Salesforce/blip2-opt-2.7b",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Chameleon
-def run_chameleon(questions: list[str], modality: str):
+def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}" for question in questions]
- llm = LLM(model="facebook/chameleon-7b",
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="facebook/chameleon-7b",
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Deepseek-VL2
-def run_deepseek_vl2(questions: list[str], modality: str):
+def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "deepseek-ai/deepseek-vl2-tiny"
- llm = LLM(model=model_name,
- max_model_len=4096,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
- hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]})
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=4096,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
+ )
prompts = [
f"<|User|>: \n{question}\n\n<|Assistant|>:"
for question in questions
]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Florence2
-def run_florence2(question: str, modality: str):
+def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
- llm = LLM(model="microsoft/Florence-2-large",
- tokenizer="facebook/bart-large",
- max_num_seqs=8,
- trust_remote_code=True,
- dtype="bfloat16",
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model="microsoft/Florence-2-large",
+ tokenizer="facebook/bart-large",
+ max_num_seqs=8,
+ trust_remote_code=True,
+ dtype="bfloat16",
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
- prompt = ""
- stop_token_ids = None
- return llm, prompt, stop_token_ids
+ prompts = ["" for _ in questions]
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Fuyu
-def run_fuyu(questions: list[str], modality: str):
+def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"{question}\n" for question in questions]
- llm = LLM(model="adept/fuyu-8b",
- max_model_len=2048,
- max_num_seqs=2,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="adept/fuyu-8b",
+ max_model_len=2048,
+ max_num_seqs=2,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# Gemma 3
-def run_gemma3(questions: list[str], modality: str):
+def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "google/gemma-3-4b-it"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=2048,
max_num_seqs=2,
@@ -135,22 +177,27 @@ def run_gemma3(questions: list[str], modality: str):
prompts = [("user\n"
f"{question}\n"
"model\n") for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# GLM-4v
-def run_glm4v(questions: list[str], modality: str):
+def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
- llm = LLM(model=model_name,
- max_model_len=2048,
- max_num_seqs=2,
- trust_remote_code=True,
- enforce_eager=True,
- hf_overrides={"architectures": ["GLM4VForCausalLM"]},
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
+ engine_args = EngineArgs(
+ model=model_name,
+ max_model_len=2048,
+ max_num_seqs=2,
+ trust_remote_code=True,
+ enforce_eager=True,
+ hf_overrides={"architectures": ["GLM4VForCausalLM"]},
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
prompts = [
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
@@ -158,16 +205,21 @@ def run_glm4v(questions: list[str], modality: str):
]
stop_token_ids = [151329, 151336, 151338]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# H2OVL-Mississippi
-def run_h2ovl(questions: list[str], modality: str):
+def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-800m"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
@@ -187,15 +239,20 @@ def run_h2ovl(questions: list[str], modality: str):
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# Idefics3-8B-Llama3
-def run_idefics3(questions: list[str], modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
@@ -212,17 +269,20 @@ def run_idefics3(questions: list[str], modality: str):
prompts = [(
f"<|begin_of_text|>User:{question}\nAssistant:"
) for question in questions]
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# InternVL
-def run_internvl(questions: list[str], modality: str):
+def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B"
- llm = LLM(
+ engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
@@ -245,53 +305,75 @@ def run_internvl(questions: list[str], modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
- return llm, prompts, stop_token_ids
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ stop_token_ids=stop_token_ids,
+ )
# LLaVA-1.5
-def run_llava(questions: list[str], modality: str):
+def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
f"USER: \n{question}\nASSISTANT:" for question in questions
]
- llm = LLM(model="llava-hf/llava-1.5-7b-hf",
- max_model_len=4096,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-1.5-7b-hf",
+ max_model_len=4096,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(questions: list[str], modality: str):
+def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [f"[INST] \n{question} [/INST]" for question in questions]
- llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
- max_model_len=8192,
- disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
- stop_token_ids = None
- return llm, prompts, stop_token_ids
+ engine_args = EngineArgs(
+ model="llava-hf/llava-v1.6-mistral-7b-hf",
+ max_model_len=8192,
+ disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompts=prompts,
+ )
# LlaVA-NeXT-Video
# Currently only support for video input
-def run_llava_next_video(questions: list[str], modality: str):
+def run_llava_next_video(questions: list[str],
+ modality: str) -> ModelRequestData:
assert modality == "video"
prompts = [
f"USER: